From bd364aaee338fbc6e3a49043614331ff471e7f4d Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Wed, 17 Mar 2021 22:37:11 +0100 Subject: Enable gather on zen3 hardware. For TSVC it get used by 5 benchmarks with following runtime improvements: s4114: 1.424 -> 1.209 (84.9017%) s4115: 2.021 -> 1.065 (52.6967%) s4116: 1.549 -> 0.854 (55.1323%) s4117: 1.386 -> 1.193 (86.075%) vag: 2.741 -> 1.940 (70.7771%) there is regression in s4112: 1.115 -> 1.184 (106.188%) The internal loop is: for (int i = 0; i < LEN_1D; i++) { a[i] += b[ip[i]] * s; } (so a standard accmulate and add with indirect addressing) 40a400: c5 fe 6f 24 03 vmovdqu (%rbx,%rax,1),%ymm4 40a405: c5 fc 28 da vmovaps %ymm2,%ymm3 40a409: 48 83 c0 20 add $0x20,%rax 40a40d: c4 e2 65 92 04 a5 00 vgatherdps %ymm3,0x594100(,%ymm4,4),%ymm0 40a414: 41 59 00 40a417: c4 e2 75 a8 80 e0 34 vfmadd213ps 0x5b34e0(%rax),%ymm1,%ymm0 40a41e: 5b 00 40a420: c5 fc 29 80 e0 34 5b vmovaps %ymm0,0x5b34e0(%rax) 40a427: 00 40a428: 48 3d 00 f4 01 00 cmp $0x1f400,%rax 40a42e: 75 d0 jne 40a400 compared to: 40a280: 49 63 14 04 movslq (%r12,%rax,1),%rdx 40a284: 48 83 c0 04 add $0x4,%rax 40a288: c5 fa 10 04 95 00 41 vmovss 0x594100(,%rdx,4),%xmm0 40a28f: 59 00 40a291: c4 e2 71 a9 80 fc 34 vfmadd213ss 0x5b34fc(%rax),%xmm1,%xmm0 40a298: 5b 00 40a29a: c5 fa 11 80 fc 34 5b vmovss %xmm0,0x5b34fc(%rax) 40a2a1: 00 40a2a2: 48 3d 00 f4 01 00 cmp $0x1f400,%rax 40a2a8: 75 d6 jne 40a280 Looking at instructions latencies - fmadd is 4 cycles - vgatherdps is 39 So vgather iself is 4.8 cycle per iteration and probably CPU is able to execute rest out of order getting clos to 4 cycles per iteration (it can do 2 loads in parallel, one store and rest fits easily to execution resources). That would explain 20% slowdown. gimple internal loop is: _2 = a[i_38]; _3 = (long unsigned int) i_38; _4 = _3 * 4; _5 = ip_18 + _4; _6 = *_5; _7 = b[_6]; _8 = _7 * s_19; _9 = _2 + _8; a[i_38] = _9; i_28 = i_38 + 1; ivtmp_52 = ivtmp_53 - 1; if (ivtmp_52 != 0) goto ; [98.99%] else goto ; [1.01%] 0x25bac30 a[i_38] 1 times scalar_load costs 12 in body 0x25bac30 *_5 1 times scalar_load costs 12 in body 0x25bac30 b[_6] 1 times scalar_load costs 12 in body 0x25bac30 _7 * s_19 1 times scalar_stmt costs 12 in body 0x25bac30 _2 + _8 1 times scalar_stmt costs 12 in body 0x25bac30 _9 1 times scalar_store costs 16 in body so 19 cycles estimate of scalar load 0x2668630 a[i_38] 1 times vector_load costs 12 in body 0x2668630 *_5 1 times unaligned_load (misalign -1) costs 12 in body 0x2668630 b[_6] 8 times scalar_load costs 96 in body 0x2668630 _7 * s_19 1 times scalar_to_vec costs 4 in prologue 0x2668630 _7 * s_19 1 times vector_stmt costs 12 in body 0x2668630 _2 + _8 1 times vector_stmt costs 12 in body 0x2668630 _9 1 times vector_store costs 16 in body so 40 cycles per 8x vectorized body tsvc.c:3450:27: note: operating only on full vectors. tsvc.c:3450:27: note: Cost model analysis: Vector inside of loop cost: 160 Vector prologue cost: 4 Vector epilogue cost: 0 Scalar iteration cost: 76 Scalar outside cost: 0 Vector outside cost: 4 prologue iterations: 0 epilogue iterations: 0 Calculated minimum iters for profitability: 1 I think this generally suffers from GIGO principle. One problem seems to be that we do not know about fmadd yet and compute it as two instructions (6 cycles instead of 4). More importnat problem is that we do not account the parallelism at all. I do not see how to disable the vecotrization here without bumping gather costs noticeably off reality and thus we probably can try to experiment with this if more similar problems are found. Icc is also using gather in s1115 and s128. For s1115 the vectorization does not seem to help and s128 gets slower. Clang and aocc does not use gathers. * config/i386/x86-tune-costs.h (struct processor_costs): Update costs of gather to match reality. * config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Enable for znver3. --- gcc/config/i386/x86-tune-costs.h | 10 +++++----- gcc/config/i386/x86-tune.def | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index e655e66..db03738 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1767,11 +1767,11 @@ struct processor_costs znver3_cost = { 2, 2, 3, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ - /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, - throughput 12. Approx 9 uops do not depend on vector size and every load - is 7 uops. */ - 18, 8, /* Gather load static, per_elt. */ - 18, 10, /* Gather store static, per_elt. */ + /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, + throughput 9. Approx 7 uops do not depend on vector size and every load + is 4 uops. */ + 14, 8, /* Gather load static, per_elt. */ + 14, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ 512, /* size of l2 cache. */ 64, /* size of prefetch block. */ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 140ccb3..caebf76 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -436,7 +436,7 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes", /* X86_TUNE_USE_GATHER: Use gather instructions. */ DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather", - ~(m_ZNVER | m_GENERIC)) + ~(m_ZNVER1 | m_ZNVER2 | m_GENERIC)) /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or smaller FMA chain. */ -- cgit v1.1 From 5074c6fa38cef1abb9a355d717b41441a44c4e6a Mon Sep 17 00:00:00 2001 From: Sandra Loosemore Date: Wed, 17 Mar 2021 14:37:05 -0700 Subject: nios2: Fix format complaints and similar diagnostics. The nios2 back end has not been building with newer versions of host GCC due to several complaints about diagnostic formatting, along with a couple other warnings. This patch fixes the errors seen when building with a host compiler from current mainline head. I also made a pass through all the error messages in this file to make them use more consistent formatting, even where the host compiler was not specifically complaining. gcc/ * config/nios2/nios2.c (nios2_custom_check_insns): Clean up error message format issues. (nios2_option_override): Likewise. (nios2_expand_fpu_builtin): Likewise. (nios2_init_custom_builtins): Adjust to avoid bogus strncpy truncation warning. (nios2_expand_custom_builtin): More error message format fixes. (nios2_expand_rdwrctl_builtin): Likewise. (nios2_expand_rdprs_builtin): Likewise. (nios2_expand_eni_builtin): Likewise. (nios2_expand_builtin): Likewise. (nios2_register_custom_code): Likewise. (nios2_valid_target_attribute_rec): Likewise. (nios2_add_insn_asm): Fix uninitialized variable warning. --- gcc/config/nios2/nios2.c | 63 ++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 29 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nios2/nios2.c b/gcc/config/nios2/nios2.c index 3ff4ff1..bf5e2be 100644 --- a/gcc/config/nios2/nios2.c +++ b/gcc/config/nios2/nios2.c @@ -1179,8 +1179,8 @@ nios2_custom_check_insns (void) for (j = 0; j < ARRAY_SIZE (nios2_fpu_insn); j++) if (N2FPU_DOUBLE_REQUIRED_P (j) && ! N2FPU_ENABLED_P (j)) { - error ("switch %<-mcustom-%s%> is required for double " - "precision floating point", N2FPU_NAME (j)); + error ("switch %<-mcustom-%s%> is required for " + "double-precision floating-point", N2FPU_NAME (j)); errors = true; } break; @@ -1188,7 +1188,8 @@ nios2_custom_check_insns (void) if (errors || custom_code_conflict) fatal_error (input_location, - "conflicting use of %<-mcustom%> switches, target attributes, " + "conflicting use of %<-mcustom%> switches, " + "target attributes, " "and/or %<__builtin_custom_%> functions"); } @@ -1378,11 +1379,11 @@ nios2_option_override (void) if (flag_pic) { if (nios2_gpopt_option != gpopt_none) - error ("%<-mgpopt%> not supported with PIC."); + error ("%<-mgpopt%> not supported with PIC"); if (nios2_gprel_sec) - error ("%<-mgprel-sec=%> not supported with PIC."); + error ("%<-mgprel-sec=%> not supported with PIC"); if (nios2_r0rel_sec) - error ("%<-mr0rel-sec=%> not supported with PIC."); + error ("%<-mr0rel-sec=%> not supported with PIC"); } /* Process -mgprel-sec= and -m0rel-sec=. */ @@ -1390,13 +1391,13 @@ nios2_option_override (void) { if (regcomp (&nios2_gprel_sec_regex, nios2_gprel_sec, REG_EXTENDED | REG_NOSUB)) - error ("%<-mgprel-sec=%> argument is not a valid regular expression."); + error ("%<-mgprel-sec=%> argument is not a valid regular expression"); } if (nios2_r0rel_sec) { if (regcomp (&nios2_r0rel_sec_regex, nios2_r0rel_sec, REG_EXTENDED | REG_NOSUB)) - error ("%<-mr0rel-sec=%> argument is not a valid regular expression."); + error ("%<-mr0rel-sec=%> argument is not a valid regular expression"); } /* If we don't have mul, we don't have mulx either! */ @@ -3574,8 +3575,9 @@ nios2_expand_fpu_builtin (tree exp, unsigned int code, rtx target) if (N2FPU_N (code) < 0) fatal_error (input_location, - "Cannot call %<__builtin_custom_%s%> without specifying switch" - " %<-mcustom-%s%>", N2FPU_NAME (code), N2FPU_NAME (code)); + "cannot call %<__builtin_custom_%s%> without specifying " + "switch %<-mcustom-%s%>", + N2FPU_NAME (code), N2FPU_NAME (code)); if (has_target_p) create_output_operand (&ops[opno++], target, dst_mode); else @@ -3641,10 +3643,10 @@ nios2_init_custom_builtins (int start_code) = build_function_type_list (ret_type, integer_type_node, op[rhs1].type, op[rhs2].type, NULL_TREE); - snprintf (builtin_name + n, 32 - n, "%sn%s%s", - op[lhs].c, op[rhs1].c, op[rhs2].c); /* Save copy of parameter string into custom_builtin_name[]. */ - strncpy (custom_builtin_name[builtin_code], builtin_name + n, 5); + snprintf (custom_builtin_name[builtin_code], 5, "%sn%s%s", + op[lhs].c, op[rhs1].c, op[rhs2].c); + strncpy (builtin_name + n, custom_builtin_name[builtin_code], 5); fndecl = add_builtin_function (builtin_name, builtin_ftype, start_code + builtin_code, @@ -3682,7 +3684,7 @@ nios2_expand_custom_builtin (tree exp, unsigned int index, rtx target) if (argno == 0) { if (!custom_insn_opcode (value, VOIDmode)) - error ("custom instruction opcode must be compile time " + error ("custom instruction opcode must be a compile-time " "constant in the range 0-255 for %<__builtin_custom_%s%>", custom_builtin_name[index]); } @@ -3887,7 +3889,7 @@ nios2_expand_rdwrctl_builtin (tree exp, rtx target, struct expand_operand ops[MAX_RECOG_OPERANDS]; if (!rdwrctl_operand (ctlcode, VOIDmode)) { - error ("Control register number must be in range 0-31 for %s", + error ("control register number must be in range 0-31 for %s", d->name); return has_target_p ? gen_reg_rtx (SImode) : const0_rtx; } @@ -3915,14 +3917,14 @@ nios2_expand_rdprs_builtin (tree exp, rtx target, if (!rdwrctl_operand (reg, VOIDmode)) { - error ("Register number must be in range 0-31 for %s", + error ("register number must be in range 0-31 for %s", d->name); return gen_reg_rtx (SImode); } if (!rdprs_dcache_operand (imm, VOIDmode)) { - error ("The immediate value must fit into a %d-bit integer for %s", + error ("immediate value must fit into a %d-bit integer for %s", (TARGET_ARCH_R2) ? 12 : 16, d->name); return gen_reg_rtx (SImode); } @@ -3972,7 +3974,7 @@ nios2_expand_eni_builtin (tree exp, rtx target ATTRIBUTE_UNUSED, if (INTVAL (imm) != 0 && INTVAL (imm) != 1) { - error ("The ENI instruction operand must be either 0 or 1"); + error ("the ENI instruction operand must be either 0 or 1"); return const0_rtx; } create_integer_operand (&ops[0], INTVAL (imm)); @@ -4000,7 +4002,7 @@ nios2_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, if (d->arch > nios2_arch_option) { - error ("Builtin function %s requires Nios II R%d", + error ("built-in function %s requires Nios II R%d", d->name, (int) d->arch); /* Given it is invalid, just generate a normal call. */ return expand_call (exp, target, ignore); @@ -4080,14 +4082,16 @@ nios2_register_custom_code (unsigned int N, enum nios2_ccs_code status, if (custom_code_status[N] == CCS_FPU && index != custom_code_index[N]) { custom_code_conflict = true; - error ("switch %<-mcustom-%s%> conflicts with switch %<-mcustom-%s%>", + error ("switch %<-mcustom-%s%> conflicts with " + "switch %<-mcustom-%s%>", N2FPU_NAME (custom_code_index[N]), N2FPU_NAME (index)); } else if (custom_code_status[N] == CCS_BUILTIN_CALL) { custom_code_conflict = true; - error ("call to %<__builtin_custom_%s%> conflicts with switch " - "%<-mcustom-%s%>", custom_builtin_name[custom_code_index[N]], + error ("call to %<__builtin_custom_%s%> conflicts with " + "switch %<-mcustom-%s%>", + custom_builtin_name[custom_code_index[N]], N2FPU_NAME (index)); } } @@ -4096,8 +4100,9 @@ nios2_register_custom_code (unsigned int N, enum nios2_ccs_code status, if (custom_code_status[N] == CCS_FPU) { custom_code_conflict = true; - error ("call to %<__builtin_custom_%s%> conflicts with switch " - "%<-mcustom-%s%>", custom_builtin_name[index], + error ("call to %<__builtin_custom_%s%> conflicts with " + "switch %<-mcustom-%s%>", + custom_builtin_name[index], N2FPU_NAME (custom_code_index[N])); } else @@ -4204,13 +4209,13 @@ nios2_valid_target_attribute_rec (tree args) char *end_eq = p; if (no_opt) { - error ("custom-fpu-cfg option does not support %"); + error ("% option does not support %"); return false; } if (!eq) { - error ("custom-fpu-cfg option requires configuration" - " argument"); + error ("% option requires configuration " + "argument"); return false; } /* Increment and skip whitespace. */ @@ -4282,7 +4287,7 @@ nios2_valid_target_attribute_rec (tree args) } else { - error ("%<%s%> is unknown", argstr); + error ("invalid custom instruction option %qs", argstr); return false; } @@ -4707,7 +4712,7 @@ nios2_add_insn_asm (rtx_insn *insn, rtx *operands) bool nios2_cdx_narrow_form_p (rtx_insn *insn) { - rtx pat, lhs, rhs1, rhs2; + rtx pat, lhs, rhs1 = NULL_RTX, rhs2 = NULL_RTX; enum attr_type type; if (!TARGET_HAS_CDX) return false; -- cgit v1.1 From 8f0c9d53ef3a9b8ba2579b53596cc2b7f5d8bf69 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Thu, 18 Mar 2021 08:57:01 +0000 Subject: aarch64: Improve generic SVE tuning defaults This patch adds the recently-added tweak to split some SVE VL-based scalar operations [1] to the generic tuning used for SVE, as enabled by adding +sve to the -march flag, for example -march=armv8.2-a+sve. The recommendation for best performance on a particular CPU remains unchanged: use the -mcpu option for that CPU, where possible. -mcpu=native makes this straightforward for native compilation. The tweak to split out SVE VL-based scalar operations is a consistent win for the Neoverse V1 CPU and should be neutral for the Fujitsu A64FX. A run of SPEC2017 on A64FX with this tweak on didn't show any non-noise differences. It is also expected to be neutral on SVE2 implementations. Therefore, the patch enables the tweak for generic +sve tuning e.g. -march=armv8.2-a+sve. No SVE2 CPUs are expected to benefit from it, therefore the tweak is disabled for generic tuning when +sve2 is in -march e.g. -march=armv8.2-a+sve2. The implementation of this approach requires a bit of custom logic in aarch64_override_options_internal to handle these kinds of architecture-dependent decisions, but we do believe the user-facing principle here is important to implement. In general, for the generic target we're using a decision framework that looks like: * If all cores that are known to benefit from an optimization are of architecture X, and all other cores that implement X or above are not impacted, or have a very slight impact, we will consider it for generic tuning for architecture X. * We will not enable that optimisation for generic tuning for architecture X+1 if no known cores of architecture X+1 or above will benefit. This framework allows us to improve generic tuning for CPUs of generation X while avoiding accumulating tweaks for future CPUs of generation X+1, X+2... that do not need them, and thus avoid even the slight negative effects of these optimisations if the user is willing to tell us the desired architecture accurately. X above can mean either annual architecture updates (Armv8.2-a, Armv8.3-a etc) or optional architecture extensions (like SVE, SVE2). [1] http://gcc.gnu.org/g:a65b9ad863c5fc0aea12db58557f4d286a1974d7 gcc/ChangeLog: * config/aarch64/aarch64.c (aarch64_adjust_generic_arch_tuning): Define. (aarch64_override_options_internal): Use it. (generic_tunings): Add AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS to tune_flags. gcc/testsuite/ChangeLog: * g++.target/aarch64/sve/aarch64-sve.exp: Add -moverride=tune=none to sve_flags. * g++.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp: Likewise. * g++.target/aarch64/sve/acle/aarch64-sve-acle.exp: Likewise. * gcc.target/aarch64/sve/aarch64-sve.exp: Likewise. * gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp: Likewise. * gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp: Likewise. --- gcc/config/aarch64/aarch64.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 7838d99..db69e69 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1035,7 +1035,10 @@ static const struct tune_params generic_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits + Neoverse V1. It does not have a noticeable effect on A64FX and should + have at most a very minor effect on SVE2 cores. */ + (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */ &generic_prefetch_tune }; @@ -14485,6 +14488,19 @@ aarch64_parse_override_string (const char* input_string, free (string_root); } +/* Adjust CURRENT_TUNE (a generic tuning struct) with settings that + are best for a generic target with the currently-enabled architecture + extensions. */ +static void +aarch64_adjust_generic_arch_tuning (struct tune_params ¤t_tune) +{ + /* Neoverse V1 is the only core that is known to benefit from + AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no + point enabling it for SVE2 and above. */ + if (TARGET_SVE2) + current_tune.extra_tuning_flags + &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS; +} static void aarch64_override_options_after_change_1 (struct gcc_options *opts) @@ -14555,6 +14571,8 @@ aarch64_override_options_internal (struct gcc_options *opts) we may later overwrite. */ aarch64_tune_params = *(selected_tune->tune); aarch64_architecture_version = selected_arch->architecture_version; + if (selected_tune->tune == &generic_tunings) + aarch64_adjust_generic_arch_tuning (aarch64_tune_params); if (opts->x_aarch64_override_tune_string) aarch64_parse_override_string (opts->x_aarch64_override_tune_string, -- cgit v1.1 From 073595ef13b3395577a96bae051caae8cff9df0f Mon Sep 17 00:00:00 2001 From: Nick Clifton Date: Thu, 18 Mar 2021 12:57:25 +0000 Subject: Fix building the V850 port using recent versions of gcc. gcc/ * config/v850/v850.c (construct_restore_jr): Increase static buffer size. (construct_save_jarl): Likewise. * config/v850/v850.h (DWARF2_DEBUGGING_INFO): Define. --- gcc/config/v850/v850.c | 4 ++-- gcc/config/v850/v850.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/v850/v850.c b/gcc/config/v850/v850.c index 249cb40..e0e5005 100644 --- a/gcc/config/v850/v850.c +++ b/gcc/config/v850/v850.c @@ -2181,7 +2181,7 @@ construct_restore_jr (rtx op) unsigned long int first; unsigned long int last; int i; - static char buff [100]; /* XXX */ + static char buff [256]; /* XXX */ if (count <= 2) { @@ -2286,7 +2286,7 @@ construct_save_jarl (rtx op) unsigned long int first; unsigned long int last; int i; - static char buff [100]; /* XXX */ + static char buff [255]; /* XXX */ if (count <= (TARGET_LONG_CALLS ? 3 : 2)) { diff --git a/gcc/config/v850/v850.h b/gcc/config/v850/v850.h index 23dfdf6..386f9f5 100644 --- a/gcc/config/v850/v850.h +++ b/gcc/config/v850/v850.h @@ -700,6 +700,7 @@ typedef enum /* Use dwarf2 debugging info by default. */ #undef PREFERRED_DEBUGGING_TYPE #define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG +#define DWARF2_DEBUGGING_INFO 1 #define DWARF2_FRAME_INFO 1 #define DWARF2_UNWIND_INFO 0 -- cgit v1.1 From d9f0ade001533c9544bf2153b6baa8844ec0bee4 Mon Sep 17 00:00:00 2001 From: Sinan Lin Date: Thu, 4 Mar 2021 18:02:39 +0800 Subject: PR target/99314: Fix integer signedness issue for cpymem pattern expansion. Third operand of cpymem pattern is unsigned HOST_WIDE_INT, however we are interpret that as signed HOST_WIDE_INT, that not a problem in most case, but when the value is large than signed HOST_WIDE_INT, it might screw up since we have using that value to calculate the buffer size. 2021-03-05 Sinan Lin Kito Cheng gcc/ChangeLog: * config/riscv/riscv.c (riscv_block_move_straight): Change type to unsigned HOST_WIDE_INT for parameter and local variable with HOST_WIDE_INT type. (riscv_adjust_block_mem): Ditto. (riscv_block_move_loop): Ditto. (riscv_expand_block_move): Ditto. --- gcc/config/riscv/riscv.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c index fffd081..96fc0c0 100644 --- a/gcc/config/riscv/riscv.c +++ b/gcc/config/riscv/riscv.c @@ -3146,9 +3146,9 @@ riscv_legitimize_call_address (rtx addr) Assume that the areas do not overlap. */ static void -riscv_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length) +riscv_block_move_straight (rtx dest, rtx src, unsigned HOST_WIDE_INT length) { - HOST_WIDE_INT offset, delta; + unsigned HOST_WIDE_INT offset, delta; unsigned HOST_WIDE_INT bits; int i; enum machine_mode mode; @@ -3194,8 +3194,8 @@ riscv_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length) register. Store them in *LOOP_REG and *LOOP_MEM respectively. */ static void -riscv_adjust_block_mem (rtx mem, HOST_WIDE_INT length, - rtx *loop_reg, rtx *loop_mem) +riscv_adjust_block_mem (rtx mem, unsigned HOST_WIDE_INT length, + rtx *loop_reg, rtx *loop_mem) { *loop_reg = copy_addr_to_reg (XEXP (mem, 0)); @@ -3210,11 +3210,11 @@ riscv_adjust_block_mem (rtx mem, HOST_WIDE_INT length, the memory regions do not overlap. */ static void -riscv_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length, - HOST_WIDE_INT bytes_per_iter) +riscv_block_move_loop (rtx dest, rtx src, unsigned HOST_WIDE_INT length, + unsigned HOST_WIDE_INT bytes_per_iter) { rtx label, src_reg, dest_reg, final_src, test; - HOST_WIDE_INT leftover; + unsigned HOST_WIDE_INT leftover; leftover = length % bytes_per_iter; length -= leftover; @@ -3259,18 +3259,19 @@ riscv_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length, bool riscv_expand_block_move (rtx dest, rtx src, rtx length) { + unsigned HOST_WIDE_INT hwi_length = UINTVAL (length); if (CONST_INT_P (length)) { - HOST_WIDE_INT factor, align; + unsigned HOST_WIDE_INT factor, align; align = MIN (MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), BITS_PER_WORD); factor = BITS_PER_WORD / align; if (optimize_function_for_size_p (cfun) - && INTVAL (length) * factor * UNITS_PER_WORD > MOVE_RATIO (false)) + && hwi_length * factor * UNITS_PER_WORD > MOVE_RATIO (false)) return false; - if (INTVAL (length) <= RISCV_MAX_MOVE_BYTES_STRAIGHT / factor) + if (hwi_length <= (RISCV_MAX_MOVE_BYTES_STRAIGHT / factor)) { riscv_block_move_straight (dest, src, INTVAL (length)); return true; @@ -3280,7 +3281,8 @@ riscv_expand_block_move (rtx dest, rtx src, rtx length) unsigned min_iter_words = RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD; unsigned iter_words = min_iter_words; - HOST_WIDE_INT bytes = INTVAL (length), words = bytes / UNITS_PER_WORD; + unsigned HOST_WIDE_INT bytes = hwi_length; + unsigned HOST_WIDE_INT words = bytes / UNITS_PER_WORD; /* Lengthen the loop body if it shortens the tail. */ for (unsigned i = min_iter_words; i < min_iter_words * 2 - 1; i++) -- cgit v1.1 From ab03c0d5753549f1a78eeb706510b55fb97c8651 Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Thu, 18 Mar 2021 17:15:34 +0100 Subject: Fix idiv latencies for znver3 update costs of integer divides to match actual latencies (the scheduler model already does the right thing). It is essentially no-op, since we end up expanding idiv for all sensible constants, so this only may end up disabling vectorization in some cases, but I did not find any such examples. However in general it is better ot have actual latencies than random numbers. gcc/ChangeLog: 2021-03-18 Jan Hubicka * config/i386/x86-tune-costs.h (struct processor_costs): Fix costs of integer divides1. --- gcc/config/i386/x86-tune-costs.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index db03738..58b3b81 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1741,13 +1741,11 @@ struct processor_costs znver3_cost = { COSTS_N_INSNS (3)}, /* other. */ 0, /* cost of multiply per each bit set. */ - /* Depending on parameters, idiv can get faster on ryzen. This is upper - bound. */ - {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ - COSTS_N_INSNS (22), /* HI. */ - COSTS_N_INSNS (30), /* SI. */ - COSTS_N_INSNS (45), /* DI. */ - COSTS_N_INSNS (45)}, /* other. */ + {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */ + COSTS_N_INSNS (10), /* HI. */ + COSTS_N_INSNS (12), /* SI. */ + COSTS_N_INSNS (17), /* DI. */ + COSTS_N_INSNS (17)}, /* other. */ COSTS_N_INSNS (1), /* cost of movsx. */ COSTS_N_INSNS (1), /* cost of movzx. */ 8, /* "large" insn. */ -- cgit v1.1 From 55308fc26318427c1438cecc60ddd7ba24d5cd33 Mon Sep 17 00:00:00 2001 From: Andrew Stubbs Date: Wed, 17 Mar 2021 14:13:10 +0000 Subject: amdgcn: Silence warnings in gcn.c This fixes a few cases of "unquoted identifier or keyword", one "spurious trailing punctuation sequence", and a "may be used uninitialized". gcc/ChangeLog: * config/gcn/gcn.c (gcn_parse_amdgpu_hsa_kernel_attribute): Add %< and %> quote markers to error messages. (gcn_goacc_validate_dims): Likewise. (gcn_conditional_register_usage): Remove exclaimation mark from error message. (gcn_vectorize_vec_perm_const): Ensure perm is fully uninitialized. --- gcc/config/gcn/gcn.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index e8bb0b6..22da37e 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -228,7 +228,7 @@ gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args, const char *str; if (TREE_CODE (TREE_VALUE (list)) != STRING_CST) { - error ("amdgpu_hsa_kernel attribute requires string constant " + error ("% attribute requires string constant " "arguments"); break; } @@ -241,13 +241,14 @@ gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args, } if (a == GCN_KERNEL_ARG_TYPES) { - error ("unknown specifier %s in amdgpu_hsa_kernel attribute", str); + error ("unknown specifier %qs in %", + str); err = true; break; } if (args->requested & (1 << a)) { - error ("duplicated parameter specifier %s in amdgpu_hsa_kernel " + error ("duplicated parameter specifier %qs in % " "attribute", str); err = true; break; @@ -2102,7 +2103,7 @@ gcn_conditional_register_usage (void) /* Requesting a set of args different from the default violates the ABI. */ if (!leaf_function_p ()) warning (0, "A non-default set of initial values has been requested, " - "which violates the ABI!"); + "which violates the ABI"); for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++) fixed_regs[i] = 0; @@ -3983,6 +3984,8 @@ gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst, unsigned int perm[64]; for (unsigned int i = 0; i < nelt; ++i) perm[i] = sel[i] & (2 * nelt - 1); + for (unsigned int i = nelt; i < 64; ++i) + perm[i] = 0; src0 = force_reg (vmode, src0); src1 = force_reg (vmode, src1); @@ -4882,8 +4885,8 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level, warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, OPT_Wopenacc_dims, (dims[GOMP_DIM_VECTOR] - ? G_("using vector_length (64), ignoring %d") - : G_("using vector_length (64), " + ? G_("using %, ignoring %d") + : G_("using %, " "ignoring runtime setting")), dims[GOMP_DIM_VECTOR]); dims[GOMP_DIM_VECTOR] = 1; @@ -4895,7 +4898,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level, { warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, OPT_Wopenacc_dims, - "using num_workers (%d), ignoring %d", + "using %, ignoring %d", max_workers, dims[GOMP_DIM_WORKER]); dims[GOMP_DIM_WORKER] = max_workers; changed = true; -- cgit v1.1 From 5cded5aff76c15e48f689842b9aba1d1df5e3c54 Mon Sep 17 00:00:00 2001 From: Andrew Stubbs Date: Fri, 19 Mar 2021 10:42:37 +0000 Subject: amdgcn: Typo fix gcc/ChangeLog: * config/gcn/gcn.c (gcn_parse_amdgpu_hsa_kernel_attribute): Fix quotes in error message. --- gcc/config/gcn/gcn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 22da37e..9660ca6 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -241,7 +241,7 @@ gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args, } if (a == GCN_KERNEL_ARG_TYPES) { - error ("unknown specifier %qs in %", + error ("unknown specifier %qs in % attribute", str); err = true; break; -- cgit v1.1 From 009528d61c796608affd1eaa18ae31a3679eb46d Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 19 Mar 2021 13:48:44 +0100 Subject: arm: Fix mve_vshlq* [PR99593] As mentioned in the PR, before the r11-6708-gbfab355012ca0f5219da8beb04f2fdaf757d34b7 change v[al]shr3 expanders were expanding the shifts by register to gen_ashl3_{,un}signed which don't support immediate CONST_VECTOR shift amounts, but now expand to mve_vshlq_ which does. The testcase ICEs, because the constraint doesn't match the predicate and because LRA works solely with the constraints, so it can e.g. from REG_EQUAL propagate there a CONST_VECTOR which matches the constraint but fails the predicate and only later on other passes will notice the predicate fails and ICE. Fixed by adding a constraint that matches the immediate part of the predicate. PR target/99593 * config/arm/constraints.md (Ds): New constraint. * config/arm/vec-common.md (mve_vshlq_): Use w,Ds constraint instead of w,Dm. * g++.target/arm/pr99593.C: New test. --- gcc/config/arm/constraints.md | 10 +++++++++- gcc/config/arm/vec-common.md | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md index 919f299..de0ca8e 100644 --- a/gcc/config/arm/constraints.md +++ b/gcc/config/arm/constraints.md @@ -32,7 +32,7 @@ ;; The following multi-letter normal constraints have been used: ;; in ARM/Thumb-2 state: Da, Db, Dc, Dd, Dn, DN, Dm, Dl, DL, Do, Dv, Dy, Di, -;; Dt, Dp, Dz, Tu, Te +;; Ds, Dt, Dp, Dz, Tu, Te ;; in Thumb-1 state: Pa, Pb, Pc, Pd, Pe ;; in Thumb-2 state: Ha, Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py, Pz, Rd, Rf, Rb, Ra, ;; Rg, Ri @@ -412,6 +412,14 @@ (and (match_code "const_double") (match_test "TARGET_32BIT && vfp3_const_double_for_fract_bits (op)"))) +(define_constraint "Ds" + "@internal + In ARM/Thumb-2 state a const_vector which can be used as immediate + in vshl instruction." + (and (match_code "const_vector") + (match_test "TARGET_32BIT + && imm_for_neon_lshift_operand (op, GET_MODE (op))"))) + (define_constraint "Dp" "@internal In ARM/ Thumb2 a const_double which can be used with a vcvt.s32.f32 with bits operation" diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 345ada0..d7011c6 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -299,7 +299,7 @@ (define_insn "mve_vshlq_" [(set (match_operand:VDQIW 0 "s_register_operand" "=w,w") (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w,w") - (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "w,Dm")] + (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "w,Ds")] VSHLQ))] "ARM_HAVE__ARITH && !TARGET_REALLY_IWMMXT" "@ -- cgit v1.1 From 5e2eabe1eed1e53d39923517122d3c7de2013ad4 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 18 Mar 2021 11:47:46 -0700 Subject: x86: Issue error for return/argument only with function body If we never generate function body, we shouldn't issue errors for return nor argument. Add silent_p to i386 machine_function to avoid issuing errors for return and argument without function body. gcc/ PR target/99652 * config/i386/i386-options.c (ix86_init_machine_status): Set silent_p to true. * config/i386/i386.c (init_cumulative_args): Set silent_p to false. (construct_container): Return early for return and argument errors if silent_p is true. * config/i386/i386.h (machine_function): Add silent_p. gcc/testsuite/ PR target/99652 * gcc.dg/torture/pr99652-1.c: New test. * gcc.dg/torture/pr99652-2.c: Likewise. * gcc.target/i386/pr57655.c: Adjusted. * gcc.target/i386/pr59794-6.c: Likewise. * gcc.target/i386/pr70738-1.c: Likewise. * gcc.target/i386/pr96744-1.c: Likewise. --- gcc/config/i386/i386-options.c | 1 + gcc/config/i386/i386.c | 12 ++++++++++++ gcc/config/i386/i386.h | 4 ++++ 3 files changed, 17 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 7865bc1..b653527 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -1768,6 +1768,7 @@ ix86_init_machine_status (void) f = ggc_cleared_alloc (); f->call_abi = ix86_abi; f->stack_frame_required = true; + f->silent_p = true; return f; } diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 540d4f4..7143490 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1705,6 +1705,10 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ struct cgraph_node *local_info_node = NULL; struct cgraph_node *target = NULL; + /* Set silent_p to false to raise an error for invalid calls when + expanding function body. */ + cfun->machine->silent_p = false; + memset (cum, 0, sizeof (*cum)); if (fndecl) @@ -2534,6 +2538,10 @@ construct_container (machine_mode mode, machine_mode orig_mode, some less clueful developer tries to use floating-point anyway. */ if (needed_sseregs && !TARGET_SSE) { + /* Return early if we shouldn't raise an error for invalid + calls. */ + if (cfun->machine->silent_p) + return NULL; if (in_return) { if (!issued_sse_ret_error) @@ -2558,6 +2566,10 @@ construct_container (machine_mode mode, machine_mode orig_mode, || regclass[i] == X86_64_X87UP_CLASS || regclass[i] == X86_64_COMPLEX_X87_CLASS) { + /* Return early if we shouldn't raise an error for invalid + calls. */ + if (cfun->machine->silent_p) + return NULL; if (!issued_x87_ret_error) { error ("x87 register return with x87 disabled"); diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 4874910..058c1cc 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2945,6 +2945,10 @@ struct GTY(()) machine_function { function. */ BOOL_BITFIELD has_explicit_vzeroupper : 1; + /* True if we should act silently, rather than raise an error for + invalid calls. */ + BOOL_BITFIELD silent_p : 1; + /* The largest alignment, in bytes, of stack slot actually used. */ unsigned int max_used_stack_alignment; -- cgit v1.1 From 22d1a90a1526a77585333bd6c7d9bfc1a9cbdffa Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Fri, 19 Mar 2021 15:57:06 +0000 Subject: Use memcpy instead of strncpy to avoid error with -Werror=stringop-truncation. gcc/ChangeLog: * config/pa/pa.c (import_milli): Use memcpy instead of strncpy. --- gcc/config/pa/pa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/pa/pa.c b/gcc/config/pa/pa.c index d7fcd11..46194ba 100644 --- a/gcc/config/pa/pa.c +++ b/gcc/config/pa/pa.c @@ -5950,7 +5950,7 @@ import_milli (enum millicodes code) { imported[(int) code] = 1; strcpy (str, import_string); - strncpy (str + MILLI_START, milli_names[(int) code], 4); + memcpy (str + MILLI_START, milli_names[(int) code], 4); output_asm_insn (str, 0); } } -- cgit v1.1 From eadb118e36f9295df0d5787c8a31424d05fde592 Mon Sep 17 00:00:00 2001 From: Olivier Hainque Date: Fri, 19 Mar 2021 15:21:38 +0000 Subject: target/99660 - missing VX_CPU_PREFIX for vxworksae This fixes an oversight which causes make all-gcc to fail for --target=*vxworksae or vxworksmils, a regression introduced by the recent VxWorks7 related updates. Both AE and MILS variants resort to a common config/vxworksae.h, which misses a definition of VX_CPU_PREFIX expected by port specific headers. The change just provides the missing definition. 2021-03-19 Olivier Hainque gcc/ PR target/99660 * config/vxworksae.h (VX_CPU_PREFIX): Define. --- gcc/config/vxworksae.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/vxworksae.h b/gcc/config/vxworksae.h index 0f9b553..86d1923 100644 --- a/gcc/config/vxworksae.h +++ b/gcc/config/vxworksae.h @@ -64,6 +64,10 @@ along with GCC; see the file COPYING3. If not see /* Both kernels and RTPs have the facilities required by this macro. */ #define TARGET_POSIX_IO +/* The AE/653 system headers all predate the introduction of _VX_ prefixes + ahead of CPU families of macros. */ +#define VX_CPU_PREFIX "" + /* A VxWorks 653 implementation of TARGET_OS_CPP_BUILTINS. */ #define VXWORKS_OS_CPP_BUILTINS() \ do \ -- cgit v1.1 From e1df2c343633ed27b7a7bd9f6b5989c75b38f1d4 Mon Sep 17 00:00:00 2001 From: Pat Haugen Date: Fri, 19 Mar 2021 15:51:22 -0500 Subject: Add Power10 scheduling description. 2021-03-19 Pat Haugen gcc/ * config/rs6000/rs6000.c (power10_cost): New. (rs6000_option_override_internal): Set Power10 costs. (rs6000_issue_rate): Set Power10 issue rate. * config/rs6000/power10.md: Rewrite for Power10. --- gcc/config/rs6000/power10.md | 519 ++++++++++++++++++++++--------------------- gcc/config/rs6000/rs6000.c | 28 ++- 2 files changed, 294 insertions(+), 253 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/power10.md b/gcc/config/rs6000/power10.md index f9632b6..665f0f2 100644 --- a/gcc/config/rs6000/power10.md +++ b/gcc/config/rs6000/power10.md @@ -1,9 +1,7 @@ -;; Scheduling description for IBM POWER10 processor. -;; Copyright (C) 2016-2021 Free Software Foundation, Inc. +;; Scheduling description for the IBM POWER10 processor. +;; Copyright (C) 2020-2021 Free Software Foundation, Inc. ;; -;; This is a clone of power9.md. It is intended to be a placeholder until a -;; real scheduler model can be contributed. -;; The original power9.md was contributed by Pat Haugen (pthaugen@us.ibm.com). +;; Contributed by Pat Haugen (pthaugen@us.ibm.com). ;; This file is part of GCC. ;; @@ -21,240 +19,215 @@ ;; along with GCC; see the file COPYING3. If not see ;; . -;; This file was cloned from power9.md, it does not (yet) describe the actual -;; POWER10 processor. - -(define_automaton "power10dsp,power10lsu,power10vsu,power10fpdiv,power10misc") - -(define_cpu_unit "lsu0_power10,lsu1_power10,lsu2_power10,lsu3_power10" "power10lsu") -(define_cpu_unit "vsu0_power10,vsu1_power10,vsu2_power10,vsu3_power10" "power10vsu") -; Two vector permute units, part of vsu -(define_cpu_unit "prm0_power10,prm1_power10" "power10vsu") -; Two fixed point divide units, not pipelined -(define_cpu_unit "fx_div0_power10,fx_div1_power10" "power10misc") -(define_cpu_unit "bru_power10,cryptu_power10,dfu_power10" "power10misc") -; Create a false unit for use by non-pipelined FP div/sqrt -(define_cpu_unit "fp_div0_power10,fp_div1_power10,fp_div2_power10,fp_div3_power10" - "power10fpdiv") - - -(define_cpu_unit "x0_power10,x1_power10,xa0_power10,xa1_power10, - x2_power10,x3_power10,xb0_power10,xb1_power10, - br0_power10,br1_power10" "power10dsp") +; For Power10 we model (and try to pack) the in-order decode/dispatch groups +; which consist of 8 instructions max. We do not try to model the details of +; the out-of-order issue queues and how insns flow to the various execution +; units except for the simple representation of the issue limitation of at +; most 4 insns to the execution units/2 insns to the load units/2 insns to +; the store units. +(define_automaton "power10dispatch,power10issue") + +; Decode/dispatch slots +(define_cpu_unit "du0_power10,du1_power10,du2_power10,du3_power10, + du4_power10,du5_power10,du6_power10,du7_power10" "power10dispatch") + +; Four execution units +(define_cpu_unit "exu0_power10,exu1_power10,exu2_power10,exu3_power10" + "power10issue") +; Two load units and two store units +(define_cpu_unit "lu0_power10,lu1_power10" "power10issue") +(define_cpu_unit "stu0_power10,stu1_power10" "power10issue") + + +; Dispatch slots are allocated in order conforming to program order. +(absence_set "du0_power10" "du1_power10,du2_power10,du3_power10,du4_power10,\ + du5_power10,du6_power10,du7_power10") +(absence_set "du1_power10" "du2_power10,du3_power10,du4_power10,du5_power10,\ + du6_power10,du7_power10") +(absence_set "du2_power10" "du3_power10,du4_power10,du5_power10,du6_power10,\ + du7_power10") +(absence_set "du3_power10" "du4_power10,du5_power10,du6_power10,du7_power10") +(absence_set "du4_power10" "du5_power10,du6_power10,du7_power10") +(absence_set "du5_power10" "du6_power10,du7_power10") +(absence_set "du6_power10" "du7_power10") ; Dispatch port reservations ; -; The processor can dispatch a maximum of 6 iops per cycle with the following -; general restrictions (other restrictions also apply): -; 1) At most 2 iops per execution slice -; 2) At most 2 iops to the branch unit -; Note that insn position in a dispatch group of 6 insns does not infer which -; execution slice the insn is routed to. The units are used to infer the -; conflicts that exist (i.e. an 'even' requirement will preclude dispatch -; with 2 insns with 'superslice' requirement). - -; The xa0/xa1 units really represent the 3rd dispatch port for a superslice but -; are listed as separate units to allow those insns that preclude its use to -; still be scheduled two to a superslice while reserving the 3rd slot. The -; same applies for xb0/xb1. -(define_reservation "DU_xa_power10" "xa0_power10+xa1_power10") -(define_reservation "DU_xb_power10" "xb0_power10+xb1_power10") - -; Any execution slice dispatch -(define_reservation "DU_any_power10" - "x0_power10|x1_power10|DU_xa_power10|x2_power10|x3_power10| - DU_xb_power10") - -; Even slice, actually takes even/odd slots -(define_reservation "DU_even_power10" "x0_power10+x1_power10|x2_power10+x3_power10") +; Power10 can dispatch a maximum of 8 iops per cycle. With a maximum of +; 4 VSU/2 Load/2 Store per cycle. -; Slice plus 3rd slot -(define_reservation "DU_slice_3_power10" - "x0_power10+xa0_power10|x1_power10+xa1_power10| - x2_power10+xb0_power10|x3_power10+xb1_power10") - -; Superslice -(define_reservation "DU_super_power10" - "x0_power10+x1_power10|x2_power10+x3_power10") - -; 2-way cracked -(define_reservation "DU_C2_power10" "x0_power10+x1_power10| - x1_power10+DU_xa_power10| - x1_power10+x2_power10| - DU_xa_power10+x2_power10| - x2_power10+x3_power10| - x3_power10+DU_xb_power10") - -; 2-way cracked plus 3rd slot -(define_reservation "DU_C2_3_power10" "x0_power10+x1_power10+xa0_power10| - x1_power10+x2_power10+xa1_power10| - x2_power10+x3_power10+xb0_power10") +; Any dispatch slot +(define_reservation "DU_any_power10" + "du0_power10|du1_power10|du2_power10|du3_power10| + du4_power10|du5_power10|du6_power10|du7_power10") -; 3-way cracked (consumes whole decode/dispatch cycle) -(define_reservation "DU_C3_power10" - "x0_power10+x1_power10+xa0_power10+xa1_power10+x2_power10+ - x3_power10+xb0_power10+xb1_power10+br0_power10+br1_power10") +; Even slot, actually takes even/odd slots +(define_reservation "DU_even_power10" + "du0_power10+du1_power10|du2_power10+du3_power10| + du4_power10+du5_power10|du6_power10+du7_power10") -; Branch ports -(define_reservation "DU_branch_power10" "br0_power10|br1_power10") +; 4-way cracked (consumes whole decode/dispatch cycle) +(define_reservation "DU_all_power10" + "du0_power10+du1_power10+du2_power10+du3_power10+ + du4_power10+du5_power10+du6_power10+du7_power10") ; Execution unit reservations -(define_reservation "LSU_power10" - "lsu0_power10|lsu1_power10|lsu2_power10|lsu3_power10") - -(define_reservation "LSU_pair_power10" - "lsu0_power10+lsu1_power10|lsu1_power10+lsu2_power10| - lsu2_power10+lsu3_power10|lsu3_power10+lsu0_power10") +(define_reservation "LU_power10" + "lu0_power10|lu1_power10") -(define_reservation "VSU_power10" - "vsu0_power10|vsu1_power10|vsu2_power10|vsu3_power10") +(define_reservation "STU_power10" + "stu0_power10|stu1_power10") -(define_reservation "VSU_super_power10" - "vsu0_power10+vsu1_power10|vsu2_power10+vsu3_power10") +; Certain simple fixed-point insns can execute in the Store-agen pipe +(define_reservation "SXU_power10" + "stu0_power10|stu1_power10") -(define_reservation "VSU_PRM_power10" "prm0_power10|prm1_power10") +(define_reservation "EXU_power10" + "exu0_power10|exu1_power10|exu2_power10|exu3_power10") -; Define the reservation to be used by FP div/sqrt which allows other insns -; to be issued to the VSU, but blocks other div/sqrt for a number of cycles. -; Note that the number of cycles blocked varies depending on insn, but we -; just use the same number for all in order to keep the number of DFA states -; reasonable. -(define_reservation "FP_DIV_power10" - "fp_div0_power10*8|fp_div1_power10*8|fp_div2_power10*8| - fp_div3_power10*8") -(define_reservation "VEC_DIV_power10" - "fp_div0_power10*8+fp_div1_power10*8| - fp_div2_power10*8+fp_div3_power10*8") +(define_reservation "EXU_super_power10" + "exu0_power10+exu1_power10|exu2_power10+exu3_power10") -; LS Unit +; Load Unit (define_insn_reservation "power10-load" 4 (and (eq_attr "type" "load") - (eq_attr "sign_extend" "no") (eq_attr "update" "no") + (eq_attr "size" "!128") + (eq_attr "prefixed" "no") (eq_attr "cpu" "power10")) - "DU_any_power10,LSU_power10") - -(define_insn_reservation "power10-load-update" 4 - (and (eq_attr "type" "load") - (eq_attr "sign_extend" "no") - (eq_attr "update" "yes") - (eq_attr "cpu" "power10")) - "DU_C2_power10,LSU_power10+VSU_power10") + "DU_any_power10,LU_power10") -(define_insn_reservation "power10-load-ext" 6 +(define_insn_reservation "power10-prefixed-load" 4 (and (eq_attr "type" "load") - (eq_attr "sign_extend" "yes") (eq_attr "update" "no") + (eq_attr "size" "!128") + (eq_attr "prefixed" "yes") (eq_attr "cpu" "power10")) - "DU_C2_power10,LSU_power10") + "DU_even_power10,LU_power10") -(define_insn_reservation "power10-load-ext-update" 6 +(define_insn_reservation "power10-load-update" 4 (and (eq_attr "type" "load") - (eq_attr "sign_extend" "yes") (eq_attr "update" "yes") (eq_attr "cpu" "power10")) - "DU_C3_power10,LSU_power10+VSU_power10") + "DU_even_power10,LU_power10+SXU_power10") (define_insn_reservation "power10-fpload-double" 4 (and (eq_attr "type" "fpload") (eq_attr "update" "no") (eq_attr "size" "64") + (eq_attr "prefixed" "no") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,LSU_power10") + "DU_any_power10,LU_power10") + +(define_insn_reservation "power10-prefixed-fpload-double" 4 + (and (eq_attr "type" "fpload") + (eq_attr "update" "no") + (eq_attr "size" "64") + (eq_attr "prefixed" "yes") + (eq_attr "cpu" "power10")) + "DU_even_power10,LU_power10") (define_insn_reservation "power10-fpload-update-double" 4 (and (eq_attr "type" "fpload") (eq_attr "update" "yes") (eq_attr "size" "64") (eq_attr "cpu" "power10")) - "DU_C2_3_power10,LSU_power10+VSU_power10") + "DU_even_power10,LU_power10+SXU_power10") -; SFmode loads are cracked and have additional 2 cycles over DFmode -(define_insn_reservation "power10-fpload-single" 6 +; SFmode loads are cracked and have additional 3 cycles over DFmode +; Prefixed forms behave the same +(define_insn_reservation "power10-fpload-single" 7 (and (eq_attr "type" "fpload") (eq_attr "update" "no") (eq_attr "size" "32") (eq_attr "cpu" "power10")) - "DU_C2_3_power10,LSU_power10") + "DU_even_power10,LU_power10") -(define_insn_reservation "power10-fpload-update-single" 6 +(define_insn_reservation "power10-fpload-update-single" 7 (and (eq_attr "type" "fpload") (eq_attr "update" "yes") (eq_attr "size" "32") (eq_attr "cpu" "power10")) - "DU_C3_power10,LSU_power10+VSU_power10") + "DU_even_power10,LU_power10+SXU_power10") -(define_insn_reservation "power10-vecload" 5 +(define_insn_reservation "power10-vecload" 4 (and (eq_attr "type" "vecload") + (eq_attr "size" "!256") (eq_attr "cpu" "power10")) - "DU_any_power10,LSU_pair_power10") + "DU_any_power10,LU_power10") -; Store data can issue 2 cycles after AGEN issue, 3 cycles for vector store -(define_insn_reservation "power10-store" 0 - (and (eq_attr "type" "store") - (eq_attr "update" "no") - (eq_attr "indexed" "no") +; lxvp +(define_insn_reservation "power10-vecload-pair" 4 + (and (eq_attr "type" "vecload") + (eq_attr "size" "256") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,LSU_power10") + "DU_even_power10,LU_power10+SXU_power10") -(define_insn_reservation "power10-store-indexed" 0 - (and (eq_attr "type" "store") +; Store Unit +(define_insn_reservation "power10-store" 0 + (and (eq_attr "type" "store,fpstore,vecstore") (eq_attr "update" "no") - (eq_attr "indexed" "yes") - (eq_attr "cpu" "power10")) - "DU_slice_3_power10,LSU_power10") - -; Update forms have 2 cycle latency for updated addr reg -(define_insn_reservation "power10-store-update" 2 - (and (eq_attr "type" "store") - (eq_attr "update" "yes") - (eq_attr "indexed" "no") - (eq_attr "cpu" "power10")) - "DU_C2_3_power10,LSU_power10+VSU_power10") - -; Update forms have 2 cycle latency for updated addr reg -(define_insn_reservation "power10-store-update-indexed" 2 - (and (eq_attr "type" "store") - (eq_attr "update" "yes") - (eq_attr "indexed" "yes") + (eq_attr "prefixed" "no") + (eq_attr "size" "!128") + (eq_attr "size" "!256") (eq_attr "cpu" "power10")) - "DU_C2_3_power10,LSU_power10+VSU_power10") + "DU_any_power10,STU_power10") -(define_insn_reservation "power10-fpstore" 0 - (and (eq_attr "type" "fpstore") - (eq_attr "update" "no") +(define_insn_reservation "power10-prefixed-store" 0 + (and (eq_attr "type" "store,fpstore,vecstore") + (eq_attr "prefixed" "yes") + (eq_attr "size" "!128") + (eq_attr "size" "!256") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,LSU_power10") + "DU_even_power10,STU_power10") ; Update forms have 2 cycle latency for updated addr reg -(define_insn_reservation "power10-fpstore-update" 2 - (and (eq_attr "type" "fpstore") +(define_insn_reservation "power10-store-update" 2 + (and (eq_attr "type" "store,fpstore") (eq_attr "update" "yes") (eq_attr "cpu" "power10")) - "DU_C2_3_power10,LSU_power10+VSU_power10") + "DU_any_power10,STU_power10") -(define_insn_reservation "power10-vecstore" 0 +; stxvp +(define_insn_reservation "power10-vecstore-pair" 0 (and (eq_attr "type" "vecstore") + (eq_attr "size" "256") (eq_attr "cpu" "power10")) - "DU_super_power10,LSU_pair_power10") + "DU_even_power10,stu0_power10+stu1_power10") (define_insn_reservation "power10-larx" 4 (and (eq_attr "type" "load_l") + (eq_attr "size" "!128") + (eq_attr "cpu" "power10")) + "DU_any_power10,LU_power10") + +; All load quad forms +(define_insn_reservation "power10-lq" 4 + (and (eq_attr "type" "load,load_l") + (eq_attr "size" "128") (eq_attr "cpu" "power10")) - "DU_any_power10,LSU_power10") + "DU_even_power10,LU_power10+SXU_power10") -(define_insn_reservation "power10-stcx" 2 +(define_insn_reservation "power10-stcx" 0 (and (eq_attr "type" "store_c") + (eq_attr "size" "!128") + (eq_attr "cpu" "power10")) + "DU_any_power10,STU_power10") + +; All store quad forms +(define_insn_reservation "power10-stq" 0 + (and (eq_attr "type" "store,store_c") + (eq_attr "size" "128") (eq_attr "cpu" "power10")) - "DU_C2_3_power10,LSU_power10+VSU_power10") + "DU_even_power10,stu0_power10+stu1_power10") -(define_insn_reservation "power10-sync" 4 +(define_insn_reservation "power10-sync" 1 (and (eq_attr "type" "sync,isync") (eq_attr "cpu" "power10")) - "DU_any_power10,LSU_power10") + "DU_even_power10,STU_power10") ; VSU Execution Unit @@ -264,258 +237,302 @@ ; Most ALU insns are simple 2 cycle, including record form (define_insn_reservation "power10-alu" 2 (and (eq_attr "type" "add,exts,integer,logical,isel") + (eq_attr "prefixed" "no") (eq_attr "cpu" "power10")) - "DU_any_power10,VSU_power10") -; 5 cycle CR latency -(define_bypass 5 "power10-alu" + "DU_any_power10,EXU_power10") +; 4 cycle CR latency +(define_bypass 4 "power10-alu" "power10-crlogical,power10-mfcr,power10-mfcrf") -; Rotate/shift prevent use of third slot +; paddi +(define_insn_reservation "power10-paddi" 2 + (and (eq_attr "type" "add") + (eq_attr "prefixed" "yes") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_power10") + +; Rotate/shift (non-record form) (define_insn_reservation "power10-rot" 2 (and (eq_attr "type" "insert,shift") (eq_attr "dot" "no") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,VSU_power10") + "DU_any_power10,EXU_power10") -; Record form rotate/shift are cracked -(define_insn_reservation "power10-cracked-alu" 2 +; Record form rotate/shift +(define_insn_reservation "power10-rot-compare" 3 (and (eq_attr "type" "insert,shift") (eq_attr "dot" "yes") (eq_attr "cpu" "power10")) - "DU_C2_3_power10,VSU_power10") -; 7 cycle CR latency -(define_bypass 7 "power10-cracked-alu" + "DU_any_power10,EXU_power10") +; 5 cycle CR latency +(define_bypass 5 "power10-rot-compare" "power10-crlogical,power10-mfcr,power10-mfcrf") (define_insn_reservation "power10-alu2" 3 (and (eq_attr "type" "cntlz,popcnt,trap") (eq_attr "cpu" "power10")) - "DU_any_power10,VSU_power10") -; 6 cycle CR latency -(define_bypass 6 "power10-alu2" + "DU_any_power10,EXU_power10") +; 5 cycle CR latency +(define_bypass 5 "power10-alu2" "power10-crlogical,power10-mfcr,power10-mfcrf") (define_insn_reservation "power10-cmp" 2 (and (eq_attr "type" "cmp") (eq_attr "cpu" "power10")) - "DU_any_power10,VSU_power10") - + "DU_any_power10,EXU_power10") ; Treat 'two' and 'three' types as 2 or 3 way cracked (define_insn_reservation "power10-two" 4 (and (eq_attr "type" "two") (eq_attr "cpu" "power10")) - "DU_C2_power10,VSU_power10") + "DU_even_power10,EXU_power10") (define_insn_reservation "power10-three" 6 (and (eq_attr "type" "three") (eq_attr "cpu" "power10")) - "DU_C3_power10,VSU_power10") + "DU_all_power10,EXU_power10") (define_insn_reservation "power10-mul" 5 (and (eq_attr "type" "mul") (eq_attr "dot" "no") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,VSU_power10") + "DU_any_power10,EXU_power10") +; 4 cycle MUL->MUL latency +(define_bypass 4 "power10-mul" + "power10-mul,power10-mul-compare") (define_insn_reservation "power10-mul-compare" 5 (and (eq_attr "type" "mul") (eq_attr "dot" "yes") (eq_attr "cpu" "power10")) - "DU_C2_3_power10,VSU_power10") -; 10 cycle CR latency -(define_bypass 10 "power10-mul-compare" + "DU_even_power10,EXU_power10") +; 4 cycle MUL->MUL latency +(define_bypass 4 "power10-mul-compare" + "power10-mul,power10-mul-compare") +; 7 cycle CR latency +(define_bypass 7 "power10-mul-compare" "power10-crlogical,power10-mfcr,power10-mfcrf") -; Fixed point divides reserve the divide units for a minimum of 8 cycles -(define_insn_reservation "power10-idiv" 16 +(define_insn_reservation "power10-div" 12 (and (eq_attr "type" "div") - (eq_attr "size" "32") + (eq_attr "dot" "no") (eq_attr "cpu" "power10")) - "DU_even_power10,fx_div0_power10*8|fx_div1_power10*8") + "DU_any_power10,EXU_power10") -(define_insn_reservation "power10-ldiv" 24 +(define_insn_reservation "power10-div-compare" 12 (and (eq_attr "type" "div") - (eq_attr "size" "64") + (eq_attr "dot" "yes") (eq_attr "cpu" "power10")) - "DU_even_power10,fx_div0_power10*8|fx_div1_power10*8") + "DU_even_power10,EXU_power10") +; 14 cycle CR latency +(define_bypass 14 "power10-div-compare" + "power10-crlogical,power10-mfcr,power10-mfcrf") (define_insn_reservation "power10-crlogical" 2 (and (eq_attr "type" "cr_logical") (eq_attr "cpu" "power10")) - "DU_any_power10,VSU_power10") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-mfcrf" 2 (and (eq_attr "type" "mfcrf") (eq_attr "cpu" "power10")) - "DU_any_power10,VSU_power10") + "DU_any_power10,EXU_power10") -(define_insn_reservation "power10-mfcr" 6 +(define_insn_reservation "power10-mfcr" 3 (and (eq_attr "type" "mfcr") (eq_attr "cpu" "power10")) - "DU_C3_power10,VSU_power10") + "DU_even_power10,EXU_power10") ; Should differentiate between 1 cr field and > 1 since target of > 1 cr ; is cracked -(define_insn_reservation "power10-mtcr" 2 +(define_insn_reservation "power10-mtcr" 3 (and (eq_attr "type" "mtcr") (eq_attr "cpu" "power10")) - "DU_any_power10,VSU_power10") + "DU_any_power10,EXU_power10") -; Move to LR/CTR are executed in VSU -(define_insn_reservation "power10-mtjmpr" 5 +(define_insn_reservation "power10-mtjmpr" 3 (and (eq_attr "type" "mtjmpr") (eq_attr "cpu" "power10")) - "DU_any_power10,VSU_power10") + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-mfjmpr" 2 + (and (eq_attr "type" "mfjmpr") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + ; Floating point/Vector ops -(define_insn_reservation "power10-fpsimple" 2 + +(define_insn_reservation "power10-fpsimple" 3 (and (eq_attr "type" "fpsimple") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,VSU_power10") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-fp" 5 (and (eq_attr "type" "fp,dmul") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,VSU_power10") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-fpcompare" 3 (and (eq_attr "type" "fpcompare") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,VSU_power10") + "DU_any_power10,EXU_power10") -; FP div/sqrt are executed in the VSU slices. They are not pipelined wrt other -; div/sqrt insns, but for the most part do not block pipelined ops. (define_insn_reservation "power10-sdiv" 22 (and (eq_attr "type" "sdiv") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,VSU_power10,FP_DIV_power10") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-ddiv" 27 (and (eq_attr "type" "ddiv") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,VSU_power10,FP_DIV_power10") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-sqrt" 26 (and (eq_attr "type" "ssqrt") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,VSU_power10,FP_DIV_power10") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-dsqrt" 36 (and (eq_attr "type" "dsqrt") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,VSU_power10,FP_DIV_power10") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-vec-2cyc" 2 (and (eq_attr "type" "vecmove,veclogical,vecexts,veccmpfx") (eq_attr "cpu" "power10")) - "DU_super_power10,VSU_super_power10") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-veccmp" 3 (and (eq_attr "type" "veccmp") (eq_attr "cpu" "power10")) - "DU_super_power10,VSU_super_power10") + "DU_any_power10,EXU_power10") -(define_insn_reservation "power10-vecsimple" 3 +(define_insn_reservation "power10-vecsimple" 2 (and (eq_attr "type" "vecsimple") (eq_attr "cpu" "power10")) - "DU_super_power10,VSU_super_power10") + "DU_any_power10,EXU_power10") -(define_insn_reservation "power10-vecnormal" 7 +(define_insn_reservation "power10-vecnormal" 5 (and (eq_attr "type" "vecfloat,vecdouble") (eq_attr "size" "!128") (eq_attr "cpu" "power10")) - "DU_super_power10,VSU_super_power10") + "DU_any_power10,EXU_power10") -; Quad-precision FP ops, execute in DFU (define_insn_reservation "power10-qp" 12 (and (eq_attr "type" "vecfloat,vecdouble") (eq_attr "size" "128") (eq_attr "cpu" "power10")) - "DU_super_power10,dfu_power10") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-vecperm" 3 (and (eq_attr "type" "vecperm") + (eq_attr "prefixed" "no") + (eq_attr "dot" "no") (eq_attr "cpu" "power10")) - "DU_super_power10,VSU_PRM_power10") + "DU_any_power10,EXU_power10") -(define_insn_reservation "power10-veccomplex" 7 +(define_insn_reservation "power10-vecperm-compare" 3 + (and (eq_attr "type" "vecperm") + (eq_attr "dot" "yes") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_power10") + +(define_insn_reservation "power10-prefixed-vecperm" 3 + (and (eq_attr "type" "vecperm") + (eq_attr "prefixed" "yes") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_power10") + +(define_insn_reservation "power10-veccomplex" 6 (and (eq_attr "type" "veccomplex") (eq_attr "cpu" "power10")) - "DU_super_power10,VSU_super_power10") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-vecfdiv" 24 (and (eq_attr "type" "vecfdiv") (eq_attr "cpu" "power10")) - "DU_super_power10,VSU_super_power10,VEC_DIV_power10") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-vecdiv" 27 (and (eq_attr "type" "vecdiv") (eq_attr "size" "!128") (eq_attr "cpu" "power10")) - "DU_super_power10,VSU_super_power10,VEC_DIV_power10") + "DU_any_power10,EXU_power10") -; Use 8 for DFU reservation on QP div/mul to limit DFA state size (define_insn_reservation "power10-qpdiv" 56 (and (eq_attr "type" "vecdiv") (eq_attr "size" "128") (eq_attr "cpu" "power10")) - "DU_super_power10,dfu_power10*8") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-qpmul" 24 (and (eq_attr "type" "qmul") (eq_attr "size" "128") (eq_attr "cpu" "power10")) - "DU_super_power10,dfu_power10*8") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-mtvsr" 2 (and (eq_attr "type" "mtvsr") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,VSU_power10") + "DU_any_power10,EXU_power10") (define_insn_reservation "power10-mfvsr" 2 (and (eq_attr "type" "mfvsr") (eq_attr "cpu" "power10")) - "DU_slice_3_power10,VSU_power10") + "DU_any_power10,EXU_power10") -; Branch Unit -; Move from LR/CTR are executed in BRU but consume a writeback port from an -; execution slice. -(define_insn_reservation "power10-mfjmpr" 6 - (and (eq_attr "type" "mfjmpr") - (eq_attr "cpu" "power10")) - "DU_branch_power10,bru_power10+VSU_power10") - -; Branch is 2 cycles +; Branch +; Branch is 2 cycles, grouped with STU for issue (define_insn_reservation "power10-branch" 2 (and (eq_attr "type" "jmpreg,branch") (eq_attr "cpu" "power10")) - "DU_branch_power10,bru_power10") + "DU_any_power10,STU_power10") -; Crypto Unit -(define_insn_reservation "power10-crypto" 6 +; Crypto +(define_insn_reservation "power10-crypto" 4 (and (eq_attr "type" "crypto") (eq_attr "cpu" "power10")) - "DU_super_power10,cryptu_power10") - + "DU_any_power10,EXU_power10") -; HTM Unit -(define_insn_reservation "power10-htm" 4 - (and (eq_attr "type" "htm") - (eq_attr "cpu" "power10")) - "DU_C2_power10,LSU_power10") -(define_insn_reservation "power10-htm-simple" 2 - (and (eq_attr "type" "htmsimple") +; HTM +(define_insn_reservation "power10-htm" 2 + (and (eq_attr "type" "htmsimple,htm") (eq_attr "cpu" "power10")) - "DU_any_power10,VSU_power10") + "DU_any_power10,EXU_power10") -; DFP Unit +; DFP +; Use the minimum 12 cycle latency for all DFP insns (define_insn_reservation "power10-dfp" 12 (and (eq_attr "type" "dfp") + (eq_attr "size" "!128") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-dfpq" 12 + (and (eq_attr "type" "dfp") + (eq_attr "size" "128") (eq_attr "cpu" "power10")) - "DU_even_power10,dfu_power10") + "DU_even_power10,EXU_power10") + +; MMA +(define_insn_reservation "power10-mma" 9 + (and (eq_attr "type" "mma") + (eq_attr "prefixed" "no") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_super_power10") + +(define_insn_reservation "power10-prefixed-mma" 9 + (and (eq_attr "type" "mma") + (eq_attr "prefixed" "yes") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_super_power10") +; 4 cycle MMA->MMA latency +(define_bypass 4 "power10-mma,power10-prefixed-mma" + "power10-mma,power10-prefixed-mma") + diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 46ddf49..712dd1c 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1080,6 +1080,26 @@ struct processor_costs power9_cost = { COSTS_N_INSNS (3), /* SF->DF convert */ }; +/* Instruction costs on POWER10 processors. */ +static const +struct processor_costs power10_cost = { + COSTS_N_INSNS (2), /* mulsi */ + COSTS_N_INSNS (2), /* mulsi_const */ + COSTS_N_INSNS (2), /* mulsi_const9 */ + COSTS_N_INSNS (2), /* muldi */ + COSTS_N_INSNS (6), /* divsi */ + COSTS_N_INSNS (6), /* divdi */ + COSTS_N_INSNS (2), /* fp */ + COSTS_N_INSNS (2), /* dmul */ + COSTS_N_INSNS (11), /* sdiv */ + COSTS_N_INSNS (13), /* ddiv */ + 128, /* cache line size */ + 32, /* l1 cache */ + 512, /* l2 cache */ + 16, /* prefetch streams */ + COSTS_N_INSNS (2), /* SF->DF convert */ +}; + /* Instruction costs on POWER A2 processors. */ static const struct processor_costs ppca2_cost = { @@ -4774,10 +4794,13 @@ rs6000_option_override_internal (bool global_init_p) break; case PROCESSOR_POWER9: - case PROCESSOR_POWER10: rs6000_cost = &power9_cost; break; + case PROCESSOR_POWER10: + rs6000_cost = &power10_cost; + break; + case PROCESSOR_PPCA2: rs6000_cost = &ppca2_cost; break; @@ -18443,8 +18466,9 @@ rs6000_issue_rate (void) case PROCESSOR_POWER8: return 7; case PROCESSOR_POWER9: - case PROCESSOR_POWER10: return 6; + case PROCESSOR_POWER10: + return 8; default: return 1; } -- cgit v1.1 From 19ff0b0d816e6e7d7657a8559e9957d79dc1d77f Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sat, 20 Mar 2021 05:17:36 -0700 Subject: x86: Check cfun != NULL before accessing silent_p Since construct_container may be called with cfun == NULL, check cfun != NULL before accessing silent_p. gcc/ PR target/99679 * config/i386/i386.c (construct_container): Check cfun != NULL before accessing silent_p. gcc/testsuite/ PR target/99679 * g++.target/i386/pr99679-1.C: New test. * g++.target/i386/pr99679-2.C: Likewise. --- gcc/config/i386/i386.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7143490..7c41302 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -2540,7 +2540,7 @@ construct_container (machine_mode mode, machine_mode orig_mode, { /* Return early if we shouldn't raise an error for invalid calls. */ - if (cfun->machine->silent_p) + if (cfun != NULL && cfun->machine->silent_p) return NULL; if (in_return) { @@ -2568,7 +2568,7 @@ construct_container (machine_mode mode, machine_mode orig_mode, { /* Return early if we shouldn't raise an error for invalid calls. */ - if (cfun->machine->silent_p) + if (cfun != NULL && cfun->machine->silent_p) return NULL; if (!issued_x87_ret_error) { -- cgit v1.1 From d0a5e8e1a84bdd6ce915c3be65da8af2552cd49e Mon Sep 17 00:00:00 2001 From: Xionghu Luo Date: Sun, 21 Mar 2021 21:14:02 -0500 Subject: rs6000: Convert the vector set variable idx to DImode [PR98914] vec_insert defines the element argument type to be signed int by ELFv2 ABI. When expanding a vector with a variable rtx, convert the rtx type to DImode to support both intrinsic usage and other callers from rs6000_expand_vector_init produced by v[k] = val when k is long type. gcc/ChangeLog: 2021-03-21 Xionghu Luo PR target/98914 * config/rs6000/rs6000.c (rs6000_expand_vector_set_var_p9): Convert idx to DImode. (rs6000_expand_vector_set_var_p8): Likewise. gcc/testsuite/ChangeLog: 2021-03-21 Xionghu Luo PR target/98914 * gcc.target/powerpc/pr98914.c: New test. --- gcc/config/rs6000/rs6000.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 712dd1c..616dae3 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -7026,21 +7026,21 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx idx) gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); - gcc_assert (GET_MODE (idx) == E_SImode); - machine_mode inner_mode = GET_MODE (val); - rtx tmp = gen_reg_rtx (GET_MODE (idx)); int width = GET_MODE_SIZE (inner_mode); gcc_assert (width >= 1 && width <= 8); int shift = exact_log2 (width); + + machine_mode idx_mode = GET_MODE (idx); + idx = convert_modes (DImode, idx_mode, idx, 1); + /* Generate the IDX for permute shift, width is the vector element size. idx = idx * width. */ - emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (shift))); - - tmp = convert_modes (DImode, SImode, tmp, 1); + rtx tmp = gen_reg_rtx (DImode); + emit_insn (gen_ashldi3 (tmp, idx, GEN_INT (shift))); /* lvsr v1,0,idx. */ rtx pcvr = gen_reg_rtx (V16QImode); @@ -7073,28 +7073,26 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx) gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); - gcc_assert (GET_MODE (idx) == E_SImode); - machine_mode inner_mode = GET_MODE (val); HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode); - rtx tmp = gen_reg_rtx (GET_MODE (idx)); int width = GET_MODE_SIZE (inner_mode); - gcc_assert (width >= 1 && width <= 4); + int shift = exact_log2 (width); + + machine_mode idx_mode = GET_MODE (idx); + idx = convert_modes (DImode, idx_mode, idx, 1); + + /* idx = idx * width. */ + rtx tmp = gen_reg_rtx (DImode); + emit_insn (gen_ashldi3 (tmp, idx, GEN_INT (shift))); + + /* For LE: idx = idx + 8. */ if (!BYTES_BIG_ENDIAN) - { - /* idx = idx * width. */ - emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width))); - /* idx = idx + 8. */ - emit_insn (gen_addsi3 (tmp, tmp, GEN_INT (8))); - } + emit_insn (gen_adddi3 (tmp, tmp, GEN_INT (8))); else - { - emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width))); - emit_insn (gen_subsi3 (tmp, GEN_INT (24 - width), tmp)); - } + emit_insn (gen_subdi3 (tmp, GEN_INT (24 - width), tmp)); /* lxv vs33, mask. DImode: 0xffffffffffffffff0000000000000000 @@ -7144,7 +7142,6 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx) emit_insn (gen_rtx_SET (val_v16qi, sub_val)); /* lvsl 13,0,idx. */ - tmp = convert_modes (DImode, SImode, tmp, 1); rtx pcv = gen_reg_rtx (V16QImode); emit_insn (gen_altivec_lvsl_reg (pcv, tmp)); -- cgit v1.1 From 0ec7641ee1823a73b560e2ed2518bf728ac9e22e Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Thu, 18 Mar 2021 01:22:59 -0500 Subject: rs6000: Fix some unexpected empty split conditions This patch is to fix empty split-conditions of some define_insn_and_split definitions where their conditions for define_insn part aren't empty. As Segher and Mike pointed out, they can sometimes lead to unexpected consequences. Bootstrapped/regtested on powerpc64le-linux-gnu P9 and powerpc64-linux-gnu P8. gcc/ChangeLog: * config/rs6000/rs6000.md (*rotldi3_insert_sf, *movcc_p9, floatsi2_lfiwax, floatsi2_lfiwax_mem, floatunssi2_lfiwzx, floatunssi2_lfiwzx_mem, *floatsidf2_internal, *floatunssidf2_internal, fix_truncsi2_stfiwx, fix_truncsi2_internal, fixuns_truncsi2_stfiwx, *round322_fprs, *roundu322_fprs, *fix_truncsi2_internal): Fix empty split condition. * config/rs6000/vsx.md (*vsx_le_undo_permute_, vsx_reduc__v2df, vsx_reduc__v4sf, *vsx_reduc__v2df_scalar, *vsx_reduc__v4sf_scalar): Likewise. --- gcc/config/rs6000/rs6000.md | 28 ++++++++++++++-------------- gcc/config/rs6000/vsx.md | 10 +++++----- 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index c0d7b1a..c71d343 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -4286,7 +4286,7 @@ (clobber (match_scratch:V4SF 4))] "TARGET_POWERPC64 && INTVAL (operands[2]) == " "#" - "" + "&& 1" [(parallel [(set (match_dup 5) (zero_extend:DI (unspec:QHSI [(match_dup 3)] UNSPEC_SI_FROM_SF))) (clobber (match_dup 4))]) @@ -5332,7 +5332,7 @@ (clobber (match_scratch:V2DI 6 "=0,&wa"))] "TARGET_P9_MINMAX" "#" - "" + "&& 1" [(set (match_dup 6) (if_then_else:V2DI (match_dup 1) (match_dup 7) @@ -5441,7 +5441,7 @@ "TARGET_HARD_FLOAT && TARGET_LFIWAX && && can_create_pseudo_p ()" "#" - "" + "&& 1" [(pc)] { rtx dest = operands[0]; @@ -5481,7 +5481,7 @@ (clobber (match_scratch:DI 2 "=d,wa"))] "TARGET_HARD_FLOAT && TARGET_LFIWAX && " "#" - "" + "&& 1" [(pc)] { operands[1] = rs6000_force_indexed_or_indirect_mem (operands[1]); @@ -5516,7 +5516,7 @@ (clobber (match_scratch:DI 2 "=d,wa"))] "TARGET_HARD_FLOAT && TARGET_LFIWZX && " "#" - "" + "&& 1" [(pc)] { rtx dest = operands[0]; @@ -5556,7 +5556,7 @@ (clobber (match_scratch:DI 2 "=d,wa"))] "TARGET_HARD_FLOAT && TARGET_LFIWZX && " "#" - "" + "&& 1" [(pc)] { operands[1] = rs6000_force_indexed_or_indirect_mem (operands[1]); @@ -5621,7 +5621,7 @@ (clobber (match_operand:SI 6 "gpc_reg_operand" "=&r"))] "!TARGET_FCFID && TARGET_HARD_FLOAT" "#" - "" + "&& 1" [(pc)] { rtx lowword, highword; @@ -5711,7 +5711,7 @@ "!TARGET_FCFIDU && TARGET_HARD_FLOAT && !(TARGET_FCFID && TARGET_POWERPC64)" "#" - "" + "&& 1" [(pc)] { rtx lowword, highword; @@ -5867,7 +5867,7 @@ "TARGET_HARD_FLOAT && TARGET_STFIWX && can_create_pseudo_p () && !(TARGET_P8_VECTOR && TARGET_DIRECT_MOVE)" "#" - "" + "&& 1" [(pc)] { rtx dest = operands[0]; @@ -5909,7 +5909,7 @@ "TARGET_HARD_FLOAT && !(TARGET_P8_VECTOR && TARGET_DIRECT_MOVE)" "#" - "" + "&& 1" [(pc)] { rtx lowword; @@ -6015,7 +6015,7 @@ && TARGET_STFIWX && can_create_pseudo_p () && !TARGET_P8_VECTOR" "#" - "" + "&& 1" [(pc)] { rtx dest = operands[0]; @@ -6235,7 +6235,7 @@ && && TARGET_LFIWAX && TARGET_STFIWX && TARGET_FCFID && !TARGET_DIRECT_MOVE && can_create_pseudo_p ()" "#" - "" + "&& 1" [(pc)] { rtx dest = operands[0]; @@ -6268,7 +6268,7 @@ && TARGET_LFIWZX && TARGET_STFIWX && TARGET_FCFIDU && !TARGET_DIRECT_MOVE && can_create_pseudo_p ()" "#" - "" + "&& 1" [(pc)] { rtx dest = operands[0]; @@ -8251,7 +8251,7 @@ (clobber (match_operand:DI 5 "offsettable_mem_operand" "=o"))] "TARGET_HARD_FLOAT && TARGET_LONG_DOUBLE_128" "#" - "" + "&& 1" [(pc)] { rtx lowword; diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index a1fa4f9..4404407 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -972,7 +972,7 @@ "@ # xxlor %x0,%x1" - "" + "&& 1" [(set (match_dup 0) (match_dup 1))] { if (reload_completed && REGNO (operands[0]) == REGNO (operands[1])) @@ -4656,7 +4656,7 @@ (clobber (match_scratch:V2DF 2 "=0,&wa"))] "VECTOR_UNIT_VSX_P (V2DFmode)" "#" - "" + "&& 1" [(const_int 0)] { rtx tmp = (GET_CODE (operands[2]) == SCRATCH) @@ -4678,7 +4678,7 @@ (clobber (match_scratch:V4SF 3 "=&wa"))] "VECTOR_UNIT_VSX_P (V4SFmode)" "#" - "" + "&& 1" [(const_int 0)] { rtx op0 = operands[0]; @@ -4726,7 +4726,7 @@ (clobber (match_scratch:DF 2 "=0,&wa"))] "BYTES_BIG_ENDIAN && VECTOR_UNIT_VSX_P (V2DFmode)" "#" - "" + "&& 1" [(const_int 0)] { rtx hi = gen_highpart (DFmode, operands[1]); @@ -4753,7 +4753,7 @@ (clobber (match_scratch:V4SF 4 "=0"))] "BYTES_BIG_ENDIAN && VECTOR_UNIT_VSX_P (V4SFmode)" "#" - "" + "&& 1" [(const_int 0)] { rtx op0 = operands[0]; -- cgit v1.1 From 540dace2ed3949571f2ce6cb007354e69bda0cb2 Mon Sep 17 00:00:00 2001 From: Kito Cheng Date: Mon, 22 Mar 2021 16:32:45 +0800 Subject: PR target/99702: Check RTL type before get value gcc/ChangeLog: PR target/99702 * config/riscv/riscv.c (riscv_expand_block_move): Get RTL value after type checking. gcc/testsuite/ChangeLog: PR target/99702 * gcc.target/riscv/pr99702.c: New. --- gcc/config/riscv/riscv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c index 96fc0c0..de8308c 100644 --- a/gcc/config/riscv/riscv.c +++ b/gcc/config/riscv/riscv.c @@ -3259,9 +3259,9 @@ riscv_block_move_loop (rtx dest, rtx src, unsigned HOST_WIDE_INT length, bool riscv_expand_block_move (rtx dest, rtx src, rtx length) { - unsigned HOST_WIDE_INT hwi_length = UINTVAL (length); if (CONST_INT_P (length)) { + unsigned HOST_WIDE_INT hwi_length = UINTVAL (length); unsigned HOST_WIDE_INT factor, align; align = MIN (MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), BITS_PER_WORD); -- cgit v1.1 From 150a829accd76ddd73c20628774cb0781f6e8bfe Mon Sep 17 00:00:00 2001 From: Alex Coplan Date: Mon, 22 Mar 2021 14:43:15 +0000 Subject: arm: Fix MVE ICEs with vector moves and -mpure-code [PR97252] This fixes around 500 ICEs in the testsuite which can be seen when testing with -march=armv8.1-m.main+mve -mfloat-abi=hard -mpure-code (leaving the testsuite free of ICEs in this configuration). All of the ICEs are in arm_print_operand (which is expecting a mem and gets another rtx, e.g. a const_vector) when running the output code for *mve_mov in alternative 4. The issue is that MVE vector moves were relying on the arm_reorg pass to move constant vectors that we can't easily synthesize to the literal pool. This doesn't work for -mpure-code where the literal pool is disabled. LLVM puts these in .rodata: I've chosen to do the same here. With this change, for -mpure-code, we no longer want to allow a constant on the RHS of a vector load in RA. To achieve this, I added a new constraint which matches constants only if the literal pool is available. gcc/ChangeLog: PR target/97252 * config/arm/arm-protos.h (neon_make_constant): Add generate argument to guard emitting insns, default to true. * config/arm/arm.c (arm_legitimate_constant_p_1): Reject CONST_VECTORs which neon_make_constant can't handle. (neon_vdup_constant): Add generate argument, avoid emitting insns if it's not set. (neon_make_constant): Plumb new generate argument through. * config/arm/constraints.md (Ui): New. Use it... * config/arm/mve.md (*mve_mov): ... here. * config/arm/vec-common.md (movv8hf): Use neon_make_constant to synthesize constants. --- gcc/config/arm/arm-protos.h | 2 +- gcc/config/arm/arm.c | 32 +++++++++++++++++++++----------- gcc/config/arm/constraints.md | 7 +++++++ gcc/config/arm/mve.md | 2 +- gcc/config/arm/vec-common.md | 5 +++++ 5 files changed, 35 insertions(+), 13 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index bb5d3a2..952a825 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -101,7 +101,7 @@ extern char *neon_output_shift_immediate (const char *, char, rtx *, machine_mode, int, bool); extern void neon_pairwise_reduce (rtx, rtx, machine_mode, rtx (*) (rtx, rtx, rtx)); -extern rtx neon_make_constant (rtx); +extern rtx neon_make_constant (rtx, bool generate = true); extern tree arm_builtin_vectorized_function (unsigned int, tree, tree); extern void neon_expand_vector_init (rtx, rtx); extern void neon_lane_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT, const_tree); diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 49635bc..e89f5e2 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -9461,6 +9461,9 @@ arm_tls_referenced_p (rtx x) static bool arm_legitimate_constant_p_1 (machine_mode, rtx x) { + if (GET_CODE (x) == CONST_VECTOR && !neon_make_constant (x, false)) + return false; + return flag_pic || !label_mentioned_p (x); } @@ -13025,12 +13028,14 @@ neon_pairwise_reduce (rtx op0, rtx op1, machine_mode mode, } } -/* If VALS is a vector constant that can be loaded into a register - using VDUP, generate instructions to do so and return an RTX to - assign to the register. Otherwise return NULL_RTX. */ +/* Return a non-NULL RTX iff VALS is a vector constant that can be + loaded into a register using VDUP. + + If this is the case, and GENERATE is set, we also generate + instructions to do this and return an RTX to assign to the register. */ static rtx -neon_vdup_constant (rtx vals) +neon_vdup_constant (rtx vals, bool generate) { machine_mode mode = GET_MODE (vals); machine_mode inner_mode = GET_MODE_INNER (mode); @@ -13046,6 +13051,9 @@ neon_vdup_constant (rtx vals) vdup.i16). */ return NULL_RTX; + if (!generate) + return x; + /* We can load this constant by using VDUP and a constant in a single ARM register. This will be cheaper than a vector load. */ @@ -13054,13 +13062,15 @@ neon_vdup_constant (rtx vals) return gen_vec_duplicate (mode, x); } -/* Generate code to load VALS, which is a PARALLEL containing only - constants (for vec_init) or CONST_VECTOR, efficiently into a - register. Returns an RTX to copy into the register, or NULL_RTX - for a PARALLEL that cannot be converted into a CONST_VECTOR. */ +/* Return a non-NULL RTX iff VALS, which is a PARALLEL containing only + constants (for vec_init) or CONST_VECTOR, can be effeciently loaded + into a register. + + If this is the case, and GENERATE is set, we also generate code to do + this and return an RTX to copy into the register. */ rtx -neon_make_constant (rtx vals) +neon_make_constant (rtx vals, bool generate) { machine_mode mode = GET_MODE (vals); rtx target; @@ -13092,7 +13102,7 @@ neon_make_constant (rtx vals) && simd_immediate_valid_for_move (const_vec, mode, NULL, NULL)) /* Load using VMOV. On Cortex-A8 this takes one cycle. */ return const_vec; - else if ((target = neon_vdup_constant (vals)) != NULL_RTX) + else if ((target = neon_vdup_constant (vals, generate)) != NULL_RTX) /* Loaded using VDUP. On Cortex-A8 the VDUP takes one NEON pipeline cycle; creating the constant takes one or two ARM pipeline cycles. */ @@ -13102,7 +13112,7 @@ neon_make_constant (rtx vals) (for either double or quad vectors). We cannot take advantage of single-cycle VLD1 because we need a PC-relative addressing mode. */ - return const_vec; + return arm_disable_literal_pool ? NULL_RTX : const_vec; else /* A PARALLEL containing something not valid inside CONST_VECTOR. We cannot construct an initializer. */ diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md index de0ca8e..a5a19a7 100644 --- a/gcc/config/arm/constraints.md +++ b/gcc/config/arm/constraints.md @@ -506,6 +506,13 @@ && mve_vector_mem_operand (GET_MODE (op), XEXP (op, 0), true)"))) +(define_constraint "Ui" + "@internal + Match a constant (as per the 'i' constraint) provided that we have the + literal pool available. This is useful for load insns that would need + to move such constants to the literal pool after RA." + (match_test "!arm_disable_literal_pool && satisfies_constraint_i (op)")) + (define_memory_constraint "Uq" "@internal In ARM state an address valid in ldrsb instructions." diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index ec0ef7b..440fd6a 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -19,7 +19,7 @@ (define_insn "*mve_mov" [(set (match_operand:MVE_types 0 "nonimmediate_operand" "=w,w,r,w,w,r,w,Ux,w") - (match_operand:MVE_types 1 "general_operand" "w,r,w,Dn,Uxi,r,Dm,w,Ul"))] + (match_operand:MVE_types 1 "general_operand" "w,r,w,Dn,UxUi,r,Dm,w,Ul"))] "TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT" { if (which_alternative == 3 || which_alternative == 6) diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index d7011c6..0e13187 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -74,6 +74,11 @@ { if (!REG_P (operands[0])) operands[1] = force_reg (E_V8HFmode, operands[1]); + else if (TARGET_HAVE_MVE_FLOAT && CONSTANT_P (operands[1])) + { + operands[1] = neon_make_constant (operands[1]); + gcc_assert (operands[1] != NULL_RTX); + } } }) -- cgit v1.1 From 02f2dc441b1954736cc61e3f97687cd23d5586c5 Mon Sep 17 00:00:00 2001 From: "Vladimir N. Makarov" Date: Mon, 22 Mar 2021 13:34:50 -0400 Subject: [PR99581] Define relaxed memory and use it for aarch64 aarch64 needs to skip memory address validation for LD1R insns. Skipping the address validation may result in LRA crash for some targets when usual memory constraint is used. This patch introduces define_relaxed_memory_constraint, skipping address validation for it, and defining relaxed memory for aarch64 LD1r insn memory operand. gcc/ChangeLog: PR target/99581 * config/aarch64/constraints.md (UtQ): Use define_relaxed_memory_constraint for it. * doc/md.texi (define_relaxed_memory_constraint): Describe it. * genoutput.c (main): Process DEFINE_RELAXED_MEMORY_CONSTRAINT. * genpreds.c (constraint_data): Add bitfield is_relaxed_memory. (have_relaxed_memory_constraints): New static var. (relaxed_memory_start, relaxed_memory_end): Ditto. (add_constraint): Add arg is_relaxed_memory. Check name for relaxed memory. Set up is_relaxed_memory in constraint_data and have_relaxed_memory_constraints. Adjust calls. (choose_enum_order): Process relaxed memory. (write_tm_preds_h): Ditto. (main): Process DEFINE_RELAXED_MEMORY_CONSTRAINT. * gensupport.c (process_rtx): Process DEFINE_RELAXED_MEMORY_CONSTRAINT. * ira-costs.c (record_reg_classes): Process CT_RELAXED_MEMORY. * ira-lives.c (single_reg_class): Use insn_extra_relaxed_memory_constraint. * ira.c (ira_setup_alts): CT_RELAXED_MEMORY. * lra-constraints.c (valid_address_p): Use insn_extra_relaxed_memory_constraint instead of other memory constraints. (process_alt_operands): Process CT_RELAXED_MEMORY. (curr_insn_transform): Use insn_extra_relaxed_memory_constraint. * recog.c (asm_operand_ok, preprocess_constraints): Process CT_RELAXED_MEMORY. * reload.c (find_reloads): Ditto. * rtl.def (DEFINE_RELAXED_MEMORY_CONSTRAINT): New. * stmt.c (parse_input_constraint): Use insn_extra_relaxed_memory_constraint. gcc/testsuite/ChangeLog: PR target/99581 * gcc.target/powerpc/pr99581.c: New. --- gcc/config/aarch64/constraints.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index 09c2b72..f08eea8b 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -330,7 +330,7 @@ (match_test "aarch64_legitimate_address_p (V2DImode, XEXP (op, 0), 1)"))) -(define_memory_constraint "UtQ" +(define_relaxed_memory_constraint "UtQ" "@internal An address valid for SVE LD1RQs." (and (match_code "mem") -- cgit v1.1 From a9604fcbb29e457a4824a496905057dcc2e5d78a Mon Sep 17 00:00:00 2001 From: Marcus Comstedt Date: Fri, 19 Mar 2021 20:49:03 +0100 Subject: RISC-V: Support -mlittle-endian and -mbig-endian gcc/ * config/riscv/elf.h (LINK_SPEC): Pass linker endianness flag. * config/riscv/freebsd.h (LINK_SPEC): Likewise. * config/riscv/linux.h (LINK_SPEC): Likewise. * config/riscv/riscv.h (ASM_SPEC): Pass -mbig-endian and -mlittle-endian. (BYTES_BIG_ENDIAN): Handle big endian. (WORDS_BIG_ENDIAN): Define to BYTES_BIG_ENDIAN. * config/riscv/riscv.opt (-mbig-endian, -mlittle-endian): New options. * doc/invoke.texi (-mbig-endian, -mlittle-endian): Document. --- gcc/config/riscv/elf.h | 2 ++ gcc/config/riscv/freebsd.h | 2 ++ gcc/config/riscv/linux.h | 2 ++ gcc/config/riscv/riscv.h | 6 ++++-- gcc/config/riscv/riscv.opt | 8 ++++++++ 5 files changed, 18 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/riscv/elf.h b/gcc/config/riscv/elf.h index d136d46..973efda 100644 --- a/gcc/config/riscv/elf.h +++ b/gcc/config/riscv/elf.h @@ -20,6 +20,8 @@ along with GCC; see the file COPYING3. If not see #define LINK_SPEC "\ -melf" XLEN_SPEC "lriscv \ %{mno-relax:--no-relax} \ +%{mbig-endian:-EB} \ +%{mlittle-endian:-EL} \ %{shared}" /* Link against Newlib libraries, because the ELF backend assumes Newlib. diff --git a/gcc/config/riscv/freebsd.h b/gcc/config/riscv/freebsd.h index a48bf9b..f3aca9f 100644 --- a/gcc/config/riscv/freebsd.h +++ b/gcc/config/riscv/freebsd.h @@ -44,6 +44,8 @@ along with GCC; see the file COPYING3. If not see %{p:%nconsider using `-pg' instead of `-p' with gprof (1)} \ %{v:-V} \ %{assert*} %{R*} %{rpath*} %{defsym*} \ + %{mbig-endian:-EB} \ + %{mlittle-endian:-EL} \ %{shared:-Bshareable %{h*} %{soname*}} \ %{symbolic:-Bsymbolic} \ %{static:-Bstatic} \ diff --git a/gcc/config/riscv/linux.h b/gcc/config/riscv/linux.h index 9238de5..e74f5d3 100644 --- a/gcc/config/riscv/linux.h +++ b/gcc/config/riscv/linux.h @@ -60,6 +60,8 @@ along with GCC; see the file COPYING3. If not see #define LINK_SPEC "\ -melf" XLEN_SPEC "lriscv" LD_EMUL_SUFFIX " \ %{mno-relax:--no-relax} \ +%{mbig-endian:-EB} \ +%{mlittle-endian:-EL} \ %{shared} \ %{!shared: \ %{!static: \ diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index c6f8bee..0b667d2 100644 --- a/gcc/config/riscv/riscv.h +++ b/gcc/config/riscv/riscv.h @@ -91,6 +91,8 @@ extern const char *riscv_default_mtune (int argc, const char **argv); %{" FPIE_OR_FPIC_SPEC ":-fpic} \ %{march=*} \ %{mabi=*} \ +%{mbig-endian} \ +%{mlittle-endian} \ %(subtarget_asm_spec)" \ ASM_MISA_SPEC @@ -126,8 +128,8 @@ ASM_MISA_SPEC /* Target machine storage layout */ #define BITS_BIG_ENDIAN 0 -#define BYTES_BIG_ENDIAN 0 -#define WORDS_BIG_ENDIAN 0 +#define BYTES_BIG_ENDIAN (TARGET_BIG_ENDIAN != 0) +#define WORDS_BIG_ENDIAN (BYTES_BIG_ENDIAN) #define MAX_BITS_PER_WORD 64 diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt index 761a09d..e294e22 100644 --- a/gcc/config/riscv/riscv.opt +++ b/gcc/config/riscv/riscv.opt @@ -21,6 +21,14 @@ HeaderInclude config/riscv/riscv-opts.h +mbig-endian +Target RejectNegative Mask(BIG_ENDIAN) +Assume target CPU is configured as big endian. + +mlittle-endian +Target RejectNegative InverseMask(BIG_ENDIAN) +Assume target CPU is configured as little endian. + mbranch-cost= Target RejectNegative Joined UInteger Var(riscv_branch_cost) -mbranch-cost=N Set the cost of branches to roughly N instructions. -- cgit v1.1 From cd1e2f632532d04c67a17820cb25826f8b0febac Mon Sep 17 00:00:00 2001 From: Marcus Comstedt Date: Fri, 19 Mar 2021 20:49:04 +0100 Subject: RISC-V: Add riscv{32,64}be with big endian as default gcc/ * common/config/riscv/riscv-common.c (TARGET_DEFAULT_TARGET_FLAGS): Set default endianness. * config.gcc (riscv32be-*, riscv64be-*): Set TARGET_BIG_ENDIAN_DEFAULT to 1. * config/riscv/elf.h (LINK_SPEC): Change -melf* value depending on default endianness. * config/riscv/freebsd.h (LINK_SPEC): Likewise. * config/riscv/linux.h (LINK_SPEC): Likewise. * config/riscv/riscv.c (TARGET_DEFAULT_TARGET_FLAGS): Set default endianness. * config/riscv/riscv.h (DEFAULT_ENDIAN_SPEC): New macro. --- gcc/config/riscv/elf.h | 2 +- gcc/config/riscv/freebsd.h | 2 +- gcc/config/riscv/linux.h | 2 +- gcc/config/riscv/riscv.c | 5 +++++ gcc/config/riscv/riscv.h | 6 ++++++ 5 files changed, 14 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/riscv/elf.h b/gcc/config/riscv/elf.h index 973efda..7e65e49 100644 --- a/gcc/config/riscv/elf.h +++ b/gcc/config/riscv/elf.h @@ -18,7 +18,7 @@ along with GCC; see the file COPYING3. If not see . */ #define LINK_SPEC "\ --melf" XLEN_SPEC "lriscv \ +-melf" XLEN_SPEC DEFAULT_ENDIAN_SPEC "riscv \ %{mno-relax:--no-relax} \ %{mbig-endian:-EB} \ %{mlittle-endian:-EL} \ diff --git a/gcc/config/riscv/freebsd.h b/gcc/config/riscv/freebsd.h index f3aca9f..6018e7b 100644 --- a/gcc/config/riscv/freebsd.h +++ b/gcc/config/riscv/freebsd.h @@ -40,7 +40,7 @@ along with GCC; see the file COPYING3. If not see #undef LINK_SPEC #define LINK_SPEC " \ - -melf" XLEN_SPEC "lriscv \ + -melf" XLEN_SPEC DEFAULT_ENDIAN_SPEC "riscv \ %{p:%nconsider using `-pg' instead of `-p' with gprof (1)} \ %{v:-V} \ %{assert*} %{R*} %{rpath*} %{defsym*} \ diff --git a/gcc/config/riscv/linux.h b/gcc/config/riscv/linux.h index e74f5d3..fce5b89 100644 --- a/gcc/config/riscv/linux.h +++ b/gcc/config/riscv/linux.h @@ -58,7 +58,7 @@ along with GCC; see the file COPYING3. If not see "%{mabi=ilp32:_ilp32}" #define LINK_SPEC "\ --melf" XLEN_SPEC "lriscv" LD_EMUL_SUFFIX " \ +-melf" XLEN_SPEC DEFAULT_ENDIAN_SPEC "riscv" LD_EMUL_SUFFIX " \ %{mno-relax:--no-relax} \ %{mbig-endian:-EB} \ %{mlittle-endian:-EL} \ diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c index de8308c..99b8438 100644 --- a/gcc/config/riscv/riscv.c +++ b/gcc/config/riscv/riscv.c @@ -5526,6 +5526,11 @@ riscv_asan_shadow_offset (void) #undef TARGET_ASAN_SHADOW_OFFSET #define TARGET_ASAN_SHADOW_OFFSET riscv_asan_shadow_offset +#ifdef TARGET_BIG_ENDIAN_DEFAULT +#undef TARGET_DEFAULT_TARGET_FLAGS +#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_ENDIAN) +#endif + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-riscv.h" diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index 0b667d2..3cc3e86 100644 --- a/gcc/config/riscv/riscv.h +++ b/gcc/config/riscv/riscv.h @@ -30,6 +30,12 @@ along with GCC; see the file COPYING3. If not see /* Target CPU versions for D. */ #define TARGET_D_CPU_VERSIONS riscv_d_target_versions +#ifdef TARGET_BIG_ENDIAN_DEFAULT +#define DEFAULT_ENDIAN_SPEC "b" +#else +#define DEFAULT_ENDIAN_SPEC "l" +#endif + /* Default target_flags if no switches are specified */ #ifndef TARGET_DEFAULT -- cgit v1.1 From 28bddf0e322a5fdc33d3dcc3b9928d559ad7f124 Mon Sep 17 00:00:00 2001 From: Marcus Comstedt Date: Fri, 19 Mar 2021 20:49:06 +0100 Subject: RISC-V: Fix trampoline generation on big endian gcc/ * config/riscv/riscv.c (riscv_swap_instruction): New function to byteswap an SImode rtx containing an instruction. (riscv_trampoline_init): Byteswap the generated instructions when needed. --- gcc/config/riscv/riscv.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c index 99b8438..fe48db7 100644 --- a/gcc/config/riscv/riscv.c +++ b/gcc/config/riscv/riscv.c @@ -1073,6 +1073,15 @@ riscv_force_binary (machine_mode mode, enum rtx_code code, rtx x, rtx y) return riscv_emit_binary (code, gen_reg_rtx (mode), x, y); } +static rtx +riscv_swap_instruction (rtx inst) +{ + gcc_assert (GET_MODE (inst) == SImode); + if (BYTES_BIG_ENDIAN) + inst = expand_unop (SImode, bswap_optab, inst, gen_reg_rtx (SImode), 1); + return inst; +} + /* Copy VALUE to a register and return that register. If new pseudos are allowed, copy it into a new register, otherwise use DEST. */ @@ -4955,7 +4964,7 @@ riscv_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) gen_int_mode (lui_hi_chain_code, SImode)); mem = adjust_address (m_tramp, SImode, 0); - riscv_emit_move (mem, lui_hi_chain); + riscv_emit_move (mem, riscv_swap_instruction (lui_hi_chain)); /* Gen lui t0, hi(func). */ rtx hi_func = riscv_force_binary (SImode, PLUS, target_function, @@ -4967,7 +4976,7 @@ riscv_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) gen_int_mode (lui_hi_func_code, SImode)); mem = adjust_address (m_tramp, SImode, 1 * GET_MODE_SIZE (SImode)); - riscv_emit_move (mem, lui_hi_func); + riscv_emit_move (mem, riscv_swap_instruction (lui_hi_func)); /* Gen addi t2, t2, lo(chain). */ rtx lo_chain = riscv_force_binary (SImode, AND, chain_value, @@ -4982,7 +4991,7 @@ riscv_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) force_reg (SImode, GEN_INT (lo_chain_code))); mem = adjust_address (m_tramp, SImode, 2 * GET_MODE_SIZE (SImode)); - riscv_emit_move (mem, addi_lo_chain); + riscv_emit_move (mem, riscv_swap_instruction (addi_lo_chain)); /* Gen jr t0, lo(func). */ rtx lo_func = riscv_force_binary (SImode, AND, target_function, @@ -4995,7 +5004,7 @@ riscv_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) force_reg (SImode, GEN_INT (lo_func_code))); mem = adjust_address (m_tramp, SImode, 3 * GET_MODE_SIZE (SImode)); - riscv_emit_move (mem, jr_lo_func); + riscv_emit_move (mem, riscv_swap_instruction (jr_lo_func)); } else { @@ -5021,6 +5030,8 @@ riscv_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) /* Copy the trampoline code. */ for (i = 0; i < ARRAY_SIZE (trampoline); i++) { + if (BYTES_BIG_ENDIAN) + trampoline[i] = __builtin_bswap32(trampoline[i]); mem = adjust_address (m_tramp, SImode, i * GET_MODE_SIZE (SImode)); riscv_emit_move (mem, gen_int_mode (trampoline[i], SImode)); } -- cgit v1.1 From 7ac4dfec3912ef0be85542a00628c3ba01ddea2a Mon Sep 17 00:00:00 2001 From: Marcus Comstedt Date: Fri, 19 Mar 2021 20:49:08 +0100 Subject: RISC-V: Fix matches against subreg with a bytenum of 0 in riscv.md These all intend the least significant subpart of the register. Use the same endian-neutral "subreg_lowpart_operator" predicate that ARM does instead. gcc/ * config/riscv/predicates.md (subreg_lowpart_operator): New predicate * config/riscv/riscv.md (*addsi3_extended2, *subsi3_extended2) (*negsi2_extended2, *mulsi3_extended2, *si3_mask) (*si3_mask_1, *di3_mask, *di3_mask_1) (*si3_extend_mask, *si3_extend_mask_1): Use new predicate "subreg_lowpart_operator" --- gcc/config/riscv/predicates.md | 5 +++ gcc/config/riscv/riscv.md | 70 +++++++++++++++++++++--------------------- 2 files changed, 40 insertions(+), 35 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md index ef821ad..2321151 100644 --- a/gcc/config/riscv/predicates.md +++ b/gcc/config/riscv/predicates.md @@ -198,6 +198,11 @@ (define_predicate "signed_order_operator" (match_code "eq,ne,lt,le,ge,gt")) +(define_predicate "subreg_lowpart_operator" + (ior (match_code "truncate") + (and (match_code "subreg") + (match_test "subreg_lowpart_p (op)")))) + (define_predicate "fp_native_comparison" (match_code "eq,lt,le,gt,ge")) diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index fcdcc3a..c3687d5 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -480,9 +480,9 @@ (define_insn "*addsi3_extended2" [(set (match_operand:DI 0 "register_operand" "=r,r") (sign_extend:DI - (subreg:SI (plus:DI (match_operand:DI 1 "register_operand" " r,r") - (match_operand:DI 2 "arith_operand" " r,I")) - 0)))] + (match_operator:SI 3 "subreg_lowpart_operator" + [(plus:DI (match_operand:DI 1 "register_operand" " r,r") + (match_operand:DI 2 "arith_operand" " r,I"))])))] "TARGET_64BIT" "add%i2w\t%0,%1,%2" [(set_attr "type" "arith") @@ -536,9 +536,9 @@ (define_insn "*subsi3_extended2" [(set (match_operand:DI 0 "register_operand" "= r") (sign_extend:DI - (subreg:SI (minus:DI (match_operand:DI 1 "reg_or_0_operand" " rJ") - (match_operand:DI 2 "register_operand" " r")) - 0)))] + (match_operator:SI 3 "subreg_lowpart_operator" + [(minus:DI (match_operand:DI 1 "reg_or_0_operand" " rJ") + (match_operand:DI 2 "register_operand" " r"))])))] "TARGET_64BIT" "subw\t%0,%z1,%2" [(set_attr "type" "arith") @@ -572,8 +572,8 @@ (define_insn "*negsi2_extended2" [(set (match_operand:DI 0 "register_operand" "=r") (sign_extend:DI - (subreg:SI (neg:DI (match_operand:DI 1 "register_operand" " r")) - 0)))] + (match_operator:SI 2 "subreg_lowpart_operator" + [(neg:DI (match_operand:DI 1 "register_operand" " r"))])))] "TARGET_64BIT" "negw\t%0,%1" [(set_attr "type" "arith") @@ -627,9 +627,9 @@ (define_insn "*mulsi3_extended2" [(set (match_operand:DI 0 "register_operand" "=r") (sign_extend:DI - (subreg:SI (mult:DI (match_operand:DI 1 "register_operand" " r") - (match_operand:DI 2 "register_operand" " r")) - 0)))] + (match_operator:SI 3 "subreg_lowpart_operator" + [(mult:DI (match_operand:DI 1 "register_operand" " r") + (match_operand:DI 2 "register_operand" " r"))])))] "TARGET_MUL && TARGET_64BIT" "mulw\t%0,%1,%2" [(set_attr "type" "imul") @@ -1591,10 +1591,10 @@ [(set (match_operand:SI 0 "register_operand" "= r") (any_shift:SI (match_operand:SI 1 "register_operand" " r") - (subreg:QI - (and:SI - (match_operand:SI 2 "register_operand" "r") - (match_operand 3 "const_int_operand")) 0)))] + (match_operator 4 "subreg_lowpart_operator" + [(and:SI + (match_operand:SI 2 "register_operand" "r") + (match_operand 3 "const_int_operand"))])))] "(INTVAL (operands[3]) & (GET_MODE_BITSIZE (SImode)-1)) == GET_MODE_BITSIZE (SImode)-1" "#" @@ -1610,10 +1610,10 @@ [(set (match_operand:SI 0 "register_operand" "= r") (any_shift:SI (match_operand:SI 1 "register_operand" " r") - (subreg:QI - (and:DI - (match_operand:DI 2 "register_operand" "r") - (match_operand 3 "const_int_operand")) 0)))] + (match_operator 4 "subreg_lowpart_operator" + [(and:DI + (match_operand:DI 2 "register_operand" "r") + (match_operand 3 "const_int_operand"))])))] "TARGET_64BIT && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (SImode)-1)) == GET_MODE_BITSIZE (SImode)-1" @@ -1646,10 +1646,10 @@ [(set (match_operand:DI 0 "register_operand" "= r") (any_shift:DI (match_operand:DI 1 "register_operand" " r") - (subreg:QI - (and:SI - (match_operand:SI 2 "register_operand" "r") - (match_operand 3 "const_int_operand")) 0)))] + (match_operator 4 "subreg_lowpart_operator" + [(and:SI + (match_operand:SI 2 "register_operand" "r") + (match_operand 3 "const_int_operand"))])))] "TARGET_64BIT && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode)-1)) == GET_MODE_BITSIZE (DImode)-1" @@ -1666,10 +1666,10 @@ [(set (match_operand:DI 0 "register_operand" "= r") (any_shift:DI (match_operand:DI 1 "register_operand" " r") - (subreg:QI - (and:DI - (match_operand:DI 2 "register_operand" "r") - (match_operand 3 "const_int_operand")) 0)))] + (match_operator 4 "subreg_lowpart_operator" + [(and:DI + (match_operand:DI 2 "register_operand" "r") + (match_operand 3 "const_int_operand"))])))] "TARGET_64BIT && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode)-1)) == GET_MODE_BITSIZE (DImode)-1" @@ -1702,10 +1702,10 @@ (sign_extend:DI (any_shift:SI (match_operand:SI 1 "register_operand" " r") - (subreg:QI - (and:SI - (match_operand:SI 2 "register_operand" " r") - (match_operand 3 "const_int_operand")) 0))))] + (match_operator 4 "subreg_lowpart_operator" + [(and:SI + (match_operand:SI 2 "register_operand" " r") + (match_operand 3 "const_int_operand"))]))))] "TARGET_64BIT && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (SImode)-1)) == GET_MODE_BITSIZE (SImode)-1" @@ -1724,10 +1724,10 @@ (sign_extend:DI (any_shift:SI (match_operand:SI 1 "register_operand" " r") - (subreg:QI - (and:DI - (match_operand:DI 2 "register_operand" " r") - (match_operand 3 "const_int_operand")) 0))))] + (match_operator 4 "subreg_lowpart_operator" + [(and:DI + (match_operand:DI 2 "register_operand" " r") + (match_operand 3 "const_int_operand"))]))))] "TARGET_64BIT && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (SImode)-1)) == GET_MODE_BITSIZE (SImode)-1" -- cgit v1.1 From fffefe3d9d1715f83c82331f2265e040f42d09fe Mon Sep 17 00:00:00 2001 From: Marcus Comstedt Date: Fri, 19 Mar 2021 20:49:09 +0100 Subject: RISC-V: Fix riscv_subword() for big endian gcc/ * config/riscv/riscv.c (riscv_subword): Take endianness into account when calculating the byte offset. --- gcc/config/riscv/riscv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c index fe48db7..17cdf70 100644 --- a/gcc/config/riscv/riscv.c +++ b/gcc/config/riscv/riscv.c @@ -1966,7 +1966,7 @@ riscv_address_cost (rtx addr, machine_mode mode, rtx riscv_subword (rtx op, bool high_p) { - unsigned int byte = high_p ? UNITS_PER_WORD : 0; + unsigned int byte = (high_p != BYTES_BIG_ENDIAN) ? UNITS_PER_WORD : 0; machine_mode mode = GET_MODE (op); if (mode == VOIDmode) -- cgit v1.1 From 9c89c9e9c6b59260c7745c8714b69f94784a9c13 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sun, 21 Mar 2021 19:47:24 -0700 Subject: x86: Add __volatile__ to __cpuid and __cpuid_count Since CPUID instruction may return different values on hybrid core. volatile is needed on asm statements in . PR target/99704 * config/i386/cpuid.h (__cpuid): Add __volatile__. (__cpuid_count): Likewise. --- gcc/config/i386/cpuid.h | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h index 539325d..aebc17c 100644 --- a/gcc/config/i386/cpuid.h +++ b/gcc/config/i386/cpuid.h @@ -212,28 +212,28 @@ /* At least one cpu (Winchip 2) does not set %ebx and %ecx for cpuid leaf 1. Forcibly zero the two registers before calling cpuid as a precaution. */ -#define __cpuid(level, a, b, c, d) \ - do { \ - if (__builtin_constant_p (level) && (level) != 1) \ - __asm__ ("cpuid\n\t" \ - : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ - : "0" (level)); \ - else \ - __asm__ ("cpuid\n\t" \ - : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ - : "0" (level), "1" (0), "2" (0)); \ +#define __cpuid(level, a, b, c, d) \ + do { \ + if (__builtin_constant_p (level) && (level) != 1) \ + __asm__ __volatile__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level)); \ + else \ + __asm__ __volatile__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level), "1" (0), "2" (0)); \ } while (0) #else -#define __cpuid(level, a, b, c, d) \ - __asm__ ("cpuid\n\t" \ - : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ - : "0" (level)) +#define __cpuid(level, a, b, c, d) \ + __asm__ __volatile__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level)) #endif -#define __cpuid_count(level, count, a, b, c, d) \ - __asm__ ("cpuid\n\t" \ - : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ - : "0" (level), "2" (count)) +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ __volatile__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) /* Return highest supported input value for cpuid instruction. ext can -- cgit v1.1 From d7cea7ceff9a2be7436108030c598628c51fba0f Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 23 Mar 2021 14:02:03 +0000 Subject: aarch64: Make aarch64_add_offset work with -ftrapv [PR99540] aarch64_add_offset uses expand_mult to multiply the SVE VL by an out-of-range constant. expand_mult takes an argument to indicate whether the multiplication is signed or unsigned, but in this context the multiplication is effectively signless and so the choice seemed arbitrary. However, one of the things that the signedness input does is indicate whether signed overflow should be trapped for -ftrapv. We don't want that here, so we must treat the multiplication as unsigned. gcc/ 2021-03-23 Jakub Jelinek PR target/99540 * config/aarch64/aarch64.c (aarch64_add_offset): Tell expand_mult to perform an unsigned rather than a signed multiplication. gcc/testsuite/ 2021-03-23 Richard Sandiford PR target/99540 * gcc.dg/vect/pr99540.c: New test. --- gcc/config/aarch64/aarch64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index db69e69..c8a87fe 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -4639,7 +4639,7 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, if (can_create_pseudo_p ()) { rtx coeff1 = gen_int_mode (factor, mode); - val = expand_mult (mode, val, coeff1, NULL_RTX, false, true); + val = expand_mult (mode, val, coeff1, NULL_RTX, true, true); } else { -- cgit v1.1 From 7af8ec508f8105a4c4ea94246c1c4c25596cf6a5 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Tue, 23 Mar 2021 19:42:36 +0000 Subject: Darwin : Address a translation comment. Add a ':' to make the diagnostic read 'pch_address_space': xxx. gcc/ChangeLog: PR target/99733 * config/host-darwin.c (darwin_gt_pch_use_address): Add a colon to the diagnostic message. --- gcc/config/host-darwin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/host-darwin.c b/gcc/config/host-darwin.c index b101fca..14a01fe 100644 --- a/gcc/config/host-darwin.c +++ b/gcc/config/host-darwin.c @@ -62,7 +62,7 @@ darwin_gt_pch_use_address (void *addr, size_t sz, int fd, size_t off) if (munmap (pch_address_space + sz, sizeof (pch_address_space) - sz) != 0) fatal_error (input_location, - "could not unmap % %m"); + "could not unmap %: %m"); if (ret) { -- cgit v1.1 From be70bb5e4babdf9d3d33e8f4658452038407fa8e Mon Sep 17 00:00:00 2001 From: "Vladimir N. Makarov" Date: Tue, 23 Mar 2021 17:51:21 -0400 Subject: [PR99581] Use relaxed memory for more aarch64 memory constraints The original patch for PR99581 resulted in GCC testsuite regression as some constraints were not declared as relaxed memory ones. This patch fixes this. gcc/ChangeLog: PR target/99581 * config/aarch64/constraints.md (Utq, UOb, UOh, UOw, UOd, UOty): Use define_relaxed_memory_constraint for them. --- gcc/config/aarch64/constraints.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index f08eea8b..fd3e925 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -323,7 +323,7 @@ (and (match_code "mem") (match_test "aarch64_simd_mem_operand_p (op)"))) -(define_memory_constraint "Utq" +(define_relaxed_memory_constraint "Utq" "@internal An address valid for loading or storing a 128-bit AdvSIMD register" (and (match_code "mem") @@ -336,32 +336,32 @@ (and (match_code "mem") (match_test "aarch64_sve_ld1rq_operand_p (op)"))) -(define_memory_constraint "UOb" +(define_relaxed_memory_constraint "UOb" "@internal An address valid for SVE LD1ROH." (and (match_code "mem") (match_test "aarch64_sve_ld1ro_operand_p (op, QImode)"))) -(define_memory_constraint "UOh" +(define_relaxed_memory_constraint "UOh" "@internal An address valid for SVE LD1ROH." (and (match_code "mem") (match_test "aarch64_sve_ld1ro_operand_p (op, HImode)"))) -(define_memory_constraint "UOw" +(define_relaxed_memory_constraint "UOw" "@internal An address valid for SVE LD1ROW." (and (match_code "mem") (match_test "aarch64_sve_ld1ro_operand_p (op, SImode)"))) -(define_memory_constraint "UOd" +(define_relaxed_memory_constraint "UOd" "@internal An address valid for SVE LD1ROD." (and (match_code "mem") (match_test "aarch64_sve_ld1ro_operand_p (op, DImode)"))) -(define_memory_constraint "Uty" +(define_relaxed_memory_constraint "Uty" "@internal An address valid for SVE LD1Rs." (and (match_code "mem") -- cgit v1.1 From 4be312862dec5c8e49c76249dd5aed220c72039e Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Wed, 24 Mar 2021 05:44:35 -0300 Subject: fix ssse3_pshufbv8qi3 post-reload const pool load The split in ssse3_pshufbv8qi3 forces a const vector into the constant pool, and loads from it. That runs after reload, so if the load requires any reloading, we're out of luck. Indeed, if the load address is not legitimate, e.g. -mcmodel=large, the insn is no longer recognized. This patch turns the constant into an input operand, introduces an expander to generate the constant unconditionally, and arranges for this input operand to be retained as an unused immediate in the alternatives that don't undergo splitting, and for it to be loaded into the scratch register for those that do. It is now the register allocator that arranges to load the const vector into a register, so it deals with whatever legitimizing steps needed for the target configuration. for gcc/ChangeLog * config/i386/predicates.md (reg_or_const_vec_operand): New. * config/i386/sse.md (ssse3_pshufbv8qi3): Add an expander for the now *-prefixed insn_and_split, turn the splitter const vec into an input for the insn, making it an ignored immediate for non-split cases, and loaded into the scratch register otherwise. for gcc/testsuite/ChangeLog * gcc.target/i386/pr94467-3.c: New. --- gcc/config/i386/predicates.md | 6 ++++++ gcc/config/i386/sse.md | 25 ++++++++++++++++++------- 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index b6dd5e9..b1df854 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1153,6 +1153,12 @@ (ior (match_operand 0 "nonimmediate_operand") (match_code "const_vector"))) +;; Return true when OP is either register operand, or any +;; CONST_VECTOR. +(define_predicate "reg_or_const_vector_operand" + (ior (match_operand 0 "register_operand") + (match_code "const_vector"))) + ;; Return true when OP is nonimmediate or standard SSE constant. (define_predicate "nonimmediate_or_sse_const_operand" (ior (match_operand 0 "nonimmediate_operand") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 43e4d57..9d3728d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17159,10 +17159,25 @@ (set_attr "btver2_decode" "vector") (set_attr "mode" "")]) -(define_insn_and_split "ssse3_pshufbv8qi3" +(define_expand "ssse3_pshufbv8qi3" + [(parallel + [(set (match_operand:V8QI 0 "register_operand") + (unspec:V8QI [(match_operand:V8QI 1 "register_operand") + (match_operand:V8QI 2 "register_mmxmem_operand") + (match_dup 3)] UNSPEC_PSHUFB)) + (clobber (match_scratch:V4SI 4))])] + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3" +{ + operands[3] = ix86_build_const_vector (V4SImode, true, + gen_int_mode (0xf7f7f7f7, SImode)); +}) + +(define_insn_and_split "*ssse3_pshufbv8qi3" [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv") (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0,0,Yv") - (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yv")] + (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yv") + (match_operand:V4SI 4 "reg_or_const_vector_operand" + "i,3,3")] UNSPEC_PSHUFB)) (clobber (match_scratch:V4SI 3 "=X,&x,&Yv"))] "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3" @@ -17172,8 +17187,7 @@ #" "TARGET_SSSE3 && reload_completed && SSE_REGNO_P (REGNO (operands[0]))" - [(set (match_dup 3) (match_dup 5)) - (set (match_dup 3) + [(set (match_dup 3) (and:V4SI (match_dup 3) (match_dup 2))) (set (match_dup 0) (unspec:V16QI [(match_dup 1) (match_dup 4)] UNSPEC_PSHUFB))] @@ -17188,9 +17202,6 @@ GET_MODE (operands[2])); operands[4] = lowpart_subreg (V16QImode, operands[3], GET_MODE (operands[3])); - rtx vec_const = ix86_build_const_vector (V4SImode, true, - gen_int_mode (0xf7f7f7f7, SImode)); - operands[5] = force_const_mem (V4SImode, vec_const); } [(set_attr "mmx_isa" "native,sse_noavx,avx") (set_attr "prefix_extra" "1") -- cgit v1.1 From 4f992de4f369e17b64a66246ec35bf75f41dc98d Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 24 Mar 2021 11:22:35 +0100 Subject: arm: Fix some more vec-common.md patterns for iwmmxt [PR99724] The following patch fixes similar issues as in PR98849; in older gcc versions, the expanders were present in neon.md guarded with TARGET_NEON, but they got moved to vec-common.md and guarded with ARM_HAVE__ARITH so that they handle both MVE and Neon. The macros are enabled for some modes even for iwmmxt which has some vector support for those modes, but only limited. In particular, neither the one_cmpl, nor neg, nor movmisalign patterns are present. For some reason I've failed to construct something that ICEs with movmisalign, so that is not covered by the testsuite, but both one_cmpl and neg ICE. 2021-03-24 Jakub Jelinek PR target/99724 * config/arm/vec-common.md (one_cmpl2, neg2, movmisalign): Disable expanders for TARGET_REALLY_IWMMXT. * gcc.target/arm/pr99724.c: New test. --- gcc/config/arm/vec-common.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 0e13187..48ee659 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -202,13 +202,13 @@ (define_expand "one_cmpl2" [(set (match_operand:VDQ 0 "s_register_operand") (not:VDQ (match_operand:VDQ 1 "s_register_operand")))] - "ARM_HAVE__ARITH" + "ARM_HAVE__ARITH && !TARGET_REALLY_IWMMXT" ) (define_expand "neg2" [(set (match_operand:VDQWH 0 "s_register_operand" "") (neg:VDQWH (match_operand:VDQWH 1 "s_register_operand" "")))] - "ARM_HAVE__ARITH" + "ARM_HAVE__ARITH && !TARGET_REALLY_IWMMXT" ) (define_expand "cadd3" @@ -281,7 +281,8 @@ [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand") (unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")] UNSPEC_MISALIGNED_ACCESS))] - "ARM_HAVE__LDST && !BYTES_BIG_ENDIAN && unaligned_access" + "ARM_HAVE__LDST && !BYTES_BIG_ENDIAN + && unaligned_access && !TARGET_REALLY_IWMMXT" { rtx adjust_mem; /* This pattern is not permitted to fail during expansion: if both arguments -- cgit v1.1 From 79cdbabbc4cbf79491c274220928406dff3d4039 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 24 Mar 2021 08:51:41 +0000 Subject: arm: Fix MVE constraints for movmisalign [PR target/99727] MVE has different constraints than Neon for load/store: we should use the Ux constraint instead of Um. 2021-03-24 Christophe Lyon PR target/99727 gcc/ * config/arm/mve.md (movmisalign_mve_store): Use Ux constraint. (movmisalign_mve_load): Likewise. gcc/testsuite/ * gcc.target/arm/pr99727.c: New test. --- gcc/config/arm/mve.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 440fd6a..1351863 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -10858,7 +10858,7 @@ ) (define_insn "*movmisalign_mve_store" - [(set (match_operand:MVE_VLD_ST 0 "neon_permissive_struct_operand" "=Um") + [(set (match_operand:MVE_VLD_ST 0 "neon_permissive_struct_operand" "=Ux") (unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 "s_register_operand" " w")] UNSPEC_MISALIGNED_ACCESS))] "((TARGET_HAVE_MVE && VALID_MVE_SI_MODE (mode)) @@ -10871,7 +10871,7 @@ (define_insn "*movmisalign_mve_load" [(set (match_operand:MVE_VLD_ST 0 "s_register_operand" "=w") - (unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 "neon_permissive_struct_operand" " Um")] + (unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 "neon_permissive_struct_operand" " Ux")] UNSPEC_MISALIGNED_ACCESS))] "((TARGET_HAVE_MVE && VALID_MVE_SI_MODE (mode)) || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (mode))) -- cgit v1.1 From 4f00c4d40a539360938607561460904663c64cda Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Wed, 24 Mar 2021 15:58:03 +0100 Subject: i386: fix -march=amd crash It started with g:3e2ae3ee285a57455d5a23bd352a68c289130186 where new entry was added to processor_alias_table after generic node: + {"amdfam19h", PROCESSOR_GENERIC, CPU_GENERIC, 0, + M_CPU_TYPE (AMDFAM19H), P_NONE}, and then the following is violated: /* NB: processor_alias_table stops at the "generic" entry. */ gcc/ChangeLog: PR target/99753 * common/config/i386/i386-common.c (ARRAY_SIZE): Fix off-by-one error. * config/i386/i386-options.c (ix86_option_override_internal): Add run-time assert. gcc/testsuite/ChangeLog: PR target/99753 * gcc.target/i386/pr99753.c: New test. --- gcc/config/i386/i386-options.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index b653527..88d5e71 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -2042,6 +2042,9 @@ ix86_option_override_internal (bool main_args_p, sorry ("%i-bit mode not compiled in", (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32); + /* Last processor_alias_table must point to "generic" entry. */ + gcc_checking_assert (strcmp (processor_alias_table[pta_size - 1].name, + "generic") == 0); for (i = 0; i < pta_size; i++) if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name)) { -- cgit v1.1 From 08103e4d6ada9b57366f2df2a2b745babfab914c Mon Sep 17 00:00:00 2001 From: Xionghu Luo Date: Wed, 24 Mar 2021 19:46:12 -0500 Subject: rs6000: Correct Power8 cost of l2 cache size [PR97329] l2 cache size for Power8 is 512kB, it was copied from Power7 before public. Tested no performance change for SPEC2017. gcc/ChangeLog: 2021-03-24 Xionghu Luo * config/rs6000/rs6000.c (power8_costs): Change l2 cache from 256 to 512. --- gcc/config/rs6000/rs6000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 616dae3..34c4eda 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1055,7 +1055,7 @@ struct processor_costs power8_cost = { COSTS_N_INSNS (17), /* ddiv */ 128, /* cache line size */ 32, /* l1 cache */ - 256, /* l2 cache */ + 512, /* l2 cache */ 12, /* prefetch streams */ COSTS_N_INSNS (3), /* SF->DF convert */ }; -- cgit v1.1 From 72982851d70dfbc547d83ed2bb45356b9ebe3ff0 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Tue, 23 Mar 2021 20:04:58 -0700 Subject: x86: Skip ISA check for always_inline in system headers For always_inline in system headers, we don't know if caller's ISAs are compatible with callee's ISAs until much later. Skip ISA check for always_inline in system headers if caller has target attribute. gcc/ PR target/98209 PR target/99744 * config/i386/i386.c (ix86_can_inline_p): Don't check ISA for always_inline in system headers. gcc/testsuite/ PR target/98209 PR target/99744 * gcc.target/i386/pr98209.c: New test. * gcc.target/i386/pr99744-1.c: Likewise. * gcc.target/i386/pr99744-2.c: Likewise. --- gcc/config/i386/i386.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7c41302..1b4567e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -578,21 +578,29 @@ ix86_can_inline_p (tree caller, tree callee) && lookup_attribute ("always_inline", DECL_ATTRIBUTES (callee))); + /* NB: Skip ISA check for always_inline in system headers if caller + has target attribute. */ + bool skip_isa_check = (always_inline + && caller_tree != target_option_default_node + && DECL_IN_SYSTEM_HEADER (callee)); + cgraph_node *callee_node = cgraph_node::get (callee); /* Callee's isa options should be a subset of the caller's, i.e. a SSE4 function can inline a SSE2 function but a SSE2 function can't inline a SSE4 function. */ - if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags) - != callee_opts->x_ix86_isa_flags) - || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2) - != callee_opts->x_ix86_isa_flags2)) + if (!skip_isa_check + && (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags) + != callee_opts->x_ix86_isa_flags) + || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2) + != callee_opts->x_ix86_isa_flags2))) ret = false; /* See if we have the same non-isa options. */ - else if ((!always_inline - && caller_opts->x_target_flags != callee_opts->x_target_flags) - || (caller_opts->x_target_flags & ~always_inline_safe_mask) - != (callee_opts->x_target_flags & ~always_inline_safe_mask)) + else if (!skip_isa_check + && ((!always_inline + && caller_opts->x_target_flags != callee_opts->x_target_flags) + || ((caller_opts->x_target_flags & ~always_inline_safe_mask) + != (callee_opts->x_target_flags & ~always_inline_safe_mask)))) ret = false; /* See if arch, tune, etc. are the same. */ -- cgit v1.1 From de00a7bda94910835012bc7150be53b460a5c8b6 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 25 Mar 2021 06:57:37 -0700 Subject: Revert "x86: Skip ISA check for always_inline in system headers" This reverts commit 72982851d70dfbc547d83ed2bb45356b9ebe3ff0. --- gcc/config/i386/i386.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 1b4567e..7c41302 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -578,29 +578,21 @@ ix86_can_inline_p (tree caller, tree callee) && lookup_attribute ("always_inline", DECL_ATTRIBUTES (callee))); - /* NB: Skip ISA check for always_inline in system headers if caller - has target attribute. */ - bool skip_isa_check = (always_inline - && caller_tree != target_option_default_node - && DECL_IN_SYSTEM_HEADER (callee)); - cgraph_node *callee_node = cgraph_node::get (callee); /* Callee's isa options should be a subset of the caller's, i.e. a SSE4 function can inline a SSE2 function but a SSE2 function can't inline a SSE4 function. */ - if (!skip_isa_check - && (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags) - != callee_opts->x_ix86_isa_flags) - || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2) - != callee_opts->x_ix86_isa_flags2))) + if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags) + != callee_opts->x_ix86_isa_flags) + || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2) + != callee_opts->x_ix86_isa_flags2)) ret = false; /* See if we have the same non-isa options. */ - else if (!skip_isa_check - && ((!always_inline - && caller_opts->x_target_flags != callee_opts->x_target_flags) - || ((caller_opts->x_target_flags & ~always_inline_safe_mask) - != (callee_opts->x_target_flags & ~always_inline_safe_mask)))) + else if ((!always_inline + && caller_opts->x_target_flags != callee_opts->x_target_flags) + || (caller_opts->x_target_flags & ~always_inline_safe_mask) + != (callee_opts->x_target_flags & ~always_inline_safe_mask)) ret = false; /* See if arch, tune, etc. are the same. */ -- cgit v1.1 From 8ab1d637440532d9698daae84cc81a43d36b4aa8 Mon Sep 17 00:00:00 2001 From: Iain Buclaw Date: Sun, 21 Mar 2021 11:00:29 +0100 Subject: d: Add openbsd support for D compiler [PR99691] gcc/ChangeLog: PR d/99691 * config.gcc (*-*-openbsd*): Add openbsd-d.o. * config/t-openbsd: Add openbsd-d.o. * config/openbsd-d.c: New file. --- gcc/config/openbsd-d.c | 39 +++++++++++++++++++++++++++++++++++++++ gcc/config/t-openbsd | 5 +++++ 2 files changed, 44 insertions(+) create mode 100644 gcc/config/openbsd-d.c (limited to 'gcc/config') diff --git a/gcc/config/openbsd-d.c b/gcc/config/openbsd-d.c new file mode 100644 index 0000000..b42727e --- /dev/null +++ b/gcc/config/openbsd-d.c @@ -0,0 +1,39 @@ +/* Functions for generic OpenBSD as target machine for GNU D compiler. + Copyright (C) 2021 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "memmodel.h" +#include "tm.h" +#include "tm_p.h" +#include "d/d-target.h" +#include "d/d-target-def.h" + +static void +openbsd_d_os_builtins (void) +{ + d_add_builtin_version ("Posix"); + d_add_builtin_version ("OpenBSD"); +} + +#undef TARGET_D_OS_VERSIONS +#define TARGET_D_OS_VERSIONS openbsd_d_os_builtins + +struct gcc_targetdm targetdm = TARGETDM_INITIALIZER; diff --git a/gcc/config/t-openbsd b/gcc/config/t-openbsd index 7637da0..27a419d 100644 --- a/gcc/config/t-openbsd +++ b/gcc/config/t-openbsd @@ -1,2 +1,7 @@ # We don't need GCC's own include files. USER_H = $(EXTRA_HEADERS) + +# OpenBSD-specific D support. +openbsd-d.o: $(srcdir)/config/openbsd-d.c + $(COMPILE) $< + $(POSTCOMPILE) -- cgit v1.1 From 74ed3fc27966b07d701e1dead1cc37b53af227b4 Mon Sep 17 00:00:00 2001 From: Iain Buclaw Date: Sun, 21 Mar 2021 17:51:39 +0100 Subject: [freebsd] d: Fix build failures on sparc64-*-freebsd* All target platforms that could run on SPARC should include this header in order to avoid errors from memmodel being used in sparc-protos.h. gcc/ChangeLog: * config/freebsd-d.c: Include memmodel.h. --- gcc/config/freebsd-d.c | 1 + 1 file changed, 1 insertion(+) (limited to 'gcc/config') diff --git a/gcc/config/freebsd-d.c b/gcc/config/freebsd-d.c index 425ca83..8a8ddd9 100644 --- a/gcc/config/freebsd-d.c +++ b/gcc/config/freebsd-d.c @@ -18,6 +18,7 @@ along with GCC; see the file COPYING3. If not see #include "config.h" #include "system.h" #include "coretypes.h" +#include "memmodel.h" #include "tm.h" #include "tm_p.h" #include "d/d-target.h" -- cgit v1.1 From 2892e2f70287f961e3bac990b926232cc2a5b123 Mon Sep 17 00:00:00 2001 From: Iain Buclaw Date: Sun, 22 Mar 2020 01:18:42 +0100 Subject: d: Add windows support for D compiler [PR91595] gcc/ChangeLog: PR d/91595 * config.gcc (*-*-cygwin*): Add winnt-d.o (*-*-mingw*): Likewise. * config/i386/cygwin.h (EXTRA_TARGET_D_OS_VERSIONS): New macro. * config/i386/mingw32.h (EXTRA_TARGET_D_OS_VERSIONS): Likewise. * config/i386/t-cygming: Add winnt-d.o. * config/i386/winnt-d.c: New file. --- gcc/config/i386/cygwin.h | 9 ++++++++ gcc/config/i386/mingw32.h | 12 ++++++++++ gcc/config/i386/t-cygming | 4 ++++ gcc/config/i386/winnt-d.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 81 insertions(+) create mode 100644 gcc/config/i386/winnt-d.c (limited to 'gcc/config') diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h index db0a3cc..71fb613 100644 --- a/gcc/config/i386/cygwin.h +++ b/gcc/config/i386/cygwin.h @@ -29,6 +29,15 @@ along with GCC; see the file COPYING3. If not see } \ while (0) +#define EXTRA_TARGET_D_OS_VERSIONS() \ + do \ + { \ + builtin_version ("Cygwin"); \ + builtin_version ("Posix"); \ + builtin_version ("CRuntime_Newlib"); \ + } \ + while (0) + #undef CPP_SPEC #define CPP_SPEC "%(cpp_cpu) %{posix:-D_POSIX_SOURCE} \ %{!ansi:-Dunix} \ diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h index 1a6a3a0..36e7bae 100644 --- a/gcc/config/i386/mingw32.h +++ b/gcc/config/i386/mingw32.h @@ -53,6 +53,18 @@ along with GCC; see the file COPYING3. If not see } \ while (0) +#define EXTRA_TARGET_D_OS_VERSIONS() \ + do \ + { \ + builtin_version ("MinGW"); \ + if (TARGET_64BIT && ix86_abi == MS_ABI) \ + builtin_version ("Win64"); \ + else if (!TARGET_64BIT) \ + builtin_version ("Win32"); \ + builtin_version ("CRuntime_Microsoft"); \ + } \ + while (0) + #ifndef TARGET_USE_PTHREAD_BY_DEFAULT #define SPEC_PTHREAD1 "pthread" #define SPEC_PTHREAD2 "!no-pthread" diff --git a/gcc/config/i386/t-cygming b/gcc/config/i386/t-cygming index 7ccbb84..38e2f0b 100644 --- a/gcc/config/i386/t-cygming +++ b/gcc/config/i386/t-cygming @@ -39,6 +39,10 @@ winnt-stubs.o: $(srcdir)/config/i386/winnt-stubs.c $(CONFIG_H) $(SYSTEM_H) coret $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ $(srcdir)/config/i386/winnt-stubs.c +winnt-d.o: $(srcdir)/config/i386/winnt-d.c + $(COMPILE) $< + $(POSTCOMPILE) + msformat-c.o: $(srcdir)/config/i386/msformat-c.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ $(TM_H) $(RTL_H) $(REGS_H) hard-reg-set.h output.h $(TREE_H) flags.h \ $(TM_P_H) $(HASHTAB_H) $(GGC_H) diff --git a/gcc/config/i386/winnt-d.c b/gcc/config/i386/winnt-d.c new file mode 100644 index 0000000..8a6b9c5 --- /dev/null +++ b/gcc/config/i386/winnt-d.c @@ -0,0 +1,56 @@ +/* Windows support needed only by D front-end. + Copyright (C) 2021 Free Software Foundation, Inc. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define IN_TARGET_CODE 1 + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "target.h" +#include "d/d-target.h" +#include "d/d-target-def.h" +#include "tm_p.h" + +/* Implement TARGET_D_OS_VERSIONS for Windows targets. */ + +static void +winnt_d_os_builtins (void) +{ + d_add_builtin_version ("Windows"); + +#define builtin_version(TXT) d_add_builtin_version (TXT) + +#ifdef EXTRA_TARGET_D_OS_VERSIONS + EXTRA_TARGET_D_OS_VERSIONS (); +#endif +} + +#undef TARGET_D_OS_VERSIONS +#define TARGET_D_OS_VERSIONS winnt_d_os_builtins + +/* Define TARGET_D_MINFO_SECTION for Windows targets. */ + +#undef TARGET_D_MINFO_SECTION +#define TARGET_D_MINFO_SECTION "minfo" + +#undef TARGET_D_MINFO_START_NAME +#define TARGET_D_MINFO_START_NAME "__start_minfo" + +#undef TARGET_D_MINFO_END_NAME +#define TARGET_D_MINFO_END_NAME "__stop_minfo" + +struct gcc_targetdm targetdm = TARGETDM_INITIALIZER; -- cgit v1.1 From 433b6b12dfa5ef109a5d8d40e7b3695dcb749e58 Mon Sep 17 00:00:00 2001 From: Iain Buclaw Date: Fri, 26 Mar 2021 13:12:59 +0100 Subject: d: Define IN_TARGET_CODE in all machine-specific D language files. This is to be consistent with the rest of the back-end. gcc/ChangeLog: * config/aarch64/aarch64-d.c (IN_TARGET_CODE): Define. * config/arm/arm-d.c (IN_TARGET_CODE): Likewise. * config/i386/i386-d.c (IN_TARGET_CODE): Likewise. * config/mips/mips-d.c (IN_TARGET_CODE): Likewise. * config/pa/pa-d.c (IN_TARGET_CODE): Likewise. * config/riscv/riscv-d.c (IN_TARGET_CODE): Likewise. * config/rs6000/rs6000-d.c (IN_TARGET_CODE): Likewise. * config/s390/s390-d.c (IN_TARGET_CODE): Likewise. * config/sparc/sparc-d.c (IN_TARGET_CODE): Likewise. --- gcc/config/aarch64/aarch64-d.c | 2 ++ gcc/config/arm/arm-d.c | 2 ++ gcc/config/i386/i386-d.c | 2 ++ gcc/config/mips/mips-d.c | 2 ++ gcc/config/pa/pa-d.c | 2 ++ gcc/config/riscv/riscv-d.c | 2 ++ gcc/config/rs6000/rs6000-d.c | 2 ++ gcc/config/s390/s390-d.c | 2 ++ gcc/config/sparc/sparc-d.c | 2 ++ 9 files changed, 18 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-d.c b/gcc/config/aarch64/aarch64-d.c index 5c9b4fa..4fce593 100644 --- a/gcc/config/aarch64/aarch64-d.c +++ b/gcc/config/aarch64/aarch64-d.c @@ -15,6 +15,8 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ +#define IN_TARGET_CODE 1 + #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/arm/arm-d.c b/gcc/config/arm/arm-d.c index 76ede3b..2cb9f4b 100644 --- a/gcc/config/arm/arm-d.c +++ b/gcc/config/arm/arm-d.c @@ -15,6 +15,8 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ +#define IN_TARGET_CODE 1 + #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/i386/i386-d.c b/gcc/config/i386/i386-d.c index cbd3ceb..b79be85 100644 --- a/gcc/config/i386/i386-d.c +++ b/gcc/config/i386/i386-d.c @@ -15,6 +15,8 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ +#define IN_TARGET_CODE 1 + #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/mips/mips-d.c b/gcc/config/mips/mips-d.c index dad101c..dc57127 100644 --- a/gcc/config/mips/mips-d.c +++ b/gcc/config/mips/mips-d.c @@ -15,6 +15,8 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ +#define IN_TARGET_CODE 1 + #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/pa/pa-d.c b/gcc/config/pa/pa-d.c index 1de49df..663e749 100644 --- a/gcc/config/pa/pa-d.c +++ b/gcc/config/pa/pa-d.c @@ -15,6 +15,8 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ +#define IN_TARGET_CODE 1 + #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/riscv/riscv-d.c b/gcc/config/riscv/riscv-d.c index 2b690b1..b20b778 100644 --- a/gcc/config/riscv/riscv-d.c +++ b/gcc/config/riscv/riscv-d.c @@ -15,6 +15,8 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ +#define IN_TARGET_CODE 1 + #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/rs6000/rs6000-d.c b/gcc/config/rs6000/rs6000-d.c index 14c4133..6bfe813 100644 --- a/gcc/config/rs6000/rs6000-d.c +++ b/gcc/config/rs6000/rs6000-d.c @@ -15,6 +15,8 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ +#define IN_TARGET_CODE 1 + #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/s390/s390-d.c b/gcc/config/s390/s390-d.c index 155144c..2f945eb 100644 --- a/gcc/config/s390/s390-d.c +++ b/gcc/config/s390/s390-d.c @@ -15,6 +15,8 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ +#define IN_TARGET_CODE 1 + #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/sparc/sparc-d.c b/gcc/config/sparc/sparc-d.c index 186e965..0eb663b 100644 --- a/gcc/config/sparc/sparc-d.c +++ b/gcc/config/sparc/sparc-d.c @@ -15,6 +15,8 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ +#define IN_TARGET_CODE 1 + #include "config.h" #include "system.h" #include "coretypes.h" -- cgit v1.1 From e253bb8b796dbf88a2650e350a040cd0e0df41cd Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 26 Mar 2021 16:08:29 +0000 Subject: aarch64: Add reduction costs to simd_vec_costs This patch is part of a series that makes opt-in tweaks to the AArch64 vector cost model. At the moment, all reductions are costed as vec_to_scalar, which also includes things like extracting a single element from a vector. This is a bit too coarse in practice, since the cost of a reduction depends very much on the type of value that it's processing. This patch therefore adds separate costs for each case. To start with, all the new costs are copied from the associated vec_to_scalar ones. Due the extreme lateness of this patch in the GCC 11 cycle, I've added a new tuning flag (use_new_vector_costs) that selects the new behaviour. This should help to ensure that the risk of the new code is only borne by the CPUs that need it. Generic tuning is not affected. gcc/ * config/aarch64/aarch64-tuning-flags.def (use_new_vector_costs): New tuning flag. * config/aarch64/aarch64-protos.h (simd_vec_cost): Put comments above the fields rather than to the right. (simd_vec_cost::reduc_i8_cost): New member variable. (simd_vec_cost::reduc_i16_cost): Likewise. (simd_vec_cost::reduc_i32_cost): Likewise. (simd_vec_cost::reduc_i64_cost): Likewise. (simd_vec_cost::reduc_f16_cost): Likewise. (simd_vec_cost::reduc_f32_cost): Likewise. (simd_vec_cost::reduc_f64_cost): Likewise. * config/aarch64/aarch64.c (generic_advsimd_vector_cost): Update accordingly, using the vec_to_scalar_cost for the new fields. (generic_sve_vector_cost, a64fx_advsimd_vector_cost): Likewise. (a64fx_sve_vector_cost, qdf24xx_advsimd_vector_cost): Likewise. (thunderx_advsimd_vector_cost, tsv110_advsimd_vector_cost): Likewise. (cortexa57_advsimd_vector_cost, exynosm1_advsimd_vector_cost) (xgene1_advsimd_vector_cost, thunderx2t99_advsimd_vector_cost) (thunderx3t110_advsimd_vector_cost): Likewise. (aarch64_use_new_vector_costs_p): New function. (aarch64_simd_vec_costs): New function, split out from... (aarch64_builtin_vectorization_cost): ...here. (aarch64_is_reduction): New function. (aarch64_detect_vector_stmt_subtype): Likewise. (aarch64_add_stmt_cost): Call aarch64_detect_vector_stmt_subtype if using the new vector costs. --- gcc/config/aarch64/aarch64-protos.h | 56 ++++++--- gcc/config/aarch64/aarch64-tuning-flags.def | 2 + gcc/config/aarch64/aarch64.c | 180 +++++++++++++++++++++++++++- 3 files changed, 216 insertions(+), 22 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index ff87ced..e4eeb2c 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -194,22 +194,46 @@ struct cpu_regmove_cost struct simd_vec_cost { - const int int_stmt_cost; /* Cost of any int vector operation, - excluding load, store, permute, - vector-to-scalar and - scalar-to-vector operation. */ - const int fp_stmt_cost; /* Cost of any fp vector operation, - excluding load, store, permute, - vector-to-scalar and - scalar-to-vector operation. */ - const int permute_cost; /* Cost of permute operation. */ - const int vec_to_scalar_cost; /* Cost of vec-to-scalar operation. */ - const int scalar_to_vec_cost; /* Cost of scalar-to-vector - operation. */ - const int align_load_cost; /* Cost of aligned vector load. */ - const int unalign_load_cost; /* Cost of unaligned vector load. */ - const int unalign_store_cost; /* Cost of unaligned vector store. */ - const int store_cost; /* Cost of vector store. */ + /* Cost of any integer vector operation, excluding the ones handled + specially below. */ + const int int_stmt_cost; + + /* Cost of any fp vector operation, excluding the ones handled + specially below. */ + const int fp_stmt_cost; + + /* Cost of a permute operation. */ + const int permute_cost; + + /* Cost of reductions for various vector types: iN is for N-bit + integer elements and fN is for N-bit floating-point elements. + We need to single out the element type because it affects the + depth of the reduction. */ + const int reduc_i8_cost; + const int reduc_i16_cost; + const int reduc_i32_cost; + const int reduc_i64_cost; + const int reduc_f16_cost; + const int reduc_f32_cost; + const int reduc_f64_cost; + + /* Cost of a vector-to-scalar operation. */ + const int vec_to_scalar_cost; + + /* Cost of a scalar-to-vector operation. */ + const int scalar_to_vec_cost; + + /* Cost of an aligned vector load. */ + const int align_load_cost; + + /* Cost of an unaligned vector load. */ + const int unalign_load_cost; + + /* Cost of an unaligned vector store. */ + const int unalign_store_cost; + + /* Cost of a vector store. */ + const int store_cost; }; typedef struct simd_vec_cost advsimd_vec_cost; diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 588edf4..a61fcf9 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -48,4 +48,6 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS) AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS) +AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index c8a87fe..b44dcdc 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -591,6 +591,13 @@ static const advsimd_vec_cost generic_advsimd_vector_cost = 1, /* int_stmt_cost */ 1, /* fp_stmt_cost */ 2, /* permute_cost */ + 2, /* reduc_i8_cost */ + 2, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 2, /* reduc_f16_cost */ + 2, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ 2, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* align_load_cost */ @@ -605,6 +612,13 @@ static const sve_vec_cost generic_sve_vector_cost = 1, /* int_stmt_cost */ 1, /* fp_stmt_cost */ 2, /* permute_cost */ + 2, /* reduc_i8_cost */ + 2, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 2, /* reduc_f16_cost */ + 2, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ 2, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* align_load_cost */ @@ -631,6 +645,13 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost = 2, /* int_stmt_cost */ 5, /* fp_stmt_cost */ 3, /* permute_cost */ + 13, /* reduc_i8_cost */ + 13, /* reduc_i16_cost */ + 13, /* reduc_i32_cost */ + 13, /* reduc_i64_cost */ + 13, /* reduc_f16_cost */ + 13, /* reduc_f32_cost */ + 13, /* reduc_f64_cost */ 13, /* vec_to_scalar_cost */ 4, /* scalar_to_vec_cost */ 6, /* align_load_cost */ @@ -644,6 +665,13 @@ static const sve_vec_cost a64fx_sve_vector_cost = 2, /* int_stmt_cost */ 5, /* fp_stmt_cost */ 3, /* permute_cost */ + 13, /* reduc_i8_cost */ + 13, /* reduc_i16_cost */ + 13, /* reduc_i32_cost */ + 13, /* reduc_i64_cost */ + 13, /* reduc_f16_cost */ + 13, /* reduc_f32_cost */ + 13, /* reduc_f64_cost */ 13, /* vec_to_scalar_cost */ 4, /* scalar_to_vec_cost */ 6, /* align_load_cost */ @@ -669,6 +697,13 @@ static const advsimd_vec_cost qdf24xx_advsimd_vector_cost = 1, /* int_stmt_cost */ 3, /* fp_stmt_cost */ 2, /* permute_cost */ + 1, /* reduc_i8_cost */ + 1, /* reduc_i16_cost */ + 1, /* reduc_i32_cost */ + 1, /* reduc_i64_cost */ + 1, /* reduc_f16_cost */ + 1, /* reduc_f32_cost */ + 1, /* reduc_f64_cost */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* align_load_cost */ @@ -696,6 +731,13 @@ static const advsimd_vec_cost thunderx_advsimd_vector_cost = 4, /* int_stmt_cost */ 1, /* fp_stmt_cost */ 4, /* permute_cost */ + 2, /* reduc_i8_cost */ + 2, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 2, /* reduc_f16_cost */ + 2, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ 2, /* vec_to_scalar_cost */ 2, /* scalar_to_vec_cost */ 3, /* align_load_cost */ @@ -722,6 +764,13 @@ static const advsimd_vec_cost tsv110_advsimd_vector_cost = 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ 2, /* permute_cost */ + 3, /* reduc_i8_cost */ + 3, /* reduc_i16_cost */ + 3, /* reduc_i32_cost */ + 3, /* reduc_i64_cost */ + 3, /* reduc_f16_cost */ + 3, /* reduc_f32_cost */ + 3, /* reduc_f64_cost */ 3, /* vec_to_scalar_cost */ 2, /* scalar_to_vec_cost */ 5, /* align_load_cost */ @@ -747,6 +796,13 @@ static const advsimd_vec_cost cortexa57_advsimd_vector_cost = 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ 3, /* permute_cost */ + 8, /* reduc_i8_cost */ + 8, /* reduc_i16_cost */ + 8, /* reduc_i32_cost */ + 8, /* reduc_i64_cost */ + 8, /* reduc_f16_cost */ + 8, /* reduc_f32_cost */ + 8, /* reduc_f64_cost */ 8, /* vec_to_scalar_cost */ 8, /* scalar_to_vec_cost */ 4, /* align_load_cost */ @@ -773,6 +829,13 @@ static const advsimd_vec_cost exynosm1_advsimd_vector_cost = 3, /* int_stmt_cost */ 3, /* fp_stmt_cost */ 3, /* permute_cost */ + 3, /* reduc_i8_cost */ + 3, /* reduc_i16_cost */ + 3, /* reduc_i32_cost */ + 3, /* reduc_i64_cost */ + 3, /* reduc_f16_cost */ + 3, /* reduc_f32_cost */ + 3, /* reduc_f64_cost */ 3, /* vec_to_scalar_cost */ 3, /* scalar_to_vec_cost */ 5, /* align_load_cost */ @@ -798,6 +861,13 @@ static const advsimd_vec_cost xgene1_advsimd_vector_cost = 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ 2, /* permute_cost */ + 4, /* reduc_i8_cost */ + 4, /* reduc_i16_cost */ + 4, /* reduc_i32_cost */ + 4, /* reduc_i64_cost */ + 4, /* reduc_f16_cost */ + 4, /* reduc_f32_cost */ + 4, /* reduc_f64_cost */ 4, /* vec_to_scalar_cost */ 4, /* scalar_to_vec_cost */ 10, /* align_load_cost */ @@ -824,6 +894,13 @@ static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost = 4, /* int_stmt_cost */ 5, /* fp_stmt_cost */ 10, /* permute_cost */ + 6, /* reduc_i8_cost */ + 6, /* reduc_i16_cost */ + 6, /* reduc_i32_cost */ + 6, /* reduc_i64_cost */ + 6, /* reduc_f16_cost */ + 6, /* reduc_f32_cost */ + 6, /* reduc_f64_cost */ 6, /* vec_to_scalar_cost */ 5, /* scalar_to_vec_cost */ 4, /* align_load_cost */ @@ -850,6 +927,13 @@ static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost = 5, /* int_stmt_cost */ 5, /* fp_stmt_cost */ 10, /* permute_cost */ + 5, /* reduc_i8_cost */ + 5, /* reduc_i16_cost */ + 5, /* reduc_i32_cost */ + 5, /* reduc_i64_cost */ + 5, /* reduc_f16_cost */ + 5, /* reduc_f32_cost */ + 5, /* reduc_f64_cost */ 5, /* vec_to_scalar_cost */ 5, /* scalar_to_vec_cost */ 4, /* align_load_cost */ @@ -13874,6 +13958,28 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn, /* Vectorizer cost model target hooks. */ +/* Return true if the current CPU should use the new costs defined + in GCC 11. This should be removed for GCC 12 and above, with the + costs applying to all CPUs instead. */ +static bool +aarch64_use_new_vector_costs_p () +{ + return (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS); +} + +/* Return the appropriate SIMD costs for vectors of type VECTYPE. */ +static const simd_vec_cost * +aarch64_simd_vec_costs (tree vectype) +{ + const cpu_vector_cost *costs = aarch64_tune_params.vec_costs; + if (vectype != NULL + && aarch64_sve_mode_p (TYPE_MODE (vectype)) + && costs->sve != NULL) + return costs->sve; + return costs->advsimd; +} + /* Implement targetm.vectorize.builtin_vectorization_cost. */ static int aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, @@ -13887,12 +13993,7 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, if (vectype != NULL) fp = FLOAT_TYPE_P (vectype); - const simd_vec_cost *simd_costs; - if (vectype != NULL && aarch64_sve_mode_p (TYPE_MODE (vectype)) - && costs->sve != NULL) - simd_costs = costs->sve; - else - simd_costs = costs->advsimd; + const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype); switch (type_of_cost) { @@ -13951,6 +14052,14 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, } } +/* Return true if STMT_INFO represents part of a reduction. */ +static bool +aarch64_is_reduction (stmt_vec_info stmt_info) +{ + return (STMT_VINFO_REDUC_DEF (stmt_info) + || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))); +} + /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD vectors would produce a series of LDP or STP operations. KIND is the kind of statement that STMT_INFO represents. */ @@ -14015,6 +14124,57 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info) } /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost + for the vectorized form of STMT_INFO, which has cost kind KIND and which + when vectorized would operate on vector type VECTYPE. Try to subdivide + the target-independent categorization provided by KIND to get a more + accurate cost. WHERE specifies where the cost associated with KIND + occurs. */ +static unsigned int +aarch64_detect_vector_stmt_subtype (vect_cost_for_stmt kind, + stmt_vec_info stmt_info, tree vectype, + enum vect_cost_model_location where, + unsigned int stmt_cost) +{ + const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype); + + /* Detect cases in which vec_to_scalar represents a single reduction + instruction like FADDP or MAXV. */ + if (kind == vec_to_scalar + && where == vect_epilogue + && aarch64_is_reduction (stmt_info)) + switch (GET_MODE_INNER (TYPE_MODE (vectype))) + { + case E_QImode: + return simd_costs->reduc_i8_cost; + + case E_HImode: + return simd_costs->reduc_i16_cost; + + case E_SImode: + return simd_costs->reduc_i32_cost; + + case E_DImode: + return simd_costs->reduc_i64_cost; + + case E_HFmode: + case E_BFmode: + return simd_costs->reduc_f16_cost; + + case E_SFmode: + return simd_costs->reduc_f32_cost; + + case E_DFmode: + return simd_costs->reduc_f64_cost; + + default: + break; + } + + /* Otherwise stick with the original categorization. */ + return stmt_cost; +} + +/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost for STMT_INFO, which has cost kind KIND and which when vectorized would operate on vector type VECTYPE. Adjust the cost as necessary for SVE targets. */ @@ -14097,6 +14257,14 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, int stmt_cost = aarch64_builtin_vectorization_cost (kind, vectype, misalign); + /* Try to get a more accurate cost by looking at STMT_INFO instead + of just looking at KIND. */ + if (stmt_info && vectype && aarch64_use_new_vector_costs_p ()) + stmt_cost = aarch64_detect_vector_stmt_subtype (kind, stmt_info, + vectype, where, + stmt_cost); + + /* Do any SVE-specific adjustments to the cost. */ if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype))) stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info, vectype, stmt_cost); -- cgit v1.1 From 1282988ba15337f21a940cd5d1c0b34a62378a2f Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 26 Mar 2021 16:08:30 +0000 Subject: aarch64: Add vector costs for SVE CLAST[AB] and FADDA Following on from the previous reduction costs patch, this one adds costs for the SVE CLAST[AB] and FADDA instructions. These instructions occur within the loop body, whereas the reductions handled by the previous patch occur outside. Like with the previous patch, this one only becomes active if a CPU selects use_new_vector_costs. It should therefore have a very low impact on other CPUs. gcc/ * config/aarch64/aarch64-protos.h (sve_vec_cost): Turn into a derived class of simd_vec_cost. Add information about CLAST[AB] and FADDA instructions. * config/aarch64/aarch64.c (generic_sve_vector_cost): Update accordingly, using the vec_to_scalar costs for the new fields. (a64fx_sve_vector_cost): Likewise. (aarch64_reduc_type): New function. (aarch64_sve_in_loop_reduction_latency): Likewise. (aarch64_detect_vector_stmt_subtype): Take a vinfo parameter. Use aarch64_sve_in_loop_reduction_latency to handle SVE reductions that occur in the loop body. (aarch64_add_stmt_cost): Update call accordingly. --- gcc/config/aarch64/aarch64-protos.h | 28 ++++++- gcc/config/aarch64/aarch64.c | 150 +++++++++++++++++++++++++++--------- 2 files changed, 141 insertions(+), 37 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index e4eeb2c..bfcab72 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -237,7 +237,33 @@ struct simd_vec_cost }; typedef struct simd_vec_cost advsimd_vec_cost; -typedef struct simd_vec_cost sve_vec_cost; + +/* SVE-specific extensions to the information provided by simd_vec_cost. */ +struct sve_vec_cost : simd_vec_cost +{ + constexpr sve_vec_cost (const simd_vec_cost &base, + unsigned int clast_cost, + unsigned int fadda_f16_cost, + unsigned int fadda_f32_cost, + unsigned int fadda_f64_cost) + : simd_vec_cost (base), + clast_cost (clast_cost), + fadda_f16_cost (fadda_f16_cost), + fadda_f32_cost (fadda_f32_cost), + fadda_f64_cost (fadda_f64_cost) + {} + + /* The cost of a vector-to-scalar CLASTA or CLASTB instruction, + with the scalar being stored in FP registers. This cost is + assumed to be a cycle latency. */ + const int clast_cost; + + /* The costs of FADDA for the three data types that it supports. + These costs are assumed to be cycle latencies. */ + const int fadda_f16_cost; + const int fadda_f32_cost; + const int fadda_f64_cost; +}; /* Cost for vector insn classes. */ struct cpu_vector_cost diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index b44dcdc..b62169a 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -609,22 +609,28 @@ static const advsimd_vec_cost generic_advsimd_vector_cost = /* Generic costs for SVE vector operations. */ static const sve_vec_cost generic_sve_vector_cost = { - 1, /* int_stmt_cost */ - 1, /* fp_stmt_cost */ - 2, /* permute_cost */ - 2, /* reduc_i8_cost */ - 2, /* reduc_i16_cost */ - 2, /* reduc_i32_cost */ - 2, /* reduc_i64_cost */ - 2, /* reduc_f16_cost */ - 2, /* reduc_f32_cost */ - 2, /* reduc_f64_cost */ - 2, /* vec_to_scalar_cost */ - 1, /* scalar_to_vec_cost */ - 1, /* align_load_cost */ - 1, /* unalign_load_cost */ - 1, /* unalign_store_cost */ - 1 /* store_cost */ + { + 1, /* int_stmt_cost */ + 1, /* fp_stmt_cost */ + 2, /* permute_cost */ + 2, /* reduc_i8_cost */ + 2, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 2, /* reduc_f16_cost */ + 2, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ + 2, /* vec_to_scalar_cost */ + 1, /* scalar_to_vec_cost */ + 1, /* align_load_cost */ + 1, /* unalign_load_cost */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ + }, + 2, /* clast_cost */ + 2, /* fadda_f16_cost */ + 2, /* fadda_f32_cost */ + 2 /* fadda_f64_cost */ }; /* Generic costs for vector insn classes. */ @@ -662,22 +668,28 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost = static const sve_vec_cost a64fx_sve_vector_cost = { - 2, /* int_stmt_cost */ - 5, /* fp_stmt_cost */ - 3, /* permute_cost */ - 13, /* reduc_i8_cost */ - 13, /* reduc_i16_cost */ - 13, /* reduc_i32_cost */ - 13, /* reduc_i64_cost */ - 13, /* reduc_f16_cost */ - 13, /* reduc_f32_cost */ - 13, /* reduc_f64_cost */ - 13, /* vec_to_scalar_cost */ - 4, /* scalar_to_vec_cost */ - 6, /* align_load_cost */ - 6, /* unalign_load_cost */ - 1, /* unalign_store_cost */ - 1 /* store_cost */ + { + 2, /* int_stmt_cost */ + 5, /* fp_stmt_cost */ + 3, /* permute_cost */ + 13, /* reduc_i8_cost */ + 13, /* reduc_i16_cost */ + 13, /* reduc_i32_cost */ + 13, /* reduc_i64_cost */ + 13, /* reduc_f16_cost */ + 13, /* reduc_f32_cost */ + 13, /* reduc_f64_cost */ + 13, /* vec_to_scalar_cost */ + 4, /* scalar_to_vec_cost */ + 6, /* align_load_cost */ + 6, /* unalign_load_cost */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ + }, + 13, /* clast_cost */ + 13, /* fadda_f16_cost */ + 13, /* fadda_f32_cost */ + 13 /* fadda_f64_cost */ }; static const struct cpu_vector_cost a64fx_vector_cost = @@ -14060,6 +14072,20 @@ aarch64_is_reduction (stmt_vec_info stmt_info) || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))); } +/* If STMT_INFO describes a reduction, return the type of reduction + it describes, otherwise return -1. */ +static int +aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info) +{ + if (loop_vec_info loop_vinfo = dyn_cast (vinfo)) + if (STMT_VINFO_REDUC_DEF (stmt_info)) + { + stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); + return int (STMT_VINFO_REDUC_TYPE (reduc_info)); + } + return -1; +} + /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD vectors would produce a series of LDP or STP operations. KIND is the kind of statement that STMT_INFO represents. */ @@ -14123,6 +14149,43 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info) && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type)); } +/* We are considering implementing STMT_INFO using SVE vector type VECTYPE. + If STMT_INFO is an in-loop reduction that SVE supports directly, return + its latency in cycles, otherwise return zero. SVE_COSTS specifies the + latencies of the relevant instructions. */ +static unsigned int +aarch64_sve_in_loop_reduction_latency (vec_info *vinfo, + stmt_vec_info stmt_info, + tree vectype, + const sve_vec_cost *sve_costs) +{ + switch (aarch64_reduc_type (vinfo, stmt_info)) + { + case EXTRACT_LAST_REDUCTION: + return sve_costs->clast_cost; + + case FOLD_LEFT_REDUCTION: + switch (GET_MODE_INNER (TYPE_MODE (vectype))) + { + case E_HFmode: + case E_BFmode: + return sve_costs->fadda_f16_cost; + + case E_SFmode: + return sve_costs->fadda_f32_cost; + + case E_DFmode: + return sve_costs->fadda_f64_cost; + + default: + break; + } + break; + } + + return 0; +} + /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost for the vectorized form of STMT_INFO, which has cost kind KIND and which when vectorized would operate on vector type VECTYPE. Try to subdivide @@ -14130,12 +14193,27 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info) accurate cost. WHERE specifies where the cost associated with KIND occurs. */ static unsigned int -aarch64_detect_vector_stmt_subtype (vect_cost_for_stmt kind, +aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, stmt_vec_info stmt_info, tree vectype, enum vect_cost_model_location where, unsigned int stmt_cost) { const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype); + const sve_vec_cost *sve_costs = nullptr; + if (aarch64_sve_mode_p (TYPE_MODE (vectype))) + sve_costs = aarch64_tune_params.vec_costs->sve; + + /* Detect cases in which vec_to_scalar represents an in-loop reduction. */ + if (kind == vec_to_scalar + && where == vect_body + && sve_costs) + { + unsigned int latency + = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype, + sve_costs); + if (latency) + return latency; + } /* Detect cases in which vec_to_scalar represents a single reduction instruction like FADDP or MAXV. */ @@ -14260,9 +14338,9 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, /* Try to get a more accurate cost by looking at STMT_INFO instead of just looking at KIND. */ if (stmt_info && vectype && aarch64_use_new_vector_costs_p ()) - stmt_cost = aarch64_detect_vector_stmt_subtype (kind, stmt_info, - vectype, where, - stmt_cost); + stmt_cost = aarch64_detect_vector_stmt_subtype (vinfo, kind, + stmt_info, vectype, + where, stmt_cost); /* Do any SVE-specific adjustments to the cost. */ if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype))) -- cgit v1.1 From b1a831f0dd869543788f08f94dc7ff64df3f2064 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 26 Mar 2021 16:08:31 +0000 Subject: aarch64: Add costs for LD[234]/ST[234] permutes At the moment, we cost LD[234] and ST[234] as N vector loads or stores, which effectively treats the implied permute as free. This patch adds additional costs for the permutes, which apply on top of the costs for the loads and stores. Like with the previous patches, this one only becomes active if a CPU selects use_new_vector_costs. It should therefore have a very low impact on other CPUs. gcc/ * config/aarch64/aarch64-protos.h (simd_vec_cost::ld2_st2_permute_cost) (simd_vec_cost::ld3_st3_permute_cost): New member variables. (simd_vec_cost::ld4_st4_permute_cost): Likewise. * config/aarch64/aarch64.c (generic_advsimd_vector_cost): Update accordingly, using zero for the new costs. (generic_sve_vector_cost, a64fx_advsimd_vector_cost): Likewise. (a64fx_sve_vector_cost, qdf24xx_advsimd_vector_cost): Likewise. (thunderx_advsimd_vector_cost, tsv110_advsimd_vector_cost): Likewise. (cortexa57_advsimd_vector_cost, exynosm1_advsimd_vector_cost) (xgene1_advsimd_vector_cost, thunderx2t99_advsimd_vector_cost) (thunderx3t110_advsimd_vector_cost): Likewise. (aarch64_ld234_st234_vectors): New function. (aarch64_adjust_stmt_cost): Likewise. (aarch64_add_stmt_cost): Call aarch64_adjust_stmt_cost if using the new vector costs. --- gcc/config/aarch64/aarch64-protos.h | 7 +++ gcc/config/aarch64/aarch64.c | 94 +++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index bfcab72..3d15275 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -202,6 +202,13 @@ struct simd_vec_cost specially below. */ const int fp_stmt_cost; + /* Per-vector cost of permuting vectors after an LD2, LD3 or LD4, + as well as the per-vector cost of permuting vectors before + an ST2, ST3 or ST4. */ + const int ld2_st2_permute_cost; + const int ld3_st3_permute_cost; + const int ld4_st4_permute_cost; + /* Cost of a permute operation. */ const int permute_cost; diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index b62169a..8fb723d 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -590,6 +590,9 @@ static const advsimd_vec_cost generic_advsimd_vector_cost = { 1, /* int_stmt_cost */ 1, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 2, /* reduc_i8_cost */ 2, /* reduc_i16_cost */ @@ -612,6 +615,9 @@ static const sve_vec_cost generic_sve_vector_cost = { 1, /* int_stmt_cost */ 1, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 2, /* reduc_i8_cost */ 2, /* reduc_i16_cost */ @@ -650,6 +656,9 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost = { 2, /* int_stmt_cost */ 5, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 3, /* permute_cost */ 13, /* reduc_i8_cost */ 13, /* reduc_i16_cost */ @@ -671,6 +680,9 @@ static const sve_vec_cost a64fx_sve_vector_cost = { 2, /* int_stmt_cost */ 5, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 3, /* permute_cost */ 13, /* reduc_i8_cost */ 13, /* reduc_i16_cost */ @@ -708,6 +720,9 @@ static const advsimd_vec_cost qdf24xx_advsimd_vector_cost = { 1, /* int_stmt_cost */ 3, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 1, /* reduc_i8_cost */ 1, /* reduc_i16_cost */ @@ -742,6 +757,9 @@ static const advsimd_vec_cost thunderx_advsimd_vector_cost = { 4, /* int_stmt_cost */ 1, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 4, /* permute_cost */ 2, /* reduc_i8_cost */ 2, /* reduc_i16_cost */ @@ -775,6 +793,9 @@ static const advsimd_vec_cost tsv110_advsimd_vector_cost = { 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 3, /* reduc_i8_cost */ 3, /* reduc_i16_cost */ @@ -807,6 +828,9 @@ static const advsimd_vec_cost cortexa57_advsimd_vector_cost = { 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 3, /* permute_cost */ 8, /* reduc_i8_cost */ 8, /* reduc_i16_cost */ @@ -840,6 +864,9 @@ static const advsimd_vec_cost exynosm1_advsimd_vector_cost = { 3, /* int_stmt_cost */ 3, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 3, /* permute_cost */ 3, /* reduc_i8_cost */ 3, /* reduc_i16_cost */ @@ -872,6 +899,9 @@ static const advsimd_vec_cost xgene1_advsimd_vector_cost = { 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 4, /* reduc_i8_cost */ 4, /* reduc_i16_cost */ @@ -905,6 +935,9 @@ static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost = { 4, /* int_stmt_cost */ 5, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 10, /* permute_cost */ 6, /* reduc_i8_cost */ 6, /* reduc_i16_cost */ @@ -938,6 +971,9 @@ static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost = { 5, /* int_stmt_cost */ 5, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 10, /* permute_cost */ 5, /* reduc_i8_cost */ 5, /* reduc_i16_cost */ @@ -14086,6 +14122,26 @@ aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info) return -1; } +/* Return true if an access of kind KIND for STMT_INFO represents one + vector of an LD[234] or ST[234] operation. Return the total number of + vectors (2, 3 or 4) if so, otherwise return a value outside that range. */ +static int +aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info) +{ + if ((kind == vector_load + || kind == unaligned_load + || kind == vector_store + || kind == unaligned_store) + && STMT_VINFO_DATA_REF (stmt_info)) + { + stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + if (stmt_info + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES) + return DR_GROUP_SIZE (stmt_info); + } + return 0; +} + /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD vectors would produce a series of LDP or STP operations. KIND is the kind of statement that STMT_INFO represents. */ @@ -14320,6 +14376,38 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind, return stmt_cost; } +/* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND + and which when vectorized would operate on vector type VECTYPE. Add the + cost of any embedded operations. */ +static unsigned int +aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, + tree vectype, unsigned int stmt_cost) +{ + if (vectype) + { + const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype); + + /* Detect cases in which a vector load or store represents an + LD[234] or ST[234] instruction. */ + switch (aarch64_ld234_st234_vectors (kind, stmt_info)) + { + case 2: + stmt_cost += simd_costs->ld2_st2_permute_cost; + break; + + case 3: + stmt_cost += simd_costs->ld3_st3_permute_cost; + break; + + case 4: + stmt_cost += simd_costs->ld4_st4_permute_cost; + break; + } + } + + return stmt_cost; +} + /* Implement targetm.vectorize.add_stmt_cost. */ static unsigned aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, @@ -14347,6 +14435,12 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info, vectype, stmt_cost); + if (stmt_info && aarch64_use_new_vector_costs_p ()) + /* Account for any extra "embedded" costs that apply additively + to the base cost calculated above. */ + stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype, + stmt_cost); + /* Statements in an inner loop relative to the loop being vectorized are weighted more heavily. The value here is arbitrary and could potentially be improved with analysis. */ -- cgit v1.1 From d1ff0847b2df6ad21f77e26e7e516643c5aa7d40 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 26 Mar 2021 16:08:31 +0000 Subject: aarch64: Add costs for storing one element of a vector Storing one element of a vector is costed as a vec_to_scalar followed by a scalar_store. However, vec_to_scalar is also used for reductions and for vector-to-GPR moves, which makes it difficult to pick one cost for them all. This patch therefore adds a cost for extracting one element of a vector in preparation for storing it out. The store itself is still costed separately. Like with the previous patches, this one only becomes active if a CPU selects use_new_vector_costs. It should therefore have a very low impact on other CPUs. gcc/ * config/aarch64/aarch64-protos.h (simd_vec_cost::store_elt_extra_cost): New member variable. * config/aarch64/aarch64.c (generic_advsimd_vector_cost): Update accordingly, using the vec_to_scalar cost for the new field. (generic_sve_vector_cost, a64fx_advsimd_vector_cost): Likewise. (a64fx_sve_vector_cost, qdf24xx_advsimd_vector_cost): Likewise. (thunderx_advsimd_vector_cost, tsv110_advsimd_vector_cost): Likewise. (cortexa57_advsimd_vector_cost, exynosm1_advsimd_vector_cost) (xgene1_advsimd_vector_cost, thunderx2t99_advsimd_vector_cost) (thunderx3t110_advsimd_vector_cost): Likewise. (aarch64_detect_vector_stmt_subtype): Detect single-element stores. --- gcc/config/aarch64/aarch64-protos.h | 4 ++++ gcc/config/aarch64/aarch64.c | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 3d15275..fabe3df 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -224,6 +224,10 @@ struct simd_vec_cost const int reduc_f32_cost; const int reduc_f64_cost; + /* Additional cost of storing a single vector element, on top of the + normal cost of a scalar store. */ + const int store_elt_extra_cost; + /* Cost of a vector-to-scalar operation. */ const int vec_to_scalar_cost; diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 8fb723d..20bb75b 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -601,6 +601,7 @@ static const advsimd_vec_cost generic_advsimd_vector_cost = 2, /* reduc_f16_cost */ 2, /* reduc_f32_cost */ 2, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ 2, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* align_load_cost */ @@ -626,6 +627,7 @@ static const sve_vec_cost generic_sve_vector_cost = 2, /* reduc_f16_cost */ 2, /* reduc_f32_cost */ 2, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ 2, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* align_load_cost */ @@ -667,6 +669,7 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost = 13, /* reduc_f16_cost */ 13, /* reduc_f32_cost */ 13, /* reduc_f64_cost */ + 13, /* store_elt_extra_cost */ 13, /* vec_to_scalar_cost */ 4, /* scalar_to_vec_cost */ 6, /* align_load_cost */ @@ -691,6 +694,7 @@ static const sve_vec_cost a64fx_sve_vector_cost = 13, /* reduc_f16_cost */ 13, /* reduc_f32_cost */ 13, /* reduc_f64_cost */ + 13, /* store_elt_extra_cost */ 13, /* vec_to_scalar_cost */ 4, /* scalar_to_vec_cost */ 6, /* align_load_cost */ @@ -731,6 +735,7 @@ static const advsimd_vec_cost qdf24xx_advsimd_vector_cost = 1, /* reduc_f16_cost */ 1, /* reduc_f32_cost */ 1, /* reduc_f64_cost */ + 1, /* store_elt_extra_cost */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* align_load_cost */ @@ -768,6 +773,7 @@ static const advsimd_vec_cost thunderx_advsimd_vector_cost = 2, /* reduc_f16_cost */ 2, /* reduc_f32_cost */ 2, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ 2, /* vec_to_scalar_cost */ 2, /* scalar_to_vec_cost */ 3, /* align_load_cost */ @@ -804,6 +810,7 @@ static const advsimd_vec_cost tsv110_advsimd_vector_cost = 3, /* reduc_f16_cost */ 3, /* reduc_f32_cost */ 3, /* reduc_f64_cost */ + 3, /* store_elt_extra_cost */ 3, /* vec_to_scalar_cost */ 2, /* scalar_to_vec_cost */ 5, /* align_load_cost */ @@ -839,6 +846,7 @@ static const advsimd_vec_cost cortexa57_advsimd_vector_cost = 8, /* reduc_f16_cost */ 8, /* reduc_f32_cost */ 8, /* reduc_f64_cost */ + 8, /* store_elt_extra_cost */ 8, /* vec_to_scalar_cost */ 8, /* scalar_to_vec_cost */ 4, /* align_load_cost */ @@ -875,6 +883,7 @@ static const advsimd_vec_cost exynosm1_advsimd_vector_cost = 3, /* reduc_f16_cost */ 3, /* reduc_f32_cost */ 3, /* reduc_f64_cost */ + 3, /* store_elt_extra_cost */ 3, /* vec_to_scalar_cost */ 3, /* scalar_to_vec_cost */ 5, /* align_load_cost */ @@ -910,6 +919,7 @@ static const advsimd_vec_cost xgene1_advsimd_vector_cost = 4, /* reduc_f16_cost */ 4, /* reduc_f32_cost */ 4, /* reduc_f64_cost */ + 4, /* store_elt_extra_cost */ 4, /* vec_to_scalar_cost */ 4, /* scalar_to_vec_cost */ 10, /* align_load_cost */ @@ -946,6 +956,7 @@ static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost = 6, /* reduc_f16_cost */ 6, /* reduc_f32_cost */ 6, /* reduc_f64_cost */ + 6, /* store_elt_extra_cost */ 6, /* vec_to_scalar_cost */ 5, /* scalar_to_vec_cost */ 4, /* align_load_cost */ @@ -982,6 +993,7 @@ static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost = 5, /* reduc_f16_cost */ 5, /* reduc_f32_cost */ 5, /* reduc_f64_cost */ + 5, /* store_elt_extra_cost */ 5, /* vec_to_scalar_cost */ 5, /* scalar_to_vec_cost */ 4, /* align_load_cost */ @@ -14259,6 +14271,14 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, if (aarch64_sve_mode_p (TYPE_MODE (vectype))) sve_costs = aarch64_tune_params.vec_costs->sve; + /* Detect cases in which vec_to_scalar is describing the extraction of a + vector element in preparation for a scalar store. The store itself is + costed separately. */ + if (kind == vec_to_scalar + && STMT_VINFO_DATA_REF (stmt_info) + && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))) + return simd_costs->store_elt_extra_cost; + /* Detect cases in which vec_to_scalar represents an in-loop reduction. */ if (kind == vec_to_scalar && where == vect_body -- cgit v1.1 From 7c679969bac9b7ae5e9446bfaa5466e19063d690 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 26 Mar 2021 16:08:32 +0000 Subject: aarch64: Add costs for one element of a scatter store Currently each element in a gather load is costed as a scalar_load and each element in a scatter store is costed as a scalar_store. The load side seems to work pretty well in practice, since many CPU-specific costs give loads quite a high cost relative to arithmetic operations. However, stores usually have a cost of just 1, which means that scatters tend to appear too cheap. This patch adds a separate cost for one element in a scatter store. Like with the previous patches, this one only becomes active if a CPU selects use_new_vector_costs. It should therefore have a very low impact on other CPUs. gcc/ * config/aarch64/aarch64-protos.h (sve_vec_cost::scatter_store_elt_cost): New member variable. * config/aarch64/aarch64.c (generic_sve_vector_cost): Update accordingly, taking the cost from the cost of a scalar_store. (a64fx_sve_vector_cost): Likewise. (aarch64_detect_vector_stmt_subtype): Detect scatter stores. --- gcc/config/aarch64/aarch64-protos.h | 9 +++++++-- gcc/config/aarch64/aarch64.c | 13 +++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index fabe3df..2ffa96e 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -256,12 +256,14 @@ struct sve_vec_cost : simd_vec_cost unsigned int clast_cost, unsigned int fadda_f16_cost, unsigned int fadda_f32_cost, - unsigned int fadda_f64_cost) + unsigned int fadda_f64_cost, + unsigned int scatter_store_elt_cost) : simd_vec_cost (base), clast_cost (clast_cost), fadda_f16_cost (fadda_f16_cost), fadda_f32_cost (fadda_f32_cost), - fadda_f64_cost (fadda_f64_cost) + fadda_f64_cost (fadda_f64_cost), + scatter_store_elt_cost (scatter_store_elt_cost) {} /* The cost of a vector-to-scalar CLASTA or CLASTB instruction, @@ -274,6 +276,9 @@ struct sve_vec_cost : simd_vec_cost const int fadda_f16_cost; const int fadda_f32_cost; const int fadda_f64_cost; + + /* The per-element cost of a scatter store. */ + const int scatter_store_elt_cost; }; /* Cost for vector insn classes. */ diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 20bb75b..7f72741 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -638,7 +638,8 @@ static const sve_vec_cost generic_sve_vector_cost = 2, /* clast_cost */ 2, /* fadda_f16_cost */ 2, /* fadda_f32_cost */ - 2 /* fadda_f64_cost */ + 2, /* fadda_f64_cost */ + 1 /* scatter_store_elt_cost */ }; /* Generic costs for vector insn classes. */ @@ -705,7 +706,8 @@ static const sve_vec_cost a64fx_sve_vector_cost = 13, /* clast_cost */ 13, /* fadda_f16_cost */ 13, /* fadda_f32_cost */ - 13 /* fadda_f64_cost */ + 13, /* fadda_f64_cost */ + 1 /* scatter_store_elt_cost */ }; static const struct cpu_vector_cost a64fx_vector_cost = @@ -14279,6 +14281,13 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))) return simd_costs->store_elt_extra_cost; + /* Detect cases in which a scalar_store is really storing one element + in a scatter operation. */ + if (kind == scalar_store + && sve_costs + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER) + return sve_costs->scatter_store_elt_cost; + /* Detect cases in which vec_to_scalar represents an in-loop reduction. */ if (kind == vec_to_scalar && where == vect_body -- cgit v1.1 From 14bd21c2c576d6f4b9bd403f543502cff40f54fc Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 26 Mar 2021 16:08:33 +0000 Subject: aarch64: Add a CPU-specific cost table for Neoverse V1 This patch adds dedicated vector costs for Neoverse V1. Previously we just used the Cortex-A57 costs, which isn't ideal given that Cortex-A57 doesn't support SVE. gcc/ * config/aarch64/aarch64.c (neoversev1_advsimd_vector_cost) (neoversev1_sve_vector_cost): New cost structures. (neoversev1_vector_cost): Likewise. (neoversev1_tunings): Use them. Enable use_new_vector_costs. --- gcc/config/aarch64/aarch64.c | 95 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 7f72741..2e9853e 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1619,12 +1619,102 @@ static const struct tune_params neoversen1_tunings = &generic_prefetch_tune }; +static const advsimd_vec_cost neoversev1_advsimd_vector_cost = +{ + 2, /* int_stmt_cost */ + 2, /* fp_stmt_cost */ + 4, /* ld2_st2_permute_cost */ + 4, /* ld3_st3_permute_cost */ + 5, /* ld4_st4_permute_cost */ + 3, /* permute_cost */ + 4, /* reduc_i8_cost */ + 4, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 6, /* reduc_f16_cost */ + 3, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ + /* This value is just inherited from the Cortex-A57 table. */ + 8, /* vec_to_scalar_cost */ + /* This depends very much on what the scalar value is and + where it comes from. E.g. some constants take two dependent + instructions or a load, while others might be moved from a GPR. + 4 seems to be a reasonable compromise in practice. */ + 4, /* scalar_to_vec_cost */ + 4, /* align_load_cost */ + 4, /* unalign_load_cost */ + /* Although stores have a latency of 2 and compete for the + vector pipes, in practice it's better not to model that. */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ +}; + +static const sve_vec_cost neoversev1_sve_vector_cost = +{ + { + 2, /* int_stmt_cost */ + 2, /* fp_stmt_cost */ + 4, /* ld2_st2_permute_cost */ + 7, /* ld3_st3_permute_cost */ + 8, /* ld4_st4_permute_cost */ + 3, /* permute_cost */ + /* Theoretically, a reduction involving 31 scalar ADDs could + complete in ~9 cycles and would have a cost of 31. [SU]ADDV + completes in 14 cycles, so give it a cost of 31 + 5. */ + 36, /* reduc_i8_cost */ + /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */ + 22, /* reduc_i16_cost */ + /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */ + 14, /* reduc_i32_cost */ + /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */ + 11, /* reduc_i64_cost */ + /* Theoretically, a reduction involving 15 scalar FADDs could + complete in ~9 cycles and would have a cost of 30. FADDV + completes in 13 cycles, so give it a cost of 30 + 4. */ + 34, /* reduc_f16_cost */ + /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */ + 19, /* reduc_f32_cost */ + /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */ + 11, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ + /* This value is just inherited from the Cortex-A57 table. */ + 8, /* vec_to_scalar_cost */ + /* See the comment above the Advanced SIMD versions. */ + 4, /* scalar_to_vec_cost */ + 4, /* align_load_cost */ + 4, /* unalign_load_cost */ + /* Although stores have a latency of 2 and compete for the + vector pipes, in practice it's better not to model that. */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ + }, + 3, /* clast_cost */ + 19, /* fadda_f16_cost */ + 11, /* fadda_f32_cost */ + 8, /* fadda_f64_cost */ + 3 /* scatter_store_elt_cost */ +}; + +/* Neoverse V1 costs for vector insn classes. */ +static const struct cpu_vector_cost neoversev1_vector_cost = +{ + 1, /* scalar_int_stmt_cost */ + 2, /* scalar_fp_stmt_cost */ + 4, /* scalar_load_cost */ + 1, /* scalar_store_cost */ + 1, /* cond_taken_branch_cost */ + 1, /* cond_not_taken_branch_cost */ + &neoversev1_advsimd_vector_cost, /* advsimd */ + &neoversev1_sve_vector_cost /* sve */ +}; + static const struct tune_params neoversev1_tunings = { &cortexa76_extra_costs, &generic_addrcost_table, &generic_regmove_cost, - &cortexa57_vector_cost, + &neoversev1_vector_cost, &generic_branch_cost, &generic_approx_modes, SVE_256, /* sve_width */ @@ -1641,7 +1731,8 @@ static const struct tune_params neoversev1_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS + | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS), /* tune_flags. */ &generic_prefetch_tune }; -- cgit v1.1 From 50a525b50c912999073a78220c6d62d87946b579 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 26 Mar 2021 16:08:34 +0000 Subject: aarch64: Use an aarch64-specific structure for vector costing This patch makes the AArch64 vector code use its own vector costs structure, rather than just using the default unsigned[3]. Unfortunately, it's not easy to make this change specific to use_new_vector_costs, so this part is one that affects all CPUs. The change is relatively mechanical though. gcc/ * config/aarch64/aarch64.c (aarch64_vector_costs): New structure. (aarch64_init_cost): New function. (aarch64_add_stmt_cost): Use aarch64_vector_costs instead of the default unsigned[3]. (aarch64_finish_cost, aarch64_destroy_cost_data): New functions. (TARGET_VECTORIZE_INIT_COST): Override. (TARGET_VECTORIZE_FINISH_COST): Likewise. (TARGET_VECTORIZE_DESTROY_COST_DATA): Likewise. --- gcc/config/aarch64/aarch64.c | 46 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 2e9853e..81683b7 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -14111,6 +14111,21 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn, /* Vectorizer cost model target hooks. */ +/* Information about vector code that we're in the process of costing. */ +struct aarch64_vector_costs +{ + /* The normal latency-based costs for each region (prologue, body and + epilogue), indexed by vect_cost_model_location. */ + unsigned int region[3] = {}; +}; + +/* Implement TARGET_VECTORIZE_INIT_COST. */ +void * +aarch64_init_cost (class loop *) +{ + return new aarch64_vector_costs; +} + /* Return true if the current CPU should use the new costs defined in GCC 11. This should be removed for GCC 12 and above, with the costs applying to all CPUs instead. */ @@ -14535,7 +14550,7 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, struct _stmt_vec_info *stmt_info, tree vectype, int misalign, enum vect_cost_model_location where) { - unsigned *cost = (unsigned *) data; + auto *costs = static_cast (data); unsigned retval = 0; if (flag_vect_cost_model) @@ -14569,12 +14584,30 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, count *= 50; /* FIXME */ retval = (unsigned) (count * stmt_cost); - cost[where] += retval; + costs->region[where] += retval; } return retval; } +/* Implement TARGET_VECTORIZE_FINISH_COST. */ +static void +aarch64_finish_cost (void *data, unsigned *prologue_cost, + unsigned *body_cost, unsigned *epilogue_cost) +{ + auto *costs = static_cast (data); + *prologue_cost = costs->region[vect_prologue]; + *body_cost = costs->region[vect_body]; + *epilogue_cost = costs->region[vect_epilogue]; +} + +/* Implement TARGET_VECTORIZE_DESTROY_COST_DATA. */ +static void +aarch64_destroy_cost_data (void *data) +{ + delete static_cast (data); +} + static void initialize_aarch64_code_model (struct gcc_options *); /* Parse the TO_PARSE string and put the architecture struct that it @@ -24713,9 +24746,18 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_ARRAY_MODE_SUPPORTED_P #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p +#undef TARGET_VECTORIZE_INIT_COST +#define TARGET_VECTORIZE_INIT_COST aarch64_init_cost + #undef TARGET_VECTORIZE_ADD_STMT_COST #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost +#undef TARGET_VECTORIZE_FINISH_COST +#define TARGET_VECTORIZE_FINISH_COST aarch64_finish_cost + +#undef TARGET_VECTORIZE_DESTROY_COST_DATA +#define TARGET_VECTORIZE_DESTROY_COST_DATA aarch64_destroy_cost_data + #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \ aarch64_builtin_vectorization_cost -- cgit v1.1 From 3b924b0d7c0218956dbc2ce0ca2740e8923c2c4a Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 26 Mar 2021 16:08:35 +0000 Subject: aarch64: Try to detect when Advanced SIMD code would be completely unrolled GCC usually costs the SVE and Advanced SIMD versions of a loop and picks the one with the lowest cost. By default it will choose SVE over Advanced SIMD in the event of tie. This is normally the correct behaviour, not least because SVE can handle every scalar iteration count whereas Advanced SIMD can only handle full vectors. However, there is one important exception that GCC failed to consider: we can completely unroll Advanced SIMD code at compile time, but we can't do the same for SVE. This patch therefore adds an opt-in heuristic to guess whether the Advanced SIMD version of a loop is likely to be unrolled. This will only be suitable for some CPUs, so it is not enabled by default and is controlled separately from use_new_vector_costs. Like with previous patches, this one only becomes active if a CPU selects both of the new tuning parameters. It should therefore have a very low impact on other CPUs. gcc/ * config/aarch64/aarch64-tuning-flags.def (matched_vector_throughput): New tuning parameter. * config/aarch64/aarch64.c (neoversev1_tunings): Use it. (aarch64_estimated_sve_vq): New function. (aarch64_vector_costs::analyzed_vinfo): New member variable. (aarch64_vector_costs::is_loop): Likewise. (aarch64_vector_costs::unrolled_advsimd_niters): Likewise. (aarch64_vector_costs::unrolled_advsimd_stmts): Likewise. (aarch64_record_potential_advsimd_unrolling): New function. (aarch64_analyze_loop_vinfo, aarch64_analyze_bb_vinfo): Likewise. (aarch64_add_stmt_cost): Call aarch64_analyze_loop_vinfo or aarch64_analyze_bb_vinfo on the first use of a costs structure. Detect whether we're vectorizing a loop for SVE that might be completely unrolled if it used Advanced SIMD instead. (aarch64_adjust_body_cost_for_latency): New function. (aarch64_finish_cost): Call it. --- gcc/config/aarch64/aarch64-tuning-flags.def | 2 + gcc/config/aarch64/aarch64.c | 215 +++++++++++++++++++++++++++- 2 files changed, 210 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index a61fcf9..65b4c37 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -50,4 +50,6 @@ AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS) AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS) +AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", MATCHED_VECTOR_THROUGHPUT) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 81683b7..63750e3 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1732,7 +1732,8 @@ static const struct tune_params neoversev1_tunings = 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS), /* tune_flags. */ + | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ &generic_prefetch_tune }; @@ -2539,6 +2540,14 @@ aarch64_bit_representation (rtx x) return x; } +/* Return an estimate for the number of quadwords in an SVE vector. This is + equivalent to the number of Advanced SIMD vectors in an SVE vector. */ +static unsigned int +aarch64_estimated_sve_vq () +{ + return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128; +} + /* Return true if MODE is any of the Advanced SIMD structure modes. */ static bool aarch64_advsimd_struct_mode_p (machine_mode mode) @@ -14117,6 +14126,39 @@ struct aarch64_vector_costs /* The normal latency-based costs for each region (prologue, body and epilogue), indexed by vect_cost_model_location. */ unsigned int region[3] = {}; + + /* True if we have performed one-time initialization based on the vec_info. + + This variable exists because the vec_info is not passed to the + init_cost hook. We therefore have to defer initialization based on + it till later. */ + bool analyzed_vinfo = false; + + /* True if we're costing a vector loop, false if we're costing block-level + vectorization. */ + bool is_loop = false; + + /* - If VEC_FLAGS is zero then we're costing the original scalar code. + - If VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced + SIMD code. + - If VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */ + unsigned int vec_flags = 0; + + /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector + throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those + situations, we try to predict whether an Advanced SIMD implementation + of the loop could be completely unrolled and become straight-line code. + If so, it is generally better to use the Advanced SIMD version rather + than length-agnostic SVE, since the SVE loop would execute an unknown + number of times and so could not be completely unrolled in the same way. + + If we're applying this heuristic, UNROLLED_ADVSIMD_NITERS is the + number of Advanced SIMD loop iterations that would be unrolled and + UNROLLED_ADVSIMD_STMTS estimates the total number of statements + in the unrolled loop. Both values are zero if we're not applying + the heuristic. */ + unsigned HOST_WIDE_INT unrolled_advsimd_niters = 0; + unsigned HOST_WIDE_INT unrolled_advsimd_stmts = 0; }; /* Implement TARGET_VECTORIZE_INIT_COST. */ @@ -14148,6 +14190,94 @@ aarch64_simd_vec_costs (tree vectype) return costs->advsimd; } +/* Decide whether to use the unrolling heuristic described above + aarch64_vector_costs::unrolled_advsimd_niters, updating that + field if so. LOOP_VINFO describes the loop that we're vectorizing + and COSTS are the costs that we're calculating for it. */ +static void +aarch64_record_potential_advsimd_unrolling (loop_vec_info loop_vinfo, + aarch64_vector_costs *costs) +{ + /* The heuristic only makes sense on targets that have the same + vector throughput for SVE and Advanced SIMD. */ + if (!(aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT)) + return; + + /* We only want to apply the heuristic if LOOP_VINFO is being + vectorized for SVE. */ + if (!(costs->vec_flags & VEC_ANY_SVE)) + return; + + /* Check whether it is possible in principle to use Advanced SIMD + instead. */ + if (aarch64_autovec_preference == 2) + return; + + /* We don't want to apply the heuristic to outer loops, since it's + harder to track two levels of unrolling. */ + if (LOOP_VINFO_LOOP (loop_vinfo)->inner) + return; + + /* Only handle cases in which the number of Advanced SIMD iterations + would be known at compile time but the number of SVE iterations + would not. */ + if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + || aarch64_sve_vg.is_constant ()) + return; + + /* Guess how many times the Advanced SIMD loop would iterate and make + sure that it is within the complete unrolling limit. Even if the + number of iterations is small enough, the number of statements might + not be, which is why we need to estimate the number of statements too. */ + unsigned int estimated_vq = aarch64_estimated_sve_vq (); + unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq); + unsigned HOST_WIDE_INT unrolled_advsimd_niters + = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf; + if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times) + return; + + /* Record that we're applying the heuristic and should try to estimate + the number of statements in the Advanced SIMD loop. */ + costs->unrolled_advsimd_niters = unrolled_advsimd_niters; +} + +/* Do one-time initialization of COSTS given that we're costing the loop + vectorization described by LOOP_VINFO. */ +static void +aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo, + aarch64_vector_costs *costs) +{ + costs->is_loop = true; + + /* Detect whether we're costing the scalar code or the vector code. + This is a bit hacky: it would be better if the vectorizer told + us directly. + + If we're costing the vector code, record whether we're vectorizing + for Advanced SIMD or SVE. */ + if (costs == LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)) + costs->vec_flags = aarch64_classify_vector_mode (loop_vinfo->vector_mode); + else + costs->vec_flags = 0; + + /* Detect whether we're vectorizing for SVE and should + apply the unrolling heuristic described above + aarch64_vector_costs::unrolled_advsimd_niters. */ + aarch64_record_potential_advsimd_unrolling (loop_vinfo, costs); +} + +/* Do one-time initialization of COSTS given that we're costing the block + vectorization described by BB_VINFO. */ +static void +aarch64_analyze_bb_vinfo (bb_vec_info bb_vinfo, aarch64_vector_costs *costs) +{ + /* Unfortunately, there's no easy way of telling whether we're costing + the vector code or the scalar code, so just assume that we're costing + the vector code. */ + costs->vec_flags = aarch64_classify_vector_mode (bb_vinfo->vector_mode); +} + /* Implement targetm.vectorize.builtin_vectorization_cost. */ static int aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, @@ -14555,8 +14685,20 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, if (flag_vect_cost_model) { - int stmt_cost = - aarch64_builtin_vectorization_cost (kind, vectype, misalign); + int stmt_cost + = aarch64_builtin_vectorization_cost (kind, vectype, misalign); + + /* Do one-time initialization based on the vinfo. */ + loop_vec_info loop_vinfo = dyn_cast (vinfo); + bb_vec_info bb_vinfo = dyn_cast (vinfo); + if (!costs->analyzed_vinfo && aarch64_use_new_vector_costs_p ()) + { + if (loop_vinfo) + aarch64_analyze_loop_vinfo (loop_vinfo, costs); + else + aarch64_analyze_bb_vinfo (bb_vinfo, costs); + costs->analyzed_vinfo = true; + } /* Try to get a more accurate cost by looking at STMT_INFO instead of just looking at KIND. */ @@ -14571,10 +14713,21 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, vectype, stmt_cost); if (stmt_info && aarch64_use_new_vector_costs_p ()) - /* Account for any extra "embedded" costs that apply additively - to the base cost calculated above. */ - stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype, - stmt_cost); + { + /* Account for any extra "embedded" costs that apply additively + to the base cost calculated above. */ + stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype, + stmt_cost); + + /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic, + estimate the number of statements in the unrolled Advanced SIMD + loop. For simplicitly, we assume that one iteration of the + Advanced SIMD loop would need the same number of statements + as one iteration of the SVE loop. */ + if (where == vect_body && costs->unrolled_advsimd_niters) + costs->unrolled_advsimd_stmts + += count * costs->unrolled_advsimd_niters; + } /* Statements in an inner loop relative to the loop being vectorized are weighted more heavily. The value here is @@ -14590,6 +14743,49 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, return retval; } +/* BODY_COST is the cost of a vector loop body recorded in COSTS. + Adjust the cost as necessary and return the new cost. */ +static unsigned int +aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) +{ + unsigned int orig_body_cost = body_cost; + + if (costs->unrolled_advsimd_stmts) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in" + " unrolled Advanced SIMD loop = %d\n", + costs->unrolled_advsimd_stmts); + + /* Apply the Advanced SIMD vs. SVE unrolling heuristic described above + aarch64_vector_costs::unrolled_advsimd_niters. + + The balance here is tricky. On the one hand, we can't be sure whether + the code is vectorizable with Advanced SIMD or not. However, even if + it isn't vectorizable with Advanced SIMD, there's a possibility that + the scalar code could also be unrolled. Some of the code might then + benefit from SLP, or from using LDP and STP. We therefore apply + the heuristic regardless of can_use_advsimd_p. */ + if (costs->unrolled_advsimd_stmts + && (costs->unrolled_advsimd_stmts + <= (unsigned int) param_max_completely_peeled_insns)) + { + unsigned int estimated_vq = aarch64_estimated_sve_vq (); + unsigned int min_cost = (orig_body_cost * estimated_vq) + 1; + if (body_cost < min_cost) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d to account for" + " unrolling\n", min_cost); + body_cost = min_cost; + } + } + } + + return body_cost; +} + /* Implement TARGET_VECTORIZE_FINISH_COST. */ static void aarch64_finish_cost (void *data, unsigned *prologue_cost, @@ -14599,6 +14795,11 @@ aarch64_finish_cost (void *data, unsigned *prologue_cost, *prologue_cost = costs->region[vect_prologue]; *body_cost = costs->region[vect_body]; *epilogue_cost = costs->region[vect_epilogue]; + + if (costs->is_loop + && costs->vec_flags + && aarch64_use_new_vector_costs_p ()) + *body_cost = aarch64_adjust_body_cost (costs, *body_cost); } /* Implement TARGET_VECTORIZE_DESTROY_COST_DATA. */ -- cgit v1.1 From ed17ad5ea1cb302951f582ae8edd1afc9c014302 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 26 Mar 2021 16:08:35 +0000 Subject: aarch64: Detect scalar extending loads If the scalar code does an integer load followed by an integer extension, we've tended to cost that as two separate operations, even though the extension is probably going to be free in practice. This patch treats the extension as having zero cost, like we already do for extending SVE loads. Like with previous patches, this one only becomes active if a CPU selects use_new_vector_costs. It should therefore have a very low impact on other CPUs. gcc/ * config/aarch64/aarch64.c (aarch64_detect_scalar_stmt_subtype): New function. (aarch64_add_stmt_cost): Call it. --- gcc/config/aarch64/aarch64.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 63750e3..e2d92f0 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -14493,6 +14493,23 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo, } /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost + for STMT_INFO, which has cost kind KIND. If this is a scalar operation, + try to subdivide the target-independent categorization provided by KIND + to get a more accurate cost. */ +static unsigned int +aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, + stmt_vec_info stmt_info, + unsigned int stmt_cost) +{ + /* Detect an extension of a loaded value. In general, we'll be able to fuse + the extension with the load. */ + if (kind == scalar_stmt && aarch64_extending_load_p (vinfo, stmt_info)) + return 0; + + return stmt_cost; +} + +/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost for the vectorized form of STMT_INFO, which has cost kind KIND and which when vectorized would operate on vector type VECTYPE. Try to subdivide the target-independent categorization provided by KIND to get a more @@ -14702,10 +14719,16 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, /* Try to get a more accurate cost by looking at STMT_INFO instead of just looking at KIND. */ - if (stmt_info && vectype && aarch64_use_new_vector_costs_p ()) - stmt_cost = aarch64_detect_vector_stmt_subtype (vinfo, kind, - stmt_info, vectype, - where, stmt_cost); + if (stmt_info && aarch64_use_new_vector_costs_p ()) + { + stmt_cost = aarch64_detect_scalar_stmt_subtype + (vinfo, kind, stmt_info, stmt_cost); + + if (vectype && costs->vec_flags) + stmt_cost = aarch64_detect_vector_stmt_subtype (vinfo, kind, + stmt_info, vectype, + where, stmt_cost); + } /* Do any SVE-specific adjustments to the cost. */ if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype))) -- cgit v1.1 From 99f94ae5018e915d0c1db1b6d4110d68bc4d242e Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 26 Mar 2021 16:08:36 +0000 Subject: aarch64: Cost comparisons embedded in COND_EXPRs So far the costing of COND_EXPRs hasn't distinguished between cases in which the condition is calculated separately or is built into the COND_EXPR itself. This patch adds the cost of any embedded comparison. Like with the previous patches, this one only becomes active if a CPU selects use_new_vector_costs. It should therefore have a very low impact on other CPUs. gcc/ * config/aarch64/aarch64.c (aarch64_embedded_comparison_type): New function. (aarch64_adjust_stmt_cost): Add the costs of embedded scalar and vector comparisons. --- gcc/config/aarch64/aarch64.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index e2d92f0..e97e71b 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -14392,6 +14392,21 @@ aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info) return 0; } +/* If STMT_INFO is a COND_EXPR that includes an embedded comparison, return the + scalar type of the values being compared. Return null otherwise. */ +static tree +aarch64_embedded_comparison_type (stmt_vec_info stmt_info) +{ + if (auto *assign = dyn_cast (stmt_info->stmt)) + if (gimple_assign_rhs_code (assign) == COND_EXPR) + { + tree cond = gimple_assign_rhs1 (assign); + if (COMPARISON_CLASS_P (cond)) + return TREE_TYPE (TREE_OPERAND (cond, 0)); + } + return NULL_TREE; +} + /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD vectors would produce a series of LDP or STP operations. KIND is the kind of statement that STMT_INFO represents. */ @@ -14685,8 +14700,26 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, stmt_cost += simd_costs->ld4_st4_permute_cost; break; } + + if (kind == vector_stmt || kind == vec_to_scalar) + if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info)) + { + if (FLOAT_TYPE_P (cmp_type)) + stmt_cost += simd_costs->fp_stmt_cost; + else + stmt_cost += simd_costs->int_stmt_cost; + } } + if (kind == scalar_stmt) + if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info)) + { + if (FLOAT_TYPE_P (cmp_type)) + stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost; + else + stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost; + } + return stmt_cost; } -- cgit v1.1 From e4180ab2fea0d3e8010f23b5e73095ac13cedafa Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 26 Mar 2021 16:08:37 +0000 Subject: aarch64: Ignore inductions when costing vector code In practice it seems to be better not to cost a vector induction. The scalar code generally needs the same induction but doesn't cost it, making an apples-for-apples comparison harder. Most inductions also have a low latency and their cost usually gets hidden by other operations. Like with the previous patches, this one only becomes active if a CPU selects use_new_vector_costs. It should therefore have a very low impact on other CPUs. gcc/ * config/aarch64/aarch64.c (aarch64_detect_vector_stmt_subtype): Assume a zero cost for induction phis. --- gcc/config/aarch64/aarch64.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index e97e71b..6d18d82 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -14541,6 +14541,12 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, if (aarch64_sve_mode_p (TYPE_MODE (vectype))) sve_costs = aarch64_tune_params.vec_costs->sve; + /* It's generally better to avoid costing inductions, since the induction + will usually be hidden by other operations. This is particularly true + for things like COND_REDUCTIONS. */ + if (is_a (stmt_info->stmt)) + return 0; + /* Detect cases in which vec_to_scalar is describing the extraction of a vector element in preparation for a scalar store. The store itself is costed separately. */ -- cgit v1.1 From 1205a8cadb6bd41cdf5b13d7aca8fb44332002e5 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 26 Mar 2021 16:08:38 +0000 Subject: aarch64: Take issue rate into account for vector loop costs When SVE is enabled, GCC needs to do a three-way comparison between scalar, Advanced SIMD and SVE code. The normal costs tend to be latency-based, which is well-suited to SLP. However, comparing sums of latency costs means that we effectively treat the code as executing sequentially. This can hide the effect of pipeline bubbles or resource contention that in practice are quite important for loop vectorisation. This is particularly true for loops that involve reductions. This patch therefore tries to estimate how quickly each piece of code could issue, using a very (very) simplistic model. It then uses this to adjust the loop vector costs up or down as appropriate. Part of the Advanced SIMD vs. SVE adjustment is opt-in and is not enabled by default even for use_new_vector_costs. Like with the previous patches, this one only becomes active if a CPU selects use_new_vector_costs. It should therefore have a very low impact on other CPUs. The code also mostly ignores CPUs that have no issue information, even if use_new_vector_costs is enabled for some reason. gcc/ * config/aarch64/aarch64.opt (-param=aarch64-loop-vect-issue-rate-niters=): New parameter. * doc/invoke.texi: Document it. * config/aarch64/aarch64-protos.h (aarch64_base_vec_issue_info) (aarch64_scalar_vec_issue_info, aarch64_simd_vec_issue_info) (aarch64_advsimd_vec_issue_info, aarch64_sve_vec_issue_info) (aarch64_vec_issue_info): New structures. (cpu_vector_cost): Write comments above the variables rather than to the side. (cpu_vector_cost::issue_info): New member variable. * config/aarch64/aarch64.c: Include gimple-pretty-print.h and tree-ssa-loop-niter.h. (generic_vector_cost, a64fx_vector_cost, qdf24xx_vector_cost) (thunderx_vector_cost, tsv110_vector_cost, cortexa57_vector_cost) (exynosm1_vector_cost, xgene1_vector_cost, thunderx2t99_vector_cost) (thunderx3t110_vector_cost): Initialize issue_info to null. (neoversev1_scalar_issue_info, neoversev1_advsimd_issue_info) (neoversev1_sve_issue_info, neoversev1_vec_issue_info): New structures. (neoversev1_vector_cost): Use them. (aarch64_vec_op_count, aarch64_sve_op_count): New structures. (aarch64_vector_costs::saw_sve_only_op): New member variable. (aarch64_vector_costs::num_vector_iterations): Likewise. (aarch64_vector_costs::scalar_ops): Likewise. (aarch64_vector_costs::advsimd_ops): Likewise. (aarch64_vector_costs::sve_ops): Likewise. (aarch64_vector_costs::seen_loads): Likewise. (aarch64_simd_vec_costs_for_flags): New function. (aarch64_analyze_loop_vinfo): Initialize num_vector_iterations. Count the number of predicate operations required by SVE WHILE instructions. (aarch64_comparison_type, aarch64_multiply_add_p): New functions. (aarch64_sve_only_stmt_p, aarch64_in_loop_reduction_latency): Likewise. (aarch64_count_ops): Likewise. (aarch64_add_stmt_cost): Record whether see an SVE operation that cannot currently be implementing using Advanced SIMD. Record issue information about the scalar, Advanced SIMD and (where relevant) SVE versions of a loop. (aarch64_vec_op_count::dump): New function. (aarch64_sve_op_count::dump): Likewise. (aarch64_estimate_min_cycles_per_iter): Likewise. (aarch64_adjust_body_cost): If issue information is available, try to compare the issue rates of the various loop implementations and increase or decrease the vector body cost accordingly. --- gcc/config/aarch64/aarch64-protos.h | 178 +++++++- gcc/config/aarch64/aarch64.c | 798 +++++++++++++++++++++++++++++++++++- gcc/config/aarch64/aarch64.opt | 3 + 3 files changed, 958 insertions(+), 21 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 2ffa96e..ca1ed9e 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -281,19 +281,177 @@ struct sve_vec_cost : simd_vec_cost const int scatter_store_elt_cost; }; +/* Base information about how the CPU issues code, containing + information that is relevant to scalar, Advanced SIMD and SVE + operations. + + The structure uses the general term "operation" to refer to + whichever subdivision of an instruction makes sense for the CPU. + These operations would typically be micro operations or macro + operations. + + Note that this structure and the ones derived from it are only + as general as they need to be for the CPUs that currently use them. + They will probably need to be extended or refined as more CPUs are + added. */ +struct aarch64_base_vec_issue_info +{ + /* How many loads and stores can be issued per cycle. */ + const unsigned int loads_stores_per_cycle; + + /* How many stores can be issued per cycle. */ + const unsigned int stores_per_cycle; + + /* How many integer or FP/SIMD operations can be issued per cycle. + + Currently we don't try to distinguish the two. For vector code, + we only really track FP/SIMD operations during vector costing; + we don't for example try to cost arithmetic operations like + address calculations, which are only decided later during ivopts. + + For scalar code, we effectively assume that code operates entirely + on integers or entirely on floating-point values. Again, we don't + try to take address calculations into account. + + This is not very precise, but it's only meant to be a heuristic. + We could certainly try to do better in future if there's an example + of something that would benefit. */ + const unsigned int general_ops_per_cycle; + + /* How many FP/SIMD operations to count for a floating-point or + vector load operation. + + When constructing an Advanced SIMD vector from elements that have + been loaded from memory, these values apply to each individual load. + When using an SVE gather load, the values apply to each element of + the gather. */ + const unsigned int fp_simd_load_general_ops; + + /* How many FP/SIMD operations to count for a floating-point or + vector store operation. + + When storing individual elements of an Advanced SIMD vector out to + memory, these values apply to each individual store. When using an + SVE scatter store, these values apply to each element of the scatter. */ + const unsigned int fp_simd_store_general_ops; +}; + +using aarch64_scalar_vec_issue_info = aarch64_base_vec_issue_info; + +/* Base information about the issue stage for vector operations. + This structure contains information that is relevant to both + Advanced SIMD and SVE. */ +struct aarch64_simd_vec_issue_info : aarch64_base_vec_issue_info +{ + constexpr aarch64_simd_vec_issue_info (aarch64_base_vec_issue_info base, + unsigned int ld2_st2_general_ops, + unsigned int ld3_st3_general_ops, + unsigned int ld4_st4_general_ops) + : aarch64_base_vec_issue_info (base), + ld2_st2_general_ops (ld2_st2_general_ops), + ld3_st3_general_ops (ld3_st3_general_ops), + ld4_st4_general_ops (ld4_st4_general_ops) + {} + + /* How many FP/SIMD operations to count for each vector loaded or + stored by an LD[234] or ST[234] operation, in addition to the + base costs given in the parent class. For example, the full + number of operations for an LD3 would be: + + load ops: 3 + general ops: 3 * (fp_simd_load_general_ops + ld3_st3_general_ops). */ + const unsigned int ld2_st2_general_ops; + const unsigned int ld3_st3_general_ops; + const unsigned int ld4_st4_general_ops; +}; + +using aarch64_advsimd_vec_issue_info = aarch64_simd_vec_issue_info; + +/* Information about the issue stage for SVE. The main thing this adds + is a concept of "predicate operations". */ +struct aarch64_sve_vec_issue_info : aarch64_simd_vec_issue_info +{ + constexpr aarch64_sve_vec_issue_info + (aarch64_simd_vec_issue_info base, + unsigned int pred_ops_per_cycle, + unsigned int while_pred_ops, + unsigned int int_cmp_pred_ops, + unsigned int fp_cmp_pred_ops, + unsigned int gather_scatter_pair_general_ops, + unsigned int gather_scatter_pair_pred_ops) + : aarch64_simd_vec_issue_info (base), + pred_ops_per_cycle (pred_ops_per_cycle), + while_pred_ops (while_pred_ops), + int_cmp_pred_ops (int_cmp_pred_ops), + fp_cmp_pred_ops (fp_cmp_pred_ops), + gather_scatter_pair_general_ops (gather_scatter_pair_general_ops), + gather_scatter_pair_pred_ops (gather_scatter_pair_pred_ops) + {} + + /* How many predicate operations can be issued per cycle. */ + const unsigned int pred_ops_per_cycle; + + /* How many predicate operations are generated by a WHILExx + instruction. */ + const unsigned int while_pred_ops; + + /* How many predicate operations are generated by an integer + comparison instruction. */ + const unsigned int int_cmp_pred_ops; + + /* How many predicate operations are generated by a floating-point + comparison instruction. */ + const unsigned int fp_cmp_pred_ops; + + /* How many general and predicate operations are generated by each pair + of elements in a gather load or scatter store. These values apply + on top of the per-element counts recorded in fp_simd_load_general_ops + and fp_simd_store_general_ops. + + The reason for using pairs is that that is the largest possible + granule size for 128-bit SVE, which can load and store 2 64-bit + elements or 4 32-bit elements. */ + const unsigned int gather_scatter_pair_general_ops; + const unsigned int gather_scatter_pair_pred_ops; +}; + +/* Information related to instruction issue for a particular CPU. */ +struct aarch64_vec_issue_info +{ + const aarch64_base_vec_issue_info *const scalar; + const aarch64_simd_vec_issue_info *const advsimd; + const aarch64_sve_vec_issue_info *const sve; +}; + /* Cost for vector insn classes. */ struct cpu_vector_cost { - const int scalar_int_stmt_cost; /* Cost of any int scalar operation, - excluding load and store. */ - const int scalar_fp_stmt_cost; /* Cost of any fp scalar operation, - excluding load and store. */ - const int scalar_load_cost; /* Cost of scalar load. */ - const int scalar_store_cost; /* Cost of scalar store. */ - const int cond_taken_branch_cost; /* Cost of taken branch. */ - const int cond_not_taken_branch_cost; /* Cost of not taken branch. */ - const advsimd_vec_cost *advsimd; /* Cost of Advanced SIMD operations. */ - const sve_vec_cost *sve; /* Cost of SVE operations. */ + /* Cost of any integer scalar operation, excluding load and store. */ + const int scalar_int_stmt_cost; + + /* Cost of any fp scalar operation, excluding load and store. */ + const int scalar_fp_stmt_cost; + + /* Cost of a scalar load. */ + const int scalar_load_cost; + + /* Cost of a scalar store. */ + const int scalar_store_cost; + + /* Cost of a taken branch. */ + const int cond_taken_branch_cost; + + /* Cost of a not-taken branch. */ + const int cond_not_taken_branch_cost; + + /* Cost of an Advanced SIMD operations. */ + const advsimd_vec_cost *advsimd; + + /* Cost of an SVE operations, or null if SVE is not implemented. */ + const sve_vec_cost *sve; + + /* Issue information, or null if none is provided. */ + const aarch64_vec_issue_info *const issue_info; }; /* Branch costs. */ diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 6d18d82..6d961be 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -74,6 +74,8 @@ #include "intl.h" #include "expmed.h" #include "function-abi.h" +#include "gimple-pretty-print.h" +#include "tree-ssa-loop-niter.h" /* This file should be included last. */ #include "target-def.h" @@ -652,7 +654,8 @@ static const struct cpu_vector_cost generic_vector_cost = 3, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &generic_advsimd_vector_cost, /* advsimd */ - &generic_sve_vector_cost /* sve */ + &generic_sve_vector_cost, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost a64fx_advsimd_vector_cost = @@ -719,7 +722,8 @@ static const struct cpu_vector_cost a64fx_vector_cost = 3, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &a64fx_advsimd_vector_cost, /* advsimd */ - &a64fx_sve_vector_cost /* sve */ + &a64fx_sve_vector_cost, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost qdf24xx_advsimd_vector_cost = @@ -756,7 +760,8 @@ static const struct cpu_vector_cost qdf24xx_vector_cost = 3, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &qdf24xx_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; @@ -794,7 +799,8 @@ static const struct cpu_vector_cost thunderx_vector_cost = 3, /* cond_taken_branch_cost */ 3, /* cond_not_taken_branch_cost */ &thunderx_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost tsv110_advsimd_vector_cost = @@ -830,7 +836,8 @@ static const struct cpu_vector_cost tsv110_vector_cost = 1, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &tsv110_advsimd_vector_cost, /* advsimd */ - NULL, /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost cortexa57_advsimd_vector_cost = @@ -867,7 +874,8 @@ static const struct cpu_vector_cost cortexa57_vector_cost = 1, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &cortexa57_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost exynosm1_advsimd_vector_cost = @@ -903,7 +911,8 @@ static const struct cpu_vector_cost exynosm1_vector_cost = 1, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &exynosm1_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost xgene1_advsimd_vector_cost = @@ -940,7 +949,8 @@ static const struct cpu_vector_cost xgene1_vector_cost = 2, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &xgene1_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost = @@ -977,7 +987,8 @@ static const struct cpu_vector_cost thunderx2t99_vector_cost = 2, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &thunderx2t99_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost = @@ -1013,7 +1024,8 @@ static const struct cpu_vector_cost thunderx3t110_vector_cost = 2, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &thunderx3t110_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; @@ -1696,6 +1708,58 @@ static const sve_vec_cost neoversev1_sve_vector_cost = 3 /* scatter_store_elt_cost */ }; +static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info = +{ + 3, /* loads_stores_per_cycle */ + 2, /* stores_per_cycle */ + 4, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ +}; + +static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info = +{ + { + 3, /* loads_stores_per_cycle */ + 2, /* stores_per_cycle */ + 4, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ + }, + 2, /* ld2_st2_general_ops */ + 2, /* ld3_st3_general_ops */ + 3 /* ld4_st4_general_ops */ +}; + +static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info = +{ + { + { + 2, /* loads_per_cycle */ + 2, /* stores_per_cycle */ + 2, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ + }, + 2, /* ld2_st2_general_ops */ + 2, /* ld3_st3_general_ops */ + 3 /* ld4_st4_general_ops */ + }, + 1, /* pred_ops_per_cycle */ + 2, /* while_pred_ops */ + 2, /* int_cmp_pred_ops */ + 1, /* fp_cmp_pred_ops */ + 1, /* gather_scatter_pair_general_ops */ + 1 /* gather_scatter_pair_pred_ops */ +}; + +static const aarch64_vec_issue_info neoversev1_vec_issue_info = +{ + &neoversev1_scalar_issue_info, + &neoversev1_advsimd_issue_info, + &neoversev1_sve_issue_info +}; + /* Neoverse V1 costs for vector insn classes. */ static const struct cpu_vector_cost neoversev1_vector_cost = { @@ -1706,7 +1770,8 @@ static const struct cpu_vector_cost neoversev1_vector_cost = 1, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &neoversev1_advsimd_vector_cost, /* advsimd */ - &neoversev1_sve_vector_cost /* sve */ + &neoversev1_sve_vector_cost, /* sve */ + &neoversev1_vec_issue_info /* issue_info */ }; static const struct tune_params neoversev1_tunings = @@ -14120,6 +14185,38 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn, /* Vectorizer cost model target hooks. */ +/* Information about how the CPU would issue the scalar, Advanced SIMD + or SVE version of a vector loop, using the scheme defined by the + aarch64_base_vec_issue_info hierarchy of structures. */ +struct aarch64_vec_op_count +{ + void dump () const; + + /* The number of individual "general" operations. See the comments + in aarch64_base_vec_issue_info for details. */ + unsigned int general_ops = 0; + + /* The number of load and store operations, under the same scheme + as above. */ + unsigned int loads = 0; + unsigned int stores = 0; + + /* The minimum number of cycles needed to execute all loop-carried + operations, which in the vector code become associated with + reductions. */ + unsigned int reduction_latency = 0; +}; + +/* Extends aarch64_vec_op_count with SVE-specific information. */ +struct aarch64_sve_op_count : aarch64_vec_op_count +{ + void dump () const; + + /* The number of individual predicate operations. See the comments + in aarch64_sve_vec_issue_info for details. */ + unsigned int pred_ops = 0; +}; + /* Information about vector code that we're in the process of costing. */ struct aarch64_vector_costs { @@ -14138,6 +14235,10 @@ struct aarch64_vector_costs vectorization. */ bool is_loop = false; + /* True if we've seen an SVE operation that we cannot currently vectorize + using Advanced SIMD. */ + bool saw_sve_only_op = false; + /* - If VEC_FLAGS is zero then we're costing the original scalar code. - If VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced SIMD code. @@ -14159,6 +14260,32 @@ struct aarch64_vector_costs the heuristic. */ unsigned HOST_WIDE_INT unrolled_advsimd_niters = 0; unsigned HOST_WIDE_INT unrolled_advsimd_stmts = 0; + + /* If we're vectorizing a loop that executes a constant number of times, + this variable gives the number of times that the vector loop would + iterate, otherwise it is zero. */ + uint64_t num_vector_iterations = 0; + + /* Used only when vectorizing loops. Estimates the number and kind of scalar + operations that would be needed to perform the same work as one iteration + of the vector loop. */ + aarch64_vec_op_count scalar_ops; + + /* Used only when vectorizing loops. If VEC_FLAGS & VEC_ADVSIMD, + this structure estimates the number and kind of operations that the + vector loop would contain. If VEC_FLAGS & VEC_SVE, the structure + estimates what the equivalent Advanced SIMD-only code would need in + order to perform the same work as one iteration of the SVE loop. */ + aarch64_vec_op_count advsimd_ops; + + /* Used only when vectorizing loops with SVE. It estimates the number and + kind of operations that the SVE loop would contain. */ + aarch64_sve_op_count sve_ops; + + /* Used to detect cases in which we end up costing the same load twice, + once to account for results that are actually used and once to account + for unused results. */ + hash_map, unsigned int> seen_loads; }; /* Implement TARGET_VECTORIZE_INIT_COST. */ @@ -14190,6 +14317,16 @@ aarch64_simd_vec_costs (tree vectype) return costs->advsimd; } +/* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */ +static const simd_vec_cost * +aarch64_simd_vec_costs_for_flags (unsigned int flags) +{ + const cpu_vector_cost *costs = aarch64_tune_params.vec_costs; + if ((flags & VEC_ANY_SVE) && costs->sve) + return costs->sve; + return costs->advsimd; +} + /* Decide whether to use the unrolling heuristic described above aarch64_vector_costs::unrolled_advsimd_niters, updating that field if so. LOOP_VINFO describes the loop that we're vectorizing @@ -14250,6 +14387,19 @@ aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo, { costs->is_loop = true; + /* Record the number of times that the vector loop would execute, + if known. */ + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + auto scalar_niters = max_stmt_executions_int (loop); + if (scalar_niters >= 0) + { + unsigned int vf = vect_vf_for_cost (loop_vinfo); + if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) + costs->num_vector_iterations = scalar_niters / vf; + else + costs->num_vector_iterations = CEIL (scalar_niters, vf); + } + /* Detect whether we're costing the scalar code or the vector code. This is a bit hacky: it would be better if the vectorizer told us directly. @@ -14265,6 +14415,20 @@ aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo, apply the unrolling heuristic described above aarch64_vector_costs::unrolled_advsimd_niters. */ aarch64_record_potential_advsimd_unrolling (loop_vinfo, costs); + + /* Record the issue information for any SVE WHILE instructions that the + loop needs. */ + auto *issue_info = aarch64_tune_params.vec_costs->issue_info; + if (issue_info->sve && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) + { + unsigned int num_masks = 0; + rgroup_controls *rgm; + unsigned int num_vectors_m1; + FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm) + if (rgm->type) + num_masks += num_vectors_m1 + 1; + costs->sve_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops; + } } /* Do one-time initialization of COSTS given that we're costing the block @@ -14407,6 +14571,17 @@ aarch64_embedded_comparison_type (stmt_vec_info stmt_info) return NULL_TREE; } +/* If STMT_INFO is a comparison or contains an embedded comparison, return the + scalar type of the values being compared. Return null otherwise. */ +static tree +aarch64_comparison_type (stmt_vec_info stmt_info) +{ + if (auto *assign = dyn_cast (stmt_info->stmt)) + if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison) + return TREE_TYPE (gimple_assign_rhs1 (assign)); + return aarch64_embedded_comparison_type (stmt_info); +} + /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD vectors would produce a series of LDP or STP operations. KIND is the kind of statement that STMT_INFO represents. */ @@ -14470,6 +14645,79 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info) && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type)); } +/* Return true if STMT_INFO is the second part of a two-statement multiply-add + or multiply-subtract sequence that might be suitable for fusing into a + single instruction. */ +static bool +aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info) +{ + gassign *assign = dyn_cast (stmt_info->stmt); + if (!assign) + return false; + tree_code code = gimple_assign_rhs_code (assign); + if (code != PLUS_EXPR && code != MINUS_EXPR) + return false; + + if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign)) + || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))) + return false; + + for (int i = 1; i < 3; ++i) + { + tree rhs = gimple_op (assign, i); + /* ??? Should we try to check for a single use as well? */ + if (TREE_CODE (rhs) != SSA_NAME) + continue; + + stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs); + if (!def_stmt_info + || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def) + continue; + gassign *rhs_assign = dyn_cast (def_stmt_info->stmt); + if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR) + continue; + + return true; + } + return false; +} + +/* Return true if the vectorized form of STMT_INFO is something that is only + possible when using SVE instead of Advanced SIMD. VECTYPE is the type of + the vector that STMT_INFO is operating on. */ +static bool +aarch64_sve_only_stmt_p (stmt_vec_info stmt_info, tree vectype) +{ + if (!aarch64_sve_mode_p (TYPE_MODE (vectype))) + return false; + + if (STMT_VINFO_DATA_REF (stmt_info)) + { + /* Check for true gathers and scatters (rather than just strided accesses + that we've chosen to implement using gathers and scatters). Although + in principle we could use elementwise accesses for Advanced SIMD, + the vectorizer doesn't yet support that. */ + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + return true; + + /* Check for masked loads and stores. */ + if (auto *call = dyn_cast (stmt_info->stmt)) + if (gimple_call_internal_p (call) + && internal_fn_mask_index (gimple_call_internal_fn (call)) >= 0) + return true; + } + + /* Check for 64-bit integer multiplications. */ + auto *assign = dyn_cast (stmt_info->stmt); + if (assign + && gimple_assign_rhs_code (assign) == MULT_EXPR + && GET_MODE_INNER (TYPE_MODE (vectype)) == DImode + && !integer_pow2p (gimple_assign_rhs2 (assign))) + return true; + + return false; +} + /* We are considering implementing STMT_INFO using SVE vector type VECTYPE. If STMT_INFO is an in-loop reduction that SVE supports directly, return its latency in cycles, otherwise return zero. SVE_COSTS specifies the @@ -14507,6 +14755,59 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo, return 0; } +/* STMT_INFO describes a loop-carried operation in the original scalar code + that we are considering implementing as a reduction. Return one of the + following values, depending on VEC_FLAGS: + + - If VEC_FLAGS is zero, return the loop carry latency of the original + scalar operation. + + - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the + the Advanced SIMD implementation. + + - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the + SVE implementation. + + VECTYPE is the type of vector that the vectorizer is considering using + for STMT_INFO, which might be different from the type of vector described + by VEC_FLAGS. */ +static unsigned int +aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info, + tree vectype, unsigned int vec_flags) +{ + const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs; + const sve_vec_cost *sve_costs = nullptr; + if (vec_flags & VEC_ANY_SVE) + sve_costs = aarch64_tune_params.vec_costs->sve; + + /* If the caller is asking for the SVE latency, check for forms of reduction + that only SVE can handle directly. */ + if (sve_costs) + { + unsigned int latency + = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype, + sve_costs); + if (latency) + return latency; + } + + /* Handle scalar costs. */ + if (vec_flags == 0) + { + if (FLOAT_TYPE_P (vectype)) + return vec_costs->scalar_fp_stmt_cost; + return vec_costs->scalar_int_stmt_cost; + } + + /* Otherwise, the loop body just contains normal integer or FP operations, + with a vector reduction outside the loop. */ + const simd_vec_cost *simd_costs + = aarch64_simd_vec_costs_for_flags (vec_flags); + if (FLOAT_TYPE_P (vectype)) + return simd_costs->fp_stmt_cost; + return simd_costs->int_stmt_cost; +} + /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost for STMT_INFO, which has cost kind KIND. If this is a scalar operation, try to subdivide the target-independent categorization provided by KIND @@ -14729,6 +15030,203 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, return stmt_cost; } +/* VINFO, COSTS, COUNT, KIND, STMT_INFO and VECTYPE are the same as for + TARGET_VECTORIZE_ADD_STMT_COST and they describe an operation in the + body of a vector loop. Record issue information relating to the vector + operation in OPS, where OPS is one of COSTS->scalar_ops, COSTS->advsimd_ops + or COSTS->sve_ops; see the comments above those variables for details. + In addition: + + - VEC_FLAGS is zero if OPS is COSTS->scalar_ops. + + - VEC_FLAGS & VEC_ADVSIMD is nonzero if OPS is COSTS->advsimd_ops. + + - VEC_FLAGS & VEC_ANY_SVE is nonzero if OPS is COSTS->sve_ops. + + ISSUE_INFO provides the scalar, Advanced SIMD or SVE issue information + associated with OPS and VEC_FLAGS. FACTOR says how many iterations of + the loop described by VEC_FLAGS would be needed to match one iteration + of the vector loop in VINFO. */ +static void +aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs, + unsigned int count, enum vect_cost_for_stmt kind, + _stmt_vec_info *stmt_info, tree vectype, + unsigned int vec_flags, aarch64_vec_op_count *ops, + const aarch64_base_vec_issue_info *issue_info, + unsigned int factor) +{ + if (!issue_info) + return; + + const aarch64_simd_vec_issue_info *simd_issue = nullptr; + if (vec_flags) + simd_issue = static_cast (issue_info); + + const aarch64_sve_vec_issue_info *sve_issue = nullptr; + if (vec_flags & VEC_ANY_SVE) + sve_issue = static_cast (issue_info); + + /* Calculate the minimum cycles per iteration imposed by a reduction + operation. */ + if ((kind == vector_stmt || kind == vec_to_scalar) + && aarch64_is_reduction (stmt_info)) + { + unsigned int base + = aarch64_in_loop_reduction_latency (vinfo, stmt_info, vectype, + vec_flags); + if (aarch64_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION) + { + if (aarch64_sve_mode_p (TYPE_MODE (vectype))) + { + /* When costing an SVE FADDA, the vectorizer treats vec_to_scalar + as a single operation, whereas for Advanced SIMD it is a + per-element one. Increase the factor accordingly, both for + the reduction_latency calculation and for the op couting. */ + if (vec_flags & VEC_ADVSIMD) + factor = vect_nunits_for_cost (vectype); + } + else + /* An Advanced SIMD fold-left reduction is the same as a + scalar one and the vectorizer therefore treats vec_to_scalar + as a per-element cost. There is no extra factor to apply for + scalar code, either for reduction_latency or for the op + counting below. */ + factor = 1; + } + + /* ??? Ideally for vector code we'd do COUNT * FACTOR reductions in + parallel, but unfortunately that's not yet the case. */ + ops->reduction_latency = MAX (ops->reduction_latency, + base * count * factor); + } + + /* Assume that multiply-adds will become a single operation. */ + if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info)) + return; + + /* When costing scalar statements in vector code, the count already + includes the number of scalar elements in the vector, so we don't + need to apply the factor as well. */ + if (kind == scalar_load || kind == scalar_store || kind == scalar_stmt) + factor = 1; + + /* This can go negative with the load handling below. */ + int num_copies = count * factor; + + /* Count the basic operation cost associated with KIND. */ + switch (kind) + { + case cond_branch_taken: + case cond_branch_not_taken: + case vector_gather_load: + case vector_scatter_store: + /* We currently don't expect these to be used in a loop body. */ + break; + + case vec_perm: + case vec_promote_demote: + case vec_construct: + case vec_to_scalar: + case scalar_to_vec: + /* Assume that these operations have no overhead in the original + scalar code. */ + if (!vec_flags) + break; + /* Fallthrough. */ + case vector_stmt: + case scalar_stmt: + ops->general_ops += num_copies; + break; + + case scalar_load: + case vector_load: + case unaligned_load: + /* When costing scalars, detect cases in which we are called twice for + the same load. This happens for LD[234] operations if only some of + the results are used. The first time represents the cost of loading + the unused vectors, while the second time represents the cost of + loading the useful parts. Only the latter should count towards the + scalar costs. */ + if (stmt_info && !vec_flags) + { + bool existed = false; + unsigned int &prev_count + = costs->seen_loads.get_or_insert (stmt_info, &existed); + if (existed) + num_copies -= prev_count; + else + prev_count = num_copies; + } + ops->loads += num_copies; + if (vec_flags || FLOAT_TYPE_P (vectype)) + ops->general_ops += issue_info->fp_simd_load_general_ops * num_copies; + break; + + case vector_store: + case unaligned_store: + case scalar_store: + ops->stores += num_copies; + if (vec_flags || FLOAT_TYPE_P (vectype)) + ops->general_ops += issue_info->fp_simd_store_general_ops * num_copies; + break; + } + + /* Add any embedded comparison operations. */ + if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar) + && aarch64_embedded_comparison_type (stmt_info)) + ops->general_ops += num_copies; + + /* Detect COND_REDUCTIONs and things that would need to become + COND_REDUCTIONs if they were implemented using Advanced SIMD. + There are then two sets of VEC_COND_EXPRs, whereas so far we + have only accounted for one. */ + if (vec_flags && (kind == vector_stmt || kind == vec_to_scalar)) + { + int reduc_type = aarch64_reduc_type (vinfo, stmt_info); + if ((reduc_type == EXTRACT_LAST_REDUCTION && (vec_flags & VEC_ADVSIMD)) + || reduc_type == COND_REDUCTION) + ops->general_ops += num_copies; + } + + /* Count the predicate operations needed by an SVE comparison. */ + if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar)) + if (tree type = aarch64_comparison_type (stmt_info)) + { + unsigned int base = (FLOAT_TYPE_P (type) + ? sve_issue->fp_cmp_pred_ops + : sve_issue->int_cmp_pred_ops); + costs->sve_ops.pred_ops += base * num_copies; + } + + /* Add any extra overhead associated with LD[234] and ST[234] operations. */ + if (simd_issue) + switch (aarch64_ld234_st234_vectors (kind, stmt_info)) + { + case 2: + ops->general_ops += simd_issue->ld2_st2_general_ops * num_copies; + break; + + case 3: + ops->general_ops += simd_issue->ld3_st3_general_ops * num_copies; + break; + + case 4: + ops->general_ops += simd_issue->ld4_st4_general_ops * num_copies; + break; + } + + /* Add any overhead associated with gather loads and scatter stores. */ + if (sve_issue + && (kind == scalar_load || kind == scalar_store) + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER) + { + unsigned int pairs = CEIL (count, 2); + costs->sve_ops.pred_ops + += sve_issue->gather_scatter_pair_pred_ops * pairs; + ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs; + } +} + /* Implement targetm.vectorize.add_stmt_cost. */ static unsigned aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, @@ -14760,6 +15258,9 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, of just looking at KIND. */ if (stmt_info && aarch64_use_new_vector_costs_p ()) { + if (vectype && aarch64_sve_only_stmt_p (stmt_info, vectype)) + costs->saw_sve_only_op = true; + stmt_cost = aarch64_detect_scalar_stmt_subtype (vinfo, kind, stmt_info, stmt_cost); @@ -14781,6 +15282,44 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype, stmt_cost); + /* If we're recording a nonzero vector loop body cost, also estimate + the operations that would need to be issued by all relevant + implementations of the loop. */ + auto *issue_info = aarch64_tune_params.vec_costs->issue_info; + if (loop_vinfo + && issue_info + && costs->vec_flags + && where == vect_body + && vectype + && stmt_cost != 0) + { + /* Record estimates for the scalar code. */ + aarch64_count_ops (vinfo, costs, count, kind, stmt_info, vectype, + 0, &costs->scalar_ops, issue_info->scalar, + vect_nunits_for_cost (vectype)); + + if (aarch64_sve_mode_p (vinfo->vector_mode) && issue_info->sve) + { + /* Record estimates for a possible Advanced SIMD version + of the SVE code. */ + aarch64_count_ops (vinfo, costs, count, kind, stmt_info, + vectype, VEC_ADVSIMD, &costs->advsimd_ops, + issue_info->advsimd, + aarch64_estimated_sve_vq ()); + + /* Record estimates for the SVE code itself. */ + aarch64_count_ops (vinfo, costs, count, kind, stmt_info, + vectype, VEC_ANY_SVE, &costs->sve_ops, + issue_info->sve, 1); + } + else + /* Record estimates for the Advanced SIMD code. Treat SVE like + Advanced SIMD if the CPU has no specific SVE costs. */ + aarch64_count_ops (vinfo, costs, count, kind, stmt_info, + vectype, VEC_ADVSIMD, &costs->advsimd_ops, + issue_info->advsimd, 1); + } + /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic, estimate the number of statements in the unrolled Advanced SIMD loop. For simplicitly, we assume that one iteration of the @@ -14805,12 +15344,56 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, return retval; } +/* Dump information about the structure. */ +void +aarch64_vec_op_count::dump () const +{ + dump_printf_loc (MSG_NOTE, vect_location, + " load operations = %d\n", loads); + dump_printf_loc (MSG_NOTE, vect_location, + " store operations = %d\n", stores); + dump_printf_loc (MSG_NOTE, vect_location, + " general operations = %d\n", general_ops); + dump_printf_loc (MSG_NOTE, vect_location, + " reduction latency = %d\n", reduction_latency); +} + +/* Dump information about the structure. */ +void +aarch64_sve_op_count::dump () const +{ + aarch64_vec_op_count::dump (); + dump_printf_loc (MSG_NOTE, vect_location, + " predicate operations = %d\n", pred_ops); +} + +/* Use ISSUE_INFO to estimate the minimum number of cycles needed to issue + the operations described by OPS. This is a very simplistic model! */ +static unsigned int +aarch64_estimate_min_cycles_per_iter + (const aarch64_vec_op_count *ops, + const aarch64_base_vec_issue_info *issue_info) +{ + unsigned int cycles = MAX (ops->reduction_latency, 1); + cycles = MAX (cycles, CEIL (ops->stores, issue_info->stores_per_cycle)); + cycles = MAX (cycles, CEIL (ops->loads + ops->stores, + issue_info->loads_stores_per_cycle)); + cycles = MAX (cycles, CEIL (ops->general_ops, + issue_info->general_ops_per_cycle)); + return cycles; +} + /* BODY_COST is the cost of a vector loop body recorded in COSTS. Adjust the cost as necessary and return the new cost. */ static unsigned int aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) { unsigned int orig_body_cost = body_cost; + bool should_disparage = false; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Original vector body cost = %d\n", body_cost); if (costs->unrolled_advsimd_stmts) { @@ -14841,10 +15424,203 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) "Increasing body cost to %d to account for" " unrolling\n", min_cost); body_cost = min_cost; + should_disparage = true; } } } + auto *issue_info = aarch64_tune_params.vec_costs->issue_info; + if (!issue_info) + return body_cost; + + unsigned int scalar_cycles_per_iter + = aarch64_estimate_min_cycles_per_iter (&costs->scalar_ops, + issue_info->scalar); + unsigned int advsimd_cycles_per_iter + = aarch64_estimate_min_cycles_per_iter (&costs->advsimd_ops, + issue_info->advsimd); + bool could_use_advsimd + = ((costs->vec_flags & VEC_ADVSIMD) + || (aarch64_autovec_preference != 2 + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT) + && !costs->saw_sve_only_op)); + + if (dump_enabled_p ()) + { + if (IN_RANGE (costs->num_vector_iterations, 0, 65536)) + dump_printf_loc (MSG_NOTE, vect_location, + "Vector loop iterates at most %wd times\n", + costs->num_vector_iterations); + dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n"); + costs->scalar_ops.dump (); + dump_printf_loc (MSG_NOTE, vect_location, + " estimated cycles per iteration = %d\n", + scalar_cycles_per_iter); + if (could_use_advsimd) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Advanced SIMD issue estimate:\n"); + costs->advsimd_ops.dump (); + dump_printf_loc (MSG_NOTE, vect_location, + " estimated cycles per iteration = %d\n", + advsimd_cycles_per_iter); + } + else + dump_printf_loc (MSG_NOTE, vect_location, + "Loop could not use Advanced SIMD\n"); + } + + uint64_t vector_cycles_per_iter = advsimd_cycles_per_iter; + unsigned int vector_reduction_latency = costs->advsimd_ops.reduction_latency; + if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve) + { + /* Estimate the minimum number of cycles per iteration needed to issue + non-predicate operations. */ + unsigned int sve_cycles_per_iter + = aarch64_estimate_min_cycles_per_iter (&costs->sve_ops, + issue_info->sve); + + /* Separately estimate the minimum number of cycles per iteration needed + to issue the predicate operations. */ + unsigned int pred_cycles_per_iter + = CEIL (costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle); + + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n"); + costs->sve_ops.dump (); + dump_printf_loc (MSG_NOTE, vect_location, + " estimated cycles per iteration for non-predicate" + " operations = %d\n", sve_cycles_per_iter); + if (costs->sve_ops.pred_ops) + dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per" + " iteration for predicate operations = %d\n", + pred_cycles_per_iter); + } + + vector_cycles_per_iter = MAX (sve_cycles_per_iter, pred_cycles_per_iter); + vector_reduction_latency = costs->sve_ops.reduction_latency; + + /* If the scalar version of the loop could issue at least as + quickly as the predicate parts of the SVE loop, make the SVE loop + prohibitively expensive. In this case vectorization is adding an + overhead that the original scalar code didn't have. + + This is mostly intended to detect cases in which WHILELOs dominate + for very tight loops, which is something that normal latency-based + costs would not model. Adding this kind of cliffedge would be + too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter; + code later in the function handles that case in a more + conservative way. */ + uint64_t sve_estimate = pred_cycles_per_iter + 1; + if (scalar_cycles_per_iter < sve_estimate) + { + unsigned int min_cost + = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR); + if (body_cost < min_cost) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d because the" + " scalar code could issue within the limit" + " imposed by predicate operations\n", + min_cost); + body_cost = min_cost; + should_disparage = true; + } + } + + /* If it appears that the Advanced SIMD version of a loop could issue + more quickly than the SVE one, increase the SVE cost in proportion + to the difference. The intention is to make Advanced SIMD preferable + in cases where an Advanced SIMD version exists, without increasing + the costs so much that SVE won't be used at all. + + The reasoning is similar to the scalar vs. predicate comparison above: + if the issue rate of the SVE code is limited by predicate operations + (i.e. if pred_cycles_per_iter > sve_cycles_per_iter), and if the + Advanced SIMD code could issue within the limit imposed by the + predicate operations, the predicate operations are adding an + overhead that the original code didn't have and so we should prefer + the Advanced SIMD version. However, if the predicate operations + do not dominate in this way, we should only increase the cost of + the SVE code if sve_cycles_per_iter is strictly greater than + advsimd_cycles_per_iter. Given rounding effects, this should mean + that Advanced SIMD is either better or at least no worse. */ + if (sve_cycles_per_iter >= pred_cycles_per_iter) + sve_estimate = sve_cycles_per_iter; + if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate) + { + /* This ensures that min_cost > orig_body_cost * 2. */ + unsigned int min_cost + = orig_body_cost * CEIL (sve_estimate, advsimd_cycles_per_iter) + 1; + if (body_cost < min_cost) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d because Advanced" + " SIMD code could issue as quickly\n", + min_cost); + body_cost = min_cost; + should_disparage = true; + } + } + } + + /* Decide whether to stick to latency-based costs or whether to try to + take issue rates into account. */ + unsigned int threshold = aarch64_loop_vect_issue_rate_niters; + if (costs->vec_flags & VEC_ANY_SVE) + threshold = CEIL (threshold, aarch64_estimated_sve_vq ()); + + if (costs->num_vector_iterations >= 1 + && costs->num_vector_iterations < threshold) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Low iteration count, so using pure latency" + " costs\n"); + } + /* Increase the cost of the vector code if it looks like the scalar code + could issue more quickly. These values are only rough estimates, + so minor differences should only result in minor changes. */ + else if (scalar_cycles_per_iter < vector_cycles_per_iter) + { + body_cost = CEIL (body_cost * vector_cycles_per_iter, + scalar_cycles_per_iter); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d because scalar code" + " would issue more quickly\n", body_cost); + } + /* In general, it's expected that the proposed vector code would be able + to issue more quickly than the original scalar code. This should + already be reflected to some extent in the latency-based costs. + + However, the latency-based costs effectively assume that the scalar + code and the vector code execute serially, which tends to underplay + one important case: if the real (non-serialized) execution time of + a scalar iteration is dominated by loop-carried dependencies, + and if the vector code is able to reduce both the length of + the loop-carried dependencies *and* the number of cycles needed + to issue the code in general, we can be more confident that the + vector code is an improvement, even if adding the other (non-loop-carried) + latencies tends to hide this saving. We therefore reduce the cost of the + vector loop body in proportion to the saving. */ + else if (costs->scalar_ops.reduction_latency > vector_reduction_latency + && costs->scalar_ops.reduction_latency == scalar_cycles_per_iter + && scalar_cycles_per_iter > vector_cycles_per_iter + && !should_disparage) + { + body_cost = CEIL (body_cost * vector_cycles_per_iter, + scalar_cycles_per_iter); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Decreasing body cost to %d account for smaller" + " reduction latency\n", body_cost); + } + return body_cost; } diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt index 91e5c61..32191cf 100644 --- a/gcc/config/aarch64/aarch64.opt +++ b/gcc/config/aarch64/aarch64.opt @@ -277,3 +277,6 @@ The number of Newton iterations for calculating the reciprocal for double type. -param=aarch64-autovec-preference= Target Joined UInteger Var(aarch64_autovec_preference) Init(0) IntegerRange(0, 4) Param + +-param=aarch64-loop-vect-issue-rate-niters= +Target Joined UInteger Var(aarch64_loop_vect_issue_rate_niters) Init(6) IntegerRange(0, 65536) Param -- cgit v1.1 From 6b8b0c8e243bdc122a9ddd42030275494b1148ff Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 26 Mar 2021 16:08:38 +0000 Subject: aarch64: Add costs for LD[34] and ST[34] postincrements Most postincrements are cheap on Neoverse V1, but it's generally better to avoid them on LD[34] and ST[34] instructions. This patch adds separate address costs fields for these cases. Other CPUs continue to use the same costs for all postincrements. gcc/ * config/aarch64/aarch64-protos.h (cpu_addrcost_table::post_modify_ld3_st3): New member variable. (cpu_addrcost_table::post_modify_ld4_st4): Likewise. * config/aarch64/aarch64.c (generic_addrcost_table): Update accordingly, using the same costs as for post_modify. (exynosm1_addrcost_table, xgene1_addrcost_table): Likewise. (thunderx2t99_addrcost_table, thunderx3t110_addrcost_table): (tsv110_addrcost_table, qdf24xx_addrcost_table): Likewise. (a64fx_addrcost_table): Likewise. (neoversev1_addrcost_table): New. (neoversev1_tunings): Use neoversev1_addrcost_table. (aarch64_address_cost): Use the new post_modify costs for CImode and XImode. --- gcc/config/aarch64/aarch64-protos.h | 2 ++ gcc/config/aarch64/aarch64.c | 45 +++++++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index ca1ed9e..d5d5417 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -177,6 +177,8 @@ struct cpu_addrcost_table const struct scale_addr_mode_cost addr_scale_costs; const int pre_modify; const int post_modify; + const int post_modify_ld3_st3; + const int post_modify_ld4_st4; const int register_offset; const int register_sextend; const int register_zextend; diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 6d961be..a573850 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -364,6 +364,8 @@ static const struct cpu_addrcost_table generic_addrcost_table = }, 0, /* pre_modify */ 0, /* post_modify */ + 0, /* post_modify_ld3_st3 */ + 0, /* post_modify_ld4_st4 */ 0, /* register_offset */ 0, /* register_sextend */ 0, /* register_zextend */ @@ -380,6 +382,8 @@ static const struct cpu_addrcost_table exynosm1_addrcost_table = }, 0, /* pre_modify */ 0, /* post_modify */ + 0, /* post_modify_ld3_st3 */ + 0, /* post_modify_ld4_st4 */ 1, /* register_offset */ 1, /* register_sextend */ 2, /* register_zextend */ @@ -396,6 +400,8 @@ static const struct cpu_addrcost_table xgene1_addrcost_table = }, 1, /* pre_modify */ 1, /* post_modify */ + 1, /* post_modify_ld3_st3 */ + 1, /* post_modify_ld4_st4 */ 0, /* register_offset */ 1, /* register_sextend */ 1, /* register_zextend */ @@ -412,6 +418,8 @@ static const struct cpu_addrcost_table thunderx2t99_addrcost_table = }, 0, /* pre_modify */ 0, /* post_modify */ + 0, /* post_modify_ld3_st3 */ + 0, /* post_modify_ld4_st4 */ 2, /* register_offset */ 3, /* register_sextend */ 3, /* register_zextend */ @@ -428,6 +436,8 @@ static const struct cpu_addrcost_table thunderx3t110_addrcost_table = }, 0, /* pre_modify */ 0, /* post_modify */ + 0, /* post_modify_ld3_st3 */ + 0, /* post_modify_ld4_st4 */ 2, /* register_offset */ 3, /* register_sextend */ 3, /* register_zextend */ @@ -444,6 +454,8 @@ static const struct cpu_addrcost_table tsv110_addrcost_table = }, 0, /* pre_modify */ 0, /* post_modify */ + 0, /* post_modify_ld3_st3 */ + 0, /* post_modify_ld4_st4 */ 0, /* register_offset */ 1, /* register_sextend */ 1, /* register_zextend */ @@ -460,6 +472,8 @@ static const struct cpu_addrcost_table qdf24xx_addrcost_table = }, 1, /* pre_modify */ 1, /* post_modify */ + 1, /* post_modify_ld3_st3 */ + 1, /* post_modify_ld4_st4 */ 3, /* register_offset */ 3, /* register_sextend */ 3, /* register_zextend */ @@ -476,12 +490,32 @@ static const struct cpu_addrcost_table a64fx_addrcost_table = }, 0, /* pre_modify */ 0, /* post_modify */ + 0, /* post_modify_ld3_st3 */ + 0, /* post_modify_ld4_st4 */ 2, /* register_offset */ 3, /* register_sextend */ 3, /* register_zextend */ 0, /* imm_offset */ }; +static const struct cpu_addrcost_table neoversev1_addrcost_table = +{ + { + 1, /* hi */ + 0, /* si */ + 0, /* di */ + 1, /* ti */ + }, + 0, /* pre_modify */ + 0, /* post_modify */ + 3, /* post_modify_ld3_st3 */ + 3, /* post_modify_ld4_st4 */ + 0, /* register_offset */ + 0, /* register_sextend */ + 0, /* register_zextend */ + 0 /* imm_offset */ +}; + static const struct cpu_regmove_cost generic_regmove_cost = { 1, /* GP2GP */ @@ -1777,7 +1811,7 @@ static const struct cpu_vector_cost neoversev1_vector_cost = static const struct tune_params neoversev1_tunings = { &cortexa76_extra_costs, - &generic_addrcost_table, + &neoversev1_addrcost_table, &generic_regmove_cost, &neoversev1_vector_cost, &generic_branch_cost, @@ -12077,7 +12111,14 @@ aarch64_address_cost (rtx x, if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY) cost += addr_cost->pre_modify; else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY) - cost += addr_cost->post_modify; + { + if (mode == CImode) + cost += addr_cost->post_modify_ld3_st3; + else if (mode == XImode) + cost += addr_cost->post_modify_ld4_st4; + else + cost += addr_cost->post_modify; + } else gcc_unreachable (); -- cgit v1.1 From 42a21b4cb540be93548a6ff6d4cb4a73ab1665be Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Sun, 14 Mar 2021 15:09:21 -0400 Subject: aix: ABI struct alignment (PR99557) The AIX power alignment rules apply the natural alignment of the "first member" if it is of a floating-point data type (or is an aggregate whose recursively "first" member or element is such a type). The alignment associated with these types for subsequent members use an alignment value where the floating-point data type is considered to have 4-byte alignment. GCC had been stripping array type but had not recursively looked within structs and unions. This also applies to classes and subclasses and, therefore, becomes more prominent with C++. For example, struct A { double x[2]; int y; }; struct B { int i; struct A a; }; struct A has double-word alignment for the bare type, but word alignment and offset within struct B despite the alignment of struct A. If struct A were the first member of struct B, struct B would have double-word alignment. One must search for the innermost first member to increase the alignment if double and then search for the innermost first member to reduce the alignment if the TYPE had double-word alignment solely because the innermost first member was double. This patch recursively looks through the first member to apply the double-word alignment to the struct / union as a whole and to apply the word alignment to the struct or union as a member within a struct or union. This is an ABI change for GCC on AIX, but GCC on AIX had not correctly implemented the AIX ABI and had not been compatible with the IBM XL compiler. Bootstrapped on powerpc-ibm-aix7.2.3.0. gcc/ChangeLog: * config/rs6000/aix.h (ADJUST_FIELD_ALIGN): Call function. * config/rs6000/rs6000-protos.h (rs6000_special_adjust_field_align): Declare. * config/rs6000/rs6000.c (rs6000_special_adjust_field_align): New. (rs6000_special_round_type_align): Recursively check innermost first field. gcc/testsuite/ChangeLog: * gcc.target/powerpc/pr99557.c: New. --- gcc/config/rs6000/aix.h | 6 +-- gcc/config/rs6000/rs6000-protos.h | 1 + gcc/config/rs6000/rs6000.c | 89 ++++++++++++++++++++++++++++++++------- 3 files changed, 77 insertions(+), 19 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/aix.h b/gcc/config/rs6000/aix.h index 2db50c8..7fccb313 100644 --- a/gcc/config/rs6000/aix.h +++ b/gcc/config/rs6000/aix.h @@ -223,10 +223,8 @@ /* This now supports a natural alignment mode. */ /* AIX word-aligns FP doubles but doubleword-aligns 64-bit ints. */ #define ADJUST_FIELD_ALIGN(FIELD, TYPE, COMPUTED) \ - ((TARGET_ALIGN_NATURAL == 0 \ - && (TYPE_MODE (strip_array_types (TYPE)) == DFmode \ - || TYPE_MODE (strip_array_types (TYPE)) == DCmode)) \ - ? MIN ((COMPUTED), 32) \ + (TARGET_ALIGN_NATURAL == 0 \ + ? rs6000_special_adjust_field_align (TYPE, COMPUTED) \ : (COMPUTED)) /* AIX increases natural record alignment to doubleword if the first diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 203660b..c44fd3d 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -227,6 +227,7 @@ address_is_prefixed (rtx addr, #ifdef TREE_CODE extern unsigned int rs6000_data_alignment (tree, unsigned int, enum data_align); extern bool rs6000_special_adjust_field_align_p (tree, unsigned int); +extern unsigned int rs6000_special_adjust_field_align (tree, unsigned int); extern unsigned int rs6000_special_round_type_align (tree, unsigned int, unsigned int); extern unsigned int darwin_rs6000_special_round_type_align (tree, unsigned int, diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 34c4eda..fd2b0b5 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -7853,32 +7853,91 @@ rs6000_special_adjust_field_align_p (tree type, unsigned int computed) return false; } -/* AIX increases natural record alignment to doubleword if the first - field is an FP double while the FP fields remain word aligned. */ +/* AIX word-aligns FP doubles but doubleword-aligns 64-bit ints. */ + +unsigned int +rs6000_special_adjust_field_align (tree type, unsigned int computed) +{ + if (computed <= 32) + return computed; + + /* Strip initial arrays. */ + while (TREE_CODE (type) == ARRAY_TYPE) + type = TREE_TYPE (type); + + /* If RECORD or UNION, recursively find the first field. */ + while (AGGREGATE_TYPE_P (type)) + { + tree field = TYPE_FIELDS (type); + + /* Skip all non field decls */ + while (field != NULL + && (TREE_CODE (field) != FIELD_DECL + || DECL_FIELD_ABI_IGNORED (field))) + field = DECL_CHAIN (field); + + if (! field) + break; + + /* A packed field does not contribute any extra alignment. */ + if (DECL_PACKED (field)) + return computed; + + type = TREE_TYPE (field); + + /* Strip arrays. */ + while (TREE_CODE (type) == ARRAY_TYPE) + type = TREE_TYPE (type); + } + + if (! AGGREGATE_TYPE_P (type) && type != error_mark_node + && (TYPE_MODE (type) == DFmode || TYPE_MODE (type) == DCmode)) + computed = MIN (computed, 32); + + return computed; +} + +/* AIX increases natural record alignment to doubleword if the innermost first + field is an FP double while the FP fields remain word aligned. + Only called if TYPE initially is a RECORD or UNION. */ unsigned int rs6000_special_round_type_align (tree type, unsigned int computed, unsigned int specified) { unsigned int align = MAX (computed, specified); - tree field = TYPE_FIELDS (type); - /* Skip all non field decls */ - while (field != NULL - && (TREE_CODE (field) != FIELD_DECL - || DECL_FIELD_ABI_IGNORED (field))) - field = DECL_CHAIN (field); + if (TYPE_PACKED (type) || align >= 64) + return align; - if (field != NULL && field != type) + /* If RECORD or UNION, recursively find the first field. */ + do { + tree field = TYPE_FIELDS (type); + + /* Skip all non field decls */ + while (field != NULL + && (TREE_CODE (field) != FIELD_DECL + || DECL_FIELD_ABI_IGNORED (field))) + field = DECL_CHAIN (field); + + if (! field) + break; + + /* A packed field does not contribute any extra alignment. */ + if (DECL_PACKED (field)) + return align; + type = TREE_TYPE (field); + + /* Strip arrays. */ while (TREE_CODE (type) == ARRAY_TYPE) type = TREE_TYPE (type); + } while (AGGREGATE_TYPE_P (type)); - if (type != error_mark_node - && (TYPE_MODE (type) == DFmode || TYPE_MODE (type) == DCmode)) - align = MAX (align, 64); - } + if (! AGGREGATE_TYPE_P (type) && type != error_mark_node + && (TYPE_MODE (type) == DFmode || TYPE_MODE (type) == DCmode)) + align = MAX (align, 64); return align; } @@ -10576,7 +10635,7 @@ rs6000_emit_move (rtx dest, rtx source, machine_mode mode) case E_OOmode: case E_XOmode: if (CONST_INT_P (operands[1]) && INTVAL (operands[1]) != 0) - error ("%qs is an opaque type, and you can't set it to other values.", + error ("%qs is an opaque type, and you cannot set it to other values", (mode == OOmode) ? "__vector_pair" : "__vector_quad"); break; @@ -20049,7 +20108,7 @@ rs6000_handle_altivec_attribute (tree *node, else if (TREE_CODE (type) == COMPLEX_TYPE) error ("use of % in AltiVec types is invalid"); else if (DECIMAL_FLOAT_MODE_P (mode)) - error ("use of decimal floating point types in AltiVec types is invalid"); + error ("use of decimal floating-point types in AltiVec types is invalid"); else if (!TARGET_VSX) { if (type == long_unsigned_type_node || type == long_integer_type_node) -- cgit v1.1 From 499fa254ae8c9752d8c2cf3130b13ffddfd83546 Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Sun, 28 Mar 2021 13:11:50 -0400 Subject: aix: TLS DWARF symbol decorations. GCC currently emits TLS relocation decorations on symbols in DWARF sections. Recent changes to the AIX linker cause it to reject such symbols. This patch removes the decorations (@ie, @le, @m) and emit only the qualified symbol name. gcc/ChangeLog: * config/rs6000/rs6000.c (rs6000_output_dwarf_dtprel): Do not add XCOFF TLS reloc decorations. --- gcc/config/rs6000/rs6000.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index fd2b0b5..6a8943d 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -9027,26 +9027,6 @@ rs6000_output_dwarf_dtprel (FILE *file, int size, rtx x) output_addr_const (file, x); if (TARGET_ELF) fputs ("@dtprel+0x8000", file); - else if (TARGET_XCOFF && SYMBOL_REF_P (x)) - { - switch (SYMBOL_REF_TLS_MODEL (x)) - { - case 0: - break; - case TLS_MODEL_LOCAL_EXEC: - fputs ("@le", file); - break; - case TLS_MODEL_INITIAL_EXEC: - fputs ("@ie", file); - break; - case TLS_MODEL_GLOBAL_DYNAMIC: - case TLS_MODEL_LOCAL_DYNAMIC: - fputs ("@m", file); - break; - default: - gcc_unreachable (); - } - } } /* Return true if X is a symbol that refers to real (rather than emulated) -- cgit v1.1 From 37d9074e12082132ae62c12fbe958c697f638c0a Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Mon, 29 Mar 2021 11:52:24 +0100 Subject: aarch64: PR target/99037 Fix RTL represntation in move_lo_quad patterns This patch fixes the RTL representation of the move_lo_quad patterns to use aarch64_simd_or_scalar_imm_zero for the zero part rather than a vec_duplicate of zero or a const_int 0. The expander that generates them is also adjusted so that we use and match the correct const_vector forms throughout. Co-Authored-By: Jakub Jelinek gcc/ChangeLog: PR target/99037 * config/aarch64/aarch64-simd.md (move_lo_quad_internal_): Use aarch64_simd_or_scalar_imm_zero to match zeroes. Remove pattern matching const_int 0. (move_lo_quad_internal_be_): Likewise. (move_lo_quad_): Update for the above. * config/aarch64/iterators.md (VQ_2E): Delete. gcc/testsuite/ChangeLog: PR target/99808 * gcc.target/aarch64/pr99808.c: New test. --- gcc/config/aarch64/aarch64-simd.md | 49 ++++++++------------------------------ gcc/config/aarch64/iterators.md | 3 --- 2 files changed, 10 insertions(+), 42 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 348a43d..d86e8e72 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1586,25 +1586,10 @@ ;; On big-endian this is { zeroes, operand } (define_insn "move_lo_quad_internal_" - [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w") - (vec_concat:VQMOV_NO2E + [(set (match_operand:VQMOV 0 "register_operand" "=w,w,w") + (vec_concat:VQMOV (match_operand: 1 "register_operand" "w,r,r") - (vec_duplicate: (const_int 0))))] - "TARGET_SIMD && !BYTES_BIG_ENDIAN" - "@ - dup\\t%d0, %1.d[0] - fmov\\t%d0, %1 - dup\\t%d0, %1" - [(set_attr "type" "neon_dup,f_mcr,neon_dup") - (set_attr "length" "4") - (set_attr "arch" "simd,fp,simd")] -) - -(define_insn "move_lo_quad_internal_" - [(set (match_operand:VQ_2E 0 "register_operand" "=w,w,w") - (vec_concat:VQ_2E - (match_operand: 1 "register_operand" "w,r,r") - (const_int 0)))] + (match_operand: 2 "aarch64_simd_or_scalar_imm_zero")))] "TARGET_SIMD && !BYTES_BIG_ENDIAN" "@ dup\\t%d0, %1.d[0] @@ -1616,24 +1601,9 @@ ) (define_insn "move_lo_quad_internal_be_" - [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w") - (vec_concat:VQMOV_NO2E - (vec_duplicate: (const_int 0)) - (match_operand: 1 "register_operand" "w,r,r")))] - "TARGET_SIMD && BYTES_BIG_ENDIAN" - "@ - dup\\t%d0, %1.d[0] - fmov\\t%d0, %1 - dup\\t%d0, %1" - [(set_attr "type" "neon_dup,f_mcr,neon_dup") - (set_attr "length" "4") - (set_attr "arch" "simd,fp,simd")] -) - -(define_insn "move_lo_quad_internal_be_" - [(set (match_operand:VQ_2E 0 "register_operand" "=w,w,w") - (vec_concat:VQ_2E - (const_int 0) + [(set (match_operand:VQMOV 0 "register_operand" "=w,w,w") + (vec_concat:VQMOV + (match_operand: 2 "aarch64_simd_or_scalar_imm_zero") (match_operand: 1 "register_operand" "w,r,r")))] "TARGET_SIMD && BYTES_BIG_ENDIAN" "@ @@ -1647,13 +1617,14 @@ (define_expand "move_lo_quad_" [(match_operand:VQMOV 0 "register_operand") - (match_operand:VQMOV 1 "register_operand")] + (match_operand: 1 "register_operand")] "TARGET_SIMD" { + rtx zs = CONST0_RTX (mode); if (BYTES_BIG_ENDIAN) - emit_insn (gen_move_lo_quad_internal_be_ (operands[0], operands[1])); + emit_insn (gen_move_lo_quad_internal_be_ (operands[0], operands[1], zs)); else - emit_insn (gen_move_lo_quad_internal_ (operands[0], operands[1])); + emit_insn (gen_move_lo_quad_internal_ (operands[0], operands[1], zs)); DONE; } ) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index fb6e228..5f5abd6 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -125,9 +125,6 @@ ;; VQ without 2 element modes. (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF]) -;; Quad vector with only 2 element modes. -(define_mode_iterator VQ_2E [V2DI V2DF]) - ;; BFmode vector modes. (define_mode_iterator VBF [V4BF V8BF]) -- cgit v1.1 From e4005cf8717abe8c949f840c707e02e6c394c2e7 Mon Sep 17 00:00:00 2001 From: Alex Coplan Date: Mon, 29 Mar 2021 12:18:19 +0100 Subject: aarch64: Fix SVE ACLE builtins with LTO [PR99216] As discussed in the PR, we currently have two different numbering schemes for SVE builtins: one for C, and one for C++. This is problematic for LTO, where we end up getting confused about which intrinsic we're talking about. This patch inserts placeholders into the registered_functions vector to ensure that there is a consistent numbering scheme for both C and C++. We use integer_zero_node as a placeholder node instead of building a function decl. This is safe because the node is only returned by the TARGET_BUILTIN_DECL hook, which (on AArch64) is only used for validation when builtin decls are streamed into lto1. gcc/ChangeLog: PR target/99216 * config/aarch64/aarch64-sve-builtins.cc (function_builder::add_function): Add placeholder_p argument, use placeholder decls if this is set. (function_builder::add_unique_function): Instead of conditionally adding direct overloads, unconditionally add either a direct overload or a placeholder. (function_builder::add_overloaded_function): Set placeholder_p if we're using C++ overloads. Use the obstack for string storage instead of relying on the tree nodes. (function_builder::add_overloaded_functions): Don't return early for m_direct_overloads: we need to add placeholders. * config/aarch64/aarch64-sve-builtins.h (function_builder::add_function): Add placeholder_p argument. gcc/testsuite/ChangeLog: PR target/99216 * g++.target/aarch64/sve/pr99216.C: New test. --- gcc/config/aarch64/aarch64-sve-builtins.cc | 59 ++++++++++++++++++------------ gcc/config/aarch64/aarch64-sve-builtins.h | 3 +- 2 files changed, 38 insertions(+), 24 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc index 25612d2..f44f81f 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc @@ -999,12 +999,29 @@ registered_function & function_builder::add_function (const function_instance &instance, const char *name, tree fntype, tree attrs, uint64_t required_extensions, - bool overloaded_p) + bool overloaded_p, + bool placeholder_p) { unsigned int code = vec_safe_length (registered_functions); code = (code << AARCH64_BUILTIN_SHIFT) | AARCH64_BUILTIN_SVE; - tree decl = simulate_builtin_function_decl (input_location, name, fntype, - code, NULL, attrs); + + /* We need to be able to generate placeholders to enusre that we have a + consistent numbering scheme for function codes between the C and C++ + frontends, so that everything ties up in LTO. + + Currently, tree-streamer-in.c:unpack_ts_function_decl_value_fields + validates that tree nodes returned by TARGET_BUILTIN_DECL are non-NULL and + some node other than error_mark_node. This is a holdover from when builtin + decls were streamed by code rather than by value. + + Ultimately, we should be able to remove this validation of BUILT_IN_MD + nodes and remove the target hook. For now, however, we need to appease the + validation and return a non-NULL, non-error_mark_node node, so we + arbitrarily choose integer_zero_node. */ + tree decl = placeholder_p + ? integer_zero_node + : simulate_builtin_function_decl (input_location, name, fntype, + code, NULL, attrs); registered_function &rfn = *ggc_alloc (); rfn.instance = instance; @@ -1036,7 +1053,7 @@ function_builder::add_unique_function (const function_instance &instance, argument_types.address ()); tree attrs = get_attributes (instance); registered_function &rfn = add_function (instance, name, fntype, attrs, - required_extensions, false); + required_extensions, false, false); /* Enter the function into the hash table. */ hashval_t hash = instance.hash (); @@ -1047,16 +1064,14 @@ function_builder::add_unique_function (const function_instance &instance, /* Also add the function under its overloaded alias, if we want a separate decl for each instance of an overloaded function. */ - if (m_direct_overloads || force_direct_overloads) + char *overload_name = get_name (instance, true); + if (strcmp (name, overload_name) != 0) { - char *overload_name = get_name (instance, true); - if (strcmp (name, overload_name) != 0) - { - /* Attribute lists shouldn't be shared. */ - tree attrs = get_attributes (instance); - add_function (instance, overload_name, fntype, attrs, - required_extensions, false); - } + /* Attribute lists shouldn't be shared. */ + tree attrs = get_attributes (instance); + bool placeholder_p = !(m_direct_overloads || force_direct_overloads); + add_function (instance, overload_name, fntype, attrs, + required_extensions, false, placeholder_p); } obstack_free (&m_string_obstack, name); @@ -1077,18 +1092,19 @@ function_builder::add_overloaded_function (const function_instance &instance, { char *name = get_name (instance, true); if (registered_function **map_value = m_overload_names.get (name)) - gcc_assert ((*map_value)->instance == instance - && ((*map_value)->required_extensions - & ~required_extensions) == 0); + { + gcc_assert ((*map_value)->instance == instance + && ((*map_value)->required_extensions + & ~required_extensions) == 0); + obstack_free (&m_string_obstack, name); + } else { registered_function &rfn = add_function (instance, name, m_overload_type, NULL_TREE, - required_extensions, true); - const char *permanent_name = IDENTIFIER_POINTER (DECL_NAME (rfn.decl)); - m_overload_names.put (permanent_name, &rfn); + required_extensions, true, m_direct_overloads); + m_overload_names.put (name, &rfn); } - obstack_free (&m_string_obstack, name); } /* If we are using manual overload resolution, add one function decl @@ -1098,9 +1114,6 @@ void function_builder::add_overloaded_functions (const function_group_info &group, mode_suffix_index mode) { - if (m_direct_overloads) - return; - unsigned int explicit_type0 = (*group.shape)->explicit_type_suffix_p (0); unsigned int explicit_type1 = (*group.shape)->explicit_type_suffix_p (1); for (unsigned int pi = 0; group.preds[pi] != NUM_PREDS; ++pi) diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h b/gcc/config/aarch64/aarch64-sve-builtins.h index 620e188..b701f90 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.h +++ b/gcc/config/aarch64/aarch64-sve-builtins.h @@ -337,7 +337,8 @@ private: tree get_attributes (const function_instance &); registered_function &add_function (const function_instance &, - const char *, tree, tree, uint64_t, bool); + const char *, tree, tree, + uint64_t, bool, bool); /* The function type to use for functions that are resolved by function_resolver. */ -- cgit v1.1 From cc2fda1328ee69b92724d6b3cffb741f07d86047 Mon Sep 17 00:00:00 2001 From: Mihailo Stojanovic Date: Tue, 30 Mar 2021 11:42:49 +0100 Subject: aarch64: Prevent use of SIMD fcvtz[su] instruction variant with "nosimd" Currently, SF->SI and DF->DI conversions on Aarch64 with the "nosimd" flag provided sometimes cause the emitting of a vector variant of the fcvtz[su] instruction (e.g. fcvtzu s0, s0). This modifies the corresponding pattern to only select the vector variant of the instruction when generating code with SIMD enabled. gcc/ChangeLog: * config/aarch64/aarch64.md (_trunc2): Set the "arch" attribute to disambiguate between SIMD and FP variants of the instruction. gcc/testsuite/ChangeLog: * gcc.target/aarch64/fcvt_nosimd.c: New test. --- gcc/config/aarch64/aarch64.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index b2abb5b..dd1dc2b 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -5989,7 +5989,8 @@ "@ fcvtz\t%0, %1 fcvtz\t%0, %1" - [(set_attr "type" "neon_fp_to_int_s,f_cvtf2i")] + [(set_attr "type" "neon_fp_to_int_s,f_cvtf2i") + (set_attr "arch" "simd,fp")] ) ;; Convert HF -> SI or DI -- cgit v1.1 From 48c79f054bf435051c95ee093c45a0f8c9de5b4e Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 30 Mar 2021 11:42:50 +0100 Subject: aarch64: Tweak post-RA handling of CONST_INT moves [PR98136] This PR is a regression caused by r8-5967, where we replaced a call to aarch64_internal_mov_immediate in aarch64_add_offset with a call to aarch64_force_temporary, which in turn uses the normal emit_move_insn{,_1} routines. The problem is that aarch64_add_offset can be called while outputting a thunk, where we require all instructions to be valid without splitting. However, the move expanders were not splitting CONST_INT moves themselves. I think the right fix is to make the move expanders work even in this scenario, rather than require callers to handle it as a special case. gcc/ PR target/98136 * config/aarch64/aarch64.md (mov): Pass multi-instruction CONST_INTs to aarch64_expand_mov_immediate when called after RA. gcc/testsuite/ PR target/98136 * g++.dg/pr98136.C: New test. --- gcc/config/aarch64/aarch64.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index dd1dc2b..a398c3d 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1241,10 +1241,19 @@ if (GET_CODE (operands[0]) == MEM && operands[1] != const0_rtx) operands[1] = force_reg (mode, operands[1]); - /* FIXME: RR we still need to fix up what we are doing with - symbol_refs and other types of constants. */ - if (CONSTANT_P (operands[1]) - && !CONST_INT_P (operands[1])) + /* Lower moves of symbolic constants into individual instructions. + Doing this now is sometimes necessary for correctness, since some + sequences require temporary pseudo registers. Lowering now is also + often better for optimization, since more RTL passes get the + chance to optimize the individual instructions. + + When called after RA, also split multi-instruction moves into + smaller pieces now, since we can't be sure that sure that there + will be a following split pass. */ + if (CONST_INT_P (operands[1]) + ? (reload_completed + && !aarch64_mov_imm_operand (operands[1], mode)) + : CONSTANT_P (operands[1])) { aarch64_expand_mov_immediate (operands[0], operands[1]); DONE; -- cgit v1.1 From 6f93a7c7fc62b2d6ab47e5d5eb60d41366e1ee9e Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Tue, 30 Mar 2021 12:26:33 +0000 Subject: arm: Fix emission of Tag_ABI_VFP_args with MVE and -mfloat-abi=hard (PR target/99773) When compiling with -mfloat-abi=hard -march=armv8.1-m.main+mve, we want to emit Tag_ABI_VFP_args even though we are not emitting floating-point instructions (we need "+mve.fp" for that), because we use MVE registers to pass FP arguments. This patch removes the condition on (! TARGET_SOFT_FLOAT) because this is a case where TARGET_SOFT_FLOAT is true, and TARGET_HARD_FLOAT_ABI is true too. 2021-03-30 Richard Earnshaw gcc/ PR target/99773 * config/arm/arm.c (arm_file_start): Fix emission of Tag_ABI_VFP_args attribute. --- gcc/config/arm/arm.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index e89f5e2..518bfed 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -28150,14 +28150,11 @@ arm_file_start (void) if (print_tune_info) arm_print_tune_info (); - if (! TARGET_SOFT_FLOAT) - { - if (TARGET_HARD_FLOAT && TARGET_VFP_SINGLE) - arm_emit_eabi_attribute ("Tag_ABI_HardFP_use", 27, 1); + if (TARGET_HARD_FLOAT && TARGET_VFP_SINGLE) + arm_emit_eabi_attribute ("Tag_ABI_HardFP_use", 27, 1); - if (TARGET_HARD_FLOAT_ABI) - arm_emit_eabi_attribute ("Tag_ABI_VFP_args", 28, 1); - } + if (TARGET_HARD_FLOAT_ABI) + arm_emit_eabi_attribute ("Tag_ABI_VFP_args", 28, 1); /* Some of these attributes only apply when the corresponding features are used. However we don't have any easy way of figuring this out. -- cgit v1.1 From 5463cee277038df4688b61144db498ae7d24e631 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Tue, 23 Mar 2021 20:04:58 -0700 Subject: x86: Define __rdtsc and __rdtscp as macros Define __rdtsc and __rdtscp as macros for callers with general-regs-only target attribute to avoid inline failure with always_inline attribute. gcc/ PR target/99744 * config/i386/ia32intrin.h (__rdtsc): Defined as macro. (__rdtscp): Likewise. gcc/testsuite/ PR target/99744 * gcc.target/i386/pr99744-1.c: New test. --- gcc/config/i386/ia32intrin.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/ia32intrin.h b/gcc/config/i386/ia32intrin.h index d336a51..5913940 100644 --- a/gcc/config/i386/ia32intrin.h +++ b/gcc/config/i386/ia32intrin.h @@ -107,22 +107,12 @@ __rdpmc (int __S) #endif /* __iamcu__ */ /* rdtsc */ -extern __inline unsigned long long -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -__rdtsc (void) -{ - return __builtin_ia32_rdtsc (); -} +#define __rdtsc() __builtin_ia32_rdtsc () #ifndef __iamcu__ /* rdtscp */ -extern __inline unsigned long long -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -__rdtscp (unsigned int *__A) -{ - return __builtin_ia32_rdtscp (__A); -} +#define __rdtscp(a) __builtin_ia32_rdtscp (a) #endif /* __iamcu__ */ -- cgit v1.1 From f64b91568f3ac8f152c6c617b4fcc6b51da10ac4 Mon Sep 17 00:00:00 2001 From: "luoxhu@cn.ibm.com" Date: Fri, 26 Mar 2021 22:26:57 -0500 Subject: rs6000: Enable 32bit variable vec_insert [PR99718] 32bit and P7 VSX could also benefit a lot from the variable vec_insert implementation with shift/insert/shift back method. 2011-03-29 Xionghu Luo PR target/99718 * config/rs6000/altivec.md (altivec_lvsl_reg): Change to ... (altivec_lvsl_reg_): ... this. (altivec_lvsr_reg): Change to ... (altivec_lvsr_reg_): ... this. * config/rs6000/predicates.md (vec_set_index_operand): New. * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): Enable 32bit variable vec_insert for all TARGET_VSX. * config/rs6000/rs6000.c (rs6000_expand_vector_set_var_p9): Enable 32bit variable vec_insert for p9 and above. (rs6000_expand_vector_set_var_p8): Rename to ... (rs6000_expand_vector_set_var_p7): ... this. (rs6000_expand_vector_set): Use TARGET_VSX and adjust assert position. * config/rs6000/vector.md (vec_set): Use vec_set_index_operand. * config/rs6000/vsx.md (xl_len_r): Use gen_altivec_lvsl_reg_di and gen_altivec_lvsr_reg_di. gcc/testsuite/ PR target/99718 * gcc.target/powerpc/fold-vec-insert-char-p8.c: Update instruction counts. * gcc.target/powerpc/fold-vec-insert-char-p9.c: Likewise. * gcc.target/powerpc/fold-vec-insert-double.c: Likewise. * gcc.target/powerpc/fold-vec-insert-float-p8.c: Likewise. * gcc.target/powerpc/fold-vec-insert-float-p9.c: Likewise. * gcc.target/powerpc/fold-vec-insert-int-p8.c: Likewise. * gcc.target/powerpc/fold-vec-insert-int-p9.c: Likewise. * gcc.target/powerpc/fold-vec-insert-longlong.c: Likewise. * gcc.target/powerpc/fold-vec-insert-short-p8.c: Likewise. * gcc.target/powerpc/fold-vec-insert-short-p9.c: Likewise. * gcc.target/powerpc/pr79251.p8.c: Likewise. * gcc.target/powerpc/pr79251.p9.c: Likewise. * gcc.target/powerpc/vsx-builtin-7.c: Likewise. * gcc.target/powerpc/pr79251-run.p7.c: New test. * gcc.target/powerpc/pr79251.p7.c: New test. --- gcc/config/rs6000/altivec.md | 8 ++-- gcc/config/rs6000/predicates.md | 6 +++ gcc/config/rs6000/rs6000-c.c | 2 +- gcc/config/rs6000/rs6000.c | 89 ++++++++++++++++++++++++++++++++--------- gcc/config/rs6000/vector.md | 2 +- gcc/config/rs6000/vsx.md | 4 +- 6 files changed, 85 insertions(+), 26 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 27a269b..c2b6c79 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -2771,10 +2771,10 @@ DONE; }) -(define_insn "altivec_lvsl_reg" +(define_insn "altivec_lvsl_reg_" [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") (unspec:V16QI - [(match_operand:DI 1 "gpc_reg_operand" "b")] + [(match_operand:GPR 1 "gpc_reg_operand" "b")] UNSPEC_LVSL_REG))] "TARGET_ALTIVEC" "lvsl %0,0,%1" @@ -2809,10 +2809,10 @@ DONE; }) -(define_insn "altivec_lvsr_reg" +(define_insn "altivec_lvsr_reg_" [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") (unspec:V16QI - [(match_operand:DI 1 "gpc_reg_operand" "b")] + [(match_operand:GPR 1 "gpc_reg_operand" "b")] UNSPEC_LVSR_REG))] "TARGET_ALTIVEC" "lvsr %0,0,%1" diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index 859af75..e21bc74 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -1940,3 +1940,9 @@ return !indexed_address (addr, mode); }) + +;; Return 1 if this operand is valid as the index for vec_set. +(define_predicate "vec_set_index_operand" + (if_then_else (match_test "TARGET_VSX") + (match_operand 0 "reg_or_cint_operand") + (match_operand 0 "const_int_operand"))) diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index 06b3bc0..0f8a629 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -1602,7 +1602,7 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt); } - if (TARGET_P8_VECTOR && TARGET_DIRECT_MOVE_64BIT) + if (TARGET_VSX) { stmt = build_array_ref (loc, stmt, arg2); stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 6a8943d..50c768d 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -7035,20 +7035,40 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx idx) int shift = exact_log2 (width); machine_mode idx_mode = GET_MODE (idx); - idx = convert_modes (DImode, idx_mode, idx, 1); + machine_mode shift_mode; + rtx (*gen_ashl)(rtx, rtx, rtx); + rtx (*gen_lvsl)(rtx, rtx); + rtx (*gen_lvsr)(rtx, rtx); + + if (TARGET_POWERPC64) + { + shift_mode = DImode; + gen_ashl = gen_ashldi3; + gen_lvsl = gen_altivec_lvsl_reg_di; + gen_lvsr = gen_altivec_lvsr_reg_di; + } + else + { + shift_mode = SImode; + gen_ashl = gen_ashlsi3; + gen_lvsl = gen_altivec_lvsl_reg_si; + gen_lvsr = gen_altivec_lvsr_reg_si; + } /* Generate the IDX for permute shift, width is the vector element size. idx = idx * width. */ - rtx tmp = gen_reg_rtx (DImode); - emit_insn (gen_ashldi3 (tmp, idx, GEN_INT (shift))); + rtx tmp = gen_reg_rtx (shift_mode); + idx = convert_modes (shift_mode, idx_mode, idx, 1); + + emit_insn (gen_ashl (tmp, idx, GEN_INT (shift))); /* lvsr v1,0,idx. */ rtx pcvr = gen_reg_rtx (V16QImode); - emit_insn (gen_altivec_lvsr_reg (pcvr, tmp)); + emit_insn (gen_lvsr (pcvr, tmp)); /* lvsl v2,0,idx. */ rtx pcvl = gen_reg_rtx (V16QImode); - emit_insn (gen_altivec_lvsl_reg (pcvl, tmp)); + emit_insn (gen_lvsl (pcvl, tmp)); rtx sub_target = simplify_gen_subreg (V16QImode, target, mode, 0); @@ -7064,10 +7084,10 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx idx) } /* Insert VAL into IDX of TARGET, VAL size is same of the vector element, IDX - is variable and also counts by vector element size for p8. */ + is variable and also counts by vector element size for p7 & p8. */ static void -rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx) +rs6000_expand_vector_set_var_p7 (rtx target, rtx val, rtx idx) { machine_mode mode = GET_MODE (target); @@ -7082,17 +7102,41 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx) int shift = exact_log2 (width); machine_mode idx_mode = GET_MODE (idx); - idx = convert_modes (DImode, idx_mode, idx, 1); + + machine_mode shift_mode; + rtx (*gen_ashl)(rtx, rtx, rtx); + rtx (*gen_add)(rtx, rtx, rtx); + rtx (*gen_sub)(rtx, rtx, rtx); + rtx (*gen_lvsl)(rtx, rtx); + + if (TARGET_POWERPC64) + { + shift_mode = DImode; + gen_ashl = gen_ashldi3; + gen_add = gen_adddi3; + gen_sub = gen_subdi3; + gen_lvsl = gen_altivec_lvsl_reg_di; + } + else + { + shift_mode = SImode; + gen_ashl = gen_ashlsi3; + gen_add = gen_addsi3; + gen_sub = gen_subsi3; + gen_lvsl = gen_altivec_lvsl_reg_si; + } /* idx = idx * width. */ - rtx tmp = gen_reg_rtx (DImode); - emit_insn (gen_ashldi3 (tmp, idx, GEN_INT (shift))); + rtx tmp = gen_reg_rtx (shift_mode); + idx = convert_modes (shift_mode, idx_mode, idx, 1); + + emit_insn (gen_ashl (tmp, idx, GEN_INT (shift))); /* For LE: idx = idx + 8. */ if (!BYTES_BIG_ENDIAN) - emit_insn (gen_adddi3 (tmp, tmp, GEN_INT (8))); + emit_insn (gen_add (tmp, tmp, GEN_INT (8))); else - emit_insn (gen_subdi3 (tmp, GEN_INT (24 - width), tmp)); + emit_insn (gen_sub (tmp, GEN_INT (24 - width), tmp)); /* lxv vs33, mask. DImode: 0xffffffffffffffff0000000000000000 @@ -7119,7 +7163,16 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx) /* mtvsrd[wz] f0,tmp_val. */ rtx tmp_val = gen_reg_rtx (SImode); if (inner_mode == E_SFmode) - emit_insn (gen_movsi_from_sf (tmp_val, val)); + if (TARGET_DIRECT_MOVE_64BIT) + emit_insn (gen_movsi_from_sf (tmp_val, val)); + else + { + rtx stack = rs6000_allocate_stack_temp (SFmode, false, true); + emit_insn (gen_movsf_hardfloat (stack, val)); + rtx stack2 = copy_rtx (stack); + PUT_MODE (stack2, SImode); + emit_move_insn (tmp_val, stack2); + } else tmp_val = force_reg (SImode, val); @@ -7143,7 +7196,7 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx) /* lvsl 13,0,idx. */ rtx pcv = gen_reg_rtx (V16QImode); - emit_insn (gen_altivec_lvsl_reg (pcv, tmp)); + emit_insn (gen_lvsl (pcv, tmp)); /* vperm 1,1,1,13. */ /* vperm 0,0,0,13. */ @@ -7184,11 +7237,13 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) rs6000_expand_vector_set_var_p9 (target, val, elt_rtx); return; } - else if (TARGET_P8_VECTOR && TARGET_DIRECT_MOVE_64BIT) + else if (TARGET_VSX) { - rs6000_expand_vector_set_var_p8 (target, val, elt_rtx); + rs6000_expand_vector_set_var_p7 (target, val, elt_rtx); return; } + else + gcc_assert (CONST_INT_P (elt_rtx)); } rtx insn = NULL_RTX; @@ -7218,8 +7273,6 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) } } - gcc_assert (CONST_INT_P (elt_rtx)); - /* Simplify setting single element vectors like V1TImode. */ if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (inner_mode) && INTVAL (elt_rtx) == 0) diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md index e5191bd..3446b03 100644 --- a/gcc/config/rs6000/vector.md +++ b/gcc/config/rs6000/vector.md @@ -1227,7 +1227,7 @@ (define_expand "vec_set" [(match_operand:VEC_E 0 "vlogical_operand") (match_operand: 1 "register_operand") - (match_operand 2 "reg_or_cint_operand")] + (match_operand 2 "vec_set_index_operand")] "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)" { rs6000_expand_vector_set (operands[0], operands[1], operands[2]); diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 4404407..bcb92be 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -5423,7 +5423,7 @@ rtx rtx_vtmp = gen_reg_rtx (V16QImode); rtx tmp = gen_reg_rtx (DImode); - emit_insn (gen_altivec_lvsl_reg (shift_mask, operands[2])); + emit_insn (gen_altivec_lvsl_reg_di (shift_mask, operands[2])); emit_insn (gen_ashldi3 (tmp, operands[2], GEN_INT (56))); emit_insn (gen_lxvll (rtx_vtmp, operands[1], tmp)); emit_insn (gen_altivec_vperm_v8hiv16qi (operands[0], rtx_vtmp, rtx_vtmp, @@ -5507,7 +5507,7 @@ rtx rtx_vtmp = gen_reg_rtx (V16QImode); rtx tmp = gen_reg_rtx (DImode); - emit_insn (gen_altivec_lvsr_reg (shift_mask, operands[2])); + emit_insn (gen_altivec_lvsr_reg_di (shift_mask, operands[2])); emit_insn (gen_altivec_vperm_v8hiv16qi (rtx_vtmp, operands[0], operands[0], shift_mask)); emit_insn (gen_ashldi3 (tmp, operands[2], GEN_INT (56))); -- cgit v1.1 From 19199a6f2b0f4ce4b100856c78706d56a16b1956 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Tue, 30 Mar 2021 15:43:36 +0100 Subject: aarch64: PR target/99822 Don't allow zero register in first operand of SUBS/ADDS-immediate In this PR we end up generating an invalid instruction: adds x1,xzr,#2 because the pattern accepts zero as an operand in the comparison, but the instruction doesn't. Fix it by adjusting the predicate and constraints. gcc/ChangeLog: PR target/99822 * config/aarch64/aarch64.md (sub3_compare1_imm): Do not allow zero in operand 1. gcc/testsuite/ChangeLog: PR target/99822 * gcc.c-torture/compile/pr99822.c: New test. --- gcc/config/aarch64/aarch64.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index a398c3d..b139c08 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -3034,7 +3034,7 @@ (define_insn "sub3_compare1_imm" [(set (reg:CC CC_REGNUM) (compare:CC - (match_operand:GPI 1 "aarch64_reg_or_zero" "rkZ,rkZ") + (match_operand:GPI 1 "register_operand" "rk,rk") (match_operand:GPI 2 "aarch64_plus_immediate" "I,J"))) (set (match_operand:GPI 0 "register_operand" "=r,r") (plus:GPI -- cgit v1.1 From c277abd9cd3d10db59f9965d7d6356868da42a9f Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Tue, 30 Mar 2021 16:42:17 +0100 Subject: aarch64: PR target/99820: Guard on available SVE issue info before using This fixes a simple segfault ICE when using the use_new_vector_costs tunable with a CPU tuning that it wasn't intended for. I'm not adding a testcase here as we intend to remove the tunable for GCC 12 anyway (the new costing logic will remain and will benefit from this extra check, but the -moverride option will no longer exist). gcc/ChangeLog: PR target/99820 * config/aarch64/aarch64.c (aarch64_analyze_loop_vinfo): Check for available issue_info before using it. --- gcc/config/aarch64/aarch64.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index a573850..5eda9e8 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -14460,7 +14460,9 @@ aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo, /* Record the issue information for any SVE WHILE instructions that the loop needs. */ auto *issue_info = aarch64_tune_params.vec_costs->issue_info; - if (issue_info->sve && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) + if (issue_info + && issue_info->sve + && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) { unsigned int num_masks = 0; rgroup_controls *rgm; -- cgit v1.1 From a49a96f681bf13c6e77644d4507e867f00f93fe6 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 31 Mar 2021 09:11:29 +0200 Subject: i386, debug: Default to -gdwarf-4 on Windows targets with broken ld.bfd [PR98860] As mentioned in the PR, before the https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=ba6eb62ff0ea9843a018cfd7cd06777bd66ae0a0 fix from March 1st, PECOFF ld.bfd didn't know about .debug_loclists, .debug_rnglists and other debug sections new in DWARF 5. Unfortunately, unlike for ELF linkers, that means the sections were placed in wrong ordering with wrong VMA/LMA, so the resulting executables are apparently unusable. As that is pretty new change, newer than 2.35.2 or 2.36 binutils releases, the following patch adds a workaround that turns -gdwarf-4 by default instead of -gdwarf-5 if a broken linker is found at configure time. Users can still explicitly play with -gdwarf-5 and either use a non-broken linker or use custom linker scripts for the broken one, but at least by default it should work. 2021-03-31 Jakub Jelinek PR bootstrap/98860 * configure.ac (HAVE_LD_BROKEN_PE_DWARF5): New AC_DEFINE if PECOFF linker doesn't support DWARF sections new in DWARF5. * config/i386/i386-options.c (ix86_option_override_internal): Default to dwarf_version 4 if HAVE_LD_BROKEN_PE_DWARF5 for TARGET_PECOFF targets. * config.in: Regenerated. * configure: Regenerated. --- gcc/config/i386/i386-options.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 88d5e71..93cd6e8 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -1861,6 +1861,13 @@ ix86_option_override_internal (bool main_args_p, SUBSUBTARGET_OVERRIDE_OPTIONS; #endif +#ifdef HAVE_LD_BROKEN_PE_DWARF5 + /* If the PE linker has broken DWARF 5 support, make + DWARF 4 the default. */ + if (TARGET_PECOFF) + SET_OPTION_IF_UNSET (opts, opts_set, dwarf_version, 4); +#endif + /* -fPIC is the default for x86_64. */ if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags)) opts->x_flag_pic = 2; -- cgit v1.1 From c001c194a2f73fb32461b597e91a35f9bbcf4414 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 31 Mar 2021 10:46:01 +0200 Subject: aarch64: Fix up *add3_poly_1 [PR99813] As mentioned in the PR, Uai constraint stands for aarch64_sve_scalar_inc_dec_immediate while Uav for aarch64_sve_addvl_addpl_immediate. Both *add3_aarch64 and *add3_poly_1 patterns use * return aarch64_output_sve_scalar_inc_dec (operands[2]); * return aarch64_output_sve_addvl_addpl (operands[2]); in that order, but the former with Uai,Uav order, while the latter with Uav,Uai instead. This patch swaps the constraints so that they match the output. Co-authored-by: Richard Sandiford 2021-03-31 Jakub Jelinek Richard Sandiford PR target/99813 * config/aarch64/aarch64.md (*add3_poly_1): Swap Uai and Uav constraints on operands[2] and similarly 0 and rk constraints on operands[1] corresponding to that. * g++.target/aarch64/sve/pr99813.C: New test. --- gcc/config/aarch64/aarch64.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index b139c08..15bbc10 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -2059,8 +2059,8 @@ [(set (match_operand:GPI 0 "register_operand" "=r,r,r,r,r,r,&r") (plus:GPI - (match_operand:GPI 1 "register_operand" "%rk,rk,rk,rk,rk,0,rk") - (match_operand:GPI 2 "aarch64_pluslong_or_poly_operand" "I,r,J,Uaa,Uav,Uai,Uat")))] + (match_operand:GPI 1 "register_operand" "%rk,rk,rk,rk,0,rk,rk") + (match_operand:GPI 2 "aarch64_pluslong_or_poly_operand" "I,r,J,Uaa,Uai,Uav,Uat")))] "TARGET_SVE && operands[0] != stack_pointer_rtx" "@ add\\t%0, %1, %2 -- cgit v1.1 From 1393938e4c7dab9306cdce5a73d93b242fc246ec Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 31 Mar 2021 11:26:06 +0100 Subject: aarch64: Fix target alignment for SVE [PR98119] The vectoriser supports peeling for alignment using predication: we move back to the previous aligned boundary and make the skipped elements inactive in the first loop iteration. As it happens, the costs for existing CPUs give an equal cost to aligned and unaligned accesses, so this feature is rarely used. However, the PR shows that when the feature was forced on, we were still trying to align to a full-vector boundary even when using partial vectors. gcc/ PR target/98119 * config/aarch64/aarch64.c (aarch64_vectorize_preferred_vector_alignment): Query the size of the provided SVE vector; do not assume that all SVE vectors have the same size. gcc/testsuite/ PR target/98119 * gcc.target/aarch64/sve/pr98119.c: New test. --- gcc/config/aarch64/aarch64.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 5eda9e8..f878721 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -20275,10 +20275,11 @@ aarch64_vectorize_preferred_vector_alignment (const_tree type) { if (aarch64_sve_data_mode_p (TYPE_MODE (type))) { - /* If the length of the vector is fixed, try to align to that length, - otherwise don't try to align at all. */ + /* If the length of the vector is a fixed power of 2, try to align + to that length, otherwise don't try to align at all. */ HOST_WIDE_INT result; - if (!BITS_PER_SVE_VECTOR.is_constant (&result)) + if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result) + || !pow2p_hwi (result)) result = TYPE_ALIGN (TREE_TYPE (type)); return result; } -- cgit v1.1 From bf24f4ec73b65454ea0edcd6ab5616f04958d41e Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 21 Jan 2021 18:51:35 -0800 Subject: x86: Update memcpy/memset inline strategies for Ice Lake Simply memcpy and memset inline strategies to avoid branches for -mtune=icelake: 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector load and store for up to 16 * 16 (256) bytes when the data size is fixed and known. 2. Inline only if data size is known to be <= 256. a. Use "rep movsb/stosb" with simple code sequence if the data size is a constant. b. Use loop if data size is not a constant. 3. Use memcpy/memset libray function if data size is unknown or > 256. On Ice Lake processor with -march=native -Ofast -flto, 1. Performance impacts of SPEC CPU 2017 rate are: 500.perlbench_r -0.93% 502.gcc_r 0.36% 505.mcf_r 0.31% 520.omnetpp_r -0.07% 523.xalancbmk_r -0.53% 525.x264_r -0.09% 531.deepsjeng_r -0.19% 541.leela_r 0.16% 548.exchange2_r 0.22% 557.xz_r -1.64% Geomean -0.24% 503.bwaves_r -0.01% 507.cactuBSSN_r 0.00% 508.namd_r 0.12% 510.parest_r 0.07% 511.povray_r 0.29% 519.lbm_r 0.00% 521.wrf_r -0.38% 526.blender_r 0.16% 527.cam4_r 0.18% 538.imagick_r 0.76% 544.nab_r -0.84% 549.fotonik3d_r -0.07% 554.roms_r -0.01% Geomean 0.02% 2. Significant impacts on eembc benchmarks are: eembc/nnet_test 9.90% eembc/mp2decoddata2 16.42% eembc/textv2data3 -4.86% eembc/qos 12.90% gcc/ * config/i386/i386-expand.c (expand_set_or_cpymem_via_rep): For TARGET_PREFER_KNOWN_REP_MOVSB_STOSB, don't convert QImode to SImode. (decide_alg): For TARGET_PREFER_KNOWN_REP_MOVSB_STOSB, use "rep movsb/stosb" only for known sizes. * config/i386/i386-options.c (processor_cost_table): Use Ice Lake cost for Cannon Lake, Ice Lake, Tiger Lake, Sapphire Rapids and Alder Lake. * config/i386/i386.h (TARGET_PREFER_KNOWN_REP_MOVSB_STOSB): New. * config/i386/x86-tune-costs.h (icelake_memcpy): New. (icelake_memset): Likewise. (icelake_cost): Likewise. * config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB): New. gcc/testsuite/ * gcc.target/i386/memcpy-strategy-5.c: New test. * gcc.target/i386/memcpy-strategy-6.c: Likewise. * gcc.target/i386/memcpy-strategy-7.c: Likewise. * gcc.target/i386/memcpy-strategy-8.c: Likewise. * gcc.target/i386/memset-strategy-3.c: Likewise. * gcc.target/i386/memset-strategy-4.c: Likewise. * gcc.target/i386/memset-strategy-5.c: Likewise. * gcc.target/i386/memset-strategy-6.c: Likewise. --- gcc/config/i386/i386-expand.c | 8 ++- gcc/config/i386/i386-options.c | 12 ++-- gcc/config/i386/i386.h | 2 + gcc/config/i386/x86-tune-costs.h | 127 +++++++++++++++++++++++++++++++++++++++ gcc/config/i386/x86-tune.def | 7 +++ 5 files changed, 149 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index ac69eed..dda08ff 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -5976,6 +5976,7 @@ expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem, /* If possible, it is shorter to use rep movs. TODO: Maybe it is better to move this logic to decide_alg. */ if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3) + && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB && (!issetmem || orig_value == const0_rtx)) mode = SImode; @@ -6984,7 +6985,12 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, else if (!any_alg_usable_p) break; } - else if (alg_usable_p (candidate, memset, have_as)) + else if (alg_usable_p (candidate, memset, have_as) + && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB + && candidate == rep_prefix_1_byte + /* NB: If min_size != max_size, size is + unknown. */ + && min_size != max_size)) { *noalign = algs->size[i].noalign; return candidate; diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 93cd6e8..a8d0673 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -721,14 +721,14 @@ static const struct processor_costs *processor_cost_table[] = &slm_cost, &skylake_cost, &skylake_cost, + &icelake_cost, + &icelake_cost, + &icelake_cost, &skylake_cost, + &icelake_cost, &skylake_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, + &icelake_cost, + &icelake_cost, &intel_cost, &geode_cost, &k6_cost, diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 058c1cc..b4001d2 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -523,6 +523,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; #define TARGET_PROMOTE_QImode ix86_tune_features[X86_TUNE_PROMOTE_QIMODE] #define TARGET_FAST_PREFIX ix86_tune_features[X86_TUNE_FAST_PREFIX] #define TARGET_SINGLE_STRINGOP ix86_tune_features[X86_TUNE_SINGLE_STRINGOP] +#define TARGET_PREFER_KNOWN_REP_MOVSB_STOSB \ + ix86_tune_features[X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB] #define TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES \ ix86_tune_features[X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES] #define TARGET_QIMODE_MATH ix86_tune_features[X86_TUNE_QIMODE_MATH] diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 58b3b81..0e00ff9 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1936,6 +1936,133 @@ struct processor_costs skylake_cost = { "0:0:8", /* Label alignment. */ "16", /* Func alignment. */ }; + +/* icelake_cost should produce code tuned for Icelake family of CPUs. + NB: rep_prefix_1_byte is used only for known size. */ + +static stringop_algs icelake_memcpy[2] = { + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}, + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}}; + +static stringop_algs icelake_memset[2] = { + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}, + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}}; + +static const +struct processor_costs icelake_cost = { + { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 6, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {6, 6, 6}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {6, 6, 8}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 10}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {6, 6}, /* cost of loading MMX registers + in SImode and DImode */ + {6, 6}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ + {6, 6, 6, 10, 20}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {8, 8, 8, 12, 24}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 6, 6, /* SSE->integer and integer->SSE moves */ + 5, 5, /* mask->integer and integer->mask moves */ + {8, 8, 8}, /* cost of loading mask register + in QImode, HImode, SImode. */ + {6, 6, 6}, /* cost if storing mask register + in QImode, HImode, SImode. */ + 3, /* cost of moving mask register. */ + /* End of register allocator costs. */ + }, + + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1)+1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (3), /* DI */ + COSTS_N_INSNS (3)}, /* other */ + 0, /* cost of multiply per each bit set */ + /* Expanding div/mod currently doesn't consider parallelism. So the cost + model is not realistic. We compensate by increasing the latencies a bit. */ + {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (11), /* HI */ + COSTS_N_INSNS (14), /* SI */ + COSTS_N_INSNS (76), /* DI */ + COSTS_N_INSNS (76)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (0), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 17, /* CLEAR_RATIO */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {6, 6, 6}, /* cost of storing integer registers */ + {6, 6, 6, 10, 20}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {8, 8, 8, 12, 24}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ + {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ + 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ + 6, /* cost of moving SSE register to integer. */ + 20, 8, /* Gather load static, per_elt. */ + 22, 10, /* Gather store static, per_elt. */ + 64, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 3, /* Branch cost */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (20), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ + + COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ + COSTS_N_INSNS (4), /* cost of MULSS instruction. */ + COSTS_N_INSNS (4), /* cost of MULSD instruction. */ + COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ + COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ + COSTS_N_INSNS (11), /* cost of DIVSS instruction. */ + COSTS_N_INSNS (14), /* cost of DIVSD instruction. */ + COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */ + COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ + 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ + icelake_memcpy, + icelake_memset, + COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ + COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ + "16:11:8", /* Loop alignment. */ + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ +}; + /* BTVER1 has optimized REP instruction for medium sized blocks, but for very small blocks it is better to use loop. For large blocks, libcall can do nontemporary accesses and beat inline considerably. */ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index caebf76..134916c 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -269,6 +269,13 @@ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove", as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA) +/* X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB: Enable use of REP MOVSB/STOSB to + move/set sequences of bytes with known size. */ +DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB, + "prefer_known_rep_movsb_stosb", + m_CANNONLAKE | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_TIGERLAKE + | m_ALDERLAKE | m_SAPPHIRERAPIDS) + /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of compact prologues and epilogues by issuing a misaligned moves. This requires target to handle misaligned moves and partial memory stalls -- cgit v1.1 From 7c1d6e89994109e1b6efb5f13890be5586edeb75 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Mon, 29 Mar 2021 12:41:08 +0000 Subject: arm: Fix mult autovectorization patterm for iwmmxt (PR target/99786) Similarly to other recently-added autovectorization patterns, mult has been erroneously enabled for iwmmxt. However, V4HI and V2SI modes are supported, so we make an exception for them. The new testcase is derived from gcc.dg/ubsan/pr79904.c, with additional modes added. I kept dg-do compile because 'assemble' results in error messages from the assembler, which are not related to this PR: Error: selected processor does not support `tmcrr wr0,r4,r5' in ARM mode Error: selected processor does not support `wstrd wr0,[r0]' in ARM mode Error: selected processor does not support `wldrd wr0,[r0]' in ARM mode Error: selected processor does not support `wldrd wr2,.L5' in ARM mode Error: selected processor does not support `wmulul wr0,wr0,wr2' in ARM mode Error: selected processor does not support `wstrd wr0,[r0]' in ARM mode Error: selected processor does not support `wldrd wr0,[r0]' in ARM mode Error: selected processor does not support `wldrd wr2,.L8' in ARM mode Error: selected processor does not support `wmulwl wr0,wr0,wr2' in ARM mode Error: selected processor does not support `wstrd wr0,[r0]' in ARM mode 2021-03-29 Christophe Lyon PR target/99786 gcc/ * config/arm/vec-common.md (mul3): Disable on iwMMXT, expect for V4HI and V2SI. gcc/testsuite/ * gcc.target/arm/pr99786.c: New test. --- gcc/config/arm/vec-common.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 48ee659..0b2b3b1 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -103,7 +103,10 @@ [(set (match_operand:VDQWH 0 "s_register_operand") (mult:VDQWH (match_operand:VDQWH 1 "s_register_operand") (match_operand:VDQWH 2 "s_register_operand")))] - "ARM_HAVE__ARITH" + "ARM_HAVE__ARITH + && (!TARGET_REALLY_IWMMXT + || mode == V4HImode + || mode == V2SImode)" ) (define_expand "smin3" -- cgit v1.1 From 1b5f74e8be4dd7abe5624ff60adceff19ca71bda Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 31 Mar 2021 19:34:00 +0100 Subject: Handle CONST_POLY_INTs in CONST_VECTORs [PR97141, PR98726] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR is caused by POLY_INT_CSTs being (necessarily) valid in tree-level VECTOR_CSTs but CONST_POLY_INTs not being valid in RTL CONST_VECTORs. I can't tell/remember how deliberate that was, but I'm guessing not very. In particular, valid_for_const_vector_p was added to guard against symbolic constants rather than CONST_POLY_INTs. I did briefly consider whether we should maintain the current status anyway. However, that would then require a way of constructing variable-length vectors from individiual elements if, say, we have: { [2, 2], [3, 2], [4, 2], … } So I'm chalking this up to an oversight. I think the intention (and certainly the natural thing) is to have the same rules for both trees and RTL. The SVE CONST_VECTOR code should already be set up to handle CONST_POLY_INTs. However, we need to add support for Advanced SIMD CONST_VECTORs that happen to contain SVE-based values. The patch does that by expanding such CONST_VECTORs in the same way as variable vectors. gcc/ PR rtl-optimization/97141 PR rtl-optimization/98726 * emit-rtl.c (valid_for_const_vector_p): Return true for CONST_POLY_INT_P. * rtx-vector-builder.h (rtx_vector_builder::step): Return a poly_wide_int instead of a wide_int. (rtx_vector_builder::apply_set): Take a poly_wide_int instead of a wide_int. * rtx-vector-builder.c (rtx_vector_builder::apply_set): Likewise. * config/aarch64/aarch64.c (aarch64_legitimate_constant_p): Return false for CONST_VECTORs that cannot be forced to memory. * config/aarch64/aarch64-simd.md (mov): If a CONST_VECTOR is too complex to force to memory, build it up from individual elements instead. gcc/testsuite/ PR rtl-optimization/97141 PR rtl-optimization/98726 * gcc.c-torture/compile/pr97141.c: New test. * gcc.c-torture/compile/pr98726.c: Likewise. * gcc.target/aarch64/sve/pr97141.c: Likewise. * gcc.target/aarch64/sve/pr98726.c: Likewise. --- gcc/config/aarch64/aarch64-simd.md | 11 +++++++++++ gcc/config/aarch64/aarch64.c | 24 ++++++++++++++---------- 2 files changed, 25 insertions(+), 10 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index d86e8e72..4edee99 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -35,6 +35,17 @@ && aarch64_mem_pair_operand (operands[0], DImode)) || known_eq (GET_MODE_SIZE (mode), 8)))) operands[1] = force_reg (mode, operands[1]); + + /* If a constant is too complex to force to memory (e.g. because it + contains CONST_POLY_INTs), build it up from individual elements instead. + We should only need to do this before RA; aarch64_legitimate_constant_p + should ensure that we don't try to rematerialize the constant later. */ + if (GET_CODE (operands[1]) == CONST_VECTOR + && targetm.cannot_force_const_mem (mode, operands[1])) + { + aarch64_expand_vector_init (operands[0], operands[1]); + DONE; + } " ) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index f878721..994fafc 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -17925,10 +17925,22 @@ aarch64_legitimate_constant_p (machine_mode mode, rtx x) { /* Support CSE and rematerialization of common constants. */ if (CONST_INT_P (x) - || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT) - || GET_CODE (x) == CONST_VECTOR) + || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)) return true; + /* Only accept variable-length vector constants if they can be + handled directly. + + ??? It would be possible (but complex) to handle rematerialization + of other constants via secondary reloads. */ + if (!GET_MODE_SIZE (mode).is_constant ()) + return aarch64_simd_valid_immediate (x, NULL); + + /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at + least be forced to memory and loaded from there. */ + if (GET_CODE (x) == CONST_VECTOR) + return !targetm.cannot_force_const_mem (mode, x); + /* Do not allow vector struct mode constants for Advanced SIMD. We could support 0 and -1 easily, but they need support in aarch64-simd.md. */ @@ -17936,14 +17948,6 @@ aarch64_legitimate_constant_p (machine_mode mode, rtx x) if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT)) return false; - /* Only accept variable-length vector constants if they can be - handled directly. - - ??? It would be possible to handle rematerialization of other - constants via secondary reloads. */ - if (vec_flags & VEC_ANY_SVE) - return aarch64_simd_valid_immediate (x, NULL); - if (GET_CODE (x) == HIGH) x = XEXP (x, 0); -- cgit v1.1 From ea9a39e63eba1ba72aa3608317d1c40ae6bcef55 Mon Sep 17 00:00:00 2001 From: Pat Haugen Date: Wed, 31 Mar 2021 14:37:24 -0500 Subject: Update prefixed attribute for Power10. This patch creates a new attribute, "maybe_prefixed", which is used to mark those instructions that may have a prefixed form. The existing "prefixed" attribute is now used to mark all instructions that are prefixed form. 2021-03-31 Pat Haugen gcc/ PR target/99133 * config/rs6000/altivec.md (xxspltiw_v4si, xxspltiw_v4sf_inst, xxspltidp_v2df_inst, xxsplti32dx_v4si_inst, xxsplti32dx_v4sf_inst, xxblend_, xxpermx_inst, xxeval): Mark prefixed. * config/rs6000/mma.md (mma_, mma_, mma_, mma_, mma_, mma_, mma_, mma_, mma_, mma_): Likewise. * config/rs6000/rs6000.c (rs6000_final_prescan_insn): Adjust test. * config/rs6000/rs6000.md (define_attr "maybe_prefixed"): New. (define_attr "prefixed"): Update initializer. --- gcc/config/rs6000/altivec.md | 24 ++++++++++++++++-------- gcc/config/rs6000/mma.md | 30 ++++++++++++++++++++---------- gcc/config/rs6000/rs6000.c | 4 +++- gcc/config/rs6000/rs6000.md | 19 +++++++++++++------ 4 files changed, 52 insertions(+), 25 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index c2b6c79..1351daf 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -826,7 +826,8 @@ UNSPEC_XXSPLTIW))] "TARGET_POWER10" "xxspltiw %x0,%1" - [(set_attr "type" "vecsimple")]) + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) (define_expand "xxspltiw_v4sf" [(set (match_operand:V4SF 0 "register_operand" "=wa") @@ -845,7 +846,8 @@ UNSPEC_XXSPLTIW))] "TARGET_POWER10" "xxspltiw %x0,%1" - [(set_attr "type" "vecsimple")]) + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) (define_expand "xxspltidp_v2df" [(set (match_operand:V2DF 0 "register_operand" ) @@ -864,7 +866,8 @@ UNSPEC_XXSPLTID))] "TARGET_POWER10" "xxspltidp %x0,%1" - [(set_attr "type" "vecsimple")]) + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) (define_expand "xxsplti32dx_v4si" [(set (match_operand:V4SI 0 "register_operand" "=wa") @@ -893,7 +896,8 @@ UNSPEC_XXSPLTI32DX))] "TARGET_POWER10" "xxsplti32dx %x0,%2,%3" - [(set_attr "type" "vecsimple")]) + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) (define_expand "xxsplti32dx_v4sf" [(set (match_operand:V4SF 0 "register_operand" "=wa") @@ -921,7 +925,8 @@ UNSPEC_XXSPLTI32DX))] "TARGET_POWER10" "xxsplti32dx %x0,%2,%3" - [(set_attr "type" "vecsimple")]) + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) (define_insn "xxblend_" [(set (match_operand:VM3 0 "register_operand" "=wa") @@ -931,7 +936,8 @@ UNSPEC_XXBLEND))] "TARGET_POWER10" "xxblendv %x0,%x1,%x2,%x3" - [(set_attr "type" "vecsimple")]) + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) (define_expand "xxpermx" [(set (match_operand:V2DI 0 "register_operand" "+wa") @@ -975,7 +981,8 @@ UNSPEC_XXPERMX))] "TARGET_POWER10" "xxpermx %x0,%x1,%x2,%x3,%4" - [(set_attr "type" "vecsimple")]) + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) (define_expand "vstrir_" [(set (match_operand:VIshort 0 "altivec_register_operand") @@ -3623,7 +3630,8 @@ UNSPEC_XXEVAL))] "TARGET_POWER10" "xxeval %0,%1,%2,%3,%4" - [(set_attr "type" "vecsimple")]) + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) (define_expand "vec_unpacku_hi_v16qi" [(set (match_operand:V8HI 0 "register_operand" "=v") diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md index a00d3a3..1f6fc03 100644 --- a/gcc/config/rs6000/mma.md +++ b/gcc/config/rs6000/mma.md @@ -540,7 +540,8 @@ MMA_VVI4I4I8))] "TARGET_MMA" " %A0,%x1,%x2,%3,%4,%5" - [(set_attr "type" "mma")]) + [(set_attr "type" "mma") + (set_attr "prefixed" "yes")]) (define_insn "mma_" [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") @@ -553,7 +554,8 @@ MMA_AVVI4I4I8))] "TARGET_MMA" " %A0,%x2,%x3,%4,%5,%6" - [(set_attr "type" "mma")]) + [(set_attr "type" "mma") + (set_attr "prefixed" "yes")]) (define_insn "mma_" [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") @@ -565,7 +567,8 @@ MMA_VVI4I4I2))] "TARGET_MMA" " %A0,%x1,%x2,%3,%4,%5" - [(set_attr "type" "mma")]) + [(set_attr "type" "mma") + (set_attr "prefixed" "yes")]) (define_insn "mma_" [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") @@ -578,7 +581,8 @@ MMA_AVVI4I4I2))] "TARGET_MMA" " %A0,%x2,%x3,%4,%5,%6" - [(set_attr "type" "mma")]) + [(set_attr "type" "mma") + (set_attr "prefixed" "yes")]) (define_insn "mma_" [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") @@ -589,7 +593,8 @@ MMA_VVI4I4))] "TARGET_MMA" " %A0,%x1,%x2,%3,%4" - [(set_attr "type" "mma")]) + [(set_attr "type" "mma") + (set_attr "prefixed" "yes")]) (define_insn "mma_" [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") @@ -601,7 +606,8 @@ MMA_AVVI4I4))] "TARGET_MMA" " %A0,%x2,%x3,%4,%5" - [(set_attr "type" "mma")]) + [(set_attr "type" "mma") + (set_attr "prefixed" "yes")]) (define_insn "mma_" [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") @@ -612,7 +618,8 @@ MMA_PVI4I2))] "TARGET_MMA" " %A0,%x1,%x2,%3,%4" - [(set_attr "type" "mma")]) + [(set_attr "type" "mma") + (set_attr "prefixed" "yes")]) (define_insn "mma_" [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") @@ -624,7 +631,8 @@ MMA_APVI4I2))] "TARGET_MMA" " %A0,%x2,%x3,%4,%5" - [(set_attr "type" "mma")]) + [(set_attr "type" "mma") + (set_attr "prefixed" "yes")]) (define_insn "mma_" [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") @@ -636,7 +644,8 @@ MMA_VVI4I4I4))] "TARGET_MMA" " %A0,%x1,%x2,%3,%4,%5" - [(set_attr "type" "mma")]) + [(set_attr "type" "mma") + (set_attr "prefixed" "yes")]) (define_insn "mma_" [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") @@ -649,4 +658,5 @@ MMA_AVVI4I4I4))] "TARGET_MMA" " %A0,%x2,%x3,%4,%5,%6" - [(set_attr "type" "mma")]) + [(set_attr "type" "mma") + (set_attr "prefixed" "yes")]) diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 50c768d..befab53 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -26396,7 +26396,9 @@ static bool prepend_p_to_next_insn; void rs6000_final_prescan_insn (rtx_insn *insn, rtx [], int) { - prepend_p_to_next_insn = (get_attr_prefixed (insn) != PREFIXED_NO); + prepend_p_to_next_insn = (get_attr_maybe_prefixed (insn) + == MAYBE_PREFIXED_YES + && get_attr_prefixed (insn) == PREFIXED_YES); return; } diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index c71d343..c8cdc42 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -264,15 +264,22 @@ (define_attr "cannot_copy" "no,yes" (const_string "no")) -;; Whether an insn is a prefixed insn, and an initial 'p' should be printed -;; before the instruction. A prefixed instruction has a prefix instruction -;; word that extends the immediate value of the instructions from 12-16 bits to -;; 34 bits. The macro ASM_OUTPUT_OPCODE emits a leading 'p' for prefixed -;; insns. The default "length" attribute will also be adjusted by default to +;; Whether this insn has a prefixed form and a non-prefixed form. +(define_attr "maybe_prefixed" "no,yes" + (if_then_else (eq_attr "type" "load,fpload,vecload,store,fpstore,vecstore, + integer,add") + (const_string "yes") + (const_string "no"))) + +;; Whether an insn is a prefixed insn. A prefixed instruction has a prefix +;; instruction word that conveys additional information such as a larger +;; immediate, additional operands, etc., in addition to the normal instruction +;; word. The default "length" attribute will also be adjusted by default to ;; be 12 bytes. (define_attr "prefixed" "no,yes" (cond [(ior (match_test "!TARGET_PREFIXED") - (match_test "!NONJUMP_INSN_P (insn)")) + (match_test "!NONJUMP_INSN_P (insn)") + (eq_attr "maybe_prefixed" "no")) (const_string "no") (eq_attr "type" "load,fpload,vecload") -- cgit v1.1 From b680b9049737198d010e49cf434704c6a6ed2b3f Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Sat, 3 Apr 2021 10:03:15 +0200 Subject: rs6000: Avoid -fpatchable-function-entry* regressions on powerpc64 be [PR98125] The SECTION_LINK_ORDER changes broke powerpc64-linux ELFv1. Seems that the assembler/linker relies on the symbol mentioned for the "awo" section to be in the same section as the symbols mentioned in the relocations in that section (i.e. labels for the patchable area in this case). That is the case for most targets, including powerpc-linux 32-bit or powerpc64 ELFv2 (that one has -fpatchable-function-entry* support broken for other reasons and it doesn't seem to be a regression). But it doesn't work on powerpc64-linux ELFv1. We emit: .section ".opd","aw" .align 3 _Z3foov: .quad .L._Z3foov,.TOC.@tocbase,0 .previous .type _Z3foov, @function .L._Z3foov: .section __patchable_function_entries,"awo",@progbits,_Z3foov .align 3 .8byte .LPFE1 .section .text._Z3foov,"axG",@progbits,_Z3foov,comdat .LPFE1: nop .LFB0: .cfi_startproc and because _Z3foov is in the .opd section rather than the function text section, it doesn't work. I'm afraid I don't know what exactly should be done, whether e.g. it could use .section __patchable_function_entries,"awo",@progbits,.L._Z3foov instead, or whether the linker should be changed to handle it as is, or something else. But because we have a P1 regression that didn't see useful progress over the 4 months since it has been filed and we don't really have much time, below is an attempt to do a targetted reversion of H.J's patch, basically act as if HAVE_GAS_SECTION_LINK_ORDER is never true for powerpc64-linux ELFv1, but for 32-bit or 64-bit ELFv2 keep working as is. This would give us time to resolve it for GCC 12 properly. 2021-04-03 Jakub Jelinek PR testsuite/98125 * targhooks.h (default_print_patchable_function_entry_1): Declare. * targhooks.c (default_print_patchable_function_entry_1): New function, copied from default_print_patchable_function_entry with an added flags argument. (default_print_patchable_function_entry): Rewritten into a small wrapper around default_print_patchable_function_entry_1. * config/rs6000/rs6000.c (TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY): Redefine. (rs6000_print_patchable_function_entry): New function. * g++.dg/pr93195a.C: Skip on powerpc*-*-* 64-bit. --- gcc/config/rs6000/rs6000.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index befab53..35f5c33 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1341,6 +1341,10 @@ static const struct attribute_spec rs6000_attribute_table[] = #define TARGET_ASM_ASSEMBLE_VISIBILITY rs6000_assemble_visibility #endif +#undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY +#define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY \ + rs6000_print_patchable_function_entry + #undef TARGET_SET_UP_BY_PROLOGUE #define TARGET_SET_UP_BY_PROLOGUE rs6000_set_up_by_prologue @@ -14695,6 +14699,30 @@ rs6000_assemble_visibility (tree decl, int vis) } #endif +/* Write PATCH_AREA_SIZE NOPs into the asm outfile FILE around a function + entry. If RECORD_P is true and the target supports named sections, + the location of the NOPs will be recorded in a special object section + called "__patchable_function_entries". This routine may be called + twice per function to put NOPs before and after the function + entry. */ + +void +rs6000_print_patchable_function_entry (FILE *file, + unsigned HOST_WIDE_INT patch_area_size, + bool record_p) +{ + unsigned int flags = SECTION_WRITE | SECTION_RELRO; + /* When .opd section is emitted, the function symbol + default_print_patchable_function_entry_1 is emitted into the .opd section + while the patchable area is emitted into the function section. + Don't use SECTION_LINK_ORDER in that case. */ + if (!(TARGET_64BIT && DEFAULT_ABI != ABI_ELFv2) + && HAVE_GAS_SECTION_LINK_ORDER) + flags |= SECTION_LINK_ORDER; + default_print_patchable_function_entry_1 (file, patch_area_size, record_p, + flags); +} + enum rtx_code rs6000_reverse_condition (machine_mode mode, enum rtx_code code) { -- cgit v1.1 From ad7c6298b6f1cf4eaa70a53af3479fab14018896 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Sat, 3 Apr 2021 10:04:12 +0100 Subject: Darwin : Fix whitespace and delete unused code (NFC). Fix some incorrect indenting and remove two cases where we had code '#if 0'-d out. gcc/ChangeLog: * config/darwin.c (machopic_legitimize_pic_address): Fix whitespace, remove unused code. --- gcc/config/darwin.c | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c index e2e60bb..c1086a0 100644 --- a/gcc/config/darwin.c +++ b/gcc/config/darwin.c @@ -906,9 +906,6 @@ machopic_legitimize_pic_address (rtx orig, machine_mode mode, rtx reg) emit_move_insn (reg, pic); pic = reg; } -#if 0 - emit_use (gen_rtx_REG (Pmode, PIC_OFFSET_TABLE_REGNUM)); -#endif if (lra_in_progress) df_set_regs_ever_live (REGNO (pic), true); @@ -977,9 +974,6 @@ machopic_legitimize_pic_address (rtx orig, machine_mode mode, rtx reg) emit_move_insn (reg, pic); pic = reg; } -#if 0 - emit_use (pic_offset_table_rtx); -#endif if (lra_in_progress) df_set_regs_ever_live (REGNO (pic), true); pic_ref = gen_rtx_PLUS (Pmode, @@ -990,21 +984,21 @@ machopic_legitimize_pic_address (rtx orig, machine_mode mode, rtx reg) } if (GET_CODE (pic_ref) != REG) - { - if (reg != 0) - { - emit_move_insn (reg, pic_ref); - return reg; - } - else - { - return force_reg (mode, pic_ref); - } - } + { + if (reg != 0) + { + emit_move_insn (reg, pic_ref); + return reg; + } + else + { + return force_reg (mode, pic_ref); + } + } else - { - return pic_ref; - } + { + return pic_ref; + } } else if (GET_CODE (orig) == PLUS && (GET_CODE (XEXP (orig, 0)) == MEM -- cgit v1.1 From 89bc1d4e7cdd0b2d012050134ad1d464ec357f0b Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Sat, 3 Apr 2021 10:07:48 +0100 Subject: Darwin : Fix out-of-bounds access to df_regs_ever_live. During changes made for LRA (or, perhaps, even before) we omitted a check that the current register we are working on is a hard reg before we tried to note its liveness. A stage 1 built with fsanitize=address catches this, as does any attempt to build master with clang and -std=c++11. gcc/ChangeLog: * config/darwin.c (machopic_legitimize_pic_address): Check that the current pic register is one of the hard reg set before setting liveness. --- gcc/config/darwin.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c index c1086a0..5d17391 100644 --- a/gcc/config/darwin.c +++ b/gcc/config/darwin.c @@ -907,7 +907,7 @@ machopic_legitimize_pic_address (rtx orig, machine_mode mode, rtx reg) pic = reg; } - if (lra_in_progress) + if (lra_in_progress && HARD_REGISTER_P (pic)) df_set_regs_ever_live (REGNO (pic), true); pic_ref = gen_rtx_PLUS (Pmode, pic, machopic_gen_offset (XEXP (orig, 0))); @@ -974,7 +974,8 @@ machopic_legitimize_pic_address (rtx orig, machine_mode mode, rtx reg) emit_move_insn (reg, pic); pic = reg; } - if (lra_in_progress) + + if (lra_in_progress && HARD_REGISTER_P (pic)) df_set_regs_ever_live (REGNO (pic), true); pic_ref = gen_rtx_PLUS (Pmode, pic, -- cgit v1.1 From 16ea7f57891d3fe885ee55b2917208695e184714 Mon Sep 17 00:00:00 2001 From: Alex Coplan Date: Tue, 6 Apr 2021 09:06:27 +0100 Subject: arm: Fix PCS for SFmode -> SImode libcalls [PR99748] This patch fixes PR99748 which shows us trying to pass the argument to __aeabi_f2iz in the VFP register s0 when the library function is expecting to use the GPR r0. It also fixes the __aeabi_f2uiz case which was broken in the same way. For the testcase in the PR, here is the code we generate before the patch (with -mfloat-abi=hard -march=armv8.1-m.main+mve -O0): main: push {r7, lr} sub sp, sp, #8 add r7, sp, #0 mov r3, #1065353216 str r3, [r7, #4] @ float vldr.32 s0, [r7, #4] bl __aeabi_f2iz mov r3, r0 cmp r3, #1 [...] This becomes: main: push {r7, lr} sub sp, sp, #8 add r7, sp, #0 mov r3, #1065353216 str r3, [r7, #4] @ float ldr r0, [r7, #4] @ float bl __aeabi_f2iz mov r3, r0 cmp r3, #1 [...] after the patch. We see a similar change for the same testcase with a cast to unsigned instead of int. gcc/ChangeLog: PR target/99748 * config/arm/arm.c (arm_libcall_uses_aapcs_base): Also use base PCS for [su]fix_optab. --- gcc/config/arm/arm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 518bfed..8910dad 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -5773,6 +5773,10 @@ arm_libcall_uses_aapcs_base (const_rtx libcall) convert_optab_libfunc (sfix_optab, DImode, SFmode)); add_libcall (libcall_htab, convert_optab_libfunc (ufix_optab, DImode, SFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (sfix_optab, SImode, SFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (ufix_optab, SImode, SFmode)); /* Values from double-precision helper functions are returned in core registers if the selected core only supports single-precision -- cgit v1.1 From a32452a5442cd05040af53787af0d8b537ac77a6 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 11 Mar 2021 16:56:26 -0800 Subject: x86: Update memcpy/memset inline strategies for Skylake family CPUs Simply memcpy and memset inline strategies to avoid branches for Skylake family CPUs: 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector load and store for up to 16 * 16 (256) bytes when the data size is fixed and known. 2. Inline only if data size is known to be <= 256. a. Use "rep movsb/stosb" with simple code sequence if the data size is a constant. b. Use loop if data size is not a constant. 3. Use memcpy/memset libray function if data size is unknown or > 256. On Cascadelake processor with -march=native -Ofast -flto, 1. Performance impacts of SPEC CPU 2017 rate are: 500.perlbench_r 0.17% 502.gcc_r -0.36% 505.mcf_r 0.00% 520.omnetpp_r 0.08% 523.xalancbmk_r -0.62% 525.x264_r 1.04% 531.deepsjeng_r 0.11% 541.leela_r -1.09% 548.exchange2_r -0.25% 557.xz_r 0.17% Geomean -0.08% 503.bwaves_r 0.00% 507.cactuBSSN_r 0.69% 508.namd_r -0.07% 510.parest_r 1.12% 511.povray_r 1.82% 519.lbm_r 0.00% 521.wrf_r -1.32% 526.blender_r -0.47% 527.cam4_r 0.23% 538.imagick_r -1.72% 544.nab_r -0.56% 549.fotonik3d_r 0.12% 554.roms_r 0.43% Geomean 0.02% 2. Significant impacts on eembc benchmarks are: eembc/idctrn01 9.23% eembc/nnet_test 29.26% gcc/ * config/i386/x86-tune-costs.h (skylake_memcpy): Updated. (skylake_memset): Likewise. (skylake_cost): Change CLEAR_RATIO to 17. * config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB): Replace m_CANNONLAKE, m_ICELAKE_CLIENT, m_ICELAKE_SERVER, m_TIGERLAKE and m_SAPPHIRERAPIDS with m_SKYLAKE and m_CORE_AVX512. gcc/testsuite/ * gcc.target/i386/memcpy-strategy-9.c: New test. * gcc.target/i386/memcpy-strategy-10.c: Likewise. * gcc.target/i386/memcpy-strategy-11.c: Likewise. * gcc.target/i386/memset-strategy-7.c: Likewise. * gcc.target/i386/memset-strategy-8.c: Likewise. * gcc.target/i386/memset-strategy-9.c: Likewise. --- gcc/config/i386/x86-tune-costs.h | 27 +++++++++++++++++---------- gcc/config/i386/x86-tune.def | 3 +-- 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 0e00ff9..ffe810f 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1822,17 +1822,24 @@ struct processor_costs znver3_cost = { /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ static stringop_algs skylake_memcpy[2] = { - {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, - {libcall, {{16, loop, false}, {512, unrolled_loop, false}, - {-1, libcall, false}}}}; + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}, + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}}; static stringop_algs skylake_memset[2] = { - {libcall, {{6, loop_1_byte, true}, - {24, loop, true}, - {8192, rep_prefix_4_byte, true}, - {-1, libcall, false}}}, - {libcall, {{24, loop, true}, {512, unrolled_loop, false}, - {-1, libcall, false}}}}; + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}, + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}}; static const struct processor_costs skylake_cost = { @@ -1889,7 +1896,7 @@ struct processor_costs skylake_cost = { COSTS_N_INSNS (0), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ - 6, /* CLEAR_RATIO */ + 17, /* CLEAR_RATIO */ {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 134916c..eb057a6 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -273,8 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA) move/set sequences of bytes with known size. */ DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB, "prefer_known_rep_movsb_stosb", - m_CANNONLAKE | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_TIGERLAKE - | m_ALDERLAKE | m_SAPPHIRERAPIDS) + m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512) /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of compact prologues and epilogues by issuing a misaligned moves. This -- cgit v1.1 From 498d2ba5849cd0888ad473a2ff953ede106262c5 Mon Sep 17 00:00:00 2001 From: Xianmiao Qu Date: Tue, 6 Apr 2021 20:19:15 +0800 Subject: C-SKY: Describe ck802 bypass accurately. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following warning: insn-automata.c: In function ‘int maximal_insn_latency(rtx_insn*)’: insn-automata.c:679:37: warning: array subscript -1 is below array bounds of ‘const unsigned char [19]’ [-Warray-bounds] 679 | return default_latencies[insn_code]; | ~~~~~~~~~~~~~~~~~~~~~~~~~~~^ insn-automata.c:397:30: note: while referencing ‘default_latencies’ 397 | static const unsigned char default_latencies[] = | gcc/ * config/csky/csky_pipeline_ck802.md : Use insn reservation name instead of *. --- gcc/config/csky/csky_pipeline_ck802.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/csky/csky_pipeline_ck802.md b/gcc/config/csky/csky_pipeline_ck802.md index bf1c2a7..2406f59 100644 --- a/gcc/config/csky/csky_pipeline_ck802.md +++ b/gcc/config/csky/csky_pipeline_ck802.md @@ -70,8 +70,12 @@ (define_bypass 3 "ck802_load,ck802_store" "ck802_pool") (define_bypass 3 "ck802_pool" "ck802_load,ck802_store") -(define_bypass 1 "*" "ck802_alu") +(define_bypass 1 "ck802_alu,ck802_branch,ck802_cmp,ck802_cbranch,ck802_call,\ + ck802_load,ck802_pool,ck802_store" + "ck802_alu") -(define_bypass 1 "*" "ck802_branch") +(define_bypass 1 "ck802_alu,ck802_branch,ck802_cmp,ck802_cbranch,ck802_call,\ + ck802_load,ck802_pool,ck802_store" + "ck802_branch") (define_bypass 2 "ck802_cmp" "ck802_cbranch") -- cgit v1.1 From 67d56b272021363eb58c319ca3b73beba3a60817 Mon Sep 17 00:00:00 2001 From: Alex Coplan Date: Thu, 8 Apr 2021 09:36:57 +0100 Subject: arm: Various MVE vec_duplicate fixes [PR99647] This patch fixes various issues with vec_duplicate in the MVE patterns. Currently there are two patterns named *mve_mov. The second of these is really a vector duplicate rather than a move, so I've renamed it accordingly. As it stands, there are several issues with this pattern: 1. The MVE_types iterator has an entry for TImode, but vec_duplicate:TI is invalid. 2. The mode of the operand to vec_duplicate is SImode, but it should vary according to the vector mode iterator. 3. The second alternative of this pattern is bogus: it allows matching symbol_refs (the cause of the PR) and const_ints (which means that it matches (vec_duplicate (const_int ...)) which is non-canonical: such rtxes should be const_vectors instead and handled by the main vector move pattern). This patch fixes all of these issues, and removes the redundant *mve_vec_duplicate pattern. gcc/ChangeLog: PR target/99647 * config/arm/iterators.md (MVE_vecs): New. (V_elem): Also handle V2DF. * config/arm/mve.md (*mve_mov): Rename to ... (*mve_vdup): ... this. Remove second alternative since vec_duplicate of const_int is not canonical RTL, and we don't want to match symbol_refs. (*mve_vec_duplicate): Delete (pattern is redundant). gcc/testsuite/ChangeLog: PR target/99647 * gcc.c-torture/compile/pr99647.c: New test. --- gcc/config/arm/iterators.md | 8 +++++--- gcc/config/arm/mve.md | 25 +++++++------------------ 2 files changed, 12 insertions(+), 21 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 43aab23..8fb723e 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -261,6 +261,7 @@ ;; MVE mode iterator. (define_mode_iterator MVE_types [V16QI V8HI V4SI V2DI TI V8HF V4SF V2DF]) +(define_mode_iterator MVE_vecs [V16QI V8HI V4SI V2DI V8HF V4SF V2DF]) (define_mode_iterator MVE_VLD_ST [V16QI V8HI V4SI V8HF V4SF]) (define_mode_iterator MVE_0 [V8HF V4SF]) (define_mode_iterator MVE_1 [V16QI V8HI V4SI V2DI]) @@ -567,9 +568,10 @@ (V4HI "HI") (V8HI "HI") (V4HF "HF") (V8HF "HF") (V4BF "BF") (V8BF "BF") - (V2SI "SI") (V4SI "SI") - (V2SF "SF") (V4SF "SF") - (DI "DI") (V2DI "DI")]) + (V2SI "SI") (V4SI "SI") + (V2SF "SF") (V4SF "SF") + (DI "DI") (V2DI "DI") + (V2DF "DF")]) ;; As above but in lower case. (define_mode_attr V_elem_l [(V8QI "qi") (V16QI "qi") diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 1351863..7467d5f 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -104,18 +104,14 @@ (set_attr "thumb2_pool_range" "*,*,*,*,1018,*,*,*,*") (set_attr "neg_pool_range" "*,*,*,*,996,*,*,*,*")]) -(define_insn "*mve_mov" - [(set (match_operand:MVE_types 0 "s_register_operand" "=w,w") - (vec_duplicate:MVE_types - (match_operand:SI 1 "nonmemory_operand" "r,i")))] +(define_insn "*mve_vdup" + [(set (match_operand:MVE_vecs 0 "s_register_operand" "=w") + (vec_duplicate:MVE_vecs + (match_operand: 1 "s_register_operand" "r")))] "TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT" -{ - if (which_alternative == 0) - return "vdup.\t%q0, %1"; - return "vmov.\t%q0, %1"; -} - [(set_attr "length" "4,4") - (set_attr "type" "mve_move,mve_move")]) + "vdup.\t%q0, %1" + [(set_attr "length" "4") + (set_attr "type" "mve_move")]) ;; ;; [vst4q]) @@ -10737,13 +10733,6 @@ [(set_attr "type" "mve_move") (set_attr "length" "8")]) -(define_insn "*mve_vec_duplicate" - [(set (match_operand:MVE_VLD_ST 0 "s_register_operand" "=w") - (vec_duplicate:MVE_VLD_ST (match_operand: 1 "general_operand" "r")))] - "TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT" - "vdup.\t%q0, %1" - [(set_attr "type" "mve_move")]) - ;; CDE instructions on MVE registers. (define_insn "arm_vcx1qv16qi" -- cgit v1.1 From 94279aacd061623a160b8dc1b9ea267ee435b0f8 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 8 Apr 2021 16:55:49 +0100 Subject: VAX: Fix comment for `*bit' pattern's peephole The comment for a peephole provided for the `*bit' pattern to be produced in comparison elimination from a sequence involving a bitwise complement operation of one input operand followed by a bitwise AND operation between a bitwise complement of said intermediate result and the other input operand (which corresponds to a sequence of MCOM and BIC machine instructions) incorrectly refers to the first operation as MNEG (which is the machine instruction for arithmetic negation) rather than MCOM as it is supposed to. Fix it. gcc/ * config/vax/vax.md: Fix comment for `*bit' pattern's peephole. --- gcc/config/vax/vax.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/vax/vax.md b/gcc/config/vax/vax.md index 0a2c86c..5b1b392 100644 --- a/gcc/config/vax/vax.md +++ b/gcc/config/vax/vax.md @@ -1228,7 +1228,7 @@ ;; the "*bit" pattern does for the purpose of the compare ;; elimination pass. Try to get rid of the extra operation by hand ;; and where the sequence is used to set the condition codes only -;; convert MNEG/BIC => BIT. +;; convert MCOM/BIC => BIT. (define_peephole2 [(parallel [(set (match_operand:VAXint 0 "register_operand") -- cgit v1.1 From 5f7c2d20b5bd33d7afacb56e18111edb3256c903 Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Wed, 7 Apr 2021 21:34:02 -0400 Subject: aix: revert TLS common change GCC uses TLS common for both public common / BSS and local common / BSS. This patch reverts to use .comm directive to allocate TLS common / BSS. This also changes the priority of section selection to use BSS before data section. gcc/ChangeLog: * config/rs6000/rs6000.c (rs6000_xcoff_select_section): Select TLS BSS before TLS data. * config/rs6000/xcoff.h (ASM_OUTPUT_TLS_COMMON): Use .comm. gcc/testsuite/ChangeLog: * g++.dg/gomp/tls-5.C: Expect tbss failure on AIX. --- gcc/config/rs6000/rs6000.c | 6 +++--- gcc/config/rs6000/xcoff.h | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 35f5c33..48b8efd 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -21285,14 +21285,14 @@ rs6000_xcoff_select_section (tree decl, int reloc, #if HAVE_AS_TLS if (TREE_CODE (decl) == VAR_DECL && DECL_THREAD_LOCAL_P (decl)) { - if (TREE_PUBLIC (decl)) - return tls_data_section; - else if (bss_initializer_p (decl)) + if (bss_initializer_p (decl)) { /* Convert to COMMON to emit in BSS. */ DECL_COMMON (decl) = 1; return tls_comm_section; } + else if (TREE_PUBLIC (decl)) + return tls_data_section; else return tls_private_data_section; } diff --git a/gcc/config/rs6000/xcoff.h b/gcc/config/rs6000/xcoff.h index cb9aae7..c016678 100644 --- a/gcc/config/rs6000/xcoff.h +++ b/gcc/config/rs6000/xcoff.h @@ -255,11 +255,11 @@ } while (0) #ifdef HAVE_AS_TLS -#define ASM_OUTPUT_TLS_COMMON(FILE, DECL, NAME, SIZE) \ - do { fputs (LOCAL_COMMON_ASM_OP, (FILE)); \ - fprintf ((FILE), "%s," HOST_WIDE_INT_PRINT_UNSIGNED",%s[UL],3\n", \ - (*targetm.strip_name_encoding) (NAME), (SIZE), \ - (*targetm.strip_name_encoding) (NAME)); \ +#define ASM_OUTPUT_TLS_COMMON(FILE, DECL, NAME, SIZE) \ + do { fputs (COMMON_ASM_OP, (FILE)); \ + RS6000_OUTPUT_BASENAME ((FILE), (NAME)); \ + fprintf ((FILE), "[UL]," HOST_WIDE_INT_PRINT_UNSIGNED"\n", \ + (SIZE)); \ } while (0) #endif -- cgit v1.1 From ba2913f618ab2fecf15355f936028a39b5a9db87 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 9 Apr 2021 13:43:14 +0100 Subject: aarch64: Use x30 as temporary in SVE TLSDESC patterns gcc.dg/torture/tls/tls-reload-1.c started ICEing for SVE some time during the GCC 11 cycle (not sure when). The problem is that we had an output reload on a call_insn, which isn't a supported combination. This patch uses LR_REGNUM instead. The resulting "blr x30" might not perform as well on some CPUs, but in this context the difference shouldn't be noticeable. gcc/ * config/aarch64/aarch64.md (tlsdesc_small_sve_): Use X30 as the temporary register. --- gcc/config/aarch64/aarch64.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 15bbc10..a149748 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -6818,10 +6818,9 @@ UNSPEC_TLSDESC)) (const_int 0))) (unspec:DI [(match_operand:DI 1 "const_int_operand")] UNSPEC_CALLEE_ABI) - (clobber (reg:DI LR_REGNUM)) - (clobber (match_scratch:DI 2 "=r"))] + (clobber (reg:DI LR_REGNUM))] "TARGET_TLS_DESC && TARGET_SVE" - "adrp\\tx0, %A0\;ldr\\t%2, [x0, #%L0]\;add\\t0, 0, %L0\;.tlsdesccall\\t%0\;blr\\t%2" + "adrp\\tx0, %A0\;ldr\\t30, [x0, #%L0]\;add\\t0, 0, %L0\;.tlsdesccall\\t%0\;blr\\tx30" [(set_attr "type" "call") (set_attr "length" "16")]) -- cgit v1.1 From 1a5c82919c27a6af5eba0c2ba147dd011003cf72 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 9 Apr 2021 18:24:00 +0100 Subject: aarch64: Fix push/pop_options with --with-cpu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a toolchain is configured with --with-cpu=X and gcc is then run with an explicit -march=Y option, we ignore the X cpu setting and tune for generic Y code: if (!selected_cpu) { if (selected_arch) { ------> selected_cpu = &all_cores[selected_arch->ident]; aarch64_isa_flags = arch_isa; explicit_arch = selected_arch->arch; } else { /* Get default configure-time CPU. */ selected_cpu = aarch64_get_tune_cpu (aarch64_none); aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6; } if (selected_tune) explicit_tune_core = selected_tune->ident; } … if (!selected_tune) selected_tune = selected_cpu; But after a push/pop_options pair, we simply did: selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core); In the above scenario, ptr->x_explicit_tune_core is aarch64_none, so we fall back on the default configure-time CPU. This means that before the push_options we tuned for generic Y but after the pop_options we tuned for X. This was picked up by an assertion failure in cl_optimization_compare. The ICE itself is a GCC 11 regression, but the problem that it shows up is much older. gcc/ * config/aarch64/aarch64.c (aarch64_option_restore): If the architecture was specified explicitly and the tuning wasn't, tune for the architecture rather than the configured default CPU. --- gcc/config/aarch64/aarch64.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 994fafc..6405504 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -16945,10 +16945,14 @@ aarch64_option_restore (struct gcc_options *opts, struct gcc_options */* opts_set */, struct cl_target_option *ptr) { - opts->x_explicit_tune_core = ptr->x_explicit_tune_core; - selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core); opts->x_explicit_arch = ptr->x_explicit_arch; selected_arch = aarch64_get_arch (ptr->x_explicit_arch); + opts->x_explicit_tune_core = ptr->x_explicit_tune_core; + if (opts->x_explicit_tune_core == aarch64_none + && opts->x_explicit_arch != aarch64_no_arch) + selected_tune = &all_cores[selected_arch->ident]; + else + selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core); opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string; opts->x_aarch64_branch_protection_string = ptr->x_aarch64_branch_protection_string; -- cgit v1.1 From 71958f740f1b8c47a86ea222418abee395d254a0 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 9 Apr 2021 11:44:32 -0700 Subject: x86: Define _serialize as macro Define _serialize as macro for callers with general-regs-only target attribute to avoid inline failure with always_inline attribute. gcc/ PR target/99744 * config/i386/serializeintrin.h (_serialize): Defined as macro. gcc/testsuite/ PR target/99744 * gcc.target/i386/pr99744-2.c: New test. --- gcc/config/i386/serializeintrin.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/serializeintrin.h b/gcc/config/i386/serializeintrin.h index 89b5b94..e280250 100644 --- a/gcc/config/i386/serializeintrin.h +++ b/gcc/config/i386/serializeintrin.h @@ -34,12 +34,7 @@ #define __DISABLE_SERIALIZE__ #endif /* __SERIALIZE__ */ -extern __inline void -__attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_serialize (void) -{ - __builtin_ia32_serialize (); -} +#define _serialize() __builtin_ia32_serialize () #ifdef __DISABLE_SERIALIZE__ #undef __DISABLE_SERIALIZE__ -- cgit v1.1 From f2be08339b77d3495e210d6b5d9cea927f437720 Mon Sep 17 00:00:00 2001 From: "Cui,Lili" Date: Mon, 12 Apr 2021 09:59:25 +0800 Subject: Change march=alderlake ISA list and add m_ALDERLAKE to m_CORE_AVX2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Alder Lake Intel Hybrid Technology will not support Intel® AVX-512. ISA features such as Intel® AVX, AVX-VNNI, Intel® AVX2, and UMONITOR/UMWAIT/TPAUSE are supported. gcc/ChangeLog * config/i386/i386.h (PTA_ALDERLAKE): Change alderlake ISA list. * config/i386/i386-options.c (m_CORE_AVX2): Add m_ALDERLAKE. * common/config/i386/cpuinfo.h (get_intel_cpu): Add AlderLake model. * doc/invoke.texi: Change alderlake ISA list. --- gcc/config/i386/i386-options.c | 2 +- gcc/config/i386/i386.h | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index a8d0673..02e9c97 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -129,7 +129,7 @@ along with GCC; see the file COPYING3. If not see #define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \ | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \ | m_TIGERLAKE | m_COOPERLAKE | m_SAPPHIRERAPIDS) -#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512) +#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512) #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2) #define m_GOLDMONT (HOST_WIDE_INT_1U< Date: Mon, 12 Apr 2021 09:59:25 +0800 Subject: Add rocketlake to gcc. gcc/ * common/config/i386/cpuinfo.h (get_intel_cpu): Handle rocketlake. * common/config/i386/i386-common.c (processor_names): Add rocketlake. (processor_alias_table): Add rocketlake. * common/config/i386/i386-cpuinfo.h (processor_subtypes): Add INTEL_COREI7_ROCKETLAKE. * config.gcc: Add -march=rocketlake. * config/i386/i386-c.c (ix86_target_macros_internal): Handle rocketlake. * config/i386/i386-options.c (m_ROCKETLAKE) : Define. (processor_cost_table): Add rocketlake cost. * config/i386/i386.h (ix86_size_cost) : Define TARGET_ROCKETLAKE. (processor_type) : Add PROCESSOR_ROCKETLAKE. (PTA_ROCKETLAKE): Ditto. * doc/extend.texi: Add rocketlake. * doc/invoke.texi: Add rocketlake. gcc/testsuite/ * gcc.target/i386/funcspec-56.inc: Handle new march. * g++.target/i386/mv16.C: Handle new march --- gcc/config/i386/i386-c.c | 7 +++++++ gcc/config/i386/i386-options.c | 5 ++++- gcc/config/i386/i386.h | 3 +++ 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c index ed4b098..be46d05 100644 --- a/gcc/config/i386/i386-c.c +++ b/gcc/config/i386/i386-c.c @@ -242,6 +242,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__alderlake"); def_or_undef (parse_in, "__alderlake__"); break; + case PROCESSOR_ROCKETLAKE: + def_or_undef (parse_in, "__rocketlake"); + def_or_undef (parse_in, "__rocketlake__"); + break; /* use PROCESSOR_max to not set/unset the arch macro. */ case PROCESSOR_max: break; @@ -405,6 +409,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, case PROCESSOR_ALDERLAKE: def_or_undef (parse_in, "__tune_alderlake__"); break; + case PROCESSOR_ROCKETLAKE: + def_or_undef (parse_in, "__tune_rocketlake__"); + break; case PROCESSOR_INTEL: case PROCESSOR_GENERIC: break; diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 02e9c97..91da284 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -126,9 +126,11 @@ along with GCC; see the file COPYING3. If not see #define m_COOPERLAKE (HOST_WIDE_INT_1U< Date: Mon, 12 Apr 2021 16:08:42 +0200 Subject: IBM Z: Add alternative to *movdi_{31,64} in order to load a DFP zero gcc/ChangeLog: * config/s390/s390.md ("*movdi_31", "*movdi_64"): Add alternative in order to load a DFP zero. --- gcc/config/s390/s390.md | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index c10f25b..7faf775 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -1868,9 +1868,9 @@ (define_insn "*movdi_64" [(set (match_operand:DI 0 "nonimmediate_operand" - "=d, d, d, d, d, d, d, d,f,d,d,d,d,d,T,!*f,!*f,!*f,!R,!T,b,Q,d,t,Q,t,v,v,v,d,v,R,d") + "=d, d, d, d, d, d, d, d,f,d,!*f,d,d,d,d,T,!*f,!*f,!*f,!R,!T,b,Q,d,t,Q,t,v,v,v,d,v,R,d") (match_operand:DI 1 "general_operand" - " K,N0HD0,N1HD0,N2HD0,N3HD0,Os,N0SD0,N1SD0,d,f,L,b,d,T,d, *f, R, T,*f,*f,d,K,t,d,t,Q,K,v,d,v,R,v,ZL"))] + " K,N0HD0,N1HD0,N2HD0,N3HD0,Os,N0SD0,N1SD0,d,f,j00,L,b,d,T,d, *f, R, T,*f,*f,d,K,t,d,t,Q,K,v,d,v,R,v,ZL"))] "TARGET_ZARCH" "@ lghi\t%0,%h1 @@ -1883,6 +1883,7 @@ llilf\t%0,%k1 ldgr\t%0,%1 lgdr\t%0,%1 + lzdr\t%0 lay\t%0,%a1 lgrl\t%0,%1 lgr\t%0,%1 @@ -1906,13 +1907,13 @@ vleg\t%v0,%1,0 vsteg\t%v1,%0,0 larl\t%0,%1" - [(set_attr "op_type" "RI,RI,RI,RI,RI,RIL,RIL,RIL,RRE,RRE,RXY,RIL,RRE,RXY, + [(set_attr "op_type" "RI,RI,RI,RI,RI,RIL,RIL,RIL,RRE,RRE,RRE,RXY,RIL,RRE,RXY, RXY,RR,RX,RXY,RX,RXY,RIL,SIL,*,*,RS,RS,VRI,VRR,VRS,VRS, VRX,VRX,RIL") - (set_attr "type" "*,*,*,*,*,*,*,*,floaddf,floaddf,la,larl,lr,load,store, + (set_attr "type" "*,*,*,*,*,*,*,*,floaddf,floaddf,fsimpdf,la,larl,lr,load,store, floaddf,floaddf,floaddf,fstoredf,fstoredf,larl,*,*,*,*, *,*,*,*,*,*,*,larl") - (set_attr "cpu_facility" "*,*,*,*,*,extimm,extimm,extimm,dfp,dfp,longdisp, + (set_attr "cpu_facility" "*,*,*,*,*,extimm,extimm,extimm,dfp,dfp,*,longdisp, z10,*,*,*,*,*,longdisp,*,longdisp, z10,z10,*,*,*,*,vx,vx,vx,vx,vx,vx,*") (set_attr "z10prop" "z10_fwd_A1, @@ -1925,6 +1926,7 @@ z10_fwd_E1, *, *, + *, z10_fwd_A1, z10_fwd_A3, z10_fr_E1, @@ -1942,7 +1944,7 @@ *, *,*,*,*,*,*,*, z10_super_A1") - (set_attr "relative_long" "*,*,*,*,*,*,*,*,*,*, + (set_attr "relative_long" "*,*,*,*,*,*,*,*,*,*,*, *,yes,*,*,*,*,*,*,*,*, yes,*,*,*,*,*,*,*,*,*, *,*,yes") @@ -2002,9 +2004,9 @@ (define_insn "*movdi_31" [(set (match_operand:DI 0 "nonimmediate_operand" - "=d,d,Q,S,d ,o,!*f,!*f,!*f,!R,!T,d") + "=d,d,Q,S,d ,o,!*f,!*f,!*f,!*f,!R,!T,d") (match_operand:DI 1 "general_operand" - " Q,S,d,d,dPT,d, *f, R, T,*f,*f,b"))] + " Q,S,d,d,dPT,d, *f, R, T,j00,*f,*f,b"))] "!TARGET_ZARCH" "@ lm\t%0,%N0,%S1 @@ -2016,12 +2018,13 @@ ldr\t%0,%1 ld\t%0,%1 ldy\t%0,%1 + lzdr\t%0 std\t%1,%0 stdy\t%1,%0 #" - [(set_attr "op_type" "RS,RSY,RS,RSY,*,*,RR,RX,RXY,RX,RXY,*") - (set_attr "type" "lm,lm,stm,stm,*,*,floaddf,floaddf,floaddf,fstoredf,fstoredf,*") - (set_attr "cpu_facility" "*,longdisp,*,longdisp,*,*,*,*,longdisp,*,longdisp,z10")]) + [(set_attr "op_type" "RS,RSY,RS,RSY,*,*,RR,RX,RXY,RRE,RX,RXY,*") + (set_attr "type" "lm,lm,stm,stm,*,*,floaddf,floaddf,floaddf,fsimpdf,fstoredf,fstoredf,*") + (set_attr "cpu_facility" "*,longdisp,*,longdisp,*,*,*,*,longdisp,*,*,longdisp,z10")]) ; For a load from a symbol ref we can use one of the target registers ; together with larl to load the address. -- cgit v1.1 From f6ba5d039f988babdd99b5cdfb4557c380e57d69 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 13 Apr 2021 12:43:39 +0200 Subject: aarch64: Restore bfxil optimization [PR100028] Similarly to PR87763 for bfi, the GCC 9 combiner changes to not combine moves from hard registers regressed the following testcase where we no longer recognize bfxil and emit 3 instructions instead. The following patch adds define_insn patterns that match what the combiner is trying to match in these cases. I haven't been able to see patterns with the other order of the IOR operands, seems the IL is canonicalized this way no matter what is written in the source. 2021-04-13 Jakub Jelinek PR target/100028 * config/aarch64/aarch64.md (*aarch64_bfxil_extr, *aarch64_bfxilsi_extrdi): New define_insn patterns. * gcc.target/aarch64/pr100028.c: New test. --- gcc/config/aarch64/aarch64.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index a149748..9a7ed78 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -5601,6 +5601,38 @@ [(set_attr "type" "bfm")] ) +(define_insn "*aarch64_bfxil_extr" + [(set (match_operand:GPI 0 "register_operand" "=r") + (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0") + (match_operand:GPI 2 "const_int_operand" "n")) + (zero_extract:GPI + (match_operand:GPI 3 "register_operand" "r") + (match_operand:GPI 4 "aarch64_simd_shift_imm_" "n") + (match_operand:GPI 5 "aarch64_simd_shift_imm_" "n"))))] + "UINTVAL (operands[2]) == HOST_WIDE_INT_M1U << INTVAL (operands[4]) + && INTVAL (operands[4]) + && (UINTVAL (operands[4]) + UINTVAL (operands[5]) + <= GET_MODE_BITSIZE (mode))" + "bfxil\t%0, %3, %5, %4" + [(set_attr "type" "bfm")] +) + +(define_insn "*aarch64_bfxilsi_extrdi" + [(set (match_operand:SI 0 "register_operand" "=r") + (ior:SI (and:SI (match_operand:SI 1 "register_operand" "0") + (match_operand:SI 2 "const_int_operand" "n")) + (match_operator:SI 6 "subreg_lowpart_operator" + [(zero_extract:DI + (match_operand:DI 3 "register_operand" "r") + (match_operand:SI 4 "aarch64_simd_shift_imm_si" "n") + (match_operand:SI 5 "aarch64_simd_shift_imm_si" "n"))])))] + "UINTVAL (operands[2]) == HOST_WIDE_INT_M1U << INTVAL (operands[4]) + && INTVAL (operands[4]) + && UINTVAL (operands[4]) + UINTVAL (operands[5]) <= 32" + "bfxil\t%w0, %w3, %5, %4" + [(set_attr "type" "bfm")] +) + (define_insn "*extr_insv_lower_reg" [(set (zero_extract:GPI (match_operand:GPI 0 "register_operand" "+r") (match_operand 1 "const_int_operand" "n") -- cgit v1.1 From d1e4368ddb76a92c44f824c8e4ca1a3de8149342 Mon Sep 17 00:00:00 2001 From: Richard Earnshaw Date: Wed, 14 Apr 2021 10:56:36 +0100 Subject: arm: fix warning when -mcpu=neoverse-n1 is used with -mfpu=neon [PR100067] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the compiler is configured with --with-fpu= (or invoked with, say, -mfpu=neon), then specifying -mcpu=neoverse-n1 can lead to an unexpected warning: cc1: warning: switch ‘-mcpu=neoverse-n1’ conflicts with ‘-march=armv8.2-a’ switch The fix for this is to correctly remove all the feature bits relating to simd/fp units when -mfpu is used, not just those bits that form part of the -mfpu specification (which is a subset). gcc: PR target/100067 * config/arm/arm.c (arm_configure_build_target): Strip isa_all_fpbits from the isa_delta when -mfpu has been used. (arm_options_perform_arch_sanity_checks): It's the architecture that lacks an FPU not the processor. --- gcc/config/arm/arm.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 8910dad..475fb0d 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -3230,21 +3230,22 @@ arm_configure_build_target (struct arm_build_target *target, bitmap_xor (isa_delta, cpu_isa, target->isa); /* Ignore any bits that are quirk bits. */ bitmap_and_compl (isa_delta, isa_delta, isa_quirkbits); - /* Ignore (for now) any bits that might be set by -mfpu. */ - bitmap_and_compl (isa_delta, isa_delta, isa_all_fpubits_internal); - - /* And if the target ISA lacks floating point, ignore any - extensions that depend on that. */ - if (!bitmap_bit_p (target->isa, isa_bit_vfpv2)) + /* If the user (or the default configuration) has specified a + specific FPU, then ignore any bits that depend on the FPU + configuration. Do similarly if using the soft-float + ABI. */ + if (opts->x_arm_fpu_index != TARGET_FPU_auto + || arm_float_abi == ARM_FLOAT_ABI_SOFT) bitmap_and_compl (isa_delta, isa_delta, isa_all_fpbits); if (!bitmap_empty_p (isa_delta)) { if (warn_compatible) warning (0, "switch %<-mcpu=%s%> conflicts " - "with %<-march=%s%> switch", - arm_selected_cpu->common.name, - arm_selected_arch->common.name); + "with switch %<-march=%s%>", + opts->x_arm_cpu_string, + opts->x_arm_arch_string); + /* -march wins for code generation. -mcpu wins for default tuning. */ if (!arm_selected_tune) @@ -3395,7 +3396,9 @@ arm_configure_build_target (struct arm_build_target *target, auto_sbitmap fpu_bits (isa_num_bits); arm_initialize_isa (fpu_bits, arm_selected_fpu->isa_bits); - bitmap_and_compl (target->isa, target->isa, isa_all_fpubits_internal); + /* Clear out ALL bits relating to the FPU/simd extensions, to avoid + potentially invalid combinations later on that we can't match. */ + bitmap_and_compl (target->isa, target->isa, isa_all_fpbits); bitmap_ior (target->isa, target->isa, fpu_bits); } @@ -3856,7 +3859,7 @@ arm_options_perform_arch_sanity_checks (void) arm_pcs_default = ARM_PCS_AAPCS_VFP; if (!bitmap_bit_p (arm_active_target.isa, isa_bit_vfpv2) && !bitmap_bit_p (arm_active_target.isa, isa_bit_mve)) - error ("%<-mfloat-abi=hard%>: selected processor lacks an FPU"); + error ("%<-mfloat-abi=hard%>: selected architecture lacks an FPU"); } else arm_pcs_default = ARM_PCS_AAPCS; -- cgit v1.1 From d253a6f7bb1748405bd9d12967f40dc19c8f77c4 Mon Sep 17 00:00:00 2001 From: Iain Buclaw Date: Mon, 5 Apr 2021 18:46:18 +0200 Subject: d: Add TARGET_D_HAS_STDCALL_CONVENTION This replaces the use of the D front-end `is64bit' parameter in determining whether to insert the "stdcall" function attribute. It is also used to determine whether `extern(System)' should be the same as `extern(Windows)' in the implementation of Target::systemLinkage. gcc/ChangeLog: * config/i386/i386-d.c (ix86_d_has_stdcall_convention): New function. * config/i386/i386-protos.h (ix86_d_has_stdcall_convention): Declare. * config/i386/i386.h (TARGET_D_HAS_STDCALL_CONVENTION): Define. * doc/tm.texi: Regenerate. * doc/tm.texi.in (D language and ABI): Add @hook for TARGET_D_HAS_STDCALL_CONVENTION. gcc/d/ChangeLog: * d-target.cc (Target::systemLinkage): Return LINKwindows if d_has_stdcall_convention applies to LINKsystem. * d-target.def (d_has_stdcall_convention): New hook. * types.cc (TypeVisitor::visit (TypeFunction *)): Insert "stdcall" function attribute if d_has_stdcall_convention applies to LINKwindows. --- gcc/config/i386/i386-d.c | 20 ++++++++++++++++++++ gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/i386.h | 3 ++- 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-d.c b/gcc/config/i386/i386-d.c index b79be85..58b4790 100644 --- a/gcc/config/i386/i386-d.c +++ b/gcc/config/i386/i386-d.c @@ -44,3 +44,23 @@ ix86_d_target_versions (void) else d_add_builtin_version ("D_SoftFloat"); } + +/* Implement TARGET_D_HAS_STDCALL_CONVENTION for x86 targets. */ + +bool +ix86_d_has_stdcall_convention (unsigned int *link_system, + unsigned int *link_windows) +{ + if (ix86_abi == MS_ABI) + { + *link_system = 1; + *link_windows = (!TARGET_64BIT) ? 1 : 0; + } + else + { + *link_system = 0; + *link_windows = 0; + } + + return true; +} diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 9f8a69e..acfb9f5 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -264,6 +264,7 @@ extern void ix86_register_pragmas (void); /* In i386-d.c */ extern void ix86_d_target_versions (void); +extern bool ix86_d_has_stdcall_convention (unsigned int *, unsigned int *); /* In winnt.c */ extern void i386_pe_unique_section (tree, int); diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index fab1b3c..4b525d2 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -802,8 +802,9 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); /* Target Pragmas. */ #define REGISTER_TARGET_PRAGMAS() ix86_register_pragmas () -/* Target CPU versions for D. */ +/* Target hooks for D language. */ #define TARGET_D_CPU_VERSIONS ix86_d_target_versions +#define TARGET_D_HAS_STDCALL_CONVENTION ix86_d_has_stdcall_convention #ifndef CC1_SPEC #define CC1_SPEC "%(cc1_cpu) " -- cgit v1.1 From 3785d2b207f1958f31a79fbbb5705b261551950d Mon Sep 17 00:00:00 2001 From: Iain Buclaw Date: Mon, 5 Apr 2021 19:37:31 +0200 Subject: d: Add TARGET_D_REGISTER_CPU_TARGET_INFO This implements `__traits(getTargetInfo, "floatAbi")' for all targets that have D support files. gcc/ChangeLog: * config/aarch64/aarch64-d.c (aarch64_d_handle_target_float_abi): New function. (aarch64_d_register_target_info): New function. * config/aarch64/aarch64-protos.h (aarch64_d_register_target_info): Declare. * config/aarch64/aarch64.h (TARGET_D_REGISTER_CPU_TARGET_INFO): Define. * config/arm/arm-d.c (arm_d_handle_target_float_abi): New function. (arm_d_register_target_info): New function. * config/arm/arm-protos.h (arm_d_register_target_info): Declare. * config/arm/arm.h (TARGET_D_REGISTER_CPU_TARGET_INFO): Define. * config/i386/i386-d.c (ix86_d_handle_target_float_abi): New function. (ix86_d_register_target_info): New function. * config/i386/i386-protos.h (ix86_d_register_target_info): Declare. * config/i386/i386.h (TARGET_D_REGISTER_CPU_TARGET_INFO): Define. * config/mips/mips-d.c (mips_d_handle_target_float_abi): New function. (mips_d_register_target_info): New function. * config/mips/mips-protos.h (mips_d_register_target_info): Declare. * config/mips/mips.h (TARGET_D_REGISTER_CPU_TARGET_INFO): Define. * config/pa/pa-d.c (pa_d_handle_target_float_abi): New function. (pa_d_register_target_info): New function. * config/pa/pa-protos.h (pa_d_register_target_info): Declare. * config/pa/pa.h (TARGET_D_REGISTER_CPU_TARGET_INFO): Define. * config/riscv/riscv-d.c (riscv_d_handle_target_float_abi): New function. (riscv_d_register_target_info): New function. * config/riscv/riscv-protos.h (riscv_d_register_target_info): Declare. * config/riscv/riscv.h (TARGET_D_REGISTER_CPU_TARGET_INFO): Define. * config/rs6000/rs6000-d.c (rs6000_d_handle_target_float_abi): New function. (rs6000_d_register_target_info): New function. * config/rs6000/rs6000-protos.h (rs6000_d_register_target_info): Declare. * config/rs6000/rs6000.h (TARGET_D_REGISTER_CPU_TARGET_INFO): Define. * config/s390/s390-d.c (s390_d_handle_target_float_abi): New function. (s390_d_register_target_info): New function. * config/s390/s390-protos.h (s390_d_register_target_info): Declare. * config/s390/s390.h (TARGET_D_REGISTER_CPU_TARGET_INFO): Define. * config/sparc/sparc-d.c (sparc_d_handle_target_float_abi): New function. (sparc_d_register_target_info): New function. * config/sparc/sparc-protos.h (sparc_d_register_target_info): Declare. * config/sparc/sparc.h (TARGET_D_REGISTER_CPU_TARGET_INFO): Define. * doc/tm.texi: Regenerate. * doc/tm.texi.in (D language and ABI): Add @hook for TARGET_D_REGISTER_CPU_TARGET_INFO. gcc/d/ChangeLog: * d-target.cc (Target::_init): Call new targetdm hook to register CPU specific target info keys. * d-target.def (d_register_cpu_target_info): New hook. --- gcc/config/aarch64/aarch64-d.c | 23 +++++++++++++++++++ gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64.h | 3 ++- gcc/config/arm/arm-d.c | 42 +++++++++++++++++++++++++++++++++ gcc/config/arm/arm-protos.h | 1 + gcc/config/arm/arm.h | 3 ++- gcc/config/i386/i386-d.c | 28 ++++++++++++++++++++++ gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/i386.h | 1 + gcc/config/mips/mips-d.c | 30 ++++++++++++++++++++++++ gcc/config/mips/mips-protos.h | 1 + gcc/config/mips/mips.h | 3 ++- gcc/config/pa/pa-d.c | 28 ++++++++++++++++++++++ gcc/config/pa/pa-protos.h | 1 + gcc/config/pa/pa.h | 3 ++- gcc/config/riscv/riscv-d.c | 46 +++++++++++++++++++++++++++++++++++++ gcc/config/riscv/riscv-protos.h | 1 + gcc/config/riscv/riscv.h | 3 ++- gcc/config/rs6000/rs6000-d.c | 30 ++++++++++++++++++++++++ gcc/config/rs6000/rs6000-protos.h | 1 + gcc/config/rs6000/rs6000.h | 3 ++- gcc/config/s390/s390-d.c | 30 ++++++++++++++++++++++++ gcc/config/s390/s390-protos.h | 1 + gcc/config/s390/s390.h | 3 ++- gcc/config/sparc/sparc-d.c | 28 ++++++++++++++++++++++ gcc/config/sparc/sparc-protos.h | 1 + gcc/config/sparc/sparc.h | 3 ++- 27 files changed, 311 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-d.c b/gcc/config/aarch64/aarch64-d.c index 4fce593..416bb7c 100644 --- a/gcc/config/aarch64/aarch64-d.c +++ b/gcc/config/aarch64/aarch64-d.c @@ -31,3 +31,26 @@ aarch64_d_target_versions (void) d_add_builtin_version ("AArch64"); d_add_builtin_version ("D_HardFloat"); } + +/* Handle a call to `__traits(getTargetInfo, "floatAbi")'. */ + +static tree +aarch64_d_handle_target_float_abi (void) +{ + const char *abi = "hard"; + + return build_string_literal (strlen (abi) + 1, abi); +} + +/* Implement TARGET_D_REGISTER_CPU_TARGET_INFO. */ + +void +aarch64_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "floatAbi", aarch64_d_handle_target_float_abi }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index d5d5417..c203338 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -1011,6 +1011,7 @@ std::string aarch64_get_extension_string_for_isa_flags (uint64_t, uint64_t); /* Defined in aarch64-d.c */ extern void aarch64_d_target_versions (void); +extern void aarch64_d_register_target_info (void); rtl_opt_pass *make_pass_fma_steering (gcc::context *); rtl_opt_pass *make_pass_track_speculation (gcc::context *); diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index d0bae61..bfffbcd 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -26,8 +26,9 @@ #define TARGET_CPU_CPP_BUILTINS() \ aarch64_cpu_cpp_builtins (pfile) -/* Target CPU versions for D. */ +/* Target hooks for D language. */ #define TARGET_D_CPU_VERSIONS aarch64_d_target_versions +#define TARGET_D_REGISTER_CPU_TARGET_INFO aarch64_d_register_target_info diff --git a/gcc/config/arm/arm-d.c b/gcc/config/arm/arm-d.c index 2cb9f4b..5f43ef9 100644 --- a/gcc/config/arm/arm-d.c +++ b/gcc/config/arm/arm-d.c @@ -53,3 +53,45 @@ arm_d_target_versions (void) else if (TARGET_HARD_FLOAT) d_add_builtin_version ("D_HardFloat"); } + +/* Handle a call to `__traits(getTargetInfo, "floatAbi")'. */ + +static tree +arm_d_handle_target_float_abi (void) +{ + const char *abi; + + switch (arm_float_abi) + { + case ARM_FLOAT_ABI_HARD: + abi = "hard"; + break; + + case ARM_FLOAT_ABI_SOFT: + abi = "soft"; + break; + + case ARM_FLOAT_ABI_SOFTFP: + abi = "softfp"; + break; + + default: + abi = ""; + break; + } + + return build_string_literal (strlen (abi) + 1, abi); +} + +/* Implement TARGET_D_REGISTER_CPU_TARGET_INFO. */ + +void +arm_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "floatAbi", arm_d_handle_target_float_abi }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 952a825..2521541 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -397,6 +397,7 @@ extern void arm_cpu_cpp_builtins (struct cpp_reader *); /* Defined in arm-d.c */ extern void arm_d_target_versions (void); +extern void arm_d_register_target_info (void); extern bool arm_is_constant_pool_ref (rtx); diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index 113c015..c70af57 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -47,8 +47,9 @@ extern char arm_arch_name[]; /* Target CPU builtins. */ #define TARGET_CPU_CPP_BUILTINS() arm_cpu_cpp_builtins (pfile) -/* Target CPU versions for D. */ +/* Target hooks for D language. */ #define TARGET_D_CPU_VERSIONS arm_d_target_versions +#define TARGET_D_REGISTER_CPU_TARGET_INFO arm_d_register_target_info #include "config/arm/arm-opts.h" diff --git a/gcc/config/i386/i386-d.c b/gcc/config/i386/i386-d.c index 58b4790..da5958c 100644 --- a/gcc/config/i386/i386-d.c +++ b/gcc/config/i386/i386-d.c @@ -45,6 +45,34 @@ ix86_d_target_versions (void) d_add_builtin_version ("D_SoftFloat"); } +/* Handle a call to `__traits(getTargetInfo, "floatAbi")'. */ + +static tree +ix86_d_handle_target_float_abi (void) +{ + const char *abi; + + if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387)) + abi = "soft"; + else + abi = "hard"; + + return build_string_literal (strlen (abi) + 1, abi); +} + +/* Implement TARGET_D_REGISTER_CPU_TARGET_INFO. */ + +void +ix86_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "floatAbi", ix86_d_handle_target_float_abi }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} + /* Implement TARGET_D_HAS_STDCALL_CONVENTION for x86 targets. */ bool diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index acfb9f5..7782cf11 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -264,6 +264,7 @@ extern void ix86_register_pragmas (void); /* In i386-d.c */ extern void ix86_d_target_versions (void); +extern void ix86_d_register_target_info (void); extern bool ix86_d_has_stdcall_convention (unsigned int *, unsigned int *); /* In winnt.c */ diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 4b525d2..97700d7 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -804,6 +804,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); /* Target hooks for D language. */ #define TARGET_D_CPU_VERSIONS ix86_d_target_versions +#define TARGET_D_REGISTER_CPU_TARGET_INFO ix86_d_register_target_info #define TARGET_D_HAS_STDCALL_CONVENTION ix86_d_has_stdcall_convention #ifndef CC1_SPEC diff --git a/gcc/config/mips/mips-d.c b/gcc/config/mips/mips-d.c index dc57127..e03f486 100644 --- a/gcc/config/mips/mips-d.c +++ b/gcc/config/mips/mips-d.c @@ -56,3 +56,33 @@ mips_d_target_versions (void) d_add_builtin_version ("D_SoftFloat"); } } + +/* Handle a call to `__traits(getTargetInfo, "floatAbi")'. */ + +static tree +mips_d_handle_target_float_abi (void) +{ + const char *abi; + + if (TARGET_HARD_FLOAT_ABI) + abi = "hard"; + else if (TARGET_SOFT_FLOAT_ABI) + abi = "soft"; + else + abi = ""; + + return build_string_literal (strlen (abi) + 1, abi); +} + +/* Implement TARGET_D_REGISTER_CPU_TARGET_INFO. */ + +void +mips_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "floatAbi", mips_d_handle_target_float_abi }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h index 72bbbe2..2cf4ed5 100644 --- a/gcc/config/mips/mips-protos.h +++ b/gcc/config/mips/mips-protos.h @@ -388,5 +388,6 @@ extern void mips_expand_vec_cond_expr (machine_mode, machine_mode, rtx *); /* Routines implemented in mips-d.c */ extern void mips_d_target_versions (void); +extern void mips_d_register_target_info (void); #endif /* ! GCC_MIPS_PROTOS_H */ diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h index b4a60a5..47aac9d 100644 --- a/gcc/config/mips/mips.h +++ b/gcc/config/mips/mips.h @@ -658,8 +658,9 @@ struct mips_cpu_info { } \ while (0) -/* Target CPU versions for D. */ +/* Target hooks for D language. */ #define TARGET_D_CPU_VERSIONS mips_d_target_versions +#define TARGET_D_REGISTER_CPU_TARGET_INFO mips_d_register_target_info /* Default target_flags if no switches are specified */ diff --git a/gcc/config/pa/pa-d.c b/gcc/config/pa/pa-d.c index 663e749..41b2f18 100644 --- a/gcc/config/pa/pa-d.c +++ b/gcc/config/pa/pa-d.c @@ -39,3 +39,31 @@ pa_d_target_versions (void) else d_add_builtin_version ("D_HardFloat"); } + +/* Handle a call to `__traits(getTargetInfo, "floatAbi")'. */ + +static tree +pa_d_handle_target_float_abi (void) +{ + const char *abi; + + if (TARGET_DISABLE_FPREGS || TARGET_SOFT_FLOAT) + abi = "soft"; + else + abi = "hard"; + + return build_string_literal (strlen (abi) + 1, abi); +} + +/* Implement TARGET_D_REGISTER_CPU_TARGET_INFO. */ + +void +pa_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "floatAbi", pa_d_handle_target_float_abi }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} diff --git a/gcc/config/pa/pa-protos.h b/gcc/config/pa/pa-protos.h index 0e1e471..5bf6fef 100644 --- a/gcc/config/pa/pa-protos.h +++ b/gcc/config/pa/pa-protos.h @@ -115,3 +115,4 @@ extern const int pa_magic_milli[]; /* Routines implemented in pa-d.c */ extern void pa_d_target_versions (void); +extern void pa_d_register_target_info (void); diff --git a/gcc/config/pa/pa.h b/gcc/config/pa/pa.h index 3ec015a..fbb9604 100644 --- a/gcc/config/pa/pa.h +++ b/gcc/config/pa/pa.h @@ -1302,8 +1302,9 @@ do { \ #define NEED_INDICATE_EXEC_STACK 0 -/* Target CPU versions for D. */ +/* Target hooks for D language. */ #define TARGET_D_CPU_VERSIONS pa_d_target_versions +#define TARGET_D_REGISTER_CPU_TARGET_INFO pa_d_register_target_info /* Output default function prologue for hpux. */ #define TARGET_ASM_FUNCTION_PROLOGUE pa_output_function_prologue diff --git a/gcc/config/riscv/riscv-d.c b/gcc/config/riscv/riscv-d.c index b20b778..8883cec 100644 --- a/gcc/config/riscv/riscv-d.c +++ b/gcc/config/riscv/riscv-d.c @@ -39,3 +39,49 @@ riscv_d_target_versions (void) else d_add_builtin_version ("D_SoftFloat"); } + +/* Handle a call to `__traits(getTargetInfo, "floatAbi")'. */ + +static tree +riscv_d_handle_target_float_abi (void) +{ + const char *abi; + + switch (riscv_abi) + { + case ABI_ILP32E: + case ABI_ILP32: + case ABI_LP64: + abi = "soft"; + break; + + case ABI_ILP32F: + case ABI_LP64F: + abi = "single"; + break; + + case ABI_ILP32D: + case ABI_LP64D: + abi = "double"; + break; + + default: + abi = ""; + break; + } + + return build_string_literal (strlen (abi) + 1, abi); +} + +/* Implement TARGET_D_REGISTER_CPU_TARGET_INFO. */ + +void +riscv_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "floatAbi", riscv_d_handle_target_float_abi }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index cc0be7e..43d7224 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -80,6 +80,7 @@ void riscv_cpu_cpp_builtins (cpp_reader *); /* Routines implemented in riscv-d.c */ extern void riscv_d_target_versions (void); +extern void riscv_d_register_target_info (void); /* Routines implemented in riscv-builtins.c. */ extern void riscv_atomic_assign_expand_fenv (tree *, tree *, tree *); diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index 3cc3e86..d17096e 100644 --- a/gcc/config/riscv/riscv.h +++ b/gcc/config/riscv/riscv.h @@ -27,8 +27,9 @@ along with GCC; see the file COPYING3. If not see /* Target CPU builtins. */ #define TARGET_CPU_CPP_BUILTINS() riscv_cpu_cpp_builtins (pfile) -/* Target CPU versions for D. */ +/* Target hooks for D language. */ #define TARGET_D_CPU_VERSIONS riscv_d_target_versions +#define TARGET_D_REGISTER_CPU_TARGET_INFO riscv_d_register_target_info #ifdef TARGET_BIG_ENDIAN_DEFAULT #define DEFAULT_ENDIAN_SPEC "b" diff --git a/gcc/config/rs6000/rs6000-d.c b/gcc/config/rs6000/rs6000-d.c index 6bfe813..755de42 100644 --- a/gcc/config/rs6000/rs6000-d.c +++ b/gcc/config/rs6000/rs6000-d.c @@ -45,3 +45,33 @@ rs6000_d_target_versions (void) d_add_builtin_version ("D_SoftFloat"); } } + +/* Handle a call to `__traits(getTargetInfo, "floatAbi")'. */ + +static tree +rs6000_d_handle_target_float_abi (void) +{ + const char *abi; + + if (TARGET_HARD_FLOAT) + abi = "hard"; + else if (TARGET_SOFT_FLOAT) + abi = "soft"; + else + abi = ""; + + return build_string_literal (strlen (abi) + 1, abi); +} + +/* Implement TARGET_D_REGISTER_CPU_TARGET_INFO. */ + +void +rs6000_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "floatAbi", rs6000_d_handle_target_float_abi }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index c44fd3d..a06a147 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -298,6 +298,7 @@ extern void (*rs6000_target_modify_macros_ptr) (bool, HOST_WIDE_INT, /* Declare functions in rs6000-d.c */ extern void rs6000_d_target_versions (void); +extern void rs6000_d_register_target_info (void); #ifdef NO_DOLLAR_IN_LABEL const char * rs6000_xcoff_strip_dollar (const char *); diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index 233a92b..164d359 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -641,8 +641,9 @@ extern unsigned char rs6000_recip_bits[]; #define TARGET_CPU_CPP_BUILTINS() \ rs6000_cpu_cpp_builtins (pfile) -/* Target CPU versions for D. */ +/* Target hooks for D language. */ #define TARGET_D_CPU_VERSIONS rs6000_d_target_versions +#define TARGET_D_REGISTER_CPU_TARGET_INFO rs6000_d_register_target_info /* This is used by rs6000_cpu_cpp_builtins to indicate the byte order we're compiling for. Some configurations may need to override it. */ diff --git a/gcc/config/s390/s390-d.c b/gcc/config/s390/s390-d.c index 2f945eb..1a99063 100644 --- a/gcc/config/s390/s390-d.c +++ b/gcc/config/s390/s390-d.c @@ -41,3 +41,33 @@ s390_d_target_versions (void) else if (TARGET_HARD_FLOAT) d_add_builtin_version ("D_HardFloat"); } + +/* Handle a call to `__traits(getTargetInfo, "floatAbi")'. */ + +static tree +s390_d_handle_target_float_abi (void) +{ + const char *abi; + + if (TARGET_HARD_FLOAT) + abi = "hard"; + else if (TARGET_SOFT_FLOAT) + abi = "soft"; + else + abi = ""; + + return build_string_literal (strlen (abi) + 1, abi); +} + +/* Implement TARGET_D_REGISTER_CPU_TARGET_INFO. */ + +void +s390_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "floatAbi", s390_d_handle_target_float_abi }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h index acbdf66..289e018 100644 --- a/gcc/config/s390/s390-protos.h +++ b/gcc/config/s390/s390-protos.h @@ -173,6 +173,7 @@ extern bool s390_const_operand_ok (tree, int, int, tree); /* s390-d.c routines */ extern void s390_d_target_versions (void); +extern void s390_d_register_target_info (void); /* Pass management. */ namespace gcc { class context; } diff --git a/gcc/config/s390/s390.h b/gcc/config/s390/s390.h index 991af96..3b87616 100644 --- a/gcc/config/s390/s390.h +++ b/gcc/config/s390/s390.h @@ -247,8 +247,9 @@ enum processor_flags /* Target CPU builtins. */ #define TARGET_CPU_CPP_BUILTINS() s390_cpu_cpp_builtins (pfile) -/* Target CPU versions for D. */ +/* Target hooks for D language. */ #define TARGET_D_CPU_VERSIONS s390_d_target_versions +#define TARGET_D_REGISTER_CPU_TARGET_INFO s390_d_register_target_info #ifdef DEFAULT_TARGET_64BIT #define TARGET_DEFAULT (MASK_64BIT | MASK_ZARCH | MASK_HARD_DFP \ diff --git a/gcc/config/sparc/sparc-d.c b/gcc/config/sparc/sparc-d.c index 0eb663b..cfb8dae 100644 --- a/gcc/config/sparc/sparc-d.c +++ b/gcc/config/sparc/sparc-d.c @@ -48,3 +48,31 @@ sparc_d_target_versions (void) d_add_builtin_version ("SPARC_SoftFloat"); } } + +/* Handle a call to `__traits(getTargetInfo, "floatAbi")'. */ + +static tree +sparc_d_handle_target_float_abi (void) +{ + const char *abi; + + if (TARGET_FPU) + abi = "hard"; + else + abi = "soft"; + + return build_string_literal (strlen (abi) + 1, abi); +} + +/* Implement TARGET_D_REGISTER_CPU_TARGET_INFO. */ + +void +sparc_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "floatAbi", sparc_d_handle_target_float_abi }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} diff --git a/gcc/config/sparc/sparc-protos.h b/gcc/config/sparc/sparc-protos.h index ef94d4f..ad875cc 100644 --- a/gcc/config/sparc/sparc-protos.h +++ b/gcc/config/sparc/sparc-protos.h @@ -113,5 +113,6 @@ extern rtl_opt_pass *make_pass_work_around_errata (gcc::context *); /* Routines implemented in sparc-d.c */ extern void sparc_d_target_versions (void); +extern void sparc_d_register_target_info (void); #endif /* __SPARC_PROTOS_H__ */ diff --git a/gcc/config/sparc/sparc.h b/gcc/config/sparc/sparc.h index cec2f5a..4834575 100644 --- a/gcc/config/sparc/sparc.h +++ b/gcc/config/sparc/sparc.h @@ -27,8 +27,9 @@ along with GCC; see the file COPYING3. If not see #define TARGET_CPU_CPP_BUILTINS() sparc_target_macros () -/* Target CPU versions for D. */ +/* Target hooks for D language. */ #define TARGET_D_CPU_VERSIONS sparc_d_target_versions +#define TARGET_D_REGISTER_CPU_TARGET_INFO sparc_d_register_target_info /* Specify this in a cover file to provide bi-architecture (32/64) support. */ /* #define SPARC_BI_ARCH */ -- cgit v1.1 From 3191c1f4488d1f7563b563d7ae2a102a26f16d82 Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Wed, 14 Apr 2021 16:07:17 +0200 Subject: IBM Z: Fix error checking for immediate builtin operands This fixes the error checking for two of the vector builtins which accept irregular (e.g. non-contigiuous) ranges of values. gcc/ChangeLog: * config/s390/s390-builtins.def (O_M5, O_M12, ...): Add new macros for mask operand types. (s390_vec_permi_s64, s390_vec_permi_b64, s390_vec_permi_u64) (s390_vec_permi_dbl, s390_vpdi): Use the M5 type for the immediate operand. (s390_vec_msum_u128, s390_vmslg): Use the M12 type for the immediate operand. * config/s390/s390.c (s390_const_operand_ok): Check the new operand types and generate a list of valid values. gcc/testsuite/ChangeLog: * gcc.target/s390/zvector/imm-range-error-1.c: New test. * gcc.target/s390/zvector/vec_msum_u128-1.c: New test. --- gcc/config/s390/s390-builtins.def | 85 ++++++++++++++++++++++++++------------- gcc/config/s390/s390.c | 35 +++++++++++++--- 2 files changed, 85 insertions(+), 35 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390-builtins.def b/gcc/config/s390/s390-builtins.def index 129d712..f77ab75 100644 --- a/gcc/config/s390/s390-builtins.def +++ b/gcc/config/s390/s390-builtins.def @@ -29,6 +29,9 @@ #undef O_U16 #undef O_U32 +#undef O_M5 +#undef O_M12 + #undef O_S2 #undef O_S3 #undef O_S4 @@ -37,6 +40,7 @@ #undef O_S12 #undef O_S16 #undef O_S32 + #undef O_ELEM #undef O_LIT @@ -85,6 +89,16 @@ #undef O3_U32 #undef O4_U32 +#undef O1_M5 +#undef O2_M5 +#undef O3_M5 +#undef O4_M5 + +#undef O1_M12 +#undef O2_M12 +#undef O3_M12 +#undef O4_M12 + #undef O1_S2 #undef O2_S2 #undef O3_S2 @@ -140,31 +154,34 @@ #undef O_UIMM_P #undef O_SIMM_P -#define O_U1 1 /* unsigned 1 bit literal */ -#define O_U2 2 /* unsigned 2 bit literal */ -#define O_U3 3 /* unsigned 3 bit literal */ -#define O_U4 4 /* unsigned 4 bit literal */ -#define O_U5 5 /* unsigned 5 bit literal */ -#define O_U8 6 /* unsigned 8 bit literal */ -#define O_U12 7 /* unsigned 16 bit literal */ -#define O_U16 8 /* unsigned 16 bit literal */ -#define O_U32 9 /* unsigned 32 bit literal */ - -#define O_S2 10 /* signed 2 bit literal */ -#define O_S3 11 /* signed 3 bit literal */ -#define O_S4 12 /* signed 4 bit literal */ -#define O_S5 13 /* signed 5 bit literal */ -#define O_S8 14 /* signed 8 bit literal */ -#define O_S12 15 /* signed 12 bit literal */ -#define O_S16 16 /* signed 16 bit literal */ -#define O_S32 17 /* signed 32 bit literal */ - -#define O_ELEM 18 /* Element selector requiring modulo arithmetic. */ -#define O_LIT 19 /* Operand must be a literal fitting the target type. */ +#define O_U1 1 /* unsigned 1 bit literal */ +#define O_U2 2 /* unsigned 2 bit literal */ +#define O_U3 3 /* unsigned 3 bit literal */ +#define O_U4 4 /* unsigned 4 bit literal */ +#define O_U5 5 /* unsigned 5 bit literal */ +#define O_U8 6 /* unsigned 8 bit literal */ +#define O_U12 7 /* unsigned 16 bit literal */ +#define O_U16 8 /* unsigned 16 bit literal */ +#define O_U32 9 /* unsigned 32 bit literal */ + +#define O_M5 10 /* matches bitmask of 5 */ +#define O_M12 11 /* matches bitmask of 12 */ + +#define O_S2 12 /* signed 2 bit literal */ +#define O_S3 13 /* signed 3 bit literal */ +#define O_S4 14 /* signed 4 bit literal */ +#define O_S5 15 /* signed 5 bit literal */ +#define O_S8 16 /* signed 8 bit literal */ +#define O_S12 17 /* signed 12 bit literal */ +#define O_S16 18 /* signed 16 bit literal */ +#define O_S32 19 /* signed 32 bit literal */ + +#define O_ELEM 20 /* Element selector requiring modulo arithmetic. */ +#define O_LIT 21 /* Operand must be a literal fitting the target type. */ #define O_SHIFT 5 -#define O_UIMM_P(X) ((X) >= O_U1 && (X) <= O_U32) +#define O_UIMM_P(X) ((X) >= O_U1 && (X) <= O_M12) #define O_SIMM_P(X) ((X) >= O_S2 && (X) <= O_S32) #define O_IMM_P(X) ((X) == O_LIT || ((X) >= O_U1 && (X) <= O_S32)) @@ -213,6 +230,16 @@ #define O3_U32 (O_U32 << (2 * O_SHIFT)) #define O4_U32 (O_U32 << (3 * O_SHIFT)) +#define O1_M5 O_M5 +#define O2_M5 (O_M5 << O_SHIFT) +#define O3_M5 (O_M5 << (2 * O_SHIFT)) +#define O4_M5 (O_M5 << (3 * O_SHIFT)) + +#define O1_M12 O_M12 +#define O2_M12 (O_M12 << O_SHIFT) +#define O3_M12 (O_M12 << (2 * O_SHIFT)) +#define O4_M12 (O_M12 << (3 * O_SHIFT)) + #define O1_S2 O_S2 #define O2_S2 (O_S2 << O_SHIFT) @@ -644,12 +671,12 @@ OB_DEF_VAR (s390_vec_perm_dbl, s390_vperm, 0, B_DEF (s390_vperm, vec_permv16qi, 0, B_VX, 0, BT_FN_UV16QI_UV16QI_UV16QI_UV16QI) OB_DEF (s390_vec_permi, s390_vec_permi_s64, s390_vec_permi_dbl, B_VX, BT_FN_OV4SI_OV4SI_OV4SI_INT) -OB_DEF_VAR (s390_vec_permi_s64, s390_vpdi, 0, O3_U2, BT_OV_V2DI_V2DI_V2DI_INT) -OB_DEF_VAR (s390_vec_permi_b64, s390_vpdi, 0, O3_U2, BT_OV_BV2DI_BV2DI_BV2DI_INT) -OB_DEF_VAR (s390_vec_permi_u64, s390_vpdi, 0, O3_U2, BT_OV_UV2DI_UV2DI_UV2DI_INT) -OB_DEF_VAR (s390_vec_permi_dbl, s390_vpdi, 0, O3_U2, BT_OV_V2DF_V2DF_V2DF_INT) +OB_DEF_VAR (s390_vec_permi_s64, s390_vpdi, 0, O3_M5, BT_OV_V2DI_V2DI_V2DI_INT) +OB_DEF_VAR (s390_vec_permi_b64, s390_vpdi, 0, O3_M5, BT_OV_BV2DI_BV2DI_BV2DI_INT) +OB_DEF_VAR (s390_vec_permi_u64, s390_vpdi, 0, O3_M5, BT_OV_UV2DI_UV2DI_UV2DI_INT) +OB_DEF_VAR (s390_vec_permi_dbl, s390_vpdi, 0, O3_M5, BT_OV_V2DF_V2DF_V2DF_INT) -B_DEF (s390_vpdi, vec_permiv2di, 0, B_VX, O3_U2, BT_FN_UV2DI_UV2DI_UV2DI_INT) +B_DEF (s390_vpdi, vec_permiv2di, 0, B_VX, O3_M5, BT_FN_UV2DI_UV2DI_UV2DI_INT) OB_DEF (s390_vec_splat, s390_vec_splat2_s8, s390_vec_splat2_dbl,B_VX, BT_FN_OV4SI_OV4SI_UCHAR) OB_DEF_VAR (s390_vec_splat2_s8, s390_vrepb, 0, O2_U4, BT_OV_V16QI_V16QI_UCHAR) @@ -2287,8 +2314,8 @@ OB_DEF_VAR (s390_vec_test_mask_dbl, s390_vtm, 0, B_DEF (s390_vtm, vec_test_mask_intv16qi,0, B_VX, 0, BT_FN_INT_UV16QI_UV16QI) -B_DEF (s390_vec_msum_u128, vec_msumv2di, 0, B_VXE, O4_U2, BT_FN_UV16QI_UV2DI_UV2DI_UV16QI_INT) -B_DEF (s390_vmslg, vmslg, 0, B_VXE, O4_U4, BT_FN_INT128_UV2DI_UV2DI_INT128_INT) +B_DEF (s390_vec_msum_u128, vec_msumv2di, 0, B_VXE, O4_M12, BT_FN_UV16QI_UV2DI_UV2DI_UV16QI_INT) +B_DEF (s390_vmslg, vmslg, 0, B_VXE, O4_M12, BT_FN_INT128_UV2DI_UV2DI_INT128_INT) OB_DEF (s390_vec_eqv, s390_vec_eqv_b8, s390_vec_eqv_dbl_c, B_VXE, BT_FN_OV4SI_OV4SI_OV4SI) OB_DEF_VAR (s390_vec_eqv_b8, s390_vnx, 0, 0, BT_OV_BV16QI_BV16QI_BV16QI) diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index f7b1c03..a9c945c 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -734,15 +734,38 @@ s390_const_operand_ok (tree arg, int argnum, int op_flags, tree decl) { if (O_UIMM_P (op_flags)) { - int bitwidths[] = { 1, 2, 3, 4, 5, 8, 12, 16, 32 }; - int bitwidth = bitwidths[op_flags - O_U1]; + unsigned HOST_WIDE_INT bitwidths[] = { 1, 2, 3, 4, 5, 8, 12, 16, 32, 4, 4 }; + unsigned HOST_WIDE_INT bitmasks[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 12 }; + unsigned HOST_WIDE_INT bitwidth = bitwidths[op_flags - O_U1]; + unsigned HOST_WIDE_INT bitmask = bitmasks[op_flags - O_U1]; if (!tree_fits_uhwi_p (arg) - || tree_to_uhwi (arg) > (HOST_WIDE_INT_1U << bitwidth) - 1) + || tree_to_uhwi (arg) > (HOST_WIDE_INT_1U << bitwidth) - 1 + || (bitmask && tree_to_uhwi (arg) & ~bitmask)) { - error ("constant argument %d for builtin %qF is out of range " - "(0..%wu)", argnum, decl, - (HOST_WIDE_INT_1U << bitwidth) - 1); + if (bitmask) + { + gcc_assert (bitmask < 16); + char values[120] = ""; + + for (unsigned HOST_WIDE_INT i = 0; i <= bitmask; i++) + { + char buf[5]; + if (i & ~bitmask) + continue; + int ret = snprintf (buf, 5, HOST_WIDE_INT_PRINT_UNSIGNED, i & bitmask); + gcc_assert (ret < 5); + strcat (values, buf); + if (i < bitmask) + strcat (values, ", "); + } + error ("constant argument %d for builtin %qF is invalid (%s)", + argnum, decl, values); + } + else + error ("constant argument %d for builtin %qF is out of range (0..%wu)", + argnum, decl, (HOST_WIDE_INT_1U << bitwidth) - 1); + return false; } } -- cgit v1.1 From a065e0bb092a010664777394530ab1a52bb5293b Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 14 Apr 2021 16:19:46 +0100 Subject: aarch64: Handle more SVE vector constants [PR99246] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR99246 is about a case in which we failed to handle a CONST_VECTOR with NELTS_PER_PATTERN==2, i.e. a vector with a “foreground” sequence of N vectors followed by a repeating “background” sequence of N vectors. At the moment, it's difficult to produce these vectors directly, but I'm hoping that for GCC 12 we'll do more folding, which will in turn make this easier to test and easier to optimise. Until then, the patch simply relies on the testcase in the PR. gcc/ PR target/99246 * config/aarch64/aarch64.c (aarch64_expand_sve_const_vector_sel): New function. (aarch64_expand_sve_const_vector): Use it for nelts_per_pattern==2. gcc/testsuite/ PR target/99246 * gcc.target/aarch64/sve/acle/general/pr99246.c: New test. --- gcc/config/aarch64/aarch64.c | 54 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 6405504..04b55d9 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -5166,6 +5166,56 @@ aarch64_expand_sve_ld1rq (rtx dest, rtx src) return true; } +/* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed + by N "background" values. Try to move it into TARGET using: + + PTRUE PRED., VL + MOV TRUE., # + MOV FALSE., # + SEL TARGET., PRED., TRUE., FALSE. + + The PTRUE is always a single instruction but the MOVs might need a + longer sequence. If the background value is zero (as it often is), + the sequence can sometimes collapse to a PTRUE followed by a + zero-predicated move. + + Return the target on success, otherwise return null. */ + +static rtx +aarch64_expand_sve_const_vector_sel (rtx target, rtx src) +{ + gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2); + + /* Make sure that the PTRUE is valid. */ + machine_mode mode = GET_MODE (src); + machine_mode pred_mode = aarch64_sve_pred_mode (mode); + unsigned int npatterns = CONST_VECTOR_NPATTERNS (src); + if (aarch64_svpattern_for_vl (pred_mode, npatterns) + == AARCH64_NUM_SVPATTERNS) + return NULL_RTX; + + rtx_vector_builder pred_builder (pred_mode, npatterns, 2); + rtx_vector_builder true_builder (mode, npatterns, 1); + rtx_vector_builder false_builder (mode, npatterns, 1); + for (unsigned int i = 0; i < npatterns; ++i) + { + true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i)); + pred_builder.quick_push (CONST1_RTX (BImode)); + } + for (unsigned int i = 0; i < npatterns; ++i) + { + false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns)); + pred_builder.quick_push (CONST0_RTX (BImode)); + } + expand_operand ops[4]; + create_output_operand (&ops[0], target, mode); + create_input_operand (&ops[1], true_builder.build (), mode); + create_input_operand (&ops[2], false_builder.build (), mode); + create_input_operand (&ops[3], pred_builder.build (), pred_mode); + expand_insn (code_for_vcond_mask (mode, mode), 4, ops); + return target; +} + /* Return a register containing CONST_VECTOR SRC, given that SRC has an SVE data mode and isn't a legitimate constant. Use TARGET for the result if convenient. @@ -5300,6 +5350,10 @@ aarch64_expand_sve_const_vector (rtx target, rtx src) if (GET_MODE_NUNITS (mode).is_constant ()) return NULL_RTX; + if (nelts_per_pattern == 2) + if (rtx res = aarch64_expand_sve_const_vector_sel (target, src)) + return res; + /* Expand each pattern individually. */ gcc_assert (npatterns > 1); rtx_vector_builder builder; -- cgit v1.1 From 39d23b7960e4efb11bbe1eff056ae9da0884c539 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 15 Apr 2021 10:45:09 +0200 Subject: aarch64: Fix several *_ashl3 related regressions [PR100056] Before combiner added 2 to 2 combinations, the following testcase functions have been all compiled into 2 instructions, zero/sign extensions or and followed by orr with lsl, e.g. for the first function Trying 7 -> 8: 7: r96:SI=r94:SI<<0xb 8: r95:SI=r96:SI|r94:SI REG_DEAD r96:SI REG_DEAD r94:SI Successfully matched this instruction: (set (reg:SI 95) (ior:SI (ashift:SI (reg/v:SI 94 [ i ]) (const_int 11 [0xb])) (reg/v:SI 94 [ i ]))) is the important successful try_combine and so we end up with and w0, w0, 255 orr w0, w0, w0, lsl 11 in the body. With 2 to 2 combination, before that can trigger, another successful combination: Trying 2 -> 7: 2: r94:SI=zero_extend(x0:QI) REG_DEAD x0:QI 7: r96:SI=r94:SI<<0xb is replaced with: (set (reg/v:SI 94 [ i ]) (zero_extend:SI (reg:QI 0 x0 [ i ]))) and (set (reg:SI 96) (and:SI (ashift:SI (reg:SI 0 x0 [ i ]) (const_int 11 [0xb])) (const_int 522240 [0x7f800]))) and in the end results in 3 instructions in the body: and w1, w0, 255 ubfiz w0, w0, 11, 8 orr w0, w0, w1 The following combine splitters help undo that when combiner tries to combine 3 instructions - the zero/sign extend or and, the other insn from the 2 to 2 combination ([us]bfiz) and the logical op, the CPUs don't have an insn to do everything in one op, but we can split it back into the zero/sign extend or and followed by logical with lsl. 2021-04-15 Jakub Jelinek PR target/100056 * config/aarch64/aarch64.md (*_3): Add combine splitters for *_ashl3 with ZERO_EXTEND, SIGN_EXTEND or AND. * gcc.target/aarch64/pr100056.c: New test. --- gcc/config/aarch64/aarch64.md | 53 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 9a7ed78..962640b 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4431,6 +4431,59 @@ [(set_attr "type" "logic_shift_imm")] ) +(define_split + [(set (match_operand:GPI 0 "register_operand") + (LOGICAL:GPI + (and:GPI (ashift:GPI (match_operand:GPI 1 "register_operand") + (match_operand:QI 2 "aarch64_shift_imm_")) + (match_operand:GPI 3 "const_int_operand")) + (zero_extend:GPI (match_operand 4 "register_operand"))))] + "can_create_pseudo_p () + && ((paradoxical_subreg_p (operands[1]) + && rtx_equal_p (SUBREG_REG (operands[1]), operands[4])) + || (REG_P (operands[1]) + && REG_P (operands[4]) + && REGNO (operands[1]) == REGNO (operands[4]))) + && (trunc_int_for_mode (GET_MODE_MASK (GET_MODE (operands[4])) + << INTVAL (operands[2]), mode) + == INTVAL (operands[3]))" + [(set (match_dup 5) (zero_extend:GPI (match_dup 4))) + (set (match_dup 0) (LOGICAL:GPI (ashift:GPI (match_dup 5) (match_dup 2)) + (match_dup 5)))] + "operands[5] = gen_reg_rtx (mode);" +) + +(define_split + [(set (match_operand:GPI 0 "register_operand") + (LOGICAL:GPI + (and:GPI (ashift:GPI (match_operand:GPI 1 "register_operand") + (match_operand:QI 2 "aarch64_shift_imm_")) + (match_operand:GPI 4 "const_int_operand")) + (and:GPI (match_dup 1) (match_operand:GPI 3 "const_int_operand"))))] + "can_create_pseudo_p () + && pow2_or_zerop (UINTVAL (operands[3]) + 1) + && (trunc_int_for_mode (UINTVAL (operands[3]) + << INTVAL (operands[2]), mode) + == INTVAL (operands[4]))" + [(set (match_dup 5) (and:GPI (match_dup 1) (match_dup 3))) + (set (match_dup 0) (LOGICAL:GPI (ashift:GPI (match_dup 5) (match_dup 2)) + (match_dup 5)))] + "operands[5] = gen_reg_rtx (mode);" +) + +(define_split + [(set (match_operand:GPI 0 "register_operand") + (LOGICAL:GPI + (ashift:GPI (sign_extend:GPI (match_operand 1 "register_operand")) + (match_operand:QI 2 "aarch64_shift_imm_")) + (sign_extend:GPI (match_dup 1))))] + "can_create_pseudo_p ()" + [(set (match_dup 3) (sign_extend:GPI (match_dup 1))) + (set (match_dup 0) (LOGICAL:GPI (ashift:GPI (match_dup 3) (match_dup 2)) + (match_dup 3)))] + "operands[3] = gen_reg_rtx (mode);" +) + (define_insn "*_rol3" [(set (match_operand:GPI 0 "register_operand" "=r") (LOGICAL:GPI (rotate:GPI -- cgit v1.1 From 47f42744f6e10ad41db926d739306e6f237fd3ac Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 16 Apr 2021 13:44:23 +0200 Subject: aarch64: Fix up 2 other combine opt regressions vs. GCC8 [PR100075] The testcase used to be compiled at -O2 by GCC8 and earlier to: f1: neg w1, w0, asr 16 and w1, w1, 65535 orr w0, w1, w0, lsl 16 ret f2: neg w1, w0 extr w0, w1, w0, 16 ret but since GCC9 (r9-3594 for f1 and r9-6926 for f2) we compile it into: f1: mov w1, w0 sbfx x0, x1, 16, 16 neg w0, w0 bfi w0, w1, 16, 16 ret f2: neg w1, w0 sbfx x0, x0, 16, 16 bfi w0, w1, 16, 16 ret instead, i.e. one insn longer each. With this patch we get: f1: mov w1, w0 neg w0, w1, asr 16 bfi w0, w1, 16, 16 ret f2: neg w1, w0 extr w0, w1, w0, 16 ret i.e. identical f2 and same number of insns as in GCC8 in f1. The combiner unfortunately doesn't try splitters when doing 2 -> 1 combination, so it can't be implemented as combine splitters, but it could be implemented as define_insn_and_split if desirable. 2021-04-16 Jakub Jelinek PR target/100075 * config/aarch64/aarch64.md (*neg_asr_si2_extr, *extrsi5_insn_di): New define_insn patterns. * gcc.target/aarch64/pr100075.c: New test. --- gcc/config/aarch64/aarch64.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 962640b..abfd845 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -3572,6 +3572,18 @@ [(set_attr "autodetect_type" "alu_shift__op2")] ) +(define_insn "*neg_asr_si2_extr" + [(set (match_operand:SI 0 "register_operand" "=r") + (neg:SI (match_operator:SI 4 "subreg_lowpart_operator" + [(sign_extract:DI + (match_operand:DI 1 "register_operand" "r") + (match_operand 3 "aarch64_simd_shift_imm_offset_si" "n") + (match_operand 2 "aarch64_simd_shift_imm_offset_si" "n"))])))] + "INTVAL (operands[2]) + INTVAL (operands[3]) == 32" + "neg\\t%w0, %w1, asr %2" + [(set_attr "autodetect_type" "alu_shift_asr_op2")] +) + (define_insn "mul3" [(set (match_operand:GPI 0 "register_operand" "=r") (mult:GPI (match_operand:GPI 1 "register_operand" "r") @@ -5383,6 +5395,22 @@ [(set_attr "type" "rotate_imm")] ) +(define_insn "*extrsi5_insn_di" + [(set (match_operand:SI 0 "register_operand" "=r") + (ior:SI (ashift:SI (match_operand:SI 1 "register_operand" "r") + (match_operand 3 "const_int_operand" "n")) + (match_operator:SI 6 "subreg_lowpart_operator" + [(zero_extract:DI + (match_operand:DI 2 "register_operand" "r") + (match_operand 5 "const_int_operand" "n") + (match_operand 4 "const_int_operand" "n"))])))] + "UINTVAL (operands[3]) < 32 + && UINTVAL (operands[3]) + UINTVAL (operands[4]) == 32 + && INTVAL (operands[3]) == INTVAL (operands[5])" + "extr\\t%w0, %w1, %w2, %4" + [(set_attr "type" "rotate_imm")] +) + (define_insn "*ror3_insn" [(set (match_operand:GPI 0 "register_operand" "=r") (rotate:GPI (match_operand:GPI 1 "register_operand" "r") -- cgit v1.1 From 8535755af70f819d820553b2e73e72a16a984599 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Fri, 16 Apr 2021 16:58:50 +0100 Subject: SVE: Fix wrong sve predicate split (PR100048) The attached testcase generates the following paradoxical subregs when creating the predicates. (insn 22 21 23 2 (set (reg:VNx8BI 100) (subreg:VNx8BI (reg:VNx2BI 103) 0)) (expr_list:REG_EQUAL (const_vector:VNx8BI [ (const_int 1 [0x1]) (const_int 0 [0]) (const_int 1 [0x1]) (const_int 0 [0]) repeated x5 ]) (nil))) and (insn 15 14 16 2 (set (reg:VNx8BI 96) (subreg:VNx8BI (reg:VNx2BI 99) 0)) (expr_list:REG_EQUAL (const_vector:VNx8BI [ (const_int 1 [0x1]) (const_int 0 [0]) repeated x7 ]) (nil))) This causes CSE to incorrectly think that the two predicates are equal because some of the significant bits get ignored due to the subreg. The attached patch instead makes it so it always looks at all 16-bits of the predicate, but in turn means we need to generate a TRN that matches the expected result mode. In effect in RTL we keep the mode as VNx16BI but during codegen re-interpret them as the mode the predicate instruction wanted: (insn 10 9 11 2 (set (reg:VNx8BI 96) (subreg:VNx8BI (reg:VNx16BI 99) 0)) (expr_list:REG_EQUAL (const_vector:VNx8BI [ (const_int 1 [0x1]) (const_int 0 [0]) repeated x7 ]) (nil))) Which needed correction to the TRN pattern. A new TRN1_CONV unspec is introduced which allows one to keep the arguments as VNx16BI but encode the instruction as a type of the last operand. (insn 9 8 10 2 (set (reg:VNx16BI 99) (unspec:VNx16BI [ (reg:VNx16BI 97) (reg:VNx16BI 98) (reg:VNx2BI 100) ] UNSPEC_TRN1_CONV)) (nil)) This allows us remove all the paradoxical subregs and end up with (insn 16 15 17 2 (set (reg:VNx8BI 101) (subreg:VNx8BI (reg:VNx16BI 104) 0)) (expr_list:REG_EQUAL (const_vector:VNx8BI [ (const_int 1 [0x1]) (const_int 0 [0]) (const_int 1 [0x1]) (const_int 0 [0]) repeated x5 ]) (nil))) gcc/ChangeLog: PR target/100048 * config/aarch64/aarch64-sve.md (@aarch64_sve_trn1_conv): New. * config/aarch64/aarch64.c (aarch64_expand_sve_const_pred_trn): Use new TRN optab. * config/aarch64/iterators.md (UNSPEC_TRN1_CONV): New. gcc/testsuite/ChangeLog: PR target/100048 * gcc.target/aarch64/sve/pr100048.c: New test. --- gcc/config/aarch64/aarch64-sve.md | 14 ++++++++++++++ gcc/config/aarch64/aarch64.c | 10 +++++----- gcc/config/aarch64/iterators.md | 1 + 3 files changed, 20 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 7db2938..b8b6f55 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -8657,6 +8657,20 @@ "\t%0., %1., %2." ) +;; Special purpose permute used by the predicate generation instructions. +;; Unlike the normal permute patterns, these instructions operate on VNx16BI +;; regardless of the element size, so that all input and output bits are +;; well-defined. Operand 3 then indicates the size of the permute. +(define_insn "@aarch64_sve_trn1_conv" + [(set (match_operand:VNx16BI 0 "register_operand" "=Upa") + (unspec:VNx16BI [(match_operand:VNx16BI 1 "register_operand" "Upa") + (match_operand:VNx16BI 2 "register_operand" "Upa") + (match_operand:PRED_ALL 3 "aarch64_simd_imm_zero")] + UNSPEC_TRN1_CONV))] + "TARGET_SVE" + "trn1\t%0., %1., %2." +) + ;; ========================================================================= ;; == Conversions ;; ========================================================================= diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 04b55d9..09d79f6 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -5535,12 +5535,12 @@ aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder, } } - /* Emit the TRN1 itself. */ + /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI + operands but permutes them as though they had mode MODE. */ machine_mode mode = aarch64_sve_pred_mode (permute_size).require (); - target = aarch64_target_reg (target, mode); - emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target, - gen_lowpart (mode, a), - gen_lowpart (mode, b))); + target = aarch64_target_reg (target, GET_MODE (a)); + rtx type_reg = CONST0_RTX (mode); + emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg)); return target; } diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 5f5abd6..cac33ae 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -649,6 +649,7 @@ UNSPEC_UZP2Q ; Used in aarch64-sve.md. UNSPEC_ZIP1Q ; Used in aarch64-sve.md. UNSPEC_ZIP2Q ; Used in aarch64-sve.md. + UNSPEC_TRN1_CONV ; Used in aarch64-sve.md. UNSPEC_COND_CMPEQ_WIDE ; Used in aarch64-sve.md. UNSPEC_COND_CMPGE_WIDE ; Used in aarch64-sve.md. UNSPEC_COND_CMPGT_WIDE ; Used in aarch64-sve.md. -- cgit v1.1 From 49813aad3292f7f2bef69206274da78a9a7116ed Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 16 Apr 2021 20:49:33 +0200 Subject: aarch64: Don't emit -Wpsabi note when ABI was never affected [PR91710] As the following testcase shows, we emit a -Wpsabi note about argument passing change since GCC 9, but in reality the ABI didn't change. The alignment is 8 bits in GCC < 9 and 32 bits in GCC >= 9 and the aarch64_function_arg_alignment returns in that case: return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY); so when both the old and new alignment are smaller or equal to PARM_BOUNDARY (or both are larger than STACK_BOUNDARY, just in theory), even when the new one is bigger, it doesn't change the argument passing. So, the following patch changes aarch64_function_arg_alignment to tell the callers the exact old alignmentm so that they can test it if needed. The other aarch64_function_arg_alignment callers either check the alignment for equality against 16-byte alignment (when old alignment was smaller than that and the new one is 16-byte, we want to emit -Wpsabi in all the cases) or the va_arg case which I think is ok now too. 2021-04-16 Jakub Jelinek PR target/91710 * config/aarch64/aarch64.c (aarch64_function_arg_alignment): Change abi_break argument from bool * to unsigned *, store there the pre-GCC 9 alignment. (aarch64_layout_arg, aarch64_gimplify_va_arg_expr): Adjust callers. (aarch64_function_arg_regno_p): Likewise. Only emit -Wpsabi note if the old and new alignment after applying MIN/MAX to it is different. * gcc.target/aarch64/pr91710.c: New test. --- gcc/config/aarch64/aarch64.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 09d79f6..12625a4 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -6337,9 +6337,9 @@ aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode, static unsigned int aarch64_function_arg_alignment (machine_mode mode, const_tree type, - bool *abi_break) + unsigned int *abi_break) { - *abi_break = false; + *abi_break = 0; if (!type) return GET_MODE_ALIGNMENT (mode); @@ -6381,7 +6381,7 @@ aarch64_function_arg_alignment (machine_mode mode, const_tree type, if (bitfield_alignment > alignment) { - *abi_break = true; + *abi_break = alignment; return bitfield_alignment; } @@ -6403,7 +6403,7 @@ aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg) int ncrn, nvrn, nregs; bool allocate_ncrn, allocate_nvrn; HOST_WIDE_INT size; - bool abi_break; + unsigned int abi_break; /* We need to do this once per argument. */ if (pcum->aapcs_arg_processed) @@ -6721,14 +6721,19 @@ aarch64_function_arg_regno_p (unsigned regno) static unsigned int aarch64_function_arg_boundary (machine_mode mode, const_tree type) { - bool abi_break; + unsigned int abi_break; unsigned int alignment = aarch64_function_arg_alignment (mode, type, &abi_break); + alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY); if (abi_break & warn_psabi) - inform (input_location, "parameter passing for argument of type " - "%qT changed in GCC 9.1", type); + { + abi_break = MIN (MAX (abi_break, PARM_BOUNDARY), STACK_BOUNDARY); + if (alignment != abi_break) + inform (input_location, "parameter passing for argument of type " + "%qT changed in GCC 9.1", type); + } - return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY); + return alignment; } /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */ @@ -18253,7 +18258,7 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p, f_stack, NULL_TREE); size = int_size_in_bytes (type); - bool abi_break; + unsigned int abi_break; align = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT; -- cgit v1.1 From d81bc495a426b0020e44a9764fd904462a39983b Mon Sep 17 00:00:00 2001 From: Iain Buclaw Date: Mon, 12 Apr 2021 01:39:08 +0200 Subject: d: Implement __traits(getTargetInfo, "objectFormat") Following on from adding TARGET_D_REGISTER_OS_TARGET_INFO, this adds the required handlers to implement `__traits(getTargetInfo, "objectFormat")' for all platforms that have D support files. Some back-ends (i386, rs6000, and pa) have some awarenes of the what object format they are compiling for, so new getTargetInfo handlers have been have added both to those back-ends as well as platform-specific target files to override the default in the D front-end. gcc/ChangeLog: * config/darwin-d.c (darwin_d_handle_target_object_format): New function. (darwin_d_register_target_info): New function. (TARGET_D_REGISTER_OS_TARGET_INFO): Define. * config/dragonfly-d.c (dragonfly_d_handle_target_object_format): New function. (dragonfly_d_register_target_info): New function. (TARGET_D_REGISTER_OS_TARGET_INFO): Define. * config/freebsd-d.c (freebsd_d_handle_target_object_format): New function. (freebsd_d_register_target_info): New function. (TARGET_D_REGISTER_OS_TARGET_INFO): Define. * config/glibc-d.c (glibc_d_handle_target_object_format): New function. (glibc_d_register_target_info): New function. (TARGET_D_REGISTER_OS_TARGET_INFO): Define. * config/i386/i386-d.c (ix86_d_handle_target_object_format): New function. (ix86_d_register_target_info): Add ix86_d_handle_target_object_format as handler for objectFormat key. * config/i386/winnt-d.c (winnt_d_handle_target_object_format): New function. (winnt_d_register_target_info): New function. (TARGET_D_REGISTER_OS_TARGET_INFO): Define. * config/netbsd-d.c (netbsd_d_handle_target_object_format): New function. (netbsd_d_register_target_info): New function. (TARGET_D_REGISTER_OS_TARGET_INFO): Define. * config/openbsd-d.c (openbsd_d_handle_target_object_format): New function. (openbsd_d_register_target_info): New function. (TARGET_D_REGISTER_OS_TARGET_INFO): Define. * config/pa/pa-d.c (pa_d_handle_target_object_format): New function. (pa_d_register_target_info): Add pa_d_handle_target_object_format as handler for objectFormat key. * config/rs6000/rs6000-d.c (rs6000_d_handle_target_object_format): New function. (rs6000_d_register_target_info): Add rs6000_d_handle_target_object_format as handler for objectFormat key. * config/sol2-d.c (solaris_d_handle_target_object_format): New function. (solaris_d_register_target_info): New function. (TARGET_D_REGISTER_OS_TARGET_INFO): Define. gcc/d/ChangeLog: * d-target.cc (d_handle_target_object_format): New function. (d_language_target_info): Add d_handle_target_object_format as handler for objectFormat key. (Target::getTargetInfo): Continue if handler returned NULL_TREE. --- gcc/config/darwin-d.c | 26 ++++++++++++++++++++++++++ gcc/config/dragonfly-d.c | 26 ++++++++++++++++++++++++++ gcc/config/freebsd-d.c | 26 ++++++++++++++++++++++++++ gcc/config/glibc-d.c | 26 ++++++++++++++++++++++++++ gcc/config/i386/i386-d.c | 19 +++++++++++++++++++ gcc/config/i386/winnt-d.c | 25 +++++++++++++++++++++++++ gcc/config/netbsd-d.c | 28 ++++++++++++++++++++++++++++ gcc/config/openbsd-d.c | 28 ++++++++++++++++++++++++++++ gcc/config/pa/pa-d.c | 17 +++++++++++++++++ gcc/config/rs6000/rs6000-d.c | 21 +++++++++++++++++++++ gcc/config/sol2-d.c | 26 ++++++++++++++++++++++++++ 11 files changed, 268 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/darwin-d.c b/gcc/config/darwin-d.c index afc32da..67d69b7 100644 --- a/gcc/config/darwin-d.c +++ b/gcc/config/darwin-d.c @@ -32,9 +32,35 @@ darwin_d_os_builtins (void) d_add_builtin_version ("darwin"); } +/* Handle a call to `__traits(getTargetInfo, "objectFormat")'. */ + +static tree +darwin_d_handle_target_object_format (void) +{ + const char *objfmt = "macho"; + + return build_string_literal (strlen (objfmt) + 1, objfmt); +} + +/* Implement TARGET_D_REGISTER_OS_TARGET_INFO for Darwin targets. */ + +static void +darwin_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "objectFormat", darwin_d_handle_target_object_format }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} + #undef TARGET_D_OS_VERSIONS #define TARGET_D_OS_VERSIONS darwin_d_os_builtins +#undef TARGET_D_REGISTER_OS_TARGET_INFO +#define TARGET_D_REGISTER_OS_TARGET_INFO darwin_d_register_target_info + /* Define TARGET_D_MINFO_SECTION for Darwin targets. */ #undef TARGET_D_MINFO_SECTION diff --git a/gcc/config/dragonfly-d.c b/gcc/config/dragonfly-d.c index 76f4cc0..dc301b5 100644 --- a/gcc/config/dragonfly-d.c +++ b/gcc/config/dragonfly-d.c @@ -31,7 +31,33 @@ dragonfly_d_os_builtins (void) d_add_builtin_version ("Posix"); } +/* Handle a call to `__traits(getTargetInfo, "objectFormat")'. */ + +static tree +dragonfly_d_handle_target_object_format (void) +{ + const char *objfmt = "elf"; + + return build_string_literal (strlen (objfmt) + 1, objfmt); +} + +/* Implement TARGET_D_REGISTER_OS_TARGET_INFO for DragonFly targets. */ + +static void +dragonfly_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "objectFormat", dragonfly_d_handle_target_object_format }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} + #undef TARGET_D_OS_VERSIONS #define TARGET_D_OS_VERSIONS dragonfly_d_os_builtins +#undef TARGET_D_REGISTER_OS_TARGET_INFO +#define TARGET_D_REGISTER_OS_TARGET_INFO dragonfly_d_register_target_info + struct gcc_targetdm targetdm = TARGETDM_INITIALIZER; diff --git a/gcc/config/freebsd-d.c b/gcc/config/freebsd-d.c index 8a8ddd9..8bebe79 100644 --- a/gcc/config/freebsd-d.c +++ b/gcc/config/freebsd-d.c @@ -37,7 +37,33 @@ freebsd_d_os_builtins (void) d_add_builtin_version ("Posix"); } +/* Handle a call to `__traits(getTargetInfo, "objectFormat")'. */ + +static tree +freebsd_d_handle_target_object_format (void) +{ + const char *objfmt = "elf"; + + return build_string_literal (strlen (objfmt) + 1, objfmt); +} + +/* Implement TARGET_D_REGISTER_OS_TARGET_INFO for FreeBSD targets. */ + +static void +freebsd_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "objectFormat", freebsd_d_handle_target_object_format }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} + #undef TARGET_D_OS_VERSIONS #define TARGET_D_OS_VERSIONS freebsd_d_os_builtins +#undef TARGET_D_REGISTER_OS_TARGET_INFO +#define TARGET_D_REGISTER_OS_TARGET_INFO freebsd_d_register_target_info + struct gcc_targetdm targetdm = TARGETDM_INITIALIZER; diff --git a/gcc/config/glibc-d.c b/gcc/config/glibc-d.c index 092c5d8..c98d494 100644 --- a/gcc/config/glibc-d.c +++ b/gcc/config/glibc-d.c @@ -42,7 +42,33 @@ glibc_d_os_builtins (void) #endif } +/* Handle a call to `__traits(getTargetInfo, "objectFormat")'. */ + +static tree +glibc_d_handle_target_object_format (void) +{ + const char *objfmt = "elf"; + + return build_string_literal (strlen (objfmt) + 1, objfmt); +} + +/* Implement TARGET_D_REGISTER_OS_TARGET_INFO for Glibc targets. */ + +static void +glibc_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "objectFormat", glibc_d_handle_target_object_format }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} + #undef TARGET_D_OS_VERSIONS #define TARGET_D_OS_VERSIONS glibc_d_os_builtins +#undef TARGET_D_REGISTER_OS_TARGET_INFO +#define TARGET_D_REGISTER_OS_TARGET_INFO glibc_d_register_target_info + struct gcc_targetdm targetdm = TARGETDM_INITIALIZER; diff --git a/gcc/config/i386/i386-d.c b/gcc/config/i386/i386-d.c index da5958c..cb99376 100644 --- a/gcc/config/i386/i386-d.c +++ b/gcc/config/i386/i386-d.c @@ -60,6 +60,24 @@ ix86_d_handle_target_float_abi (void) return build_string_literal (strlen (abi) + 1, abi); } +/* Handle a call to `__traits(getTargetInfo, "objectFormat")'. */ + +static tree +ix86_d_handle_target_object_format (void) +{ + const char *objfmt = NULL; + + if (TARGET_MACHO) + objfmt = "macho"; + else if (TARGET_COFF || TARGET_PECOFF) + objfmt = "coff"; + + if (objfmt == NULL) + return NULL_TREE; + + return build_string_literal (strlen (objfmt) + 1, objfmt); +} + /* Implement TARGET_D_REGISTER_CPU_TARGET_INFO. */ void @@ -67,6 +85,7 @@ ix86_d_register_target_info (void) { const struct d_target_info_spec handlers[] = { { "floatAbi", ix86_d_handle_target_float_abi }, + { "objectFormat", ix86_d_handle_target_object_format }, { NULL, NULL }, }; diff --git a/gcc/config/i386/winnt-d.c b/gcc/config/i386/winnt-d.c index 8a6b9c5..b978025 100644 --- a/gcc/config/i386/winnt-d.c +++ b/gcc/config/i386/winnt-d.c @@ -39,9 +39,34 @@ winnt_d_os_builtins (void) #endif } +/* Handle a call to `__traits(getTargetInfo, "objectFormat")'. */ + +static tree +winnt_d_handle_target_object_format (void) +{ + const char *objfmt = "coff"; + + return build_string_literal (strlen (objfmt) + 1, objfmt); +} + +/* Implement TARGET_D_REGISTER_OS_TARGET_INFO for Windows targets. */ + +static void +winnt_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "objectFormat", winnt_d_handle_target_object_format }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} #undef TARGET_D_OS_VERSIONS #define TARGET_D_OS_VERSIONS winnt_d_os_builtins +#undef TARGET_D_REGISTER_OS_TARGET_INFO +#define TARGET_D_REGISTER_OS_TARGET_INFO winnt_d_register_target_info + /* Define TARGET_D_MINFO_SECTION for Windows targets. */ #undef TARGET_D_MINFO_SECTION diff --git a/gcc/config/netbsd-d.c b/gcc/config/netbsd-d.c index c3ac010..842644f 100644 --- a/gcc/config/netbsd-d.c +++ b/gcc/config/netbsd-d.c @@ -26,6 +26,8 @@ along with GCC; see the file COPYING3. If not see #include "d/d-target.h" #include "d/d-target-def.h" +/* Define TARGET_D_OS_VERSIONS for NetBSD targets. */ + static void netbsd_d_os_builtins (void) { @@ -33,7 +35,33 @@ netbsd_d_os_builtins (void) d_add_builtin_version ("NetBSD"); } +/* Handle a call to `__traits(getTargetInfo, "objectFormat")'. */ + +static tree +netbsd_d_handle_target_object_format (void) +{ + const char *objfmt = "elf"; + + return build_string_literal (strlen (objfmt) + 1, objfmt); +} + +/* Implement TARGET_D_REGISTER_OS_TARGET_INFO for NetBSD targets. */ + +static void +netbsd_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "objectFormat", netbsd_d_handle_target_object_format }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} + #undef TARGET_D_OS_VERSIONS #define TARGET_D_OS_VERSIONS netbsd_d_os_builtins +#undef TARGET_D_REGISTER_OS_TARGET_INFO +#define TARGET_D_REGISTER_OS_TARGET_INFO netbsd_d_register_target_info + struct gcc_targetdm targetdm = TARGETDM_INITIALIZER; diff --git a/gcc/config/openbsd-d.c b/gcc/config/openbsd-d.c index b42727e..31f8eb4 100644 --- a/gcc/config/openbsd-d.c +++ b/gcc/config/openbsd-d.c @@ -26,6 +26,8 @@ along with GCC; see the file COPYING3. If not see #include "d/d-target.h" #include "d/d-target-def.h" +/* Define TARGET_D_OS_VERSIONS for OpenBSD targets. */ + static void openbsd_d_os_builtins (void) { @@ -33,7 +35,33 @@ openbsd_d_os_builtins (void) d_add_builtin_version ("OpenBSD"); } +/* Handle a call to `__traits(getTargetInfo, "objectFormat")'. */ + +static tree +openbsd_d_handle_target_object_format (void) +{ + const char *objfmt = "elf"; + + return build_string_literal (strlen (objfmt) + 1, objfmt); +} + +/* Implement TARGET_D_REGISTER_OS_TARGET_INFO for OpenBSD targets. */ + +static void +openbsd_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "objectFormat", openbsd_d_handle_target_object_format }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} + #undef TARGET_D_OS_VERSIONS #define TARGET_D_OS_VERSIONS openbsd_d_os_builtins +#undef TARGET_D_REGISTER_OS_TARGET_INFO +#define TARGET_D_REGISTER_OS_TARGET_INFO openbsd_d_register_target_info + struct gcc_targetdm targetdm = TARGETDM_INITIALIZER; diff --git a/gcc/config/pa/pa-d.c b/gcc/config/pa/pa-d.c index 41b2f18..6802738 100644 --- a/gcc/config/pa/pa-d.c +++ b/gcc/config/pa/pa-d.c @@ -55,6 +55,22 @@ pa_d_handle_target_float_abi (void) return build_string_literal (strlen (abi) + 1, abi); } +/* Handle a call to `__traits(getTargetInfo, "objectFormat")'. */ + +static tree +pa_d_handle_target_object_format (void) +{ + const char *objfmt = NULL; + + if (TARGET_SOM) + objfmt = "som"; + + if (objfmt == NULL) + return NULL_TREE; + + return build_string_literal (strlen (objfmt) + 1, objfmt); +} + /* Implement TARGET_D_REGISTER_CPU_TARGET_INFO. */ void @@ -62,6 +78,7 @@ pa_d_register_target_info (void) { const struct d_target_info_spec handlers[] = { { "floatAbi", pa_d_handle_target_float_abi }, + { "objectFormat", pa_d_handle_target_object_format }, { NULL, NULL }, }; diff --git a/gcc/config/rs6000/rs6000-d.c b/gcc/config/rs6000/rs6000-d.c index 755de42..6f1f55b 100644 --- a/gcc/config/rs6000/rs6000-d.c +++ b/gcc/config/rs6000/rs6000-d.c @@ -63,6 +63,26 @@ rs6000_d_handle_target_float_abi (void) return build_string_literal (strlen (abi) + 1, abi); } +/* Handle a call to `__traits(getTargetInfo, "objectFormat")'. */ + +static tree +rs6000_d_handle_target_object_format (void) +{ + const char *objfmt = NULL; + + if (TARGET_ELF) + objfmt = "elf"; + else if (TARGET_MACHO) + objfmt = "macho"; + else if (TARGET_XCOFF) + objfmt = "coff"; + + if (objfmt == NULL) + return NULL_TREE; + + return build_string_literal (strlen (objfmt) + 1, objfmt); +} + /* Implement TARGET_D_REGISTER_CPU_TARGET_INFO. */ void @@ -70,6 +90,7 @@ rs6000_d_register_target_info (void) { const struct d_target_info_spec handlers[] = { { "floatAbi", rs6000_d_handle_target_float_abi }, + { "objectFormat", rs6000_d_handle_target_object_format }, { NULL, NULL }, }; diff --git a/gcc/config/sol2-d.c b/gcc/config/sol2-d.c index 529d365..650cb37 100644 --- a/gcc/config/sol2-d.c +++ b/gcc/config/sol2-d.c @@ -33,7 +33,33 @@ solaris_d_os_builtins (void) d_add_builtin_version ("Solaris"); \ } +/* Handle a call to `__traits(getTargetInfo, "objectFormat")'. */ + +static tree +solaris_d_handle_target_object_format (void) +{ + const char *objfmt = "elf"; + + return build_string_literal (strlen (objfmt) + 1, objfmt); +} + +/* Implement TARGET_D_REGISTER_OS_TARGET_INFO for Solaris targets. */ + +static void +solaris_d_register_target_info (void) +{ + const struct d_target_info_spec handlers[] = { + { "objectFormat", solaris_d_handle_target_object_format }, + { NULL, NULL }, + }; + + d_add_target_info_handlers (handlers); +} + #undef TARGET_D_OS_VERSIONS #define TARGET_D_OS_VERSIONS solaris_d_os_builtins +#undef TARGET_D_REGISTER_OS_TARGET_INFO +#define TARGET_D_REGISTER_OS_TARGET_INFO solaris_d_register_target_info + struct gcc_targetdm targetdm = TARGETDM_INITIALIZER; -- cgit v1.1 From bda519596543e49f77914b5677693e86be5d01d0 Mon Sep 17 00:00:00 2001 From: Iain Buclaw Date: Tue, 13 Apr 2021 22:28:55 +0200 Subject: d: Add TARGET_D_TEMPLATES_ALWAYS_COMDAT Following up on the fix for PR99914, when testing on MinGW, it was found not to support weak in the same way as on ELF or Mach-O targets. So the linkage has been reverted back to COMDAT for that target, however in order to properly support overriding functions and variables, all declarations with external linkage must be put on COMDAT. For this a new target hook has been added to control the behavior. gcc/ChangeLog: PR d/99914 * config/i386/winnt-d.c (TARGET_D_TEMPLATES_ALWAYS_COMDAT): Define. * doc/tm.texi: Regenerate. * doc/tm.texi.in (D language and ABI): Add @hook for TARGET_D_TEMPLATES_ALWAYS_COMDAT. gcc/d/ChangeLog: PR d/99914 * d-target.def (d_templates_always_comdat): New hook. * d-tree.h (mark_needed): Remove prototype. * decl.cc: Include d-target.h. (mark_needed): Rename to... (d_mark_needed): ...this. Make static. (set_linkage_for_decl): Put variables in comdat if d_templates_always_comdat. --- gcc/config/i386/winnt-d.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/winnt-d.c b/gcc/config/i386/winnt-d.c index b978025..ea4cd13 100644 --- a/gcc/config/i386/winnt-d.c +++ b/gcc/config/i386/winnt-d.c @@ -78,4 +78,9 @@ winnt_d_register_target_info (void) #undef TARGET_D_MINFO_END_NAME #define TARGET_D_MINFO_END_NAME "__stop_minfo" +/* Define TARGET_D_TEMPLATES_ALWAYS_COMDAT for Windows targets. */ + +#undef TARGET_D_TEMPLATES_ALWAYS_COMDAT +#define TARGET_D_TEMPLATES_ALWAYS_COMDAT true + struct gcc_targetdm targetdm = TARGETDM_INITIALIZER; -- cgit v1.1 From 0bb37e80bb786e11cb7aa2d23b7d68bb0357fc15 Mon Sep 17 00:00:00 2001 From: Eric Botcazou Date: Mon, 19 Apr 2021 10:13:36 +0200 Subject: Fix another -freorder-blocks-and-partition glitch with Windows SEH Since GCC 8, the -freorder-blocks-and-partition pass can split a function into hot and cold parts, thus generating 2 FDEs for a single function in DWARF for exception purposes and doing an equivalent trick for Windows SEH. Now the Windows system unwinder does not support arbitrarily large frames and there is even a hard limit on the encoding of the CFI, which changes the stack allocation strategy when it is topped and which must be reflected everywhere. gcc/ * config/i386/winnt.c (i386_pe_seh_cold_init): Properly deal with frames larger than the SEH maximum frame size. gcc/testsuite/ * gnat.dg/opt92.adb: New test. --- gcc/config/i386/winnt.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/winnt.c b/gcc/config/i386/winnt.c index cc12196..b66263a 100644 --- a/gcc/config/i386/winnt.c +++ b/gcc/config/i386/winnt.c @@ -921,15 +921,17 @@ i386_pe_seh_cold_init (FILE *f, const char *name) /* In the normal case, the frame pointer is near the bottom of the frame so we can do the full stack allocation and set it afterwards. There - is an exception when the function accesses prior frames so, in this - case, we need to pre-allocate a small chunk before setting it. */ - if (crtl->accesses_prior_frames) - alloc_offset = seh->cfa_offset; - else + is an exception if the function overflows the SEH maximum frame size + or accesses prior frames so, in this case, we need to pre-allocate a + small chunk of stack before setting it. */ + offset = seh->sp_offset - INCOMING_FRAME_SP_OFFSET; + if (offset < SEH_MAX_FRAME_SIZE && !crtl->accesses_prior_frames) alloc_offset = seh->sp_offset; + else + alloc_offset = MIN (seh->cfa_offset + 240, seh->sp_offset); offset = alloc_offset - INCOMING_FRAME_SP_OFFSET; - if (offset > 0 && offset < SEH_MAX_FRAME_SIZE) + if (offset > 0) fprintf (f, "\t.seh_stackalloc\t" HOST_WIDE_INT_PRINT_DEC "\n", offset); for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) @@ -958,7 +960,7 @@ i386_pe_seh_cold_init (FILE *f, const char *name) fprintf (f, ", " HOST_WIDE_INT_PRINT_DEC "\n", offset); } - if (crtl->accesses_prior_frames) + if (alloc_offset != seh->sp_offset) { offset = seh->sp_offset - alloc_offset; if (offset > 0 && offset < SEH_MAX_FRAME_SIZE) -- cgit v1.1 From 3bffc4b37e85c7f6092dfb0fbe4067d268e97b46 Mon Sep 17 00:00:00 2001 From: Richard Earnshaw Date: Mon, 19 Apr 2021 16:56:31 +0100 Subject: arm: partial revert of r11-8168 [PR100067] This is a partial revert of r11-8168. The overall purpose of the commit is retained (to fix a bogus warning when -mfpu= is used in combination with eg -mcpu=neoverse-v1), but it removes the hunk that changed the subsequent feature bits for features of a simd/fp unit that cannot be described by -mfpu. While I still think that is the correct direction of travel, it's somewhat disruptive and not appropriate for late stage4. I'll revisit for gcc-12. gcc: PR target/100067 * config/arm/arm.c (arm_configure_build_target): Do not strip extended FPU/SIMD feature bits from the target ISA when -mfpu is specified (partial revert of r11-8168). --- gcc/config/arm/arm.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 475fb0d..340f7c9 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -3396,9 +3396,11 @@ arm_configure_build_target (struct arm_build_target *target, auto_sbitmap fpu_bits (isa_num_bits); arm_initialize_isa (fpu_bits, arm_selected_fpu->isa_bits); - /* Clear out ALL bits relating to the FPU/simd extensions, to avoid - potentially invalid combinations later on that we can't match. */ - bitmap_and_compl (target->isa, target->isa, isa_all_fpbits); + /* This should clear out ALL bits relating to the FPU/simd + extensions, to avoid potentially invalid combinations later on + that we can't match. At present we only clear out those bits + that can be set by -mfpu. This should be fixed in GCC-12. */ + bitmap_and_compl (target->isa, target->isa, isa_all_fpubits_internal); bitmap_ior (target->isa, target->isa, fpu_bits); } -- cgit v1.1 From 6156df483fa50a08f561b6c248819f2992aa380d Mon Sep 17 00:00:00 2001 From: Segher Boessenkool Date: Tue, 20 Apr 2021 12:00:50 +0000 Subject: rs6000: Fix cpu selection w/ isel (PR100108) There are various non-IBM CPUs with isel as well, so it is easiest if we just don't consider that flag here (it is not needed). 2021-04-20 Segher Boessenkool PR target/100108 * config/rs6000/rs6000.c (rs6000_machine_from_flags): Do not consider OPTION_MASK_ISEL. --- gcc/config/rs6000/rs6000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 48b8efd..844fee8 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5765,7 +5765,7 @@ rs6000_machine_from_flags (void) HOST_WIDE_INT flags = rs6000_isa_flags; /* Disable the flags that should never influence the .machine selection. */ - flags &= ~(OPTION_MASK_PPC_GFXOPT | OPTION_MASK_PPC_GPOPT); + flags &= ~(OPTION_MASK_PPC_GFXOPT | OPTION_MASK_PPC_GPOPT | OPTION_MASK_ISEL); if ((flags & (ISA_3_1_MASKS_SERVER & ~ISA_3_0_MASKS_SERVER)) != 0) return "power10"; -- cgit v1.1 From 39671f87b2df6a1894cc11a161e4a7949d1ddccd Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 15 Apr 2021 05:59:48 -0700 Subject: x86: Use crc32 target option for CRC32 intrinsics Use crc32 target option for CRC32 intrinsics to support CRC32 intrinsics without enabling SSE vector instructions. * config/i386/i386-c.c (ix86_target_macros_internal): Define __CRC32__ for -mcrc32. * config/i386/i386-options.c (ix86_option_override_internal): Enable crc32 instruction for -msse4.2. * config/i386/i386.md (sse4_2_crc32): Remove TARGET_SSE4_2 check. (sse4_2_crc32di): Likewise. * config/i386/ia32intrin.h: Use crc32 target option for CRC32 intrinsics. --- gcc/config/i386/i386-c.c | 2 ++ gcc/config/i386/i386-options.c | 5 +++++ gcc/config/i386/i386.md | 4 ++-- gcc/config/i386/ia32intrin.h | 28 ++++++++++++++-------------- 4 files changed, 23 insertions(+), 16 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c index be46d05..5ed0de0 100644 --- a/gcc/config/i386/i386-c.c +++ b/gcc/config/i386/i386-c.c @@ -532,6 +532,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__LZCNT__"); if (isa_flag & OPTION_MASK_ISA_TBM) def_or_undef (parse_in, "__TBM__"); + if (isa_flag & OPTION_MASK_ISA_CRC32) + def_or_undef (parse_in, "__CRC32__"); if (isa_flag & OPTION_MASK_ISA_POPCNT) def_or_undef (parse_in, "__POPCNT__"); if (isa_flag & OPTION_MASK_ISA_FSGSBASE) diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 91da284..7e59ccd 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -2617,6 +2617,11 @@ ix86_option_override_internal (bool main_args_p, opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit; + /* Enable crc32 instruction for -msse4.2. */ + if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags + |= OPTION_MASK_ISA_CRC32 & ~opts->x_ix86_isa_flags_explicit; + /* Enable lzcnt instruction for -mabm. */ if (TARGET_ABM_P(opts->x_ix86_isa_flags)) opts->x_ix86_isa_flags diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 9ff35d9..1f1d74e 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -20998,7 +20998,7 @@ [(match_operand:SI 1 "register_operand" "0") (match_operand:SWI124 2 "nonimmediate_operand" "m")] UNSPEC_CRC32))] - "TARGET_SSE4_2 || TARGET_CRC32" + "TARGET_CRC32" "crc32{}\t{%2, %0|%0, %2}" [(set_attr "type" "sselog1") (set_attr "prefix_rep" "1") @@ -21019,7 +21019,7 @@ [(match_operand:DI 1 "register_operand" "0") (match_operand:DI 2 "nonimmediate_operand" "rm")] UNSPEC_CRC32))] - "TARGET_64BIT && (TARGET_SSE4_2 || TARGET_CRC32)" + "TARGET_64BIT && TARGET_CRC32" "crc32{q}\t{%2, %0|%0, %2}" [(set_attr "type" "sselog1") (set_attr "prefix_rep" "1") diff --git a/gcc/config/i386/ia32intrin.h b/gcc/config/i386/ia32intrin.h index 5913940..5422b0f 100644 --- a/gcc/config/i386/ia32intrin.h +++ b/gcc/config/i386/ia32intrin.h @@ -51,11 +51,11 @@ __bswapd (int __X) #ifndef __iamcu__ -#ifndef __SSE4_2__ +#ifndef __CRC32__ #pragma GCC push_options -#pragma GCC target("sse4.2") -#define __DISABLE_SSE4_2__ -#endif /* __SSE4_2__ */ +#pragma GCC target("crc32") +#define __DISABLE_CRC32__ +#endif /* __CRC32__ */ /* 32bit accumulate CRC32 (polynomial 0x11EDC6F41) value. */ extern __inline unsigned int @@ -79,10 +79,10 @@ __crc32d (unsigned int __C, unsigned int __V) return __builtin_ia32_crc32si (__C, __V); } -#ifdef __DISABLE_SSE4_2__ -#undef __DISABLE_SSE4_2__ +#ifdef __DISABLE_CRC32__ +#undef __DISABLE_CRC32__ #pragma GCC pop_options -#endif /* __DISABLE_SSE4_2__ */ +#endif /* __DISABLE_CRC32__ */ #endif /* __iamcu__ */ @@ -199,11 +199,11 @@ __bswapq (long long __X) return __builtin_bswap64 (__X); } -#ifndef __SSE4_2__ +#ifndef __CRC32__ #pragma GCC push_options -#pragma GCC target("sse4.2") -#define __DISABLE_SSE4_2__ -#endif /* __SSE4_2__ */ +#pragma GCC target("crc32") +#define __DISABLE_CRC32__ +#endif /* __CRC32__ */ /* 64bit accumulate CRC32 (polynomial 0x11EDC6F41) value. */ extern __inline unsigned long long @@ -213,10 +213,10 @@ __crc32q (unsigned long long __C, unsigned long long __V) return __builtin_ia32_crc32di (__C, __V); } -#ifdef __DISABLE_SSE4_2__ -#undef __DISABLE_SSE4_2__ +#ifdef __DISABLE_CRC32__ +#undef __DISABLE_CRC32__ #pragma GCC pop_options -#endif /* __DISABLE_SSE4_2__ */ +#endif /* __DISABLE_CRC32__ */ /* 64bit popcnt */ extern __inline long long -- cgit v1.1 From c37e8fdc41fb8cf324d0ee4600819f5fd2b1bf84 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Mon, 8 Mar 2021 13:01:37 +0100 Subject: Remove DEF_ENUM from stringop.def. gcc/ChangeLog: * config/i386/i386-options.c (DEF_ENUM): Remove it. * config/i386/i386-opts.h (DEF_ENUM): Likewise. * config/i386/stringop.def (DEF_ENUM): Likewise. --- gcc/config/i386/i386-options.c | 2 -- gcc/config/i386/i386-opts.h | 4 ---- gcc/config/i386/stringop.def | 9 --------- 3 files changed, 15 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 7e59ccd..154234a 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -1455,10 +1455,8 @@ ix86_valid_target_attribute_p (tree fndecl, } const char *stringop_alg_names[] = { -#define DEF_ENUM #define DEF_ALG(alg, name) #name, #include "stringop.def" -#undef DEF_ENUM #undef DEF_ALG }; diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h index de6e7e0..04e4ad6 100644 --- a/gcc/config/i386/i386-opts.h +++ b/gcc/config/i386/i386-opts.h @@ -28,16 +28,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see /* Algorithm to expand string function with. */ enum stringop_alg { -#undef DEF_ENUM -#define DEF_ENUM - #undef DEF_ALG #define DEF_ALG(alg, name) alg, #include "stringop.def" last_alg -#undef DEF_ENUM #undef DEF_ALG }; diff --git a/gcc/config/i386/stringop.def b/gcc/config/i386/stringop.def index 76898d2..cd34b7e 100644 --- a/gcc/config/i386/stringop.def +++ b/gcc/config/i386/stringop.def @@ -17,21 +17,12 @@ You should have received a copy of the GNU General Public License along with GCC; see the files COPYING3. If not, see . */ -DEF_ENUM DEF_ALG (no_stringop, no_stringop) -DEF_ENUM DEF_ALG (libcall, libcall) -DEF_ENUM DEF_ALG (rep_prefix_1_byte, rep_byte) -DEF_ENUM DEF_ALG (rep_prefix_4_byte, rep_4byte) -DEF_ENUM DEF_ALG (rep_prefix_8_byte, rep_8byte) -DEF_ENUM DEF_ALG (loop_1_byte, byte_loop) -DEF_ENUM DEF_ALG (loop, loop) -DEF_ENUM DEF_ALG (unrolled_loop, unrolled_loop) -DEF_ENUM DEF_ALG (vector_loop, vector_loop) -- cgit v1.1 From d8c6cc2ca35489bc41bb58ec96c1195928826922 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 15 Apr 2021 11:19:32 -0700 Subject: x86: Add -mmwait for -mgeneral-regs-only Add -mmwait so that the MWAIT and MONITOR intrinsics can be used with -mgeneral-regs-only and make -msse3 to imply -mmwait. gcc/ * config.gcc: Install mwaitintrin.h for i[34567]86-*-* and x86_64-*-* targets. * common/config/i386/i386-common.c (OPTION_MASK_ISA2_MWAIT_SET): New. (OPTION_MASK_ISA2_MWAIT_UNSET): Likewise. (ix86_handle_option): Handle -mmwait. * config/i386/i386-builtins.c (ix86_init_mmx_sse_builtins): Replace OPTION_MASK_ISA_SSE3 with OPTION_MASK_ISA2_MWAIT on __builtin_ia32_monitor and __builtin_ia32_mwait. * config/i386/i386-options.c (isa2_opts): Add -mmwait. (ix86_valid_target_attribute_inner_p): Likewise. (ix86_option_override_internal): Enable mwait/monitor instructions for -msse3. * config/i386/i386.h (TARGET_MWAIT): New. (TARGET_MWAIT_P): Likewise. * config/i386/i386.opt: Add -mmwait. * config/i386/mwaitintrin.h: New file. * config/i386/pmmintrin.h: Include . * config/i386/sse.md (sse3_mwait): Replace TARGET_SSE3 with TARGET_MWAIT. (@sse3_monitor_): Likewise. * config/i386/x86gprintrin.h: Include . * doc/extend.texi: Document mwait target attribute. * doc/invoke.texi: Document -mmwait. gcc/testsuite/ * gcc.target/i386/monitor-2.c: New test. --- gcc/config/i386/i386-builtins.c | 4 ++-- gcc/config/i386/i386-options.c | 7 ++++++ gcc/config/i386/i386.h | 2 ++ gcc/config/i386/i386.opt | 4 ++++ gcc/config/i386/mwaitintrin.h | 52 +++++++++++++++++++++++++++++++++++++++++ gcc/config/i386/pmmintrin.h | 13 +---------- gcc/config/i386/sse.md | 4 ++-- gcc/config/i386/x86gprintrin.h | 2 ++ 8 files changed, 72 insertions(+), 16 deletions(-) create mode 100644 gcc/config/i386/mwaitintrin.h (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c index 4fcdf4b..128bd39 100644 --- a/gcc/config/i386/i386-builtins.c +++ b/gcc/config/i386/i386-builtins.c @@ -628,9 +628,9 @@ ix86_init_mmx_sse_builtins (void) VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE); /* SSE3. */ - def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_monitor", + def_builtin (0, OPTION_MASK_ISA2_MWAIT, "__builtin_ia32_monitor", VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR); - def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_mwait", + def_builtin (0, OPTION_MASK_ISA2_MWAIT, "__builtin_ia32_mwait", VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT); /* AES */ diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 154234a..65bb0bb 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -207,6 +207,7 @@ static struct ix86_target_opts isa2_opts[] = { "-mmovbe", OPTION_MASK_ISA2_MOVBE }, { "-mclzero", OPTION_MASK_ISA2_CLZERO }, { "-mmwaitx", OPTION_MASK_ISA2_MWAITX }, + { "-mmwait", OPTION_MASK_ISA2_MWAIT }, { "-mmovdir64b", OPTION_MASK_ISA2_MOVDIR64B }, { "-mwaitpkg", OPTION_MASK_ISA2_WAITPKG }, { "-mcldemote", OPTION_MASK_ISA2_CLDEMOTE }, @@ -1015,6 +1016,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase), IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd), IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx), + IX86_ATTR_ISA ("mwait", OPT_mmwait), IX86_ATTR_ISA ("clzero", OPT_mclzero), IX86_ATTR_ISA ("pku", OPT_mpku), IX86_ATTR_ISA ("lwp", OPT_mlwp), @@ -2609,6 +2611,11 @@ ix86_option_override_internal (bool main_args_p, || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags)) ix86_prefetch_sse = true; + /* Enable mwait/monitor instructions for -msse3. */ + if (TARGET_SSE3_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags2 + |= OPTION_MASK_ISA2_MWAIT & ~opts->x_ix86_isa_flags2_explicit; + /* Enable popcnt instruction for -msse4.2 or -mabm. */ if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags) || TARGET_ABM_P (opts->x_ix86_isa_flags)) diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 97700d7..aa3fa3e 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -181,6 +181,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #define TARGET_CLWB_P(x) TARGET_ISA_CLWB_P(x) #define TARGET_MWAITX TARGET_ISA2_MWAITX #define TARGET_MWAITX_P(x) TARGET_ISA2_MWAITX_P(x) +#define TARGET_MWAIT TARGET_ISA2_MWAIT +#define TARGET_MWAIT_P(x) TARGET_ISA2_MWAIT_P(x) #define TARGET_PKU TARGET_ISA_PKU #define TARGET_PKU_P(x) TARGET_ISA_PKU_P(x) #define TARGET_SHSTK TARGET_ISA_SHSTK diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index c781fdc..7b8547b 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1162,3 +1162,7 @@ AVXVNNI built-in functions and code generation. mneeded Target Var(ix86_needed) Save Emit GNU_PROPERTY_X86_ISA_1_NEEDED GNU property. + +mmwait +Target Mask(ISA2_MWAIT) Var(ix86_isa_flags2) Save +Support MWAIT and MONITOR built-in functions and code generation. diff --git a/gcc/config/i386/mwaitintrin.h b/gcc/config/i386/mwaitintrin.h new file mode 100644 index 0000000..1ecbc4a --- /dev/null +++ b/gcc/config/i386/mwaitintrin.h @@ -0,0 +1,52 @@ +/* Copyright (C) 2021 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _MWAITINTRIN_H_INCLUDED +#define _MWAITINTRIN_H_INCLUDED + +#ifndef __MWAIT__ +#pragma GCC push_options +#pragma GCC target("mwait") +#define __DISABLE_MWAIT__ +#endif /* __MWAIT__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_monitor (void const * __P, unsigned int __E, unsigned int __H) +{ + __builtin_ia32_monitor (__P, __E, __H); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mwait (unsigned int __E, unsigned int __H) +{ + __builtin_ia32_mwait (__E, __H); +} + +#ifdef __DISABLE_MWAIT__ +#undef __DISABLE_MWAIT__ +#pragma GCC pop_options +#endif /* __DISABLE_MWAIT__ */ + +#endif /* _MWAITINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/pmmintrin.h b/gcc/config/i386/pmmintrin.h index fa9c5bb..f8102d2 100644 --- a/gcc/config/i386/pmmintrin.h +++ b/gcc/config/i386/pmmintrin.h @@ -29,6 +29,7 @@ /* We need definitions from the SSE2 and SSE header files*/ #include +#include #ifndef __SSE3__ #pragma GCC push_options @@ -112,18 +113,6 @@ _mm_lddqu_si128 (__m128i const *__P) return (__m128i) __builtin_ia32_lddqu ((char const *)__P); } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_monitor (void const * __P, unsigned int __E, unsigned int __H) -{ - __builtin_ia32_monitor (__P, __E, __H); -} - -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mwait (unsigned int __E, unsigned int __H) -{ - __builtin_ia32_mwait (__E, __H); -} - #ifdef __DISABLE_SSE3__ #undef __DISABLE_SSE3__ #pragma GCC pop_options diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 9d3728d..319099d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -16593,7 +16593,7 @@ [(unspec_volatile [(match_operand:SI 0 "register_operand" "c") (match_operand:SI 1 "register_operand" "a")] UNSPECV_MWAIT)] - "TARGET_SSE3" + "TARGET_MWAIT" ;; 64bit version is "mwait %rax,%rcx". But only lower 32bits are used. ;; Since 32bit register operands are implicitly zero extended to 64bit, ;; we only need to set up 32bit registers. @@ -16605,7 +16605,7 @@ (match_operand:SI 1 "register_operand" "c") (match_operand:SI 2 "register_operand" "d")] UNSPECV_MONITOR)] - "TARGET_SSE3" + "TARGET_MWAIT" ;; 64bit version is "monitor %rax,%rcx,%rdx". But only lower 32bits in ;; RCX and RDX are used. Since 32bit register operands are implicitly ;; zero extended to 64bit, we only need to set up 32bit registers. diff --git a/gcc/config/i386/x86gprintrin.h b/gcc/config/i386/x86gprintrin.h index ceda501..7793032 100644 --- a/gcc/config/i386/x86gprintrin.h +++ b/gcc/config/i386/x86gprintrin.h @@ -56,6 +56,8 @@ #include +#include + #include #include -- cgit v1.1 From fe11882ae34c49f6214f93867783ed1332f35f0f Mon Sep 17 00:00:00 2001 From: Alex Coplan Date: Wed, 21 Apr 2021 14:42:04 +0100 Subject: aarch64: Avoid duplicating bti j insns for jump tables [PR99988] This patch fixes PR99988 which shows us generating large (> 250) sequences of back-to-back bti j instructions. The fix is simply to avoid inserting bti j instructions at the target of a jump table if we've already inserted one for a given label. gcc/ChangeLog: PR target/99988 * config/aarch64/aarch64-bti-insert.c (aarch64_bti_j_insn_p): New. (rest_of_insert_bti): Avoid inserting duplicate bti j insns for jump table targets. gcc/testsuite/ChangeLog: PR target/99988 * gcc.target/aarch64/pr99988.c: New test. --- gcc/config/aarch64/aarch64-bti-insert.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-bti-insert.c b/gcc/config/aarch64/aarch64-bti-insert.c index 9366497..5d6bc16 100644 --- a/gcc/config/aarch64/aarch64-bti-insert.c +++ b/gcc/config/aarch64/aarch64-bti-insert.c @@ -120,6 +120,17 @@ aarch64_pac_insn_p (rtx x) return false; } +/* Check if INSN is a BTI J insn. */ +static bool +aarch64_bti_j_insn_p (rtx_insn *insn) +{ + if (!insn || !INSN_P (insn)) + return false; + + rtx pat = PATTERN (insn); + return GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == UNSPECV_BTI_J; +} + /* Insert the BTI instruction. */ /* This is implemented as a late RTL pass that runs before branch shortening and does the following. */ @@ -165,6 +176,10 @@ rest_of_insert_bti (void) for (j = GET_NUM_ELEM (vec) - 1; j >= 0; --j) { label = as_a (XEXP (RTVEC_ELT (vec, j), 0)); + rtx_insn *next = next_nonnote_nondebug_insn (label); + if (aarch64_bti_j_insn_p (next)) + continue; + bti_insn = gen_bti_j (); emit_insn_after (bti_insn, label); } -- cgit v1.1 From 14431e66b35e69a334a35df4c157a033e23d0999 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Mon, 8 Mar 2021 13:53:09 +0100 Subject: Generate PTA features from a def file. gcc/ChangeLog: * config/i386/i386.h (PTA_*): Remove. (enum pta_flag): New. (DEF_PTA): Generate PTA_* values from i386-isa.def. * config/i386/i386-isa.def: New file. --- gcc/config/i386/i386-isa.def | 110 +++++++++++++++++++++++++++++++++++++++++++ gcc/config/i386/i386.h | 109 +++++++----------------------------------- 2 files changed, 128 insertions(+), 91 deletions(-) create mode 100644 gcc/config/i386/i386-isa.def (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-isa.def b/gcc/config/i386/i386-isa.def new file mode 100644 index 0000000..a0d46cb --- /dev/null +++ b/gcc/config/i386/i386-isa.def @@ -0,0 +1,110 @@ +/* Definition for processor table alias flags. + Copyright (C) 2001-2021 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +DEF_PTA(3DNOW) +DEF_PTA(3DNOW_A) +DEF_PTA(64BIT) +DEF_PTA(ABM) +DEF_PTA(AES) +DEF_PTA(AVX) +DEF_PTA(BMI) +DEF_PTA(CX16) +DEF_PTA(F16C) +DEF_PTA(FMA) +DEF_PTA(FMA4) +DEF_PTA(FSGSBASE) +DEF_PTA(LWP) +DEF_PTA(LZCNT) +DEF_PTA(MMX) +DEF_PTA(MOVBE) +DEF_PTA(NO_SAHF) +DEF_PTA(PCLMUL) +DEF_PTA(POPCNT) +DEF_PTA(PREFETCH_SSE) +DEF_PTA(RDRND) +DEF_PTA(SSE) +DEF_PTA(SSE2) +DEF_PTA(SSE3) +DEF_PTA(SSE4_1) +DEF_PTA(SSE4_2) +DEF_PTA(SSE4A) +DEF_PTA(SSSE3) +DEF_PTA(TBM) +DEF_PTA(XOP) +DEF_PTA(AVX2) +DEF_PTA(BMI2) +DEF_PTA(RTM) +DEF_PTA(HLE) +DEF_PTA(PRFCHW) +DEF_PTA(RDSEED) +DEF_PTA(ADX) +DEF_PTA(FXSR) +DEF_PTA(XSAVE) +DEF_PTA(XSAVEOPT) +DEF_PTA(AVX512F) +DEF_PTA(AVX512ER) +DEF_PTA(AVX512PF) +DEF_PTA(AVX512CD) +DEF_PTA(NO_TUNE) +DEF_PTA(SHA) +DEF_PTA(PREFETCHWT1) +DEF_PTA(CLFLUSHOPT) +DEF_PTA(XSAVEC) +DEF_PTA(XSAVES) +DEF_PTA(AVX512DQ) +DEF_PTA(AVX512BW) +DEF_PTA(AVX512VL) +DEF_PTA(AVX512IFMA) +DEF_PTA(AVX512VBMI) +DEF_PTA(CLWB) +DEF_PTA(MWAITX) +DEF_PTA(CLZERO) +DEF_PTA(NO_80387) +DEF_PTA(PKU) +DEF_PTA(AVX5124VNNIW) +DEF_PTA(AVX5124FMAPS) +DEF_PTA(AVX512VPOPCNTDQ) +DEF_PTA(SGX) +DEF_PTA(AVX512VNNI) +DEF_PTA(GFNI) +DEF_PTA(VAES) +DEF_PTA(AVX512VBMI2) +DEF_PTA(VPCLMULQDQ) +DEF_PTA(AVX512BITALG) +DEF_PTA(RDPID) +DEF_PTA(PCONFIG) +DEF_PTA(WBNOINVD) +DEF_PTA(AVX512VP2INTERSECT) +DEF_PTA(PTWRITE) +DEF_PTA(AVX512BF16) +DEF_PTA(WAITPKG) +DEF_PTA(MOVDIRI) +DEF_PTA(MOVDIR64B) +DEF_PTA(ENQCMD) +DEF_PTA(CLDEMOTE) +DEF_PTA(SERIALIZE) +DEF_PTA(TSXLDTRK) +DEF_PTA(AMX_TILE) +DEF_PTA(AMX_INT8) +DEF_PTA(AMX_BF16) +DEF_PTA(UINTR) +DEF_PTA(HRESET) +DEF_PTA(KL) +DEF_PTA(WIDEKL) +DEF_PTA(AVXVNNI) diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index aa3fa3e..d2e6751 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2415,97 +2415,24 @@ extern const char *const processor_names[]; #include "wide-int-bitmask.h" -constexpr wide_int_bitmask PTA_3DNOW (HOST_WIDE_INT_1U << 0); -constexpr wide_int_bitmask PTA_3DNOW_A (HOST_WIDE_INT_1U << 1); -constexpr wide_int_bitmask PTA_64BIT (HOST_WIDE_INT_1U << 2); -constexpr wide_int_bitmask PTA_ABM (HOST_WIDE_INT_1U << 3); -constexpr wide_int_bitmask PTA_AES (HOST_WIDE_INT_1U << 4); -constexpr wide_int_bitmask PTA_AVX (HOST_WIDE_INT_1U << 5); -constexpr wide_int_bitmask PTA_BMI (HOST_WIDE_INT_1U << 6); -constexpr wide_int_bitmask PTA_CX16 (HOST_WIDE_INT_1U << 7); -constexpr wide_int_bitmask PTA_F16C (HOST_WIDE_INT_1U << 8); -constexpr wide_int_bitmask PTA_FMA (HOST_WIDE_INT_1U << 9); -constexpr wide_int_bitmask PTA_FMA4 (HOST_WIDE_INT_1U << 10); -constexpr wide_int_bitmask PTA_FSGSBASE (HOST_WIDE_INT_1U << 11); -constexpr wide_int_bitmask PTA_LWP (HOST_WIDE_INT_1U << 12); -constexpr wide_int_bitmask PTA_LZCNT (HOST_WIDE_INT_1U << 13); -constexpr wide_int_bitmask PTA_MMX (HOST_WIDE_INT_1U << 14); -constexpr wide_int_bitmask PTA_MOVBE (HOST_WIDE_INT_1U << 15); -constexpr wide_int_bitmask PTA_NO_SAHF (HOST_WIDE_INT_1U << 16); -constexpr wide_int_bitmask PTA_PCLMUL (HOST_WIDE_INT_1U << 17); -constexpr wide_int_bitmask PTA_POPCNT (HOST_WIDE_INT_1U << 18); -constexpr wide_int_bitmask PTA_PREFETCH_SSE (HOST_WIDE_INT_1U << 19); -constexpr wide_int_bitmask PTA_RDRND (HOST_WIDE_INT_1U << 20); -constexpr wide_int_bitmask PTA_SSE (HOST_WIDE_INT_1U << 21); -constexpr wide_int_bitmask PTA_SSE2 (HOST_WIDE_INT_1U << 22); -constexpr wide_int_bitmask PTA_SSE3 (HOST_WIDE_INT_1U << 23); -constexpr wide_int_bitmask PTA_SSE4_1 (HOST_WIDE_INT_1U << 24); -constexpr wide_int_bitmask PTA_SSE4_2 (HOST_WIDE_INT_1U << 25); -constexpr wide_int_bitmask PTA_SSE4A (HOST_WIDE_INT_1U << 26); -constexpr wide_int_bitmask PTA_SSSE3 (HOST_WIDE_INT_1U << 27); -constexpr wide_int_bitmask PTA_TBM (HOST_WIDE_INT_1U << 28); -constexpr wide_int_bitmask PTA_XOP (HOST_WIDE_INT_1U << 29); -constexpr wide_int_bitmask PTA_AVX2 (HOST_WIDE_INT_1U << 30); -constexpr wide_int_bitmask PTA_BMI2 (HOST_WIDE_INT_1U << 31); -constexpr wide_int_bitmask PTA_RTM (HOST_WIDE_INT_1U << 32); -constexpr wide_int_bitmask PTA_HLE (HOST_WIDE_INT_1U << 33); -constexpr wide_int_bitmask PTA_PRFCHW (HOST_WIDE_INT_1U << 34); -constexpr wide_int_bitmask PTA_RDSEED (HOST_WIDE_INT_1U << 35); -constexpr wide_int_bitmask PTA_ADX (HOST_WIDE_INT_1U << 36); -constexpr wide_int_bitmask PTA_FXSR (HOST_WIDE_INT_1U << 37); -constexpr wide_int_bitmask PTA_XSAVE (HOST_WIDE_INT_1U << 38); -constexpr wide_int_bitmask PTA_XSAVEOPT (HOST_WIDE_INT_1U << 39); -constexpr wide_int_bitmask PTA_AVX512F (HOST_WIDE_INT_1U << 40); -constexpr wide_int_bitmask PTA_AVX512ER (HOST_WIDE_INT_1U << 41); -constexpr wide_int_bitmask PTA_AVX512PF (HOST_WIDE_INT_1U << 42); -constexpr wide_int_bitmask PTA_AVX512CD (HOST_WIDE_INT_1U << 43); -constexpr wide_int_bitmask PTA_NO_TUNE (HOST_WIDE_INT_1U << 44); -constexpr wide_int_bitmask PTA_SHA (HOST_WIDE_INT_1U << 45); -constexpr wide_int_bitmask PTA_PREFETCHWT1 (HOST_WIDE_INT_1U << 46); -constexpr wide_int_bitmask PTA_CLFLUSHOPT (HOST_WIDE_INT_1U << 47); -constexpr wide_int_bitmask PTA_XSAVEC (HOST_WIDE_INT_1U << 48); -constexpr wide_int_bitmask PTA_XSAVES (HOST_WIDE_INT_1U << 49); -constexpr wide_int_bitmask PTA_AVX512DQ (HOST_WIDE_INT_1U << 50); -constexpr wide_int_bitmask PTA_AVX512BW (HOST_WIDE_INT_1U << 51); -constexpr wide_int_bitmask PTA_AVX512VL (HOST_WIDE_INT_1U << 52); -constexpr wide_int_bitmask PTA_AVX512IFMA (HOST_WIDE_INT_1U << 53); -constexpr wide_int_bitmask PTA_AVX512VBMI (HOST_WIDE_INT_1U << 54); -constexpr wide_int_bitmask PTA_CLWB (HOST_WIDE_INT_1U << 55); -constexpr wide_int_bitmask PTA_MWAITX (HOST_WIDE_INT_1U << 56); -constexpr wide_int_bitmask PTA_CLZERO (HOST_WIDE_INT_1U << 57); -constexpr wide_int_bitmask PTA_NO_80387 (HOST_WIDE_INT_1U << 58); -constexpr wide_int_bitmask PTA_PKU (HOST_WIDE_INT_1U << 59); -constexpr wide_int_bitmask PTA_AVX5124VNNIW (HOST_WIDE_INT_1U << 60); -constexpr wide_int_bitmask PTA_AVX5124FMAPS (HOST_WIDE_INT_1U << 61); -constexpr wide_int_bitmask PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1U << 62); -constexpr wide_int_bitmask PTA_SGX (HOST_WIDE_INT_1U << 63); -constexpr wide_int_bitmask PTA_AVX512VNNI (0, HOST_WIDE_INT_1U); -constexpr wide_int_bitmask PTA_GFNI (0, HOST_WIDE_INT_1U << 1); -constexpr wide_int_bitmask PTA_VAES (0, HOST_WIDE_INT_1U << 2); -constexpr wide_int_bitmask PTA_AVX512VBMI2 (0, HOST_WIDE_INT_1U << 3); -constexpr wide_int_bitmask PTA_VPCLMULQDQ (0, HOST_WIDE_INT_1U << 4); -constexpr wide_int_bitmask PTA_AVX512BITALG (0, HOST_WIDE_INT_1U << 5); -constexpr wide_int_bitmask PTA_RDPID (0, HOST_WIDE_INT_1U << 6); -constexpr wide_int_bitmask PTA_PCONFIG (0, HOST_WIDE_INT_1U << 7); -constexpr wide_int_bitmask PTA_WBNOINVD (0, HOST_WIDE_INT_1U << 8); -constexpr wide_int_bitmask PTA_AVX512VP2INTERSECT (0, HOST_WIDE_INT_1U << 9); -constexpr wide_int_bitmask PTA_PTWRITE (0, HOST_WIDE_INT_1U << 10); -constexpr wide_int_bitmask PTA_AVX512BF16 (0, HOST_WIDE_INT_1U << 11); -constexpr wide_int_bitmask PTA_WAITPKG (0, HOST_WIDE_INT_1U << 12); -constexpr wide_int_bitmask PTA_MOVDIRI (0, HOST_WIDE_INT_1U << 13); -constexpr wide_int_bitmask PTA_MOVDIR64B (0, HOST_WIDE_INT_1U << 14); -constexpr wide_int_bitmask PTA_ENQCMD (0, HOST_WIDE_INT_1U << 15); -constexpr wide_int_bitmask PTA_CLDEMOTE (0, HOST_WIDE_INT_1U << 16); -constexpr wide_int_bitmask PTA_SERIALIZE (0, HOST_WIDE_INT_1U << 17); -constexpr wide_int_bitmask PTA_TSXLDTRK (0, HOST_WIDE_INT_1U << 18); -constexpr wide_int_bitmask PTA_AMX_TILE (0, HOST_WIDE_INT_1U << 19); -constexpr wide_int_bitmask PTA_AMX_INT8 (0, HOST_WIDE_INT_1U << 20); -constexpr wide_int_bitmask PTA_AMX_BF16 (0, HOST_WIDE_INT_1U << 21); -constexpr wide_int_bitmask PTA_UINTR (0, HOST_WIDE_INT_1U << 22); -constexpr wide_int_bitmask PTA_HRESET (0, HOST_WIDE_INT_1U << 23); -constexpr wide_int_bitmask PTA_KL (0, HOST_WIDE_INT_1U << 24); -constexpr wide_int_bitmask PTA_WIDEKL (0, HOST_WIDE_INT_1U << 25); -constexpr wide_int_bitmask PTA_AVXVNNI (0, HOST_WIDE_INT_1U << 26); +enum pta_flag +{ +#define DEF_PTA(NAME) _ ## NAME, +#include "i386-isa.def" +#undef DEF_PTA + END_PTA +}; + +/* wide_int_bitmask can handle only 128 flags. */ +STATIC_ASSERT (END_PTA <= 128); + +#define WIDE_INT_BITMASK_FROM_NTH(N) (N < 64 ? wide_int_bitmask (0, 1ULL << N) \ + : wide_int_bitmask (1ULL << (N - 64), 0)) + +#define DEF_PTA(NAME) constexpr wide_int_bitmask PTA_ ## NAME \ + = WIDE_INT_BITMASK_FROM_NTH ((pta_flag) _ ## NAME); +#include "i386-isa.def" +#undef DEF_PTA constexpr wide_int_bitmask PTA_X86_64_BASELINE = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR; -- cgit v1.1 From 1751bec027f030515889fcf4baa9c91501aafc85 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Mon, 8 Mar 2021 14:54:18 +0100 Subject: Overhaul in isa_flags and handling it. gcc/ChangeLog: * config/i386/i386-options.c (TARGET_EXPLICIT_NO_SAHF_P): Define. (SET_TARGET_NO_SAHF): Likewise. (TARGET_EXPLICIT_PREFETCH_SSE_P): Likewise. (SET_TARGET_PREFETCH_SSE): Likewise. (TARGET_EXPLICIT_NO_TUNE_P): Likewise. (SET_TARGET_NO_TUNE): Likewise. (TARGET_EXPLICIT_NO_80387_P): Likewise. (SET_TARGET_NO_80387): Likewise. (DEF_PTA): New. * config/i386/i386.h (TARGET_*): Remove. * opth-gen.awk: Generate new used macros. --- gcc/config/i386/i386-options.c | 303 ++++------------------------------------- gcc/config/i386/i386.h | 194 +------------------------- 2 files changed, 36 insertions(+), 461 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 65bb0bb..2a12228 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -2096,285 +2096,40 @@ ix86_option_override_internal (bool main_args_p, else ix86_tune = PROCESSOR_GENERIC; - if (((processor_alias_table[i].flags & PTA_MMX) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX; - if (((processor_alias_table[i].flags & PTA_3DNOW) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW; - if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A; - if (((processor_alias_table[i].flags & PTA_SSE) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE; - if (((processor_alias_table[i].flags & PTA_SSE2) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2; - if (((processor_alias_table[i].flags & PTA_SSE3) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3; - if (((processor_alias_table[i].flags & PTA_SSSE3) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3; - if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1; - if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2; - if (((processor_alias_table[i].flags & PTA_AVX) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX; - if (((processor_alias_table[i].flags & PTA_AVX2) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2; - if (((processor_alias_table[i].flags & PTA_FMA) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA; - if (((processor_alias_table[i].flags & PTA_SSE4A) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A; - if (((processor_alias_table[i].flags & PTA_FMA4) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4; - if (((processor_alias_table[i].flags & PTA_XOP) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP; - if (((processor_alias_table[i].flags & PTA_LWP) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP; + /* Enable PTA flags that are enabled by default by a -march option. */ +#define TARGET_EXPLICIT_NO_SAHF_P(opts) (false) +#define SET_TARGET_NO_SAHF(opts) {} +#define TARGET_EXPLICIT_PREFETCH_SSE_P(opts) (false) +#define SET_TARGET_PREFETCH_SSE(opts) {} +#define TARGET_EXPLICIT_NO_TUNE_P(opts) (false) +#define SET_TARGET_NO_TUNE(opts) {} +#define TARGET_EXPLICIT_NO_80387_P(opts) (false) +#define SET_TARGET_NO_80387(opts) {} + +#define DEF_PTA(NAME) \ + if (((processor_alias_table[i].flags & PTA_ ## NAME) != 0) \ + && PTA_ ## NAME != PTA_64BIT \ + && !TARGET_EXPLICIT_ ## NAME ## _P (opts)) \ + SET_TARGET_ ## NAME (opts); +#include "i386-isa.def" +#undef DEF_PTA + + + if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags) + && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0)) + && !TARGET_EXPLICIT_SAHF_P (opts)) + SET_TARGET_SAHF (opts); + if (((processor_alias_table[i].flags & PTA_ABM) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM; - if (((processor_alias_table[i].flags & PTA_BMI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI; - if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT; - if (((processor_alias_table[i].flags & PTA_TBM) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM; - if (((processor_alias_table[i].flags & PTA_BMI2) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2; - if (((processor_alias_table[i].flags & PTA_CX16) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_CX16)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_CX16; - if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT; - if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags) - && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0)) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF; - if (((processor_alias_table[i].flags & PTA_MOVBE) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_MOVBE)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_MOVBE; - if (((processor_alias_table[i].flags & PTA_AES) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AES)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AES; - if (((processor_alias_table[i].flags & PTA_SHA) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SHA; - if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL; - if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE; - if (((processor_alias_table[i].flags & PTA_RDRND) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND; - if (((processor_alias_table[i].flags & PTA_F16C) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C; - if (((processor_alias_table[i].flags & PTA_RTM) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM; - if (((processor_alias_table[i].flags & PTA_HLE) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_HLE)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_HLE; - if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW; - if (((processor_alias_table[i].flags & PTA_RDSEED) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED; - if (((processor_alias_table[i].flags & PTA_ADX) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX; - if (((processor_alias_table[i].flags & PTA_FXSR) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR; - if (((processor_alias_table[i].flags & PTA_XSAVE) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE; - if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT; - if (((processor_alias_table[i].flags & PTA_AVX512F) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F; - if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER; - if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF; - if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD; - if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1; - if (((processor_alias_table[i].flags & PTA_CLWB) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB; - if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT; - if (((processor_alias_table[i].flags & PTA_CLZERO) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_CLZERO)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_CLZERO; - if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC; - if (((processor_alias_table[i].flags & PTA_XSAVES) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES; - if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ; - if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW; - if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL; - if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI; - if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA; - if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI; - if (((processor_alias_table[i].flags & PTA_GFNI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI; - if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0) - && !(opts->x_ix86_isa_flags_explicit - & OPTION_MASK_ISA_AVX512VBMI2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2; - if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ; - if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0) - && !(opts->x_ix86_isa_flags_explicit - & OPTION_MASK_ISA_AVX512BITALG)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG; - - if (((processor_alias_table[i].flags & PTA_AVX512VP2INTERSECT) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA2_AVX512VP2INTERSECT)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVX512VP2INTERSECT; - if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA2_AVX5124VNNIW)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVX5124VNNIW; - if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA2_AVX5124FMAPS)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVX5124FMAPS; - if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0) - && !(opts->x_ix86_isa_flags_explicit - & OPTION_MASK_ISA_AVX512VPOPCNTDQ)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ; - if (((processor_alias_table[i].flags & PTA_AVX512BF16) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA2_AVX512BF16)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVX512BF16; - if (((processor_alias_table[i].flags & PTA_AMX_TILE) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA2_AMX_TILE)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_TILE; - if (((processor_alias_table[i].flags & PTA_AMX_INT8) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA2_AMX_INT8)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_INT8; - if (((processor_alias_table[i].flags & PTA_AMX_BF16) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA2_AMX_BF16)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_BF16; - if (((processor_alias_table[i].flags & PTA_AVXVNNI) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA2_AVXVNNI)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVXVNNI; - if (((processor_alias_table[i].flags & PTA_MOVDIRI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVDIRI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVDIRI; - if (((processor_alias_table[i].flags & PTA_MOVDIR64B) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_MOVDIR64B)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_MOVDIR64B; - if (((processor_alias_table[i].flags & PTA_SGX) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_SGX)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_SGX; - if (((processor_alias_table[i].flags & PTA_VAES) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_VAES)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_VAES; - if (((processor_alias_table[i].flags & PTA_RDPID) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_RDPID)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_RDPID; - if (((processor_alias_table[i].flags & PTA_PCONFIG) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_PCONFIG)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_PCONFIG; - if (((processor_alias_table[i].flags & PTA_WBNOINVD) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_WBNOINVD)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_WBNOINVD; - if (((processor_alias_table[i].flags & PTA_PTWRITE) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_PTWRITE)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_PTWRITE; - if (((processor_alias_table[i].flags & PTA_WAITPKG) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_WAITPKG)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_WAITPKG; - if (((processor_alias_table[i].flags & PTA_ENQCMD) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_ENQCMD)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_ENQCMD; - if (((processor_alias_table[i].flags & PTA_CLDEMOTE) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_CLDEMOTE)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_CLDEMOTE; - if (((processor_alias_table[i].flags & PTA_SERIALIZE) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_SERIALIZE)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_SERIALIZE; - if (((processor_alias_table[i].flags & PTA_TSXLDTRK) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_TSXLDTRK)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_TSXLDTRK; - if (((processor_alias_table[i].flags & PTA_KL) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_KL)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_KL; - if (((processor_alias_table[i].flags & PTA_WIDEKL) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_WIDEKL)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_WIDEKL; + && !TARGET_EXPLICIT_ABM_P (opts)) + { + SET_TARGET_LZCNT (opts); + SET_TARGET_POPCNT (opts); + } if ((processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)) != 0) ix86_prefetch_sse = true; - if (((processor_alias_table[i].flags & PTA_MWAITX) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_MWAITX)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_MWAITX; - if (((processor_alias_table[i].flags & PTA_PKU) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU; - if (((processor_alias_table[i].flags & PTA_UINTR) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_UINTR)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_UINTR; - if (((processor_alias_table[i].flags & PTA_HRESET) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_HRESET)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_HRESET; - /* Don't enable x87 instructions if only general registers are allowed by target("general-regs-only") function attribute or diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index d2e6751..177c2b6 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -39,194 +39,14 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see /* Redefines for option macros. */ -#define TARGET_64BIT TARGET_ISA_64BIT -#define TARGET_64BIT_P(x) TARGET_ISA_64BIT_P(x) -#define TARGET_MMX TARGET_ISA_MMX -#define TARGET_MMX_P(x) TARGET_ISA_MMX_P(x) -#define TARGET_3DNOW TARGET_ISA_3DNOW -#define TARGET_3DNOW_P(x) TARGET_ISA_3DNOW_P(x) -#define TARGET_3DNOW_A TARGET_ISA_3DNOW_A -#define TARGET_3DNOW_A_P(x) TARGET_ISA_3DNOW_A_P(x) -#define TARGET_SSE TARGET_ISA_SSE -#define TARGET_SSE_P(x) TARGET_ISA_SSE_P(x) -#define TARGET_SSE2 TARGET_ISA_SSE2 -#define TARGET_SSE2_P(x) TARGET_ISA_SSE2_P(x) -#define TARGET_SSE3 TARGET_ISA_SSE3 -#define TARGET_SSE3_P(x) TARGET_ISA_SSE3_P(x) -#define TARGET_SSSE3 TARGET_ISA_SSSE3 -#define TARGET_SSSE3_P(x) TARGET_ISA_SSSE3_P(x) -#define TARGET_SSE4_1 TARGET_ISA_SSE4_1 -#define TARGET_SSE4_1_P(x) TARGET_ISA_SSE4_1_P(x) -#define TARGET_SSE4_2 TARGET_ISA_SSE4_2 -#define TARGET_SSE4_2_P(x) TARGET_ISA_SSE4_2_P(x) -#define TARGET_AVX TARGET_ISA_AVX -#define TARGET_AVX_P(x) TARGET_ISA_AVX_P(x) -#define TARGET_AVX2 TARGET_ISA_AVX2 -#define TARGET_AVX2_P(x) TARGET_ISA_AVX2_P(x) -#define TARGET_AVX512F TARGET_ISA_AVX512F -#define TARGET_AVX512F_P(x) TARGET_ISA_AVX512F_P(x) -#define TARGET_AVX512PF TARGET_ISA_AVX512PF -#define TARGET_AVX512PF_P(x) TARGET_ISA_AVX512PF_P(x) -#define TARGET_AVX512ER TARGET_ISA_AVX512ER -#define TARGET_AVX512ER_P(x) TARGET_ISA_AVX512ER_P(x) -#define TARGET_AVX512CD TARGET_ISA_AVX512CD -#define TARGET_AVX512CD_P(x) TARGET_ISA_AVX512CD_P(x) -#define TARGET_AVX512DQ TARGET_ISA_AVX512DQ -#define TARGET_AVX512DQ_P(x) TARGET_ISA_AVX512DQ_P(x) -#define TARGET_AVX512BW TARGET_ISA_AVX512BW -#define TARGET_AVX512BW_P(x) TARGET_ISA_AVX512BW_P(x) -#define TARGET_AVX512VL TARGET_ISA_AVX512VL -#define TARGET_AVX512VL_P(x) TARGET_ISA_AVX512VL_P(x) -#define TARGET_AVX512VBMI TARGET_ISA_AVX512VBMI -#define TARGET_AVX512VBMI_P(x) TARGET_ISA_AVX512VBMI_P(x) -#define TARGET_AVX512IFMA TARGET_ISA_AVX512IFMA -#define TARGET_AVX512IFMA_P(x) TARGET_ISA_AVX512IFMA_P(x) -#define TARGET_AVX5124FMAPS TARGET_ISA2_AVX5124FMAPS -#define TARGET_AVX5124FMAPS_P(x) TARGET_ISA2_AVX5124FMAPS_P(x) -#define TARGET_AVX5124VNNIW TARGET_ISA2_AVX5124VNNIW -#define TARGET_AVX5124VNNIW_P(x) TARGET_ISA2_AVX5124VNNIW_P(x) -#define TARGET_AVX512VBMI2 TARGET_ISA_AVX512VBMI2 -#define TARGET_AVX512VBMI2_P(x) TARGET_ISA_AVX512VBMI2_P(x) -#define TARGET_AVX512VPOPCNTDQ TARGET_ISA_AVX512VPOPCNTDQ -#define TARGET_AVX512VPOPCNTDQ_P(x) TARGET_ISA_AVX512VPOPCNTDQ_P(x) -#define TARGET_AVX512VNNI TARGET_ISA_AVX512VNNI -#define TARGET_AVX512VNNI_P(x) TARGET_ISA_AVX512VNNI_P(x) -#define TARGET_AVX512BITALG TARGET_ISA_AVX512BITALG -#define TARGET_AVX512BITALG_P(x) TARGET_ISA_AVX512BITALG_P(x) -#define TARGET_AVX512VP2INTERSECT TARGET_ISA2_AVX512VP2INTERSECT -#define TARGET_AVX512VP2INTERSECT_P(x) TARGET_ISA2_AVX512VP2INTERSECT_P(x) -#define TARGET_FMA TARGET_ISA_FMA -#define TARGET_FMA_P(x) TARGET_ISA_FMA_P(x) -#define TARGET_SSE4A TARGET_ISA_SSE4A -#define TARGET_SSE4A_P(x) TARGET_ISA_SSE4A_P(x) -#define TARGET_FMA4 TARGET_ISA_FMA4 -#define TARGET_FMA4_P(x) TARGET_ISA_FMA4_P(x) -#define TARGET_XOP TARGET_ISA_XOP -#define TARGET_XOP_P(x) TARGET_ISA_XOP_P(x) -#define TARGET_LWP TARGET_ISA_LWP -#define TARGET_LWP_P(x) TARGET_ISA_LWP_P(x) -#define TARGET_ABM TARGET_ISA_ABM -#define TARGET_ABM_P(x) TARGET_ISA_ABM_P(x) -#define TARGET_PCONFIG TARGET_ISA2_PCONFIG -#define TARGET_PCONFIG_P(x) TARGET_ISA2_PCONFIG_P(x) -#define TARGET_WBNOINVD TARGET_ISA2_WBNOINVD -#define TARGET_WBNOINVD_P(x) TARGET_ISA2_WBNOINVD_P(x) -#define TARGET_SGX TARGET_ISA2_SGX -#define TARGET_SGX_P(x) TARGET_ISA2_SGX_P(x) -#define TARGET_RDPID TARGET_ISA2_RDPID -#define TARGET_RDPID_P(x) TARGET_ISA2_RDPID_P(x) -#define TARGET_GFNI TARGET_ISA_GFNI -#define TARGET_GFNI_P(x) TARGET_ISA_GFNI_P(x) -#define TARGET_VAES TARGET_ISA2_VAES -#define TARGET_VAES_P(x) TARGET_ISA2_VAES_P(x) -#define TARGET_VPCLMULQDQ TARGET_ISA_VPCLMULQDQ -#define TARGET_VPCLMULQDQ_P(x) TARGET_ISA_VPCLMULQDQ_P(x) -#define TARGET_BMI TARGET_ISA_BMI -#define TARGET_BMI_P(x) TARGET_ISA_BMI_P(x) -#define TARGET_BMI2 TARGET_ISA_BMI2 -#define TARGET_BMI2_P(x) TARGET_ISA_BMI2_P(x) -#define TARGET_LZCNT TARGET_ISA_LZCNT -#define TARGET_LZCNT_P(x) TARGET_ISA_LZCNT_P(x) -#define TARGET_TBM TARGET_ISA_TBM -#define TARGET_TBM_P(x) TARGET_ISA_TBM_P(x) -#define TARGET_POPCNT TARGET_ISA_POPCNT -#define TARGET_POPCNT_P(x) TARGET_ISA_POPCNT_P(x) -#define TARGET_SAHF TARGET_ISA_SAHF -#define TARGET_SAHF_P(x) TARGET_ISA_SAHF_P(x) -#define TARGET_MOVBE TARGET_ISA2_MOVBE -#define TARGET_MOVBE_P(x) TARGET_ISA2_MOVBE_P(x) -#define TARGET_CRC32 TARGET_ISA_CRC32 -#define TARGET_CRC32_P(x) TARGET_ISA_CRC32_P(x) -#define TARGET_AES TARGET_ISA_AES -#define TARGET_AES_P(x) TARGET_ISA_AES_P(x) -#define TARGET_SHA TARGET_ISA_SHA -#define TARGET_SHA_P(x) TARGET_ISA_SHA_P(x) -#define TARGET_CLFLUSHOPT TARGET_ISA_CLFLUSHOPT -#define TARGET_CLFLUSHOPT_P(x) TARGET_ISA_CLFLUSHOPT_P(x) -#define TARGET_CLZERO TARGET_ISA2_CLZERO -#define TARGET_CLZERO_P(x) TARGET_ISA2_CLZERO_P(x) -#define TARGET_XSAVEC TARGET_ISA_XSAVEC -#define TARGET_XSAVEC_P(x) TARGET_ISA_XSAVEC_P(x) -#define TARGET_XSAVES TARGET_ISA_XSAVES -#define TARGET_XSAVES_P(x) TARGET_ISA_XSAVES_P(x) -#define TARGET_PCLMUL TARGET_ISA_PCLMUL -#define TARGET_PCLMUL_P(x) TARGET_ISA_PCLMUL_P(x) -#define TARGET_CMPXCHG16B TARGET_ISA2_CX16 -#define TARGET_CMPXCHG16B_P(x) TARGET_ISA2_CX16_P(x) -#define TARGET_FSGSBASE TARGET_ISA_FSGSBASE -#define TARGET_FSGSBASE_P(x) TARGET_ISA_FSGSBASE_P(x) -#define TARGET_RDRND TARGET_ISA_RDRND -#define TARGET_RDRND_P(x) TARGET_ISA_RDRND_P(x) -#define TARGET_F16C TARGET_ISA_F16C -#define TARGET_F16C_P(x) TARGET_ISA_F16C_P(x) -#define TARGET_RTM TARGET_ISA_RTM -#define TARGET_RTM_P(x) TARGET_ISA_RTM_P(x) -#define TARGET_HLE TARGET_ISA2_HLE -#define TARGET_HLE_P(x) TARGET_ISA2_HLE_P(x) -#define TARGET_RDSEED TARGET_ISA_RDSEED -#define TARGET_RDSEED_P(x) TARGET_ISA_RDSEED_P(x) -#define TARGET_PRFCHW TARGET_ISA_PRFCHW -#define TARGET_PRFCHW_P(x) TARGET_ISA_PRFCHW_P(x) -#define TARGET_ADX TARGET_ISA_ADX -#define TARGET_ADX_P(x) TARGET_ISA_ADX_P(x) -#define TARGET_FXSR TARGET_ISA_FXSR -#define TARGET_FXSR_P(x) TARGET_ISA_FXSR_P(x) -#define TARGET_XSAVE TARGET_ISA_XSAVE -#define TARGET_XSAVE_P(x) TARGET_ISA_XSAVE_P(x) -#define TARGET_XSAVEOPT TARGET_ISA_XSAVEOPT -#define TARGET_XSAVEOPT_P(x) TARGET_ISA_XSAVEOPT_P(x) -#define TARGET_PREFETCHWT1 TARGET_ISA_PREFETCHWT1 -#define TARGET_PREFETCHWT1_P(x) TARGET_ISA_PREFETCHWT1_P(x) -#define TARGET_CLWB TARGET_ISA_CLWB -#define TARGET_CLWB_P(x) TARGET_ISA_CLWB_P(x) -#define TARGET_MWAITX TARGET_ISA2_MWAITX -#define TARGET_MWAITX_P(x) TARGET_ISA2_MWAITX_P(x) -#define TARGET_MWAIT TARGET_ISA2_MWAIT -#define TARGET_MWAIT_P(x) TARGET_ISA2_MWAIT_P(x) -#define TARGET_PKU TARGET_ISA_PKU -#define TARGET_PKU_P(x) TARGET_ISA_PKU_P(x) -#define TARGET_SHSTK TARGET_ISA_SHSTK -#define TARGET_SHSTK_P(x) TARGET_ISA_SHSTK_P(x) -#define TARGET_MOVDIRI TARGET_ISA_MOVDIRI -#define TARGET_MOVDIRI_P(x) TARGET_ISA_MOVDIRI_P(x) -#define TARGET_MOVDIR64B TARGET_ISA2_MOVDIR64B -#define TARGET_MOVDIR64B_P(x) TARGET_ISA2_MOVDIR64B_P(x) -#define TARGET_WAITPKG TARGET_ISA2_WAITPKG -#define TARGET_WAITPKG_P(x) TARGET_ISA2_WAITPKG_P(x) -#define TARGET_CLDEMOTE TARGET_ISA2_CLDEMOTE -#define TARGET_CLDEMOTE_P(x) TARGET_ISA2_CLDEMOTE_P(x) -#define TARGET_PTWRITE TARGET_ISA2_PTWRITE -#define TARGET_PTWRITE_P(x) TARGET_ISA2_PTWRITE_P(x) -#define TARGET_AVX512BF16 TARGET_ISA2_AVX512BF16 -#define TARGET_AVX512BF16_P(x) TARGET_ISA2_AVX512BF16_P(x) -#define TARGET_ENQCMD TARGET_ISA2_ENQCMD -#define TARGET_ENQCMD_P(x) TARGET_ISA2_ENQCMD_P(x) -#define TARGET_SERIALIZE TARGET_ISA2_SERIALIZE -#define TARGET_SERIALIZE_P(x) TARGET_ISA2_SERIALIZE_P(x) -#define TARGET_TSXLDTRK TARGET_ISA2_TSXLDTRK -#define TARGET_TSXLDTRK_P(x) TARGET_ISA2_TSXLDTRK_P(x) -#define TARGET_AMX_TILE TARGET_ISA2_AMX_TILE -#define TARGET_AMX_TILE_P(x) TARGET_ISA2_AMX_TILE(x) -#define TARGET_AMX_INT8 TARGET_ISA2_AMX_INT8 -#define TARGET_AMX_INT8_P(x) TARGET_ISA2_AMX_INT8(x) -#define TARGET_AMX_BF16 TARGET_ISA2_AMX_BF16 -#define TARGET_AMX_BF16_P(x) TARGET_ISA2_AMX_BF16(x) -#define TARGET_UINTR TARGET_ISA2_UINTR -#define TARGET_UINTR_P(x) TARGET_ISA2_UINTR_P(x) -#define TARGET_HRESET TARGET_ISA2_HRESET -#define TARGET_HRESET_P(x) TARGET_ISA2_HRESET_P(x) -#define TARGET_KL TARGET_ISA2_KL -#define TARGET_KL_P(x) TARGET_ISA2_KL_P(x) -#define TARGET_WIDEKL TARGET_ISA2_WIDEKL -#define TARGET_WIDEKL_P(x) TARGET_ISA2_WIDEKL_P(x) -#define TARGET_AVXVNNI TARGET_ISA2_AVXVNNI -#define TARGET_AVXVNNI_P(x) TARGET_ISA2_AVXVNNI_P(x) - -#define TARGET_LP64 TARGET_ABI_64 +#define TARGET_CMPXCHG16B TARGET_CX16 +#define TARGET_CMPXCHG16B_P(x) TARGET_CX16_P(x) + +#define TARGET_LP64 TARGET_ABI_64 #define TARGET_LP64_P(x) TARGET_ABI_64_P(x) -#define TARGET_X32 TARGET_ABI_X32 -#define TARGET_X32_P(x) TARGET_ABI_X32_P(x) -#define TARGET_16BIT TARGET_CODE16 +#define TARGET_X32 TARGET_ABI_X32 +#define TARGET_X32_P(x) TARGET_ABI_X32_P(x) +#define TARGET_16BIT TARGET_CODE16 #define TARGET_16BIT_P(x) TARGET_CODE16_P(x) #define TARGET_MMX_WITH_SSE (TARGET_64BIT && TARGET_SSE2) -- cgit v1.1 From f23881fcf081a6edd538d6d54fa0068d716973d7 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Tue, 9 Mar 2021 14:56:54 +0100 Subject: Remove TARGET_foo (ix86_tune == PROCESSOR_foo) macros. gcc/ChangeLog: * config/i386/i386-expand.c (decide_alignment): Use newly named macro TARGET_CPU_P. * config/i386/i386.c (ix86_decompose_address): Likewise. (ix86_address_cost): Likewise. (ix86_lea_outperforms): Likewise. (ix86_avoid_lea_for_addr): Likewise. (ix86_add_stmt_cost): Likewise. * config/i386/i386.h (TARGET_*): Remove. (TARGET_CPU_P): New macro. * config/i386/i386.md: Use newly named macro TARGET_CPU_P. * config/i386/x86-tune-sched-atom.c (do_reorder_for_imul): Likewise. (swap_top_of_ready_list): Likewise. (ix86_atom_sched_reorder): Likewise. * config/i386/x86-tune-sched-bd.c (ix86_bd_has_dispatch): Likewise. * config/i386/x86-tune-sched.c (ix86_adjust_cost): Likewise. --- gcc/config/i386/i386-expand.c | 2 +- gcc/config/i386/i386.c | 16 ++++++------ gcc/config/i386/i386.h | 46 +---------------------------------- gcc/config/i386/i386.md | 8 +++--- gcc/config/i386/x86-tune-sched-atom.c | 7 +++--- gcc/config/i386/x86-tune-sched-bd.c | 5 ++-- gcc/config/i386/x86-tune-sched.c | 2 +- 7 files changed, 23 insertions(+), 63 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index dda08ff..166c23d 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -7055,7 +7055,7 @@ decide_alignment (int align, desired_align = GET_MODE_SIZE (move_mode); /* PentiumPro has special logic triggering for 8 byte aligned blocks. copying whole cacheline at once. */ - if (TARGET_PENTIUMPRO + if (TARGET_CPU_P (PENTIUMPRO) && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte)) desired_align = 8; diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7c41302..c302bc2 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -10179,7 +10179,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) Avoid this by transforming to [%esi+0]. Reload calls address legitimization without cfun defined, so we need to test cfun for being non-NULL. */ - if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun) + if (TARGET_CPU_P (K6) && cfun && optimize_function_for_speed_p (cfun) && base_reg && !index_reg && !disp && REGNO (base_reg) == SI_REG) disp = const0_rtx; @@ -10257,7 +10257,7 @@ ix86_address_cost (rtx x, machine_mode, addr_space_t, bool) memory address, but I don't have AMD-K6 machine handy to check this theory. */ - if (TARGET_K6 + if (TARGET_CPU_P (K6) && ((!parts.disp && parts.base && parts.index && parts.scale != 1) || (parts.disp && !parts.base && parts.index && parts.scale != 1) || (!parts.disp && parts.base && parts.index && parts.scale == 1))) @@ -14940,7 +14940,7 @@ ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1, /* For Atom processors newer than Bonnell, if using a 2-source or 3-source LEA for non-destructive destination purposes, or due to wanting ability to use SCALE, the use of LEA is justified. */ - if (!TARGET_BONNELL) + if (!TARGET_CPU_P (BONNELL)) { if (has_scale) return true; @@ -15082,7 +15082,7 @@ ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[]) than lea for most processors. For the processors like BONNELL, if the destination register of LEA holds an actual address which will be used soon, LEA is better and otherwise ADD is better. */ - if (!TARGET_BONNELL + if (!TARGET_CPU_P (BONNELL) && parts.scale == 1 && (!parts.disp || parts.disp == const0_rtx) && (regno0 == regno1 || regno0 == regno2)) @@ -22387,7 +22387,7 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); /* Penalize DFmode vector operations for Bonnell. */ - if (TARGET_BONNELL && kind == vector_stmt + if (TARGET_CPU_P (BONNELL) && kind == vector_stmt && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode) stmt_cost *= 5; /* FIXME: The value here is arbitrary. */ @@ -22403,8 +22403,10 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, /* We need to multiply all vector stmt cost by 1.7 (estimated cost) for Silvermont as it has out of order integer pipeline and can execute 2 scalar instruction per tick, but has in order SIMD pipeline. */ - if ((TARGET_SILVERMONT || TARGET_GOLDMONT || TARGET_GOLDMONT_PLUS - || TARGET_TREMONT || TARGET_INTEL) && stmt_info && stmt_info->stmt) + if ((TARGET_CPU_P (SILVERMONT) || TARGET_CPU_P (GOLDMONT) + || TARGET_CPU_P (GOLDMONT_PLUS) || TARGET_CPU_P (TREMONT) + || TARGET_CPU_P (INTEL)) + && stmt_info && stmt_info->stmt) { tree lhs_op = gimple_get_lhs (stmt_info->stmt); if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE) diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 177c2b6..96b46ba 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -263,51 +263,7 @@ extern const struct processor_costs ix86_size_cost; #define HAS_LONG_COND_BRANCH 1 #define HAS_LONG_UNCOND_BRANCH 1 -#define TARGET_386 (ix86_tune == PROCESSOR_I386) -#define TARGET_486 (ix86_tune == PROCESSOR_I486) -#define TARGET_PENTIUM (ix86_tune == PROCESSOR_PENTIUM) -#define TARGET_PENTIUMPRO (ix86_tune == PROCESSOR_PENTIUMPRO) -#define TARGET_GEODE (ix86_tune == PROCESSOR_GEODE) -#define TARGET_K6 (ix86_tune == PROCESSOR_K6) -#define TARGET_ATHLON (ix86_tune == PROCESSOR_ATHLON) -#define TARGET_PENTIUM4 (ix86_tune == PROCESSOR_PENTIUM4) -#define TARGET_K8 (ix86_tune == PROCESSOR_K8) -#define TARGET_ATHLON_K8 (TARGET_K8 || TARGET_ATHLON) -#define TARGET_NOCONA (ix86_tune == PROCESSOR_NOCONA) -#define TARGET_CORE2 (ix86_tune == PROCESSOR_CORE2) -#define TARGET_NEHALEM (ix86_tune == PROCESSOR_NEHALEM) -#define TARGET_SANDYBRIDGE (ix86_tune == PROCESSOR_SANDYBRIDGE) -#define TARGET_HASWELL (ix86_tune == PROCESSOR_HASWELL) -#define TARGET_BONNELL (ix86_tune == PROCESSOR_BONNELL) -#define TARGET_SILVERMONT (ix86_tune == PROCESSOR_SILVERMONT) -#define TARGET_GOLDMONT (ix86_tune == PROCESSOR_GOLDMONT) -#define TARGET_GOLDMONT_PLUS (ix86_tune == PROCESSOR_GOLDMONT_PLUS) -#define TARGET_TREMONT (ix86_tune == PROCESSOR_TREMONT) -#define TARGET_KNL (ix86_tune == PROCESSOR_KNL) -#define TARGET_KNM (ix86_tune == PROCESSOR_KNM) -#define TARGET_SKYLAKE (ix86_tune == PROCESSOR_SKYLAKE) -#define TARGET_SKYLAKE_AVX512 (ix86_tune == PROCESSOR_SKYLAKE_AVX512) -#define TARGET_CANNONLAKE (ix86_tune == PROCESSOR_CANNONLAKE) -#define TARGET_ICELAKE_CLIENT (ix86_tune == PROCESSOR_ICELAKE_CLIENT) -#define TARGET_ICELAKE_SERVER (ix86_tune == PROCESSOR_ICELAKE_SERVER) -#define TARGET_CASCADELAKE (ix86_tune == PROCESSOR_CASCADELAKE) -#define TARGET_TIGERLAKE (ix86_tune == PROCESSOR_TIGERLAKE) -#define TARGET_COOPERLAKE (ix86_tune == PROCESSOR_COOPERLAKE) -#define TARGET_SAPPHIRERAPIDS (ix86_tune == PROCESSOR_SAPPHIRERAPIDS) -#define TARGET_ALDERLAKE (ix86_tune == PROCESSOR_ALDERLAKE) -#define TARGET_ROCKETLAKE (ix86_tune == PROCESSOR_ROCKETLAKE) -#define TARGET_INTEL (ix86_tune == PROCESSOR_INTEL) -#define TARGET_GENERIC (ix86_tune == PROCESSOR_GENERIC) -#define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10) -#define TARGET_BDVER1 (ix86_tune == PROCESSOR_BDVER1) -#define TARGET_BDVER2 (ix86_tune == PROCESSOR_BDVER2) -#define TARGET_BDVER3 (ix86_tune == PROCESSOR_BDVER3) -#define TARGET_BDVER4 (ix86_tune == PROCESSOR_BDVER4) -#define TARGET_BTVER1 (ix86_tune == PROCESSOR_BTVER1) -#define TARGET_BTVER2 (ix86_tune == PROCESSOR_BTVER2) -#define TARGET_ZNVER1 (ix86_tune == PROCESSOR_ZNVER1) -#define TARGET_ZNVER2 (ix86_tune == PROCESSOR_ZNVER2) -#define TARGET_ZNVER3 (ix86_tune == PROCESSOR_ZNVER3) +#define TARGET_CPU_P(CPU) (ix86_tune == PROCESSOR_ ## CPU) /* Feature tests against the various tunings. */ enum ix86_tune_indices { diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 1f1d74e..7eb7176 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -14302,13 +14302,13 @@ return "tzcnt{}\t{%1, %0|%0, %1}"; else if (optimize_function_for_size_p (cfun)) ; - else if (TARGET_GENERIC) + else if (TARGET_CPU_P (GENERIC)) /* tzcnt expands to 'rep bsf' and we can use it even if !TARGET_BMI. */ return "rep%; bsf{}\t{%1, %0|%0, %1}"; return "bsf{}\t{%1, %0|%0, %1}"; } - "(TARGET_BMI || TARGET_GENERIC) + "(TARGET_BMI || TARGET_CPU_P (GENERIC)) && TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed && optimize_function_for_speed_p (cfun) && !reg_mentioned_p (operands[0], operands[1])" @@ -14324,7 +14324,7 @@ (if_then_else (ior (match_test "TARGET_BMI") (and (not (match_test "optimize_function_for_size_p (cfun)")) - (match_test "TARGET_GENERIC"))) + (match_test "TARGET_CPU_P (GENERIC)"))) (const_string "1") (const_string "0"))) (set_attr "mode" "")]) @@ -14343,7 +14343,7 @@ { if (TARGET_BMI) return "tzcnt{}\t{%1, %0|%0, %1}"; - else if (TARGET_GENERIC) + else if (TARGET_CPU_P (GENERIC)) /* tzcnt expands to 'rep bsf' and we can use it even if !TARGET_BMI. */ return "rep%; bsf{}\t{%1, %0|%0, %1}"; else diff --git a/gcc/config/i386/x86-tune-sched-atom.c b/gcc/config/i386/x86-tune-sched-atom.c index cfb0c65..1611436 100644 --- a/gcc/config/i386/x86-tune-sched-atom.c +++ b/gcc/config/i386/x86-tune-sched-atom.c @@ -51,7 +51,7 @@ do_reorder_for_imul (rtx_insn **ready, int n_ready) int index = -1; int i; - if (!TARGET_BONNELL) + if (!TARGET_CPU_P (BONNELL)) return index; /* Check that IMUL instruction is on the top of ready list. */ @@ -131,7 +131,7 @@ swap_top_of_ready_list (rtx_insn **ready, int n_ready) int clock2 = -1; #define INSN_TICK(INSN) (HID (INSN)->tick) - if (!TARGET_SILVERMONT && !TARGET_INTEL) + if (!TARGET_CPU_P (SILVERMONT) && !TARGET_CPU_P (INTEL)) return false; if (!NONDEBUG_INSN_P (top)) @@ -204,7 +204,8 @@ ix86_atom_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready, issue_rate = ix86_issue_rate (); /* Do reodering for BONNELL/SILVERMONT only. */ - if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL) + if (!TARGET_CPU_P (BONNELL) && !TARGET_CPU_P (SILVERMONT) + && !TARGET_CPU_P (INTEL)) return issue_rate; /* Nothing to do if ready list contains only 1 instruction. */ diff --git a/gcc/config/i386/x86-tune-sched-bd.c b/gcc/config/i386/x86-tune-sched-bd.c index d696643..ad0edf7 100644 --- a/gcc/config/i386/x86-tune-sched-bd.c +++ b/gcc/config/i386/x86-tune-sched-bd.c @@ -800,8 +800,9 @@ bool ix86_bd_has_dispatch (rtx_insn *insn, int action) { /* Current implementation of dispatch scheduler models buldozer only. */ - if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3 - || TARGET_BDVER4) && flag_dispatch_scheduler) + if ((TARGET_CPU_P (BDVER1) || TARGET_CPU_P (BDVER2) + || TARGET_CPU_P (BDVER3) || TARGET_CPU_P (BDVER4)) + && flag_dispatch_scheduler) switch (action) { default: diff --git a/gcc/config/i386/x86-tune-sched.c b/gcc/config/i386/x86-tune-sched.c index 2bcc64b..6d8bca9 100644 --- a/gcc/config/i386/x86-tune-sched.c +++ b/gcc/config/i386/x86-tune-sched.c @@ -386,7 +386,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) loadcost = 3; else - loadcost = TARGET_ATHLON ? 2 : 0; + loadcost = TARGET_CPU_P (ATHLON) ? 2 : 0; if (cost >= loadcost) cost -= loadcost; -- cgit v1.1 From 8f48ec0946abdc036d74a157623b45fddd864a72 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Mon, 15 Mar 2021 09:45:41 +0100 Subject: Fix clang warnings. gcc/ChangeLog: * config/i386/i386.c: Remove superfluous || TARGET_MACHO which remains to be '(... || 0)' and clang complains about it. * dwarf2out.c (AT_vms_delta): Declare conditionally. (add_AT_vms_delta): Likewise. * tree.c (fld_simplified_type): Use rather more common pattern for disabling of something (#if 0). (get_tree_code_name): Likewise. (verify_type_variant): Likewise. --- gcc/config/i386/i386.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index c302bc2..d3c09bf 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -10817,12 +10817,11 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool strict) else if (SYMBOLIC_CONST (disp) && (flag_pic - || (TARGET_MACHO #if TARGET_MACHO - && MACHOPIC_INDIRECT - && !machopic_operand_p (disp) + || (MACHOPIC_INDIRECT + && !machopic_operand_p (disp)) #endif - ))) + )) { is_legitimate_pic: -- cgit v1.1 From a44895ce7ffbc26b4d765c40b5b346f8c9a9b762 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Wed, 21 Apr 2021 20:40:48 +0100 Subject: Darwin, X86 : Fix bootstrap break from flags changes. The changes from r12-36-g1751bec027f030515889fcf4baa9c91501aafc85 did not remove the uses of TARGET_ISA_* from i386/darwin.h. Fixed thus. gcc/ChangeLog: * config/i386/darwin.h (TARGET_64BIT): Remove definition based on TARGET_ISA_64BIT. (TARGET_64BIT_P): Remove definition based on TARGET_ISA_64BIT_P(). --- gcc/config/i386/darwin.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h index c81db9b..2657dfe 100644 --- a/gcc/config/i386/darwin.h +++ b/gcc/config/i386/darwin.h @@ -25,11 +25,6 @@ along with GCC; see the file COPYING3. If not see #undef DARWIN_X86 #define DARWIN_X86 1 -#undef TARGET_64BIT -#define TARGET_64BIT TARGET_ISA_64BIT -#undef TARGET_64BIT_P -#define TARGET_64BIT_P(x) TARGET_ISA_64BIT_P(x) - #ifdef IN_LIBGCC2 #undef TARGET_64BIT #ifdef __x86_64__ -- cgit v1.1 From 0cda606d08d6196b76524c7b6ad51d87fed0d54b Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 22 Apr 2021 16:30:38 +0200 Subject: i386: Fix unsigned int -> double conversion on i386 w/ -mfpmath=sse [PR100119] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2021-04-22 Uroš Bizjak gcc/ PR target/100119 * config/i386/i386-expand.c (ix86_expand_convert_uns_sidf_sse): Remove the sign with FE_DOWNWARD, where x - x = -0.0. gcc/testsuite/ PR target/100119 * gcc.target/i386/pr100119.c: New test. --- gcc/config/i386/i386-expand.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 166c23d..516440e 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -1550,6 +1550,8 @@ ix86_expand_convert_uns_sixf_sse (rtx, rtx) gcc_unreachable (); } +static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask); + /* Convert an unsigned SImode value into a DFmode. Only currently used for SSE, but applicable anywhere. */ @@ -1569,6 +1571,11 @@ ix86_expand_convert_uns_sidf_sse (rtx target, rtx input) x = const_double_from_real_value (TWO31r, DFmode); x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT); + + /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */ + if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math) + x = ix86_expand_sse_fabs (x, NULL); + if (x != target) emit_move_insn (target, x); } -- cgit v1.1 From 0745b6fa66c69cc1e27547601298843c26f0e144 Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Thu, 22 Apr 2021 17:41:10 -0400 Subject: aix: Remove AIX 6.1 support. AIX 6.1 is past end of life and extended support. This patch removes the configuration option and references to AIX 6.1. contrib/ChangeLog: * config-list.mk: Remove rs6000-ibm-aix6.1. Rename rs6000-ibm-aix7.1 to powerpc-ibm-aix7.1. Add powerpc-ibm-aix7.2. gcc/ChangeLog: * config.gcc (powerpc-ibm-aix6.*): Remove. * config/rs6000/aix61.h: Delete. --- gcc/config/rs6000/aix61.h | 214 ---------------------------------------------- 1 file changed, 214 deletions(-) delete mode 100644 gcc/config/rs6000/aix61.h (limited to 'gcc/config') diff --git a/gcc/config/rs6000/aix61.h b/gcc/config/rs6000/aix61.h deleted file mode 100644 index 78c179b..0000000 --- a/gcc/config/rs6000/aix61.h +++ /dev/null @@ -1,214 +0,0 @@ -/* Definitions of target machine for GNU compiler, - for IBM RS/6000 POWER running AIX V6.1. - Copyright (C) 2002-2021 Free Software Foundation, Inc. - Contributed by David Edelsohn (edelsohn@gnu.org). - - This file is part of GCC. - - GCC is free software; you can redistribute it and/or modify it - under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3, or (at your - option) any later version. - - GCC is distributed in the hope that it will be useful, but WITHOUT - ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public - License for more details. - - You should have received a copy of the GNU General Public License - along with GCC; see the file COPYING3. If not see - . */ - -/* The macro SUBTARGET_OVERRIDE_OPTIONS is provided for subtargets, to - get control in TARGET_OPTION_OVERRIDE. */ - -#define SUBTARGET_OVERRIDE_OPTIONS \ -do { \ - if (TARGET_64BIT && ! TARGET_POWERPC64) \ - { \ - rs6000_isa_flags |= OPTION_MASK_POWERPC64; \ - warning (0, "%<-maix64%> requires PowerPC64 architecture remain enabled"); \ - } \ - if (TARGET_SOFT_FLOAT && TARGET_LONG_DOUBLE_128) \ - { \ - rs6000_long_double_type_size = 64; \ - if (global_options_set.x_rs6000_long_double_type_size) \ - warning (0, "soft-float and long-double-128 are incompatible"); \ - } \ - if (TARGET_POWERPC64 && ! TARGET_64BIT) \ - { \ - error ("%<-maix64%> required: 64-bit computation with 32-bit addressing not yet supported"); \ - } \ - if ((rs6000_isa_flags_explicit \ - & OPTION_MASK_MINIMAL_TOC) != 0) \ - { \ - if (global_options_set.x_rs6000_current_cmodel \ - && rs6000_current_cmodel != CMODEL_SMALL) \ - error ("%<-mcmodel%> incompatible with other toc options"); \ - SET_CMODEL (CMODEL_SMALL); \ - } \ - if (rs6000_current_cmodel != CMODEL_SMALL) \ - { \ - TARGET_NO_FP_IN_TOC = 1; \ - TARGET_NO_SUM_IN_TOC = 1; \ - } \ - if (rs6000_current_cmodel == CMODEL_MEDIUM) \ - { \ - rs6000_current_cmodel = CMODEL_LARGE; \ - } \ -} while (0) - -#undef ASM_SPEC -#define ASM_SPEC "-u %{maix64:-a64 %{!mcpu*:-mppc64}} %(asm_cpu)" - -/* Common ASM definitions used by ASM_SPEC amongst the various targets for - handling -mcpu=xxx switches. There is a parallel list in driver-rs6000.c to - provide the default assembler options if the user uses -mcpu=native, so if - you make changes here, make them there also. */ -#undef ASM_CPU_SPEC -#define ASM_CPU_SPEC \ -"%{!mcpu*: %{!maix64: \ - %{mpowerpc64: -mppc64} \ - %{maltivec: -m970} \ - %{!maltivec: %{!mpowerpc64: %(asm_default)}}}} \ -%{mcpu=native: %(asm_cpu_native)} \ -%{mcpu=power3: -m620} \ -%{mcpu=power4: -mpwr4} \ -%{mcpu=power5: -mpwr5} \ -%{mcpu=power5+: -mpwr5x} \ -%{mcpu=power6: -mpwr6} \ -%{mcpu=power6x: -mpwr6} \ -%{mcpu=power7: -mpwr7} \ -%{mcpu=power8: -mpwr8} \ -%{mcpu=power9: -mpwr9} \ -%{mcpu=powerpc: -mppc} \ -%{mcpu=rs64a: -mppc} \ -%{mcpu=603: -m603} \ -%{mcpu=603e: -m603} \ -%{mcpu=604: -m604} \ -%{mcpu=604e: -m604} \ -%{mcpu=620: -m620} \ -%{mcpu=630: -m620} \ -%{mcpu=970: -m970} \ -%{mcpu=G5: -m970} \ -%{mvsx: %{!mcpu*: -mpwr6}} \ --many" - -#undef ASM_DEFAULT_SPEC -#define ASM_DEFAULT_SPEC "-mpwr4" - -#undef TARGET_OS_CPP_BUILTINS -#define TARGET_OS_CPP_BUILTINS() \ - do \ - { \ - builtin_define ("_AIX43"); \ - builtin_define ("_AIX51"); \ - builtin_define ("_AIX52"); \ - builtin_define ("_AIX53"); \ - builtin_define ("_AIX61"); \ - TARGET_OS_AIX_CPP_BUILTINS (); \ - } \ - while (0) - -#undef CPP_SPEC -#define CPP_SPEC "%{posix: -D_POSIX_SOURCE} \ - %{ansi: -D_ANSI_C_SOURCE} \ - %{maix64: -D__64BIT__} \ - %{mpe: -I%R/usr/lpp/ppe.poe/include} \ - %{pthread: -D_THREAD_SAFE}" - -/* The GNU C++ standard library requires that these macros be - defined. Synchronize with libstdc++ os_defines.h. */ -#undef CPLUSPLUS_CPP_SPEC -#define CPLUSPLUS_CPP_SPEC \ - "-D_ALL_SOURCE -D__COMPATMATH__ \ - %{maix64: -D__64BIT__} \ - %{mpe: -I%R/usr/lpp/ppe.poe/include} \ - %{pthread: -D_THREAD_SAFE}" - -#undef TARGET_DEFAULT -#define TARGET_DEFAULT (MASK_PPC_GPOPT | MASK_PPC_GFXOPT | MASK_MFCRF) - -#undef PROCESSOR_DEFAULT -#define PROCESSOR_DEFAULT PROCESSOR_POWER7 -#undef PROCESSOR_DEFAULT64 -#define PROCESSOR_DEFAULT64 PROCESSOR_POWER7 - -/* AIX 6.1 kernel and assembler have necessary support for Altivec and VSX. */ -#undef OS_MISSING_ALTIVEC - -/* Define this macro as a C expression for the initializer of an - array of string to tell the driver program which options are - defaults for this target and thus do not need to be handled - specially when using `MULTILIB_OPTIONS'. - - Do not define this macro if `MULTILIB_OPTIONS' is not defined in - the target makefile fragment or if none of the options listed in - `MULTILIB_OPTIONS' are set by default. *Note Target Fragment::. */ - -#undef MULTILIB_DEFAULTS - -#undef LIB_SPEC -#define LIB_SPEC "%{pg:-L%R/lib/profiled -L%R/usr/lib/profiled}\ - %{p:-L%R/lib/profiled -L%R/usr/lib/profiled}\ - %{!maix64:%{!shared:%{g*:-lg}}}\ - %{fprofile-arcs|fprofile-generate*|coverage:-lpthreads}\ - %{mpe:-L%R/usr/lpp/ppe.poe/lib -lmpi -lvtd}\ - %{mlong-double-128:-lc128}\ - %{pthread:-lpthreads} -lc" - -#undef LINK_SPEC -#define LINK_SPEC "-bpT:0x10000000 -bpD:0x20000000 %{!r:-btextro}\ - %{static:-bnso %(link_syscalls) } %{shared:-bM:SRE %{!e:-bnoentry}}\ - %{!maix64:%{!shared:%{g*: %(link_libg) }}} %{maix64:-b64}\ - %{mpe:-binitfini:poe_remote_main}" - -#undef STARTFILE_SPEC -#define STARTFILE_SPEC "%{!shared:\ - %{maix64:%{pg:gcrt0_64%O%s}%{!pg:%{p:mcrt0_64%O%s}%{!p:crt0_64%O%s}}}\ - %{!maix64:\ - %{pthread:%{pg:gcrt0_r%O%s}%{!pg:%{p:mcrt0_r%O%s}%{!p:crt0_r%O%s}}}\ - %{!pthread:%{pg:gcrt0%O%s}%{!pg:%{p:mcrt0%O%s}%{!p:crt0%O%s}}}}}\ - %{shared:crtcxa_s%O%s;:crtcxa%O%s} crtdbase%O%s" - -/* AIX V5 typedefs ptrdiff_t as "long" while earlier releases used "int". */ - -#undef PTRDIFF_TYPE -#define PTRDIFF_TYPE "long int" - -/* Type used for wchar_t, as a string used in a declaration. */ -#undef WCHAR_TYPE -#define WCHAR_TYPE (!TARGET_64BIT ? "short unsigned int" : "unsigned int") - -/* Width of wchar_t in bits. */ -#undef WCHAR_TYPE_SIZE -#define WCHAR_TYPE_SIZE (!TARGET_64BIT ? 16 : 32) - -/* AIX 4.2 and above provides initialization and finalization function - support from linker command line. */ -#undef HAS_INIT_SECTION -#define HAS_INIT_SECTION - -#undef LD_INIT_SWITCH -#define LD_INIT_SWITCH "-binitfini" - -#ifndef _AIX52 -extern long long int atoll(const char *); -#endif - -/* This target uses the aix64.opt file. */ -#define TARGET_USES_AIX64_OPT 1 - -/* Large TOC Support */ -#ifdef HAVE_LD_LARGE_TOC -#undef TARGET_CMODEL -#define TARGET_CMODEL rs6000_current_cmodel -#define SET_CMODEL(opt) rs6000_current_cmodel = opt -#else -#define SET_CMODEL(opt) do {} while (0) -#endif - -/* This target defines SUPPORTS_WEAK and TARGET_ASM_NAMED_SECTION, - but does not have crtbegin/end. */ - -#define TARGET_AIX_VERSION 61 -- cgit v1.1 From 6a2f49c6999fa2c7a3a8cb05005d21f8020d674d Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Tue, 14 Jan 2020 13:12:59 -0500 Subject: aix: Switch AIX configurtion to DWARF2 debugging This patch is in preparation for removing stabs debugging support from GCC. The rs6000 configuration files remain somewhat intertwined with the stabs debugging support, but the configuration no longer generates stabs debugging information. This patch means that earlier releases (Technology Levels) of AIX 7.1 and 7.2, prior to DWARF support and fixes, cannot build GCC or support GCC. gcc/ChangeLog: * config/rs6000/aix71.h (PREFERRED_DEBUGGING_TYPE): Change to DWARF2_DEBUG. * config/rs6000/aix72.h (PREFERRED_DEBUGGING_TYPE): Same. --- gcc/config/rs6000/aix71.h | 4 ++-- gcc/config/rs6000/aix72.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/aix71.h b/gcc/config/rs6000/aix71.h index 3612ed2..807e260 100644 --- a/gcc/config/rs6000/aix71.h +++ b/gcc/config/rs6000/aix71.h @@ -272,9 +272,9 @@ extern long long int atoll(const char *); #define TARGET_AIX_VERSION 71 -/* AIX 7.1 supports DWARF3 debugging, but XCOFF remains the default. */ +/* AIX 7.1 supports DWARF3+ debugging. */ #define DWARF2_DEBUGGING_INFO 1 -#define PREFERRED_DEBUGGING_TYPE XCOFF_DEBUG +#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG #define DEBUG_INFO_SECTION "0x10000" #define DEBUG_LINE_SECTION "0x20000" #define DEBUG_PUBNAMES_SECTION "0x30000" diff --git a/gcc/config/rs6000/aix72.h b/gcc/config/rs6000/aix72.h index d349092..36c5d99 100644 --- a/gcc/config/rs6000/aix72.h +++ b/gcc/config/rs6000/aix72.h @@ -273,9 +273,9 @@ extern long long int atoll(const char *); #define TARGET_AIX_VERSION 72 -/* AIX 7.2 supports DWARF3 debugging, but XCOFF remains the default. */ +/* AIX 7.2 supports DWARF3+ debugging. */ #define DWARF2_DEBUGGING_INFO 1 -#define PREFERRED_DEBUGGING_TYPE XCOFF_DEBUG +#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG #define DEBUG_INFO_SECTION "0x10000" #define DEBUG_LINE_SECTION "0x20000" #define DEBUG_PUBNAMES_SECTION "0x30000" -- cgit v1.1 From 342de04d993beaa644d0b0087c20bef5dad5bf5f Mon Sep 17 00:00:00 2001 From: liuhongt Date: Fri, 16 Apr 2021 11:29:10 +0800 Subject: MASK_AVX256_SPLIT_UNALIGNED_STORE/LOAD should be cleared in opts->x_target_flags when X86_TUNE_AVX256_UNALIGNED_LOAD/STORE_OPTIMAL is enabled by target attribute. gcc/ChangeLog: PR target/100093 * config/i386/i386-options.c (ix86_option_override_internal): Clear MASK_AVX256_SPLIT_UNALIGNED_LOAD/STORE in x_target_flags when X86_TUNE_AVX256_UNALIGNED_LOAD/STORE_OPTIMAL is enabled by target attribute. gcc/testsuite/ChangeLog: PR target/100093 * gcc.target/i386/pr100093.c: New test. --- gcc/config/i386/i386-options.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 2a12228..2f3d40c 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -2618,9 +2618,16 @@ ix86_option_override_internal (bool main_args_p, if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL] && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; + else if (!main_args_p + && ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]) + opts->x_target_flags &= ~MASK_AVX256_SPLIT_UNALIGNED_LOAD; + if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL] && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE)) opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; + else if (!main_args_p + && ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]) + opts->x_target_flags &= ~MASK_AVX256_SPLIT_UNALIGNED_STORE; /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */ -- cgit v1.1 From d2324a5ab3ff097864ae6828cb1db4dd013c70d1 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 23 Apr 2021 17:29:29 +0200 Subject: i386: Fix atomic FP peepholes [PR100182] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 64bit loads to/stores from x87 and SSE registers are atomic also on 32-bit targets, so there is no need for additional atomic moves to a temporary register. Introduced load peephole2 patterns assume that there won't be any additional loads from the load location outside the peepholed sequence and wrongly removed the source location initialization. OTOH, introduced store peephole2 patterns assume there won't be any additional loads from the stored location outside the peepholed sequence and wrongly removed the destination location initialization. Note that we can't use plain x87 FST instruction to initialize destination location because FST converts the value to the double-precision format, changing bits during move. The patch restores removed initializations in load and store patterns. Additionally, plain x87 FST in store peephole2 patterns is prevented by limiting the store operand source to SSE registers. 2021-04-23 Uroš Bizjak gcc/ PR target/100182 * config/i386/sync.md (FILD_ATOMIC/FIST_ATOMIC FP load peephole2): Copy operand 3 to operand 4. Use sse_reg_operand as operand 3 predicate. (FILD_ATOMIC/FIST_ATOMIC FP load peephole2 with mem blockage): Ditto. (LDX_ATOMIC/STX_ATOMIC FP load peephole2): Ditto. (LDX_ATOMIC/LDX_ATOMIC FP load peephole2 with mem blockage): Ditto. (FILD_ATOMIC/FIST_ATOMIC FP store peephole2): Copy operand 1 to operand 0. (FILD_ATOMIC/FIST_ATOMIC FP store peephole2 with mem blockage): Ditto. (LDX_ATOMIC/STX_ATOMIC FP store peephole2): Ditto. (LDX_ATOMIC/LDX_ATOMIC FP store peephole2 with mem blockage): Ditto. gcc/testsuite/ PR target/100182 * gcc.target/i386/pr100182.c: New test. * gcc.target/i386/pr71245-1.c (dg-final): Xfail scan-assembler-not. * gcc.target/i386/pr71245-2.c (dg-final): Ditto. --- gcc/config/i386/sync.md | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md index c7c508c..7913b91 100644 --- a/gcc/config/i386/sync.md +++ b/gcc/config/i386/sync.md @@ -226,12 +226,13 @@ (set (match_operand:DI 2 "memory_operand") (unspec:DI [(match_dup 0)] UNSPEC_FIST_ATOMIC)) - (set (match_operand:DF 3 "any_fp_register_operand") + (set (match_operand:DF 3 "sse_reg_operand") (match_operand:DF 4 "memory_operand"))] "!TARGET_64BIT && peep2_reg_dead_p (2, operands[0]) && rtx_equal_p (XEXP (operands[4], 0), XEXP (operands[2], 0))" - [(set (match_dup 3) (match_dup 5))] + [(set (match_dup 3) (match_dup 5)) + (set (match_dup 4) (match_dup 3))] "operands[5] = gen_lowpart (DFmode, operands[1]);") (define_peephole2 @@ -243,7 +244,7 @@ UNSPEC_FIST_ATOMIC)) (set (mem:BLK (scratch:SI)) (unspec:BLK [(mem:BLK (scratch:SI))] UNSPEC_MEMORY_BLOCKAGE)) - (set (match_operand:DF 3 "any_fp_register_operand") + (set (match_operand:DF 3 "sse_reg_operand") (match_operand:DF 4 "memory_operand"))] "!TARGET_64BIT && peep2_reg_dead_p (2, operands[0]) @@ -251,6 +252,7 @@ [(const_int 0)] { emit_move_insn (operands[3], gen_lowpart (DFmode, operands[1])); + emit_move_insn (operands[4], operands[3]); emit_insn (gen_memory_blockage ()); DONE; }) @@ -262,12 +264,13 @@ (set (match_operand:DI 2 "memory_operand") (unspec:DI [(match_dup 0)] UNSPEC_STX_ATOMIC)) - (set (match_operand:DF 3 "any_fp_register_operand") + (set (match_operand:DF 3 "sse_reg_operand") (match_operand:DF 4 "memory_operand"))] "!TARGET_64BIT && peep2_reg_dead_p (2, operands[0]) && rtx_equal_p (XEXP (operands[4], 0), XEXP (operands[2], 0))" - [(set (match_dup 3) (match_dup 5))] + [(set (match_dup 3) (match_dup 5)) + (set (match_dup 4) (match_dup 3))] "operands[5] = gen_lowpart (DFmode, operands[1]);") (define_peephole2 @@ -279,7 +282,7 @@ UNSPEC_STX_ATOMIC)) (set (mem:BLK (scratch:SI)) (unspec:BLK [(mem:BLK (scratch:SI))] UNSPEC_MEMORY_BLOCKAGE)) - (set (match_operand:DF 3 "any_fp_register_operand") + (set (match_operand:DF 3 "sse_reg_operand") (match_operand:DF 4 "memory_operand"))] "!TARGET_64BIT && peep2_reg_dead_p (2, operands[0]) @@ -287,6 +290,7 @@ [(const_int 0)] { emit_move_insn (operands[3], gen_lowpart (DFmode, operands[1])); + emit_move_insn (operands[4], operands[3]); emit_insn (gen_memory_blockage ()); DONE; }) @@ -392,7 +396,8 @@ "!TARGET_64BIT && peep2_reg_dead_p (3, operands[2]) && rtx_equal_p (XEXP (operands[0], 0), XEXP (operands[3], 0))" - [(set (match_dup 5) (match_dup 1))] + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 5) (match_dup 1))] "operands[5] = gen_lowpart (DFmode, operands[4]);") (define_peephole2 @@ -411,6 +416,7 @@ && rtx_equal_p (XEXP (operands[0], 0), XEXP (operands[3], 0))" [(const_int 0)] { + emit_move_insn (operands[0], operands[1]); emit_insn (gen_memory_blockage ()); emit_move_insn (gen_lowpart (DFmode, operands[4]), operands[1]); DONE; @@ -428,7 +434,8 @@ "!TARGET_64BIT && peep2_reg_dead_p (3, operands[2]) && rtx_equal_p (XEXP (operands[0], 0), XEXP (operands[3], 0))" - [(set (match_dup 5) (match_dup 1))] + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 5) (match_dup 1))] "operands[5] = gen_lowpart (DFmode, operands[4]);") (define_peephole2 @@ -447,6 +454,7 @@ && rtx_equal_p (XEXP (operands[0], 0), XEXP (operands[3], 0))" [(const_int 0)] { + emit_move_insn (operands[0], operands[1]); emit_insn (gen_memory_blockage ()); emit_move_insn (gen_lowpart (DFmode, operands[4]), operands[1]); DONE; -- cgit v1.1 From 716bb02b40ecef5564abb5ba45a594323123a104 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 23 Apr 2021 18:45:14 +0200 Subject: i386: Reject -m96bit-long-double for 64bit targets [PR100041] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 64bit targets default to 128bit long double, so -m96bit-long-double should not be used. Together with -m128bit-long-double, this option was intended to be an optimization for 32bit targets only. Error out when -m96bit-long-double is used with 64bit targets. 2021-04-23 Uroš Bizjak gcc/ PR target/100041 * config/i386/i386-options.c (ix86_option_override_internal): Error out when -m96bit-long-double is used with 64bit targets. * config/i386/i386.md (*pushxf_rounded): Remove pattern. gcc/testsuite/ PR target/100041 * gcc.target/i386/pr79514.c (dg-error): Expect error for 64bit targets. --- gcc/config/i386/i386-options.c | 3 +++ gcc/config/i386/i386.md | 30 ------------------------------ 2 files changed, 3 insertions(+), 30 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 2f3d40c..cc7b617 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -2312,6 +2312,9 @@ ix86_option_override_internal (bool main_args_p, opts->x_ix86_isa_flags |= TARGET_SUBTARGET64_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit; + if (!TARGET_128BIT_LONG_DOUBLE_P (opts->x_target_flags)) + error ("%<-m96bit-long-double%> is not compatible with this target"); + if (TARGET_RTD_P (opts->x_target_flags)) warning (0, main_args_p diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 7eb7176..27f100c 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -3044,36 +3044,6 @@ operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx); }) -(define_insn_and_split "*pushxf_rounded" - [(set (mem:XF - (pre_modify:P - (reg:P SP_REG) - (plus:P (reg:P SP_REG) (const_int -16)))) - (match_operand:XF 0 "nonmemory_no_elim_operand" "f,r,*r,C"))] - "TARGET_64BIT" - "#" - "&& 1" - [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (const_int -16))) - (set (match_dup 1) (match_dup 0))] -{ - rtx pat = PATTERN (curr_insn); - operands[1] = SET_DEST (pat); - - /* Preserve memory attributes. */ - operands[1] = replace_equiv_address (operands[1], stack_pointer_rtx); -} - [(set_attr "type" "multi") - (set_attr "unit" "i387,*,*,*") - (set (attr "mode") - (cond [(eq_attr "alternative" "1,2,3") - (const_string "DI") - ] - (const_string "XF"))) - (set (attr "preferred_for_size") - (cond [(eq_attr "alternative" "1") - (symbol_ref "false")] - (symbol_ref "true")))]) - (define_insn "*pushxf" [(set (match_operand:XF 0 "push_operand" "=<,<,<,<,<") (match_operand:XF 1 "general_no_elim_operand" "f,r,*r,oF,oC"))] -- cgit v1.1 From 0a662e103e911af935aa5c601051c135986ce3de Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 22 Apr 2021 05:05:56 -0500 Subject: bpf: align function entry point to 64 bits Libbpf does not treat paddings after functions well. If function symbols does not cover a whole text section, it will emit error similar to: libbpf: sec '.text': failed to find program symbol at offset 56 Each instruction in BPF is a multiple of 8 bytes, so align the functions to 8 bytes, similar to how clang does it. 2021-04-22 YiFei Zhu gcc/ * config/bpf/bpf.h (FUNCTION_BOUNDARY): Set to 64. --- gcc/config/bpf/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/bpf/bpf.h b/gcc/config/bpf/bpf.h index 9e2f526..6a3907f 100644 --- a/gcc/config/bpf/bpf.h +++ b/gcc/config/bpf/bpf.h @@ -57,8 +57,8 @@ 64-bit at any time. */ #define STACK_BOUNDARY 64 -/* Function entry points are aligned to 128 bits. */ -#define FUNCTION_BOUNDARY 128 +/* Function entry points are aligned to 64 bits. */ +#define FUNCTION_BOUNDARY 64 /* Maximum alignment required by data of any type. */ #define BIGGEST_ALIGNMENT 64 -- cgit v1.1 From 886b6c1e8af502b69e3f318b9830b73b88215878 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 22 Apr 2021 05:05:57 -0500 Subject: bpf: allow BSS symbols to be global symbols Prior to this, a BSS declaration such as: int foo; static int bar; Generates: .global foo .local foo .comm foo,4,4 .local bar .comm bar,4,4 Creating symbols: 0000000000000000 b foo 0000000000000004 b bar Both symbols are local. However, libbpf bpf_object__variable_offset rquires symbols to be STB_GLOBAL & STT_OBJECT for data section lookup. This patch makes the same declaration generate: .global foo .type foo, @object .lcomm foo,4,4 .local bar .comm bar,4,4 Creating symbols: 0000000000000000 B foo 0000000000000004 b bar And libbpf will be okay with looking up the global symbol "foo". 2021-04-22 YiFei Zhu gcc/ * config/bpf/bpf.h (ASM_OUTPUT_ALIGNED_BSS): Use .type and .lcomm. --- gcc/config/bpf/bpf.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/bpf/bpf.h b/gcc/config/bpf/bpf.h index 6a3907f..4c5b19e 100644 --- a/gcc/config/bpf/bpf.h +++ b/gcc/config/bpf/bpf.h @@ -422,9 +422,15 @@ enum reg_class Try to use asm_output_aligned_bss to implement this macro. */ #define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ - do { \ - ASM_OUTPUT_ALIGNED_LOCAL (FILE, NAME, SIZE, ALIGN); \ - } while (0) + do \ + { \ + ASM_OUTPUT_TYPE_DIRECTIVE (FILE, NAME, "object"); \ + fprintf ((FILE), "%s", "\t.lcomm\t"); \ + assemble_name ((FILE), (NAME)); \ + fprintf ((FILE), "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n", \ + (SIZE), (ALIGN) / BITS_PER_UNIT); \ + } \ + while (0) /*** Output and Generation of Labels. */ -- cgit v1.1 From b6600392bf71c4a9785f8f49948b611425896830 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Sat, 24 Apr 2021 13:34:49 +0100 Subject: Darwin : Adjust darwin_binds_local_p for PIC code [PR100152]. Darwin's dynamic linker supports interposition and lazy symbol binding. If we are generating PIC code and a symbol is public, then it could potentially be indirected via a lazy-resolver stub; we cannot tell at compile-time if this will be done (since the indirection can be the result of adding a -flat-namespace option at link-time). Here we are conservative and assume that any such symbol cannot bind locally. The default implementation for binds_local_p handles undefined, weak and common symbols which are always indirected (for mdynamic-no-pic also). gcc/ChangeLog: PR target/100152 * config/darwin.c (darwin_binds_local_p): Assume that any public symbol might be interposed for PIC code. Update function header comment to reflect current Darwin capability. --- gcc/config/darwin.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c index 5d17391..36b460a 100644 --- a/gcc/config/darwin.c +++ b/gcc/config/darwin.c @@ -3134,16 +3134,25 @@ darwin_file_end (void) /* TODO: Add a language hook for identifying if a decl is a vtable. */ #define DARWIN_VTABLE_P(DECL) 0 -/* Cross-module name binding. Darwin does not support overriding - functions at dynamic-link time, except for vtables in kexts. */ +/* Cross-module name binding. + Darwin's dynamic linker supports interposition and lazy symbol binding. + If we are generating PIC code and a symbol is public, then it could + potentially be indirected via a lazy-resolver stub; we cannot tell at + compile-time if this will be done (since the indirection can be the + result of adding a -flat-namespace option at link-time). Here we are + conservative and assume that any such symbol cannot bind locally. + The default implementation for binds_local_p handles undefined, weak and + common symbols which are always indirected. */ bool darwin_binds_local_p (const_tree decl) { /* We use the "shlib" input to indicate that a symbol should be - considered overridable; only relevant for vtables in kernel modules - on earlier system versions, and with a TODO to complete. */ + considered overridable. Older versions of the kernel also support + interposition for extensions (although this code is a place-holder + until there is an implementation for DARWIN_VTABLE_P). */ bool force_overridable = TARGET_KEXTABI && DARWIN_VTABLE_P (decl); + force_overridable |= MACHOPIC_PURE; return default_binds_local_p_3 (decl, force_overridable /* shlib */, false /* weak dominate */, false /* extern_protected_data */, -- cgit v1.1 From a1765b421eb3d01ecc88fb0cdec9f06bfdaab8e2 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Sat, 24 Apr 2021 19:28:25 +0100 Subject: Revert "Darwin : Adjust darwin_binds_local_p for PIC code [PR100152]." Unfortunately, although this is required to fix the PR, and is notionally correct, it regresses some of the sanitizer and IPA tests. Reverting until this can be analysed. This reverts commit b6600392bf71c4a9785f8f49948b611425896830. --- gcc/config/darwin.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c index 36b460a..5d17391 100644 --- a/gcc/config/darwin.c +++ b/gcc/config/darwin.c @@ -3134,25 +3134,16 @@ darwin_file_end (void) /* TODO: Add a language hook for identifying if a decl is a vtable. */ #define DARWIN_VTABLE_P(DECL) 0 -/* Cross-module name binding. - Darwin's dynamic linker supports interposition and lazy symbol binding. - If we are generating PIC code and a symbol is public, then it could - potentially be indirected via a lazy-resolver stub; we cannot tell at - compile-time if this will be done (since the indirection can be the - result of adding a -flat-namespace option at link-time). Here we are - conservative and assume that any such symbol cannot bind locally. - The default implementation for binds_local_p handles undefined, weak and - common symbols which are always indirected. */ +/* Cross-module name binding. Darwin does not support overriding + functions at dynamic-link time, except for vtables in kexts. */ bool darwin_binds_local_p (const_tree decl) { /* We use the "shlib" input to indicate that a symbol should be - considered overridable. Older versions of the kernel also support - interposition for extensions (although this code is a place-holder - until there is an implementation for DARWIN_VTABLE_P). */ + considered overridable; only relevant for vtables in kernel modules + on earlier system versions, and with a TODO to complete. */ bool force_overridable = TARGET_KEXTABI && DARWIN_VTABLE_P (decl); - force_overridable |= MACHOPIC_PURE; return default_binds_local_p_3 (decl, force_overridable /* shlib */, false /* weak dominate */, false /* extern_protected_data */, -- cgit v1.1 From c54a9f7259fce1a221270f21d8076c960c7d0163 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Tue, 23 Feb 2021 11:17:40 +0800 Subject: Add folding and remove expanders for x86 *pcmp{et,gt}* builtins [PR target/98911] gcc/ChangeLog: PR target/98911 * config/i386/i386-builtin.def (BDESC): Change the icode of the following builtins to CODE_FOR_nothing. * config/i386/i386.c (ix86_gimple_fold_builtin): Fold IX86_BUILTIN_PCMPEQB128, IX86_BUILTIN_PCMPEQW128, IX86_BUILTIN_PCMPEQD128, IX86_BUILTIN_PCMPEQQ, IX86_BUILTIN_PCMPEQB256, IX86_BUILTIN_PCMPEQW256, IX86_BUILTIN_PCMPEQD256, IX86_BUILTIN_PCMPEQQ256, IX86_BUILTIN_PCMPGTB128, IX86_BUILTIN_PCMPGTW128, IX86_BUILTIN_PCMPGTD128, IX86_BUILTIN_PCMPGTQ, IX86_BUILTIN_PCMPGTB256, IX86_BUILTIN_PCMPGTW256, IX86_BUILTIN_PCMPGTD256, IX86_BUILTIN_PCMPGTQ256. * config/i386/sse.md (avx2_eq3): Deleted. (sse2_eq3): Ditto. (sse4_1_eqv2di3): Ditto. (sse2_gt3): Rename to .. (*sse2_gt3): .. this. gcc/testsuite/ChangeLog: PR target/98911 * gcc.target/i386/pr98911.c: New test. * gcc.target/i386/funcspec-8.c: Replace __builtin_ia32_pcmpgtq with __builtin_ia32_pcmpistrm128 since it has been folded. --- gcc/config/i386/i386-builtin.def | 32 ++++++++++++++--------------- gcc/config/i386/i386.c | 44 ++++++++++++++++++++++++++++++++++++++++ gcc/config/i386/sse.md | 26 +----------------------- 3 files changed, 61 insertions(+), 41 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index e3ed4e1..4dbd4f2 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -773,12 +773,12 @@ BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX8 BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI) -BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) -BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI) -BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI ) -BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) -BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI) -BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI ) +BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI) +BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI ) +BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI) +BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI ) BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI) @@ -919,7 +919,7 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__built BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI) -BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI) +BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI) @@ -962,7 +962,7 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_ptestv2di, "__builtin_ia32_pte BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_ptestv2di, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST) /* SSE4.2 */ -BDESC (OPTION_MASK_ISA_SSE4_2, 0, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI) +BDESC (OPTION_MASK_ISA_SSE4_2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI) BDESC (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, 0, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR) BDESC (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, 0, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT) BDESC (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, 0, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT) @@ -1149,14 +1149,14 @@ BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb2 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI ) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI ) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI ) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI ) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI ) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI ) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI ) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI ) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index d3c09bf..adcef1e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -67,6 +67,7 @@ along with GCC; see the file COPYING3. If not see #include "pass_manager.h" #include "target-globals.h" #include "gimple-iterator.h" +#include "gimple-fold.h" #include "tree-vectorizer.h" #include "shrink-wrap.h" #include "builtins.h" @@ -17865,6 +17866,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) tree decl = NULL_TREE; tree arg0, arg1, arg2; enum rtx_code rcode; + enum tree_code tcode; unsigned HOST_WIDE_INT count; bool is_vshift; @@ -17946,6 +17948,48 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) } break; + case IX86_BUILTIN_PCMPEQB128: + case IX86_BUILTIN_PCMPEQW128: + case IX86_BUILTIN_PCMPEQD128: + case IX86_BUILTIN_PCMPEQQ: + case IX86_BUILTIN_PCMPEQB256: + case IX86_BUILTIN_PCMPEQW256: + case IX86_BUILTIN_PCMPEQD256: + case IX86_BUILTIN_PCMPEQQ256: + tcode = EQ_EXPR; + goto do_cmp; + + case IX86_BUILTIN_PCMPGTB128: + case IX86_BUILTIN_PCMPGTW128: + case IX86_BUILTIN_PCMPGTD128: + case IX86_BUILTIN_PCMPGTQ: + case IX86_BUILTIN_PCMPGTB256: + case IX86_BUILTIN_PCMPGTW256: + case IX86_BUILTIN_PCMPGTD256: + case IX86_BUILTIN_PCMPGTQ256: + tcode = GT_EXPR; + + do_cmp: + gcc_assert (n_args == 2); + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + { + location_t loc = gimple_location (stmt); + tree type = TREE_TYPE (arg0); + tree zero_vec = build_zero_cst (type); + tree minus_one_vec = build_minus_one_cst (type); + tree cmp_type = truth_type_for (type); + gimple_seq stmts = NULL; + tree cmp = gimple_build (&stmts, tcode, cmp_type, arg0, arg1); + gsi_insert_before (gsi, stmts, GSI_SAME_STMT); + gimple *g = gimple_build_assign (gimple_call_lhs (stmt), + VEC_COND_EXPR, cmp, + minus_one_vec, zero_vec); + gimple_set_location (g, loc); + gsi_replace (gsi, g, false); + } + return true; + case IX86_BUILTIN_PSLLD: case IX86_BUILTIN_PSLLD128: case IX86_BUILTIN_PSLLD128_MASK: diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 319099d..897cf3e 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -12875,14 +12875,6 @@ ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(define_expand "avx2_eq3" - [(set (match_operand:VI_256 0 "register_operand") - (eq:VI_256 - (match_operand:VI_256 1 "nonimmediate_operand") - (match_operand:VI_256 2 "nonimmediate_operand")))] - "TARGET_AVX2" - "ix86_fixup_binary_operands_no_copy (EQ, mode, operands);") - (define_insn "*avx2_eq3" [(set (match_operand:VI_256 0 "register_operand" "=x") (eq:VI_256 @@ -13058,22 +13050,6 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) -(define_expand "sse2_eq3" - [(set (match_operand:VI124_128 0 "register_operand") - (eq:VI124_128 - (match_operand:VI124_128 1 "vector_operand") - (match_operand:VI124_128 2 "vector_operand")))] - "TARGET_SSE2 && !TARGET_XOP " - "ix86_fixup_binary_operands_no_copy (EQ, mode, operands);") - -(define_expand "sse4_1_eqv2di3" - [(set (match_operand:V2DI 0 "register_operand") - (eq:V2DI - (match_operand:V2DI 1 "vector_operand") - (match_operand:V2DI 2 "vector_operand")))] - "TARGET_SSE4_1" - "ix86_fixup_binary_operands_no_copy (EQ, V2DImode, operands);") - (define_insn "sse4_2_gtv2di3" [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,x") (gt:V2DI @@ -13144,7 +13120,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "")]) -(define_insn "sse2_gt3" +(define_insn "*sse2_gt3" [(set (match_operand:VI124_128 0 "register_operand" "=x,x") (gt:VI124_128 (match_operand:VI124_128 1 "register_operand" "0,x") -- cgit v1.1 From 52a5515ed6619739eb122f05ce26057dd8b06fb6 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Wed, 14 Apr 2021 13:40:58 +0200 Subject: Simplify {gimplify_and_,}update_call_from_tree API This removes update_call_from_tree in favor of gimplify_and_update_call_from_tree, removing some code duplication and simplifying the API use. Some users of update_call_from_tree have been transitioned to replace_call_with_value and the API and its dependences have been moved to gimple-fold.h. This shaves off another user of valid_gimple_rhs_p which is now only used from within gimple-fold.c and thus moved and made private. 2021-04-14 Richard Biener * tree-ssa-propagate.h (valid_gimple_rhs_p): Remove. (update_gimple_call): Likewise. (update_call_from_tree): Likewise. * tree-ssa-propagate.c (valid_gimple_rhs_p): Remove. (valid_gimple_call_p): Likewise. (move_ssa_defining_stmt_for_defs): Likewise. (finish_update_gimple_call): Likewise. (update_gimple_call): Likewise. (update_call_from_tree): Likewise. (propagate_tree_value_into_stmt): Use replace_call_with_value. * gimple-fold.h (update_gimple_call): Declare. * gimple-fold.c (valid_gimple_rhs_p): Move here from tree-ssa-propagate.c. (update_gimple_call): Likewise. (valid_gimple_call_p): Likewise. (finish_update_gimple_call): Likewise, and simplify. (gimplify_and_update_call_from_tree): Implement update_call_from_tree functionality, avoid excessive push/pop_gimplify_context. (gimple_fold_builtin): Use only gimplify_and_update_call_from_tree. (gimple_fold_call): Likewise. * gimple-ssa-sprintf.c (try_substitute_return_value): Likewise. * tree-ssa-ccp.c (ccp_folder::fold_stmt): Likewise. (pass_fold_builtins::execute): Likewise. (optimize_stack_restore): Use replace_call_with_value. * tree-cfg.c (fold_loop_internal_call): Likewise. * tree-ssa-dce.c (maybe_optimize_arith_overflow): Use only gimplify_and_update_call_from_tree. * tree-ssa-strlen.c (handle_builtin_strlen): Likewise. (handle_builtin_strchr): Likewise. * tsan.c: Include gimple-fold.h instead of tree-ssa-propagate.h. * config/rs6000/rs6000-call.c (rs6000_gimple_fold_builtin): Use replace_call_with_value. --- gcc/config/rs6000/rs6000-call.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index f567625..6f6dc47 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -12369,7 +12369,7 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi) /* Convert result back to the lhs type. */ res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res); gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - update_call_from_tree (gsi, res); + replace_call_with_value (gsi, res); return true; } /* Vector loads. */ -- cgit v1.1 From 5320d4e4af76120a402d0c5adef0d1bc92d66e40 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Mon, 26 Apr 2021 12:01:33 +0100 Subject: aarch64: Handle V4BF V8BF modes in vwcore attribute While playing with other unrelated changes I hit an assemble-failure bug where a pattern (one of the get_lane ones) that was using V4BF, V8BF as part of a mode iterator and outputting registers with the vwcore attribute, but there is no vwcore mapping for V4BF and V8BF. This patch fixes that in the obvious way by adding the missing mappings Bootstrapped and tested on aarch64-none-linux-gnu. gcc/ChangeLog: * config/aarch64/iterators.md (vwcore): Handle V4BF, V8BF. --- gcc/config/aarch64/iterators.md | 1 + 1 file changed, 1 insertion(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index cac33ae..a3d895a 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -1364,6 +1364,7 @@ (V2SI "w") (V4SI "w") (DI "x") (V2DI "x") (V4HF "w") (V8HF "w") + (V4BF "w") (V8BF "w") (V2SF "w") (V4SF "w") (V2DF "x") (VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "w") -- cgit v1.1 From 2cde2d620fc5ff60264ee825fd6eea457d7c51d9 Mon Sep 17 00:00:00 2001 From: "Cui,Lili" Date: Sat, 24 Apr 2021 11:52:17 +0800 Subject: Synchronize Rocket Lake's processor_names and processor_cost_table with processor_type gcc/ChangeLog * common/config/i386/i386-common.c (processor_names): Sync processor_names with processor_type. * config/i386/i386-options.c (processor_cost_table): Sync processor_cost_table with processor_type. --- gcc/config/i386/i386-options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index cc7b617..64c6ef4 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -727,12 +727,12 @@ static const struct processor_costs *processor_cost_table[] = &icelake_cost, &icelake_cost, &icelake_cost, - &icelake_cost, &skylake_cost, &icelake_cost, &skylake_cost, &icelake_cost, &icelake_cost, + &icelake_cost, &intel_cost, &geode_cost, &k6_cost, -- cgit v1.1 From 71c8aaf29bb122ebe5e67c84903fd23ff05f04ec Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 27 Apr 2021 10:17:45 +0200 Subject: i386: Improve [QH]Imode rotates with masked shift count [PR99405] The following testcase shows that while we nicely optimize away the useless and? of shift count before rotation for [SD]Imode rotates, we don't do that for [QH]Imode. The following patch optimizes that by using the right iterator on those 4 patterns. 2021-04-27 Jakub Jelinek PR target/99405 * config/i386/i386.md (*3_mask, *3_mask_1): For any_rotate define_insn_split and following splitters, use SWI iterator instead of SWI48. * gcc.target/i386/pr99405.c: New test. --- gcc/config/i386/i386.md | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 27f100c..eff189f 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -11933,9 +11933,9 @@ ;; Avoid useless masking of count operand. (define_insn_and_split "*3_mask" - [(set (match_operand:SWI48 0 "nonimmediate_operand") - (any_rotate:SWI48 - (match_operand:SWI48 1 "nonimmediate_operand") + [(set (match_operand:SWI 0 "nonimmediate_operand") + (any_rotate:SWI + (match_operand:SWI 1 "nonimmediate_operand") (subreg:QI (and:SI (match_operand:SI 2 "register_operand" "c") @@ -11949,15 +11949,15 @@ "&& 1" [(parallel [(set (match_dup 0) - (any_rotate:SWI48 (match_dup 1) - (match_dup 2))) + (any_rotate:SWI (match_dup 1) + (match_dup 2))) (clobber (reg:CC FLAGS_REG))])] "operands[2] = gen_lowpart (QImode, operands[2]);") (define_split - [(set (match_operand:SWI48 0 "register_operand") - (any_rotate:SWI48 - (match_operand:SWI48 1 "const_int_operand") + [(set (match_operand:SWI 0 "register_operand") + (any_rotate:SWI + (match_operand:SWI 1 "const_int_operand") (subreg:QI (and:SI (match_operand:SI 2 "register_operand") @@ -11966,14 +11966,14 @@ == GET_MODE_BITSIZE (mode) - 1" [(set (match_dup 4) (match_dup 1)) (set (match_dup 0) - (any_rotate:SWI48 (match_dup 4) - (subreg:QI (match_dup 2) 0)))] + (any_rotate:SWI (match_dup 4) + (subreg:QI (match_dup 2) 0)))] "operands[4] = gen_reg_rtx (mode);") (define_insn_and_split "*3_mask_1" - [(set (match_operand:SWI48 0 "nonimmediate_operand") - (any_rotate:SWI48 - (match_operand:SWI48 1 "nonimmediate_operand") + [(set (match_operand:SWI 0 "nonimmediate_operand") + (any_rotate:SWI + (match_operand:SWI 1 "nonimmediate_operand") (and:QI (match_operand:QI 2 "register_operand" "c") (match_operand:QI 3 "const_int_operand")))) @@ -11986,14 +11986,14 @@ "&& 1" [(parallel [(set (match_dup 0) - (any_rotate:SWI48 (match_dup 1) - (match_dup 2))) + (any_rotate:SWI (match_dup 1) + (match_dup 2))) (clobber (reg:CC FLAGS_REG))])]) (define_split - [(set (match_operand:SWI48 0 "register_operand") - (any_rotate:SWI48 - (match_operand:SWI48 1 "const_int_operand") + [(set (match_operand:SWI 0 "register_operand") + (any_rotate:SWI + (match_operand:SWI 1 "const_int_operand") (and:QI (match_operand:QI 2 "register_operand") (match_operand:QI 3 "const_int_operand"))))] @@ -12001,7 +12001,7 @@ == GET_MODE_BITSIZE (mode) - 1" [(set (match_dup 4) (match_dup 1)) (set (match_dup 0) - (any_rotate:SWI48 (match_dup 4) (match_dup 2)))] + (any_rotate:SWI (match_dup 4) (match_dup 2)))] "operands[4] = gen_reg_rtx (mode);") ;; Implement rotation using two double-precision -- cgit v1.1 From 4cea5b8cb715e40e10174e6de405f26202fa3d6a Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 27 Apr 2021 12:18:03 +0100 Subject: aarch64: Handle SVE attributes in comp_type_attributes [PR100270] Even though "SVE type" and "SVE sizeless type" are marked as affecting type identity, the middle end doesn't truly believe it unless we also handle them in comp_type_attributes. gcc/ PR target/100270 * config/aarch64/aarch64.c (aarch64_comp_type_attributes): Handle SVE attributes. gcc/testsuite/ PR target/100270 * gcc.target/aarch64/sve/acle/general-c/pr100270_1.c: New test. * gcc.target/aarch64/sve/acle/general-c/sizeless-2.c: Change expected error message when subtracting pointers to different vector types. Expect warnings when mixing them elsewhere. * gcc.target/aarch64/sve/acle/general/attributes_7.c: Remove XFAILs. Tweak error messages for some cases. --- gcc/config/aarch64/aarch64.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 12625a4..dbaf6fb 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -25173,6 +25173,10 @@ aarch64_comp_type_attributes (const_tree type1, const_tree type2) return 0; if (!check_attr ("Advanced SIMD type")) return 0; + if (!check_attr ("SVE type")) + return 0; + if (!check_attr ("SVE sizeless type")) + return 0; return 1; } -- cgit v1.1 From 01d0bda8bdf3cd804e1e00915d432ad0cdc49399 Mon Sep 17 00:00:00 2001 From: Richard Earnshaw Date: Tue, 27 Apr 2021 12:25:30 +0100 Subject: arm: fix UB when compiling thumb2 with PIC [PR100236] arm_compute_save_core_reg_mask contains UB in that the saved PIC register number is used to create a bit mask. However, for some target options this register is undefined and we end up with a shift of ~0. On native compilations this is benign since the shift will still be large enough to move the bit outside of the range of the mask, but if cross compiling from a system that truncates out-of-range shifts to zero (or worse, raises a trap for such values) we'll get potentially wrong code (or a fault). gcc: PR target/100236 * config/arm/arm.c (THUMB2_WORK_REGS): Check PIC_OFFSET_TABLE_REGNUM is valid before including it in the mask. --- gcc/config/arm/arm.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 340f7c9..352b2cd 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -1051,9 +1051,13 @@ const char *fp_sysreg_names[NB_FP_SYSREGS] = { #define ARM_LSL_NAME "lsl" #define streq(string1, string2) (strcmp (string1, string2) == 0) -#define THUMB2_WORK_REGS (0xff & ~( (1 << THUMB_HARD_FRAME_POINTER_REGNUM) \ - | (1 << SP_REGNUM) | (1 << PC_REGNUM) \ - | (1 << PIC_OFFSET_TABLE_REGNUM))) +#define THUMB2_WORK_REGS \ + (0xff & ~((1 << THUMB_HARD_FRAME_POINTER_REGNUM) \ + | (1 << SP_REGNUM) \ + | (1 << PC_REGNUM) \ + | (PIC_OFFSET_TABLE_REGNUM != INVALID_REGNUM \ + ? (1 << PIC_OFFSET_TABLE_REGNUM) \ + : 0))) /* Initialization code. */ -- cgit v1.1 From 618ae596ebcd1de03857d20485d1324931852569 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 27 Apr 2021 15:46:16 +0200 Subject: aarch64: Fix UB in the compiler [PR100200] The following patch fixes UBs in the compiler when negativing a CONST_INT containing HOST_WIDE_INT_MIN. I've changed the spots where there wasn't an obvious earlier condition check or predicate that would fail for such CONST_INTs. 2021-04-27 Jakub Jelinek PR target/100200 * config/aarch64/predicates.md (aarch64_sub_immediate, aarch64_plus_immediate): Use -UINTVAL instead of -INTVAL. * config/aarch64/aarch64.md (casesi, rotl3): Likewise. * config/aarch64/aarch64.c (aarch64_print_operand, aarch64_split_atomic_op, aarch64_expand_subvti): Likewise. --- gcc/config/aarch64/aarch64.c | 6 +++--- gcc/config/aarch64/aarch64.md | 5 +++-- gcc/config/aarch64/predicates.md | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index dbaf6fb..aa148ac 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -10778,7 +10778,7 @@ aarch64_print_operand (FILE *f, rtx x, int code) } if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT) - asm_fprintf (f, "%wd", -INTVAL (elt)); + asm_fprintf (f, "%wd", -UINTVAL (elt)); else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT && aarch64_print_vector_float_operand (f, x, true)) ; @@ -21598,7 +21598,7 @@ aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem, case MINUS: if (CONST_INT_P (value)) { - value = GEN_INT (-INTVAL (value)); + value = GEN_INT (-UINTVAL (value)); code = PLUS; } /* Fall through. */ @@ -23514,7 +23514,7 @@ aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1, { if (aarch64_plus_immediate (low_in2, DImode)) emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2, - GEN_INT (-INTVAL (low_in2)))); + GEN_INT (-UINTVAL (low_in2)))); else { low_in2 = force_reg (DImode, low_in2); diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index abfd845..aef6da9 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -747,7 +747,8 @@ constant can be represented in SImode, this is important for the corner case where operand[1] is INT_MIN. */ - operands[1] = GEN_INT (trunc_int_for_mode (-INTVAL (operands[1]), SImode)); + operands[1] + = GEN_INT (trunc_int_for_mode (-UINTVAL (operands[1]), SImode)); if (!(*insn_data[CODE_FOR_addsi3].operand[2].predicate) (operands[1], SImode)) @@ -5008,7 +5009,7 @@ /* (SZ - cnt) % SZ == -cnt % SZ */ if (CONST_INT_P (operands[2])) { - operands[2] = GEN_INT ((-INTVAL (operands[2])) + operands[2] = GEN_INT ((-UINTVAL (operands[2])) & (GET_MODE_BITSIZE (mode) - 1)); if (operands[2] == const0_rtx) { diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index c55842b..49f02ae 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -121,12 +121,12 @@ (define_predicate "aarch64_sub_immediate" (and (match_code "const_int") - (match_test "aarch64_uimm12_shift (-INTVAL (op))"))) + (match_test "aarch64_uimm12_shift (-UINTVAL (op))"))) (define_predicate "aarch64_plus_immediate" (and (match_code "const_int") (ior (match_test "aarch64_uimm12_shift (INTVAL (op))") - (match_test "aarch64_uimm12_shift (-INTVAL (op))")))) + (match_test "aarch64_uimm12_shift (-UINTVAL (op))")))) (define_predicate "aarch64_plus_operand" (ior (match_operand 0 "register_operand") -- cgit v1.1 From 42a10bb884c0d5af2583b8bfe4d239ce95bf9e43 Mon Sep 17 00:00:00 2001 From: Alex Coplan Date: Tue, 27 Apr 2021 14:56:15 +0100 Subject: arm: Fix ICEs with compare-and-swap and -march=armv8-m.base [PR99977] The PR shows two ICEs with __sync_bool_compare_and_swap and -mcpu=cortex-m23 (equivalently, -march=armv8-m.base): one in LRA and one later on, after the CAS insn is split. The LRA ICE occurs because the @atomic_compare_and_swap_1 pattern attempts to tie two output operands together (operands 0 and 1 in the third alternative). LRA can't handle this, since it doesn't make sense for an insn to assign to the same operand twice. The later (post-splitting) ICE occurs because the expansion of the cbranchsi4_scratch insn doesn't quite go according to plan. As it stands, arm_split_compare_and_swap calls gen_cbranchsi4_scratch, attempting to pass a register (neg_bval) to use as a scratch register. However, since the RTL template has a match_scratch here, gen_cbranchsi4_scratch ignores this argument and produces a scratch rtx. Since this is all happening after RA, this is doomed to fail (and we get an ICE about the insn not matching its constraints). It seems that the motivation for the choice of constraints in the atomic_compare_and_swap pattern comes from an attempt to satisfy the constraints of the cbranchsi4_scratch insn. This insn requires the scratch register to be the same as the input register in the case that we use a larger negative immediate (one that satisfies J, but not L). Of course, as noted above, LRA refuses to assign two output operands to the same register, so this was never going to work. The solution I'm proposing here is to collapse the alternatives to the CAS insn (allowing the two output register operands to be matched to different registers) and to ensure that the constraints for cbranchsi4_scratch are met in arm_split_compare_and_swap. We do this by inserting a move to ensure the source and destination registers match if necessary (i.e. in the case of large negative immediates). Another notable change here is that we only do: emit_move_insn (neg_bval, const1_rtx); for non-negative immediates. This is because the ADDS instruction used in the negative case suffices to leave a suitable value in neg_bval: if the operands compare equal, we don't take the branch (so neg_bval will be set by the load exclusive). Otherwise, the ADDS will leave a nonzero value in neg_bval, which will correctly signal that the CAS has failed when it is later negated. gcc/ChangeLog: PR target/99977 * config/arm/arm.c (arm_split_compare_and_swap): Fix up codegen with negative immediates: ensure we expand cbranchsi4_scratch correctly and ensure we satisfy its constraints. * config/arm/sync.md (@atomic_compare_and_swap_1): Don't attempt to tie two output operands together with constraints; collapse two alternatives. (@atomic_compare_and_swap_1): Likewise. * config/arm/thumb1.md (cbranchsi4_neg_late): New. gcc/testsuite/ChangeLog: PR target/99977 * gcc.target/arm/pr99977.c: New test. --- gcc/config/arm/arm.c | 26 ++++++++++++++++++++++---- gcc/config/arm/sync.md | 28 ++++++++++++++-------------- gcc/config/arm/thumb1.md | 15 +++++++++++++++ 3 files changed, 51 insertions(+), 18 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 352b2cd..6641e3f 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -30743,13 +30743,31 @@ arm_split_compare_and_swap (rtx operands[]) } else { - emit_move_insn (neg_bval, const1_rtx); cond = gen_rtx_NE (VOIDmode, rval, oldval); if (thumb1_cmpneg_operand (oldval, SImode)) - emit_unlikely_jump (gen_cbranchsi4_scratch (neg_bval, rval, oldval, - label2, cond)); + { + rtx src = rval; + if (!satisfies_constraint_L (oldval)) + { + gcc_assert (satisfies_constraint_J (oldval)); + + /* For such immediates, ADDS needs the source and destination regs + to be the same. + + Normally this would be handled by RA, but this is all happening + after RA. */ + emit_move_insn (neg_bval, rval); + src = neg_bval; + } + + emit_unlikely_jump (gen_cbranchsi4_neg_late (neg_bval, src, oldval, + label2, cond)); + } else - emit_unlikely_jump (gen_cbranchsi4_insn (cond, rval, oldval, label2)); + { + emit_move_insn (neg_bval, const1_rtx); + emit_unlikely_jump (gen_cbranchsi4_insn (cond, rval, oldval, label2)); + } } arm_emit_store_exclusive (mode, neg_bval, mem, newval, use_release); diff --git a/gcc/config/arm/sync.md b/gcc/config/arm/sync.md index e4682c0..b9fa870 100644 --- a/gcc/config/arm/sync.md +++ b/gcc/config/arm/sync.md @@ -187,20 +187,20 @@ ;; Constraints of this pattern must be at least as strict as those of the ;; cbranchsi operations in thumb1.md and aim to be as permissive. (define_insn_and_split "@atomic_compare_and_swap_1" - [(set (match_operand:CCSI 0 "cc_register_operand" "=&c,&l,&l,&l") ;; bool out + [(set (match_operand:CCSI 0 "cc_register_operand" "=&c,&l,&l") ;; bool out (unspec_volatile:CCSI [(const_int 0)] VUNSPEC_ATOMIC_CAS)) - (set (match_operand:SI 1 "s_register_operand" "=&r,&l,&0,&l*h") ;; val out + (set (match_operand:SI 1 "s_register_operand" "=&r,&l,&l*h") ;; val out (zero_extend:SI - (match_operand:NARROW 2 "mem_noofs_operand" "+Ua,Ua,Ua,Ua"))) ;; memory + (match_operand:NARROW 2 "mem_noofs_operand" "+Ua,Ua,Ua"))) ;; memory (set (match_dup 2) (unspec_volatile:NARROW - [(match_operand:SI 3 "arm_add_operand" "rIL,lIL*h,J,*r") ;; expected - (match_operand:NARROW 4 "s_register_operand" "r,r,r,r") ;; desired + [(match_operand:SI 3 "arm_add_operand" "rIL,lILJ*h,*r") ;; expected + (match_operand:NARROW 4 "s_register_operand" "r,r,r") ;; desired (match_operand:SI 5 "const_int_operand") ;; is_weak (match_operand:SI 6 "const_int_operand") ;; mod_s (match_operand:SI 7 "const_int_operand")] ;; mod_f VUNSPEC_ATOMIC_CAS)) - (clobber (match_scratch:SI 8 "=&r,X,X,X"))] + (clobber (match_scratch:SI 8 "=&r,X,X"))] "" "#" "&& reload_completed" @@ -209,7 +209,7 @@ arm_split_compare_and_swap (operands); DONE; } - [(set_attr "arch" "32,v8mb,v8mb,v8mb")]) + [(set_attr "arch" "32,v8mb,v8mb")]) (define_mode_attr cas_cmp_operand [(SI "arm_add_operand") (DI "cmpdi_operand")]) @@ -219,19 +219,19 @@ ;; Constraints of this pattern must be at least as strict as those of the ;; cbranchsi operations in thumb1.md and aim to be as permissive. (define_insn_and_split "@atomic_compare_and_swap_1" - [(set (match_operand:CCSI 0 "cc_register_operand" "=&c,&l,&l,&l") ;; bool out + [(set (match_operand:CCSI 0 "cc_register_operand" "=&c,&l,&l") ;; bool out (unspec_volatile:CCSI [(const_int 0)] VUNSPEC_ATOMIC_CAS)) - (set (match_operand:SIDI 1 "s_register_operand" "=&r,&l,&0,&l*h") ;; val out - (match_operand:SIDI 2 "mem_noofs_operand" "+Ua,Ua,Ua,Ua")) ;; memory + (set (match_operand:SIDI 1 "s_register_operand" "=&r,&l,&l*h") ;; val out + (match_operand:SIDI 2 "mem_noofs_operand" "+Ua,Ua,Ua")) ;; memory (set (match_dup 2) (unspec_volatile:SIDI - [(match_operand:SIDI 3 "" ",lIL*h,J,*r") ;; expect - (match_operand:SIDI 4 "s_register_operand" "r,r,r,r") ;; desired + [(match_operand:SIDI 3 "" ",lILJ*h,*r") ;; expect + (match_operand:SIDI 4 "s_register_operand" "r,r,r") ;; desired (match_operand:SI 5 "const_int_operand") ;; is_weak (match_operand:SI 6 "const_int_operand") ;; mod_s (match_operand:SI 7 "const_int_operand")] ;; mod_f VUNSPEC_ATOMIC_CAS)) - (clobber (match_scratch:SI 8 "=&r,X,X,X"))] + (clobber (match_scratch:SI 8 "=&r,X,X"))] "" "#" "&& reload_completed" @@ -240,7 +240,7 @@ arm_split_compare_and_swap (operands); DONE; } - [(set_attr "arch" "32,v8mb,v8mb,v8mb")]) + [(set_attr "arch" "32,v8mb,v8mb")]) (define_insn_and_split "atomic_exchange" [(set (match_operand:QHSD 0 "s_register_operand" "=&r,&r") ;; output diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md index c98b59c..084ed65 100644 --- a/gcc/config/arm/thumb1.md +++ b/gcc/config/arm/thumb1.md @@ -1206,6 +1206,21 @@ (set_attr "type" "multiple")] ) +;; An expander which makes use of the cbranchsi4_scratch insn, but can +;; be used safely after RA. +(define_expand "cbranchsi4_neg_late" + [(parallel [ + (set (pc) (if_then_else + (match_operator 4 "arm_comparison_operator" + [(match_operand:SI 1 "s_register_operand") + (match_operand:SI 2 "thumb1_cmpneg_operand")]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (match_operand:SI 0 "s_register_operand")) + ])] + "TARGET_THUMB1" +) + ;; Changes to the constraints of this pattern must be propagated to those of ;; atomic compare_and_swap splitters in sync.md. These must be at least as ;; strict as the constraints here and aim to be as permissive. -- cgit v1.1 From 1c0c371d0ea297af2e3180c64cd18f2bfce919b1 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 27 Apr 2021 17:50:53 +0200 Subject: aarch64: Fix up last commit [PR100200] Pedantically signed vs. unsigned mismatches in va_arg are only well defined if the value can be represented in both signed and unsigned integer types. 2021-04-27 Jakub Jelinek PR target/100200 * config/aarch64/aarch64.c (aarch64_print_operand): Cast -UINTVAL back to HOST_WIDE_INT. --- gcc/config/aarch64/aarch64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index aa148ac..a863af1 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -10778,7 +10778,7 @@ aarch64_print_operand (FILE *f, rtx x, int code) } if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT) - asm_fprintf (f, "%wd", -UINTVAL (elt)); + asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt)); else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT && aarch64_print_vector_float_operand (f, x, true)) ; -- cgit v1.1 From a21b399708175f6fc0ac723a0cebc127da421c60 Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Sun, 11 Apr 2021 19:41:26 -0400 Subject: aix: TLS precompute register parameters (PR 94177) AIX uses a compiler-managed TOC for global data, including TLS symbols. The GCC TOC implementation manages the TOC entries through the constant pool. TLS symbols sometimes require a function call to obtain the TLS base pointer. The arguments to the TLS call can conflict with arguments to a normal function call if the TLS symbol is an argument in the normal call. GCC specifically checks for this situation and precomputes the TLS arguments, but the mechanism to check for this requirement utilizes legitimate_constant_p(). The necessary result of legitimate_constant_p() for correct TOC behavior and for correct TLS argument behavior is in conflict. This patch adds a new target hook precompute_tls_p() to decide if an argument should be precomputed regardless of the result from legitmate_constant_p(). gcc/ChangeLog: PR target/94177 * calls.c (precompute_register_parameters): Additionally test targetm.precompute_tls_p to pre-compute argument. * config/rs6000/aix.h (TARGET_PRECOMPUTE_TLS_P): Define. * config/rs6000/rs6000.c (rs6000_aix_precompute_tls_p): New. * target.def (precompute_tls_p): New. * doc/tm.texi.in (TARGET_PRECOMPUTE_TLS_P): Add hook documentation. * doc/tm.texi: Regenerated. --- gcc/config/rs6000/aix.h | 1 + gcc/config/rs6000/rs6000.c | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/aix.h b/gcc/config/rs6000/aix.h index 7fccb313..b116e1a 100644 --- a/gcc/config/rs6000/aix.h +++ b/gcc/config/rs6000/aix.h @@ -279,3 +279,4 @@ /* Use standard DWARF numbering for DWARF debugging information. */ #define RS6000_USE_DWARF_NUMBERING +#define TARGET_PRECOMPUTE_TLS_P rs6000_aix_precompute_tls_p diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 844fee8..60b8e3e 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -9608,7 +9608,8 @@ rs6000_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x) && SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0)) != 0) return true; - /* Do not place an ELF TLS symbol in the constant pool. */ + /* Allow AIX TOC TLS symbols in the constant pool, + but not ELF TLS symbols. */ return TARGET_ELF && tls_referenced_p (x); } @@ -25370,6 +25371,18 @@ rs6000_legitimate_constant_p (machine_mode mode, rtx x) return true; } +/* Implement TARGET_PRECOMPUTE_TLS_P. + + On the AIX, TLS symbols are in the TOC, which is maintained in the + constant pool. AIX TOC TLS symbols need to be pre-computed, but + must be considered legitimate constants. */ + +static bool +rs6000_aix_precompute_tls_p (machine_mode mode ATTRIBUTE_UNUSED, rtx x) +{ + return tls_referenced_p (x); +} + /* Return TRUE iff the sequence ending in LAST sets the static chain. */ -- cgit v1.1 From f82658338756fe9a38a728aa542d786a0e889e21 Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Tue, 27 Apr 2021 16:59:59 +0000 Subject: powerpc: fix bootstrap. gcc/ChangeLog: * config/rs6000/rs6000.c (rs6000_aix_precompute_tls_p): Protect with TARGET_AIX_OS. --- gcc/config/rs6000/rs6000.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 60b8e3e..14ff56a 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -25371,6 +25371,7 @@ rs6000_legitimate_constant_p (machine_mode mode, rtx x) return true; } +#if TARGET_AIX_OS /* Implement TARGET_PRECOMPUTE_TLS_P. On the AIX, TLS symbols are in the TOC, which is maintained in the @@ -25382,6 +25383,7 @@ rs6000_aix_precompute_tls_p (machine_mode mode ATTRIBUTE_UNUSED, rtx x) { return tls_referenced_p (x); } +#endif /* Return TRUE iff the sequence ending in LAST sets the static chain. */ -- cgit v1.1 From 436e6f7d85cbd744c3f3aa393043e615dca1d36f Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Wed, 21 Apr 2021 23:33:02 +0200 Subject: VAX: Remove dead `adjacent_operands_p' function This function has never been used and it is unclear what its intended purpose was. gcc/ * config/vax/vax-protos.h (adjacent_operands_p): Remove prototype. * config/vax/vax.c (adjacent_operands_p): Remove. --- gcc/config/vax/vax-protos.h | 1 - gcc/config/vax/vax.c | 73 --------------------------------------------- 2 files changed, 74 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/vax/vax-protos.h b/gcc/config/vax/vax-protos.h index 89fddec..6dcbf53 100644 --- a/gcc/config/vax/vax-protos.h +++ b/gcc/config/vax/vax-protos.h @@ -24,7 +24,6 @@ extern void vax_expand_prologue (void); extern bool vax_acceptable_pic_operand_p (rtx, bool, bool); extern machine_mode vax_select_cc_mode (enum rtx_code, rtx, rtx); extern const char *cond_name (rtx); -extern bool adjacent_operands_p (rtx, rtx, machine_mode); extern const char *rev_cond_name (rtx); extern void print_operand_address (FILE *, rtx); extern void print_operand (FILE *, rtx, int); diff --git a/gcc/config/vax/vax.c b/gcc/config/vax/vax.c index 726c371..870af2b 100644 --- a/gcc/config/vax/vax.c +++ b/gcc/config/vax/vax.c @@ -2108,79 +2108,6 @@ vax_expand_addsub_di_operands (rtx * operands, enum rtx_code code) } } -bool -adjacent_operands_p (rtx lo, rtx hi, machine_mode mode) -{ - HOST_WIDE_INT lo_offset; - HOST_WIDE_INT hi_offset; - - if (GET_CODE (lo) != GET_CODE (hi)) - return false; - - if (REG_P (lo)) - return mode == SImode && REGNO (lo) + 1 == REGNO (hi); - if (CONST_INT_P (lo)) - return INTVAL (hi) == 0 && UINTVAL (lo) < 64; - if (CONST_INT_P (lo)) - return mode != SImode; - - if (!MEM_P (lo)) - return false; - - if (MEM_VOLATILE_P (lo) || MEM_VOLATILE_P (hi)) - return false; - - lo = XEXP (lo, 0); - hi = XEXP (hi, 0); - - if (GET_CODE (lo) == POST_INC /* || GET_CODE (lo) == PRE_DEC */) - return rtx_equal_p (lo, hi); - - switch (GET_CODE (lo)) - { - case REG: - case SYMBOL_REF: - lo_offset = 0; - break; - case CONST: - lo = XEXP (lo, 0); - /* FALLTHROUGH */ - case PLUS: - if (!CONST_INT_P (XEXP (lo, 1))) - return false; - lo_offset = INTVAL (XEXP (lo, 1)); - lo = XEXP (lo, 0); - break; - default: - return false; - } - - switch (GET_CODE (hi)) - { - case REG: - case SYMBOL_REF: - hi_offset = 0; - break; - case CONST: - hi = XEXP (hi, 0); - /* FALLTHROUGH */ - case PLUS: - if (!CONST_INT_P (XEXP (hi, 1))) - return false; - hi_offset = INTVAL (XEXP (hi, 1)); - hi = XEXP (hi, 0); - break; - default: - return false; - } - - if (GET_CODE (lo) == MULT || GET_CODE (lo) == PLUS) - return false; - - return rtx_equal_p (lo, hi) - && hi_offset - lo_offset == GET_MODE_SIZE (mode); -} - /* Output assembler code for a block containing the constant parts of a trampoline, leaving space for the variable parts. */ -- cgit v1.1 From f3bfed3381be2e616599679b2a093b0ac8f1c5f7 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Wed, 21 Apr 2021 23:33:11 +0200 Subject: VAX: Fix ill-formed `jbbi' insn operands The insn has extraneous operand #3 that is aliased in RTL to operand #0 with a constraint. The operands specify a single-bit field in memory that the machine instruction produced boths reads for the purpose of determining whether to branch or not and either clears or sets according to the machine operation selected with the `ccss' iterator. The caller of the insn is supposed to supply the same rtx for both operands. This odd arrangement happens to work with old reload, but breaks with libatomic if LRA is used instead: .../libatomic/flag.c: In function 'atomic_flag_test_and_set': .../libatomic/flag.c:36:1: error: unable to generate reloads for: 36 | } | ^ (jump_insn 7 6 19 2 (unspec_volatile [ (set (pc) (if_then_else (eq (zero_extract:SI (mem/v:QI (reg:SI 27) [-1 S1 A8]) (const_int 1 [0x1]) (const_int 0 [0])) (const_int 1 [0x1])) (label_ref:SI 25) (pc))) (set (zero_extract:SI (mem/v:QI (reg:SI 28) [-1 S1 A8]) (const_int 1 [0x1]) (const_int 0 [0])) (const_int 1 [0x1])) ] 100) ".../libatomic/flag.c":35:10 669 {jbbssiqi} (nil) -> 25) during RTL pass: reload .../libatomic/flag.c:36:1: internal compiler error: in curr_insn_transform, at lra-constraints.c:4098 0x1112c587 _fatal_insn(char const*, rtx_def const*, char const*, int, char const*) .../gcc/rtl-error.c:108 0x10ee6563 curr_insn_transform .../gcc/lra-constraints.c:4098 0x10eeaf87 lra_constraints(bool) .../gcc/lra-constraints.c:5133 0x10ec97e3 lra(_IO_FILE*) .../gcc/lra.c:2336 0x10e4633f do_reload .../gcc/ira.c:5827 0x10e46b27 execute .../gcc/ira.c:6013 Please submit a full bug report, with preprocessed source if appropriate. Please include the complete backtrace with any bug report. See for instructions. Switch to using `match_dup' as expected then for a machine instruction that in its encoding only has one actual operand in for the single-bit field. gcc/ * config/vax/builtins.md (jbbi): Remove operand #3. (sync_lock_test_and_set): Adjust accordingly. (sync_lock_release): Likewise. --- gcc/config/vax/builtins.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/vax/builtins.md b/gcc/config/vax/builtins.md index 3d1cbcd..ff97ff3 100644 --- a/gcc/config/vax/builtins.md +++ b/gcc/config/vax/builtins.md @@ -174,8 +174,7 @@ label = gen_label_rtx (); emit_move_insn (operands[0], const1_rtx); - emit_jump_insn (gen_jbbssi (operands[1], const0_rtx, label, - operands[1])); + emit_jump_insn (gen_jbbssi (operands[1], const0_rtx, label)); emit_move_insn (operands[0], const0_rtx); emit_label (label); DONE; @@ -193,8 +192,7 @@ FAIL; label = gen_label_rtx (); - emit_jump_insn (gen_jbbcci (operands[0], const0_rtx, label, - operands[0])); + emit_jump_insn (gen_jbbcci (operands[0], const0_rtx, label)); emit_label (label); DONE; }") @@ -204,13 +202,13 @@ [(set (pc) (if_then_else (eq (zero_extract:SI - (match_operand:VAXint 0 "any_memory_operand" "") + (match_operand:VAXint 0 "any_memory_operand" "+") (const_int 1) (match_operand:SI 1 "general_operand" "nrmT")) (const_int bit)) (label_ref (match_operand 2 "" "")) (pc))) - (set (zero_extract:SI (match_operand:VAXint 3 "any_memory_operand" "+0") + (set (zero_extract:SI (match_dup 0) (const_int 1) (match_dup 1)) (const_int bit))] -- cgit v1.1 From c605a8bf92708e81d771426a87b3baddc32082dd Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Wed, 21 Apr 2021 23:33:25 +0200 Subject: VAX: Accept ASHIFT in address expressions Fix regressions: FAIL: gcc.c-torture/execute/20090113-2.c -O1 (internal compiler error) FAIL: gcc.c-torture/execute/20090113-2.c -O1 (test for excess errors) FAIL: gcc.c-torture/execute/20090113-3.c -O1 (internal compiler error) FAIL: gcc.c-torture/execute/20090113-3.c -O1 (test for excess errors) triggering if LRA is used rather than old reload and caused by: (plus:SI (plus:SI (mult:SI (reg:SI 30 [ _10 ]) (const_int 4 [0x4])) (reg/f:SI 26 [ _6 ])) (const_int 12 [0xc])) coming from: (insn 58 57 59 10 (set (reg:SI 33 [ _13 ]) (zero_extract:SI (mem:SI (plus:SI (plus:SI (mult:SI (reg:SI 30 [ _10 ]) (const_int 4 [0x4])) (reg/f:SI 26 [ _6 ])) (const_int 12 [0xc])) [4 _6->bits[_10]+0 S4 A32]) (reg:QI 56) (reg:SI 53))) ".../gcc/testsuite/gcc.c-torture/execute/20090113-2.c":64:12 490 {*extzv_non_const} (expr_list:REG_DEAD (reg:QI 56) (expr_list:REG_DEAD (reg:SI 53) (expr_list:REG_DEAD (reg:SI 30 [ _10 ]) (expr_list:REG_DEAD (reg/f:SI 26 [ _6 ]) (nil)))))) being converted into: (plus:SI (plus:SI (ashift:SI (reg:SI 30 [ _10 ]) (const_int 2 [0x2])) (reg/f:SI 26 [ _6 ])) (const_int 12 [0xc])) which is an rtx the VAX backend currently does not recognize as a valid machine address, although apparently it is only inside MEM rtx's that indexed addressing is supposed to be canonicalized to a MULT rather than ASHIFT form. Handle the ASHIFT form too throughout the backend then. The change appears to also improve code generation with old reload and code size stats are as follows, collected from 18153 executables built in `check-c' GCC testing: samples average median -------------------------------------- regressions 47 0.702% 0.521% unchanged 17503 0.000% 0.000% progressions 603 -0.920% -0.403% -------------------------------------- total 18153 -0.029% 0.000% with a small number of outliers (over 5% size change): old new change %change filename ---------------------------------------------------- 1885 1645 -240 -12.7320 pr53505.exe 1331 1221 -110 -8.2644 pr89634.exe 1553 1473 -80 -5.1513 stdatomic-vm.exe 1413 1341 -72 -5.0955 pr45830.exe 1415 1343 -72 -5.0883 stdatomic-vm.exe 25765 24463 -1302 -5.0533 strlen-5.exe 25765 24463 -1302 -5.0533 strlen-5.exe 25765 24463 -1302 -5.0533 strlen-5.exe 1191 1131 -60 -5.0377 20050527-1.exe (all changes on the expansion side are below 5%). gcc/ * config/vax/vax.c (print_operand_address, vax_address_cost_1) (index_term_p): Handle ASHIFT too. --- gcc/config/vax/vax.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/vax/vax.c b/gcc/config/vax/vax.c index 870af2b..96a7925 100644 --- a/gcc/config/vax/vax.c +++ b/gcc/config/vax/vax.c @@ -333,12 +333,12 @@ print_operand_address (FILE * file, rtx addr) case PLUS: /* There can be either two or three things added here. One must be a - REG. One can be either a REG or a MULT of a REG and an appropriate - constant, and the third can only be a constant or a MEM. + REG. One can be either a REG or a MULT/ASHIFT of a REG and an + appropriate constant, and the third can only be a constant or a MEM. We get these two or three things and put the constant or MEM in - OFFSET, the MULT or REG in IREG, and the REG in BREG. If we have - a register and can't tell yet if it is a base or index register, + OFFSET, the MULT/ASHIFT or REG in IREG, and the REG in BREG. If we + have a register and can't tell yet if it is a base or index register, put it into REG1. */ reg1 = 0; ireg = 0; breg = 0; offset = 0; @@ -355,12 +355,14 @@ print_operand_address (FILE * file, rtx addr) offset = XEXP (addr, 1); addr = XEXP (addr, 0); } - else if (GET_CODE (XEXP (addr, 1)) == MULT) + else if (GET_CODE (XEXP (addr, 1)) == MULT + || GET_CODE (XEXP (addr, 1)) == ASHIFT) { ireg = XEXP (addr, 1); addr = XEXP (addr, 0); } - else if (GET_CODE (XEXP (addr, 0)) == MULT) + else if (GET_CODE (XEXP (addr, 0)) == MULT + || GET_CODE (XEXP (addr, 0)) == ASHIFT) { ireg = XEXP (addr, 0); addr = XEXP (addr, 1); @@ -385,7 +387,7 @@ print_operand_address (FILE * file, rtx addr) else reg1 = addr; } - else if (GET_CODE (addr) == MULT) + else if (GET_CODE (addr) == MULT || GET_CODE (addr) == ASHIFT) ireg = addr; else { @@ -416,7 +418,8 @@ print_operand_address (FILE * file, rtx addr) } else { - gcc_assert (GET_CODE (XEXP (addr, 0)) == MULT); + gcc_assert (GET_CODE (XEXP (addr, 0)) == MULT + || GET_CODE (XEXP (addr, 0)) == ASHIFT); gcc_assert (!ireg); ireg = XEXP (addr, 0); } @@ -447,7 +450,8 @@ print_operand_address (FILE * file, rtx addr) } else { - gcc_assert (GET_CODE (XEXP (addr, 1)) == MULT); + gcc_assert (GET_CODE (XEXP (addr, 1)) == MULT + || GET_CODE (XEXP (addr, 1)) == ASHIFT); gcc_assert (!ireg); ireg = XEXP (addr, 1); } @@ -506,7 +510,7 @@ print_operand_address (FILE * file, rtx addr) if (ireg != 0) { - if (GET_CODE (ireg) == MULT) + if (GET_CODE (ireg) == MULT || GET_CODE (ireg) == ASHIFT) ireg = XEXP (ireg, 0); gcc_assert (REG_P (ireg)); fprintf (file, "[%s]", reg_names[REGNO (ireg)]); @@ -707,6 +711,7 @@ vax_address_cost_1 (rtx addr) reg = 1; break; case MULT: + case ASHIFT: indexed = 1; /* 2 on VAX 2 */ break; case CONST_INT: @@ -1824,23 +1829,26 @@ static bool index_term_p (rtx prod, machine_mode mode, bool strict) { rtx xfoo0, xfoo1; + bool log_p; if (GET_MODE_SIZE (mode) == 1) return BASE_REGISTER_P (prod, strict); - if (GET_CODE (prod) != MULT || GET_MODE_SIZE (mode) > 8) + if ((GET_CODE (prod) != MULT && GET_CODE (prod) != ASHIFT) + || GET_MODE_SIZE (mode) > 8) return false; + log_p = GET_CODE (prod) == ASHIFT; xfoo0 = XEXP (prod, 0); xfoo1 = XEXP (prod, 1); if (CONST_INT_P (xfoo0) - && INTVAL (xfoo0) == (int)GET_MODE_SIZE (mode) + && GET_MODE_SIZE (mode) == (log_p ? 1 << INTVAL (xfoo0) : INTVAL (xfoo0)) && INDEX_REGISTER_P (xfoo1, strict)) return true; if (CONST_INT_P (xfoo1) - && INTVAL (xfoo1) == (int)GET_MODE_SIZE (mode) + && GET_MODE_SIZE (mode) == (log_p ? 1 << INTVAL (xfoo1) : INTVAL (xfoo1)) && INDEX_REGISTER_P (xfoo0, strict)) return true; -- cgit v1.1 From 0366e2b40e9ea5fc61c9a694de0c8c76a238b03c Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Tue, 27 Apr 2021 16:09:07 -0400 Subject: aix: Alias -m64 to -maix64 and -m32 to -maix32. GCC on AIX historically has used -maix64 and -maix32 to switch to 64 bit mode or 32 bit mode, unlike other ports that use -m64 and -m32. The Alias() directive for options cannot be used because aix64 is expected in multiple parts of the compiler infrastructure and one cannot switch to -m64 due to backward compatibility. This patch defines DRIVER_SELF_SPECS to translate -m64 to -maix64 and -m32 to -maix32 so that the command line option compatible with other targets can be used while continuing to allow the historical options. gcc/ChangeLog: * config/rs6000/aix.h (SUBTARGET_DRIVER_SELF_SPECS): New. * config/rs6000/aix64.opt (m64): New. (m32): New. --- gcc/config/rs6000/aix.h | 6 ++++++ gcc/config/rs6000/aix64.opt | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/aix.h b/gcc/config/rs6000/aix.h index b116e1a..662785c 100644 --- a/gcc/config/rs6000/aix.h +++ b/gcc/config/rs6000/aix.h @@ -280,3 +280,9 @@ #define RS6000_USE_DWARF_NUMBERING #define TARGET_PRECOMPUTE_TLS_P rs6000_aix_precompute_tls_p + +/* Replace -m64 with -maix64 and -m32 with -maix32. */ +#undef SUBTARGET_DRIVER_SELF_SPECS +#define SUBTARGET_DRIVER_SELF_SPECS \ +"%{m64:-maix64} % Date: Wed, 28 Apr 2021 17:54:52 +0100 Subject: aarch64: Fix address mode for vec_concat pattern [PR100305] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The load_pair_lanes patterns match a vec_concat of two adjacent 64-bit memory locations as a single 128-bit load. The Utq constraint made sure that the address was suitable for a 128-bit vector, but this meant that it allowed some addresses that aren't valid for the 64-bit element mode. Two obvious fixes were: (1) Continue to accept addresses that aren't valid for the element modes. This would mean changing the mode of operands[1] before printing it. It would also mean using a custom predicate instead of the current memory_operand. (2) Restrict addresses to the intersection of those that are valid element and vector addresses. The problem with (1) is that, as well as being more complicated, it doesn't deal with the fact that we still have a memory_operand for the second element. If we encourage the first operand to be outside the range of a normal element memory_operand, we'll have to reload the second operand to make it valid. This reload will often be dead code, but will be kept around because the RTL pattern makes it look as though the second element address is still needed. This patch therefore does (2) instead. As mentioned in the PR notes, I think we have a general problem with the way that the aarch64 port deals with paired addresses. There's nothing to guarantee that the two addresses will be reloaded in a way that keeps them “obviously” adjacent, so the rtx_equal_p conditions could fail if something rechecked them later. For this particular pattern, I think it would be better to teach simplify-rtx.c to fold the vec_concat to a normal vector memory reference, to remove any suggestion that targets should try to match the unsimplified form. That obviously wouldn't be suitable for backports though. gcc/ PR target/100305 * config/aarch64/constraints.md (Utq): Require the address to be valid for both the element mode and for V2DImode. gcc/testsuite/ PR target/100305 * gcc.c-torture/compile/pr100305.c: New test. --- gcc/config/aarch64/constraints.md | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index fd3e925..3b49b45 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -327,6 +327,8 @@ "@internal An address valid for loading or storing a 128-bit AdvSIMD register" (and (match_code "mem") + (match_test "aarch64_legitimate_address_p (GET_MODE (op), + XEXP (op, 0), 1)") (match_test "aarch64_legitimate_address_p (V2DImode, XEXP (op, 0), 1)"))) -- cgit v1.1 From 59f5d16f2c5db4d9592c8ce6453afe81334bb012 Mon Sep 17 00:00:00 2001 From: Richard Earnshaw Date: Wed, 28 Apr 2021 17:56:38 +0100 Subject: arm: fix UB due to missing mode check [PR100311] Some places in the compiler iterate over all the fixed registers to check if that register can be used in a particular mode. The idiom is to iterate over the register and then for that register, if it supports the current mode to check all that register and any additional registers needed (HARD_REGNO_NREGS). If these two checks are not fully aligned then it is possible to generate a buffer overrun when testing data objects that are sized by the number of hard regs in the machine. The VPR register is a case where these checks were not consistent and because this is the last HARD register the result was that we ended up overflowing the fixed_regs array. gcc: PR target/100311 * config/arm/arm.c (arm_hard_regno_mode_ok): Only allow VPR to be used in HImode. --- gcc/config/arm/arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 6641e3f..0371d98 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -25269,7 +25269,7 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode) return false; if (IS_VPR_REGNUM (regno)) - return true; + return mode == HImode; if (TARGET_THUMB1) /* For the Thumb we only allow values bigger than SImode in -- cgit v1.1 From 3ba781d3b5c8efadb60866c9743b657e8f0eb222 Mon Sep 17 00:00:00 2001 From: Senthil Kumar Selvaraj Date: Wed, 28 Apr 2021 17:29:12 +0000 Subject: AVR cc0 conversion See https://gcc.gnu.org/pipermail/gcc-patches/2021-January/563638.html for background. This patch converts the avr backend to MODE_CC. It addresses some of the comments made in the previous submission over here (https://gcc.gnu.org/pipermail/gcc-patches/2020-December/561757.html). Specifically, this patch has 1. Automatic clobber of REG_CC in inline asm statements, via TARGET_MD_ASM_ADJUST hook. 2. Direct clobber of REG_CC in insns emitted after reload (pro and epilogue). 3. Regression testing done on atmega8, atmega128, attiny40 and atxmega128a3 devices (more details below). 4. Verification and fixes for casesi and avr_compare_pattern related code that inspects insns, by looking at avr-casesi and mach RTL dumps. 5. Use length of parallel instead of passing in operand counts when generating code for shift patterns. 6. Fixes for indentation glitches. 7. Removal of CC_xxx stuff in avr-protos.h. In the places where the macros were still used (cond_string), I've replaced them with a bool hardcoded to false. I expect this will go away/get fixed when I eventually add specific CC modes. Things still to do: 1. Adjustment of peepholes/define_splits to match against patterns with REG_CC clobber. 2. Model effect of non-compare insns on REG_CC using additional CC modes. I'm hoping to use of a modified version of the cc attribute and define_subst (again inspired by the cris port), to do this. 3. RTX cost adjustment. gcc/ * config/avr/avr-dimode.md: Turn existing patterns into define_insn_and_split style patterns where the splitter adds a clobber of the condition code register. Drop "cc" attribute. Add new patterns to match output of the splitters. * config/avr/avr-fixed.md: Likewise. * config/avr/avr.c (cc_reg_rtx): New. (avr_parallel_insn_from_insns): Adjust insn count for removal of set of cc0. (avr_is_casesi_sequence): Likewise. (avr_casei_sequence_check_operands): Likewise. (avr_optimize_casesi): Likewise. Also insert new insns after jump_insn. (avr_pass_casesi::avr_rest_of_handle_casesi): Adjust for removal of set of cc0. (avr_init_expanders): Initialize cc_reg_rtx. (avr_regno_reg_class): Handle REG_CC. (cond_string): Remove usage of CC_OVERFLOW_UNUSABLE. (avr_notice_update_cc): Remove function. (ret_cond_branch): Remove usage of CC_OVERFLOW_UNUSABLE. (compare_condition): Adjust for PARALLEL with REG_CC clobber. (out_shift_with_cnt): Likewise. (ashlhi3_out): Likewise. (ashrhi3_out): Likewise. (lshrhi3_out): Likewise. (avr_class_max_nregs): Return single reg for REG_CC. (avr_compare_pattern): Check for REG_CC instead of cc0_rtx. (avr_reorg_remove_redundant_compare): Likewise. (avr_reorg):Adjust for PARALLEL with REG_CC clobber. (avr_hard_regno_nregs): Return single reg for REG_CC. (avr_hard_regno_mode_ok): Allow only CCmode for REG_CC. (avr_md_asm_adjust): Clobber REG_CC. (TARGET_HARD_REGNO_NREGS): Define. (TARGET_CLASS_MAX_NREGS): Define. (TARGET_MD_ASM_ADJUST): Define. * config/avr/avr.h (FIRST_PSEUDO_REGISTER): Adjust for REG_CC. (enum reg_class): Add CC_REG class. (NOTICE_UPDATE_CC): Remove. (CC_OVERFLOW_UNUSABLE): Remove. (CC_NO_CARRY): Remove. * config/avr/avr.md: Turn existing patterns into define_insn_and_split style patterns where the splitter adds a clobber of the condition code register. Drop "cc" attribute. Add new patterns to match output of the splitters. (sez): Remove unused pattern. --- gcc/config/avr/avr-dimode.md | 336 +++- gcc/config/avr/avr-fixed.md | 380 +++- gcc/config/avr/avr.c | 318 ++-- gcc/config/avr/avr.h | 34 +- gcc/config/avr/avr.md | 4244 +++++++++++++++++++++++++++++++++--------- 5 files changed, 4120 insertions(+), 1192 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/avr/avr-dimode.md b/gcc/config/avr/avr-dimode.md index 1817c16..1eb9599 100644 --- a/gcc/config/avr/avr-dimode.md +++ b/gcc/config/avr/avr-dimode.md @@ -95,39 +95,77 @@ ;; "adddq3_insn" "addudq3_insn" ;; "addda3_insn" "adduda3_insn" ;; "addta3_insn" "adduta3_insn" -(define_insn "add3_insn" +(define_insn_and_split "add3_insn" [(set (reg:ALL8 ACC_A) (plus:ALL8 (reg:ALL8 ACC_A) (reg:ALL8 ACC_B)))] "avr_have_dimode" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL8 ACC_A) + (plus:ALL8 (reg:ALL8 ACC_A) + (reg:ALL8 ACC_B))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*add3_insn" + [(set (reg:ALL8 ACC_A) + (plus:ALL8 (reg:ALL8 ACC_A) + (reg:ALL8 ACC_B))) + (clobber (reg:CC REG_CC))] + "avr_have_dimode && reload_completed" "%~call __adddi3" - [(set_attr "adjust_len" "call") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "call")]) -(define_insn "adddi3_const8_insn" +(define_insn_and_split "adddi3_const8_insn" [(set (reg:DI ACC_A) (plus:DI (reg:DI ACC_A) (sign_extend:DI (reg:QI REG_X))))] "avr_have_dimode" + "#" + "&& reload_completed" + [(parallel [(set (reg:DI ACC_A) + (plus:DI (reg:DI ACC_A) + (sign_extend:DI (reg:QI REG_X)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*adddi3_const8_insn" + [(set (reg:DI ACC_A) + (plus:DI (reg:DI ACC_A) + (sign_extend:DI (reg:QI REG_X)))) + (clobber (reg:CC REG_CC))] + "avr_have_dimode && reload_completed" "%~call __adddi3_s8" - [(set_attr "adjust_len" "call") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "call")]) ;; "adddi3_const_insn" ;; "adddq3_const_insn" "addudq3_const_insn" ;; "addda3_const_insn" "adduda3_const_insn" ;; "addta3_const_insn" "adduta3_const_insn" -(define_insn "add3_const_insn" +(define_insn_and_split "add3_const_insn" [(set (reg:ALL8 ACC_A) (plus:ALL8 (reg:ALL8 ACC_A) (match_operand:ALL8 0 "const_operand" "n Ynn")))] "avr_have_dimode && !s8_operand (operands[0], VOIDmode)" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL8 ACC_A) + (plus:ALL8 (reg:ALL8 ACC_A) + (match_dup 0))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*add3_const_insn" + [(set (reg:ALL8 ACC_A) + (plus:ALL8 (reg:ALL8 ACC_A) + (match_operand:ALL8 0 "const_operand" "n Ynn"))) + (clobber (reg:CC REG_CC))] + "avr_have_dimode + && !s8_operand (operands[0], VOIDmode) + && reload_completed" { return avr_out_plus (insn, operands); } - [(set_attr "adjust_len" "plus") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "plus")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -167,29 +205,53 @@ ;; "subdq3_insn" "subudq3_insn" ;; "subda3_insn" "subuda3_insn" ;; "subta3_insn" "subuta3_insn" -(define_insn "sub3_insn" +(define_insn_and_split "sub3_insn" [(set (reg:ALL8 ACC_A) (minus:ALL8 (reg:ALL8 ACC_A) (reg:ALL8 ACC_B)))] "avr_have_dimode" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL8 ACC_A) + (minus:ALL8 (reg:ALL8 ACC_A) + (reg:ALL8 ACC_B))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*sub3_insn" + [(set (reg:ALL8 ACC_A) + (minus:ALL8 (reg:ALL8 ACC_A) + (reg:ALL8 ACC_B))) + (clobber (reg:CC REG_CC))] + "avr_have_dimode && reload_completed" "%~call __subdi3" - [(set_attr "adjust_len" "call") - (set_attr "cc" "set_czn")]) + [(set_attr "adjust_len" "call")]) ;; "subdi3_const_insn" ;; "subdq3_const_insn" "subudq3_const_insn" ;; "subda3_const_insn" "subuda3_const_insn" ;; "subta3_const_insn" "subuta3_const_insn" -(define_insn "sub3_const_insn" +(define_insn_and_split "sub3_const_insn" [(set (reg:ALL8 ACC_A) (minus:ALL8 (reg:ALL8 ACC_A) (match_operand:ALL8 0 "const_operand" "n Ynn")))] "avr_have_dimode" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL8 ACC_A) + (minus:ALL8 (reg:ALL8 ACC_A) + (match_dup 0))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*sub3_const_insn" + [(set (reg:ALL8 ACC_A) + (minus:ALL8 (reg:ALL8 ACC_A) + (match_operand:ALL8 0 "const_operand" "n Ynn"))) + (clobber (reg:CC REG_CC))] + "avr_have_dimode && reload_completed" { return avr_out_plus (insn, operands); } - [(set_attr "adjust_len" "plus") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "plus")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Signed Saturating Addition and Subtraction @@ -220,25 +282,49 @@ DONE; }) -(define_insn "3_insn" +(define_insn_and_split "3_insn" [(set (reg:ALL8S ACC_A) (ss_addsub:ALL8S (reg:ALL8S ACC_A) (reg:ALL8S ACC_B)))] "avr_have_dimode" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL8S ACC_A) + (ss_addsub:ALL8S (reg:ALL8S ACC_A) + (reg:ALL8S ACC_B))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*3_insn" + [(set (reg:ALL8S ACC_A) + (ss_addsub:ALL8S (reg:ALL8S ACC_A) + (reg:ALL8S ACC_B))) + (clobber (reg:CC REG_CC))] + "avr_have_dimode && reload_completed" "%~call __3" - [(set_attr "adjust_len" "call") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "call")]) -(define_insn "3_const_insn" +(define_insn_and_split "3_const_insn" [(set (reg:ALL8S ACC_A) (ss_addsub:ALL8S (reg:ALL8S ACC_A) (match_operand:ALL8S 0 "const_operand" "n Ynn")))] "avr_have_dimode" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL8S ACC_A) + (ss_addsub:ALL8S (reg:ALL8S ACC_A) + (match_dup 0))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*3_const_insn" + [(set (reg:ALL8S ACC_A) + (ss_addsub:ALL8S (reg:ALL8S ACC_A) + (match_operand:ALL8S 0 "const_operand" "n Ynn"))) + (clobber (reg:CC REG_CC))] + "avr_have_dimode && reload_completed" { return avr_out_plus (insn, operands); } - [(set_attr "adjust_len" "plus") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "plus")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Unsigned Saturating Addition and Subtraction @@ -269,25 +355,49 @@ DONE; }) -(define_insn "3_insn" +(define_insn_and_split "3_insn" [(set (reg:ALL8U ACC_A) (us_addsub:ALL8U (reg:ALL8U ACC_A) (reg:ALL8U ACC_B)))] "avr_have_dimode" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL8U ACC_A) + (us_addsub:ALL8U (reg:ALL8U ACC_A) + (reg:ALL8U ACC_B))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*3_insn" + [(set (reg:ALL8U ACC_A) + (us_addsub:ALL8U (reg:ALL8U ACC_A) + (reg:ALL8U ACC_B))) + (clobber (reg:CC REG_CC))] + "avr_have_dimode && reload_completed" "%~call __3" - [(set_attr "adjust_len" "call") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "call")]) -(define_insn "3_const_insn" +(define_insn_and_split "3_const_insn" [(set (reg:ALL8U ACC_A) (us_addsub:ALL8U (reg:ALL8U ACC_A) (match_operand:ALL8U 0 "const_operand" "n Ynn")))] "avr_have_dimode" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL8U ACC_A) + (us_addsub:ALL8U (reg:ALL8U ACC_A) + (match_operand:ALL8U 0 "const_operand" "n Ynn"))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*3_const_insn" + [(set (reg:ALL8U ACC_A) + (us_addsub:ALL8U (reg:ALL8U ACC_A) + (match_operand:ALL8U 0 "const_operand" "n Ynn"))) + (clobber (reg:CC REG_CC))] + "avr_have_dimode && reload_completed" { return avr_out_plus (insn, operands); } - [(set_attr "adjust_len" "plus") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "plus")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Negation @@ -306,13 +416,23 @@ DONE; }) -(define_insn "negdi2_insn" +(define_insn_and_split "negdi2_insn" [(set (reg:DI ACC_A) (neg:DI (reg:DI ACC_A)))] "avr_have_dimode" + "#" + "&& reload_completed" + [(parallel [(set (reg:DI ACC_A) + (neg:DI (reg:DI ACC_A))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*negdi2_insn" + [(set (reg:DI ACC_A) + (neg:DI (reg:DI ACC_A))) + (clobber (reg:CC REG_CC))] + "avr_have_dimode && reload_completed" "%~call __negdi2" - [(set_attr "adjust_len" "call") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "call")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -322,7 +442,7 @@ (define_expand "conditional_jump" [(set (pc) (if_then_else - (match_operator 0 "ordered_comparison_operator" [(cc0) + (match_operator 0 "ordered_comparison_operator" [(reg:CC REG_CC) (const_int 0)]) (label_ref (match_operand 1 "" "")) (pc)))] @@ -333,13 +453,14 @@ ;; "cbranchda4" "cbranchuda4" ;; "cbranchta4" "cbranchuta4" (define_expand "cbranch4" - [(parallel [(match_operand:ALL8 1 "register_operand" "") - (match_operand:ALL8 2 "nonmemory_operand" "") - (match_operator 0 "ordered_comparison_operator" [(cc0) - (const_int 0)]) - (label_ref (match_operand 3 "" ""))])] + [(set (pc) + (if_then_else (match_operator 0 "ordered_comparison_operator" + [(match_operand:ALL8 1 "register_operand" "") + (match_operand:ALL8 2 "nonmemory_operand" "")]) + (label_ref (match_operand 3 "" "")) + (pc)))] "avr_have_dimode" - { + { rtx acc_a = gen_rtx_REG (mode, ACC_A); avr_fix_inputs (operands, 1 << 2, regmask (mode, ACC_A)); @@ -348,19 +469,36 @@ if (s8_operand (operands[2], VOIDmode)) { emit_move_insn (gen_rtx_REG (QImode, REG_X), operands[2]); - emit_insn (gen_compare_const8_di2 ()); + emit_jump_insn (gen_cbranch_const8_di2_split (operands[0], operands[3])); } else if (const_operand (operands[2], GET_MODE (operands[2]))) { - emit_insn (gen_compare_const_2 (operands[2])); + emit_jump_insn (gen_cbranch_const_2_split (operands[0], + operands[2], + operands[3])); } else { emit_move_insn (gen_rtx_REG (mode, ACC_B), operands[2]); - emit_insn (gen_compare_2 ()); + emit_jump_insn (gen_cbranch_2_split (operands[0], operands[3])); } + DONE; + }) - emit_jump_insn (gen_conditional_jump (operands[0], operands[3])); +(define_insn_and_split "cbranch_2_split" + [(set (pc) + (if_then_else (match_operator 0 "ordered_comparison_operator" + [(reg:ALL8 ACC_A) + (reg:ALL8 ACC_B)]) + (label_ref (match_operand 1 "" "")) + (pc)))] + "avr_have_dimode" + "#" + "&& reload_completed" + [(const_int 0)] + { + emit_insn (gen_compare_2 ()); + emit_jump_insn (gen_conditional_jump (operands[0], operands[1])); DONE; }) @@ -369,39 +507,74 @@ ;; "compare_da2" "compare_uda2" ;; "compare_ta2" "compare_uta2" (define_insn "compare_2" - [(set (cc0) - (compare (reg:ALL8 ACC_A) - (reg:ALL8 ACC_B)))] - "avr_have_dimode" + [(set (reg:CC REG_CC) + (compare:CC (reg:ALL8 ACC_A) + (reg:ALL8 ACC_B)))] + "reload_completed && avr_have_dimode" "%~call __cmpdi2" - [(set_attr "adjust_len" "call") - (set_attr "cc" "compare")]) + [(set_attr "adjust_len" "call")]) -(define_insn "compare_const8_di2" - [(set (cc0) - (compare (reg:DI ACC_A) - (sign_extend:DI (reg:QI REG_X))))] +(define_insn_and_split "cbranch_const8_di2_split" + [(set (pc) + (if_then_else (match_operator 0 "ordered_comparison_operator" + [(reg:DI ACC_A) + (sign_extend:DI (reg:QI REG_X))]) + (label_ref (match_operand 1 "" "")) + (pc)))] "avr_have_dimode" + "#" + "&& reload_completed" + [(const_int 0)] + { + emit_insn (gen_compare_const8_di2 ()); + emit_jump_insn (gen_conditional_jump (operands[0], operands[1])); + DONE; + }) + +(define_insn "compare_const8_di2" + [(set (reg:CC REG_CC) + (compare:CC (reg:DI ACC_A) + (sign_extend:DI (reg:QI REG_X))))] + "reload_completed && avr_have_dimode" "%~call __cmpdi2_s8" - [(set_attr "adjust_len" "call") - (set_attr "cc" "compare")]) + [(set_attr "adjust_len" "call")]) + +(define_insn_and_split "cbranch_const_2_split" + [(set (pc) + (if_then_else (match_operator 0 "ordered_comparison_operator" + [(reg:ALL8 ACC_A) + (match_operand:ALL8 1 "const_operand" "n Ynn")]) + (label_ref (match_operand 2 "" "")) + (pc))) + (clobber (match_scratch:QI 3 "=&d"))] + "avr_have_dimode + && !s8_operand (operands[1], VOIDmode)" + "#" + "&& reload_completed" + [(const_int 0)] + { + emit_insn (gen_compare_const_2 (operands[1], operands[3])); + emit_jump_insn (gen_conditional_jump (operands[0], operands[2])); + DONE; + }) + ;; "compare_const_di2" ;; "compare_const_dq2" "compare_const_udq2" ;; "compare_const_da2" "compare_const_uda2" ;; "compare_const_ta2" "compare_const_uta2" (define_insn "compare_const_2" - [(set (cc0) - (compare (reg:ALL8 ACC_A) - (match_operand:ALL8 0 "const_operand" "n Ynn"))) - (clobber (match_scratch:QI 1 "=&d"))] - "avr_have_dimode + [(set (reg:CC REG_CC) + (compare:CC (reg:ALL8 ACC_A) + (match_operand:ALL8 0 "const_operand" "n Ynn"))) + (clobber (match_operand:QI 1 "register_operand" "=&d"))] + "reload_completed + && avr_have_dimode && !s8_operand (operands[0], VOIDmode)" { return avr_out_compare64 (insn, operands, NULL); } - [(set_attr "adjust_len" "compare64") - (set_attr "cc" "compare")]) + [(set_attr "adjust_len" "compare64")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -444,14 +617,26 @@ ;; "ashludq3_insn" "ashrudq3_insn" "lshrudq3_insn" "rotludq3_insn" ;; "ashluda3_insn" "ashruda3_insn" "lshruda3_insn" "rotluda3_insn" ;; "ashluta3_insn" "ashruta3_insn" "lshruta3_insn" "rotluta3_insn" -(define_insn "3_insn" +(define_insn_and_split "3_insn" [(set (reg:ALL8 ACC_A) (di_shifts:ALL8 (reg:ALL8 ACC_A) (reg:QI 16)))] "avr_have_dimode" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL8 ACC_A) + (di_shifts:ALL8 (reg:ALL8 ACC_A) + (reg:QI 16))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*3_insn" + [(set (reg:ALL8 ACC_A) + (di_shifts:ALL8 (reg:ALL8 ACC_A) + (reg:QI 16))) + (clobber (reg:CC REG_CC))] + "avr_have_dimode && reload_completed" "%~call __di3" - [(set_attr "adjust_len" "call") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "call")]) ;; "umulsidi3" ;; "mulsidi3" @@ -475,7 +660,8 @@ ;; "umulsidi3_insn" ;; "mulsidi3_insn" -(define_insn "mulsidi3_insn" + +(define_insn_and_split "mulsidi3_insn" [(set (reg:DI ACC_A) (mult:DI (any_extend:DI (reg:SI 18)) (any_extend:DI (reg:SI 22)))) @@ -483,6 +669,24 @@ (clobber (reg:HI REG_Z))] "avr_have_dimode && AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:DI ACC_A) + (mult:DI (any_extend:DI (reg:SI 18)) + (any_extend:DI (reg:SI 22)))) + (clobber (reg:HI REG_X)) + (clobber (reg:HI REG_Z)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulsidi3_insn" + [(set (reg:DI ACC_A) + (mult:DI (any_extend:DI (reg:SI 18)) + (any_extend:DI (reg:SI 22)))) + (clobber (reg:HI REG_X)) + (clobber (reg:HI REG_Z)) + (clobber (reg:CC REG_CC))] + "avr_have_dimode + && AVR_HAVE_MUL + && reload_completed" "%~call __mulsidi3" - [(set_attr "adjust_len" "call") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "call")]) diff --git a/gcc/config/avr/avr-fixed.md b/gcc/config/avr/avr-fixed.md index a3b49d5..1c4902f 100644 --- a/gcc/config/avr/avr-fixed.md +++ b/gcc/config/avr/avr-fixed.md @@ -56,27 +56,53 @@ TA UTA QI HI SI DI]) -(define_insn "fract2" +(define_insn_and_split "fract2" [(set (match_operand:FIXED_A 0 "register_operand" "=r") (fract_convert:FIXED_A (match_operand:FIXED_B 1 "register_operand" "r")))] "mode != mode" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (fract_convert:FIXED_A + (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*fract2" + [(set (match_operand:FIXED_A 0 "register_operand" "=r") + (fract_convert:FIXED_A + (match_operand:FIXED_B 1 "register_operand" "r"))) + (clobber (reg:CC REG_CC))] + "mode != mode + && reload_completed" { return avr_out_fract (insn, operands, true, NULL); } - [(set_attr "cc" "clobber") - (set_attr "adjust_len" "sfract")]) + [(set_attr "adjust_len" "sfract")]) -(define_insn "fractuns2" +(define_insn_and_split "fractuns2" [(set (match_operand:FIXED_A 0 "register_operand" "=r") (unsigned_fract_convert:FIXED_A (match_operand:FIXED_B 1 "register_operand" "r")))] "mode != mode" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (unsigned_fract_convert:FIXED_A + (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*fractuns2" + [(set (match_operand:FIXED_A 0 "register_operand" "=r") + (unsigned_fract_convert:FIXED_A + (match_operand:FIXED_B 1 "register_operand" "r"))) + (clobber (reg:CC REG_CC))] + "mode != mode + && reload_completed" { return avr_out_fract (insn, operands, false, NULL); } - [(set_attr "cc" "clobber") - (set_attr "adjust_len" "ufract")]) + [(set_attr "adjust_len" "ufract")]) ;****************************************************************************** ;** Saturated Addition and Subtraction @@ -92,29 +118,53 @@ ;; "ssaddqq3" "ssaddhq3" "ssaddha3" "ssaddsq3" "ssaddsa3" ;; "sssubqq3" "sssubhq3" "sssubha3" "sssubsq3" "sssubsa3" -(define_insn "3" +(define_insn_and_split "3" [(set (match_operand:ALL124S 0 "register_operand" "=??d,d") (ss_addsub:ALL124S (match_operand:ALL124S 1 "register_operand" "0,0") (match_operand:ALL124S 2 "nonmemory_operand" "r,Ynn")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ss_addsub:ALL124S (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*3" + [(set (match_operand:ALL124S 0 "register_operand" "=??d,d") + (ss_addsub:ALL124S (match_operand:ALL124S 1 "register_operand" "0,0") + (match_operand:ALL124S 2 "nonmemory_operand" "r,Ynn"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_plus (insn, operands); } - [(set_attr "cc" "clobber") - (set_attr "adjust_len" "plus")]) + [(set_attr "adjust_len" "plus")]) ;; "usadduqq3" "usadduhq3" "usadduha3" "usaddusq3" "usaddusa3" ;; "ussubuqq3" "ussubuhq3" "ussubuha3" "ussubusq3" "ussubusa3" -(define_insn "3" +(define_insn_and_split "3" [(set (match_operand:ALL124U 0 "register_operand" "=??r,d") (us_addsub:ALL124U (match_operand:ALL124U 1 "register_operand" "0,0") (match_operand:ALL124U 2 "nonmemory_operand" "r,Ynn")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (us_addsub:ALL124U (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*3" + [(set (match_operand:ALL124U 0 "register_operand" "=??r,d") + (us_addsub:ALL124U (match_operand:ALL124U 1 "register_operand" "0,0") + (match_operand:ALL124U 2 "nonmemory_operand" "r,Ynn"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_plus (insn, operands); } - [(set_attr "cc" "clobber") - (set_attr "adjust_len" "plus")]) + [(set_attr "adjust_len" "plus")]) ;****************************************************************************** ;** Saturated Negation and Absolute Value @@ -134,21 +184,41 @@ DONE; }) -(define_insn "ssnegqq2" +(define_insn_and_split "ssnegqq2" [(set (match_operand:QQ 0 "register_operand" "=r") (ss_neg:QQ (match_operand:QQ 1 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ss_neg:QQ (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ssnegqq2" + [(set (match_operand:QQ 0 "register_operand" "=r") + (ss_neg:QQ (match_operand:QQ 1 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "neg %0\;brvc 0f\;dec %0\;0:" - [(set_attr "cc" "clobber") - (set_attr "length" "3")]) + [(set_attr "length" "3")]) -(define_insn "ssabsqq2" +(define_insn_and_split "ssabsqq2" [(set (match_operand:QQ 0 "register_operand" "=r") (ss_abs:QQ (match_operand:QQ 1 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ss_abs:QQ (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ssabsqq2" + [(set (match_operand:QQ 0 "register_operand" "=r") + (ss_abs:QQ (match_operand:QQ 1 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "sbrc %0,7\;neg %0\;sbrc %0,7\;dec %0" - [(set_attr "cc" "clobber") - (set_attr "length" "4")]) + [(set_attr "length" "4")]) ;; "ssneghq2" "ssnegha2" "ssnegsq2" "ssnegsa2" ;; "ssabshq2" "ssabsha2" "ssabssq2" "ssabssa2" @@ -166,23 +236,43 @@ ;; "*ssneghq2" "*ssnegha2" ;; "*ssabshq2" "*ssabsha2" -(define_insn "*2" +(define_insn_and_split "*2_split" [(set (reg:ALL2S 24) (ss_abs_neg:ALL2S (reg:ALL2S 24)))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL2S 24) + (ss_abs_neg:ALL2S (reg:ALL2S 24))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*2" + [(set (reg:ALL2S 24) + (ss_abs_neg:ALL2S (reg:ALL2S 24))) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call ___2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; "*ssnegsq2" "*ssnegsa2" ;; "*ssabssq2" "*ssabssa2" -(define_insn "*2" +(define_insn_and_split "*2_split" [(set (reg:ALL4S 22) (ss_abs_neg:ALL4S (reg:ALL4S 22)))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL4S 22) + (ss_abs_neg:ALL4S (reg:ALL4S 22))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*2" + [(set (reg:ALL4S 22) + (ss_abs_neg:ALL4S (reg:ALL4S 22))) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call ___4" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;****************************************************************************** ; mul @@ -200,23 +290,47 @@ DONE; }) -(define_insn "mulqq3_enh" +(define_insn_and_split "mulqq3_enh" [(set (match_operand:QQ 0 "register_operand" "=r") (mult:QQ (match_operand:QQ 1 "register_operand" "a") (match_operand:QQ 2 "register_operand" "a")))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:QQ (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulqq3_enh" + [(set (match_operand:QQ 0 "register_operand" "=r") + (mult:QQ (match_operand:QQ 1 "register_operand" "a") + (match_operand:QQ 2 "register_operand" "a"))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "fmuls %1,%2\;dec r1\;brvs 0f\;inc r1\;0:\;mov %0,r1\;clr __zero_reg__" - [(set_attr "length" "6") - (set_attr "cc" "clobber")]) + [(set_attr "length" "6")]) -(define_insn "muluqq3_enh" +(define_insn_and_split "muluqq3_enh" [(set (match_operand:UQQ 0 "register_operand" "=r") (mult:UQQ (match_operand:UQQ 1 "register_operand" "r") (match_operand:UQQ 2 "register_operand" "r")))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:UQQ (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*muluqq3_enh" + [(set (match_operand:UQQ 0 "register_operand" "=r") + (mult:UQQ (match_operand:UQQ 1 "register_operand" "r") + (match_operand:UQQ 2 "register_operand" "r"))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mul %1,%2\;mov %0,r1\;clr __zero_reg__" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) (define_expand "mulqq3_nomul" [(set (reg:QQ 24) @@ -255,16 +369,32 @@ avr_fix_inputs (operands, 1 << 2, regmask (UQQmode, 22)); }) -(define_insn "*mulqq3.call" +(define_insn_and_split "*mulqq3.call_split" [(set (reg:QQ 23) (mult:QQ (reg:QQ 24) (reg:QQ 25))) (clobber (reg:QI 22)) (clobber (reg:HI 24))] "!AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:QQ 23) + (mult:QQ (reg:QQ 24) + (reg:QQ 25))) + (clobber (reg:QI 22)) + (clobber (reg:HI 24)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulqq3.call" + [(set (reg:QQ 23) + (mult:QQ (reg:QQ 24) + (reg:QQ 25))) + (clobber (reg:QI 22)) + (clobber (reg:HI 24)) + (clobber (reg:CC REG_CC))] + "!AVR_HAVE_MUL && reload_completed" "%~call __mulqq3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; "mulhq3" "muluhq3" @@ -288,15 +418,29 @@ ;; "*mulhq3.call" "*muluhq3.call" ;; "*mulha3.call" "*muluha3.call" -(define_insn "*mul3.call" +(define_insn_and_split "*mul3.call_split" [(set (reg:ALL2QA 24) (mult:ALL2QA (reg:ALL2QA 18) (reg:ALL2QA 26))) (clobber (reg:HI 22))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL2QA 24) + (mult:ALL2QA (reg:ALL2QA 18) + (reg:ALL2QA 26))) + (clobber (reg:HI 22)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mul3.call" + [(set (reg:ALL2QA 24) + (mult:ALL2QA (reg:ALL2QA 18) + (reg:ALL2QA 26))) + (clobber (reg:HI 22)) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "%~call __mul3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; On the enhanced core, don't clobber either input and use a separate output @@ -318,14 +462,26 @@ }) ;; "*mulsa3.call" "*mulusa3.call" -(define_insn "*mul3.call" +(define_insn_and_split "*mul3.call_split" [(set (reg:ALL4A 24) (mult:ALL4A (reg:ALL4A 16) (reg:ALL4A 20)))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL4A 24) + (mult:ALL4A (reg:ALL4A 16) + (reg:ALL4A 20))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mul3.call" + [(set (reg:ALL4A 24) + (mult:ALL4A (reg:ALL4A 16) + (reg:ALL4A 20))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "%~call __mul3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ; / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ; div @@ -351,15 +507,29 @@ ;; "*divqq3.call" "*udivuqq3.call" -(define_insn "*3.call" +(define_insn_and_split "*3.call_split" [(set (reg:ALL1Q 24) (usdiv:ALL1Q (reg:ALL1Q 25) (reg:ALL1Q 22))) (clobber (reg:QI 25))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL1Q 24) + (usdiv:ALL1Q (reg:ALL1Q 25) + (reg:ALL1Q 22))) + (clobber (reg:QI 25)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*3.call" + [(set (reg:ALL1Q 24) + (usdiv:ALL1Q (reg:ALL1Q 25) + (reg:ALL1Q 22))) + (clobber (reg:QI 25)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; "divhq3" "udivuhq3" ;; "divha3" "udivuha3" @@ -382,16 +552,32 @@ ;; "*divhq3.call" "*udivuhq3.call" ;; "*divha3.call" "*udivuha3.call" -(define_insn "*3.call" +(define_insn_and_split "*3.call_split" [(set (reg:ALL2QA 24) (usdiv:ALL2QA (reg:ALL2QA 26) (reg:ALL2QA 22))) (clobber (reg:HI 26)) (clobber (reg:QI 21))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL2QA 24) + (usdiv:ALL2QA (reg:ALL2QA 26) + (reg:ALL2QA 22))) + (clobber (reg:HI 26)) + (clobber (reg:QI 21)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*3.call" + [(set (reg:ALL2QA 24) + (usdiv:ALL2QA (reg:ALL2QA 26) + (reg:ALL2QA 22))) + (clobber (reg:HI 26)) + (clobber (reg:QI 21)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; Note the first parameter gets passed in already offset by 2 bytes @@ -414,16 +600,32 @@ }) ;; "*divsa3.call" "*udivusa3.call" -(define_insn "*3.call" +(define_insn_and_split "*3.call_split" [(set (reg:ALL4A 22) (usdiv:ALL4A (reg:ALL4A 24) (reg:ALL4A 18))) (clobber (reg:HI 26)) (clobber (reg:HI 30))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL4A 22) + (usdiv:ALL4A (reg:ALL4A 24) + (reg:ALL4A 18))) + (clobber (reg:HI 26)) + (clobber (reg:HI 30)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*3.call" + [(set (reg:ALL4A 22) + (usdiv:ALL4A (reg:ALL4A 24) + (reg:ALL4A 18))) + (clobber (reg:HI 26)) + (clobber (reg:HI 30)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;****************************************************************************** @@ -474,51 +676,109 @@ ;; "roundqq3_const" "rounduqq3_const" ;; "roundhq3_const" "rounduhq3_const" "roundha3_const" "rounduha3_const" ;; "roundsq3_const" "roundusq3_const" "roundsa3_const" "roundusa3_const" -(define_insn "round3_const" +(define_insn_and_split "round3_const" [(set (match_operand:ALL124QA 0 "register_operand" "=d") (unspec:ALL124QA [(match_operand:ALL124QA 1 "register_operand" "0") (match_operand:HI 2 "const_int_operand" "n") (const_int 0)] UNSPEC_ROUND))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (unspec:ALL124QA [(match_dup 1) + (match_dup 2) + (const_int 0)] + UNSPEC_ROUND)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*round3_const" + [(set (match_operand:ALL124QA 0 "register_operand" "=d") + (unspec:ALL124QA [(match_operand:ALL124QA 1 "register_operand" "0") + (match_operand:HI 2 "const_int_operand" "n") + (const_int 0)] + UNSPEC_ROUND)) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_round (insn, operands); } - [(set_attr "cc" "clobber") - (set_attr "adjust_len" "round")]) + [(set_attr "adjust_len" "round")]) ;; "*roundqq3.libgcc" "*rounduqq3.libgcc" -(define_insn "*round3.libgcc" +(define_insn_and_split "*round3.libgcc_split" [(set (reg:ALL1Q 24) (unspec:ALL1Q [(reg:ALL1Q 22) (reg:QI 24)] UNSPEC_ROUND)) (clobber (reg:ALL1Q 22))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL1Q 24) + (unspec:ALL1Q [(reg:ALL1Q 22) + (reg:QI 24)] UNSPEC_ROUND)) + (clobber (reg:ALL1Q 22)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*round3.libgcc" + [(set (reg:ALL1Q 24) + (unspec:ALL1Q [(reg:ALL1Q 22) + (reg:QI 24)] UNSPEC_ROUND)) + (clobber (reg:ALL1Q 22)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __round3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; "*roundhq3.libgcc" "*rounduhq3.libgcc" ;; "*roundha3.libgcc" "*rounduha3.libgcc" -(define_insn "*round3.libgcc" +(define_insn_and_split "*round3.libgcc_split" [(set (reg:ALL2QA 24) (unspec:ALL2QA [(reg:ALL2QA 22) (reg:QI 24)] UNSPEC_ROUND)) (clobber (reg:ALL2QA 22))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL2QA 24) + (unspec:ALL2QA [(reg:ALL2QA 22) + (reg:QI 24)] UNSPEC_ROUND)) + (clobber (reg:ALL2QA 22)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*round3.libgcc" + [(set (reg:ALL2QA 24) + (unspec:ALL2QA [(reg:ALL2QA 22) + (reg:QI 24)] UNSPEC_ROUND)) + (clobber (reg:ALL2QA 22)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __round3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; "*roundsq3.libgcc" "*roundusq3.libgcc" ;; "*roundsa3.libgcc" "*roundusa3.libgcc" -(define_insn "*round3.libgcc" +(define_insn_and_split "*round3.libgcc_split" [(set (reg:ALL4QA 22) (unspec:ALL4QA [(reg:ALL4QA 18) (reg:QI 24)] UNSPEC_ROUND)) (clobber (reg:ALL4QA 18))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:ALL4QA 22) + (unspec:ALL4QA [(reg:ALL4QA 18) + (reg:QI 24)] UNSPEC_ROUND)) + (clobber (reg:ALL4QA 18)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*round3.libgcc" + [(set (reg:ALL4QA 22) + (unspec:ALL4QA [(reg:ALL4QA 18) + (reg:QI 24)] UNSPEC_ROUND)) + (clobber (reg:ALL4QA 18)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __round3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) diff --git a/gcc/config/avr/avr.c b/gcc/config/avr/avr.c index 3a250df..06c84d5 100644 --- a/gcc/config/avr/avr.c +++ b/gcc/config/avr/avr.c @@ -195,6 +195,10 @@ rtx tmp_reg_rtx; extern GTY(()) rtx zero_reg_rtx; rtx zero_reg_rtx; +/* Condition Code register RTX (reg:CC REG_CC) */ +extern GTY(()) rtx cc_reg_rtx; +rtx cc_reg_rtx; + /* RTXs for all general purpose registers as QImode */ extern GTY(()) rtx all_regs_rtx[32]; rtx all_regs_rtx[32]; @@ -376,10 +380,10 @@ make_avr_pass_casesi (gcc::context *ctxt) /* Make one parallel insn with all the patterns from insns i[0]..i[5]. */ static rtx_insn* -avr_parallel_insn_from_insns (rtx_insn *i[6]) +avr_parallel_insn_from_insns (rtx_insn *i[5]) { - rtvec vec = gen_rtvec (6, PATTERN (i[0]), PATTERN (i[1]), PATTERN (i[2]), - PATTERN (i[3]), PATTERN (i[4]), PATTERN (i[5])); + rtvec vec = gen_rtvec (5, PATTERN (i[0]), PATTERN (i[1]), PATTERN (i[2]), + PATTERN (i[3]), PATTERN (i[4])); start_sequence(); emit (gen_rtx_PARALLEL (VOIDmode, vec)); rtx_insn *insn = get_insns(); @@ -397,22 +401,21 @@ avr_parallel_insn_from_insns (rtx_insn *i[6]) pattern casesi__sequence forged from the sequence to recog_data. */ static bool -avr_is_casesi_sequence (basic_block bb, rtx_insn *insn, rtx_insn *insns[6]) +avr_is_casesi_sequence (basic_block bb, rtx_insn *insn, rtx_insn *insns[5]) { - rtx set_5, set_0; + rtx set_4, set_0; /* A first and quick test for a casesi sequences. As a side effect of - the test, harvest respective insns to INSNS[0..5]. */ + the test, harvest respective insns to INSNS[0..4]. */ - if (!(JUMP_P (insns[5] = insn) + if (!(JUMP_P (insns[4] = insn) // casesi is the only insn that comes up with UNSPEC_INDEX_JMP, // hence the following test ensures that we are actually dealing // with code from casesi. - && (set_5 = single_set (insns[5])) - && UNSPEC == GET_CODE (SET_SRC (set_5)) - && UNSPEC_INDEX_JMP == XINT (SET_SRC (set_5), 1) + && (set_4 = single_set (insns[4])) + && UNSPEC == GET_CODE (SET_SRC (set_4)) + && UNSPEC_INDEX_JMP == XINT (SET_SRC (set_4), 1) - && (insns[4] = prev_real_insn (insns[5])) && (insns[3] = prev_real_insn (insns[4])) && (insns[2] = prev_real_insn (insns[3])) && (insns[1] = prev_real_insn (insns[2])) @@ -429,7 +432,7 @@ avr_is_casesi_sequence (basic_block bb, rtx_insn *insn, rtx_insn *insns[6]) { fprintf (dump_file, ";; Sequence from casesi in " "[bb %d]:\n\n", bb->index); - for (int i = 0; i < 6; i++) + for (int i = 0; i < 5; i++) print_rtl_single (dump_file, insns[i]); } @@ -519,7 +522,7 @@ avr_casei_sequence_check_operands (rtx *xop) } -/* INSNS[1..5] is a sequence as generated by casesi and INSNS[0] is an +/* INSNS[1..4] is a sequence as generated by casesi and INSNS[0] is an extension of an 8-bit or 16-bit integer to SImode. XOP contains the operands of INSNS as extracted by insn_extract from pattern casesi__sequence: @@ -541,7 +544,7 @@ avr_casei_sequence_check_operands (rtx *xop) switch value instead of SImode. */ static void -avr_optimize_casesi (rtx_insn *insns[6], rtx *xop) +avr_optimize_casesi (rtx_insn *insns[5], rtx *xop) { // Original mode of the switch value; this is QImode or HImode. machine_mode mode = GET_MODE (xop[10]); @@ -597,16 +600,21 @@ avr_optimize_casesi (rtx_insn *insns[6], rtx *xop) rtx reg = copy_to_mode_reg (mode, xop[10]); rtx (*gen_add)(rtx,rtx,rtx) = QImode == mode ? gen_addqi3 : gen_addhi3; - rtx (*gen_cmp)(rtx,rtx) = QImode == mode ? gen_cmpqi3 : gen_cmphi3; + rtx (*gen_cbranch)(rtx,rtx,rtx,rtx) + = QImode == mode ? gen_cbranchqi4 : gen_cbranchhi4; emit_insn (gen_add (reg, reg, gen_int_mode (-low_idx, mode))); - emit_insn (gen_cmp (reg, gen_int_mode (num_idx, mode))); + rtx op0 = reg; rtx op1 = gen_int_mode (num_idx, mode); + rtx labelref = copy_rtx (xop[4]); + emit_jump_insn (gen_cbranch (gen_rtx_fmt_ee (GTU, VOIDmode, op0, op1), + op0, op1, + labelref)); seq1 = get_insns(); last1 = get_last_insn(); end_sequence(); - emit_insn_before (seq1, insns[1]); + emit_insn_after (seq1, insns[2]); // After the out-of-bounds test and corresponding branch, use a // 16-bit index. If QImode is used, extend it to HImode first. @@ -627,7 +635,7 @@ avr_optimize_casesi (rtx_insn *insns[6], rtx *xop) last2 = get_last_insn(); end_sequence(); - emit_insn_after (seq2, insns[4]); + emit_insn_after (seq2, insns[3]); if (dump_file) { @@ -648,7 +656,7 @@ avr_optimize_casesi (rtx_insn *insns[6], rtx *xop) } fprintf (dump_file, ";; Deleting insns: %d, %d, %d.\n\n", - INSN_UID (insns[1]), INSN_UID (insns[2]), INSN_UID (insns[4])); + INSN_UID (insns[1]), INSN_UID (insns[2]), INSN_UID (insns[3])); } // Pseudodelete the SImode and subreg of SImode insns. We don't care @@ -657,7 +665,7 @@ avr_optimize_casesi (rtx_insn *insns[6], rtx *xop) SET_INSN_DELETED (insns[1]); SET_INSN_DELETED (insns[2]); - SET_INSN_DELETED (insns[4]); + SET_INSN_DELETED (insns[3]); } @@ -668,7 +676,7 @@ avr_pass_casesi::avr_rest_of_handle_casesi (function *func) FOR_EACH_BB_FN (bb, func) { - rtx_insn *insn, *insns[6]; + rtx_insn *insn, *insns[5]; FOR_BB_INSNS (bb, insn) { @@ -814,6 +822,8 @@ avr_init_expanders (void) tmp_reg_rtx = all_regs_rtx[AVR_TMP_REGNO]; zero_reg_rtx = all_regs_rtx[AVR_ZERO_REGNO]; + cc_reg_rtx = gen_rtx_REG (CCmode, REG_CC); + lpm_addr_reg_rtx = gen_rtx_REG (HImode, REG_Z); sreg_rtx = gen_rtx_MEM (QImode, GEN_INT (avr_addr.sreg)); @@ -864,6 +874,9 @@ avr_regno_reg_class (int r) if (r <= 33) return reg_class_tab[r]; + if (r == REG_CC) + return CC_REG; + return ALL_REGS; } @@ -2641,6 +2654,8 @@ ptrreg_to_str (int regno) static const char* cond_string (enum rtx_code code) { + bool cc_overflow_unusable = false; + switch (code) { case NE: @@ -2648,12 +2663,12 @@ cond_string (enum rtx_code code) case EQ: return "eq"; case GE: - if (cc_prev_status.flags & CC_OVERFLOW_UNUSABLE) + if (cc_overflow_unusable) return "pl"; else return "ge"; case LT: - if (cc_prev_status.flags & CC_OVERFLOW_UNUSABLE) + if (cc_overflow_unusable) return "mi"; else return "lt"; @@ -2989,152 +3004,6 @@ avr_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size, return size <= MOVE_MAX_PIECES; } - -/* Worker function for `NOTICE_UPDATE_CC'. */ -/* Update the condition code in the INSN. */ - -void -avr_notice_update_cc (rtx body ATTRIBUTE_UNUSED, rtx_insn *insn) -{ - rtx set; - enum attr_cc cc = get_attr_cc (insn); - - switch (cc) - { - default: - break; - - case CC_PLUS: - case CC_LDI: - { - rtx *op = recog_data.operand; - int len_dummy, icc; - - /* Extract insn's operands. */ - extract_constrain_insn_cached (insn); - - switch (cc) - { - default: - gcc_unreachable(); - - case CC_PLUS: - avr_out_plus (insn, op, &len_dummy, &icc); - cc = (enum attr_cc) icc; - break; - - case CC_LDI: - - cc = (op[1] == CONST0_RTX (GET_MODE (op[0])) - && reg_overlap_mentioned_p (op[0], zero_reg_rtx)) - /* Loading zero-reg with 0 uses CLR and thus clobbers cc0. */ - ? CC_CLOBBER - /* Any other "r,rL" combination does not alter cc0. */ - : CC_NONE; - - break; - } /* inner switch */ - - break; - } - } /* outer swicth */ - - switch (cc) - { - default: - /* Special values like CC_OUT_PLUS from above have been - mapped to "standard" CC_* values so we never come here. */ - - gcc_unreachable(); - break; - - case CC_NONE: - /* Insn does not affect CC at all, but it might set some registers - that are stored in cc_status. If such a register is affected by - the current insn, for example by means of a SET or a CLOBBER, - then we must reset cc_status; cf. PR77326. - - Unfortunately, set_of cannot be used as reg_overlap_mentioned_p - will abort on COMPARE (which might be found in cc_status.value1/2). - Thus work out the registers set by the insn and regs mentioned - in cc_status.value1/2. */ - - if (cc_status.value1 - || cc_status.value2) - { - HARD_REG_SET regs_used; - HARD_REG_SET regs_set; - CLEAR_HARD_REG_SET (regs_used); - - if (cc_status.value1 - && !CONSTANT_P (cc_status.value1)) - { - find_all_hard_regs (cc_status.value1, ®s_used); - } - - if (cc_status.value2 - && !CONSTANT_P (cc_status.value2)) - { - find_all_hard_regs (cc_status.value2, ®s_used); - } - - find_all_hard_reg_sets (insn, ®s_set, false); - - if (hard_reg_set_intersect_p (regs_used, regs_set)) - { - CC_STATUS_INIT; - } - } - - break; // CC_NONE - - case CC_SET_N: - CC_STATUS_INIT; - break; - - case CC_SET_ZN: - set = single_set (insn); - CC_STATUS_INIT; - if (set) - { - cc_status.flags |= CC_NO_OVERFLOW; - cc_status.value1 = SET_DEST (set); - } - break; - - case CC_SET_VZN: - /* Insn like INC, DEC, NEG that set Z,N,V. We currently don't make use - of this combination, cf. also PR61055. */ - CC_STATUS_INIT; - break; - - case CC_SET_CZN: - /* Insn sets the Z,N,C flags of CC to recog_operand[0]. - The V flag may or may not be known but that's ok because - alter_cond will change tests to use EQ/NE. */ - set = single_set (insn); - CC_STATUS_INIT; - if (set) - { - cc_status.value1 = SET_DEST (set); - cc_status.flags |= CC_OVERFLOW_UNUSABLE; - } - break; - - case CC_COMPARE: - set = single_set (insn); - CC_STATUS_INIT; - if (set) - cc_status.value1 = SET_SRC (set); - break; - - case CC_CLOBBER: - /* Insn doesn't leave CC in a usable state. */ - CC_STATUS_INIT; - break; - } -} - /* Choose mode for jump insn: 1 - relative jump in range -63 <= x <= 62 ; 2 - relative jump in range -2046 <= x <= 2045 ; @@ -3167,11 +3036,12 @@ const char* ret_cond_branch (rtx x, int len, int reverse) { RTX_CODE cond = reverse ? reverse_condition (GET_CODE (x)) : GET_CODE (x); + bool cc_overflow_unusable = false; switch (cond) { case GT: - if (cc_prev_status.flags & CC_OVERFLOW_UNUSABLE) + if (cc_overflow_unusable) return (len == 1 ? ("breq .+2" CR_TAB "brpl %0") : len == 2 ? ("breq .+4" CR_TAB @@ -3200,7 +3070,7 @@ ret_cond_branch (rtx x, int len, int reverse) "brlo .+4" CR_TAB "jmp %0")); case LE: - if (cc_prev_status.flags & CC_OVERFLOW_UNUSABLE) + if (cc_overflow_unusable) return (len == 1 ? ("breq %0" CR_TAB "brmi %0") : len == 2 ? ("breq .+2" CR_TAB @@ -5820,6 +5690,8 @@ compare_condition (rtx_insn *insn) if (next && JUMP_P (next)) { rtx pat = PATTERN (next); + if (GET_CODE (pat) == PARALLEL) + pat = XVECEXP (pat, 0, 0); rtx src = SET_SRC (pat); if (IF_THEN_ELSE == GET_CODE (src)) @@ -6179,7 +6051,13 @@ out_shift_with_cnt (const char *templ, rtx_insn *insn, rtx operands[], if (CONST_INT_P (operands[2])) { + /* Operand 3 is a scratch register if this is a + parallel with three elements i.e. a set, + a clobber of a scratch, and clobber of REG_CC. + If a scratch reg is not available, then the parallel + will contain only a set and clobber of REG_CC. */ bool scratch = (GET_CODE (PATTERN (insn)) == PARALLEL + && XVECLEN (PATTERN (insn), 0) == 3 && REG_P (operands[3])); int count = INTVAL (operands[2]); int max_len = 10; /* If larger than this, always use a loop. */ @@ -6376,7 +6254,9 @@ ashlhi3_out (rtx_insn *insn, rtx operands[], int *len) { if (CONST_INT_P (operands[2])) { - int scratch = (GET_CODE (PATTERN (insn)) == PARALLEL); + int scratch = (GET_CODE (PATTERN (insn)) == PARALLEL + && XVECLEN (PATTERN (insn), 0) == 3 + && REG_P (operands[3])); int ldi_ok = test_hard_reg_class (LD_REGS, operands[0]); int k; int *t = len; @@ -6857,7 +6737,9 @@ ashrhi3_out (rtx_insn *insn, rtx operands[], int *len) { if (CONST_INT_P (operands[2])) { - int scratch = (GET_CODE (PATTERN (insn)) == PARALLEL); + int scratch = (GET_CODE (PATTERN (insn)) == PARALLEL + && XVECLEN (PATTERN (insn), 0) == 3 + && REG_P (operands[3])); int ldi_ok = test_hard_reg_class (LD_REGS, operands[0]); int k; int *t = len; @@ -7271,7 +7153,9 @@ lshrhi3_out (rtx_insn *insn, rtx operands[], int *len) { if (CONST_INT_P (operands[2])) { - int scratch = (GET_CODE (PATTERN (insn)) == PARALLEL); + int scratch = (GET_CODE (PATTERN (insn)) == PARALLEL + && XVECLEN (PATTERN (insn), 0) == 3 + && REG_P (operands[3])); int ldi_ok = test_hard_reg_class (LD_REGS, operands[0]); int k; int *t = len; @@ -9619,6 +9503,18 @@ avr_assemble_integer (rtx x, unsigned int size, int aligned_p) return default_assemble_integer (x, size, aligned_p); } +/* Implement TARGET_CLASS_MAX_NREGS. Reasons described in comments for + avr_hard_regno_nregs. */ + +static unsigned char +avr_class_max_nregs (reg_class_t rclass, machine_mode mode) +{ + if (rclass == CC_REG && mode == CCmode) + return 1; + + return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD); +} + /* Implement `TARGET_CLASS_LIKELY_SPILLED_P'. */ /* Return value is nonzero if pseudos that have been @@ -11719,7 +11615,8 @@ avr_compare_pattern (rtx_insn *insn) if (pattern && NONJUMP_INSN_P (insn) - && SET_DEST (pattern) == cc0_rtx + && REG_P (SET_DEST (pattern)) + && REGNO (SET_DEST (pattern)) == REG_CC && GET_CODE (SET_SRC (pattern)) == COMPARE) { machine_mode mode0 = GET_MODE (XEXP (SET_SRC (pattern), 0)); @@ -11740,18 +11637,18 @@ avr_compare_pattern (rtx_insn *insn) /* Expansion of switch/case decision trees leads to code like - cc0 = compare (Reg, Num) - if (cc0 == 0) + REG_CC = compare (Reg, Num) + if (REG_CC == 0) goto L1 - cc0 = compare (Reg, Num) - if (cc0 > 0) + REG_CC = compare (Reg, Num) + if (REG_CC > 0) goto L2 The second comparison is superfluous and can be deleted. The second jump condition can be transformed from a - "difficult" one to a "simple" one because "cc0 > 0" and - "cc0 >= 0" will have the same effect here. + "difficult" one to a "simple" one because "REG_CC > 0" and + "REG_CC >= 0" will have the same effect here. This function relies on the way switch/case is being expaned as binary decision tree. For example code see PR 49903. @@ -11822,8 +11719,8 @@ avr_reorg_remove_redundant_compare (rtx_insn *insn1) || LABEL_REF != GET_CODE (XEXP (ifelse1, 1)) || LABEL_REF != GET_CODE (XEXP (ifelse2, 1)) || !COMPARISON_P (XEXP (ifelse2, 0)) - || cc0_rtx != XEXP (XEXP (ifelse1, 0), 0) - || cc0_rtx != XEXP (XEXP (ifelse2, 0), 0) + || REG_CC != REGNO (XEXP (XEXP (ifelse1, 0), 0)) + || REG_CC != REGNO (XEXP (XEXP (ifelse2, 0), 0)) || const0_rtx != XEXP (XEXP (ifelse1, 0), 1) || const0_rtx != XEXP (XEXP (ifelse2, 0), 1)) { @@ -11832,20 +11729,20 @@ avr_reorg_remove_redundant_compare (rtx_insn *insn1) /* We filtered the insn sequence to look like - (set (cc0) + (set (reg:CC cc) (compare (reg:M N) (const_int VAL))) (set (pc) - (if_then_else (eq (cc0) + (if_then_else (eq (reg:CC cc) (const_int 0)) (label_ref L1) (pc))) - (set (cc0) + (set (reg:CC cc) (compare (reg:M N) (const_int VAL))) (set (pc) - (if_then_else (CODE (cc0) + (if_then_else (CODE (reg:CC cc) (const_int 0)) (label_ref L2) (pc))) @@ -11893,7 +11790,7 @@ avr_reorg_remove_redundant_compare (rtx_insn *insn1) JUMP_LABEL (jump) = JUMP_LABEL (branch1); target = XEXP (XEXP (ifelse2, 1), 0); - cond = gen_rtx_fmt_ee (code, VOIDmode, cc0_rtx, const0_rtx); + cond = gen_rtx_fmt_ee (code, VOIDmode, cc_reg_rtx, const0_rtx); jump = emit_jump_insn_after (gen_branch_unspec (target, cond), insn2); JUMP_LABEL (jump) = JUMP_LABEL (branch2); @@ -11936,6 +11833,8 @@ avr_reorg (void) rtx_insn *next = next_real_insn (insn); rtx pat = PATTERN (next); + if (GET_CODE (pat) == PARALLEL) + pat = XVECEXP (pat, 0, 0); pattern = SET_SRC (pattern); @@ -12119,6 +12018,22 @@ jump_over_one_insn_p (rtx_insn *insn, rtx dest) && avr_2word_insn_p (next_active_insn (insn)))); } +/* Implement TARGET_HARD_REGNO_NREGS. CCmode is four units for historical + reasons. If this hook is not defined, TARGET_HARD_REGNO_NREGS + reports that CCmode requires four registers. + Define this hook to allow CCmode to fit in a single REG_CC. For + other modes and regs, return the number of words in mode (i.e whatever + the default implementation of the hook returned). */ + +static unsigned int +avr_hard_regno_nregs (unsigned int regno, machine_mode mode) +{ + if (regno == REG_CC && mode == CCmode) + return 1; + + return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD); +} + /* Implement TARGET_HARD_REGNO_MODE_OK. On the enhanced core, anything larger than 1 byte must start in even numbered register for "movw" to @@ -12127,6 +12042,9 @@ jump_over_one_insn_p (rtx_insn *insn, rtx dest) static bool avr_hard_regno_mode_ok (unsigned int regno, machine_mode mode) { + if (regno == REG_CC) + return mode == CCmode; + /* NOTE: 8-bit values must not be disallowed for R28 or R29. Disallowing QI et al. in these regs might lead to code like (set (subreg:QI (reg:HI 28) n) ...) @@ -14575,6 +14493,21 @@ avr_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED, tree *arg, return NULL_TREE; } +/* Prepend to CLOBBERS hard registers that are automatically clobbered + for an asm. We do this for CC_REGNUM to maintain source compatibility + with the original cc0-based compiler. */ + +static rtx_insn * +avr_md_asm_adjust (vec &/*outputs*/, vec &/*inputs*/, + vec & /*input_modes*/, + vec &/*constraints*/, + vec &clobbers, HARD_REG_SET &clobbered_regs) +{ + clobbers.safe_push (cc_reg_rtx); + SET_HARD_REG_BIT (clobbered_regs, REG_CC); + return NULL; +} + /* Worker function for `FLOAT_LIB_COMPARE_RETURNS_BOOL'. */ @@ -14669,6 +14602,9 @@ avr_float_lib_compare_returns_bool (machine_mode mode, enum rtx_code) #undef TARGET_CONDITIONAL_REGISTER_USAGE #define TARGET_CONDITIONAL_REGISTER_USAGE avr_conditional_register_usage +#undef TARGET_HARD_REGNO_NREGS +#define TARGET_HARD_REGNO_NREGS avr_hard_regno_nregs + #undef TARGET_HARD_REGNO_MODE_OK #define TARGET_HARD_REGNO_MODE_OK avr_hard_regno_mode_ok #undef TARGET_HARD_REGNO_SCRATCH_OK @@ -14694,6 +14630,9 @@ avr_float_lib_compare_returns_bool (machine_mode mode, enum rtx_code) #undef TARGET_CLASS_LIKELY_SPILLED_P #define TARGET_CLASS_LIKELY_SPILLED_P avr_class_likely_spilled_p +#undef TARGET_CLASS_MAX_NREGS +#define TARGET_CLASS_MAX_NREGS avr_class_max_nregs + #undef TARGET_OPTION_OVERRIDE #define TARGET_OPTION_OVERRIDE avr_option_override @@ -14772,6 +14711,9 @@ avr_float_lib_compare_returns_bool (machine_mode mode, enum rtx_code) #undef TARGET_STARTING_FRAME_OFFSET #define TARGET_STARTING_FRAME_OFFSET avr_starting_frame_offset +#undef TARGET_MD_ASM_ADJUST +#define TARGET_MD_ASM_ADJUST avr_md_asm_adjust + struct gcc_target targetm = TARGET_INITIALIZER; diff --git a/gcc/config/avr/avr.h b/gcc/config/avr/avr.h index 0026a66..30ce2ba 100644 --- a/gcc/config/avr/avr.h +++ b/gcc/config/avr/avr.h @@ -155,7 +155,7 @@ FIXME: DRIVER_SELF_SPECS has changed. #define WCHAR_TYPE_SIZE 16 -#define FIRST_PSEUDO_REGISTER 36 +#define FIRST_PSEUDO_REGISTER 37 #define GENERAL_REGNO_P(N) IN_RANGE (N, 2, 31) #define GENERAL_REG_P(X) (REG_P (X) && GENERAL_REGNO_P (REGNO (X))) @@ -178,7 +178,8 @@ FIXME: DRIVER_SELF_SPECS has changed. 0,0,/* r28 r29 */\ 0,0,/* r30 r31 */\ 1,1,/* STACK */\ - 1,1 /* arg pointer */ } + 1,1, /* arg pointer */ \ + 1 /* CC */ } #define CALL_USED_REGISTERS { \ 1,1,/* r0 r1 */ \ @@ -198,7 +199,8 @@ FIXME: DRIVER_SELF_SPECS has changed. 0,0,/* r28 r29 */ \ 1,1,/* r30 r31 */ \ 1,1,/* STACK */ \ - 1,1 /* arg pointer */ } + 1,1, /* arg pointer */ \ + 1 /* CC */ } #define REG_ALLOC_ORDER { \ 24,25, \ @@ -210,7 +212,7 @@ FIXME: DRIVER_SELF_SPECS has changed. 28,29, \ 17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2, \ 0,1, \ - 32,33,34,35 \ + 32,33,34,35,36 \ } #define ADJUST_REG_ALLOC_ORDER avr_adjust_reg_alloc_order() @@ -230,6 +232,7 @@ enum reg_class { LD_REGS, /* r16 - r31 */ NO_LD_REGS, /* r0 - r15 */ GENERAL_REGS, /* r0 - r31 */ + CC_REG, /* CC */ ALL_REGS, LIM_REG_CLASSES }; @@ -250,6 +253,7 @@ enum reg_class { "LD_REGS", /* r16 - r31 */ \ "NO_LD_REGS", /* r0 - r15 */ \ "GENERAL_REGS", /* r0 - r31 */ \ + "CC_REG", /* CC */ \ "ALL_REGS" } #define REG_CLASS_CONTENTS { \ @@ -270,7 +274,8 @@ enum reg_class { 0x00000000}, /* LD_REGS, r16 - r31 */ \ {0x0000ffff,0x00000000}, /* NO_LD_REGS r0 - r15 */ \ {0xffffffff,0x00000000}, /* GENERAL_REGS, r0 - r31 */ \ - {0xffffffff,0x00000003} /* ALL_REGS */ \ + {0x00000000,0x00000010}, /* CC */ \ + {0xffffffff,0x00000013} /* ALL_REGS */ \ } #define REGNO_REG_CLASS(R) avr_regno_reg_class(R) @@ -429,7 +434,7 @@ typedef struct avr_args "r8","r9","r10","r11","r12","r13","r14","r15", \ "r16","r17","r18","r19","r20","r21","r22","r23", \ "r24","r25","r26","r27","r28","r29","r30","r31", \ - "__SP_L__","__SP_H__","argL","argH"} + "__SP_L__","__SP_H__","argL","argH", "cc"} #define FINAL_PRESCAN_INSN(insn, operand, nop) \ avr_final_prescan_insn (insn, operand,nop) @@ -484,23 +489,6 @@ typedef struct avr_args #define TRAMPOLINE_SIZE 4 -/* Store in cc_status the expressions - that the condition codes will describe - after execution of an instruction whose pattern is EXP. - Do not alter them if the instruction would not alter the cc's. */ - -#define NOTICE_UPDATE_CC(EXP, INSN) avr_notice_update_cc (EXP, INSN) - -/* The add insns don't set overflow in a usable way. */ -#define CC_OVERFLOW_UNUSABLE 01000 -/* The mov,and,or,xor insns don't set carry. That's ok though as the - Z bit is all we need when doing unsigned comparisons on the result of - these insns (since they're always with 0). However, conditions.h has - CC_NO_OVERFLOW defined for this purpose. Rename it to something more - understandable. */ -#define CC_NO_CARRY CC_NO_OVERFLOW - - /* Output assembler code to FILE to increment profiler label # LABELNO for profiling a function entry. */ diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md index 478abc1..2206fa1 100644 --- a/gcc/config/avr/avr.md +++ b/gcc/config/avr/avr.md @@ -58,6 +58,7 @@ (REG_Z 30) (REG_W 24) (REG_SP 32) + (REG_CC 36) (LPM_REGNO 0) ; implicit target register of LPM (TMP_REGNO 0) ; temporary register r0 (ZERO_REGNO 1) ; zero register r1 @@ -459,7 +460,8 @@ "reload_completed && frame_pointer_needed && !cfun->calls_alloca - && find_reg_note (insn, REG_ARGS_SIZE, const0_rtx)" + && find_reg_note (insn, REG_ARGS_SIZE, const0_rtx) + && REGNO (operands[0]) != REG_Y" [(set (reg:HI REG_SP) (reg:HI REG_Y))]) @@ -491,19 +493,34 @@ ;; "load_psi_libgcc" ;; "load_si_libgcc" ;; "load_sf_libgcc" -(define_insn "load__libgcc" +(define_insn_and_split "load__libgcc" [(set (reg:MOVMODE 22) (match_operand:MOVMODE 0 "memory_operand" "m,m"))] "avr_load_libgcc_p (operands[0]) && REG_P (XEXP (operands[0], 0)) && REG_Z == REGNO (XEXP (operands[0], 0))" + "#" + "&& reload_completed" + [(parallel [(set (reg:MOVMODE 22) + (match_dup 0)) + (clobber (reg:CC REG_CC))])] + "" + [(set_attr "isa" "rjmp,jmp")]) + +(define_insn "*load__libgcc" + [(set (reg:MOVMODE 22) + (match_operand:MOVMODE 0 "memory_operand" "m,m")) + (clobber (reg:CC REG_CC))] + "avr_load_libgcc_p (operands[0]) + && REG_P (XEXP (operands[0], 0)) + && REG_Z == REGNO (XEXP (operands[0], 0)) + && reload_completed" { operands[0] = GEN_INT (GET_MODE_SIZE (mode)); return "%~call __load_%0"; } [(set_attr "length" "1,2") - (set_attr "isa" "rjmp,jmp") - (set_attr "cc" "clobber")]) + (set_attr "isa" "rjmp,jmp")]) ;; "xload8qi_A" @@ -591,8 +608,7 @@ } [(set_attr "length" "4,4") (set_attr "adjust_len" "*,xload") - (set_attr "isa" "lpmx,lpm") - (set_attr "cc" "none")]) + (set_attr "isa" "lpmx,lpm")]) ;; R21:Z : 24-bit source address ;; R22 : 1-4 byte output @@ -602,21 +618,35 @@ ;; "xload_si_libgcc" "xload_sq_libgcc" "xload_usq_libgcc" "xload_sa_libgcc" "xload_usa_libgcc" ;; "xload_sf_libgcc" ;; "xload_psi_libgcc" -(define_insn "xload__libgcc" + +(define_insn_and_split "xload__libgcc" [(set (reg:MOVMODE 22) (mem:MOVMODE (lo_sum:PSI (reg:QI 21) (reg:HI REG_Z)))) (clobber (reg:QI 21)) (clobber (reg:HI REG_Z))] "avr_xload_libgcc_p (mode)" + "#" + "&& reload_completed" + [(parallel [(set (reg:MOVMODE 22) + (mem:MOVMODE (lo_sum:PSI (reg:QI 21) + (reg:HI REG_Z)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*xload__libgcc" + [(set (reg:MOVMODE 22) + (mem:MOVMODE (lo_sum:PSI (reg:QI 21) + (reg:HI REG_Z)))) + (clobber (reg:CC REG_CC))] + "avr_xload_libgcc_p (mode) + && reload_completed" { rtx x_bytes = GEN_INT (GET_MODE_SIZE (mode)); output_asm_insn ("%~call __xload_%0", &x_bytes); return ""; } - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; General move expanders @@ -696,17 +726,29 @@ ;; "movqi_insn" ;; "movqq_insn" "movuqq_insn" -(define_insn "mov_insn" +(define_insn_and_split "mov_insn" [(set (match_operand:ALL1 0 "nonimmediate_operand" "=r ,d ,Qm ,r ,q,r,*r") (match_operand:ALL1 1 "nox_general_operand" "r Y00,n Ynn,r Y00,Qm,r,q,i"))] "register_operand (operands[0], mode) || reg_or_0_operand (operands[1], mode)" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (match_dup 1)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mov_insn" + [(set (match_operand:ALL1 0 "nonimmediate_operand" "=r ,d ,Qm ,r ,q,r,*r") + (match_operand:ALL1 1 "nox_general_operand" "r Y00,n Ynn,r Y00,Qm,r,q,i")) + (clobber (reg:CC REG_CC))] + "(register_operand (operands[0], mode) + || reg_or_0_operand (operands[1], mode)) + && reload_completed" { return output_movqi (insn, operands, NULL); } [(set_attr "length" "1,1,5,5,1,1,4") - (set_attr "adjust_len" "mov8") - (set_attr "cc" "ldi,none,clobber,clobber,none,none,clobber")]) + (set_attr "adjust_len" "mov8")]) ;; This is used in peephole2 to optimize loading immediate constants ;; if a scratch register from LD_REGS happens to be available. @@ -720,8 +762,7 @@ "reload_completed" "ldi %2,lo8(%1) mov %0,%2" - [(set_attr "length" "2") - (set_attr "cc" "none")]) + [(set_attr "length" "2")]) (define_peephole2 [(match_scratch:QI 2 "d") @@ -759,8 +800,7 @@ out %A0,%A1 out %A0,%A1\;out %B0,%B1" [(set_attr "length" "2,4,5,1,2") - (set_attr "isa" "no_xmega,no_xmega,no_xmega,*,xmega") - (set_attr "cc" "none")]) + (set_attr "isa" "no_xmega,no_xmega,no_xmega,*,xmega")]) (define_peephole2 [(match_scratch:QI 2 "d") @@ -778,29 +818,41 @@ (define_insn "*reload_in" [(set (match_operand:ALL2 0 "l_register_operand" "=l") (match_operand:ALL2 1 "immediate_operand" "i")) - (clobber (match_operand:QI 2 "register_operand" "=&d"))] + (clobber (match_operand:QI 2 "register_operand" "=&d")) + (clobber (reg:CC REG_CC))] "reload_completed" { return output_reload_inhi (operands, operands[2], NULL); } [(set_attr "length" "4") - (set_attr "adjust_len" "reload_in16") - (set_attr "cc" "clobber")]) + (set_attr "adjust_len" "reload_in16")]) ;; "*movhi" ;; "*movhq" "*movuhq" ;; "*movha" "*movuha" -(define_insn "*mov" +(define_insn_and_split "*mov_split" [(set (match_operand:ALL2 0 "nonimmediate_operand" "=r,r ,r,m ,d,*r,q,r") (match_operand:ALL2 1 "nox_general_operand" "r,Y00,m,r Y00,i,i ,r,q"))] "register_operand (operands[0], mode) || reg_or_0_operand (operands[1], mode)" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (match_dup 1)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mov" + [(set (match_operand:ALL2 0 "nonimmediate_operand" "=r,r ,r,m ,d,*r,q,r") + (match_operand:ALL2 1 "nox_general_operand" "r,Y00,m,r Y00,i,i ,r,q")) + (clobber (reg:CC REG_CC))] + "(register_operand (operands[0], mode) + || reg_or_0_operand (operands[1], mode)) + && reload_completed" { return output_movhi (insn, operands, NULL); } [(set_attr "length" "2,2,6,7,2,6,5,2") - (set_attr "adjust_len" "mov16") - (set_attr "cc" "none,none,clobber,clobber,none,clobber,none,none")]) + (set_attr "adjust_len" "mov16")]) (define_peephole2 ; movw [(set (match_operand:ALL1 0 "even_register_operand" "") @@ -844,7 +896,10 @@ [(set (match_operand:HISI 0 "register_operand" "") (match_operand:HISI 1 "memory_operand" ""))] "reload_completed - && AVR_HAVE_LPMX" + && AVR_HAVE_LPMX + && avr_mem_flash_p (operands[1]) + && REG_P (XEXP (operands[1], 0)) + && !reg_overlap_mentioned_p (XEXP (operands[1], 0), operands[0])" [(set (match_dup 0) (match_dup 2)) (set (match_dup 3) @@ -853,13 +908,6 @@ { rtx addr = XEXP (operands[1], 0); - if (!avr_mem_flash_p (operands[1]) - || !REG_P (addr) - || reg_overlap_mentioned_p (addr, operands[0])) - { - FAIL; - } - operands[2] = replace_equiv_address (operands[1], gen_rtx_POST_INC (Pmode, addr)); operands[3] = addr; @@ -878,33 +926,47 @@ && operands[1] != constm1_rtx" [(parallel [(set (match_dup 0) (match_dup 1)) - (clobber (match_dup 2))])]) + (clobber (match_dup 2)) + (clobber (reg:CC REG_CC))])]) ;; '*' because it is not used in rtl generation. (define_insn "*reload_inpsi" [(set (match_operand:PSI 0 "register_operand" "=r") (match_operand:PSI 1 "immediate_operand" "i")) - (clobber (match_operand:QI 2 "register_operand" "=&d"))] + (clobber (match_operand:QI 2 "register_operand" "=&d")) + (clobber (reg:CC REG_CC))] "reload_completed" { return avr_out_reload_inpsi (operands, operands[2], NULL); } [(set_attr "length" "6") - (set_attr "adjust_len" "reload_in24") - (set_attr "cc" "clobber")]) + (set_attr "adjust_len" "reload_in24")]) -(define_insn "*movpsi" +(define_insn_and_split "*movpsi_split" [(set (match_operand:PSI 0 "nonimmediate_operand" "=r,r,r ,Qm,!d,r") (match_operand:PSI 1 "nox_general_operand" "r,L,Qm,rL,i ,i"))] "register_operand (operands[0], PSImode) || register_operand (operands[1], PSImode) || const0_rtx == operands[1]" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (match_dup 1)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*movpsi" + [(set (match_operand:PSI 0 "nonimmediate_operand" "=r,r,r ,Qm,!d,r") + (match_operand:PSI 1 "nox_general_operand" "r,L,Qm,rL,i ,i")) + (clobber (reg:CC REG_CC))] + "(register_operand (operands[0], PSImode) + || register_operand (operands[1], PSImode) + || const0_rtx == operands[1]) + && reload_completed" { return avr_out_movpsi (insn, operands, NULL); } [(set_attr "length" "3,3,8,9,4,10") - (set_attr "adjust_len" "mov24") - (set_attr "cc" "none,none,clobber,clobber,none,clobber")]) + (set_attr "adjust_len" "mov24")]) ;;========================================================================== ;; move double word (32 bit) @@ -917,7 +979,8 @@ "operands[1] != CONST0_RTX (mode)" [(parallel [(set (match_dup 0) (match_dup 1)) - (clobber (match_dup 2))])]) + (clobber (match_dup 2)) + (clobber (reg:CC REG_CC))])]) ;; '*' because it is not used in rtl generation. ;; "*reload_insi" @@ -926,45 +989,69 @@ (define_insn "*reload_insi" [(set (match_operand:ALL4 0 "register_operand" "=r") (match_operand:ALL4 1 "immediate_operand" "n Ynn")) - (clobber (match_operand:QI 2 "register_operand" "=&d"))] + (clobber (match_operand:QI 2 "register_operand" "=&d")) + (clobber (reg:CC REG_CC))] "reload_completed" { return output_reload_insisf (operands, operands[2], NULL); } [(set_attr "length" "8") - (set_attr "adjust_len" "reload_in32") - (set_attr "cc" "clobber")]) + (set_attr "adjust_len" "reload_in32")]) ;; "*movsi" ;; "*movsq" "*movusq" ;; "*movsa" "*movusa" -(define_insn "*mov" +(define_insn_and_split "*mov_split" [(set (match_operand:ALL4 0 "nonimmediate_operand" "=r,r ,r ,Qm ,!d,r") (match_operand:ALL4 1 "nox_general_operand" "r,Y00,Qm,r Y00,i ,i"))] "register_operand (operands[0], mode) || reg_or_0_operand (operands[1], mode)" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (match_dup 1)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mov" + [(set (match_operand:ALL4 0 "nonimmediate_operand" "=r,r ,r ,Qm ,!d,r") + (match_operand:ALL4 1 "nox_general_operand" "r,Y00,Qm,r Y00,i ,i")) + (clobber (reg:CC REG_CC))] + "(register_operand (operands[0], mode) + || reg_or_0_operand (operands[1], mode)) + && reload_completed" { return output_movsisf (insn, operands, NULL); } [(set_attr "length" "4,4,8,9,4,10") - (set_attr "adjust_len" "mov32") - (set_attr "cc" "none,none,clobber,clobber,none,clobber")]) + (set_attr "adjust_len" "mov32")]) ;; fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff ;; move floating point numbers (32 bit) -(define_insn "*movsf" +(define_insn_and_split "*movsf_split" [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,r ,Qm,!d,r") (match_operand:SF 1 "nox_general_operand" "r,G,Qm,rG,F ,F"))] "register_operand (operands[0], SFmode) || reg_or_0_operand (operands[1], SFmode)" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (match_dup 1)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*movsf" + [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,r ,Qm,!d,r") + (match_operand:SF 1 "nox_general_operand" "r,G,Qm,rG,F ,F")) + (clobber (reg:CC REG_CC))] + "(register_operand (operands[0], SFmode) + || reg_or_0_operand (operands[1], SFmode)) + && reload_completed" { return output_movsisf (insn, operands, NULL); } [(set_attr "length" "4,4,8,9,4,10") - (set_attr "adjust_len" "mov32") - (set_attr "cc" "none,none,clobber,clobber,none,clobber")]) + (set_attr "adjust_len" "mov32")]) (define_peephole2 ; *reload_insf [(match_scratch:QI 2 "d") @@ -974,20 +1061,21 @@ "operands[1] != CONST0_RTX (SFmode)" [(parallel [(set (match_dup 0) (match_dup 1)) - (clobber (match_dup 2))])]) + (clobber (match_dup 2)) + (clobber (reg:CC REG_CC))])]) ;; '*' because it is not used in rtl generation. (define_insn "*reload_insf" [(set (match_operand:SF 0 "register_operand" "=r") (match_operand:SF 1 "const_double_operand" "F")) - (clobber (match_operand:QI 2 "register_operand" "=&d"))] + (clobber (match_operand:QI 2 "register_operand" "=&d")) + (clobber (reg:CC REG_CC))] "reload_completed" { return output_reload_insisf (operands, operands[2], NULL); } [(set_attr "length" "8") - (set_attr "adjust_len" "reload_in32") - (set_attr "cc" "clobber")]) + (set_attr "adjust_len" "reload_in32")]) ;;========================================================================= ;; move string (like memcpy) @@ -1015,7 +1103,7 @@ ;; "cpymem_qi" ;; "cpymem_hi" -(define_insn "cpymem_" +(define_insn_and_split "cpymem_" [(set (mem:BLK (reg:HI REG_X)) (mem:BLK (reg:HI REG_Z))) (unspec [(match_operand:QI 0 "const_int_operand" "n")] @@ -1026,11 +1114,35 @@ (clobber (reg:QI LPM_REGNO)) (clobber (match_operand:QIHI 2 "register_operand" "=1"))] "" + "#" + "&& reload_completed" + [(parallel [(set (mem:BLK (reg:HI REG_X)) + (mem:BLK (reg:HI REG_Z))) + (unspec [(match_dup 0)] + UNSPEC_CPYMEM) + (use (match_dup 1)) + (clobber (reg:HI REG_X)) + (clobber (reg:HI REG_Z)) + (clobber (reg:QI LPM_REGNO)) + (clobber (match_dup 2)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*cpymem_" + [(set (mem:BLK (reg:HI REG_X)) + (mem:BLK (reg:HI REG_Z))) + (unspec [(match_operand:QI 0 "const_int_operand" "n")] + UNSPEC_CPYMEM) + (use (match_operand:QIHI 1 "register_operand" "")) + (clobber (reg:HI REG_X)) + (clobber (reg:HI REG_Z)) + (clobber (reg:QI LPM_REGNO)) + (clobber (match_operand:QIHI 2 "register_operand" "=1")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_cpymem (insn, operands, NULL); } - [(set_attr "adjust_len" "cpymem") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "cpymem")]) ;; $0 : Address Space @@ -1041,7 +1153,8 @@ ;; "cpymemx_qi" ;; "cpymemx_hi" -(define_insn "cpymemx_" + +(define_insn_and_split "cpymemx_" [(set (mem:BLK (reg:HI REG_X)) (mem:BLK (lo_sum:PSI (reg:QI 23) (reg:HI REG_Z)))) @@ -1055,9 +1168,39 @@ (clobber (reg:QI 23)) (clobber (mem:QI (match_operand:QI 1 "io_address_operand" "n")))] "" + "#" + "&& reload_completed" + [(parallel [(set (mem:BLK (reg:HI REG_X)) + (mem:BLK (lo_sum:PSI (reg:QI 23) + (reg:HI REG_Z)))) + (unspec [(match_dup 0)] + UNSPEC_CPYMEM) + (use (reg:QIHI 24)) + (clobber (reg:HI REG_X)) + (clobber (reg:HI REG_Z)) + (clobber (reg:QI LPM_REGNO)) + (clobber (reg:HI 24)) + (clobber (reg:QI 23)) + (clobber (mem:QI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*cpymemx_" + [(set (mem:BLK (reg:HI REG_X)) + (mem:BLK (lo_sum:PSI (reg:QI 23) + (reg:HI REG_Z)))) + (unspec [(match_operand:QI 0 "const_int_operand" "n")] + UNSPEC_CPYMEM) + (use (reg:QIHI 24)) + (clobber (reg:HI REG_X)) + (clobber (reg:HI REG_Z)) + (clobber (reg:QI LPM_REGNO)) + (clobber (reg:HI 24)) + (clobber (reg:QI 23)) + (clobber (mem:QI (match_operand:QI 1 "io_address_operand" "n"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __movmemx_" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; =%2 =%2 =%2 =%2 =%2 =%2 =%2 =%2 =%2 =%2 =%2 =%2 =%2 =%2 =%2 =%2 =%2 =%2 @@ -1091,7 +1234,7 @@ }) -(define_insn "*clrmemqi" +(define_insn_and_split "*clrmemqi_split" [(set (mem:BLK (match_operand:HI 0 "register_operand" "e")) (const_int 0)) (use (match_operand:QI 1 "register_operand" "r")) @@ -1099,12 +1242,30 @@ (clobber (match_scratch:HI 3 "=0")) (clobber (match_scratch:QI 4 "=&1"))] "" + "#" + "&& reload_completed" + [(parallel [(set (mem:BLK (match_dup 0)) + (const_int 0)) + (use (match_dup 1)) + (use (match_dup 2)) + (clobber (match_dup 3)) + (clobber (match_dup 4)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*clrmemqi" + [(set (mem:BLK (match_operand:HI 0 "register_operand" "e")) + (const_int 0)) + (use (match_operand:QI 1 "register_operand" "r")) + (use (match_operand:QI 2 "const_int_operand" "n")) + (clobber (match_scratch:HI 3 "=0")) + (clobber (match_scratch:QI 4 "=&1")) + (clobber (reg:CC REG_CC))] + "reload_completed" "0:\;st %a0+,__zero_reg__\;dec %1\;brne 0b" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) -(define_insn "*clrmemhi" +(define_insn_and_split "*clrmemhi_split" [(set (mem:BLK (match_operand:HI 0 "register_operand" "e,e")) (const_int 0)) (use (match_operand:HI 1 "register_operand" "!w,d")) @@ -1112,11 +1273,30 @@ (clobber (match_scratch:HI 3 "=0,0")) (clobber (match_scratch:HI 4 "=&1,&1"))] "" + "#" + "&& reload_completed" + [(parallel [(set (mem:BLK (match_dup 0)) + (const_int 0)) + (use (match_dup 1)) + (use (match_dup 2)) + (clobber (match_dup 3)) + (clobber (match_dup 4)) + (clobber (reg:CC REG_CC))])]) + + +(define_insn "*clrmemhi" + [(set (mem:BLK (match_operand:HI 0 "register_operand" "e,e")) + (const_int 0)) + (use (match_operand:HI 1 "register_operand" "!w,d")) + (use (match_operand:HI 2 "const_int_operand" "n,n")) + (clobber (match_scratch:HI 3 "=0,0")) + (clobber (match_scratch:HI 4 "=&1,&1")) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ 0:\;st %a0+,__zero_reg__\;sbiw %A1,1\;brne 0b 0:\;st %a0+,__zero_reg__\;subi %A1,1\;sbci %B1,0\;brne 0b" - [(set_attr "length" "3,4") - (set_attr "cc" "clobber,clobber")]) + [(set_attr "length" "3,4")]) (define_expand "strlenhi" [(set (match_dup 4) @@ -1142,27 +1322,57 @@ operands[4] = gen_reg_rtx (HImode); }) -(define_insn "*strlenhi" +(define_insn_and_split "*strlenhi_split" [(set (match_operand:HI 0 "register_operand" "=e") (unspec:HI [(mem:BLK (match_operand:HI 1 "register_operand" "0")) (const_int 0) (match_operand:HI 2 "immediate_operand" "i")] UNSPEC_STRLEN))] "" + "#" + "&& reload_completed" + [(parallel + [(set (match_dup 0) + (unspec:HI [(mem:BLK (match_dup 1)) + (const_int 0) + (match_dup 2)] + UNSPEC_STRLEN)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*strlenhi" + [(set (match_operand:HI 0 "register_operand" "=e") + (unspec:HI [(mem:BLK (match_operand:HI 1 "register_operand" "0")) + (const_int 0) + (match_operand:HI 2 "immediate_operand" "i")] + UNSPEC_STRLEN)) + (clobber (reg:CC REG_CC))] + "reload_completed" "0:\;ld __tmp_reg__,%a0+\;tst __tmp_reg__\;brne 0b" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) ;+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ; add bytes ;; "addqi3" ;; "addqq3" "adduqq3" -(define_insn "add3" +(define_insn_and_split "add3" [(set (match_operand:ALL1 0 "register_operand" "=r,d ,r ,r ,r ,r") (plus:ALL1 (match_operand:ALL1 1 "register_operand" "%0,0 ,0 ,0 ,0 ,0") (match_operand:ALL1 2 "nonmemory_operand" "r,n Ynn,Y01,Ym1,Y02,Ym2")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:ALL1 (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*add3" + [(set (match_operand:ALL1 0 "register_operand" "=r,d ,r ,r ,r ,r") + (plus:ALL1 (match_operand:ALL1 1 "register_operand" "%0,0 ,0 ,0 ,0 ,0") + (match_operand:ALL1 2 "nonmemory_operand" "r,n Ynn,Y01,Ym1,Y02,Ym2"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ add %0,%2 subi %0,lo8(-(%2)) @@ -1170,8 +1380,7 @@ dec %0 inc %0\;inc %0 dec %0\;dec %0" - [(set_attr "length" "1,1,1,1,2,2") - (set_attr "cc" "set_czn,set_czn,set_vzn,set_vzn,set_vzn,set_vzn")]) + [(set_attr "length" "1,1,1,1,2,2")]) ;; "addhi3" ;; "addhq3" "adduhq3" @@ -1205,67 +1414,144 @@ }) -(define_insn "*addhi3_zero_extend" +(define_insn_and_split "*addhi3_zero_extend_split" [(set (match_operand:HI 0 "register_operand" "=r,*?r") (plus:HI (zero_extend:HI (match_operand:QI 1 "register_operand" "r ,0")) (match_operand:HI 2 "register_operand" "0 ,r")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:HI (zero_extend:HI (match_dup 1)) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*addhi3_zero_extend" + [(set (match_operand:HI 0 "register_operand" "=r,*?r") + (plus:HI (zero_extend:HI (match_operand:QI 1 "register_operand" "r ,0")) + (match_operand:HI 2 "register_operand" "0 ,r"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ add %A0,%1\;adc %B0,__zero_reg__ add %A0,%A2\;mov %B0,%B2\;adc %B0,__zero_reg__" - [(set_attr "length" "2,3") - (set_attr "cc" "set_n")]) + [(set_attr "length" "2,3")]) -(define_insn "*addhi3_zero_extend1" +(define_insn_and_split "*addhi3_zero_extend1_split" [(set (match_operand:HI 0 "register_operand" "=r") (plus:HI (match_operand:HI 1 "register_operand" "0") (zero_extend:HI (match_operand:QI 2 "register_operand" "r"))))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:HI (match_dup 1) + (zero_extend:HI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*addhi3_zero_extend1" + [(set (match_operand:HI 0 "register_operand" "=r") + (plus:HI (match_operand:HI 1 "register_operand" "0") + (zero_extend:HI (match_operand:QI 2 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "reload_completed" "add %A0,%2\;adc %B0,__zero_reg__" - [(set_attr "length" "2") - (set_attr "cc" "set_n")]) + [(set_attr "length" "2")]) -(define_insn "*addhi3.sign_extend1" +(define_insn_and_split "*addhi3.sign_extend1_split" [(set (match_operand:HI 0 "register_operand" "=r") (plus:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "r")) (match_operand:HI 2 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel + [(set (match_dup 0) + (plus:HI + (sign_extend:HI (match_dup 1)) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + + +(define_insn "*addhi3.sign_extend1" + [(set (match_operand:HI 0 "register_operand" "=r") + (plus:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "r")) + (match_operand:HI 2 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return reg_overlap_mentioned_p (operands[0], operands[1]) ? "mov __tmp_reg__,%1\;add %A0,%1\;adc %B0,__zero_reg__\;sbrc __tmp_reg__,7\;dec %B0" : "add %A0,%1\;adc %B0,__zero_reg__\;sbrc %1,7\;dec %B0"; } - [(set_attr "length" "5") - (set_attr "cc" "clobber")]) + [(set_attr "length" "5")]) -(define_insn "*addhi3_zero_extend.const" +(define_insn_and_split "*addhi3_zero_extend.const_split" [(set (match_operand:HI 0 "register_operand" "=d") (plus:HI (zero_extend:HI (match_operand:QI 1 "register_operand" "0")) (match_operand:HI 2 "const_m255_to_m1_operand" "Cn8")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:HI (zero_extend:HI (match_dup 1)) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*addhi3_zero_extend.const" + [(set (match_operand:HI 0 "register_operand" "=d") + (plus:HI (zero_extend:HI (match_operand:QI 1 "register_operand" "0")) + (match_operand:HI 2 "const_m255_to_m1_operand" "Cn8"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "subi %A0,%n2\;sbc %B0,%B0" - [(set_attr "length" "2") - (set_attr "cc" "set_czn")]) + [(set_attr "length" "2")]) -(define_insn "*usum_widenqihi3" +(define_insn_and_split "*usum_widenqihi3_split" [(set (match_operand:HI 0 "register_operand" "=r") (plus:HI (zero_extend:HI (match_operand:QI 1 "register_operand" "0")) (zero_extend:HI (match_operand:QI 2 "register_operand" "r"))))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:HI + (zero_extend:HI (match_dup 1)) + (zero_extend:HI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + + +(define_insn "*usum_widenqihi3" + [(set (match_operand:HI 0 "register_operand" "=r") + (plus:HI (zero_extend:HI (match_operand:QI 1 "register_operand" "0")) + (zero_extend:HI (match_operand:QI 2 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "reload_completed" "add %A0,%2\;clr %B0\;rol %B0" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) -(define_insn "*udiff_widenqihi3" +(define_insn_and_split "*udiff_widenqihi3_split" [(set (match_operand:HI 0 "register_operand" "=r") (minus:HI (zero_extend:HI (match_operand:QI 1 "register_operand" "0")) (zero_extend:HI (match_operand:QI 2 "register_operand" "r"))))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:HI (zero_extend:HI (match_dup 1)) + (zero_extend:HI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*udiff_widenqihi3" + [(set (match_operand:HI 0 "register_operand" "=r") + (minus:HI (zero_extend:HI (match_operand:QI 1 "register_operand" "0")) + (zero_extend:HI (match_operand:QI 2 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "reload_completed" "sub %A0,%2\;sbc %B0,%B0" - [(set_attr "length" "2") - (set_attr "cc" "set_czn")]) + [(set_attr "length" "2")]) -(define_insn "*addhi3_sp" +(define_insn_and_split "*addhi3_sp" [(set (match_operand:HI 1 "stack_register_operand" "=q") (plus:HI (match_operand:HI 2 "stack_register_operand" "q") (match_operand:HI 0 "avr_sp_immediate_operand" "Csp")))] @@ -1273,23 +1559,44 @@ { return avr_out_addto_sp (operands, NULL); } + "" + [(const_int 0)] + { + /* Do not attempt to split this pattern. This FAIL is necessary + to prevent the splitter from matching *add3_split, splitting + it, and then failing later because constraints don't match, as split + does not look at constraints. */ + FAIL; + } [(set_attr "length" "6") (set_attr "adjust_len" "addto_sp")]) ;; "*addhi3" ;; "*addhq3" "*adduhq3" ;; "*addha3" "*adduha3" -(define_insn "*add3" +(define_insn_and_split "*add3_split" [(set (match_operand:ALL2 0 "register_operand" "=??r,d,!w ,d") (plus:ALL2 (match_operand:ALL2 1 "register_operand" "%0,0,0 ,0") (match_operand:ALL2 2 "nonmemory_or_const_operand" "r,s,IJ YIJ,n Ynn")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:ALL2 (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*add3" + [(set (match_operand:ALL2 0 "register_operand" "=??r,d,!w ,d") + (plus:ALL2 (match_operand:ALL2 1 "register_operand" "%0,0,0 ,0") + (match_operand:ALL2 2 "nonmemory_or_const_operand" "r,s,IJ YIJ,n Ynn"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_plus (insn, operands); } [(set_attr "length" "2") - (set_attr "adjust_len" "plus") - (set_attr "cc" "plus")]) + (set_attr "adjust_len" "plus")]) ;; Adding a constant to NO_LD_REGS might have lead to a reload of ;; that constant to LD_REGS. We don't add a scratch to *addhi3 @@ -1329,140 +1636,303 @@ ;; "addhi3_clobber" ;; "addhq3_clobber" "adduhq3_clobber" ;; "addha3_clobber" "adduha3_clobber" -(define_insn "add3_clobber" +(define_insn_and_split "add3_clobber" [(set (match_operand:ALL2 0 "register_operand" "=!w ,d ,r") (plus:ALL2 (match_operand:ALL2 1 "register_operand" "%0 ,0 ,0") (match_operand:ALL2 2 "const_operand" "IJ YIJ,n Ynn,n Ynn"))) (clobber (match_scratch:QI 3 "=X ,X ,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:ALL2 (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*add3_clobber" + [(set (match_operand:ALL2 0 "register_operand" "=!w ,d ,r") + (plus:ALL2 (match_operand:ALL2 1 "register_operand" "%0 ,0 ,0") + (match_operand:ALL2 2 "const_operand" "IJ YIJ,n Ynn,n Ynn"))) + (clobber (match_scratch:QI 3 "=X ,X ,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_plus (insn, operands); } [(set_attr "length" "4") - (set_attr "adjust_len" "plus") - (set_attr "cc" "plus")]) + (set_attr "adjust_len" "plus")]) ;; "addsi3" ;; "addsq3" "addusq3" ;; "addsa3" "addusa3" -(define_insn "add3" +(define_insn_and_split "add3" [(set (match_operand:ALL4 0 "register_operand" "=??r,d ,r") (plus:ALL4 (match_operand:ALL4 1 "register_operand" "%0,0 ,0") (match_operand:ALL4 2 "nonmemory_operand" "r,i ,n Ynn"))) (clobber (match_scratch:QI 3 "=X,X ,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:ALL4 (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*add3" + [(set (match_operand:ALL4 0 "register_operand" "=??r,d ,r") + (plus:ALL4 (match_operand:ALL4 1 "register_operand" "%0,0 ,0") + (match_operand:ALL4 2 "nonmemory_operand" "r,i ,n Ynn"))) + (clobber (match_scratch:QI 3 "=X,X ,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_plus (insn, operands); } [(set_attr "length" "4") - (set_attr "adjust_len" "plus") - (set_attr "cc" "plus")]) + (set_attr "adjust_len" "plus")]) -(define_insn "*addpsi3_zero_extend.qi" +(define_insn_and_split "*addpsi3_zero_extend.qi_split" [(set (match_operand:PSI 0 "register_operand" "=r") (plus:PSI (zero_extend:PSI (match_operand:QI 1 "register_operand" "r")) (match_operand:PSI 2 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:PSI (zero_extend:PSI (match_dup 1)) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*addpsi3_zero_extend.qi" + [(set (match_operand:PSI 0 "register_operand" "=r") + (plus:PSI (zero_extend:PSI (match_operand:QI 1 "register_operand" "r")) + (match_operand:PSI 2 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "add %A0,%A1\;adc %B0,__zero_reg__\;adc %C0,__zero_reg__" - [(set_attr "length" "3") - (set_attr "cc" "set_n")]) + [(set_attr "length" "3")]) -(define_insn "*addpsi3_zero_extend.hi" +(define_insn_and_split "*addpsi3_zero_extend.hi_split" [(set (match_operand:PSI 0 "register_operand" "=r") (plus:PSI (zero_extend:PSI (match_operand:HI 1 "register_operand" "r")) (match_operand:PSI 2 "register_operand" "0")))] "" - "add %A0,%A1\;adc %B0,%B1\;adc %C0,__zero_reg__" - [(set_attr "length" "3") - (set_attr "cc" "set_n")]) + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:PSI (zero_extend:PSI (match_dup 1)) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) -(define_insn "*addpsi3_sign_extend.hi" +(define_insn "*addpsi3_zero_extend.hi" + [(set (match_operand:PSI 0 "register_operand" "=r") + (plus:PSI (zero_extend:PSI (match_operand:HI 1 "register_operand" "r")) + (match_operand:PSI 2 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" + "add %A0,%A1\;adc %B0,%B1\;adc %C0,__zero_reg__" + [(set_attr "length" "3")]) + +(define_insn_and_split "*addpsi3_sign_extend.hi_split" [(set (match_operand:PSI 0 "register_operand" "=r") (plus:PSI (sign_extend:PSI (match_operand:HI 1 "register_operand" "r")) (match_operand:PSI 2 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:PSI (sign_extend:PSI (match_dup 1)) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*addpsi3_sign_extend.hi" + [(set (match_operand:PSI 0 "register_operand" "=r") + (plus:PSI (sign_extend:PSI (match_operand:HI 1 "register_operand" "r")) + (match_operand:PSI 2 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "add %A0,%1\;adc %B0,%B1\;adc %C0,__zero_reg__\;sbrc %B1,7\;dec %C0" - [(set_attr "length" "5") - (set_attr "cc" "set_n")]) + [(set_attr "length" "5")]) -(define_insn "*addsi3_zero_extend" +(define_insn_and_split "*addsi3_zero_extend_split" [(set (match_operand:SI 0 "register_operand" "=r") (plus:SI (zero_extend:SI (match_operand:QI 1 "register_operand" "r")) (match_operand:SI 2 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:SI (zero_extend:SI (match_dup 1)) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*addsi3_zero_extend" + [(set (match_operand:SI 0 "register_operand" "=r") + (plus:SI (zero_extend:SI (match_operand:QI 1 "register_operand" "r")) + (match_operand:SI 2 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "add %A0,%1\;adc %B0,__zero_reg__\;adc %C0,__zero_reg__\;adc %D0,__zero_reg__" - [(set_attr "length" "4") - (set_attr "cc" "set_n")]) + [(set_attr "length" "4")]) -(define_insn "*addsi3_zero_extend.hi" +(define_insn_and_split "*addsi3_zero_extend.hi_split" [(set (match_operand:SI 0 "register_operand" "=r") (plus:SI (zero_extend:SI (match_operand:HI 1 "register_operand" "r")) (match_operand:SI 2 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:SI (zero_extend:SI (match_dup 1)) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*addsi3_zero_extend.hi" + [(set (match_operand:SI 0 "register_operand" "=r") + (plus:SI (zero_extend:SI (match_operand:HI 1 "register_operand" "r")) + (match_operand:SI 2 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "add %A0,%1\;adc %B0,%B1\;adc %C0,__zero_reg__\;adc %D0,__zero_reg__" - [(set_attr "length" "4") - (set_attr "cc" "set_n")]) + [(set_attr "length" "4")]) -(define_insn "addpsi3" +(define_insn_and_split "addpsi3" [(set (match_operand:PSI 0 "register_operand" "=??r,d ,d,r") (plus:PSI (match_operand:PSI 1 "register_operand" "%0,0 ,0,0") (match_operand:PSI 2 "nonmemory_operand" "r,s ,n,n"))) (clobber (match_scratch:QI 3 "=X,X ,X,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:PSI (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3 )) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*addpsi3" + [(set (match_operand:PSI 0 "register_operand" "=??r,d ,d,r") + (plus:PSI (match_operand:PSI 1 "register_operand" "%0,0 ,0,0") + (match_operand:PSI 2 "nonmemory_operand" "r,s ,n,n"))) + (clobber (match_scratch:QI 3 "=X,X ,X,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_plus (insn, operands); } [(set_attr "length" "3") - (set_attr "adjust_len" "plus") - (set_attr "cc" "plus")]) + (set_attr "adjust_len" "plus")]) -(define_insn "subpsi3" +(define_insn_and_split "subpsi3" [(set (match_operand:PSI 0 "register_operand" "=r") (minus:PSI (match_operand:PSI 1 "register_operand" "0") (match_operand:PSI 2 "register_operand" "r")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:PSI (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*subpsi3" + [(set (match_operand:PSI 0 "register_operand" "=r") + (minus:PSI (match_operand:PSI 1 "register_operand" "0") + (match_operand:PSI 2 "register_operand" "r"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "sub %0,%2\;sbc %B0,%B2\;sbc %C0,%C2" - [(set_attr "length" "3") - (set_attr "cc" "set_czn")]) + [(set_attr "length" "3")]) -(define_insn "*subpsi3_zero_extend.qi" +(define_insn_and_split "*subpsi3_zero_extend.qi_split" [(set (match_operand:PSI 0 "register_operand" "=r") (minus:PSI (match_operand:SI 1 "register_operand" "0") (zero_extend:PSI (match_operand:QI 2 "register_operand" "r"))))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:PSI (match_dup 1) + (zero_extend:PSI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*subpsi3_zero_extend.qi" + [(set (match_operand:PSI 0 "register_operand" "=r") + (minus:PSI (match_operand:SI 1 "register_operand" "0") + (zero_extend:PSI (match_operand:QI 2 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "reload_completed" "sub %A0,%2\;sbc %B0,__zero_reg__\;sbc %C0,__zero_reg__" - [(set_attr "length" "3") - (set_attr "cc" "set_czn")]) + [(set_attr "length" "3")]) -(define_insn "*subpsi3_zero_extend.hi" +(define_insn_and_split "*subpsi3_zero_extend.hi_split" [(set (match_operand:PSI 0 "register_operand" "=r") (minus:PSI (match_operand:PSI 1 "register_operand" "0") (zero_extend:PSI (match_operand:HI 2 "register_operand" "r"))))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:PSI (match_dup 1) + (zero_extend:PSI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*subpsi3_zero_extend.hi" + [(set (match_operand:PSI 0 "register_operand" "=r") + (minus:PSI (match_operand:PSI 1 "register_operand" "0") + (zero_extend:PSI (match_operand:HI 2 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "reload_completed" "sub %A0,%2\;sbc %B0,%B2\;sbc %C0,__zero_reg__" - [(set_attr "length" "3") - (set_attr "cc" "set_czn")]) + [(set_attr "length" "3")]) -(define_insn "*subpsi3_sign_extend.hi" +(define_insn_and_split "*subpsi3_sign_extend.hi_split" [(set (match_operand:PSI 0 "register_operand" "=r") (minus:PSI (match_operand:PSI 1 "register_operand" "0") (sign_extend:PSI (match_operand:HI 2 "register_operand" "r"))))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:PSI (match_dup 1) + (sign_extend:PSI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*subpsi3_sign_extend.hi" + [(set (match_operand:PSI 0 "register_operand" "=r") + (minus:PSI (match_operand:PSI 1 "register_operand" "0") + (sign_extend:PSI (match_operand:HI 2 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "reload_completed" "sub %A0,%A2\;sbc %B0,%B2\;sbc %C0,__zero_reg__\;sbrc %B2,7\;inc %C0" - [(set_attr "length" "5") - (set_attr "cc" "set_czn")]) + [(set_attr "length" "5")]) ;----------------------------------------------------------------------------- ; sub bytes ;; "subqi3" ;; "subqq3" "subuqq3" -(define_insn "sub3" +(define_insn_and_split "sub3" [(set (match_operand:ALL1 0 "register_operand" "=??r,d ,r ,r ,r ,r") (minus:ALL1 (match_operand:ALL1 1 "register_operand" "0,0 ,0 ,0 ,0 ,0") (match_operand:ALL1 2 "nonmemory_or_const_operand" "r,n Ynn,Y01,Ym1,Y02,Ym2")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:ALL1 (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*sub3" + [(set (match_operand:ALL1 0 "register_operand" "=??r,d ,r ,r ,r ,r") + (minus:ALL1 (match_operand:ALL1 1 "register_operand" "0,0 ,0 ,0 ,0 ,0") + (match_operand:ALL1 2 "nonmemory_or_const_operand" "r,n Ynn,Y01,Ym1,Y02,Ym2"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ sub %0,%2 subi %0,lo8(%2) @@ -1470,78 +1940,155 @@ inc %0 dec %0\;dec %0 inc %0\;inc %0" - [(set_attr "length" "1,1,1,1,2,2") - (set_attr "cc" "set_czn,set_czn,set_vzn,set_vzn,set_vzn,set_vzn")]) + [(set_attr "length" "1,1,1,1,2,2")]) ;; "subhi3" ;; "subhq3" "subuhq3" ;; "subha3" "subuha3" -(define_insn "sub3" +(define_insn_and_split "sub3" [(set (match_operand:ALL2 0 "register_operand" "=??r,d ,*r") (minus:ALL2 (match_operand:ALL2 1 "register_operand" "0,0 ,0") (match_operand:ALL2 2 "nonmemory_or_const_operand" "r,i Ynn,Ynn"))) (clobber (match_scratch:QI 3 "=X,X ,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:ALL2 (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*sub3" + [(set (match_operand:ALL2 0 "register_operand" "=??r,d ,*r") + (minus:ALL2 (match_operand:ALL2 1 "register_operand" "0,0 ,0") + (match_operand:ALL2 2 "nonmemory_or_const_operand" "r,i Ynn,Ynn"))) + (clobber (match_scratch:QI 3 "=X,X ,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_plus (insn, operands); } - [(set_attr "adjust_len" "plus") - (set_attr "cc" "plus")]) + [(set_attr "adjust_len" "plus")]) -(define_insn "*subhi3_zero_extend1" +(define_insn_and_split "*subhi3_zero_extend1_split" [(set (match_operand:HI 0 "register_operand" "=r") (minus:HI (match_operand:HI 1 "register_operand" "0") (zero_extend:HI (match_operand:QI 2 "register_operand" "r"))))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:HI (match_dup 1) + (zero_extend:HI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*subhi3_zero_extend1" + [(set (match_operand:HI 0 "register_operand" "=r") + (minus:HI (match_operand:HI 1 "register_operand" "0") + (zero_extend:HI (match_operand:QI 2 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "reload_completed" "sub %A0,%2\;sbc %B0,__zero_reg__" - [(set_attr "length" "2") - (set_attr "cc" "set_czn")]) + [(set_attr "length" "2")]) -(define_insn "*subhi3.sign_extend2" +(define_insn_and_split "*subhi3.sign_extend2_split" [(set (match_operand:HI 0 "register_operand" "=r") (minus:HI (match_operand:HI 1 "register_operand" "0") (sign_extend:HI (match_operand:QI 2 "register_operand" "r"))))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:HI (match_dup 1) + (sign_extend:HI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + + +(define_insn "*subhi3.sign_extend2" + [(set (match_operand:HI 0 "register_operand" "=r") + (minus:HI (match_operand:HI 1 "register_operand" "0") + (sign_extend:HI (match_operand:QI 2 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return reg_overlap_mentioned_p (operands[0], operands[2]) ? "mov __tmp_reg__,%2\;sub %A0,%2\;sbc %B0,__zero_reg__\;sbrc __tmp_reg__,7\;inc %B0" : "sub %A0,%2\;sbc %B0,__zero_reg__\;sbrc %2,7\;inc %B0"; } - [(set_attr "length" "5") - (set_attr "cc" "clobber")]) + [(set_attr "length" "5")]) ;; "subsi3" ;; "subsq3" "subusq3" ;; "subsa3" "subusa3" -(define_insn "sub3" +(define_insn_and_split "sub3" [(set (match_operand:ALL4 0 "register_operand" "=??r,d ,r") (minus:ALL4 (match_operand:ALL4 1 "register_operand" "0,0 ,0") (match_operand:ALL4 2 "nonmemory_or_const_operand" "r,n Ynn,Ynn"))) (clobber (match_scratch:QI 3 "=X,X ,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:ALL4 (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*sub3" + [(set (match_operand:ALL4 0 "register_operand" "=??r,d ,r") + (minus:ALL4 (match_operand:ALL4 1 "register_operand" "0,0 ,0") + (match_operand:ALL4 2 "nonmemory_or_const_operand" "r,n Ynn,Ynn"))) + (clobber (match_scratch:QI 3 "=X,X ,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_plus (insn, operands); } - [(set_attr "adjust_len" "plus") - (set_attr "cc" "plus")]) + [(set_attr "adjust_len" "plus")]) -(define_insn "*subsi3_zero_extend" +(define_insn_and_split "*subsi3_zero_extend_split" [(set (match_operand:SI 0 "register_operand" "=r") (minus:SI (match_operand:SI 1 "register_operand" "0") (zero_extend:SI (match_operand:QI 2 "register_operand" "r"))))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:SI (match_dup 1) + (zero_extend:SI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*subsi3_zero_extend" + [(set (match_operand:SI 0 "register_operand" "=r") + (minus:SI (match_operand:SI 1 "register_operand" "0") + (zero_extend:SI (match_operand:QI 2 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "reload_completed" "sub %A0,%2\;sbc %B0,__zero_reg__\;sbc %C0,__zero_reg__\;sbc %D0,__zero_reg__" [(set_attr "length" "4") - (set_attr "cc" "set_czn")]) + ]) -(define_insn "*subsi3_zero_extend.hi" +(define_insn_and_split "*subsi3_zero_extend.hi_split" [(set (match_operand:SI 0 "register_operand" "=r") (minus:SI (match_operand:SI 1 "register_operand" "0") (zero_extend:SI (match_operand:HI 2 "register_operand" "r"))))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:SI (match_dup 1) + (zero_extend:SI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*subsi3_zero_extend.hi" + [(set (match_operand:SI 0 "register_operand" "=r") + (minus:SI (match_operand:SI 1 "register_operand" "0") + (zero_extend:SI (match_operand:HI 2 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "reload_completed" "sub %A0,%2\;sbc %B0,%B2\;sbc %C0,__zero_reg__\;sbc %D0,__zero_reg__" - [(set_attr "length" "4") - (set_attr "cc" "set_czn")]) + [(set_attr "length" "4")]) ;****************************************************************************** ; mul @@ -1559,16 +2106,28 @@ } }) -(define_insn "*mulqi3_enh" +(define_insn_and_split "*mulqi3_enh_split" [(set (match_operand:QI 0 "register_operand" "=r") (mult:QI (match_operand:QI 1 "register_operand" "r") (match_operand:QI 2 "register_operand" "r")))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:QI (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulqi3_enh" + [(set (match_operand:QI 0 "register_operand" "=r") + (mult:QI (match_operand:QI 1 "register_operand" "r") + (match_operand:QI 2 "register_operand" "r"))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mul %1,%2 mov %0,r0 clr r1" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) (define_expand "mulqi3_call" [(set (reg:QI 24) (match_operand:QI 1 "register_operand" "")) @@ -1581,189 +2140,392 @@ avr_fix_inputs (operands, 1 << 2, regmask (QImode, 24)); }) -(define_insn "*mulqi3_call" +(define_insn_and_split "*mulqi3_call_split" [(set (reg:QI 24) (mult:QI (reg:QI 24) (reg:QI 22))) (clobber (reg:QI 22))] "!AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:QI 24) (mult:QI (reg:QI 24) (reg:QI 22))) + (clobber (reg:QI 22)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulqi3_call" + [(set (reg:QI 24) (mult:QI (reg:QI 24) (reg:QI 22))) + (clobber (reg:QI 22)) + (clobber (reg:CC REG_CC))] + "!AVR_HAVE_MUL && reload_completed" "%~call __mulqi3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; "umulqi3_highpart" ;; "smulqi3_highpart" -(define_insn "mulqi3_highpart" + +(define_insn_and_split "mulqi3_highpart" [(set (match_operand:QI 0 "register_operand" "=r") (truncate:QI (lshiftrt:HI (mult:HI (any_extend:HI (match_operand:QI 1 "register_operand" "")) (any_extend:HI (match_operand:QI 2 "register_operand" ""))) (const_int 8))))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (truncate:QI + (lshiftrt:HI (mult:HI (any_extend:HI (match_dup 1)) + (any_extend:HI (match_dup 2))) + (const_int 8)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulqi3_highpart" + [(set (match_operand:QI 0 "register_operand" "=r") + (truncate:QI + (lshiftrt:HI (mult:HI (any_extend:HI (match_operand:QI 1 "register_operand" "")) + (any_extend:HI (match_operand:QI 2 "register_operand" ""))) + (const_int 8)))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mul %1,%2 mov %0,r1 clr __zero_reg__" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) ;; Used when expanding div or mod inline for some special values -(define_insn "*subqi3.ashiftrt7" +(define_insn_and_split "*subqi3.ashiftrt7_split" [(set (match_operand:QI 0 "register_operand" "=r") (minus:QI (match_operand:QI 1 "register_operand" "0") (ashiftrt:QI (match_operand:QI 2 "register_operand" "r") (const_int 7))))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:QI (match_dup 1) + (ashiftrt:QI (match_dup 2) + (const_int 7)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*subqi3.ashiftrt7" + [(set (match_operand:QI 0 "register_operand" "=r") + (minus:QI (match_operand:QI 1 "register_operand" "0") + (ashiftrt:QI (match_operand:QI 2 "register_operand" "r") + (const_int 7)))) + (clobber (reg:CC REG_CC))] + "reload_completed" "sbrc %2,7\;inc %0" - [(set_attr "length" "2") - (set_attr "cc" "clobber")]) + [(set_attr "length" "2")]) -(define_insn "*addqi3.lt0" +(define_insn_and_split "*addqi3.lt0_split" [(set (match_operand:QI 0 "register_operand" "=r") (plus:QI (lt:QI (match_operand:QI 1 "register_operand" "r") (const_int 0)) (match_operand:QI 2 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:QI (lt:QI (match_dup 1) + (const_int 0)) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*addqi3.lt0" + [(set (match_operand:QI 0 "register_operand" "=r") + (plus:QI (lt:QI (match_operand:QI 1 "register_operand" "r") + (const_int 0)) + (match_operand:QI 2 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "sbrc %1,7\;inc %0" - [(set_attr "length" "2") - (set_attr "cc" "clobber")]) + [(set_attr "length" "2")]) -(define_insn "*addhi3.lt0" +(define_insn_and_split "*addhi3.lt0_split" [(set (match_operand:HI 0 "register_operand" "=w,r") (plus:HI (lt:HI (match_operand:QI 1 "register_operand" "r,r") (const_int 0)) (match_operand:HI 2 "register_operand" "0,0"))) (clobber (match_scratch:QI 3 "=X,&1"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:HI (lt:HI (match_dup 1) + (const_int 0)) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*addhi3.lt0" + [(set (match_operand:HI 0 "register_operand" "=w,r") + (plus:HI (lt:HI (match_operand:QI 1 "register_operand" "r,r") + (const_int 0)) + (match_operand:HI 2 "register_operand" "0,0"))) + (clobber (match_scratch:QI 3 "=X,&1")) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ sbrc %1,7\;adiw %0,1 lsl %1\;adc %A0,__zero_reg__\;adc %B0,__zero_reg__" - [(set_attr "length" "2,3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "2,3")]) -(define_insn "*addpsi3.lt0" +(define_insn_and_split "*addpsi3.lt0_split" [(set (match_operand:PSI 0 "register_operand" "=r") (plus:PSI (lshiftrt:PSI (match_operand:PSI 1 "register_operand" "r") (const_int 23)) (match_operand:PSI 2 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:PSI (lshiftrt:PSI (match_dup 1) + (const_int 23)) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*addpsi3.lt0" + [(set (match_operand:PSI 0 "register_operand" "=r") + (plus:PSI (lshiftrt:PSI (match_operand:PSI 1 "register_operand" "r") + (const_int 23)) + (match_operand:PSI 2 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "mov __tmp_reg__,%C1\;lsl __tmp_reg__ adc %A0,__zero_reg__\;adc %B0,__zero_reg__\;adc %C0,__zero_reg__" - [(set_attr "length" "5") - (set_attr "cc" "clobber")]) + [(set_attr "length" "5")]) -(define_insn "*addsi3.lt0" +(define_insn_and_split "*addsi3.lt0_split" [(set (match_operand:SI 0 "register_operand" "=r") (plus:SI (lshiftrt:SI (match_operand:SI 1 "register_operand" "r") (const_int 31)) (match_operand:SI 2 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:SI (lshiftrt:SI (match_dup 1) + (const_int 31)) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*addsi3.lt0" + [(set (match_operand:SI 0 "register_operand" "=r") + (plus:SI (lshiftrt:SI (match_operand:SI 1 "register_operand" "r") + (const_int 31)) + (match_operand:SI 2 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "mov __tmp_reg__,%D1\;lsl __tmp_reg__ adc %A0,__zero_reg__\;adc %B0,__zero_reg__\;adc %C0,__zero_reg__\;adc %D0,__zero_reg__" - [(set_attr "length" "6") - (set_attr "cc" "clobber")]) + [(set_attr "length" "6")]) -(define_insn "*umulqihi3.call" +(define_insn_and_split "*umulqihi3.call_split" [(set (reg:HI 24) (mult:HI (zero_extend:HI (reg:QI 22)) (zero_extend:HI (reg:QI 24)))) (clobber (reg:QI 21)) (clobber (reg:HI 22))] "!AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) + (mult:HI (zero_extend:HI (reg:QI 22)) + (zero_extend:HI (reg:QI 24)))) + (clobber (reg:QI 21)) + (clobber (reg:HI 22)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*umulqihi3.call" + [(set (reg:HI 24) + (mult:HI (zero_extend:HI (reg:QI 22)) + (zero_extend:HI (reg:QI 24)))) + (clobber (reg:QI 21)) + (clobber (reg:HI 22)) + (clobber (reg:CC REG_CC))] + "!AVR_HAVE_MUL && reload_completed" "%~call __umulqihi3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; "umulqihi3" ;; "mulqihi3" -(define_insn "mulqihi3" + +(define_insn_and_split "mulqihi3_split" [(set (match_operand:HI 0 "register_operand" "=r") (mult:HI (any_extend:HI (match_operand:QI 1 "register_operand" "")) (any_extend:HI (match_operand:QI 2 "register_operand" ""))))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:HI (any_extend:HI (match_dup 1)) + (any_extend:HI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "mulqihi3" + [(set (match_operand:HI 0 "register_operand" "=r") + (mult:HI (any_extend:HI (match_operand:QI 1 "register_operand" "")) + (any_extend:HI (match_operand:QI 2 "register_operand" "")))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mul %1,%2 movw %0,r0 clr __zero_reg__" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) -(define_insn "usmulqihi3" +(define_insn_and_split "usmulqihi3" [(set (match_operand:HI 0 "register_operand" "=r") (mult:HI (zero_extend:HI (match_operand:QI 1 "register_operand" "a")) (sign_extend:HI (match_operand:QI 2 "register_operand" "a"))))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:HI (zero_extend:HI (match_dup 1)) + (sign_extend:HI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*usmulqihi3" + [(set (match_operand:HI 0 "register_operand" "=r") + (mult:HI (zero_extend:HI (match_operand:QI 1 "register_operand" "a")) + (sign_extend:HI (match_operand:QI 2 "register_operand" "a")))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mulsu %2,%1 movw %0,r0 clr __zero_reg__" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) ;; Above insn is not canonicalized by insn combine, so here is a version with ;; operands swapped. - -(define_insn "*sumulqihi3" +(define_insn_and_split "*sumulqihi3_split" [(set (match_operand:HI 0 "register_operand" "=r") (mult:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "a")) (zero_extend:HI (match_operand:QI 2 "register_operand" "a"))))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:HI (sign_extend:HI (match_dup 1)) + (zero_extend:HI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*sumulqihi3" + [(set (match_operand:HI 0 "register_operand" "=r") + (mult:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "a")) + (zero_extend:HI (match_operand:QI 2 "register_operand" "a")))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mulsu %1,%2 movw %0,r0 clr __zero_reg__" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) ;; One-extend operand 1 -(define_insn "*osmulqihi3" +(define_insn_and_split "*osmulqihi3_split" [(set (match_operand:HI 0 "register_operand" "=&r") (mult:HI (not:HI (zero_extend:HI (not:QI (match_operand:QI 1 "register_operand" "a")))) (sign_extend:HI (match_operand:QI 2 "register_operand" "a"))))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:HI (not:HI (zero_extend:HI (not:QI (match_dup 1)))) + (sign_extend:HI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*osmulqihi3" + [(set (match_operand:HI 0 "register_operand" "=&r") + (mult:HI (not:HI (zero_extend:HI (not:QI (match_operand:QI 1 "register_operand" "a")))) + (sign_extend:HI (match_operand:QI 2 "register_operand" "a")))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mulsu %2,%1 movw %0,r0 sub %B0,%2 clr __zero_reg__" - [(set_attr "length" "4") - (set_attr "cc" "clobber")]) + [(set_attr "length" "4")]) -(define_insn "*oumulqihi3" +(define_insn_and_split "*oumulqihi3_split" [(set (match_operand:HI 0 "register_operand" "=&r") (mult:HI (not:HI (zero_extend:HI (not:QI (match_operand:QI 1 "register_operand" "r")))) (zero_extend:HI (match_operand:QI 2 "register_operand" "r"))))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:HI (not:HI (zero_extend:HI (not:QI (match_dup 1)))) + (zero_extend:HI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*oumulqihi3" + [(set (match_operand:HI 0 "register_operand" "=&r") + (mult:HI (not:HI (zero_extend:HI (not:QI (match_operand:QI 1 "register_operand" "r")))) + (zero_extend:HI (match_operand:QI 2 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mul %2,%1 movw %0,r0 sub %B0,%2 clr __zero_reg__" - [(set_attr "length" "4") - (set_attr "cc" "clobber")]) + [(set_attr "length" "4")]) ;****************************************************************************** ; multiply-add/sub QI: $0 = $3 +/- $1*$2 ;****************************************************************************** -(define_insn "*maddqi4" +(define_insn_and_split "*maddqi4_split" [(set (match_operand:QI 0 "register_operand" "=r") (plus:QI (mult:QI (match_operand:QI 1 "register_operand" "r") (match_operand:QI 2 "register_operand" "r")) (match_operand:QI 3 "register_operand" "0")))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:QI (mult:QI (match_dup 1) + (match_dup 2)) + (match_dup 3))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*maddqi4" + [(set (match_operand:QI 0 "register_operand" "=r") + (plus:QI (mult:QI (match_operand:QI 1 "register_operand" "r") + (match_operand:QI 2 "register_operand" "r")) + (match_operand:QI 3 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mul %1,%2 add %A0,r0 clr __zero_reg__" - [(set_attr "length" "4") - (set_attr "cc" "clobber")]) + [(set_attr "length" "4")]) -(define_insn "*msubqi4" +(define_insn_and_split "*msubqi4_split" [(set (match_operand:QI 0 "register_operand" "=r") (minus:QI (match_operand:QI 3 "register_operand" "0") (mult:QI (match_operand:QI 1 "register_operand" "r") (match_operand:QI 2 "register_operand" "r"))))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:QI (match_dup 3) + (mult:QI (match_dup 1) + (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*msubqi4" + [(set (match_operand:QI 0 "register_operand" "=r") + (minus:QI (match_operand:QI 3 "register_operand" "0") + (mult:QI (match_operand:QI 1 "register_operand" "r") + (match_operand:QI 2 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mul %1,%2 sub %A0,r0 clr __zero_reg__" - [(set_attr "length" "4") - (set_attr "cc" "clobber")]) + [(set_attr "length" "4")]) (define_insn_and_split "*maddqi4.const" [(set (match_operand:QI 0 "register_operand" "=r") @@ -1821,38 +2583,66 @@ ;; "*maddqihi4" ;; "*umaddqihi4" -(define_insn "*maddqihi4" +(define_insn_and_split "*maddqihi4_split" [(set (match_operand:HI 0 "register_operand" "=r") (plus:HI (mult:HI (any_extend:HI (match_operand:QI 1 "register_operand" "")) (any_extend:HI (match_operand:QI 2 "register_operand" ""))) (match_operand:HI 3 "register_operand" "0")))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:HI (mult:HI (any_extend:HI (match_dup 1)) + (any_extend:HI (match_dup 2))) + (match_dup 3))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*maddqihi4" + [(set (match_operand:HI 0 "register_operand" "=r") + (plus:HI (mult:HI (any_extend:HI (match_operand:QI 1 "register_operand" "")) + (any_extend:HI (match_operand:QI 2 "register_operand" ""))) + (match_operand:HI 3 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mul %1,%2 add %A0,r0 adc %B0,r1 clr __zero_reg__" - [(set_attr "length" "4") - (set_attr "cc" "clobber")]) + [(set_attr "length" "4")]) ;; "*msubqihi4" ;; "*umsubqihi4" -(define_insn "*msubqihi4" +(define_insn_and_split "*msubqihi4_split" [(set (match_operand:HI 0 "register_operand" "=r") (minus:HI (match_operand:HI 3 "register_operand" "0") (mult:HI (any_extend:HI (match_operand:QI 1 "register_operand" "")) (any_extend:HI (match_operand:QI 2 "register_operand" "")))))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:HI (match_dup 3) + (mult:HI (any_extend:HI (match_dup 1)) + (any_extend:HI (match_dup 2))))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*msubqihi4" + [(set (match_operand:HI 0 "register_operand" "=r") + (minus:HI (match_operand:HI 3 "register_operand" "0") + (mult:HI (any_extend:HI (match_operand:QI 1 "register_operand" "")) + (any_extend:HI (match_operand:QI 2 "register_operand" ""))))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mul %1,%2 sub %A0,r0 sbc %B0,r1 clr __zero_reg__" - [(set_attr "length" "4") - (set_attr "cc" "clobber")]) + [(set_attr "length" "4")]) ;; "*usmaddqihi4" ;; "*sumaddqihi4" -(define_insn "*msubqihi4" +(define_insn_and_split "*msubqihi4_split" [(set (match_operand:HI 0 "register_operand" "=r") (plus:HI (mult:HI (any_extend:HI (match_operand:QI 1 "register_operand" "a")) (any_extend2:HI (match_operand:QI 2 "register_operand" "a"))) @@ -1860,18 +2650,34 @@ "AVR_HAVE_MUL && reload_completed && != " + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (plus:HI (mult:HI (any_extend:HI (match_dup 1)) + (any_extend2:HI (match_dup 2))) + (match_dup 3))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*msubqihi4" + [(set (match_operand:HI 0 "register_operand" "=r") + (plus:HI (mult:HI (any_extend:HI (match_operand:QI 1 "register_operand" "a")) + (any_extend2:HI (match_operand:QI 2 "register_operand" "a"))) + (match_operand:HI 3 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL + && reload_completed + && != " { output_asm_insn ( == SIGN_EXTEND ? "mulsu %1,%2" : "mulsu %2,%1", operands); return "add %A0,r0\;adc %B0,r1\;clr __zero_reg__"; } - [(set_attr "length" "4") - (set_attr "cc" "clobber")]) + [(set_attr "length" "4")]) ;; "*usmsubqihi4" ;; "*sumsubqihi4" -(define_insn "*msubqihi4" +(define_insn_and_split "*msubqihi4_split" [(set (match_operand:HI 0 "register_operand" "=r") (minus:HI (match_operand:HI 3 "register_operand" "0") (mult:HI (any_extend:HI (match_operand:QI 1 "register_operand" "a")) @@ -1879,14 +2685,30 @@ "AVR_HAVE_MUL && reload_completed && != " + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (minus:HI (match_dup 3) + (mult:HI (any_extend:HI (match_dup 1)) + (any_extend2:HI (match_dup 2))))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*msubqihi4" + [(set (match_operand:HI 0 "register_operand" "=r") + (minus:HI (match_operand:HI 3 "register_operand" "0") + (mult:HI (any_extend:HI (match_operand:QI 1 "register_operand" "a")) + (any_extend2:HI (match_operand:QI 2 "register_operand" "a"))))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL + && reload_completed + && != " { output_asm_insn ( == SIGN_EXTEND ? "mulsu %1,%2" : "mulsu %2,%1", operands); return "sub %A0,r0\;sbc %B0,r1\;clr __zero_reg__"; } - [(set_attr "length" "4") - (set_attr "cc" "clobber")]) + [(set_attr "length" "4")]) ;; Handle small constants @@ -2130,17 +2952,28 @@ ;; The EXTEND of $1 only appears in combine, we don't see it in expand so that ;; expand decides to use ASHIFT instead of MUL because ASHIFT costs are cheaper ;; at that time. Fix that. - -(define_insn "*ashiftqihi2.signx.1" +(define_insn_and_split "*ashiftqihi2.signx.1_split" [(set (match_operand:HI 0 "register_operand" "=r,*r") (ashift:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "0,r")) (const_int 1)))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ashift:HI (sign_extend:HI (match_dup 1)) + (const_int 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ashiftqihi2.signx.1" + [(set (match_operand:HI 0 "register_operand" "=r,*r") + (ashift:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "0,r")) + (const_int 1))) + (clobber (reg:CC REG_CC)) ] + "reload_completed" "@ lsl %A0\;sbc %B0,%B0 mov %A0,%1\;lsl %A0\;sbc %B0,%B0" - [(set_attr "length" "2,3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "2,3")]) (define_insn_and_split "*ashifthi3.signx.const" [(set (match_operand:HI 0 "register_operand" "=r") @@ -2200,47 +3033,83 @@ ; mul HI: $1 = sign-/zero-/one-extend, $2 = reg ;****************************************************************************** -(define_insn "mulsqihi3" +(define_insn_and_split "mulsqihi3" [(set (match_operand:HI 0 "register_operand" "=&r") (mult:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "a")) (match_operand:HI 2 "register_operand" "a")))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:HI (sign_extend:HI (match_dup 1)) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulsqihi3" + [(set (match_operand:HI 0 "register_operand" "=&r") + (mult:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "a")) + (match_operand:HI 2 "register_operand" "a"))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mulsu %1,%A2 movw %0,r0 mul %1,%B2 add %B0,r0 clr __zero_reg__" - [(set_attr "length" "5") - (set_attr "cc" "clobber")]) + [(set_attr "length" "5")]) -(define_insn "muluqihi3" +(define_insn_and_split "muluqihi3" [(set (match_operand:HI 0 "register_operand" "=&r") (mult:HI (zero_extend:HI (match_operand:QI 1 "register_operand" "r")) (match_operand:HI 2 "register_operand" "r")))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:HI (zero_extend:HI (match_dup 1)) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*muluqihi3" + [(set (match_operand:HI 0 "register_operand" "=&r") + (mult:HI (zero_extend:HI (match_operand:QI 1 "register_operand" "r")) + (match_operand:HI 2 "register_operand" "r"))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mul %1,%A2 movw %0,r0 mul %1,%B2 add %B0,r0 clr __zero_reg__" - [(set_attr "length" "5") - (set_attr "cc" "clobber")]) + [(set_attr "length" "5")]) ;; one-extend operand 1 -(define_insn "muloqihi3" +(define_insn_and_split "muloqihi3" [(set (match_operand:HI 0 "register_operand" "=&r") (mult:HI (not:HI (zero_extend:HI (not:QI (match_operand:QI 1 "register_operand" "r")))) (match_operand:HI 2 "register_operand" "r")))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:HI (not:HI (zero_extend:HI (not:QI (match_dup 1)))) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*muloqihi3" + [(set (match_operand:HI 0 "register_operand" "=&r") + (mult:HI (not:HI (zero_extend:HI (not:QI (match_operand:QI 1 "register_operand" "r")))) + (match_operand:HI 2 "register_operand" "r"))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mul %1,%A2 movw %0,r0 mul %1,%B2 add %B0,r0 sub %B0,%A2 clr __zero_reg__" - [(set_attr "length" "6") - (set_attr "cc" "clobber")]) + [(set_attr "length" "6")]) ;****************************************************************************** @@ -2288,18 +3157,30 @@ operands[2] = force_reg (HImode, operands[2]); }) -(define_insn "*mulhi3_enh" +(define_insn_and_split "*mulhi3_enh_split" [(set (match_operand:HI 0 "register_operand" "=&r") (mult:HI (match_operand:HI 1 "register_operand" "r") (match_operand:HI 2 "register_operand" "r")))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:HI (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulhi3_enh" + [(set (match_operand:HI 0 "register_operand" "=&r") + (mult:HI (match_operand:HI 1 "register_operand" "r") + (match_operand:HI 2 "register_operand" "r"))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" { return REGNO (operands[1]) == REGNO (operands[2]) ? "mul %A1,%A1\;movw %0,r0\;mul %A1,%B1\;add %B0,r0\;add %B0,r0\;clr r1" : "mul %A1,%A2\;movw %0,r0\;mul %A1,%B2\;add %B0,r0\;mul %B1,%A2\;add %B0,r0\;clr r1"; } - [(set_attr "length" "7") - (set_attr "cc" "clobber")]) + [(set_attr "length" "7")]) (define_expand "mulhi3_call" [(set (reg:HI 24) (match_operand:HI 1 "register_operand" "")) @@ -2315,14 +3196,26 @@ }) -(define_insn "*mulhi3_call" +(define_insn_and_split "*mulhi3_call_split" [(set (reg:HI 24) (mult:HI (reg:HI 24) (reg:HI 22))) (clobber (reg:HI 22)) (clobber (reg:QI 21))] "!AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) (mult:HI (reg:HI 24) (reg:HI 22))) + (clobber (reg:HI 22)) + (clobber (reg:QI 21)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulhi3_call" + [(set (reg:HI 24) (mult:HI (reg:HI 24) (reg:HI 22))) + (clobber (reg:HI 22)) + (clobber (reg:QI 21)) + (clobber (reg:CC REG_CC))] + "!AVR_HAVE_MUL && reload_completed" "%~call __mulhi3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; To support widening multiplication with constant we postpone ;; expanding to the implicit library call until post combine and @@ -2643,67 +3536,144 @@ avr_fix_inputs (operands, 1 << 2, regmask (HImode, 18)); }) - -(define_insn "*mulsi3_call" +(define_insn_and_split "*mulsi3_call_split" [(set (reg:SI 22) (mult:SI (reg:SI 22) (reg:SI 18))) (clobber (reg:HI 26))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:SI 22) + (mult:SI (reg:SI 22) + (reg:SI 18))) + (clobber (reg:HI 26)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulsi3_call" + [(set (reg:SI 22) + (mult:SI (reg:SI 22) + (reg:SI 18))) + (clobber (reg:HI 26)) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "%~call __mulsi3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; "*mulhisi3_call" ;; "*umulhisi3_call" -(define_insn "*mulhisi3_call" +(define_insn_and_split "*mulhisi3_call_split" [(set (reg:SI 22) (mult:SI (any_extend:SI (reg:HI 18)) (any_extend:SI (reg:HI 26))))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:SI 22) + (mult:SI (any_extend:SI (reg:HI 18)) + (any_extend:SI (reg:HI 26)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulhisi3_call" + [(set (reg:SI 22) + (mult:SI (any_extend:SI (reg:HI 18)) + (any_extend:SI (reg:HI 26)))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "%~call __mulhisi3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; "*umulhi3_highpart_call" ;; "*smulhi3_highpart_call" -(define_insn "*mulhi3_highpart_call" +(define_insn_and_split "*mulhi3_highpart_call_split" [(set (reg:HI 24) (truncate:HI (lshiftrt:SI (mult:SI (any_extend:SI (reg:HI 18)) (any_extend:SI (reg:HI 26))) (const_int 16)))) (clobber (reg:HI 22))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) + (truncate:HI (lshiftrt:SI (mult:SI (any_extend:SI (reg:HI 18)) + (any_extend:SI (reg:HI 26))) + (const_int 16)))) + (clobber (reg:HI 22)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulhi3_highpart_call" + [(set (reg:HI 24) + (truncate:HI (lshiftrt:SI (mult:SI (any_extend:SI (reg:HI 18)) + (any_extend:SI (reg:HI 26))) + (const_int 16)))) + (clobber (reg:HI 22)) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "%~call __mulhisi3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) -(define_insn "*usmulhisi3_call" +(define_insn_and_split "*usmulhisi3_call_split" [(set (reg:SI 22) (mult:SI (zero_extend:SI (reg:HI 18)) (sign_extend:SI (reg:HI 26))))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:SI 22) + (mult:SI (zero_extend:SI (reg:HI 18)) + (sign_extend:SI (reg:HI 26)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*usmulhisi3_call" + [(set (reg:SI 22) + (mult:SI (zero_extend:SI (reg:HI 18)) + (sign_extend:SI (reg:HI 26)))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "%~call __usmulhisi3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) -(define_insn "*mulhisi3_call" +(define_insn_and_split "*mulhisi3_call_split" [(set (reg:SI 22) (mult:SI (any_extend:SI (reg:HI 26)) (reg:SI 18)))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:SI 22) + (mult:SI (any_extend:SI (reg:HI 26)) + (reg:SI 18))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulhisi3_call" + [(set (reg:SI 22) + (mult:SI (any_extend:SI (reg:HI 26)) + (reg:SI 18))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "%~call __mulhisi3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) -(define_insn "*mulohisi3_call" +(define_insn_and_split "*mulohisi3_call_split" [(set (reg:SI 22) (mult:SI (not:SI (zero_extend:SI (not:HI (reg:HI 26)))) (reg:SI 18)))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:SI 22) + (mult:SI (not:SI (zero_extend:SI (not:HI (reg:HI 26)))) + (reg:SI 18))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulohisi3_call" + [(set (reg:SI 22) + (mult:SI (not:SI (zero_extend:SI (not:HI (reg:HI 26)))) + (reg:SI 18))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "%~call __mulohisi3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ; / % / % / % / % / % / % / % / % / % / % / % / % / % / % / % / % / % / % / % ; divmod @@ -2716,15 +3686,15 @@ ;; CSE has problems to operate on hard regs. ;; (define_insn_and_split "divmodqi4" - [(parallel [(set (match_operand:QI 0 "pseudo_register_operand" "") - (div:QI (match_operand:QI 1 "pseudo_register_operand" "") - (match_operand:QI 2 "pseudo_register_operand" ""))) - (set (match_operand:QI 3 "pseudo_register_operand" "") - (mod:QI (match_dup 1) (match_dup 2))) - (clobber (reg:QI 22)) - (clobber (reg:QI 23)) - (clobber (reg:QI 24)) - (clobber (reg:QI 25))])] + [(set (match_operand:QI 0 "pseudo_register_operand" "") + (div:QI (match_operand:QI 1 "pseudo_register_operand" "") + (match_operand:QI 2 "pseudo_register_operand" ""))) + (set (match_operand:QI 3 "pseudo_register_operand" "") + (mod:QI (match_dup 1) (match_dup 2))) + (clobber (reg:QI 22)) + (clobber (reg:QI 23)) + (clobber (reg:QI 24)) + (clobber (reg:QI 25))] "" "this divmodqi4 pattern should have been splitted;" "" @@ -2737,26 +3707,40 @@ (set (match_dup 0) (reg:QI 24)) (set (match_dup 3) (reg:QI 25))]) -(define_insn "*divmodqi4_call" +(define_insn_and_split "*divmodqi4_call_split" [(set (reg:QI 24) (div:QI (reg:QI 24) (reg:QI 22))) (set (reg:QI 25) (mod:QI (reg:QI 24) (reg:QI 22))) (clobber (reg:QI 22)) (clobber (reg:QI 23))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:QI 24) (div:QI (reg:QI 24) (reg:QI 22))) + (set (reg:QI 25) (mod:QI (reg:QI 24) (reg:QI 22))) + (clobber (reg:QI 22)) + (clobber (reg:QI 23)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*divmodqi4_call" + [(set (reg:QI 24) (div:QI (reg:QI 24) (reg:QI 22))) + (set (reg:QI 25) (mod:QI (reg:QI 24) (reg:QI 22))) + (clobber (reg:QI 22)) + (clobber (reg:QI 23)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __divmodqi4" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) (define_insn_and_split "udivmodqi4" - [(parallel [(set (match_operand:QI 0 "pseudo_register_operand" "") - (udiv:QI (match_operand:QI 1 "pseudo_register_operand" "") - (match_operand:QI 2 "pseudo_register_operand" ""))) - (set (match_operand:QI 3 "pseudo_register_operand" "") - (umod:QI (match_dup 1) (match_dup 2))) - (clobber (reg:QI 22)) - (clobber (reg:QI 23)) - (clobber (reg:QI 24)) - (clobber (reg:QI 25))])] + [(set (match_operand:QI 0 "pseudo_register_operand" "") + (udiv:QI (match_operand:QI 1 "pseudo_register_operand" "") + (match_operand:QI 2 "pseudo_register_operand" ""))) + (set (match_operand:QI 3 "pseudo_register_operand" "") + (umod:QI (match_dup 1) (match_dup 2))) + (clobber (reg:QI 22)) + (clobber (reg:QI 23)) + (clobber (reg:QI 24)) + (clobber (reg:QI 25))] "" "this udivmodqi4 pattern should have been splitted;" "" @@ -2768,25 +3752,37 @@ (set (match_dup 0) (reg:QI 24)) (set (match_dup 3) (reg:QI 25))]) -(define_insn "*udivmodqi4_call" +(define_insn_and_split "*udivmodqi4_call_split" [(set (reg:QI 24) (udiv:QI (reg:QI 24) (reg:QI 22))) (set (reg:QI 25) (umod:QI (reg:QI 24) (reg:QI 22))) (clobber (reg:QI 23))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:QI 24) (udiv:QI (reg:QI 24) (reg:QI 22))) + (set (reg:QI 25) (umod:QI (reg:QI 24) (reg:QI 22))) + (clobber (reg:QI 23)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*udivmodqi4_call" + [(set (reg:QI 24) (udiv:QI (reg:QI 24) (reg:QI 22))) + (set (reg:QI 25) (umod:QI (reg:QI 24) (reg:QI 22))) + (clobber (reg:QI 23)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __udivmodqi4" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) (define_insn_and_split "divmodhi4" - [(parallel [(set (match_operand:HI 0 "pseudo_register_operand" "") - (div:HI (match_operand:HI 1 "pseudo_register_operand" "") - (match_operand:HI 2 "pseudo_register_operand" ""))) - (set (match_operand:HI 3 "pseudo_register_operand" "") - (mod:HI (match_dup 1) (match_dup 2))) - (clobber (reg:QI 21)) - (clobber (reg:HI 22)) - (clobber (reg:HI 24)) - (clobber (reg:HI 26))])] + [(set (match_operand:HI 0 "pseudo_register_operand" "") + (div:HI (match_operand:HI 1 "pseudo_register_operand" "") + (match_operand:HI 2 "pseudo_register_operand" ""))) + (set (match_operand:HI 3 "pseudo_register_operand" "") + (mod:HI (match_dup 1) (match_dup 2))) + (clobber (reg:QI 21)) + (clobber (reg:HI 22)) + (clobber (reg:HI 24)) + (clobber (reg:HI 26))] "" "this should have been splitted;" "" @@ -2799,26 +3795,40 @@ (set (match_dup 0) (reg:HI 22)) (set (match_dup 3) (reg:HI 24))]) -(define_insn "*divmodhi4_call" +(define_insn_and_split "*divmodhi4_call_split" [(set (reg:HI 22) (div:HI (reg:HI 24) (reg:HI 22))) (set (reg:HI 24) (mod:HI (reg:HI 24) (reg:HI 22))) (clobber (reg:HI 26)) (clobber (reg:QI 21))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 22) (div:HI (reg:HI 24) (reg:HI 22))) + (set (reg:HI 24) (mod:HI (reg:HI 24) (reg:HI 22))) + (clobber (reg:HI 26)) + (clobber (reg:QI 21)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*divmodhi4_call" + [(set (reg:HI 22) (div:HI (reg:HI 24) (reg:HI 22))) + (set (reg:HI 24) (mod:HI (reg:HI 24) (reg:HI 22))) + (clobber (reg:HI 26)) + (clobber (reg:QI 21)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __divmodhi4" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) (define_insn_and_split "udivmodhi4" - [(parallel [(set (match_operand:HI 0 "pseudo_register_operand" "") - (udiv:HI (match_operand:HI 1 "pseudo_register_operand" "") - (match_operand:HI 2 "pseudo_register_operand" ""))) - (set (match_operand:HI 3 "pseudo_register_operand" "") - (umod:HI (match_dup 1) (match_dup 2))) - (clobber (reg:QI 21)) - (clobber (reg:HI 22)) - (clobber (reg:HI 24)) - (clobber (reg:HI 26))])] + [(set (match_operand:HI 0 "pseudo_register_operand" "") + (udiv:HI (match_operand:HI 1 "pseudo_register_operand" "") + (match_operand:HI 2 "pseudo_register_operand" ""))) + (set (match_operand:HI 3 "pseudo_register_operand" "") + (umod:HI (match_dup 1) (match_dup 2))) + (clobber (reg:QI 21)) + (clobber (reg:HI 22)) + (clobber (reg:HI 24)) + (clobber (reg:HI 26))] "" "this udivmodhi4 pattern should have been splitted.;" "" @@ -2831,15 +3841,30 @@ (set (match_dup 0) (reg:HI 22)) (set (match_dup 3) (reg:HI 24))]) -(define_insn "*udivmodhi4_call" +(define_insn_and_split "*udivmodhi4_call_split" [(set (reg:HI 22) (udiv:HI (reg:HI 24) (reg:HI 22))) (set (reg:HI 24) (umod:HI (reg:HI 24) (reg:HI 22))) (clobber (reg:HI 26)) (clobber (reg:QI 21))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 22) (udiv:HI (reg:HI 24) (reg:HI 22))) + (set (reg:HI 24) (umod:HI (reg:HI 24) (reg:HI 22))) + (clobber (reg:HI 26)) + (clobber (reg:QI 21)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*udivmodhi4_call" + [(set (reg:HI 22) (udiv:HI (reg:HI 24) (reg:HI 22))) + (set (reg:HI 24) (umod:HI (reg:HI 24) (reg:HI 22))) + (clobber (reg:HI 26)) + (clobber (reg:QI 21)) + (clobber (reg:CC REG_CC)) + ] + "reload_completed" "%~call __udivmodhi4" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; 24-bit multiply @@ -2870,11 +3895,24 @@ DONE; }) -(define_insn "*umulqihipsi3" +(define_insn_and_split "*umulqihipsi3_split" [(set (match_operand:PSI 0 "register_operand" "=&r") (mult:PSI (zero_extend:PSI (match_operand:QI 1 "register_operand" "r")) (zero_extend:PSI (match_operand:HI 2 "register_operand" "r"))))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:PSI (zero_extend:PSI (match_dup 1)) + (zero_extend:PSI (match_dup 2)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*umulqihipsi3" + [(set (match_operand:PSI 0 "register_operand" "=&r") + (mult:PSI (zero_extend:PSI (match_operand:QI 1 "register_operand" "r")) + (zero_extend:PSI (match_operand:HI 2 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mul %1,%A2 movw %A0,r0 mul %1,%B2 @@ -2882,14 +3920,26 @@ add %B0,r0 adc %C0,r1 clr __zero_reg__" - [(set_attr "length" "7") - (set_attr "cc" "clobber")]) + [(set_attr "length" "7")]) -(define_insn "*umulhiqipsi3" +(define_insn_and_split "*umulhiqipsi3_split" [(set (match_operand:PSI 0 "register_operand" "=&r") (mult:PSI (zero_extend:PSI (match_operand:HI 2 "register_operand" "r")) (zero_extend:PSI (match_operand:QI 1 "register_operand" "r"))))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (mult:PSI (zero_extend:PSI (match_dup 2)) + (zero_extend:PSI (match_dup 1)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*umulhiqipsi3" + [(set (match_operand:PSI 0 "register_operand" "=&r") + (mult:PSI (zero_extend:PSI (match_operand:HI 2 "register_operand" "r")) + (zero_extend:PSI (match_operand:QI 1 "register_operand" "r")))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "mul %1,%A2 movw %A0,r0 mul %1,%B2 @@ -2897,8 +3947,7 @@ mov %C0,r1 clr __zero_reg__ adc %C0,__zero_reg__" - [(set_attr "length" "7") - (set_attr "cc" "clobber")]) + [(set_attr "length" "7")]) (define_expand "mulsqipsi3" [(parallel [(set (match_operand:PSI 0 "pseudo_register_operand" "") @@ -2963,16 +4012,28 @@ } }) -(define_insn "*mulsqipsi3.libgcc" +(define_insn_and_split "*mulsqipsi3.libgcc_split" [(set (reg:PSI 18) (mult:PSI (sign_extend:PSI (reg:QI 25)) (reg:PSI 22)))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:PSI 18) + (mult:PSI (sign_extend:PSI (reg:QI 25)) + (reg:PSI 22))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulsqipsi3.libgcc" + [(set (reg:PSI 18) + (mult:PSI (sign_extend:PSI (reg:QI 25)) + (reg:PSI 22))) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "%~call __mulsqipsi3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) -(define_insn "*mulpsi3.libgcc" +(define_insn_and_split "*mulpsi3.libgcc_split" [(set (reg:PSI 22) (mult:PSI (reg:PSI 22) (reg:PSI 18))) @@ -2980,9 +4041,27 @@ (clobber (reg:QI 25)) (clobber (reg:HI 26))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:PSI 22) + (mult:PSI (reg:PSI 22) + (reg:PSI 18))) + (clobber (reg:QI 21)) + (clobber (reg:QI 25)) + (clobber (reg:HI 26)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*mulpsi3.libgcc" + [(set (reg:PSI 22) + (mult:PSI (reg:PSI 22) + (reg:PSI 18))) + (clobber (reg:QI 21)) + (clobber (reg:QI 25)) + (clobber (reg:HI 26)) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "%~call __mulpsi3" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3013,16 +4092,32 @@ (set (match_dup 0) (reg:PSI 22)) (set (match_dup 3) (reg:PSI 18))]) -(define_insn "*divmodpsi4_call" +(define_insn_and_split "*divmodpsi4_call_split" [(set (reg:PSI 22) (div:PSI (reg:PSI 22) (reg:PSI 18))) (set (reg:PSI 18) (mod:PSI (reg:PSI 22) (reg:PSI 18))) (clobber (reg:QI 21)) (clobber (reg:QI 25)) (clobber (reg:QI 26))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:PSI 22) (div:PSI (reg:PSI 22) (reg:PSI 18))) + (set (reg:PSI 18) (mod:PSI (reg:PSI 22) (reg:PSI 18))) + (clobber (reg:QI 21)) + (clobber (reg:QI 25)) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*divmodpsi4_call" + [(set (reg:PSI 22) (div:PSI (reg:PSI 22) (reg:PSI 18))) + (set (reg:PSI 18) (mod:PSI (reg:PSI 22) (reg:PSI 18))) + (clobber (reg:QI 21)) + (clobber (reg:QI 25)) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __divmodpsi4" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) (define_insn_and_split "udivmodpsi4" [(parallel [(set (match_operand:PSI 0 "pseudo_register_operand" "") @@ -3046,16 +4141,32 @@ (set (match_dup 0) (reg:PSI 22)) (set (match_dup 3) (reg:PSI 18))]) -(define_insn "*udivmodpsi4_call" +(define_insn_and_split "*udivmodpsi4_call_split" [(set (reg:PSI 22) (udiv:PSI (reg:PSI 22) (reg:PSI 18))) (set (reg:PSI 18) (umod:PSI (reg:PSI 22) (reg:PSI 18))) (clobber (reg:QI 21)) (clobber (reg:QI 25)) (clobber (reg:QI 26))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:PSI 22) (udiv:PSI (reg:PSI 22) (reg:PSI 18))) + (set (reg:PSI 18) (umod:PSI (reg:PSI 22) (reg:PSI 18))) + (clobber (reg:QI 21)) + (clobber (reg:QI 25)) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*udivmodpsi4_call" + [(set (reg:PSI 22) (udiv:PSI (reg:PSI 22) (reg:PSI 18))) + (set (reg:PSI 18) (umod:PSI (reg:PSI 22) (reg:PSI 18))) + (clobber (reg:QI 21)) + (clobber (reg:QI 25)) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __udivmodpsi4" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3081,15 +4192,29 @@ (set (match_dup 0) (reg:SI 18)) (set (match_dup 3) (reg:SI 22))]) -(define_insn "*divmodsi4_call" +(define_insn_and_split "*divmodsi4_call_split" [(set (reg:SI 18) (div:SI (reg:SI 22) (reg:SI 18))) (set (reg:SI 22) (mod:SI (reg:SI 22) (reg:SI 18))) (clobber (reg:HI 26)) (clobber (reg:HI 30))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:SI 18) (div:SI (reg:SI 22) (reg:SI 18))) + (set (reg:SI 22) (mod:SI (reg:SI 22) (reg:SI 18))) + (clobber (reg:HI 26)) + (clobber (reg:HI 30)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*divmodsi4_call" + [(set (reg:SI 18) (div:SI (reg:SI 22) (reg:SI 18))) + (set (reg:SI 22) (mod:SI (reg:SI 22) (reg:SI 18))) + (clobber (reg:HI 26)) + (clobber (reg:HI 30)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __divmodsi4" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) (define_insn_and_split "udivmodsi4" [(parallel [(set (match_operand:SI 0 "pseudo_register_operand" "") @@ -3113,37 +4238,78 @@ (set (match_dup 0) (reg:SI 18)) (set (match_dup 3) (reg:SI 22))]) -(define_insn "*udivmodsi4_call" +(define_insn_and_split "*udivmodsi4_call_split" [(set (reg:SI 18) (udiv:SI (reg:SI 22) (reg:SI 18))) (set (reg:SI 22) (umod:SI (reg:SI 22) (reg:SI 18))) (clobber (reg:HI 26)) (clobber (reg:HI 30))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:SI 18) (udiv:SI (reg:SI 22) (reg:SI 18))) + (set (reg:SI 22) (umod:SI (reg:SI 22) (reg:SI 18))) + (clobber (reg:HI 26)) + (clobber (reg:HI 30)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*udivmodsi4_call" + [(set (reg:SI 18) (udiv:SI (reg:SI 22) (reg:SI 18))) + (set (reg:SI 22) (umod:SI (reg:SI 22) (reg:SI 18))) + (clobber (reg:HI 26)) + (clobber (reg:HI 30)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __udivmodsi4" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& ; and -(define_insn "andqi3" +(define_insn_and_split "andqi3" + [(set (match_operand:QI 0 "register_operand" "=??r,d,*l") + (and:QI (match_operand:QI 1 "register_operand" "%0,0,0") + (match_operand:QI 2 "nonmemory_operand" "r,i,Ca1")))] + "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (and:QI (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*andqi3" [(set (match_operand:QI 0 "register_operand" "=??r,d,*l") (and:QI (match_operand:QI 1 "register_operand" "%0,0,0") - (match_operand:QI 2 "nonmemory_operand" "r,i,Ca1")))] - "" + (match_operand:QI 2 "nonmemory_operand" "r,i,Ca1"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ and %0,%2 andi %0,lo8(%2) * return avr_out_bitop (insn, operands, NULL);" - [(set_attr "length" "1,1,2") - (set_attr "cc" "set_zn,set_zn,none")]) + [(set_attr "length" "1,1,2")]) -(define_insn "andhi3" +(define_insn_and_split "andhi3" [(set (match_operand:HI 0 "register_operand" "=??r,d,d,r ,r") (and:HI (match_operand:HI 1 "register_operand" "%0,0,0,0 ,0") (match_operand:HI 2 "nonmemory_operand" "r,s,n,Ca2,n"))) (clobber (match_scratch:QI 3 "=X,X,X,X ,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (and:HI (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*andhi3" + [(set (match_operand:HI 0 "register_operand" "=??r,d,d,r ,r") + (and:HI (match_operand:HI 1 "register_operand" "%0,0,0,0 ,0") + (match_operand:HI 2 "nonmemory_operand" "r,s,n,Ca2,n"))) + (clobber (match_scratch:QI 3 "=X,X,X,X ,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { if (which_alternative == 0) return "and %A0,%A2\;and %B0,%B2"; @@ -3153,15 +4319,29 @@ return avr_out_bitop (insn, operands, NULL); } [(set_attr "length" "2,2,2,4,4") - (set_attr "adjust_len" "*,*,out_bitop,out_bitop,out_bitop") - (set_attr "cc" "set_n,set_n,clobber,clobber,clobber")]) + (set_attr "adjust_len" "*,*,out_bitop,out_bitop,out_bitop")]) -(define_insn "andpsi3" +(define_insn_and_split "andpsi3" [(set (match_operand:PSI 0 "register_operand" "=??r,d,r ,r") (and:PSI (match_operand:PSI 1 "register_operand" "%0,0,0 ,0") (match_operand:PSI 2 "nonmemory_operand" "r,n,Ca3,n"))) (clobber (match_scratch:QI 3 "=X,X,X ,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (and:PSI (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*andpsi3" + [(set (match_operand:PSI 0 "register_operand" "=??r,d,r ,r") + (and:PSI (match_operand:PSI 1 "register_operand" "%0,0,0 ,0") + (match_operand:PSI 2 "nonmemory_operand" "r,n,Ca3,n"))) + (clobber (match_scratch:QI 3 "=X,X,X ,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { if (which_alternative == 0) return "and %A0,%A2" CR_TAB @@ -3171,15 +4351,29 @@ return avr_out_bitop (insn, operands, NULL); } [(set_attr "length" "3,3,6,6") - (set_attr "adjust_len" "*,out_bitop,out_bitop,out_bitop") - (set_attr "cc" "set_n,clobber,clobber,clobber")]) + (set_attr "adjust_len" "*,out_bitop,out_bitop,out_bitop")]) -(define_insn "andsi3" +(define_insn_and_split "andsi3" [(set (match_operand:SI 0 "register_operand" "=??r,d,r ,r") (and:SI (match_operand:SI 1 "register_operand" "%0,0,0 ,0") (match_operand:SI 2 "nonmemory_operand" "r,n,Ca4,n"))) (clobber (match_scratch:QI 3 "=X,X,X ,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (and:SI (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*andsi3" + [(set (match_operand:SI 0 "register_operand" "=??r,d,r ,r") + (and:SI (match_operand:SI 1 "register_operand" "%0,0,0 ,0") + (match_operand:SI 2 "nonmemory_operand" "r,n,Ca4,n"))) + (clobber (match_scratch:QI 3 "=X,X,X ,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { if (which_alternative == 0) return "and %0,%2" CR_TAB @@ -3190,8 +4384,7 @@ return avr_out_bitop (insn, operands, NULL); } [(set_attr "length" "4,4,8,8") - (set_attr "adjust_len" "*,out_bitop,out_bitop,out_bitop") - (set_attr "cc" "set_n,clobber,clobber,clobber")]) + (set_attr "adjust_len" "*,out_bitop,out_bitop,out_bitop")]) (define_peephole2 ; andi [(set (match_operand:QI 0 "d_register_operand" "") @@ -3209,24 +4402,51 @@ ;;||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ;; ior -(define_insn "iorqi3" +(define_insn_and_split "iorqi3" [(set (match_operand:QI 0 "register_operand" "=??r,d,*l") (ior:QI (match_operand:QI 1 "register_operand" "%0,0,0") (match_operand:QI 2 "nonmemory_operand" "r,i,Co1")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ior:QI (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*iorqi3" + [(set (match_operand:QI 0 "register_operand" "=??r,d,*l") + (ior:QI (match_operand:QI 1 "register_operand" "%0,0,0") + (match_operand:QI 2 "nonmemory_operand" "r,i,Co1"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ or %0,%2 ori %0,lo8(%2) * return avr_out_bitop (insn, operands, NULL);" - [(set_attr "length" "1,1,2") - (set_attr "cc" "set_zn,set_zn,none")]) + [(set_attr "length" "1,1,2")]) -(define_insn "iorhi3" +(define_insn_and_split "iorhi3" [(set (match_operand:HI 0 "register_operand" "=??r,d,d,r ,r") (ior:HI (match_operand:HI 1 "register_operand" "%0,0,0,0 ,0") (match_operand:HI 2 "nonmemory_operand" "r,s,n,Co2,n"))) (clobber (match_scratch:QI 3 "=X,X,X,X ,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ior:HI (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*iorhi3" + [(set (match_operand:HI 0 "register_operand" "=??r,d,d,r ,r") + (ior:HI (match_operand:HI 1 "register_operand" "%0,0,0,0 ,0") + (match_operand:HI 2 "nonmemory_operand" "r,s,n,Co2,n"))) + (clobber (match_scratch:QI 3 "=X,X,X,X ,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { if (which_alternative == 0) return "or %A0,%A2\;or %B0,%B2"; @@ -3236,15 +4456,29 @@ return avr_out_bitop (insn, operands, NULL); } [(set_attr "length" "2,2,2,4,4") - (set_attr "adjust_len" "*,*,out_bitop,out_bitop,out_bitop") - (set_attr "cc" "set_n,set_n,clobber,clobber,clobber")]) + (set_attr "adjust_len" "*,*,out_bitop,out_bitop,out_bitop")]) -(define_insn "iorpsi3" +(define_insn_and_split "iorpsi3" [(set (match_operand:PSI 0 "register_operand" "=??r,d,r ,r") (ior:PSI (match_operand:PSI 1 "register_operand" "%0,0,0 ,0") (match_operand:PSI 2 "nonmemory_operand" "r,n,Co3,n"))) (clobber (match_scratch:QI 3 "=X,X,X ,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ior:PSI (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*iorpsi3" + [(set (match_operand:PSI 0 "register_operand" "=??r,d,r ,r") + (ior:PSI (match_operand:PSI 1 "register_operand" "%0,0,0 ,0") + (match_operand:PSI 2 "nonmemory_operand" "r,n,Co3,n"))) + (clobber (match_scratch:QI 3 "=X,X,X ,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { if (which_alternative == 0) return "or %A0,%A2" CR_TAB @@ -3254,15 +4488,29 @@ return avr_out_bitop (insn, operands, NULL); } [(set_attr "length" "3,3,6,6") - (set_attr "adjust_len" "*,out_bitop,out_bitop,out_bitop") - (set_attr "cc" "set_n,clobber,clobber,clobber")]) + (set_attr "adjust_len" "*,out_bitop,out_bitop,out_bitop")]) -(define_insn "iorsi3" +(define_insn_and_split "iorsi3" [(set (match_operand:SI 0 "register_operand" "=??r,d,r ,r") (ior:SI (match_operand:SI 1 "register_operand" "%0,0,0 ,0") (match_operand:SI 2 "nonmemory_operand" "r,n,Co4,n"))) (clobber (match_scratch:QI 3 "=X,X,X ,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ior:SI (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*iorsi3" + [(set (match_operand:SI 0 "register_operand" "=??r,d,r ,r") + (ior:SI (match_operand:SI 1 "register_operand" "%0,0,0 ,0") + (match_operand:SI 2 "nonmemory_operand" "r,n,Co4,n"))) + (clobber (match_scratch:QI 3 "=X,X,X ,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { if (which_alternative == 0) return "or %0,%2" CR_TAB @@ -3273,27 +4521,53 @@ return avr_out_bitop (insn, operands, NULL); } [(set_attr "length" "4,4,8,8") - (set_attr "adjust_len" "*,out_bitop,out_bitop,out_bitop") - (set_attr "cc" "set_n,clobber,clobber,clobber")]) + (set_attr "adjust_len" "*,out_bitop,out_bitop,out_bitop")]) ;;^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ;; xor -(define_insn "xorqi3" +(define_insn_and_split "xorqi3" [(set (match_operand:QI 0 "register_operand" "=r") (xor:QI (match_operand:QI 1 "register_operand" "%0") (match_operand:QI 2 "register_operand" "r")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (xor:QI (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*xorqi3" + [(set (match_operand:QI 0 "register_operand" "=r") + (xor:QI (match_operand:QI 1 "register_operand" "%0") + (match_operand:QI 2 "register_operand" "r"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "eor %0,%2" - [(set_attr "length" "1") - (set_attr "cc" "set_zn")]) + [(set_attr "length" "1")]) -(define_insn "xorhi3" +(define_insn_and_split "xorhi3" [(set (match_operand:HI 0 "register_operand" "=??r,r ,r") (xor:HI (match_operand:HI 1 "register_operand" "%0,0 ,0") (match_operand:HI 2 "nonmemory_operand" "r,Cx2,n"))) (clobber (match_scratch:QI 3 "=X,X ,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (xor:HI (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*xorhi3" + [(set (match_operand:HI 0 "register_operand" "=??r,r ,r") + (xor:HI (match_operand:HI 1 "register_operand" "%0,0 ,0") + (match_operand:HI 2 "nonmemory_operand" "r,Cx2,n"))) + (clobber (match_scratch:QI 3 "=X,X ,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { if (which_alternative == 0) return "eor %A0,%A2\;eor %B0,%B2"; @@ -3301,15 +4575,29 @@ return avr_out_bitop (insn, operands, NULL); } [(set_attr "length" "2,2,4") - (set_attr "adjust_len" "*,out_bitop,out_bitop") - (set_attr "cc" "set_n,clobber,clobber")]) + (set_attr "adjust_len" "*,out_bitop,out_bitop")]) -(define_insn "xorpsi3" +(define_insn_and_split "xorpsi3" [(set (match_operand:PSI 0 "register_operand" "=??r,r ,r") (xor:PSI (match_operand:PSI 1 "register_operand" "%0,0 ,0") (match_operand:PSI 2 "nonmemory_operand" "r,Cx3,n"))) (clobber (match_scratch:QI 3 "=X,X ,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (xor:PSI (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*xorpsi3" + [(set (match_operand:PSI 0 "register_operand" "=??r,r ,r") + (xor:PSI (match_operand:PSI 1 "register_operand" "%0,0 ,0") + (match_operand:PSI 2 "nonmemory_operand" "r,Cx3,n"))) + (clobber (match_scratch:QI 3 "=X,X ,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { if (which_alternative == 0) return "eor %A0,%A2" CR_TAB @@ -3319,15 +4607,29 @@ return avr_out_bitop (insn, operands, NULL); } [(set_attr "length" "3,6,6") - (set_attr "adjust_len" "*,out_bitop,out_bitop") - (set_attr "cc" "set_n,clobber,clobber")]) + (set_attr "adjust_len" "*,out_bitop,out_bitop")]) -(define_insn "xorsi3" +(define_insn_and_split "xorsi3" [(set (match_operand:SI 0 "register_operand" "=??r,r ,r") (xor:SI (match_operand:SI 1 "register_operand" "%0,0 ,0") (match_operand:SI 2 "nonmemory_operand" "r,Cx4,n"))) (clobber (match_scratch:QI 3 "=X,X ,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (xor:SI (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*xorsi3" + [(set (match_operand:SI 0 "register_operand" "=??r,r ,r") + (xor:SI (match_operand:SI 1 "register_operand" "%0,0 ,0") + (match_operand:SI 2 "nonmemory_operand" "r,Cx4,n"))) + (clobber (match_scratch:QI 3 "=X,X ,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { if (which_alternative == 0) return "eor %0,%2" CR_TAB @@ -3338,8 +4640,7 @@ return avr_out_bitop (insn, operands, NULL); } [(set_attr "length" "4,8,8") - (set_attr "adjust_len" "*,out_bitop,out_bitop") - (set_attr "cc" "set_n,clobber,clobber")]) + (set_attr "adjust_len" "*,out_bitop,out_bitop")]) (define_split @@ -3424,11 +4725,24 @@ (rotate:QI (match_operand:QI 1 "register_operand" "") (const_int 4)))]) -(define_insn "*rotlqi3" +(define_insn_and_split "*rotlqi3_split" [(set (match_operand:QI 0 "register_operand" "=r,r,r ,r ,r ,r ,r ,r") (rotate:QI (match_operand:QI 1 "register_operand" "0,0,0 ,0 ,0 ,0 ,0 ,0") (match_operand:QI 2 "const_0_to_7_operand" "P,K,C03,C04,C05,C06,C07,L")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (rotate:QI (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*rotlqi3" + [(set (match_operand:QI 0 "register_operand" "=r,r,r ,r ,r ,r ,r ,r") + (rotate:QI (match_operand:QI 1 "register_operand" "0,0,0 ,0 ,0 ,0 ,0 ,0") + (match_operand:QI 2 "const_0_to_7_operand" "P,K,C03,C04,C05,C06,C07,L"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ lsl %0\;adc %0,__zero_reg__ lsl %0\;adc %0,__zero_reg__\;lsl %0\;adc %0,__zero_reg__ @@ -3438,8 +4752,7 @@ swap %0\;lsl %0\;adc %0,__zero_reg__\;lsl %0\;adc %0,__zero_reg__ bst %0,0\;ror %0\;bld %0,7 " ; empty - [(set_attr "length" "2,4,4,1,3,5,3,0") - (set_attr "cc" "set_n,set_n,clobber,none,set_n,set_n,clobber,none")]) + [(set_attr "length" "2,4,4,1,3,5,3,0")]) ;; Split all rotates of HI,SI and PSImode registers where rotation is by ;; a whole number of bytes. The split creates the appropriate moves and @@ -3487,59 +4800,131 @@ FAIL; }) -(define_insn "*rotlhi2.1" +(define_insn_and_split "*rotlhi2.1_split" [(set (match_operand:HI 0 "register_operand" "=r") (rotate:HI (match_operand:HI 1 "register_operand" "0") (const_int 1)))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (rotate:HI (match_dup 1) + (const_int 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*rotlhi2.1" + [(set (match_operand:HI 0 "register_operand" "=r") + (rotate:HI (match_operand:HI 1 "register_operand" "0") + (const_int 1))) + (clobber (reg:CC REG_CC))] + "reload_completed" "lsl %A0\;rol %B0\;adc %A0,__zero_reg__" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) -(define_insn "*rotlhi2.15" +(define_insn_and_split "*rotlhi2.15_split" [(set (match_operand:HI 0 "register_operand" "=r") (rotate:HI (match_operand:HI 1 "register_operand" "0") (const_int 15)))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (rotate:HI (match_dup 1) + (const_int 15))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*rotlhi2.15" + [(set (match_operand:HI 0 "register_operand" "=r") + (rotate:HI (match_operand:HI 1 "register_operand" "0") + (const_int 15))) + (clobber (reg:CC REG_CC))] + "reload_completed" "bst %A0,0\;ror %B0\;ror %A0\;bld %B0,7" - [(set_attr "length" "4") - (set_attr "cc" "clobber")]) + [(set_attr "length" "4")]) -(define_insn "*rotlpsi2.1" +(define_insn_and_split "*rotlpsi2.1_split" [(set (match_operand:PSI 0 "register_operand" "=r") (rotate:PSI (match_operand:PSI 1 "register_operand" "0") (const_int 1)))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (rotate:PSI (match_dup 1) + (const_int 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*rotlpsi2.1" + [(set (match_operand:PSI 0 "register_operand" "=r") + (rotate:PSI (match_operand:PSI 1 "register_operand" "0") + (const_int 1))) + (clobber (reg:CC REG_CC))] + "reload_completed" "lsl %A0\;rol %B0\;rol %C0\;adc %A0,__zero_reg__" - [(set_attr "length" "4") - (set_attr "cc" "clobber")]) + [(set_attr "length" "4")]) -(define_insn "*rotlpsi2.23" +(define_insn_and_split "*rotlpsi2.23_split" [(set (match_operand:PSI 0 "register_operand" "=r") (rotate:PSI (match_operand:PSI 1 "register_operand" "0") (const_int 23)))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (rotate:PSI (match_dup 1) + (const_int 23))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*rotlpsi2.23" + [(set (match_operand:PSI 0 "register_operand" "=r") + (rotate:PSI (match_operand:PSI 1 "register_operand" "0") + (const_int 23))) + (clobber (reg:CC REG_CC))] + "reload_completed" "bst %A0,0\;ror %C0\;ror %B0\;ror %A0\;bld %C0,7" - [(set_attr "length" "5") - (set_attr "cc" "clobber")]) + [(set_attr "length" "5")]) -(define_insn "*rotlsi2.1" +(define_insn_and_split "*rotlsi2.1_split" [(set (match_operand:SI 0 "register_operand" "=r") (rotate:SI (match_operand:SI 1 "register_operand" "0") (const_int 1)))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (rotate:SI (match_dup 1) + (const_int 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*rotlsi2.1" + [(set (match_operand:SI 0 "register_operand" "=r") + (rotate:SI (match_operand:SI 1 "register_operand" "0") + (const_int 1))) + (clobber (reg:CC REG_CC))] + "reload_completed" "lsl %A0\;rol %B0\;rol %C0\;rol %D0\;adc %A0,__zero_reg__" - [(set_attr "length" "5") - (set_attr "cc" "clobber")]) + [(set_attr "length" "5")]) -(define_insn "*rotlsi2.31" +(define_insn_and_split "*rotlsi2.31_split" [(set (match_operand:SI 0 "register_operand" "=r") (rotate:SI (match_operand:SI 1 "register_operand" "0") (const_int 31)))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (rotate:SI (match_dup 1) + (const_int 31))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*rotlsi2.31" + [(set (match_operand:SI 0 "register_operand" "=r") + (rotate:SI (match_operand:SI 1 "register_operand" "0") + (const_int 31))) + (clobber (reg:CC REG_CC))] + "reload_completed" "bst %A0,0\;ror %D0\;ror %C0\;ror %B0\;ror %A0\;bld %D0,7" - [(set_attr "length" "6") - (set_attr "cc" "clobber")]) + [(set_attr "length" "6")]) ;; Overlapping non-HImode registers often (but not always) need a scratch. ;; The best we can do is use early clobber alternative "#&r" so that @@ -3644,29 +5029,53 @@ ;; "*ashlqi3" ;; "*ashlqq3" "*ashluqq3" -(define_insn "*ashl3" +(define_insn_and_split "*ashl3_split" [(set (match_operand:ALL1 0 "register_operand" "=r,r,r,r,!d,r,r") (ashift:ALL1 (match_operand:ALL1 1 "register_operand" "0,0,0,0,0 ,0,0") (match_operand:QI 2 "nop_general_operand" "r,L,P,K,n ,n,Qm")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ashift:ALL1 (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ashl3" + [(set (match_operand:ALL1 0 "register_operand" "=r,r,r,r,!d,r,r") + (ashift:ALL1 (match_operand:ALL1 1 "register_operand" "0,0,0,0,0 ,0,0") + (match_operand:QI 2 "nop_general_operand" "r,L,P,K,n ,n,Qm"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ashlqi3_out (insn, operands, NULL); } [(set_attr "length" "5,0,1,2,4,6,9") - (set_attr "adjust_len" "ashlqi") - (set_attr "cc" "clobber,none,set_czn,set_czn,set_czn,set_czn,clobber")]) + (set_attr "adjust_len" "ashlqi")]) -(define_insn "ashl3" +(define_insn_and_split "ashl3" [(set (match_operand:ALL2 0 "register_operand" "=r,r,r,r,r,r,r") (ashift:ALL2 (match_operand:ALL2 1 "register_operand" "0,0,0,r,0,0,0") (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ashift:ALL2 (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ashl3" + [(set (match_operand:ALL2 0 "register_operand" "=r,r,r,r,r,r,r") + (ashift:ALL2 (match_operand:ALL2 1 "register_operand" "0,0,0,r,0,0,0") + (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ashlhi3_out (insn, operands, NULL); } [(set_attr "length" "6,0,2,2,4,10,10") - (set_attr "adjust_len" "ashlhi") - (set_attr "cc" "clobber,none,set_n,clobber,set_n,clobber,clobber")]) + (set_attr "adjust_len" "ashlhi")]) ;; Insns like the following are generated when (implicitly) extending 8-bit shifts @@ -3752,17 +5161,29 @@ ;; "ashlsi3" ;; "ashlsq3" "ashlusq3" ;; "ashlsa3" "ashlusa3" -(define_insn "ashl3" +(define_insn_and_split "ashl3" [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r,r,r,r") (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,0,r,0,0,0") (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ashift:ALL4 (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ashl3" + [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r,r,r,r") + (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,0,r,0,0,0") + (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ashlsi3_out (insn, operands, NULL); } [(set_attr "length" "8,0,4,4,8,10,12") - (set_attr "adjust_len" "ashlsi") - (set_attr "cc" "clobber,none,set_n,clobber,set_n,clobber,clobber")]) + (set_attr "adjust_len" "ashlsi")]) ;; Optimize if a scratch register from LD_REGS happens to be available. @@ -3821,18 +5242,32 @@ ;; "*ashlhi3_const" ;; "*ashlhq3_const" "*ashluhq3_const" ;; "*ashlha3_const" "*ashluha3_const" -(define_insn "*ashl3_const" +(define_insn_and_split "*ashl3_const_split" [(set (match_operand:ALL2 0 "register_operand" "=r,r,r,r,r") (ashift:ALL2 (match_operand:ALL2 1 "register_operand" "0,0,r,0,0") (match_operand:QI 2 "const_int_operand" "L,P,O,K,n"))) (clobber (match_scratch:QI 3 "=X,X,X,X,&d"))] "reload_completed" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ashift:ALL2 (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ashl3_const" + [(set (match_operand:ALL2 0 "register_operand" "=r,r,r,r,r") + (ashift:ALL2 (match_operand:ALL2 1 "register_operand" "0,0,r,0,0") + (match_operand:QI 2 "const_int_operand" "L,P,O,K,n"))) + (clobber (match_scratch:QI 3 "=X,X,X,X,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ashlhi3_out (insn, operands, NULL); } [(set_attr "length" "0,2,2,4,10") - (set_attr "adjust_len" "ashlhi") - (set_attr "cc" "none,set_n,clobber,set_n,clobber")]) + (set_attr "adjust_len" "ashlhi")]) (define_peephole2 [(match_scratch:QI 3 "d") @@ -3848,18 +5283,32 @@ ;; "*ashlsi3_const" ;; "*ashlsq3_const" "*ashlusq3_const" ;; "*ashlsa3_const" "*ashlusa3_const" -(define_insn "*ashl3_const" +(define_insn_and_split "*ashl3_const_split" [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r") (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,r,0") (match_operand:QI 2 "const_int_operand" "L,P,O,n"))) (clobber (match_scratch:QI 3 "=X,X,X,&d"))] "reload_completed" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ashift:ALL4 (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ashl3_const" + [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r") + (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,r,0") + (match_operand:QI 2 "const_int_operand" "L,P,O,n"))) + (clobber (match_scratch:QI 3 "=X,X,X,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ashlsi3_out (insn, operands, NULL); } [(set_attr "length" "0,4,4,10") - (set_attr "adjust_len" "ashlsi") - (set_attr "cc" "none,set_n,clobber,clobber")]) + (set_attr "adjust_len" "ashlsi")]) (define_expand "ashlpsi3" [(parallel [(set (match_operand:PSI 0 "register_operand" "") @@ -3888,76 +5337,140 @@ } }) -(define_insn "*ashlpsi3" +(define_insn_and_split "*ashlpsi3_split" [(set (match_operand:PSI 0 "register_operand" "=r,r,r,r") (ashift:PSI (match_operand:PSI 1 "register_operand" "0,0,r,0") (match_operand:QI 2 "nonmemory_operand" "r,P,O,n"))) (clobber (match_scratch:QI 3 "=X,X,X,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ashift:PSI (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ashlpsi3" + [(set (match_operand:PSI 0 "register_operand" "=r,r,r,r") + (ashift:PSI (match_operand:PSI 1 "register_operand" "0,0,r,0") + (match_operand:QI 2 "nonmemory_operand" "r,P,O,n"))) + (clobber (match_scratch:QI 3 "=X,X,X,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_ashlpsi3 (insn, operands, NULL); } - [(set_attr "adjust_len" "ashlpsi") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "ashlpsi")]) ;; >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> ;; arithmetic shift right ;; "ashrqi3" ;; "ashrqq3" "ashruqq3" -(define_insn "ashr3" +(define_insn_and_split "ashr3" [(set (match_operand:ALL1 0 "register_operand" "=r,r,r,r,r ,r ,r") (ashiftrt:ALL1 (match_operand:ALL1 1 "register_operand" "0,0,0,0,0 ,0 ,0") (match_operand:QI 2 "nop_general_operand" "r,L,P,K,C03 C04 C05,C06 C07,Qm")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ashiftrt:ALL1 (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ashr3" + [(set (match_operand:ALL1 0 "register_operand" "=r,r,r,r,r ,r ,r") + (ashiftrt:ALL1 (match_operand:ALL1 1 "register_operand" "0,0,0,0,0 ,0 ,0") + (match_operand:QI 2 "nop_general_operand" "r,L,P,K,C03 C04 C05,C06 C07,Qm"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ashrqi3_out (insn, operands, NULL); } [(set_attr "length" "5,0,1,2,5,4,9") - (set_attr "adjust_len" "ashrqi") - (set_attr "cc" "clobber,none,set_czn,set_czn,set_czn,clobber,clobber")]) + (set_attr "adjust_len" "ashrqi")]) ;; "ashrhi3" ;; "ashrhq3" "ashruhq3" ;; "ashrha3" "ashruha3" -(define_insn "ashr3" +(define_insn_and_split "ashr3" [(set (match_operand:ALL2 0 "register_operand" "=r,r,r,r,r,r,r") (ashiftrt:ALL2 (match_operand:ALL2 1 "register_operand" "0,0,0,r,0,0,0") (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_operand:ALL2 0 "register_operand" "=r,r,r,r,r,r,r") + (ashiftrt:ALL2 (match_operand:ALL2 1 "register_operand" "0,0,0,r,0,0,0") + (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm"))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ashr3" + [(set (match_operand:ALL2 0 "register_operand" "=r,r,r,r,r,r,r") + (ashiftrt:ALL2 (match_operand:ALL2 1 "register_operand" "0,0,0,r,0,0,0") + (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ashrhi3_out (insn, operands, NULL); } [(set_attr "length" "6,0,2,4,4,10,10") - (set_attr "adjust_len" "ashrhi") - (set_attr "cc" "clobber,none,clobber,set_n,clobber,clobber,clobber")]) + (set_attr "adjust_len" "ashrhi")]) -(define_insn "ashrpsi3" +(define_insn_and_split "ashrpsi3" [(set (match_operand:PSI 0 "register_operand" "=r,r,r,r,r") (ashiftrt:PSI (match_operand:PSI 1 "register_operand" "0,0,0,r,0") (match_operand:QI 2 "nonmemory_operand" "r,P,K,O,n"))) (clobber (match_scratch:QI 3 "=X,X,X,X,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ashiftrt:PSI (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ashrpsi3" + [(set (match_operand:PSI 0 "register_operand" "=r,r,r,r,r") + (ashiftrt:PSI (match_operand:PSI 1 "register_operand" "0,0,0,r,0") + (match_operand:QI 2 "nonmemory_operand" "r,P,K,O,n"))) + (clobber (match_scratch:QI 3 "=X,X,X,X,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_ashrpsi3 (insn, operands, NULL); } - [(set_attr "adjust_len" "ashrpsi") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "ashrpsi")]) ;; "ashrsi3" ;; "ashrsq3" "ashrusq3" ;; "ashrsa3" "ashrusa3" -(define_insn "ashr3" +(define_insn_and_split "ashr3" + [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r,r,r,r") + (ashiftrt:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,0,r,0,0,0") + (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm")))] + "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ashiftrt:ALL4 (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ashr3" [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r,r,r,r") (ashiftrt:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,0,r,0,0,0") - (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm")))] - "" + (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ashrsi3_out (insn, operands, NULL); } [(set_attr "length" "8,0,4,6,8,10,12") - (set_attr "adjust_len" "ashrsi") - (set_attr "cc" "clobber,none,clobber,set_n,clobber,clobber,clobber")]) + (set_attr "adjust_len" "ashrsi")]) ;; Optimize if a scratch register from LD_REGS happens to be available. @@ -3975,18 +5488,32 @@ ;; "*ashrhi3_const" ;; "*ashrhq3_const" "*ashruhq3_const" ;; "*ashrha3_const" "*ashruha3_const" -(define_insn "*ashr3_const" +(define_insn_and_split "*ashr3_const_split" [(set (match_operand:ALL2 0 "register_operand" "=r,r,r,r,r") (ashiftrt:ALL2 (match_operand:ALL2 1 "register_operand" "0,0,r,0,0") (match_operand:QI 2 "const_int_operand" "L,P,O,K,n"))) (clobber (match_scratch:QI 3 "=X,X,X,X,&d"))] "reload_completed" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ashiftrt:ALL2 (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ashr3_const" + [(set (match_operand:ALL2 0 "register_operand" "=r,r,r,r,r") + (ashiftrt:ALL2 (match_operand:ALL2 1 "register_operand" "0,0,r,0,0") + (match_operand:QI 2 "const_int_operand" "L,P,O,K,n"))) + (clobber (match_scratch:QI 3 "=X,X,X,X,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ashrhi3_out (insn, operands, NULL); } [(set_attr "length" "0,2,4,4,10") - (set_attr "adjust_len" "ashrhi") - (set_attr "cc" "none,clobber,set_n,clobber,clobber")]) + (set_attr "adjust_len" "ashrhi")]) (define_peephole2 [(match_scratch:QI 3 "d") @@ -4002,18 +5529,32 @@ ;; "*ashrsi3_const" ;; "*ashrsq3_const" "*ashrusq3_const" ;; "*ashrsa3_const" "*ashrusa3_const" -(define_insn "*ashr3_const" +(define_insn_and_split "*ashr3_const_split" [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r") (ashiftrt:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,r,0") (match_operand:QI 2 "const_int_operand" "L,P,O,n"))) (clobber (match_scratch:QI 3 "=X,X,X,&d"))] "reload_completed" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (ashiftrt:ALL4 (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ashr3_const" + [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r") + (ashiftrt:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,r,0") + (match_operand:QI 2 "const_int_operand" "L,P,O,n"))) + (clobber (match_scratch:QI 3 "=X,X,X,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ashrsi3_out (insn, operands, NULL); } [(set_attr "length" "0,4,4,10") - (set_attr "adjust_len" "ashrsi") - (set_attr "cc" "none,clobber,set_n,clobber")]) + (set_attr "adjust_len" "ashrsi")]) ;; >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> ;; logical shift right @@ -4067,59 +5608,109 @@ ;; "*lshrqi3" ;; "*lshrqq3" ;; "*lshruqq3" -(define_insn "*lshr3" +(define_insn_and_split "*lshr3_split" [(set (match_operand:ALL1 0 "register_operand" "=r,r,r,r,!d,r,r") (lshiftrt:ALL1 (match_operand:ALL1 1 "register_operand" "0,0,0,0,0 ,0,0") (match_operand:QI 2 "nop_general_operand" "r,L,P,K,n ,n,Qm")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (lshiftrt:ALL1 (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*lshr3" + [(set (match_operand:ALL1 0 "register_operand" "=r,r,r,r,!d,r,r") + (lshiftrt:ALL1 (match_operand:ALL1 1 "register_operand" "0,0,0,0,0 ,0,0") + (match_operand:QI 2 "nop_general_operand" "r,L,P,K,n ,n,Qm"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return lshrqi3_out (insn, operands, NULL); } [(set_attr "length" "5,0,1,2,4,6,9") - (set_attr "adjust_len" "lshrqi") - (set_attr "cc" "clobber,none,set_czn,set_czn,set_czn,set_czn,clobber")]) + (set_attr "adjust_len" "lshrqi")]) ;; "lshrhi3" ;; "lshrhq3" "lshruhq3" ;; "lshrha3" "lshruha3" -(define_insn "lshr3" +(define_insn_and_split "lshr3" [(set (match_operand:ALL2 0 "register_operand" "=r,r,r,r,r,r,r") (lshiftrt:ALL2 (match_operand:ALL2 1 "register_operand" "0,0,0,r,0,0,0") (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (lshiftrt:ALL2 (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*lshr3" + [(set (match_operand:ALL2 0 "register_operand" "=r,r,r,r,r,r,r") + (lshiftrt:ALL2 (match_operand:ALL2 1 "register_operand" "0,0,0,r,0,0,0") + (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return lshrhi3_out (insn, operands, NULL); } [(set_attr "length" "6,0,2,2,4,10,10") - (set_attr "adjust_len" "lshrhi") - (set_attr "cc" "clobber,none,clobber,clobber,clobber,clobber,clobber")]) + (set_attr "adjust_len" "lshrhi")]) -(define_insn "lshrpsi3" +(define_insn_and_split "lshrpsi3" [(set (match_operand:PSI 0 "register_operand" "=r,r,r,r,r") (lshiftrt:PSI (match_operand:PSI 1 "register_operand" "0,0,r,0,0") (match_operand:QI 2 "nonmemory_operand" "r,P,O,K,n"))) (clobber (match_scratch:QI 3 "=X,X,X,X,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (lshiftrt:PSI (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*lshrpsi3" + [(set (match_operand:PSI 0 "register_operand" "=r,r,r,r,r") + (lshiftrt:PSI (match_operand:PSI 1 "register_operand" "0,0,r,0,0") + (match_operand:QI 2 "nonmemory_operand" "r,P,O,K,n"))) + (clobber (match_scratch:QI 3 "=X,X,X,X,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_lshrpsi3 (insn, operands, NULL); } - [(set_attr "adjust_len" "lshrpsi") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "lshrpsi")]) ;; "lshrsi3" ;; "lshrsq3" "lshrusq3" ;; "lshrsa3" "lshrusa3" -(define_insn "lshr3" +(define_insn_and_split "lshr3" [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r,r,r,r") (lshiftrt:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,0,r,0,0,0") (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (lshiftrt:ALL4 (match_dup 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*lshr3" + [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r,r,r,r") + (lshiftrt:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,0,r,0,0,0") + (match_operand:QI 2 "nop_general_operand" "r,L,P,O,K,n,Qm"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return lshrsi3_out (insn, operands, NULL); } [(set_attr "length" "8,0,4,4,8,10,12") - (set_attr "adjust_len" "lshrsi") - (set_attr "cc" "clobber,none,clobber,clobber,clobber,clobber,clobber")]) + (set_attr "adjust_len" "lshrsi")]) ;; Optimize if a scratch register from LD_REGS happens to be available. @@ -4178,18 +5769,32 @@ ;; "*lshrhi3_const" ;; "*lshrhq3_const" "*lshruhq3_const" ;; "*lshrha3_const" "*lshruha3_const" -(define_insn "*lshr3_const" +(define_insn_and_split "*lshr3_const_split" [(set (match_operand:ALL2 0 "register_operand" "=r,r,r,r,r") (lshiftrt:ALL2 (match_operand:ALL2 1 "register_operand" "0,0,r,0,0") (match_operand:QI 2 "const_int_operand" "L,P,O,K,n"))) (clobber (match_scratch:QI 3 "=X,X,X,X,&d"))] "reload_completed" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (lshiftrt:ALL2 (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*lshr3_const" + [(set (match_operand:ALL2 0 "register_operand" "=r,r,r,r,r") + (lshiftrt:ALL2 (match_operand:ALL2 1 "register_operand" "0,0,r,0,0") + (match_operand:QI 2 "const_int_operand" "L,P,O,K,n"))) + (clobber (match_scratch:QI 3 "=X,X,X,X,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return lshrhi3_out (insn, operands, NULL); } [(set_attr "length" "0,2,2,4,10") - (set_attr "adjust_len" "lshrhi") - (set_attr "cc" "none,clobber,clobber,clobber,clobber")]) + (set_attr "adjust_len" "lshrhi")]) (define_peephole2 [(match_scratch:QI 3 "d") @@ -4205,143 +5810,279 @@ ;; "*lshrsi3_const" ;; "*lshrsq3_const" "*lshrusq3_const" ;; "*lshrsa3_const" "*lshrusa3_const" -(define_insn "*lshr3_const" +(define_insn_and_split "*lshr3_const_split" [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r") (lshiftrt:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,r,0") (match_operand:QI 2 "const_int_operand" "L,P,O,n"))) (clobber (match_scratch:QI 3 "=X,X,X,&d"))] "reload_completed" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (lshiftrt:ALL4 (match_dup 1) + (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*lshr3_const" + [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r") + (lshiftrt:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,r,0") + (match_operand:QI 2 "const_int_operand" "L,P,O,n"))) + (clobber (match_scratch:QI 3 "=X,X,X,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" { return lshrsi3_out (insn, operands, NULL); } [(set_attr "length" "0,4,4,10") - (set_attr "adjust_len" "lshrsi") - (set_attr "cc" "none,clobber,clobber,clobber")]) + (set_attr "adjust_len" "lshrsi")]) ;; abs(x) abs(x) abs(x) abs(x) abs(x) abs(x) abs(x) abs(x) abs(x) abs(x) abs(x) ;; abs -(define_insn "absqi2" +(define_insn_and_split "absqi2" [(set (match_operand:QI 0 "register_operand" "=r") (abs:QI (match_operand:QI 1 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (abs:QI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*absqi2" + [(set (match_operand:QI 0 "register_operand" "=r") + (abs:QI (match_operand:QI 1 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "sbrc %0,7 neg %0" - [(set_attr "length" "2") - (set_attr "cc" "clobber")]) + [(set_attr "length" "2")]) -(define_insn "abssf2" +(define_insn_and_split "abssf2" [(set (match_operand:SF 0 "register_operand" "=d,r") (abs:SF (match_operand:SF 1 "register_operand" "0,0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (abs:SF (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*abssf2" + [(set (match_operand:SF 0 "register_operand" "=d,r") + (abs:SF (match_operand:SF 1 "register_operand" "0,0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ andi %D0,0x7f clt\;bld %D0,7" - [(set_attr "length" "1,2") - (set_attr "cc" "set_n,clobber")]) + [(set_attr "length" "1,2")]) ;; 0 - x 0 - x 0 - x 0 - x 0 - x 0 - x 0 - x 0 - x 0 - x 0 - x 0 - x ;; neg -(define_insn "negqi2" +(define_insn_and_split "negqi2" [(set (match_operand:QI 0 "register_operand" "=r") (neg:QI (match_operand:QI 1 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (neg:QI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*negqi2" + [(set (match_operand:QI 0 "register_operand" "=r") + (neg:QI (match_operand:QI 1 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "neg %0" - [(set_attr "length" "1") - (set_attr "cc" "set_vzn")]) + [(set_attr "length" "1")]) -(define_insn "*negqihi2" +(define_insn_and_split "*negqihi2_split" [(set (match_operand:HI 0 "register_operand" "=r") (neg:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "0"))))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (neg:HI (sign_extend:HI (match_dup 1)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*negqihi2" + [(set (match_operand:HI 0 "register_operand" "=r") + (neg:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "0")))) + (clobber (reg:CC REG_CC))] + "reload_completed" "clr %B0\;neg %A0\;brge .+2\;com %B0" - [(set_attr "length" "4") - (set_attr "cc" "set_n")]) + [(set_attr "length" "4")]) -(define_insn "neghi2" +(define_insn_and_split "neghi2" [(set (match_operand:HI 0 "register_operand" "=r,&r") (neg:HI (match_operand:HI 1 "register_operand" "0,r")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (neg:HI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*neghi2" + [(set (match_operand:HI 0 "register_operand" "=r,&r") + (neg:HI (match_operand:HI 1 "register_operand" "0,r"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ neg %B0\;neg %A0\;sbc %B0,__zero_reg__ clr %A0\;clr %B0\;sub %A0,%A1\;sbc %B0,%B1" - [(set_attr "length" "3,4") - (set_attr "cc" "set_czn")]) + [(set_attr "length" "3,4")]) -(define_insn "negpsi2" +(define_insn_and_split "negpsi2" [(set (match_operand:PSI 0 "register_operand" "=!d,r,&r") (neg:PSI (match_operand:PSI 1 "register_operand" "0,0,r")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (neg:PSI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*negpsi2" + [(set (match_operand:PSI 0 "register_operand" "=!d,r,&r") + (neg:PSI (match_operand:PSI 1 "register_operand" "0,0,r"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ com %C0\;com %B0\;neg %A0\;sbci %B0,-1\;sbci %C0,-1 com %C0\;com %B0\;com %A0\;adc %A0,__zero_reg__\;adc %B0,__zero_reg__\;adc %C0,__zero_reg__ clr %A0\;clr %B0\;clr %C0\;sub %A0,%A1\;sbc %B0,%B1\;sbc %C0,%C1" - [(set_attr "length" "5,6,6") - (set_attr "cc" "set_czn,set_n,set_czn")]) + [(set_attr "length" "5,6,6")]) -(define_insn "negsi2" +(define_insn_and_split "negsi2" [(set (match_operand:SI 0 "register_operand" "=!d,r,&r,&r") (neg:SI (match_operand:SI 1 "register_operand" "0,0,r ,r")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (neg:SI (match_dup 1))) + (clobber (reg:CC REG_CC))])] + "" + [(set_attr "isa" "*,*,mov,movw")]) + +(define_insn "*negsi2" + [(set (match_operand:SI 0 "register_operand" "=!d,r,&r,&r") + (neg:SI (match_operand:SI 1 "register_operand" "0,0,r ,r"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ com %D0\;com %C0\;com %B0\;neg %A0\;sbci %B0,lo8(-1)\;sbci %C0,lo8(-1)\;sbci %D0,lo8(-1) com %D0\;com %C0\;com %B0\;com %A0\;adc %A0,__zero_reg__\;adc %B0,__zero_reg__\;adc %C0,__zero_reg__\;adc %D0,__zero_reg__ clr %A0\;clr %B0\;clr %C0\;clr %D0\;sub %A0,%A1\;sbc %B0,%B1\;sbc %C0,%C1\;sbc %D0,%D1 clr %A0\;clr %B0\;movw %C0,%A0\;sub %A0,%A1\;sbc %B0,%B1\;sbc %C0,%C1\;sbc %D0,%D1" [(set_attr "length" "7,8,8,7") - (set_attr "isa" "*,*,mov,movw") - (set_attr "cc" "set_czn,set_n,set_czn,set_czn")]) + (set_attr "isa" "*,*,mov,movw")]) -(define_insn "negsf2" +(define_insn_and_split "negsf2" [(set (match_operand:SF 0 "register_operand" "=d,r") (neg:SF (match_operand:SF 1 "register_operand" "0,0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (neg:SF (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*negsf2" + [(set (match_operand:SF 0 "register_operand" "=d,r") + (neg:SF (match_operand:SF 1 "register_operand" "0,0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ subi %D0,0x80 bst %D0,7\;com %D0\;bld %D0,7\;com %D0" - [(set_attr "length" "1,4") - (set_attr "cc" "set_n,set_n")]) + [(set_attr "length" "1,4")]) ;; !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ;; not -(define_insn "one_cmplqi2" +(define_insn_and_split "one_cmplqi2" [(set (match_operand:QI 0 "register_operand" "=r") (not:QI (match_operand:QI 1 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (not:QI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*one_cmplqi2" + [(set (match_operand:QI 0 "register_operand" "=r") + (not:QI (match_operand:QI 1 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "com %0" - [(set_attr "length" "1") - (set_attr "cc" "set_czn")]) + [(set_attr "length" "1")]) -(define_insn "one_cmplhi2" +(define_insn_and_split "one_cmplhi2" [(set (match_operand:HI 0 "register_operand" "=r") (not:HI (match_operand:HI 1 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (not:HI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*one_cmplhi2" + [(set (match_operand:HI 0 "register_operand" "=r") + (not:HI (match_operand:HI 1 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "com %0 com %B0" - [(set_attr "length" "2") - (set_attr "cc" "set_n")]) + [(set_attr "length" "2")]) -(define_insn "one_cmplpsi2" +(define_insn_and_split "one_cmplpsi2" [(set (match_operand:PSI 0 "register_operand" "=r") (not:PSI (match_operand:PSI 1 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (not:PSI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*one_cmplpsi2" + [(set (match_operand:PSI 0 "register_operand" "=r") + (not:PSI (match_operand:PSI 1 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "com %0\;com %B0\;com %C0" - [(set_attr "length" "3") - (set_attr "cc" "set_n")]) + [(set_attr "length" "3")]) -(define_insn "one_cmplsi2" +(define_insn_and_split "one_cmplsi2" [(set (match_operand:SI 0 "register_operand" "=r") (not:SI (match_operand:SI 1 "register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (not:SI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*one_cmplsi2" + [(set (match_operand:SI 0 "register_operand" "=r") + (not:SI (match_operand:SI 1 "register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "com %0 com %B0 com %C0 com %D0" - [(set_attr "length" "4") - (set_attr "cc" "set_n")]) + [(set_attr "length" "4")]) ;; xx<---x xx<---x xx<---x xx<---x xx<---x xx<---x xx<---x xx<---x xx<---x ;; sign extend @@ -4353,71 +6094,131 @@ ;; multiplication. There is no need for combine to propagate hard registers, ;; register allocation can do it just as well. -(define_insn "extendqihi2" +(define_insn_and_split "extendqihi2" [(set (match_operand:HI 0 "register_operand" "=r,r") (sign_extend:HI (match_operand:QI 1 "combine_pseudo_register_operand" "0,*r")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (sign_extend:HI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*extendqihi2" + [(set (match_operand:HI 0 "register_operand" "=r,r") + (sign_extend:HI (match_operand:QI 1 "combine_pseudo_register_operand" "0,*r"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_sign_extend (insn, operands, NULL); } [(set_attr "length" "3,4") - (set_attr "adjust_len" "sext") - (set_attr "cc" "set_n")]) + (set_attr "adjust_len" "sext")]) -(define_insn "extendqipsi2" +(define_insn_and_split "extendqipsi2" [(set (match_operand:PSI 0 "register_operand" "=r,r") (sign_extend:PSI (match_operand:QI 1 "combine_pseudo_register_operand" "0,*r")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (sign_extend:PSI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*extendqipsi2" + [(set (match_operand:PSI 0 "register_operand" "=r,r") + (sign_extend:PSI (match_operand:QI 1 "combine_pseudo_register_operand" "0,*r"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_sign_extend (insn, operands, NULL); } [(set_attr "length" "4,5") - (set_attr "adjust_len" "sext") - (set_attr "cc" "set_n")]) + (set_attr "adjust_len" "sext")]) -(define_insn "extendqisi2" +(define_insn_and_split "extendqisi2" [(set (match_operand:SI 0 "register_operand" "=r,r") (sign_extend:SI (match_operand:QI 1 "combine_pseudo_register_operand" "0,*r")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (sign_extend:SI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*extendqisi2" + [(set (match_operand:SI 0 "register_operand" "=r,r") + (sign_extend:SI (match_operand:QI 1 "combine_pseudo_register_operand" "0,*r"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_sign_extend (insn, operands, NULL); } [(set_attr "length" "5,6") - (set_attr "adjust_len" "sext") - (set_attr "cc" "set_n")]) + (set_attr "adjust_len" "sext")]) -(define_insn "extendhipsi2" +(define_insn_and_split "extendhipsi2" [(set (match_operand:PSI 0 "register_operand" "=r,r") (sign_extend:PSI (match_operand:HI 1 "combine_pseudo_register_operand" "0,*r")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (sign_extend:PSI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*extendhipsi2" + [(set (match_operand:PSI 0 "register_operand" "=r,r") + (sign_extend:PSI (match_operand:HI 1 "combine_pseudo_register_operand" "0,*r"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_sign_extend (insn, operands, NULL); } [(set_attr "length" "3,5") - (set_attr "adjust_len" "sext") - (set_attr "cc" "set_n")]) + (set_attr "adjust_len" "sext")]) -(define_insn "extendhisi2" +(define_insn_and_split "extendhisi2" [(set (match_operand:SI 0 "register_operand" "=r,r") (sign_extend:SI (match_operand:HI 1 "combine_pseudo_register_operand" "0,*r")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (sign_extend:SI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*extendhisi2" + [(set (match_operand:SI 0 "register_operand" "=r,r") + (sign_extend:SI (match_operand:HI 1 "combine_pseudo_register_operand" "0,*r"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_sign_extend (insn, operands, NULL); } [(set_attr "length" "4,6") - (set_attr "adjust_len" "sext") - (set_attr "cc" "set_n")]) + (set_attr "adjust_len" "sext")]) -(define_insn "extendpsisi2" +(define_insn_and_split "extendpsisi2" [(set (match_operand:SI 0 "register_operand" "=r") (sign_extend:SI (match_operand:PSI 1 "combine_pseudo_register_operand" "0")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (sign_extend:SI (match_dup 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*extendpsisi2" + [(set (match_operand:SI 0 "register_operand" "=r") + (sign_extend:SI (match_operand:PSI 1 "combine_pseudo_register_operand" "0"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_sign_extend (insn, operands, NULL); } [(set_attr "length" "3") - (set_attr "adjust_len" "sext") - (set_attr "cc" "set_n")]) + (set_attr "adjust_len" "sext")]) ;; xx<---x xx<---x xx<---x xx<---x xx<---x xx<---x xx<---x xx<---x xx<---x ;; zero extend @@ -4585,145 +6386,133 @@ ; Optimize negated tests into reverse compare if overflow is undefined. (define_insn "*negated_tstqi" - [(set (cc0) - (compare (neg:QI (match_operand:QI 0 "register_operand" "r")) + [(set (reg:CC REG_CC) + (compare:CC (neg:QI (match_operand:QI 0 "register_operand" "r")) (const_int 0)))] - "!flag_wrapv && !flag_trapv" + "reload_completed && !flag_wrapv && !flag_trapv" "cp __zero_reg__,%0" - [(set_attr "cc" "compare") - (set_attr "length" "1")]) + [(set_attr "length" "1")]) (define_insn "*reversed_tstqi" - [(set (cc0) - (compare (const_int 0) + [(set (reg:CC REG_CC) + (compare:CC (const_int 0) (match_operand:QI 0 "register_operand" "r")))] - "" + "reload_completed" "cp __zero_reg__,%0" -[(set_attr "cc" "compare") - (set_attr "length" "2")]) +[(set_attr "length" "2")]) (define_insn "*negated_tsthi" - [(set (cc0) - (compare (neg:HI (match_operand:HI 0 "register_operand" "r")) + [(set (reg:CC REG_CC) + (compare:CC (neg:HI (match_operand:HI 0 "register_operand" "r")) (const_int 0)))] - "!flag_wrapv && !flag_trapv" + "reload_completed && !flag_wrapv && !flag_trapv" "cp __zero_reg__,%A0 cpc __zero_reg__,%B0" -[(set_attr "cc" "compare") - (set_attr "length" "2")]) +[(set_attr "length" "2")]) ;; Leave here the clobber used by the cmphi pattern for simplicity, even ;; though it is unused, because this pattern is synthesized by avr_reorg. (define_insn "*reversed_tsthi" - [(set (cc0) - (compare (const_int 0) + [(set (reg:CC REG_CC) + (compare:CC (const_int 0) (match_operand:HI 0 "register_operand" "r"))) (clobber (match_scratch:QI 1 "=X"))] - "" + "reload_completed" "cp __zero_reg__,%A0 cpc __zero_reg__,%B0" -[(set_attr "cc" "compare") - (set_attr "length" "2")]) +[(set_attr "length" "2")]) (define_insn "*negated_tstpsi" - [(set (cc0) - (compare (neg:PSI (match_operand:PSI 0 "register_operand" "r")) + [(set (reg:CC REG_CC) + (compare:CC (neg:PSI (match_operand:PSI 0 "register_operand" "r")) (const_int 0)))] - "!flag_wrapv && !flag_trapv" + "reload_completed && !flag_wrapv && !flag_trapv" "cp __zero_reg__,%A0\;cpc __zero_reg__,%B0\;cpc __zero_reg__,%C0" - [(set_attr "cc" "compare") - (set_attr "length" "3")]) + [(set_attr "length" "3")]) (define_insn "*reversed_tstpsi" - [(set (cc0) - (compare (const_int 0) + [(set (reg:CC REG_CC) + (compare:CC (const_int 0) (match_operand:PSI 0 "register_operand" "r"))) (clobber (match_scratch:QI 1 "=X"))] - "" + "reload_completed" "cp __zero_reg__,%A0\;cpc __zero_reg__,%B0\;cpc __zero_reg__,%C0" - [(set_attr "cc" "compare") - (set_attr "length" "3")]) + [(set_attr "length" "3")]) (define_insn "*negated_tstsi" - [(set (cc0) - (compare (neg:SI (match_operand:SI 0 "register_operand" "r")) + [(set (reg:CC REG_CC) + (compare:CC (neg:SI (match_operand:SI 0 "register_operand" "r")) (const_int 0)))] - "!flag_wrapv && !flag_trapv" + "reload_completed && !flag_wrapv && !flag_trapv" "cp __zero_reg__,%A0 cpc __zero_reg__,%B0 cpc __zero_reg__,%C0 cpc __zero_reg__,%D0" - [(set_attr "cc" "compare") - (set_attr "length" "4")]) + [(set_attr "length" "4")]) ;; "*reversed_tstsi" ;; "*reversed_tstsq" "*reversed_tstusq" ;; "*reversed_tstsa" "*reversed_tstusa" (define_insn "*reversed_tst" - [(set (cc0) - (compare (match_operand:ALL4 0 "const0_operand" "Y00") + [(set (reg:CC REG_CC) + (compare:CC (match_operand:ALL4 0 "const0_operand" "Y00") (match_operand:ALL4 1 "register_operand" "r"))) (clobber (match_scratch:QI 2 "=X"))] - "" + "reload_completed" "cp __zero_reg__,%A1 cpc __zero_reg__,%B1 cpc __zero_reg__,%C1 cpc __zero_reg__,%D1" - [(set_attr "cc" "compare") - (set_attr "length" "4")]) + [(set_attr "length" "4")]) ;; "cmpqi3" ;; "cmpqq3" "cmpuqq3" (define_insn "cmp3" - [(set (cc0) - (compare (match_operand:ALL1 0 "register_operand" "r ,r,d") + [(set (reg:CC REG_CC) + (compare:CC (match_operand:ALL1 0 "register_operand" "r ,r,d") (match_operand:ALL1 1 "nonmemory_operand" "Y00,r,i")))] - "" + "reload_completed" "@ tst %0 cp %0,%1 cpi %0,lo8(%1)" - [(set_attr "cc" "compare,compare,compare") - (set_attr "length" "1,1,1")]) + [(set_attr "length" "1,1,1")]) (define_insn "*cmpqi_sign_extend" - [(set (cc0) - (compare (sign_extend:HI (match_operand:QI 0 "register_operand" "d")) + [(set (reg:CC REG_CC) + (compare:CC (sign_extend:HI (match_operand:QI 0 "register_operand" "d")) (match_operand:HI 1 "s8_operand" "n")))] - "" + "reload_completed" "cpi %0,lo8(%1)" - [(set_attr "cc" "compare") - (set_attr "length" "1")]) + [(set_attr "length" "1")]) (define_insn "*cmphi.zero-extend.0" - [(set (cc0) - (compare (zero_extend:HI (match_operand:QI 0 "register_operand" "r")) + [(set (reg:CC REG_CC) + (compare:CC (zero_extend:HI (match_operand:QI 0 "register_operand" "r")) (match_operand:HI 1 "register_operand" "r")))] - "" + "reload_completed" "cp %0,%A1\;cpc __zero_reg__,%B1" - [(set_attr "cc" "compare") - (set_attr "length" "2")]) + [(set_attr "length" "2")]) (define_insn "*cmphi.zero-extend.1" - [(set (cc0) - (compare (match_operand:HI 0 "register_operand" "r") + [(set (reg:CC REG_CC) + (compare:CC (match_operand:HI 0 "register_operand" "r") (zero_extend:HI (match_operand:QI 1 "register_operand" "r"))))] - "" + "reload_completed" "cp %A0,%1\;cpc %B0,__zero_reg__" - [(set_attr "cc" "compare") - (set_attr "length" "2")]) + [(set_attr "length" "2")]) ;; "cmphi3" ;; "cmphq3" "cmpuhq3" ;; "cmpha3" "cmpuha3" (define_insn "cmp3" - [(set (cc0) - (compare (match_operand:ALL2 0 "register_operand" "!w ,r ,r,d ,r ,d,r") + [(set (reg:CC REG_CC) + (compare:CC (match_operand:ALL2 0 "register_operand" "!w ,r ,r,d ,r ,d,r") (match_operand:ALL2 1 "nonmemory_operand" "Y00,Y00,r,s ,s ,M,n Ynn"))) (clobber (match_scratch:QI 2 "=X ,X ,X,&d,&d ,X,&d"))] - "" + "reload_completed" { switch (which_alternative) { @@ -4749,16 +6538,15 @@ return avr_out_compare (insn, operands, NULL); } - [(set_attr "cc" "compare") - (set_attr "length" "1,2,2,3,4,2,4") + [(set_attr "length" "1,2,2,3,4,2,4") (set_attr "adjust_len" "tsthi,tsthi,*,*,*,compare,compare")]) (define_insn "*cmppsi" - [(set (cc0) - (compare (match_operand:PSI 0 "register_operand" "r,r,d ,r ,d,r") + [(set (reg:CC REG_CC) + (compare:CC (match_operand:PSI 0 "register_operand" "r,r,d ,r ,d,r") (match_operand:PSI 1 "nonmemory_operand" "L,r,s ,s ,M,n"))) (clobber (match_scratch:QI 2 "=X,X,&d,&d ,X,&d"))] - "" + "reload_completed" { switch (which_alternative) { @@ -4779,19 +6567,18 @@ return avr_out_compare (insn, operands, NULL); } - [(set_attr "cc" "compare") - (set_attr "length" "3,3,5,6,3,7") + [(set_attr "length" "3,3,5,6,3,7") (set_attr "adjust_len" "tstpsi,*,*,*,compare,compare")]) ;; "*cmpsi" ;; "*cmpsq" "*cmpusq" ;; "*cmpsa" "*cmpusa" (define_insn "*cmp" - [(set (cc0) - (compare (match_operand:ALL4 0 "register_operand" "r ,r ,d,r ,r") + [(set (reg:CC REG_CC) + (compare:CC (match_operand:ALL4 0 "register_operand" "r ,r ,d,r ,r") (match_operand:ALL4 1 "nonmemory_operand" "Y00,r ,M,M ,n Ynn"))) (clobber (match_scratch:QI 2 "=X ,X ,X,&d,&d"))] - "" + "reload_completed" { if (0 == which_alternative) return avr_out_tstsi (insn, operands, NULL); @@ -4800,8 +6587,7 @@ return avr_out_compare (insn, operands, NULL); } - [(set_attr "cc" "compare") - (set_attr "length" "4,4,4,5,8") + [(set_attr "length" "4,4,4,5,8") (set_attr "adjust_len" "tstsi,*,compare,compare,compare")]) @@ -4810,38 +6596,144 @@ ;; ---------------------------------------------------------------------- ;; Conditional jump instructions -;; "cbranchqi4" -;; "cbranchqq4" "cbranchuqq4" (define_expand "cbranch4" - [(set (cc0) - (compare (match_operand:ALL1 1 "register_operand" "") - (match_operand:ALL1 2 "nonmemory_operand" ""))) - (set (pc) - (if_then_else - (match_operator 0 "ordered_comparison_operator" [(cc0) - (const_int 0)]) + [(set (pc) + (if_then_else (match_operator 0 "ordered_comparison_operator" + [(match_operand:ALL1 1 "register_operand" "") + (match_operand:ALL1 2 "nonmemory_operand" "")]) (label_ref (match_operand 3 "" "")) (pc)))]) -;; "cbranchhi4" "cbranchhq4" "cbranchuhq4" "cbranchha4" "cbranchuha4" -;; "cbranchsi4" "cbranchsq4" "cbranchusq4" "cbranchsa4" "cbranchusa4" -;; "cbranchpsi4" (define_expand "cbranch4" - [(parallel [(set (cc0) - (compare (match_operand:ORDERED234 1 "register_operand" "") - (match_operand:ORDERED234 2 "nonmemory_operand" ""))) - (clobber (match_scratch:QI 4 ""))]) - (set (pc) + [(parallel + [(set (pc) + (if_then_else + (match_operator 0 "ordered_comparison_operator" + [(match_operand:ORDERED234 1 "register_operand" "") + (match_operand:ORDERED234 2 "nonmemory_operand" "")]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (match_scratch:QI 4 ""))])]) + +;; "*cbranchqi4" +;; "*cbranchqq4" "*cbranchuqq4" +(define_insn_and_split "*cbranch4" + [(set (pc) + (if_then_else (match_operator 0 "ordered_comparison_operator" + [(match_operand:ALL1 1 "register_operand" "r ,r,d") + (match_operand:ALL1 2 "nonmemory_operand" "Y00,r,i")]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "" + "#" + "reload_completed" + [(set (reg:CC REG_CC) + (compare:CC (match_dup 1) (match_dup 2))) + (set (pc) + (if_then_else (match_op_dup 0 + [(reg:CC REG_CC) (const_int 0)]) + (label_ref (match_dup 3)) + (pc)))] + "") + +;; "*cbranchsi4" "*cbranchsq4" "*cbranchusq4" "*cbranchsa4" "*cbranchusa4" +(define_insn_and_split "*cbranch4" + [(set (pc) + (if_then_else + (match_operator 0 "ordered_comparison_operator" + [(match_operand:ALL4 1 "register_operand" "r ,r ,d,r ,r") + (match_operand:ALL4 2 "nonmemory_operand" "Y00,r ,M,M ,n Ynn")]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (match_scratch:QI 4 "=X ,X ,X,&d,&d"))] + "" + "#" + "reload_completed" + [(parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 1) (match_dup 2))) + (clobber (match_dup 4))]) + (set (pc) + (if_then_else (match_op_dup 0 + [(reg:CC REG_CC) (const_int 0)]) + (label_ref (match_dup 3)) + (pc)))] + "") + +;; "*cbranchpsi4" +(define_insn_and_split "*cbranchpsi4" + [(set (pc) + (if_then_else + (match_operator 0 "ordered_comparison_operator" + [(match_operand:PSI 1 "register_operand" "r,r,d ,r ,d,r") + (match_operand:PSI 2 "nonmemory_operand" "L,r,s ,s ,M,n")]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (match_scratch:QI 4 "=X,X,&d,&d ,X,&d"))] + "" + "#" + "reload_completed" + [(parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 1) (match_dup 2))) + (clobber (match_dup 4))]) + (set (pc) + (if_then_else (match_op_dup 0 + [(reg:CC REG_CC) (const_int 0)]) + (label_ref (match_dup 3)) + (pc)))] + "") + +;; "*cbranchhi4" "*cbranchhq4" "*cbranchuhq4" "*cbranchha4" "*cbranchuha4" +(define_insn_and_split "*cbranch4" + [(set (pc) + (if_then_else + (match_operator 0 "ordered_comparison_operator" + [(match_operand:ALL2 1 "register_operand" "!w ,r ,r,d ,r ,d,r") + (match_operand:ALL2 2 "nonmemory_operand" "Y00,Y00,r,s ,s ,M,n Ynn")]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (match_scratch:QI 4 "=X ,X ,X,&d,&d ,X,&d"))] + "" + "#" + "reload_completed" + [(parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 1) (match_dup 2))) + (clobber (match_dup 4))]) + (set (pc) + (if_then_else (match_op_dup 0 + [(reg:CC REG_CC) (const_int 0)]) + (label_ref (match_dup 3)) + (pc)))] + "") + +;; Test a single bit in a QI/HI/SImode register. +;; Combine will create zero extract patterns for single bit tests. +;; permit any mode in source pattern by using VOIDmode. + +(define_insn_and_split "*sbrx_branch_split" + [(set (pc) (if_then_else - (match_operator 0 "ordered_comparison_operator" [(cc0) - (const_int 0)]) + (match_operator 0 "eqne_operator" + [(zero_extract:QIDI + (match_operand:VOID 1 "register_operand" "r") + (const_int 1) + (match_operand 2 "const_int_operand" "n")) + (const_int 0)]) (label_ref (match_operand 3 "" "")) - (pc)))]) - - -;; Test a single bit in a QI/HI/SImode register. -;; Combine will create zero extract patterns for single bit tests. -;; permit any mode in source pattern by using VOIDmode. + (pc)))] + "" + "#" + "&& reload_completed" + [(parallel [(set (pc) + (if_then_else + (match_op_dup 0 + [(zero_extract:QIDI + (match_dup 1) + (const_int 1) + (match_dup 2)) + (const_int 0)]) + (label_ref (match_dup 3)) + (pc))) + (clobber (reg:CC REG_CC))])]) (define_insn "*sbrx_branch" [(set (pc) @@ -4853,8 +6745,9 @@ (match_operand 2 "const_int_operand" "n")) (const_int 0)]) (label_ref (match_operand 3 "" "")) - (pc)))] - "" + (pc))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_sbxx_branch (insn, operands); } @@ -4864,14 +6757,13 @@ (const_int 2) (if_then_else (match_test "!AVR_HAVE_JMP_CALL") (const_int 2) - (const_int 4)))) - (set_attr "cc" "clobber")]) + (const_int 4))))]) ;; Same test based on bitwise AND. Keep this in case gcc changes patterns. ;; or for old peepholes. ;; Fixme - bitwise Mask will not work for DImode -(define_insn "*sbrx_and_branch" +(define_insn_and_split "*sbrx_and_branch_split" [(set (pc) (if_then_else (match_operator 0 "eqne_operator" @@ -4882,6 +6774,31 @@ (label_ref (match_operand 3 "" "")) (pc)))] "" + "#" + "&& reload_completed" + [(parallel [(set (pc) + (if_then_else + (match_op_dup 0 + [(and:QISI + (match_dup 1) + (match_dup 2)) + (const_int 0)]) + (label_ref (match_dup 3)) + (pc))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*sbrx_and_branch" + [(set (pc) + (if_then_else + (match_operator 0 "eqne_operator" + [(and:QISI + (match_operand:QISI 1 "register_operand" "r") + (match_operand:QISI 2 "single_one_operand" "n")) + (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (reg:CC REG_CC))] + "reload_completed" { HOST_WIDE_INT bitnumber; bitnumber = exact_log2 (GET_MODE_MASK (mode) & INTVAL (operands[2])); @@ -4894,14 +6811,13 @@ (const_int 2) (if_then_else (match_test "!AVR_HAVE_JMP_CALL") (const_int 2) - (const_int 4)))) - (set_attr "cc" "clobber")]) + (const_int 4))))]) ;; Convert sign tests to bit 7/15/31 tests that match the above insns. (define_peephole2 - [(set (cc0) (compare (match_operand:QI 0 "register_operand" "") + [(set (reg:CC REG_CC) (compare:CC (match_operand:QI 0 "register_operand" "") (const_int 0))) - (set (pc) (if_then_else (ge (cc0) (const_int 0)) + (set (pc) (if_then_else (ge (reg:CC REG_CC) (const_int 0)) (label_ref (match_operand 1 "" "")) (pc)))] "" @@ -4913,9 +6829,9 @@ (pc)))]) (define_peephole2 - [(set (cc0) (compare (match_operand:QI 0 "register_operand" "") + [(set (reg:CC REG_CC) (compare:CC (match_operand:QI 0 "register_operand" "") (const_int 0))) - (set (pc) (if_then_else (lt (cc0) (const_int 0)) + (set (pc) (if_then_else (lt (reg:CC REG_CC) (const_int 0)) (label_ref (match_operand 1 "" "")) (pc)))] "" @@ -4927,10 +6843,10 @@ (pc)))]) (define_peephole2 - [(parallel [(set (cc0) (compare (match_operand:HI 0 "register_operand" "") + [(parallel [(set (reg:CC REG_CC) (compare:CC (match_operand:HI 0 "register_operand" "") (const_int 0))) (clobber (match_operand:HI 2 ""))]) - (set (pc) (if_then_else (ge (cc0) (const_int 0)) + (set (pc) (if_then_else (ge (reg:CC REG_CC) (const_int 0)) (label_ref (match_operand 1 "" "")) (pc)))] "" @@ -4940,10 +6856,10 @@ (pc)))]) (define_peephole2 - [(parallel [(set (cc0) (compare (match_operand:HI 0 "register_operand" "") + [(parallel [(set (reg:CC REG_CC) (compare:CC (match_operand:HI 0 "register_operand" "") (const_int 0))) (clobber (match_operand:HI 2 ""))]) - (set (pc) (if_then_else (lt (cc0) (const_int 0)) + (set (pc) (if_then_else (lt (reg:CC REG_CC) (const_int 0)) (label_ref (match_operand 1 "" "")) (pc)))] "" @@ -4953,10 +6869,10 @@ (pc)))]) (define_peephole2 - [(parallel [(set (cc0) (compare (match_operand:SI 0 "register_operand" "") + [(parallel [(set (reg:CC REG_CC) (compare:CC (match_operand:SI 0 "register_operand" "") (const_int 0))) (clobber (match_operand:SI 2 ""))]) - (set (pc) (if_then_else (ge (cc0) (const_int 0)) + (set (pc) (if_then_else (ge (reg:CC REG_CC) (const_int 0)) (label_ref (match_operand 1 "" "")) (pc)))] "" @@ -4967,10 +6883,10 @@ "operands[2] = gen_int_mode (-2147483647 - 1, SImode);") (define_peephole2 - [(parallel [(set (cc0) (compare (match_operand:SI 0 "register_operand" "") + [(parallel [(set (reg:CC REG_CC) (compare:CC (match_operand:SI 0 "register_operand" "") (const_int 0))) (clobber (match_operand:SI 2 ""))]) - (set (pc) (if_then_else (lt (cc0) (const_int 0)) + (set (pc) (if_then_else (lt (reg:CC REG_CC) (const_int 0)) (label_ref (match_operand 1 "" "")) (pc)))] "" @@ -4985,19 +6901,37 @@ ;; Compare with 0 (test) jumps ;; ************************************************************************ -(define_insn "branch" +(define_insn_and_split "branch" [(set (pc) (if_then_else (match_operator 1 "simple_comparison_operator" - [(cc0) + [(reg:CC REG_CC) (const_int 0)]) (label_ref (match_operand 0 "" "")) (pc)))] - "" + "reload_completed" + "#" + "&& reload_completed" + [(parallel [(set (pc) + (if_then_else (match_op_dup 1 + [(reg:CC REG_CC) + (const_int 0)]) + (label_ref (match_dup 0)) + (pc))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*branch" + [(set (pc) + (if_then_else (match_operator 1 "simple_comparison_operator" + [(reg:CC REG_CC) + (const_int 0)]) + (label_ref (match_operand 0 "" "")) + (pc))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ret_cond_branch (operands[1], avr_jump_mode (operands[0], insn), 0); } - [(set_attr "type" "branch") - (set_attr "cc" "clobber")]) + [(set_attr "type" "branch")]) ;; Same as above but wrap SET_SRC so that this branch won't be transformed @@ -5006,66 +6940,120 @@ (define_insn "branch_unspec" [(set (pc) (unspec [(if_then_else (match_operator 1 "simple_comparison_operator" - [(cc0) + [(reg:CC REG_CC) (const_int 0)]) (label_ref (match_operand 0 "" "")) (pc)) - ] UNSPEC_IDENTITY))] - "" + ] UNSPEC_IDENTITY)) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ret_cond_branch (operands[1], avr_jump_mode (operands[0], insn), 0); } - [(set_attr "type" "branch") - (set_attr "cc" "none")]) + [(set_attr "type" "branch")]) ;; **************************************************************** ;; AVR does not have following conditional jumps: LE,LEU,GT,GTU. ;; Convert them all to proper jumps. ;; ****************************************************************/ -(define_insn "difficult_branch" +(define_insn_and_split "difficult_branch" [(set (pc) (if_then_else (match_operator 1 "difficult_comparison_operator" - [(cc0) + [(reg:CC REG_CC) (const_int 0)]) (label_ref (match_operand 0 "" "")) (pc)))] - "" + "reload_completed" + "#" + "&& reload_completed" + [(parallel [(set (pc) + (if_then_else (match_op_dup 1 + [(reg:CC REG_CC) + (const_int 0)]) + (label_ref (match_dup 0)) + (pc))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*difficult_branch" + [(set (pc) + (if_then_else (match_operator 1 "difficult_comparison_operator" + [(reg:CC REG_CC) + (const_int 0)]) + (label_ref (match_operand 0 "" "")) + (pc))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ret_cond_branch (operands[1], avr_jump_mode (operands[0], insn), 0); } - [(set_attr "type" "branch1") - (set_attr "cc" "clobber")]) + [(set_attr "type" "branch1")]) ;; revers branch -(define_insn "rvbranch" +(define_insn_and_split "rvbranch" [(set (pc) (if_then_else (match_operator 1 "simple_comparison_operator" - [(cc0) + [(reg:CC REG_CC) (const_int 0)]) (pc) (label_ref (match_operand 0 "" ""))))] - "" + "reload_completed" + "#" + "&& reload_completed" + [(parallel [(set (pc) + (if_then_else (match_op_dup 1 + [(reg:CC REG_CC) + (const_int 0)]) + (pc) + (label_ref (match_dup 0)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*rvbranch" + [(set (pc) + (if_then_else (match_operator 1 "simple_comparison_operator" + [(reg:CC REG_CC) + (const_int 0)]) + (pc) + (label_ref (match_operand 0 "" "")))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ret_cond_branch (operands[1], avr_jump_mode (operands[0], insn), 1); } - [(set_attr "type" "branch1") - (set_attr "cc" "clobber")]) + [(set_attr "type" "branch1")]) -(define_insn "difficult_rvbranch" +(define_insn_and_split "difficult_rvbranch" [(set (pc) (if_then_else (match_operator 1 "difficult_comparison_operator" - [(cc0) + [(reg:CC REG_CC) (const_int 0)]) (pc) (label_ref (match_operand 0 "" ""))))] - "" + "reload_completed" + "#" + "&& reload_completed" + [(parallel [(set (pc) + (if_then_else (match_op_dup 1 + [(reg:CC REG_CC) + (const_int 0)]) + (pc) + (label_ref (match_dup 0)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*difficult_rvbranch" + [(set (pc) + (if_then_else (match_operator 1 "difficult_comparison_operator" + [(reg:CC REG_CC) + (const_int 0)]) + (pc) + (label_ref (match_operand 0 "" "")))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return ret_cond_branch (operands[1], avr_jump_mode (operands[0], insn), 1); } - [(set_attr "type" "branch") - (set_attr "cc" "clobber")]) + [(set_attr "type" "branch")]) ;; ************************************************************************** ;; Unconditional and other jump instructions. @@ -5087,8 +7075,7 @@ (if_then_else (and (ge (minus (pc) (match_dup 0)) (const_int -2047)) (le (minus (pc) (match_dup 0)) (const_int 2047))) (const_int 1) - (const_int 2)))) - (set_attr "cc" "none")]) + (const_int 2))))]) ;; call @@ -5136,8 +7123,7 @@ %~call %x0 %!ijmp %~jmp %x0" - [(set_attr "cc" "clobber") - (set_attr "length" "1,*,1,*") + [(set_attr "length" "1,*,1,*") (set_attr "adjust_len" "*,call,*,call")]) (define_insn "call_value_insn" @@ -5153,16 +7139,14 @@ %~call %x1 %!ijmp %~jmp %x1" - [(set_attr "cc" "clobber") - (set_attr "length" "1,*,1,*") + [(set_attr "length" "1,*,1,*") (set_attr "adjust_len" "*,call,*,call")]) (define_insn "nop" [(const_int 0)] "" "nop" - [(set_attr "cc" "none") - (set_attr "length" "1")]) + [(set_attr "length" "1")]) ; indirect jump @@ -5189,8 +7173,7 @@ push %A0\;push %B0\;ret eijmp" [(set_attr "length" "1,2,1,3,1") - (set_attr "isa" "rjmp,jmp,ijmp,ijmp,eijmp") - (set_attr "cc" "none")]) + (set_attr "isa" "rjmp,jmp,ijmp,ijmp,eijmp")]) ;; table jump ;; For entries in jump table see avr_output_addr_vec. @@ -5198,7 +7181,7 @@ ;; Table made from ;; "rjmp .L" instructions for <= 8K devices ;; ".word gs(.L)" addresses for > 8K devices -(define_insn "*tablejump" +(define_insn_and_split "*tablejump_split" [(set (pc) (unspec:HI [(match_operand:HI 0 "register_operand" "!z,*r,z")] UNSPEC_INDEX_JMP)) @@ -5206,15 +7189,35 @@ (clobber (match_dup 0)) (clobber (const_int 0))] "!AVR_HAVE_EIJMP_EICALL" + "#" + "&& reload_completed" + [(parallel [(set (pc) + (unspec:HI [(match_dup 0)] + UNSPEC_INDEX_JMP)) + (use (label_ref (match_dup 1))) + (clobber (match_dup 0)) + (clobber (const_int 0)) + (clobber (reg:CC REG_CC))])] + "" + [(set_attr "isa" "rjmp,rjmp,jmp")]) + +(define_insn "*tablejump" + [(set (pc) + (unspec:HI [(match_operand:HI 0 "register_operand" "!z,*r,z")] + UNSPEC_INDEX_JMP)) + (use (label_ref (match_operand 1 "" ""))) + (clobber (match_dup 0)) + (clobber (const_int 0)) + (clobber (reg:CC REG_CC))] + "!AVR_HAVE_EIJMP_EICALL && reload_completed" "@ ijmp push %A0\;push %B0\;ret jmp __tablejump2__" [(set_attr "length" "1,3,2") - (set_attr "isa" "rjmp,rjmp,jmp") - (set_attr "cc" "none,none,clobber")]) + (set_attr "isa" "rjmp,rjmp,jmp")]) -(define_insn "*tablejump.3byte-pc" +(define_insn_and_split "*tablejump.3byte-pc_split" [(set (pc) (unspec:HI [(reg:HI REG_Z)] UNSPEC_INDEX_JMP)) @@ -5222,10 +7225,31 @@ (clobber (reg:HI REG_Z)) (clobber (reg:QI 24))] "AVR_HAVE_EIJMP_EICALL" + "#" + "&& reload_completed" + [(parallel [(set (pc) + (unspec:HI [(reg:HI REG_Z)] + UNSPEC_INDEX_JMP)) + (use (label_ref (match_dup 0))) + (clobber (reg:HI REG_Z)) + (clobber (reg:QI 24)) + (clobber (reg:CC REG_CC))])] + "" + [(set_attr "isa" "eijmp")]) + + +(define_insn "*tablejump.3byte-pc" + [(set (pc) + (unspec:HI [(reg:HI REG_Z)] + UNSPEC_INDEX_JMP)) + (use (label_ref (match_operand 0 "" ""))) + (clobber (reg:HI REG_Z)) + (clobber (reg:QI 24)) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_EIJMP_EICALL && reload_completed" "clr r24\;subi r30,pm_lo8(-(%0))\;sbci r31,pm_hi8(-(%0))\;sbci r24,pm_hh8(-(%0))\;jmp __tablejump2__" [(set_attr "length" "6") - (set_attr "isa" "eijmp") - (set_attr "cc" "clobber")]) + (set_attr "isa" "eijmp")]) ;; FIXME: casesi comes up with an SImode switch value $0 which @@ -5254,16 +7278,13 @@ (plus:SI (match_operand:SI 0 "register_operand") (match_operand:SI 1 "const_int_operand"))) (clobber (scratch:QI))]) - (parallel [(set (cc0) - (compare (match_dup 5) - (match_operand:SI 2 "const_int_operand"))) - (clobber (scratch:QI))]) - (set (pc) - (if_then_else (gtu (cc0) - (const_int 0)) - (label_ref (match_operand 4)) - (pc))) + (parallel [(set (pc) + (if_then_else (gtu (match_dup 5) + (match_operand:SI 2 "const_int_operand")) + (label_ref (match_operand 4)) + (pc))) + (clobber (scratch:QI))]) (set (match_dup 7) (match_dup 6)) @@ -5312,16 +7333,13 @@ (plus:SI (match_dup 0) (match_operand:SI 1 "const_int_operand"))) (clobber (scratch:QI))]) - (parallel [(set (cc0) - (compare (match_dup 5) - (match_operand:SI 2 "const_int_operand"))) - (clobber (scratch:QI))]) - (set (pc) - (if_then_else (gtu (cc0) - (const_int 0)) - (label_ref (match_operand 4)) - (pc))) + (parallel [(set (pc) + (if_then_else (gtu (match_dup 5) + (match_operand:SI 2 "const_int_operand")) + (label_ref (match_operand 4)) + (pc))) + (clobber (scratch:QI))]) (set (match_operand:HI 7 "register_operand") (match_operand:HI 6)) @@ -5338,15 +7356,6 @@ ;; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -;; This instruction sets Z flag - -(define_insn "sez" - [(set (cc0) (const_int 0))] - "" - "sez" - [(set_attr "length" "1") - (set_attr "cc" "compare")]) - ;; Clear/set/test a single bit in I/O address space. (define_insn "*cbi" @@ -5358,8 +7367,7 @@ operands[2] = GEN_INT (exact_log2 (~INTVAL (operands[1]) & 0xff)); return "cbi %i0,%2"; } - [(set_attr "length" "1") - (set_attr "cc" "none")]) + [(set_attr "length" "1")]) (define_insn "*sbi" [(set (mem:QI (match_operand 0 "low_io_address_operand" "i")) @@ -5370,11 +7378,10 @@ operands[2] = GEN_INT (exact_log2 (INTVAL (operands[1]) & 0xff)); return "sbi %i0,%2"; } - [(set_attr "length" "1") - (set_attr "cc" "none")]) + [(set_attr "length" "1")]) ;; Lower half of the I/O space - use sbic/sbis directly. -(define_insn "*sbix_branch" +(define_insn_and_split "*sbix_branch_split" [(set (pc) (if_then_else (match_operator 0 "eqne_operator" @@ -5386,6 +7393,33 @@ (label_ref (match_operand 3 "" "")) (pc)))] "" + "#" + "&& reload_completed" + [(parallel [(set (pc) + (if_then_else + (match_operator 0 "eqne_operator" + [(zero_extract:QIHI + (mem:QI (match_dup 1)) + (const_int 1) + (match_dup 2)) + (const_int 0)]) + (label_ref (match_dup 3)) + (pc))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*sbix_branch" + [(set (pc) + (if_then_else + (match_operator 0 "eqne_operator" + [(zero_extract:QIHI + (mem:QI (match_operand 1 "low_io_address_operand" "i")) + (const_int 1) + (match_operand 2 "const_int_operand" "n")) + (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_sbxx_branch (insn, operands); } @@ -5395,11 +7429,10 @@ (const_int 2) (if_then_else (match_test "!AVR_HAVE_JMP_CALL") (const_int 2) - (const_int 4)))) - (set_attr "cc" "clobber")]) + (const_int 4))))]) ;; Tests of bit 7 are pessimized to sign tests, so we need this too... -(define_insn "*sbix_branch_bit7" +(define_insn_and_split "*sbix_branch_bit7_split" [(set (pc) (if_then_else (match_operator 0 "gelt_operator" @@ -5408,6 +7441,27 @@ (label_ref (match_operand 2 "" "")) (pc)))] "" + "#" + "&& reload_completed" + [(parallel [(set (pc) + (if_then_else + (match_operator 0 "gelt_operator" + [(mem:QI (match_dup 1)) + (const_int 0)]) + (label_ref (match_dup 2)) + (pc))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*sbix_branch_bit7" + [(set (pc) + (if_then_else + (match_operator 0 "gelt_operator" + [(mem:QI (match_operand 1 "low_io_address_operand" "i")) + (const_int 0)]) + (label_ref (match_operand 2 "" "")) + (pc))) + (clobber (reg:CC REG_CC))] + "reload_completed" { operands[3] = operands[2]; operands[2] = GEN_INT (7); @@ -5419,11 +7473,10 @@ (const_int 2) (if_then_else (match_test "!AVR_HAVE_JMP_CALL") (const_int 2) - (const_int 4)))) - (set_attr "cc" "clobber")]) + (const_int 4))))]) ;; Upper half of the I/O space - read port to __tmp_reg__ and use sbrc/sbrs. -(define_insn "*sbix_branch_tmp" +(define_insn_and_split "*sbix_branch_tmp_split" [(set (pc) (if_then_else (match_operator 0 "eqne_operator" @@ -5435,6 +7488,33 @@ (label_ref (match_operand 3 "" "")) (pc)))] "" + "#" + "&& reload_completed" + [(parallel [(set (pc) + (if_then_else + (match_operator 0 "eqne_operator" + [(zero_extract:QIHI + (mem:QI (match_dup 1)) + (const_int 1) + (match_dup 2)) + (const_int 0)]) + (label_ref (match_dup 3)) + (pc))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*sbix_branch_tmp" + [(set (pc) + (if_then_else + (match_operator 0 "eqne_operator" + [(zero_extract:QIHI + (mem:QI (match_operand 1 "high_io_address_operand" "n")) + (const_int 1) + (match_operand 2 "const_int_operand" "n")) + (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_sbxx_branch (insn, operands); } @@ -5444,10 +7524,9 @@ (const_int 3) (if_then_else (match_test "!AVR_HAVE_JMP_CALL") (const_int 3) - (const_int 5)))) - (set_attr "cc" "clobber")]) + (const_int 5))))]) -(define_insn "*sbix_branch_tmp_bit7" +(define_insn_and_split "*sbix_branch_tmp_bit7_split" [(set (pc) (if_then_else (match_operator 0 "gelt_operator" @@ -5456,6 +7535,27 @@ (label_ref (match_operand 2 "" "")) (pc)))] "" + "#" + "&& reload_completed" + [(parallel [(set (pc) + (if_then_else + (match_operator 0 "gelt_operator" + [(mem:QI (match_dup 1)) + (const_int 0)]) + (label_ref (match_dup 2)) + (pc))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*sbix_branch_tmp_bit7" + [(set (pc) + (if_then_else + (match_operator 0 "gelt_operator" + [(mem:QI (match_operand 1 "high_io_address_operand" "n")) + (const_int 0)]) + (label_ref (match_operand 2 "" "")) + (pc))) + (clobber (reg:CC REG_CC))] + "reload_completed" { operands[3] = operands[2]; operands[2] = GEN_INT (7); @@ -5467,8 +7567,7 @@ (const_int 3) (if_then_else (match_test "!AVR_HAVE_JMP_CALL") (const_int 3) - (const_int 5)))) - (set_attr "cc" "clobber")]) + (const_int 5))))]) ;; ************************* Peepholes ******************************** @@ -5477,12 +7576,12 @@ (plus:SI (match_dup 0) (const_int -1))) (clobber (scratch:QI))]) - (parallel [(set (cc0) - (compare (match_dup 0) + (parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 0) (const_int -1))) (clobber (match_operand:QI 1 "d_register_operand" ""))]) (set (pc) - (if_then_else (eqne (cc0) + (if_then_else (eqne (reg:CC REG_CC) (const_int 0)) (label_ref (match_operand 2 "" "")) (pc)))] @@ -5520,12 +7619,12 @@ [(set (match_operand:HI 0 "d_register_operand" "") (plus:HI (match_dup 0) (const_int -1))) - (parallel [(set (cc0) - (compare (match_dup 0) + (parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 0) (const_int -1))) (clobber (match_operand:QI 1 "d_register_operand" ""))]) (set (pc) - (if_then_else (eqne (cc0) + (if_then_else (eqne (reg:CC REG_CC) (const_int 0)) (label_ref (match_operand 2 "" "")) (pc)))] @@ -5561,12 +7660,12 @@ (plus:HI (match_dup 0) (const_int -1))) (clobber (scratch:QI))]) - (parallel [(set (cc0) - (compare (match_dup 0) + (parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 0) (const_int -1))) (clobber (match_operand:QI 1 "d_register_operand" ""))]) (set (pc) - (if_then_else (eqne (cc0) + (if_then_else (eqne (reg:CC REG_CC) (const_int 0)) (label_ref (match_operand 2 "" "")) (pc)))] @@ -5602,12 +7701,12 @@ (plus:HI (match_dup 0) (const_int -1))) (clobber (match_operand:QI 3 "d_register_operand" ""))]) - (parallel [(set (cc0) - (compare (match_dup 0) + (parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 0) (const_int -1))) (clobber (match_operand:QI 1 "d_register_operand" ""))]) (set (pc) - (if_then_else (eqne (cc0) + (if_then_else (eqne (reg:CC REG_CC) (const_int 0)) (label_ref (match_operand 2 "" "")) (pc)))] @@ -5639,11 +7738,11 @@ [(set (match_operand:QI 0 "d_register_operand" "") (plus:QI (match_dup 0) (const_int -1))) - (set (cc0) - (compare (match_dup 0) + (set (reg:CC REG_CC) + (compare:CC (match_dup 0) (const_int -1))) (set (pc) - (if_then_else (eqne (cc0) + (if_then_else (eqne (reg:CC REG_CC) (const_int 0)) (label_ref (match_operand 1 "" "")) (pc)))] @@ -5651,9 +7750,6 @@ { const char *op; int jump_mode; - CC_STATUS_INIT; - cc_status.value1 = operands[0]; - cc_status.flags |= CC_OVERFLOW_UNUSABLE; output_asm_insn ("subi %A0,1", operands); @@ -5674,11 +7770,11 @@ (define_peephole ; "*cpse.eq" - [(set (cc0) - (compare (match_operand:ALL1 1 "register_operand" "r,r") + [(set (reg:CC REG_CC) + (compare:CC (match_operand:ALL1 1 "register_operand" "r,r") (match_operand:ALL1 2 "reg_or_0_operand" "r,Y00"))) (set (pc) - (if_then_else (eq (cc0) + (if_then_else (eq (reg:CC REG_CC) (const_int 0)) (label_ref (match_operand 0 "" "")) (pc)))] @@ -5709,11 +7805,11 @@ ;; and thus longer and slower and not easy to be rolled back. (define_peephole ; "*cpse.ne" - [(set (cc0) - (compare (match_operand:ALL1 1 "register_operand" "") + [(set (reg:CC REG_CC) + (compare:CC (match_operand:ALL1 1 "register_operand" "") (match_operand:ALL1 2 "reg_or_0_operand" ""))) (set (pc) - (if_then_else (ne (cc0) + (if_then_else (ne (reg:CC REG_CC) (const_int 0)) (label_ref (match_operand 0 "" "")) (pc)))] @@ -5736,8 +7832,7 @@ (mem:QI (pre_inc:HI (reg:HI REG_SP))))] "" "pop %0" - [(set_attr "cc" "none") - (set_attr "length" "1")]) + [(set_attr "length" "1")]) ;; Enable Interrupts (define_expand "enable_interrupt" @@ -5770,8 +7865,7 @@ "@ cli sei" - [(set_attr "length" "1") - (set_attr "cc" "none")]) + [(set_attr "length" "1")]) ;; Library prologue saves (define_insn "call_prologue_saves" @@ -5781,14 +7875,14 @@ (minus:HI (reg:HI REG_SP) (match_operand:HI 1 "immediate_operand" "i,i"))) (use (reg:HI REG_X)) - (clobber (reg:HI REG_Z))] + (clobber (reg:HI REG_Z)) + (clobber (reg:CC REG_CC))] "" "ldi r30,lo8(gs(1f)) ldi r31,hi8(gs(1f)) %~jmp __prologue_saves__+((18 - %0) * 2) 1:" [(set_attr "length" "5,6") - (set_attr "cc" "clobber") (set_attr "isa" "rjmp,jmp")]) ; epilogue restores using library @@ -5800,12 +7894,12 @@ (set (reg:HI REG_SP) (plus:HI (reg:HI REG_Y) (match_dup 0))) - (clobber (reg:QI REG_Z))] + (clobber (reg:QI REG_Z)) + (clobber (reg:CC REG_CC))] "" "ldi r30, lo8(%0) %~jmp __epilogue_restores__ + ((18 - %0) * 2)" [(set_attr "length" "2,3") - (set_attr "cc" "clobber") (set_attr "isa" "rjmp,jmp")]) @@ -5819,7 +7913,8 @@ (unspec_volatile:HI [(reg:HI REG_SP)] UNSPECV_GASISR)) (set (match_dup 2) (unspec_volatile:BLK [(match_dup 2)] - UNSPECV_MEMORY_BARRIER))])] + UNSPECV_MEMORY_BARRIER)) + (clobber (reg:CC REG_CC))])] "avr_gasisr_prologues" { operands[2] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); @@ -5833,11 +7928,11 @@ (set (reg:HI REG_SP) (unspec_volatile:HI [(reg:HI REG_SP)] UNSPECV_GASISR)) (set (match_operand:BLK 2) - (unspec_volatile:BLK [(match_dup 2)] UNSPECV_MEMORY_BARRIER))] + (unspec_volatile:BLK [(match_dup 2)] UNSPECV_MEMORY_BARRIER)) + (clobber (reg:CC REG_CC))] "avr_gasisr_prologues" "__gcc_isr %0" - [(set_attr "length" "6,5") - (set_attr "cc" "clobber")]) + [(set_attr "length" "6,5")]) ; return @@ -5845,8 +7940,7 @@ [(return)] "reload_completed && avr_simple_epilogue ()" "ret" - [(set_attr "cc" "none") - (set_attr "length" "1")]) + [(set_attr "length" "1")]) (define_insn "return_from_epilogue" [(return)] @@ -5855,8 +7949,7 @@ && !(cfun->machine->is_interrupt || cfun->machine->is_signal) && !cfun->machine->is_naked" "ret" - [(set_attr "cc" "none") - (set_attr "length" "1")]) + [(set_attr "length" "1")]) (define_insn "return_from_interrupt_epilogue" [(return)] @@ -5865,8 +7958,7 @@ && (cfun->machine->is_interrupt || cfun->machine->is_signal) && !cfun->machine->is_naked" "reti" - [(set_attr "cc" "none") - (set_attr "length" "1")]) + [(set_attr "length" "1")]) (define_insn "return_from_naked_epilogue" [(return)] @@ -5874,8 +7966,7 @@ && cfun->machine && cfun->machine->is_naked" "" - [(set_attr "cc" "none") - (set_attr "length" "0")]) + [(set_attr "length" "0")]) (define_expand "prologue" [(const_int 0)] @@ -5904,7 +7995,7 @@ ;; Some instructions resp. instruction sequences available ;; via builtins. -(define_insn "delay_cycles_1" +(define_insn_and_split "delay_cycles_1" [(unspec_volatile [(match_operand:QI 0 "const_int_operand" "n") (const_int 1)] UNSPECV_DELAY_CYCLES) @@ -5912,13 +8003,31 @@ (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER)) (clobber (match_scratch:QI 2 "=&d"))] "" + "#" + "&& reload_completed" + [(parallel [(unspec_volatile [(match_dup 0) + (const_int 1)] + UNSPECV_DELAY_CYCLES) + (set (match_dup 1) + (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER)) + (clobber (match_dup 2)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*delay_cycles_1" + [(unspec_volatile [(match_operand:QI 0 "const_int_operand" "n") + (const_int 1)] + UNSPECV_DELAY_CYCLES) + (set (match_operand:BLK 1 "" "") + (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER)) + (clobber (match_scratch:QI 2 "=&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" "ldi %2,lo8(%0) 1: dec %2 brne 1b" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) -(define_insn "delay_cycles_2" +(define_insn_and_split "delay_cycles_2" [(unspec_volatile [(match_operand:HI 0 "const_int_operand" "n,n") (const_int 2)] UNSPECV_DELAY_CYCLES) @@ -5926,14 +8035,34 @@ (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER)) (clobber (match_scratch:HI 2 "=&w,&d"))] "" + "#" + "&& reload_completed" + [(parallel [(unspec_volatile [(match_dup 0) + (const_int 2)] + UNSPECV_DELAY_CYCLES) + (set (match_dup 1) + (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER)) + (clobber (match_dup 2)) + (clobber (reg:CC REG_CC))])] + "" + [(set_attr "isa" "no_tiny,tiny")]) + +(define_insn "*delay_cycles_2" + [(unspec_volatile [(match_operand:HI 0 "const_int_operand" "n,n") + (const_int 2)] + UNSPECV_DELAY_CYCLES) + (set (match_operand:BLK 1 "" "") + (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER)) + (clobber (match_scratch:HI 2 "=&w,&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ ldi %A2,lo8(%0)\;ldi %B2,hi8(%0)\n1: sbiw %A2,1\;brne 1b ldi %A2,lo8(%0)\;ldi %B2,hi8(%0)\n1: subi %A2,1\;sbci %B2,0\;brne 1b" [(set_attr "length" "4,5") - (set_attr "isa" "no_tiny,tiny") - (set_attr "cc" "clobber")]) + (set_attr "isa" "no_tiny,tiny")]) -(define_insn "delay_cycles_3" +(define_insn_and_split "delay_cycles_3" [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "n") (const_int 3)] UNSPECV_DELAY_CYCLES) @@ -5943,6 +8072,29 @@ (clobber (match_scratch:QI 3 "=&d")) (clobber (match_scratch:QI 4 "=&d"))] "" + "#" + "&& reload_completed" + [(parallel [(unspec_volatile [(match_dup 0) + (const_int 3)] + UNSPECV_DELAY_CYCLES) + (set (match_dup 1) + (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER)) + (clobber (match_dup 2)) + (clobber (match_dup 3)) + (clobber (match_dup 4)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*delay_cycles_3" + [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "n") + (const_int 3)] + UNSPECV_DELAY_CYCLES) + (set (match_operand:BLK 1 "" "") + (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER)) + (clobber (match_scratch:QI 2 "=&d")) + (clobber (match_scratch:QI 3 "=&d")) + (clobber (match_scratch:QI 4 "=&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" "ldi %2,lo8(%0) ldi %3,hi8(%0) ldi %4,hlo8(%0) @@ -5950,10 +8102,9 @@ sbci %3,0 sbci %4,0 brne 1b" - [(set_attr "length" "7") - (set_attr "cc" "clobber")]) + [(set_attr "length" "7")]) -(define_insn "delay_cycles_4" +(define_insn_and_split "delay_cycles_4" [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "n") (const_int 4)] UNSPECV_DELAY_CYCLES) @@ -5964,6 +8115,31 @@ (clobber (match_scratch:QI 4 "=&d")) (clobber (match_scratch:QI 5 "=&d"))] "" + "#" + "&& reload_completed" + [(parallel [(unspec_volatile [(match_dup 0) + (const_int 4)] + UNSPECV_DELAY_CYCLES) + (set (match_dup 1) + (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER)) + (clobber (match_dup 2)) + (clobber (match_dup 3)) + (clobber (match_dup 4)) + (clobber (match_dup 5)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*delay_cycles_4" + [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "n") + (const_int 4)] + UNSPECV_DELAY_CYCLES) + (set (match_operand:BLK 1 "" "") + (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER)) + (clobber (match_scratch:QI 2 "=&d")) + (clobber (match_scratch:QI 3 "=&d")) + (clobber (match_scratch:QI 4 "=&d")) + (clobber (match_scratch:QI 5 "=&d")) + (clobber (reg:CC REG_CC))] + "reload_completed" "ldi %2,lo8(%0) ldi %3,hi8(%0) ldi %4,hlo8(%0) @@ -5973,24 +8149,39 @@ sbci %4,0 sbci %5,0 brne 1b" - [(set_attr "length" "9") - (set_attr "cc" "clobber")]) + [(set_attr "length" "9")]) ;; __builtin_avr_insert_bits -(define_insn "insert_bits" +(define_insn_and_split "insert_bits" [(set (match_operand:QI 0 "register_operand" "=r ,d ,r") (unspec:QI [(match_operand:SI 1 "const_int_operand" "C0f,Cxf,C0f") (match_operand:QI 2 "register_operand" "r ,r ,r") (match_operand:QI 3 "nonmemory_operand" "n ,0 ,0")] UNSPEC_INSERT_BITS))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (unspec:QI [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_INSERT_BITS)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*insert_bits" + [(set (match_operand:QI 0 "register_operand" "=r ,d ,r") + (unspec:QI [(match_operand:SI 1 "const_int_operand" "C0f,Cxf,C0f") + (match_operand:QI 2 "register_operand" "r ,r ,r") + (match_operand:QI 3 "nonmemory_operand" "n ,0 ,0")] + UNSPEC_INSERT_BITS)) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_insert_bits (operands, NULL); } - [(set_attr "adjust_len" "insert_bits") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "insert_bits")]) ;; __builtin_avr_flash_segment @@ -6001,17 +8192,31 @@ [(set (match_operand:QI 0 "register_operand" "") (subreg:QI (match_operand:PSI 1 "register_operand" "") 2)) - (set (cc0) - (compare (match_dup 0) - (const_int 0))) (set (pc) - (if_then_else (ge (cc0) + (if_then_else (ge (match_dup 0) (const_int 0)) (label_ref (match_operand 2 "" "")) (pc))) (set (match_dup 0) (const_int -1))]) +(define_insn_and_split "*flash_segment1" + [(set (pc) + (if_then_else (ge (match_operand:QI 0 "register_operand" "") + (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc)))] + "" + "#" + "reload_completed" + [(set (reg:CC REG_CC) + (compare:CC (match_dup 0) (const_int 0))) + (set (pc) + (if_then_else (ge (reg:CC REG_CC) (const_int 0)) + (label_ref (match_dup 1)) + (pc)))] + "") + (define_expand "flash_segment" [(parallel [(match_operand:QI 0 "register_operand" "") (match_operand:PSI 1 "register_operand" "")])] @@ -6092,29 +8297,59 @@ operands[2] = gen_reg_rtx (HImode); }) -(define_insn "*parityhi2.libgcc" +(define_insn_and_split "*parityhi2.libgcc_split" [(set (reg:HI 24) (parity:HI (reg:HI 24)))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) + (parity:HI (reg:HI 24))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*parityhi2.libgcc" + [(set (reg:HI 24) + (parity:HI (reg:HI 24))) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __parityhi2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) -(define_insn "*parityqihi2.libgcc" +(define_insn_and_split "*parityqihi2.libgcc_split" [(set (reg:HI 24) (zero_extend:HI (parity:QI (reg:QI 24))))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) + (zero_extend:HI (parity:QI (reg:QI 24)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*parityqihi2.libgcc" + [(set (reg:HI 24) + (zero_extend:HI (parity:QI (reg:QI 24)))) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __parityqi2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) -(define_insn "*paritysihi2.libgcc" +(define_insn_and_split "*paritysihi2.libgcc_split" [(set (reg:HI 24) (truncate:HI (parity:SI (reg:SI 22))))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) + (truncate:HI (parity:SI (reg:SI 22)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*paritysihi2.libgcc" + [(set (reg:HI 24) + (truncate:HI (parity:SI (reg:SI 22)))) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __paritysi2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; Popcount @@ -6143,29 +8378,59 @@ operands[2] = gen_reg_rtx (HImode); }) -(define_insn "*popcounthi2.libgcc" +(define_insn_and_split "*popcounthi2.libgcc_split" [(set (reg:HI 24) (popcount:HI (reg:HI 24)))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) + (popcount:HI (reg:HI 24))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*popcounthi2.libgcc" + [(set (reg:HI 24) + (popcount:HI (reg:HI 24))) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __popcounthi2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) -(define_insn "*popcountsi2.libgcc" +(define_insn_and_split "*popcountsi2.libgcc_split" [(set (reg:HI 24) (truncate:HI (popcount:SI (reg:SI 22))))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) + (truncate:HI (popcount:SI (reg:SI 22)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*popcountsi2.libgcc" + [(set (reg:HI 24) + (truncate:HI (popcount:SI (reg:SI 22)))) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __popcountsi2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) -(define_insn "*popcountqi2.libgcc" +(define_insn_and_split "*popcountqi2.libgcc_split" [(set (reg:QI 24) (popcount:QI (reg:QI 24)))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:QI 24) + (popcount:QI (reg:QI 24))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*popcountqi2.libgcc" + [(set (reg:QI 24) + (popcount:QI (reg:QI 24))) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __popcountqi2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) (define_insn_and_split "*popcountqihi2.libgcc" [(set (reg:HI 24) @@ -6204,23 +8469,47 @@ operands[2] = gen_reg_rtx (HImode); }) -(define_insn "*clzhi2.libgcc" +(define_insn_and_split "*clzhi2.libgcc_split" [(set (reg:HI 24) (clz:HI (reg:HI 24))) (clobber (reg:QI 26))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) + (clz:HI (reg:HI 24))) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*clzhi2.libgcc" + [(set (reg:HI 24) + (clz:HI (reg:HI 24))) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __clzhi2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) -(define_insn "*clzsihi2.libgcc" +(define_insn_and_split "*clzsihi2.libgcc_split" [(set (reg:HI 24) (truncate:HI (clz:SI (reg:SI 22)))) (clobber (reg:QI 26))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) + (truncate:HI (clz:SI (reg:SI 22)))) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*clzsihi2.libgcc" + [(set (reg:HI 24) + (truncate:HI (clz:SI (reg:SI 22)))) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __clzsi2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; Count Trailing Zeros @@ -6249,24 +8538,50 @@ operands[2] = gen_reg_rtx (HImode); }) -(define_insn "*ctzhi2.libgcc" +(define_insn_and_split "*ctzhi2.libgcc_split" [(set (reg:HI 24) (ctz:HI (reg:HI 24))) (clobber (reg:QI 26))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) + (ctz:HI (reg:HI 24))) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ctzhi2.libgcc" + [(set (reg:HI 24) + (ctz:HI (reg:HI 24))) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __ctzhi2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) -(define_insn "*ctzsihi2.libgcc" +(define_insn_and_split "*ctzsihi2.libgcc_split" [(set (reg:HI 24) (truncate:HI (ctz:SI (reg:SI 22)))) (clobber (reg:QI 22)) (clobber (reg:QI 26))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) + (truncate:HI (ctz:SI (reg:SI 22)))) + (clobber (reg:QI 22)) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ctzsihi2.libgcc" + [(set (reg:HI 24) + (truncate:HI (ctz:SI (reg:SI 22)))) + (clobber (reg:QI 22)) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __ctzsi2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; Find First Set @@ -6295,24 +8610,50 @@ operands[2] = gen_reg_rtx (HImode); }) -(define_insn "*ffshi2.libgcc" +(define_insn_and_split "*ffshi2.libgcc_split" [(set (reg:HI 24) (ffs:HI (reg:HI 24))) (clobber (reg:QI 26))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) + (ffs:HI (reg:HI 24))) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ffshi2.libgcc" + [(set (reg:HI 24) + (ffs:HI (reg:HI 24))) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __ffshi2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) -(define_insn "*ffssihi2.libgcc" +(define_insn_and_split "*ffssihi2.libgcc_split" [(set (reg:HI 24) (truncate:HI (ffs:SI (reg:SI 22)))) (clobber (reg:QI 22)) (clobber (reg:QI 26))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 24) + (truncate:HI (ffs:SI (reg:SI 22)))) + (clobber (reg:QI 22)) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*ffssihi2.libgcc" + [(set (reg:HI 24) + (truncate:HI (ffs:SI (reg:SI 22)))) + (clobber (reg:QI 22)) + (clobber (reg:QI 26)) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __ffssi2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; Copysign @@ -6323,8 +8664,7 @@ UNSPEC_COPYSIGN))] "" "bst %D2,7\;bld %D0,7" - [(set_attr "length" "2") - (set_attr "cc" "none")]) + [(set_attr "length" "2")]) ;; Swap Bytes (change byte-endianness) @@ -6336,13 +8676,23 @@ (set (match_operand:SI 0 "register_operand" "") (reg:SI 22))]) -(define_insn "*bswapsi2.libgcc" +(define_insn_and_split "*bswapsi2.libgcc_split" [(set (reg:SI 22) (bswap:SI (reg:SI 22)))] "" + "#" + "&& reload_completed" + [(parallel [(set (reg:SI 22) + (bswap:SI (reg:SI 22))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*bswapsi2.libgcc" + [(set (reg:SI 22) + (bswap:SI (reg:SI 22))) + (clobber (reg:CC REG_CC))] + "reload_completed" "%~call __bswapsi2" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; CPU instructions @@ -6369,8 +8719,7 @@ "@ nop rjmp ." - [(set_attr "length" "1") - (set_attr "cc" "none")]) + [(set_attr "length" "1")]) ;; SLEEP (define_expand "sleep" @@ -6390,8 +8739,7 @@ (unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMORY_BARRIER))] "" "sleep" - [(set_attr "length" "1") - (set_attr "cc" "none")]) + [(set_attr "length" "1")]) ;; WDR (define_expand "wdr" @@ -6411,8 +8759,7 @@ (unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMORY_BARRIER))] "" "wdr" - [(set_attr "length" "1") - (set_attr "cc" "none")]) + [(set_attr "length" "1")]) ;; FMUL (define_expand "fmul" @@ -6436,27 +8783,55 @@ avr_fix_inputs (operands, 1 << 2, regmask (QImode, 24)); }) -(define_insn "fmul_insn" +(define_insn_and_split "fmul_insn" [(set (match_operand:HI 0 "register_operand" "=r") (unspec:HI [(match_operand:QI 1 "register_operand" "a") (match_operand:QI 2 "register_operand" "a")] UNSPEC_FMUL))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (unspec:HI [(match_dup 1) + (match_dup 2)] + UNSPEC_FMUL)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*fmul_insn" + [(set (match_operand:HI 0 "register_operand" "=r") + (unspec:HI [(match_operand:QI 1 "register_operand" "a") + (match_operand:QI 2 "register_operand" "a")] + UNSPEC_FMUL)) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "fmul %1,%2 movw %0,r0 clr __zero_reg__" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) -(define_insn "*fmul.call" +(define_insn_and_split "*fmul.call_split" [(set (reg:HI 22) (unspec:HI [(reg:QI 24) (reg:QI 25)] UNSPEC_FMUL)) (clobber (reg:HI 24))] "!AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 22) + (unspec:HI [(reg:QI 24) + (reg:QI 25)] UNSPEC_FMUL)) + (clobber (reg:HI 24)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*fmul.call" + [(set (reg:HI 22) + (unspec:HI [(reg:QI 24) + (reg:QI 25)] UNSPEC_FMUL)) + (clobber (reg:HI 24)) + (clobber (reg:CC REG_CC))] + "!AVR_HAVE_MUL && reload_completed" "%~call __fmul" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; FMULS (define_expand "fmuls" @@ -6480,27 +8855,55 @@ avr_fix_inputs (operands, 1 << 2, regmask (QImode, 24)); }) -(define_insn "fmuls_insn" +(define_insn_and_split "fmuls_insn" [(set (match_operand:HI 0 "register_operand" "=r") (unspec:HI [(match_operand:QI 1 "register_operand" "a") (match_operand:QI 2 "register_operand" "a")] UNSPEC_FMULS))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (unspec:HI [(match_dup 1) + (match_dup 2)] + UNSPEC_FMULS)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*fmuls_insn" + [(set (match_operand:HI 0 "register_operand" "=r") + (unspec:HI [(match_operand:QI 1 "register_operand" "a") + (match_operand:QI 2 "register_operand" "a")] + UNSPEC_FMULS)) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "fmuls %1,%2 movw %0,r0 clr __zero_reg__" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) -(define_insn "*fmuls.call" +(define_insn_and_split "*fmuls.call_split" [(set (reg:HI 22) (unspec:HI [(reg:QI 24) (reg:QI 25)] UNSPEC_FMULS)) (clobber (reg:HI 24))] "!AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 22) + (unspec:HI [(reg:QI 24) + (reg:QI 25)] UNSPEC_FMULS)) + (clobber (reg:HI 24)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*fmuls.call" + [(set (reg:HI 22) + (unspec:HI [(reg:QI 24) + (reg:QI 25)] UNSPEC_FMULS)) + (clobber (reg:HI 24)) + (clobber (reg:CC REG_CC))] + "!AVR_HAVE_MUL && reload_completed" "%~call __fmuls" - [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + [(set_attr "type" "xcall")]) ;; FMULSU (define_expand "fmulsu" @@ -6524,27 +8927,56 @@ avr_fix_inputs (operands, 1 << 2, regmask (QImode, 24)); }) -(define_insn "fmulsu_insn" +(define_insn_and_split "fmulsu_insn" [(set (match_operand:HI 0 "register_operand" "=r") (unspec:HI [(match_operand:QI 1 "register_operand" "a") (match_operand:QI 2 "register_operand" "a")] UNSPEC_FMULSU))] "AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (unspec:HI [(match_dup 1) + (match_dup 2)] + UNSPEC_FMULSU)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*fmulsu_insn" + [(set (match_operand:HI 0 "register_operand" "=r") + (unspec:HI [(match_operand:QI 1 "register_operand" "a") + (match_operand:QI 2 "register_operand" "a")] + UNSPEC_FMULSU)) + (clobber (reg:CC REG_CC))] + "AVR_HAVE_MUL && reload_completed" "fmulsu %1,%2 movw %0,r0 clr __zero_reg__" - [(set_attr "length" "3") - (set_attr "cc" "clobber")]) + [(set_attr "length" "3")]) -(define_insn "*fmulsu.call" +(define_insn_and_split "*fmulsu.call_split" [(set (reg:HI 22) (unspec:HI [(reg:QI 24) (reg:QI 25)] UNSPEC_FMULSU)) (clobber (reg:HI 24))] "!AVR_HAVE_MUL" + "#" + "&& reload_completed" + [(parallel [(set (reg:HI 22) + (unspec:HI [(reg:QI 24) + (reg:QI 25)] UNSPEC_FMULSU)) + (clobber (reg:HI 24)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*fmulsu.call" + [(set (reg:HI 22) + (unspec:HI [(reg:QI 24) + (reg:QI 25)] UNSPEC_FMULSU)) + (clobber (reg:HI 24)) + (clobber (reg:CC REG_CC))] + "!AVR_HAVE_MUL && reload_completed" "%~call __fmulsu" [(set_attr "type" "xcall") - (set_attr "cc" "clobber")]) + ]) ;; Some combiner patterns dealing with bits. @@ -6561,8 +8993,7 @@ "INTVAL(operands[4]) == exact_log2 (~INTVAL(operands[2]) & GET_MODE_MASK (QImode)) && INTVAL(operands[4]) == exact_log2 (INTVAL(operands[5]) & GET_MODE_MASK (QImode))" "bst %3,0\;bld %0,%4" - [(set_attr "length" "2") - (set_attr "cc" "none")]) + [(set_attr "length" "2")]) ;; Move bit $3.0 into bit $0.$4 ;; Variation of above. Unfortunately, there is no canonicalized representation @@ -6577,8 +9008,7 @@ (match_operand:QI 4 "const_0_to_7_operand" "n"))))] "INTVAL(operands[4]) == exact_log2 (~INTVAL(operands[2]) & GET_MODE_MASK (QImode))" "bst %3,0\;bld %0,%4" - [(set_attr "length" "2") - (set_attr "cc" "none")]) + [(set_attr "length" "2")]) ;; Move bit $3.0 into bit $0.0. ;; For bit 0, combiner generates slightly different pattern. @@ -6590,8 +9020,7 @@ (const_int 1))))] "0 == exact_log2 (~INTVAL(operands[2]) & GET_MODE_MASK (QImode))" "bst %3,0\;bld %0,0" - [(set_attr "length" "2") - (set_attr "cc" "none")]) + [(set_attr "length" "2")]) ;; Move bit $2.0 into bit $0.7. ;; For bit 7, combiner generates slightly different pattern @@ -6603,8 +9032,7 @@ (const_int 7))))] "" "bst %2,0\;bld %0,7" - [(set_attr "length" "2") - (set_attr "cc" "none")]) + [(set_attr "length" "2")]) ;; Combiner transforms above four pattern into ZERO_EXTRACT if it sees MEM ;; and input/output match. We provide a special pattern for this, because @@ -6620,8 +9048,7 @@ cbi %i0,%1 sbi %i0,%1 sbrc %2,0\;sbi %i0,%1\;sbrs %2,0\;cbi %i0,%1" - [(set_attr "length" "1,1,4") - (set_attr "cc" "none")]) + [(set_attr "length" "1,1,4")]) (define_insn "*insv.not.io" [(set (zero_extract:QI (mem:QI (match_operand 0 "low_io_address_operand" "i")) @@ -6630,8 +9057,7 @@ (not:QI (match_operand:QI 2 "register_operand" "r")))] "" "sbrs %2,0\;sbi %i0,%1\;sbrc %2,0\;cbi %i0,%1" - [(set_attr "length" "4") - (set_attr "cc" "none")]) + [(set_attr "length" "4")]) ;; The insv expander. ;; We only support 1-bit inserts @@ -6648,20 +9074,34 @@ ;; complicated. ;; Insert bit $2.0 into $0.$1 -(define_insn "*insv.reg" +(define_insn_and_split "*insv.reg_split" [(set (zero_extract:QI (match_operand:QI 0 "register_operand" "+r,d,d,l,l") (const_int 1) (match_operand:QI 1 "const_0_to_7_operand" "n,n,n,n,n")) (match_operand:QI 2 "nonmemory_operand" "r,L,P,L,P"))] "" + "#" + "&& reload_completed" + [(parallel [(set (zero_extract:QI (match_dup 0) + (const_int 1) + (match_dup 1)) + (match_dup 2)) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*insv.reg" + [(set (zero_extract:QI (match_operand:QI 0 "register_operand" "+r,d,d,l,l") + (const_int 1) + (match_operand:QI 1 "const_0_to_7_operand" "n,n,n,n,n")) + (match_operand:QI 2 "nonmemory_operand" "r,L,P,L,P")) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ bst %2,0\;bld %0,%1 andi %0,lo8(~(1<<%1)) ori %0,lo8(1<<%1) clt\;bld %0,%1 set\;bld %0,%1" - [(set_attr "length" "2,1,1,2,2") - (set_attr "cc" "none,set_zn,set_zn,none,none")]) + [(set_attr "length" "2,1,1,2,2")]) ;; Insert bit $2.$3 into $0.$1 (define_insn "*insv.extract" @@ -6673,8 +9113,7 @@ (match_operand:QI 3 "const_0_to_7_operand" "n")))] "" "bst %2,%3\;bld %0,%1" - [(set_attr "length" "2") - (set_attr "cc" "none")]) + [(set_attr "length" "2")]) ;; Insert bit $2.$3 into $0.$1 (define_insn "*insv.shiftrt" @@ -6685,67 +9124,128 @@ (match_operand:QI 3 "const_0_to_7_operand" "n")))] "" "bst %2,%3\;bld %0,%1" - [(set_attr "length" "2") - (set_attr "cc" "none")]) + [(set_attr "length" "2")]) ;; Same, but with a NOT inverting the source bit. ;; Insert bit ~$2.$3 into $0.$1 -(define_insn "*insv.not-shiftrt" +(define_insn_and_split "*insv.not-shiftrt_split" [(set (zero_extract:QI (match_operand:QI 0 "register_operand" "+r") (const_int 1) (match_operand:QI 1 "const_0_to_7_operand" "n")) (not:QI (any_shiftrt:QI (match_operand:QI 2 "register_operand" "r") (match_operand:QI 3 "const_0_to_7_operand" "n"))))] "" + "#" + "&& reload_completed" + [(parallel [(set (zero_extract:QI (match_dup 0) + (const_int 1) + (match_dup 1)) + (not:QI (any_shiftrt:QI (match_dup 2) + (match_dup 3)))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*insv.not-shiftrt" + [(set (zero_extract:QI (match_operand:QI 0 "register_operand" "+r") + (const_int 1) + (match_operand:QI 1 "const_0_to_7_operand" "n")) + (not:QI (any_shiftrt:QI (match_operand:QI 2 "register_operand" "r") + (match_operand:QI 3 "const_0_to_7_operand" "n")))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_insert_notbit (insn, operands, NULL_RTX, NULL); } - [(set_attr "adjust_len" "insv_notbit") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "insv_notbit")]) ;; Insert bit ~$2.0 into $0.$1 -(define_insn "*insv.xor1-bit.0" +(define_insn_and_split "*insv.xor1-bit.0_split" [(set (zero_extract:QI (match_operand:QI 0 "register_operand" "+r") (const_int 1) (match_operand:QI 1 "const_0_to_7_operand" "n")) (xor:QI (match_operand:QI 2 "register_operand" "r") (const_int 1)))] "" + "#" + "&& reload_completed" + [(parallel [(set (zero_extract:QI (match_dup 0) + (const_int 1) + (match_dup 1)) + (xor:QI (match_dup 2) + (const_int 1))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*insv.xor1-bit.0" + [(set (zero_extract:QI (match_operand:QI 0 "register_operand" "+r") + (const_int 1) + (match_operand:QI 1 "const_0_to_7_operand" "n")) + (xor:QI (match_operand:QI 2 "register_operand" "r") + (const_int 1))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_insert_notbit (insn, operands, const0_rtx, NULL); } - [(set_attr "adjust_len" "insv_notbit_0") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "insv_notbit_0")]) ;; Insert bit ~$2.0 into $0.$1 -(define_insn "*insv.not-bit.0" +(define_insn_and_split "*insv.not-bit.0_split" [(set (zero_extract:QI (match_operand:QI 0 "register_operand" "+r") (const_int 1) (match_operand:QI 1 "const_0_to_7_operand" "n")) (not:QI (match_operand:QI 2 "register_operand" "r")))] "" + "#" + "&& reload_completed" + [(parallel [(set (zero_extract:QI (match_dup 0) + (const_int 1) + (match_dup 1)) + (not:QI (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*insv.not-bit.0" + [(set (zero_extract:QI (match_operand:QI 0 "register_operand" "+r") + (const_int 1) + (match_operand:QI 1 "const_0_to_7_operand" "n")) + (not:QI (match_operand:QI 2 "register_operand" "r"))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_insert_notbit (insn, operands, const0_rtx, NULL); } - [(set_attr "adjust_len" "insv_notbit_0") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "insv_notbit_0")]) ;; Insert bit ~$2.7 into $0.$1 -(define_insn "*insv.not-bit.7" +(define_insn_and_split "*insv.not-bit.7_split" [(set (zero_extract:QI (match_operand:QI 0 "register_operand" "+r") (const_int 1) (match_operand:QI 1 "const_0_to_7_operand" "n")) (ge:QI (match_operand:QI 2 "register_operand" "r") (const_int 0)))] "" + "#" + "&& reload_completed" + [(parallel [(set (zero_extract:QI (match_dup 0) + (const_int 1) + (match_dup 1)) + (ge:QI (match_dup 2) + (const_int 0))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*insv.not-bit.7" + [(set (zero_extract:QI (match_operand:QI 0 "register_operand" "+r") + (const_int 1) + (match_operand:QI 1 "const_0_to_7_operand" "n")) + (ge:QI (match_operand:QI 2 "register_operand" "r") + (const_int 0))) + (clobber (reg:CC REG_CC))] + "reload_completed" { return avr_out_insert_notbit (insn, operands, GEN_INT (7), NULL); } - [(set_attr "adjust_len" "insv_notbit_7") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "insv_notbit_7")]) ;; Insert bit ~$2.$3 into $0.$1 -(define_insn "*insv.xor-extract" +(define_insn_and_split "*insv.xor-extract_split" [(set (zero_extract:QI (match_operand:QI 0 "register_operand" "+r") (const_int 1) (match_operand:QI 1 "const_0_to_7_operand" "n")) @@ -6754,11 +9254,31 @@ (const_int 1) (match_operand:QI 3 "const_0_to_7_operand" "n")))] "INTVAL (operands[4]) & (1 << INTVAL (operands[3]))" + "#" + "&& reload_completed" + [(parallel [(set (zero_extract:QI (match_dup 0) + (const_int 1) + (match_dup 1)) + (any_extract:QI (xor:QI (match_dup 2) + (match_dup 4)) + (const_int 1) + (match_dup 3))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*insv.xor-extract" + [(set (zero_extract:QI (match_operand:QI 0 "register_operand" "+r") + (const_int 1) + (match_operand:QI 1 "const_0_to_7_operand" "n")) + (any_extract:QI (xor:QI (match_operand:QI 2 "register_operand" "r") + (match_operand:QI 4 "const_int_operand" "n")) + (const_int 1) + (match_operand:QI 3 "const_0_to_7_operand" "n"))) + (clobber (reg:CC REG_CC))] + "INTVAL (operands[4]) & (1 << INTVAL (operands[3])) && reload_completed" { return avr_out_insert_notbit (insn, operands, NULL_RTX, NULL); } - [(set_attr "adjust_len" "insv_notbit") - (set_attr "cc" "clobber")]) + [(set_attr "adjust_len" "insv_notbit")]) ;; Some combine patterns that try to fix bad code when a value is composed @@ -6887,20 +9407,34 @@ (match_operand:QI 2 "const1_operand" "") (match_operand:QI 3 "const_0_to_7_operand" "")))]) -(define_insn "*extzv" +(define_insn_and_split "*extzv_split" [(set (match_operand:QI 0 "register_operand" "=*d,*d,*d,*d,r") (zero_extract:QI (match_operand:QI 1 "register_operand" "0,r,0,0,r") (const_int 1) (match_operand:QI 2 "const_0_to_7_operand" "L,L,P,C04,n")))] "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) + (zero_extract:QI (match_dup 1) + (const_int 1) + (match_dup 2))) + (clobber (reg:CC REG_CC))])]) + +(define_insn "*extzv" + [(set (match_operand:QI 0 "register_operand" "=*d,*d,*d,*d,r") + (zero_extract:QI (match_operand:QI 1 "register_operand" "0,r,0,0,r") + (const_int 1) + (match_operand:QI 2 "const_0_to_7_operand" "L,L,P,C04,n"))) + (clobber (reg:CC REG_CC))] + "reload_completed" "@ andi %0,1 mov %0,%1\;andi %0,1 lsr %0\;andi %0,1 swap %0\;andi %0,1 bst %1,%2\;clr %0\;bld %0,0" - [(set_attr "length" "1,2,2,2,3") - (set_attr "cc" "set_zn,set_zn,set_zn,set_zn,clobber")]) + [(set_attr "length" "1,2,2,2,3")]) (define_insn_and_split "*extzv.qihi1" [(set (match_operand:HI 0 "register_operand" "=r") -- cgit v1.1 From a53b8229e64c78256449005929e599b2eab83fbd Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 8 Feb 2021 11:37:29 +0000 Subject: aarch64: Use RTL builtins for vq[r]dmulh[q]_n intrinsics Rewrite vq[r]dmulh[q]_n Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-08 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Add sq[r]dmulh_n builtin generator macros. * config/aarch64/aarch64-simd.md (aarch64_sqdmulh_n): Define. * config/aarch64/arm_neon.h (vqdmulh_n_s16): Use RTL builtin instead of inline asm. (vqdmulh_n_s32): Likewise. (vqdmulhq_n_s16): Likewise. (vqdmulhq_n_s32): Likewise. (vqrdmulh_n_s16): Likewise. (vqrdmulh_n_s32): Likewise. (vqrdmulhq_n_s16): Likewise. (vqrdmulhq_n_s32): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 3 ++ gcc/config/aarch64/aarch64-simd.md | 12 ++++++ gcc/config/aarch64/arm_neon.h | 56 ++++------------------------ 3 files changed, 23 insertions(+), 48 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index b885bd5..f79e716 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -348,6 +348,9 @@ /* Implemented by aarch64_sqdmulh. */ BUILTIN_VSDQ_HSI (BINOP, sqdmulh, 0, NONE) BUILTIN_VSDQ_HSI (BINOP, sqrdmulh, 0, NONE) + /* Implemented by aarch64_sqdmulh_n. */ + BUILTIN_VDQHS (BINOP, sqdmulh_n, 0, NONE) + BUILTIN_VDQHS (BINOP, sqrdmulh_n, 0, NONE) /* Implemented by aarch64_sqdmulh_lane. */ BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_lane, 0, NONE) BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_laneq, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 4edee99..5245cf0 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4639,6 +4639,18 @@ [(set_attr "type" "neon_sat_mul_")] ) +(define_insn "aarch64_sqdmulh_n" + [(set (match_operand:VDQHS 0 "register_operand" "=w") + (unspec:VDQHS + [(match_operand:VDQHS 1 "register_operand" "w") + (vec_duplicate:VDQHS + (match_operand: 2 "register_operand" ""))] + VQDMULH))] + "TARGET_SIMD" + "sqdmulh\\t%0., %1., %2.[0]" + [(set_attr "type" "neon_sat_mul__scalar")] +) + ;; sqdmulh_lane (define_insn "aarch64_sqdmulh_lane" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index baa30bd..5fb2b3d 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -8769,48 +8769,28 @@ __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqdmulh_n_s16 (int16x4_t __a, int16_t __b) { - int16x4_t __result; - __asm__ ("sqdmulh %0.4h,%1.4h,%2.h[0]" - : "=w"(__result) - : "w"(__a), "x"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqdmulh_nv4hi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqdmulh_n_s32 (int32x2_t __a, int32_t __b) { - int32x2_t __result; - __asm__ ("sqdmulh %0.2s,%1.2s,%2.s[0]" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqdmulh_nv2si (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqdmulhq_n_s16 (int16x8_t __a, int16_t __b) { - int16x8_t __result; - __asm__ ("sqdmulh %0.8h,%1.8h,%2.h[0]" - : "=w"(__result) - : "w"(__a), "x"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqdmulh_nv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqdmulhq_n_s32 (int32x4_t __a, int32_t __b) { - int32x4_t __result; - __asm__ ("sqdmulh %0.4s,%1.4s,%2.s[0]" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqdmulh_nv4si (__a, __b); } __extension__ extern __inline int8x16_t @@ -8880,48 +8860,28 @@ __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqrdmulh_n_s16 (int16x4_t __a, int16_t __b) { - int16x4_t __result; - __asm__ ("sqrdmulh %0.4h,%1.4h,%2.h[0]" - : "=w"(__result) - : "w"(__a), "x"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqrdmulh_nv4hi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqrdmulh_n_s32 (int32x2_t __a, int32_t __b) { - int32x2_t __result; - __asm__ ("sqrdmulh %0.2s,%1.2s,%2.s[0]" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqrdmulh_nv2si (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqrdmulhq_n_s16 (int16x8_t __a, int16_t __b) { - int16x8_t __result; - __asm__ ("sqrdmulh %0.8h,%1.8h,%2.h[0]" - : "=w"(__result) - : "w"(__a), "x"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqrdmulh_nv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b) { - int32x4_t __result; - __asm__ ("sqrdmulh %0.4s,%1.4s,%2.s[0]" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqrdmulh_nv4si (__a, __b); } __extension__ extern __inline int8x16_t -- cgit v1.1 From eb2b36024c94bc32465777927092cdbdf2d95204 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 8 Feb 2021 16:50:30 +0000 Subject: aarch64: Use RTL builtins for vpaddq intrinsics Rewrite vpaddq Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-08 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Use VDQ_I iterator for aarch64_addp builtin macro generator. * config/aarch64/aarch64-simd.md: Use VDQ_I iterator in aarch64_addp RTL pattern. * config/aarch64/arm_neon.h (vpaddq_s8): Use RTL builtin instead of inline asm. (vpaddq_s16): Likewise. (vpaddq_s32): Likewise. (vpaddq_s64): Likewise. (vpaddq_u8): Likewise. (vpaddq_u16): Likewise. (vpaddq_u32): Likewise. (vpaddq_u64): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 2 +- gcc/config/aarch64/aarch64-simd.md | 8 ++-- gcc/config/aarch64/arm_neon.h | 60 ++++++---------------------- 3 files changed, 17 insertions(+), 53 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index f79e716..92804e0 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -48,7 +48,7 @@ BUILTIN_VB (BINOP, pmul, 0, NONE) BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP) BUILTIN_VHSDF_DF (UNOP, sqrt, 2, FP) - BUILTIN_VD_BHSI (BINOP, addp, 0, NONE) + BUILTIN_VDQ_I (BINOP, addp, 0, NONE) VAR1 (UNOP, addp, 0, NONE, di) BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, NONE) BUILTIN_VDQ_BHSI (UNOP, clz, 2, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 5245cf0..60e11c6 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -6004,10 +6004,10 @@ ;; addp (define_insn "aarch64_addp" - [(set (match_operand:VD_BHSI 0 "register_operand" "=w") - (unspec:VD_BHSI - [(match_operand:VD_BHSI 1 "register_operand" "w") - (match_operand:VD_BHSI 2 "register_operand" "w")] + [(set (match_operand:VDQ_I 0 "register_operand" "=w") + (unspec:VDQ_I + [(match_operand:VDQ_I 1 "register_operand" "w") + (match_operand:VDQ_I 2 "register_operand" "w")] UNSPEC_ADDP))] "TARGET_SIMD" "addp\t%0, %1, %2" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 5fb2b3d..52f3714 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -8673,96 +8673,60 @@ __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_s8 (int8x16_t __a, int8x16_t __b) { - int8x16_t __result; - __asm__ ("addp %0.16b,%1.16b,%2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_addpv16qi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_s16 (int16x8_t __a, int16x8_t __b) { - int16x8_t __result; - __asm__ ("addp %0.8h,%1.8h,%2.8h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_addpv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_s32 (int32x4_t __a, int32x4_t __b) { - int32x4_t __result; - __asm__ ("addp %0.4s,%1.4s,%2.4s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_addpv4si (__a, __b); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_s64 (int64x2_t __a, int64x2_t __b) { - int64x2_t __result; - __asm__ ("addp %0.2d,%1.2d,%2.2d" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_addpv2di (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_u8 (uint8x16_t __a, uint8x16_t __b) { - uint8x16_t __result; - __asm__ ("addp %0.16b,%1.16b,%2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (uint8x16_t) __builtin_aarch64_addpv16qi ((int8x16_t) __a, + (int8x16_t) __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_u16 (uint16x8_t __a, uint16x8_t __b) { - uint16x8_t __result; - __asm__ ("addp %0.8h,%1.8h,%2.8h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (uint16x8_t) __builtin_aarch64_addpv8hi ((int16x8_t) __a, + (int16x8_t) __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_u32 (uint32x4_t __a, uint32x4_t __b) { - uint32x4_t __result; - __asm__ ("addp %0.4s,%1.4s,%2.4s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (uint32x4_t) __builtin_aarch64_addpv4si ((int32x4_t) __a, + (int32x4_t) __b); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_u64 (uint64x2_t __a, uint64x2_t __b) { - uint64x2_t __result; - __asm__ ("addp %0.2d,%1.2d,%2.2d" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (uint64x2_t) __builtin_aarch64_addpv2di ((int64x2_t) __a, + (int64x2_t) __b); } __extension__ extern __inline int16x4_t -- cgit v1.1 From fa18085a32df06be6e7d899fd804d537c0149baf Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 8 Feb 2021 21:23:48 +0000 Subject: aarch64: Use RTL builtins for [su]paddl[q] intrinsics Rewrite [su]paddl[q] Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-08 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Add [su]addlp builtin generator macros. * config/aarch64/aarch64-simd.md (aarch64_addlp): Define. * config/aarch64/arm_neon.h (vpaddl_s8): Use RTL builtin instead of inline asm. (vpaddl_s16): Likewise. (vpaddl_s32): Likewise. (vpaddl_u8): Likewise. (vpaddl_u16): Likewise. (vpaddl_u32): Likewise. (vpaddlq_s8): Likewise. (vpaddlq_s16): Likewise. (vpaddlq_s32): Likewise. (vpaddlq_u8): Likewise. (vpaddlq_u16): Likewise. (vpaddlq_u32): Liwewise. * config/aarch64/iterators.md: Define [SU]ADDLP unspecs with appropriate attributes. --- gcc/config/aarch64/aarch64-simd-builtins.def | 4 ++ gcc/config/aarch64/aarch64-simd.md | 9 +++ gcc/config/aarch64/arm_neon.h | 84 ++++------------------------ gcc/config/aarch64/iterators.md | 6 ++ 4 files changed, 31 insertions(+), 72 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 92804e0..ecf8019 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -154,6 +154,10 @@ BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, NONE) BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, NONE) + /* Implemented by aarch64_addlp. */ + BUILTIN_VDQV_L (UNOP, saddlp, 0, NONE) + BUILTIN_VDQV_L (UNOPU, uaddlp, 0, NONE) + /* Implemented by aarch64_addlv. */ BUILTIN_VDQV_L (UNOP, saddlv, 0, NONE) BUILTIN_VDQV_L (UNOPU, uaddlv, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 60e11c6..8aae6a6 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3149,6 +3149,15 @@ [(set_attr "type" "neon_reduc_add")] ) +(define_insn "aarch64_addlp" + [(set (match_operand: 0 "register_operand" "=w") + (unspec: [(match_operand:VDQV_L 1 "register_operand" "w")] + USADDLP))] + "TARGET_SIMD" + "addlp\\t%0., %1." + [(set_attr "type" "neon_reduc_add")] +) + ;; ADDV with result zero-extended to SI/DImode (for popcount). (define_insn "aarch64_zero_extend_reduc_plus_" [(set (match_operand:GPI 0 "register_operand" "=w") diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 52f3714..7eed6c6 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -8529,144 +8529,84 @@ __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddl_s8 (int8x8_t __a) { - int16x4_t __result; - __asm__ ("saddlp %0.4h,%1.8b" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_saddlpv8qi (__a); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddl_s16 (int16x4_t __a) { - int32x2_t __result; - __asm__ ("saddlp %0.2s,%1.4h" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_saddlpv4hi (__a); } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddl_s32 (int32x2_t __a) { - int64x1_t __result; - __asm__ ("saddlp %0.1d,%1.2s" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return (int64x1_t) __builtin_aarch64_saddlpv2si (__a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddl_u8 (uint8x8_t __a) { - uint16x4_t __result; - __asm__ ("uaddlp %0.4h,%1.8b" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uaddlpv8qi_uu (__a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddl_u16 (uint16x4_t __a) { - uint32x2_t __result; - __asm__ ("uaddlp %0.2s,%1.4h" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uaddlpv4hi_uu (__a); } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddl_u32 (uint32x2_t __a) { - uint64x1_t __result; - __asm__ ("uaddlp %0.1d,%1.2s" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return (uint64x1_t) __builtin_aarch64_uaddlpv2si_uu (__a); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddlq_s8 (int8x16_t __a) { - int16x8_t __result; - __asm__ ("saddlp %0.8h,%1.16b" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_saddlpv16qi (__a); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddlq_s16 (int16x8_t __a) { - int32x4_t __result; - __asm__ ("saddlp %0.4s,%1.8h" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_saddlpv8hi (__a); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddlq_s32 (int32x4_t __a) { - int64x2_t __result; - __asm__ ("saddlp %0.2d,%1.4s" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_saddlpv4si (__a); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddlq_u8 (uint8x16_t __a) { - uint16x8_t __result; - __asm__ ("uaddlp %0.8h,%1.16b" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uaddlpv16qi_uu (__a); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddlq_u16 (uint16x8_t __a) { - uint32x4_t __result; - __asm__ ("uaddlp %0.4s,%1.8h" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uaddlpv8hi_uu (__a); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddlq_u32 (uint32x4_t __a) { - uint64x2_t __result; - __asm__ ("uaddlp %0.2d,%1.4s" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uaddlpv4si_uu (__a); } __extension__ extern __inline int8x16_t diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index a3d895a..8a765ea 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -550,6 +550,8 @@ UNSPEC_SSHLL ; Used in aarch64-simd.md. UNSPEC_USHLL ; Used in aarch64-simd.md. UNSPEC_ADDP ; Used in aarch64-simd.md. + UNSPEC_SADDLP ; Used in aarch64-simd.md. + UNSPEC_UADDLP ; Used in aarch64-simd.md. UNSPEC_TBL ; Used in vector permute patterns. UNSPEC_TBX ; Used in vector permute patterns. UNSPEC_CONCAT ; Used in vector permute patterns. @@ -2209,6 +2211,8 @@ (define_int_iterator SVE_INT_ADDV [UNSPEC_SADDV UNSPEC_UADDV]) +(define_int_iterator USADDLP [UNSPEC_SADDLP UNSPEC_UADDLP]) + (define_int_iterator USADDLV [UNSPEC_SADDLV UNSPEC_UADDLV]) (define_int_iterator LOGICALF [UNSPEC_ANDF UNSPEC_IORF UNSPEC_XORF]) @@ -2961,6 +2965,8 @@ ;; "s" for signed operations and "u" for unsigned ones. (define_int_attr su [(UNSPEC_SADDV "s") (UNSPEC_UADDV "u") + (UNSPEC_SADDLP "s") + (UNSPEC_UADDLP "u") (UNSPEC_SADDLV "s") (UNSPEC_UADDLV "u") (UNSPEC_UNPACKSHI "s") -- cgit v1.1 From 8e7f6e03955244827a513777e4845c98e130319d Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 9 Feb 2021 01:14:00 +0000 Subject: aarch64: Use RTL builtins for vpadal_[su]32 intrinsics Rewrite vpadal_[su]32 Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-09 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Use VDQV_L iterator to generate [su]adalp RTL builtins. * config/aarch64/aarch64-simd.md: Use VDQV_L iterator in [su]adalp RTL pattern. * config/aarch64/arm_neon.h (vpadal_s32): Use RTL builtin instead of inline asm. (vpadal_u32): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 4 ++-- gcc/config/aarch64/aarch64-simd.md | 4 ++-- gcc/config/aarch64/arm_neon.h | 14 ++------------ 3 files changed, 6 insertions(+), 16 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index ecf8019..202f690 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -170,8 +170,8 @@ BUILTIN_VDQ_BHSI (TERNOP, saba, 0, NONE) BUILTIN_VDQ_BHSI (TERNOPU, uaba, 0, NONE) - BUILTIN_VDQV_S (BINOP, sadalp, 0, NONE) - BUILTIN_VDQV_S (BINOPU, uadalp, 0, NONE) + BUILTIN_VDQV_L (BINOP, sadalp, 0, NONE) + BUILTIN_VDQV_L (BINOPU, uadalp, 0, NONE) /* Implemented by aarch64_abal. */ BUILTIN_VD_BHSI (TERNOP, sabal, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 8aae6a6..565ce5a 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -904,8 +904,8 @@ (define_insn "aarch64_adalp" [(set (match_operand: 0 "register_operand" "=w") - (unspec: [(match_operand:VDQV_S 2 "register_operand" "w") - (match_operand: 1 "register_operand" "0")] + (unspec: [(match_operand:VDQV_L 2 "register_operand" "w") + (match_operand: 1 "register_operand" "0")] ADALP))] "TARGET_SIMD" "adalp\t%0., %2." diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 7eed6c6..164c76d 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -8449,12 +8449,7 @@ __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpadal_s32 (int64x1_t __a, int32x2_t __b) { - int64x1_t __result; - __asm__ ("sadalp %0.1d,%2.2s" - : "=w"(__result) - : "0"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (int64x1_t) __builtin_aarch64_sadalpv2si (__a[0], __b); } __extension__ extern __inline uint16x4_t @@ -8475,12 +8470,7 @@ __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpadal_u32 (uint64x1_t __a, uint32x2_t __b) { - uint64x1_t __result; - __asm__ ("uadalp %0.1d,%2.2s" - : "=w"(__result) - : "0"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (uint64x1_t) __builtin_aarch64_uadalpv2si_uuu (__a[0], __b); } __extension__ extern __inline int16x8_t -- cgit v1.1 From 6372b05e5b14f27ddce11c28654956c1ad715dac Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 10 Feb 2021 11:39:39 +0000 Subject: aarch64: Use RTL builtins for polynomial vsli[q]_n intrinsics Rewrite vsli[q]_n_p* Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-10 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Use VALLP mode iterator for polynomial ssli_n builtin generator macro. * config/aarch64/arm_neon.h (vsli_n_p8): Use RTL builtin instead of inline asm. (vsli_n_p16): Likewise. (vsliq_n_p8): Likewise. (vsliq_n_p16): Likewise. * config/aarch64/iterators.md: Define VALLP mode iterator. --- gcc/config/aarch64/aarch64-simd-builtins.def | 2 +- gcc/config/aarch64/arm_neon.h | 72 ++++++++++------------------ gcc/config/aarch64/iterators.md | 3 ++ 3 files changed, 28 insertions(+), 49 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 202f690..5349791 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -436,7 +436,7 @@ BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0, NONE) BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0, NONE) BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0, NONE) - VAR2 (SHIFTINSERTP, ssli_n, 0, NONE, di, v2di) + BUILTIN_VALLP (SHIFTINSERTP, ssli_n, 0, NONE) BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0, NONE) /* Implemented by aarch64_qshl_n. */ BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0, NONE) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 164c76d..38a3a3f 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9050,57 +9050,33 @@ vshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c) __builtin_aarch64_shrn2v2di ((int32x2_t) __a, (int64x2_t) __b, __c); } -#define vsli_n_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x8_t b_ = (b); \ - poly8x8_t a_ = (a); \ - poly8x8_t result; \ - __asm__ ("sli %0.8b,%2.8b,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline poly8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsli_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c) +{ + return __builtin_aarch64_ssli_nv8qi_ppps (__a, __b, __c); +} -#define vsli_n_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x4_t b_ = (b); \ - poly16x4_t a_ = (a); \ - poly16x4_t result; \ - __asm__ ("sli %0.4h,%2.4h,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline poly16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c) +{ + return __builtin_aarch64_ssli_nv4hi_ppps (__a, __b, __c); +} -#define vsliq_n_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x16_t b_ = (b); \ - poly8x16_t a_ = (a); \ - poly8x16_t result; \ - __asm__ ("sli %0.16b,%2.16b,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline poly8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsliq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c) +{ + return __builtin_aarch64_ssli_nv16qi_ppps (__a, __b, __c); +} -#define vsliq_n_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x8_t b_ = (b); \ - poly16x8_t a_ = (a); \ - poly16x8_t result; \ - __asm__ ("sli %0.8h,%2.8h,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline poly16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c) +{ + return __builtin_aarch64_ssli_nv8hi_ppps (__a, __b, __c); +} #define vsri_n_p8(a, b, c) \ __extension__ \ diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 8a765ea..fe2c51c 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -203,6 +203,9 @@ (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI V4BF V8BF V2DI V4HF V8HF V2SF V4SF V2DF DI DF]) +;; All Advanced SIMD polynomial modes and DI. +(define_mode_iterator VALLP [V8QI V16QI V4HI V8HI V2DI DI]) + ;; Advanced SIMD modes for Integer reduction across lanes. (define_mode_iterator VDQV [V8QI V16QI V4HI V8HI V4SI V2DI]) -- cgit v1.1 From 1d66367a71ef969235e10c77685f5ca4551bf519 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 10 Feb 2021 13:02:24 +0000 Subject: aarch64: Use RTL builtins for polynomial vsri[q]_n intrinsics Rewrite vsri[q]_n_p* Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-10 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Add polynomial ssri_n buitin generator macro. * config/aarch64/arm_neon.h (vsri_n_p8): Use RTL builtin instead of inline asm. (vsri_n_p16): Likewise. (vsri_n_p64): Likewise. (vsriq_n_p8): Likewise. (vsriq_n_p16): Likewise. (vsriq_n_p64): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 1 + gcc/config/aarch64/arm_neon.h | 118 ++++++++++----------------- 2 files changed, 42 insertions(+), 77 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 5349791..86614e7 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -434,6 +434,7 @@ BUILTIN_VQN (USHIFT2IMM, uqrshrn2_n, 0, NONE) /* Implemented by aarch64_si_n. */ BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0, NONE) + BUILTIN_VALLP (SHIFTINSERTP, ssri_n, 0, NONE) BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0, NONE) BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0, NONE) BUILTIN_VALLP (SHIFTINSERTP, ssli_n, 0, NONE) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 38a3a3f..3536052 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9078,83 +9078,47 @@ vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c) return __builtin_aarch64_ssli_nv8hi_ppps (__a, __b, __c); } -#define vsri_n_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x8_t b_ = (b); \ - poly8x8_t a_ = (a); \ - poly8x8_t result; \ - __asm__ ("sri %0.8b,%2.8b,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vsri_n_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x4_t b_ = (b); \ - poly16x4_t a_ = (a); \ - poly16x4_t result; \ - __asm__ ("sri %0.4h,%2.4h,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vsri_n_p64(a, b, c) \ - __extension__ \ - ({ \ - poly64x1_t b_ = (b); \ - poly64x1_t a_ = (a); \ - poly64x1_t result; \ - __asm__ ("sri %d0,%d2,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers. */); \ - result; \ - }) - -#define vsriq_n_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x16_t b_ = (b); \ - poly8x16_t a_ = (a); \ - poly8x16_t result; \ - __asm__ ("sri %0.16b,%2.16b,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vsriq_n_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x8_t b_ = (b); \ - poly16x8_t a_ = (a); \ - poly16x8_t result; \ - __asm__ ("sri %0.8h,%2.8h,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vsriq_n_p64(a, b, c) \ - __extension__ \ - ({ \ - poly64x2_t b_ = (b); \ - poly64x2_t a_ = (a); \ - poly64x2_t result; \ - __asm__ ("sri %0.2d,%2.2d,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers. */); \ - result; \ - }) +__extension__ extern __inline poly8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsri_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c) +{ + return __builtin_aarch64_ssri_nv8qi_ppps (__a, __b, __c); +} + +__extension__ extern __inline poly16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsri_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c) +{ + return __builtin_aarch64_ssri_nv4hi_ppps (__a, __b, __c); +} + +__extension__ extern __inline poly64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsri_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c) +{ + return (poly64x1_t) __builtin_aarch64_ssri_ndi_ppps (__a[0], __b[0], __c); +} + +__extension__ extern __inline poly8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsriq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c) +{ + return __builtin_aarch64_ssri_nv16qi_ppps (__a, __b, __c); +} + +__extension__ extern __inline poly16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsriq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c) +{ + return __builtin_aarch64_ssri_nv8hi_ppps (__a, __b, __c); +} + +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsriq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c) +{ + return __builtin_aarch64_ssri_nv2di_ppps (__a, __b, __c); +} __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -- cgit v1.1 From ffb112289452f58fbf00a4e57c0d7de930aca6b1 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 12 Feb 2021 12:13:27 +0000 Subject: aarch64: Use RTL builtins for v[q]tbl intrinsics Rewrite v[q]tbl Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-12 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Add tbl1 builtin generator macros. * config/aarch64/arm_neon.h (vqtbl1_p8): Use RTL builtin instead of inline asm. (vqtbl1_s8): Likewise. (vqtbl1_u8): Likewise. (vqtbl1q_p8): Likewise. (vqtbl1q_s8): Likewise. (vqtbl1q_u8): Likewise. (vtbl1_s8): Likewise. (vtbl1_u8): Likewise. (vtbl1_p8): Likewise. (vtbl2_s8): Likewise. (vtbl2_u8): Likewise. (vtbl2_p8): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 4 + gcc/config/aarch64/arm_neon.h | 109 +++++++-------------------- 2 files changed, 32 insertions(+), 81 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 86614e7..04b392b 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -696,6 +696,10 @@ VAR1 (BINOP, tbl3, 0, NONE, v8qi) VAR1 (BINOP, tbl3, 0, NONE, v16qi) + /* Implemented by aarch64_tbl1. */ + VAR2 (BINOP, tbl1, 0, NONE, v8qi, v16qi) + VAR2 (BINOPU, tbl1, 0, NONE, v8qi, v16qi) + /* Implemented by aarch64_qtbl3. */ VAR1 (BINOP, qtbl3, 0, NONE, v8qi) VAR1 (BINOP, qtbl3, 0, NONE, v16qi) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 3536052..0817129 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9579,74 +9579,46 @@ vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1_p8 (poly8x16_t __a, uint8x8_t __b) +vqtbl1_p8 (poly8x16_t __tab, uint8x8_t __idx) { - poly8x8_t __result; - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (poly8x8_t) __builtin_aarch64_tbl1v8qi ((int8x16_t) __tab, + (int8x8_t) __idx); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1_s8 (int8x16_t __a, uint8x8_t __b) +vqtbl1_s8 (int8x16_t __tab, uint8x8_t __idx) { - int8x8_t __result; - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbl1v8qi (__tab, (int8x8_t) __idx); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1_u8 (uint8x16_t __a, uint8x8_t __b) +vqtbl1_u8 (uint8x16_t __tab, uint8x8_t __idx) { - uint8x8_t __result; - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbl1v8qi_uuu (__tab, __idx); } __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1q_p8 (poly8x16_t __a, uint8x16_t __b) +vqtbl1q_p8 (poly8x16_t __tab, uint8x16_t __idx) { - poly8x16_t __result; - __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (poly8x16_t) __builtin_aarch64_tbl1v16qi ((int8x16_t) __tab, + (int8x16_t) __idx); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1q_s8 (int8x16_t __a, uint8x16_t __b) +vqtbl1q_s8 (int8x16_t __tab, uint8x16_t __idx) { - int8x16_t __result; - __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbl1v16qi (__tab, (int8x16_t) __idx); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1q_u8 (uint8x16_t __a, uint8x16_t __b) +vqtbl1q_u8 (uint8x16_t __tab, uint8x16_t __idx) { - uint8x16_t __result; - __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbl1v16qi_uuu (__tab, __idx); } __extension__ extern __inline int8x8_t @@ -9727,78 +9699,53 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl1_s8 (int8x8_t __tab, int8x8_t __idx) { - int8x8_t __result; - int8x16_t __temp = vcombine_s8 (__tab, vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + int8x16_t __temp = vcombine_s8 (__tab, + vcreate_s8 (__AARCH64_UINT64_C (0x0))); + return __builtin_aarch64_tbl1v8qi (__temp, __idx); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl1_u8 (uint8x8_t __tab, uint8x8_t __idx) { - uint8x8_t __result; - uint8x16_t __temp = vcombine_u8 (__tab, vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + uint8x16_t __temp = vcombine_u8 (__tab, + vcreate_u8 (__AARCH64_UINT64_C (0x0))); + return __builtin_aarch64_tbl1v8qi_uuu (__temp, __idx); } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl1_p8 (poly8x8_t __tab, uint8x8_t __idx) { - poly8x8_t __result; - poly8x16_t __temp = vcombine_p8 (__tab, vcreate_p8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + poly8x16_t __temp = vcombine_p8 (__tab, + vcreate_p8 (__AARCH64_UINT64_C (0x0))); + return (poly8x8_t) __builtin_aarch64_tbl1v8qi ((int8x16_t) __temp, + (int8x8_t) __idx); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl2_s8 (int8x8x2_t __tab, int8x8_t __idx) { - int8x8_t __result; int8x16_t __temp = vcombine_s8 (__tab.val[0], __tab.val[1]); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbl1v8qi (__temp, __idx); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl2_u8 (uint8x8x2_t __tab, uint8x8_t __idx) { - uint8x8_t __result; uint8x16_t __temp = vcombine_u8 (__tab.val[0], __tab.val[1]); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbl1v8qi_uuu (__temp, __idx); } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl2_p8 (poly8x8x2_t __tab, uint8x8_t __idx) { - poly8x8_t __result; poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + return (poly8x8_t) __builtin_aarch64_tbl1v8qi ((int8x16_t) __temp, + (int8x8_t) __idx); } __extension__ extern __inline int8x8_t -- cgit v1.1 From 4362c9c88d9092a6585cd061e5535cb2f0453d13 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 12 Feb 2021 15:37:05 +0000 Subject: aarch64: Use RTL builtins for v[q]tbx intrinsics Rewrite v[q]tbx Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-12 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Add tbx1 builtin generator macros. * config/aarch64/aarch64-simd.md (aarch64_tbx1): Define. * config/aarch64/arm_neon.h (vqtbx1_s8): USE RTL builtin instead of inline asm. (vqtbx1_u8): Likewise. (vqtbx1_p8): Likewise. (vqtbx1q_s8): Likewise. (vqtbx1q_u8): Likewise. (vqtbx1q_p8): Likewise. (vtbx2_s8): Likewise. (vtbx2_u8): Likewise. (vtbx2_p8): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 4 ++ gcc/config/aarch64/aarch64-simd.md | 11 +++++ gcc/config/aarch64/arm_neon.h | 69 ++++++---------------------- 3 files changed, 30 insertions(+), 54 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 04b392b..a7d4f2b 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -708,6 +708,10 @@ VAR1 (BINOP, qtbl4, 0, NONE, v8qi) VAR1 (BINOP, qtbl4, 0, NONE, v16qi) + /* Implemented by aarch64_tbx1. */ + VAR2 (TERNOP, tbx1, 0, NONE, v8qi, v16qi) + VAR2 (TERNOPU, tbx1, 0, NONE, v8qi, v16qi) + /* Implemented by aarch64_tbx4. */ VAR1 (TERNOP, tbx4, 0, NONE, v8qi) VAR1 (TERNOP, tbx4, 0, NONE, v16qi) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 565ce5a..299d911 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -6852,6 +6852,17 @@ [(set_attr "type" "neon_tbl1")] ) +(define_insn "aarch64_tbx1" + [(set (match_operand:VB 0 "register_operand" "=w") + (unspec:VB [(match_operand:VB 1 "register_operand" "0") + (match_operand:V16QI 2 "register_operand" "w") + (match_operand:VB 3 "register_operand" "w")] + UNSPEC_TBX))] + "TARGET_SIMD" + "tbx\\t%0., {%2.16b}, %3." + [(set_attr "type" "neon_tbl1")] +) + ;; Two source registers. (define_insn "aarch64_tbl2v16qi" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 0817129..ead2bd0 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9625,72 +9625,46 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1_s8 (int8x8_t __r, int8x16_t __tab, uint8x8_t __idx) { - int8x8_t __result = __r; - __asm__ ("tbx %0.8b,{%1.16b},%2.8b" - : "+w"(__result) - : "w"(__tab), "w"(__idx) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbx1v8qi (__r, __tab, (int8x8_t) __idx); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1_u8 (uint8x8_t __r, uint8x16_t __tab, uint8x8_t __idx) { - uint8x8_t __result = __r; - __asm__ ("tbx %0.8b,{%1.16b},%2.8b" - : "+w"(__result) - : "w"(__tab), "w"(__idx) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbx1v8qi_uuuu (__r, __tab, __idx); } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1_p8 (poly8x8_t __r, poly8x16_t __tab, uint8x8_t __idx) { - poly8x8_t __result = __r; - __asm__ ("tbx %0.8b,{%1.16b},%2.8b" - : "+w"(__result) - : "w"(__tab), "w"(__idx) - : /* No clobbers */); - return __result; + return (poly8x8_t) __builtin_aarch64_tbx1v8qi ((int8x8_t) __r, + (int8x16_t) __tab, + (int8x8_t) __idx); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1q_s8 (int8x16_t __r, int8x16_t __tab, uint8x16_t __idx) { - int8x16_t __result = __r; - __asm__ ("tbx %0.16b,{%1.16b},%2.16b" - : "+w"(__result) - : "w"(__tab), "w"(__idx) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbx1v16qi (__r, __tab, (int8x16_t) __idx); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1q_u8 (uint8x16_t __r, uint8x16_t __tab, uint8x16_t __idx) { - uint8x16_t __result = __r; - __asm__ ("tbx %0.16b,{%1.16b},%2.16b" - : "+w"(__result) - : "w"(__tab), "w"(__idx) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbx1v16qi_uuuu (__r, __tab, __idx); } __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1q_p8 (poly8x16_t __r, poly8x16_t __tab, uint8x16_t __idx) { - poly8x16_t __result = __r; - __asm__ ("tbx %0.16b,{%1.16b},%2.16b" - : "+w"(__result) - : "w"(__tab), "w"(__idx) - : /* No clobbers */); - return __result; + return (poly8x16_t) __builtin_aarch64_tbx1v16qi ((int8x16_t) __r, + (int8x16_t) __tab, + (int8x16_t) __idx); } /* V7 legacy table intrinsics. */ @@ -9854,39 +9828,26 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbx2_s8 (int8x8_t __r, int8x8x2_t __tab, int8x8_t __idx) { - int8x8_t __result = __r; int8x16_t __temp = vcombine_s8 (__tab.val[0], __tab.val[1]); - __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" - : "+w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbx1v8qi (__r, __temp, __idx); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbx2_u8 (uint8x8_t __r, uint8x8x2_t __tab, uint8x8_t __idx) { - uint8x8_t __result = __r; uint8x16_t __temp = vcombine_u8 (__tab.val[0], __tab.val[1]); - __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" - : "+w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbx1v8qi_uuuu (__r, __temp, __idx); } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbx2_p8 (poly8x8_t __r, poly8x8x2_t __tab, uint8x8_t __idx) { - poly8x8_t __result = __r; poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]); - __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" - : "+w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + return (poly8x8_t) __builtin_aarch64_tbx1v8qi ((int8x8_t) __r, + (int8x16_t) __temp, + (int8x8_t) __idx); } /* End of temporary inline asm. */ -- cgit v1.1 From 8456a4cd96823704beec0b863010cd0dcc8dd591 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 18 Feb 2021 23:27:00 +0000 Subject: aarch64: Use RTL builtins for vcvtx intrinsics Rewrite vcvtx Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-18 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Add float_trunc_rodd builtin generator macros. * config/aarch64/aarch64-simd.md (aarch64_float_trunc_rodd_df): Define. (aarch64_float_trunc_rodd_lo_v2sf): Define. (aarch64_float_trunc_rodd_hi_v4sf_le): Define. (aarch64_float_trunc_rodd_hi_v4sf_be): Define. (aarch64_float_trunc_rodd_hi_v4sf): Define. * config/aarch64/arm_neon.h (vcvtx_f32_f64): Use RTL builtin instead of inline asm. (vcvtx_high_f32_f64): Likewise. (vcvtxd_f32_f64): Likewise. * config/aarch64/iterators.md: Add FCVTXN unspec. --- gcc/config/aarch64/aarch64-simd-builtins.def | 4 +++ gcc/config/aarch64/aarch64-simd.md | 54 ++++++++++++++++++++++++++++ gcc/config/aarch64/arm_neon.h | 21 ++--------- gcc/config/aarch64/iterators.md | 1 + 4 files changed, 62 insertions(+), 18 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index a7d4f2b..f01a1b4 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -632,6 +632,10 @@ VAR1 (UNOP, float_extend_lo_, 0, FP, v4sf) BUILTIN_VDF (UNOP, float_truncate_lo_, 0, FP) + VAR1 (UNOP, float_trunc_rodd_, 0, FP, df) + VAR1 (UNOP, float_trunc_rodd_lo_, 0, FP, v2sf) + VAR1 (BINOP, float_trunc_rodd_hi_, 0, FP, v4sf) + /* Implemented by aarch64_ld1. */ BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD) VAR1(STORE1P, ld1, 0, ALL, v2di) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 299d911..72f429c 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -2981,6 +2981,60 @@ ;; Float narrowing operations. +(define_insn "aarch64_float_trunc_rodd_df" + [(set (match_operand:SF 0 "register_operand" "=w") + (unspec:SF [(match_operand:DF 1 "register_operand" "w")] + UNSPEC_FCVTXN))] + "TARGET_SIMD" + "fcvtxn\\t%s0, %d1" + [(set_attr "type" "neon_fp_cvt_narrow_d_q")] +) + +(define_insn "aarch64_float_trunc_rodd_lo_v2sf" + [(set (match_operand:V2SF 0 "register_operand" "=w") + (unspec:V2SF [(match_operand:V2DF 1 "register_operand" "w")] + UNSPEC_FCVTXN))] + "TARGET_SIMD" + "fcvtxn\\t%0.2s, %1.2d" + [(set_attr "type" "neon_fp_cvt_narrow_d_q")] +) + +(define_insn "aarch64_float_trunc_rodd_hi_v4sf_le" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (vec_concat:V4SF + (match_operand:V2SF 1 "register_operand" "0") + (unspec:V2SF [(match_operand:V2DF 2 "register_operand" "w")] + UNSPEC_FCVTXN)))] + "TARGET_SIMD && !BYTES_BIG_ENDIAN" + "fcvtxn2\\t%0.4s, %2.2d" + [(set_attr "type" "neon_fp_cvt_narrow_d_q")] +) + +(define_insn "aarch64_float_trunc_rodd_hi_v4sf_be" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (vec_concat:V4SF + (unspec:V2SF [(match_operand:V2DF 2 "register_operand" "w")] + UNSPEC_FCVTXN) + (match_operand:V2SF 1 "register_operand" "0")))] + "TARGET_SIMD && BYTES_BIG_ENDIAN" + "fcvtxn2\\t%0.4s, %2.2d" + [(set_attr "type" "neon_fp_cvt_narrow_d_q")] +) + +(define_expand "aarch64_float_trunc_rodd_hi_v4sf" + [(match_operand:V4SF 0 "register_operand") + (match_operand:V2SF 1 "register_operand") + (match_operand:V2DF 2 "register_operand")] + "TARGET_SIMD" +{ + rtx (*gen) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN + ? gen_aarch64_float_trunc_rodd_hi_v4sf_be + : gen_aarch64_float_trunc_rodd_hi_v4sf_le; + emit_insn (gen (operands[0], operands[1], operands[2])); + DONE; +} +) + (define_insn "aarch64_float_truncate_lo_" [(set (match_operand:VDF 0 "register_operand" "=w") (float_truncate:VDF diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index ead2bd0..4b8ec52 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -7014,36 +7014,21 @@ __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtx_f32_f64 (float64x2_t __a) { - float32x2_t __result; - __asm__ ("fcvtxn %0.2s,%1.2d" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_float_trunc_rodd_lo_v2sf (__a); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtx_high_f32_f64 (float32x2_t __a, float64x2_t __b) { - float32x4_t __result; - __asm__ ("fcvtxn2 %0.4s,%1.2d" - : "=w"(__result) - : "w" (__b), "0"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_float_trunc_rodd_hi_v4sf (__a, __b); } __extension__ extern __inline float32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtxd_f32_f64 (float64_t __a) { - float32_t __result; - __asm__ ("fcvtxn %s0,%d1" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_float_trunc_rodd_df (__a); } __extension__ extern __inline float32x2_t diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index fe2c51c..3d66e63 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -861,6 +861,7 @@ UNSPEC_BFCVTN ; Used in aarch64-simd.md. UNSPEC_BFCVTN2 ; Used in aarch64-simd.md. UNSPEC_BFCVT ; Used in aarch64-simd.md. + UNSPEC_FCVTXN ; Used in aarch64-simd.md. ]) ;; ------------------------------------------------------------------ -- cgit v1.1 From 67cf12a8171399a9e724a7eb6cc80908ed297eaa Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 18 Mar 2021 12:14:48 +0000 Subject: aarch64: Update attributes of arm_fp16.h intrinsics Update the attributes of all intrinsics defined in arm_fp16.h to be consistent with the attributes of the intrinsics defined in arm_neon.h. Specifically, this means updating the attributes from: __extension__ static __inline __attribute__ ((__always_inline__)) to: __extension__ extern __inline __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) gcc/ChangeLog: 2021-03-18 Jonathan Wright * config/aarch64/arm_fp16.h (__attribute__): Make intrinsic attributes consistent with those defined in arm_neon.h. --- gcc/config/aarch64/arm_fp16.h | 267 ++++++++++++++++++++++++++++-------------- 1 file changed, 178 insertions(+), 89 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_fp16.h b/gcc/config/aarch64/arm_fp16.h index 2afbd12..2633953 100644 --- a/gcc/config/aarch64/arm_fp16.h +++ b/gcc/config/aarch64/arm_fp16.h @@ -36,325 +36,379 @@ typedef __fp16 float16_t; /* ARMv8.2-A FP16 one operand scalar intrinsics. */ -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabsh_f16 (float16_t __a) { return __builtin_aarch64_abshf (__a); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqzh_f16 (float16_t __a) { return __builtin_aarch64_cmeqhf_uss (__a, 0.0f); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcgezh_f16 (float16_t __a) { return __builtin_aarch64_cmgehf_uss (__a, 0.0f); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcgtzh_f16 (float16_t __a) { return __builtin_aarch64_cmgthf_uss (__a, 0.0f); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vclezh_f16 (float16_t __a) { return __builtin_aarch64_cmlehf_uss (__a, 0.0f); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcltzh_f16 (float16_t __a) { return __builtin_aarch64_cmlthf_uss (__a, 0.0f); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_f16_s16 (int16_t __a) { return __builtin_aarch64_floathihf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_f16_s32 (int32_t __a) { return __builtin_aarch64_floatsihf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_f16_s64 (int64_t __a) { return __builtin_aarch64_floatdihf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_f16_u16 (uint16_t __a) { return __builtin_aarch64_floatunshihf_us (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_f16_u32 (uint32_t __a) { return __builtin_aarch64_floatunssihf_us (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_f16_u64 (uint64_t __a) { return __builtin_aarch64_floatunsdihf_us (__a); } -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_s16_f16 (float16_t __a) { return __builtin_aarch64_fix_trunchfhi (__a); } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_s32_f16 (float16_t __a) { return __builtin_aarch64_fix_trunchfsi (__a); } -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_s64_f16 (float16_t __a) { return __builtin_aarch64_fix_trunchfdi (__a); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_u16_f16 (float16_t __a) { return __builtin_aarch64_fixuns_trunchfhi_us (__a); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_u32_f16 (float16_t __a) { return __builtin_aarch64_fixuns_trunchfsi_us (__a); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_u64_f16 (float16_t __a) { return __builtin_aarch64_fixuns_trunchfdi_us (__a); } -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtah_s16_f16 (float16_t __a) { return __builtin_aarch64_lroundhfhi (__a); } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtah_s32_f16 (float16_t __a) { return __builtin_aarch64_lroundhfsi (__a); } -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtah_s64_f16 (float16_t __a) { return __builtin_aarch64_lroundhfdi (__a); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtah_u16_f16 (float16_t __a) { return __builtin_aarch64_lrounduhfhi_us (__a); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtah_u32_f16 (float16_t __a) { return __builtin_aarch64_lrounduhfsi_us (__a); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtah_u64_f16 (float16_t __a) { return __builtin_aarch64_lrounduhfdi_us (__a); } -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtmh_s16_f16 (float16_t __a) { return __builtin_aarch64_lfloorhfhi (__a); } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtmh_s32_f16 (float16_t __a) { return __builtin_aarch64_lfloorhfsi (__a); } -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtmh_s64_f16 (float16_t __a) { return __builtin_aarch64_lfloorhfdi (__a); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtmh_u16_f16 (float16_t __a) { return __builtin_aarch64_lflooruhfhi_us (__a); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtmh_u32_f16 (float16_t __a) { return __builtin_aarch64_lflooruhfsi_us (__a); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtmh_u64_f16 (float16_t __a) { return __builtin_aarch64_lflooruhfdi_us (__a); } -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtnh_s16_f16 (float16_t __a) { return __builtin_aarch64_lfrintnhfhi (__a); } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtnh_s32_f16 (float16_t __a) { return __builtin_aarch64_lfrintnhfsi (__a); } -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtnh_s64_f16 (float16_t __a) { return __builtin_aarch64_lfrintnhfdi (__a); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtnh_u16_f16 (float16_t __a) { return __builtin_aarch64_lfrintnuhfhi_us (__a); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtnh_u32_f16 (float16_t __a) { return __builtin_aarch64_lfrintnuhfsi_us (__a); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtnh_u64_f16 (float16_t __a) { return __builtin_aarch64_lfrintnuhfdi_us (__a); } -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtph_s16_f16 (float16_t __a) { return __builtin_aarch64_lceilhfhi (__a); } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtph_s32_f16 (float16_t __a) { return __builtin_aarch64_lceilhfsi (__a); } -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtph_s64_f16 (float16_t __a) { return __builtin_aarch64_lceilhfdi (__a); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtph_u16_f16 (float16_t __a) { return __builtin_aarch64_lceiluhfhi_us (__a); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtph_u32_f16 (float16_t __a) { return __builtin_aarch64_lceiluhfsi_us (__a); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvtph_u64_f16 (float16_t __a) { return __builtin_aarch64_lceiluhfdi_us (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vnegh_f16 (float16_t __a) { return __builtin_aarch64_neghf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrecpeh_f16 (float16_t __a) { return __builtin_aarch64_frecpehf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrecpxh_f16 (float16_t __a) { return __builtin_aarch64_frecpxhf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrndh_f16 (float16_t __a) { return __builtin_aarch64_btrunchf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrndah_f16 (float16_t __a) { return __builtin_aarch64_roundhf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrndih_f16 (float16_t __a) { return __builtin_aarch64_nearbyinthf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrndmh_f16 (float16_t __a) { return __builtin_aarch64_floorhf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrndnh_f16 (float16_t __a) { return __builtin_aarch64_frintnhf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrndph_f16 (float16_t __a) { return __builtin_aarch64_ceilhf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrndxh_f16 (float16_t __a) { return __builtin_aarch64_rinthf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrsqrteh_f16 (float16_t __a) { return __builtin_aarch64_rsqrtehf (__a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsqrth_f16 (float16_t __a) { return __builtin_aarch64_sqrthf (__a); @@ -362,199 +416,232 @@ vsqrth_f16 (float16_t __a) /* ARMv8.2-A FP16 two operands scalar intrinsics. */ -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddh_f16 (float16_t __a, float16_t __b) { return __a + __b; } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabdh_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_fabdhf (__a, __b); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcageh_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_facgehf_uss (__a, __b); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcagth_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_facgthf_uss (__a, __b); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcaleh_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_faclehf_uss (__a, __b); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcalth_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_faclthf_uss (__a, __b); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqh_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_cmeqhf_uss (__a, __b); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcgeh_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_cmgehf_uss (__a, __b); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcgth_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_cmgthf_uss (__a, __b); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcleh_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_cmlehf_uss (__a, __b); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vclth_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_cmlthf_uss (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_n_f16_s16 (int16_t __a, const int __b) { return __builtin_aarch64_scvtfhi (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_n_f16_s32 (int32_t __a, const int __b) { return __builtin_aarch64_scvtfsihf (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_n_f16_s64 (int64_t __a, const int __b) { return __builtin_aarch64_scvtfdihf (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_n_f16_u16 (uint16_t __a, const int __b) { return __builtin_aarch64_ucvtfhi_sus (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_n_f16_u32 (uint32_t __a, const int __b) { return __builtin_aarch64_ucvtfsihf_sus (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_n_f16_u64 (uint64_t __a, const int __b) { return __builtin_aarch64_ucvtfdihf_sus (__a, __b); } -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_n_s16_f16 (float16_t __a, const int __b) { return __builtin_aarch64_fcvtzshf (__a, __b); } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_n_s32_f16 (float16_t __a, const int __b) { return __builtin_aarch64_fcvtzshfsi (__a, __b); } -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_n_s64_f16 (float16_t __a, const int __b) { return __builtin_aarch64_fcvtzshfdi (__a, __b); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_n_u16_f16 (float16_t __a, const int __b) { return __builtin_aarch64_fcvtzuhf_uss (__a, __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_n_u32_f16 (float16_t __a, const int __b) { return __builtin_aarch64_fcvtzuhfsi_uss (__a, __b); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvth_n_u64_f16 (float16_t __a, const int __b) { return __builtin_aarch64_fcvtzuhfdi_uss (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdivh_f16 (float16_t __a, float16_t __b) { return __a / __b; } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmaxh_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_fmaxhf (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmaxnmh_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_fmaxhf (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vminh_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_fminhf (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vminnmh_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_fminhf (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmulh_f16 (float16_t __a, float16_t __b) { return __a * __b; } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmulxh_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_fmulxhf (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrecpsh_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_frecpshf (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrsqrtsh_f16 (float16_t __a, float16_t __b) { return __builtin_aarch64_rsqrtshf (__a, __b); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsubh_f16 (float16_t __a, float16_t __b) { return __a - __b; @@ -562,13 +649,15 @@ vsubh_f16 (float16_t __a, float16_t __b) /* ARMv8.2-A FP16 three operands scalar intrinsics. */ -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vfmah_f16 (float16_t __a, float16_t __b, float16_t __c) { return __builtin_aarch64_fmahf (__b, __c, __a); } -__extension__ static __inline float16_t __attribute__ ((__always_inline__)) +__extension__ extern __inline float16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vfmsh_f16 (float16_t __a, float16_t __b, float16_t __c) { return __builtin_aarch64_fnmahf (__b, __c, __a); -- cgit v1.1 From 6a82f012c4a1c12468b154c59b817dc2d4d044c5 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 18 Mar 2021 16:23:50 +0000 Subject: aarch64: Update attributes of arm_acle.h intrinsics Update the attributes of all intrinsics defined in arm_acle.h to be consistent with the attributes of the intrinsics defined in arm_neon.h. Specifically, this means updating the attributes from: __extension__ static __inline __attribute__ ((__always_inline__)) to: __extension__ extern __inline __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) gcc/ChangeLog: 2021-03-18 Jonathan Wright * config/aarch64/arm_acle.h (__attribute__): Make intrinsic attributes consistent with those defined in arm_neon.h. --- gcc/config/aarch64/arm_acle.h | 69 ++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 23 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h index 73b29f4..13f2363 100644 --- a/gcc/config/aarch64/arm_acle.h +++ b/gcc/config/aarch64/arm_acle.h @@ -35,7 +35,8 @@ extern "C" { #pragma GCC push_options #pragma GCC target ("arch=armv8.3-a") -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __jcvt (double __a) { return __builtin_aarch64_jcvtzs (__a); @@ -45,49 +46,57 @@ __jcvt (double __a) #pragma GCC push_options #pragma GCC target ("arch=armv8.5-a") -__extension__ static __inline float __attribute__ ((__always_inline__)) +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __rint32zf (float __a) { return __builtin_aarch64_frint32zsf (__a); } -__extension__ static __inline double __attribute__ ((__always_inline__)) +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __rint32z (double __a) { return __builtin_aarch64_frint32zdf (__a); } -__extension__ static __inline float __attribute__ ((__always_inline__)) +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __rint64zf (float __a) { return __builtin_aarch64_frint64zsf (__a); } -__extension__ static __inline double __attribute__ ((__always_inline__)) +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __rint64z (double __a) { return __builtin_aarch64_frint64zdf (__a); } -__extension__ static __inline float __attribute__ ((__always_inline__)) +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __rint32xf (float __a) { return __builtin_aarch64_frint32xsf (__a); } -__extension__ static __inline double __attribute__ ((__always_inline__)) +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __rint32x (double __a) { return __builtin_aarch64_frint32xdf (__a); } -__extension__ static __inline float __attribute__ ((__always_inline__)) +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __rint64xf (float __a) { return __builtin_aarch64_frint64xsf (__a); } -__extension__ static __inline double __attribute__ ((__always_inline__)) +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __rint64x (double __a) { return __builtin_aarch64_frint64xdf (__a); @@ -100,49 +109,57 @@ __rint64x (double __a) #pragma GCC target ("+nothing+crc") -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __crc32b (uint32_t __a, uint8_t __b) { return __builtin_aarch64_crc32b (__a, __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __crc32cb (uint32_t __a, uint8_t __b) { return __builtin_aarch64_crc32cb (__a, __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __crc32ch (uint32_t __a, uint16_t __b) { return __builtin_aarch64_crc32ch (__a, __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __crc32cw (uint32_t __a, uint32_t __b) { return __builtin_aarch64_crc32cw (__a, __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __crc32cd (uint32_t __a, uint64_t __b) { return __builtin_aarch64_crc32cx (__a, __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __crc32h (uint32_t __a, uint16_t __b) { return __builtin_aarch64_crc32h (__a, __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __crc32w (uint32_t __a, uint32_t __b) { return __builtin_aarch64_crc32w (__a, __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __crc32d (uint32_t __a, uint64_t __b) { return __builtin_aarch64_crc32x (__a, __b); @@ -166,25 +183,29 @@ __crc32d (uint32_t __a, uint64_t __b) #define _TMFAILURE_INT 0x00800000u #define _TMFAILURE_TRIVIAL 0x01000000u -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __tstart (void) { return __builtin_aarch64_tstart (); } -__extension__ static __inline void __attribute__ ((__always_inline__)) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __tcommit (void) { __builtin_aarch64_tcommit (); } -__extension__ static __inline void __attribute__ ((__always_inline__)) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __tcancel (const uint64_t __reason) { __builtin_aarch64_tcancel (__reason); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +__extension__ extern __inline uint64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __ttest (void) { return __builtin_aarch64_ttest (); @@ -195,13 +216,15 @@ __ttest (void) #pragma GCC push_options #pragma GCC target ("+nothing+rng") -__extension__ static __inline int __attribute__ ((__always_inline__)) +__extension__ extern __inline int +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __rndr (uint64_t *__res) { return __builtin_aarch64_rndr (__res); } -__extension__ static __inline int __attribute__ ((__always_inline__)) +__extension__ extern __inline int +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __rndrrs (uint64_t *__res) { return __builtin_aarch64_rndrrs (__res); -- cgit v1.1 From c99f3747131377956e3bd8e393911c959ef5ff34 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 22 Apr 2021 15:04:19 +0100 Subject: aarch64: Remove unspecs from [su]qmovn RTL pattern Saturating truncation can be expressed using the RTL expressions ss_truncate and us_truncate. This patch changes the implementation of the vqmovn_* intrinsics to use these RTL expressions rather than a pair of unspecs. The redundant unspecs are removed along with their code iterator. gcc/ChangeLog: 2021-04-12 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Modify comment to make consistent with updated RTL pattern. * config/aarch64/aarch64-simd.md (aarch64_qmovn): Implement using ss_truncate and us_truncate rather than unspecs. * config/aarch64/iterators.md: Remove redundant unspecs and iterator: UNSPEC_[SU]QXTN and SUQMOVN respectively. --- gcc/config/aarch64/aarch64-simd-builtins.def | 2 +- gcc/config/aarch64/aarch64-simd.md | 8 ++++---- gcc/config/aarch64/iterators.md | 5 ----- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index f01a1b4..337ec8d 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -266,7 +266,7 @@ /* Implemented by aarch64_sqxtun2. */ BUILTIN_VQN (BINOP_UUS, sqxtun2, 0, NONE) - /* Implemented by aarch64_qmovn. */ + /* Implemented by aarch64_qmovn. */ BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0, NONE) BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 72f429c..fbfed33 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4591,12 +4591,12 @@ ;; sqmovn and uqmovn -(define_insn "aarch64_qmovn" +(define_insn "aarch64_qmovn" [(set (match_operand: 0 "register_operand" "=w") - (unspec: [(match_operand:VSQN_HSDI 1 "register_operand" "w")] - SUQMOVN))] + (SAT_TRUNC: + (match_operand:VSQN_HSDI 1 "register_operand" "w")))] "TARGET_SIMD" - "qxtn\\t%0, %1" + "qxtn\\t%0, %1" [(set_attr "type" "neon_sat_shift_imm_narrow_q")] ) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 3d66e63..634c44e 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -523,8 +523,6 @@ UNSPEC_SUQADD ; Used in aarch64-simd.md. UNSPEC_SQXTUN ; Used in aarch64-simd.md. UNSPEC_SQXTUN2 ; Used in aarch64-simd.md. - UNSPEC_SQXTN ; Used in aarch64-simd.md. - UNSPEC_UQXTN ; Used in aarch64-simd.md. UNSPEC_SSRA ; Used in aarch64-simd.md. UNSPEC_USRA ; Used in aarch64-simd.md. UNSPEC_SRSRA ; Used in aarch64-simd.md. @@ -2258,8 +2256,6 @@ (define_int_iterator USSUQADD [UNSPEC_SUQADD UNSPEC_USQADD]) -(define_int_iterator SUQMOVN [UNSPEC_SQXTN UNSPEC_UQXTN]) - (define_int_iterator VSHL [UNSPEC_SSHL UNSPEC_USHL UNSPEC_SRSHL UNSPEC_URSHL]) @@ -2998,7 +2994,6 @@ (UNSPEC_SUBHN "") (UNSPEC_RSUBHN "r") (UNSPEC_ADDHN2 "") (UNSPEC_RADDHN2 "r") (UNSPEC_SUBHN2 "") (UNSPEC_RSUBHN2 "r") - (UNSPEC_SQXTN "s") (UNSPEC_UQXTN "u") (UNSPEC_USQADD "us") (UNSPEC_SUQADD "su") (UNSPEC_SSLI "s") (UNSPEC_USLI "u") (UNSPEC_SSRI "s") (UNSPEC_USRI "u") -- cgit v1.1 From 1bb3e2c0ce6ed363c72caf814a6ba6d7b17c3e0a Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 29 Apr 2021 11:34:50 +0200 Subject: aarch64: Fix ICE in aarch64_add_offset_1_temporaries [PR100302] In PR94121 I've changed aarch64_add_offset_1 to use absu_hwi instead of abs_hwi because offset can be HOST_WIDE_INT_MIN. As can be seen with the testcase below, aarch64_add_offset_1_temporaries suffers from the same problem and should be in sync with aarch64_add_offset_1, i.e. for HOST_WIDE_INT_MIN it needs a temporary. 2021-04-29 Jakub Jelinek PR target/100302 * config/aarch64/aarch64.c (aarch64_add_offset_1_temporaries): Use absu_hwi instead of abs_hwi. * gcc.target/aarch64/sve/pr100302.c: New test. --- gcc/config/aarch64/aarch64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index a863af1..c2f4b27 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -4736,7 +4736,7 @@ aarch64_mov128_immediate (rtx imm) static unsigned int aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset) { - return abs_hwi (offset) < 0x1000000 ? 0 : 1; + return absu_hwi (offset) < 0x1000000 ? 0 : 1; } /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for -- cgit v1.1 From 985b3a6837dee7001e6b618f073ed74f0edf5787 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 10 Jun 2019 09:57:15 -0700 Subject: Generate offset adjusted operation for op_by_pieces operations Add an overlap_op_by_pieces_p target hook for op_by_pieces operations between two areas of memory to generate one offset adjusted operation in the smallest integer mode for the remaining bytes on the last piece operation of a memory region to avoid doing more than one smaller operations. Pass the RTL information from the previous iteration to m_constfn in op_by_pieces operation so that builtin_memset_[read|gen]_str can generate the new RTL from the previous RTL. Tested on Linux/x86-64. gcc/ PR middle-end/90773 * builtins.c (builtin_memcpy_read_str): Add a dummy argument. (builtin_strncpy_read_str): Likewise. (builtin_memset_read_str): Add an argument for the previous RTL information and generate the new RTL from the previous RTL info. (builtin_memset_gen_str): Likewise. * builtins.h (builtin_strncpy_read_str): Update the prototype. (builtin_memset_read_str): Likewise. * expr.c (by_pieces_ninsns): If targetm.overlap_op_by_pieces_p() returns true, round up size and alignment to the widest integer mode for maximum size. (pieces_addr::adjust): Add a pointer to by_pieces_prev argument and pass it to m_constfn. (op_by_pieces_d): Add m_push and m_overlap_op_by_pieces. (op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize m_push. Initialize m_overlap_op_by_pieces with targetm.overlap_op_by_pieces_p (). (op_by_pieces_d::run): Pass the previous RTL information to pieces_addr::adjust and generate overlapping operations if m_overlap_op_by_pieces is true. (PUSHG_P): New. (move_by_pieces_d::move_by_pieces_d): Updated for op_by_pieces_d change. (store_by_pieces_d::store_by_pieces_d): Updated for op_by_pieces_d change. (can_store_by_pieces): Use by_pieces_constfn on constfun. (store_by_pieces): Use by_pieces_constfn on constfun. Updated for op_by_pieces_d change. (clear_by_pieces_1): Add a dummy argument. (clear_by_pieces): Updated for op_by_pieces_d change. (compare_by_pieces_d::compare_by_pieces_d): Likewise. (string_cst_read_str): Add a dummy argument. * expr.h (by_pieces_constfn): Add a dummy argument. (by_pieces_prev): New. * target.def (overlap_op_by_pieces_p): New target hook. * config/i386/i386.c (TARGET_OVERLAP_OP_BY_PIECES_P): New. * doc/tm.texi.in: Add TARGET_OVERLAP_OP_BY_PIECES_P. * doc/tm.texi: Regenerated. gcc/testsuite/ PR middle-end/90773 * g++.dg/pr90773-1.h: New test. * g++.dg/pr90773-1a.C: Likewise. * g++.dg/pr90773-1b.C: Likewise. * g++.dg/pr90773-1c.C: Likewise. * g++.dg/pr90773-1d.C: Likewise. * gcc.target/i386/pr90773-1.c: Likewise. * gcc.target/i386/pr90773-2.c: Likewise. * gcc.target/i386/pr90773-3.c: Likewise. * gcc.target/i386/pr90773-4.c: Likewise. * gcc.target/i386/pr90773-5.c: Likewise. * gcc.target/i386/pr90773-6.c: Likewise. * gcc.target/i386/pr90773-7.c: Likewise. * gcc.target/i386/pr90773-8.c: Likewise. * gcc.target/i386/pr90773-9.c: Likewise. * gcc.target/i386/pr90773-10.c: Likewise. * gcc.target/i386/pr90773-11.c: Likewise. * gcc.target/i386/pr90773-12.c: Likewise. * gcc.target/i386/pr90773-13.c: Likewise. * gcc.target/i386/pr90773-14.c: Likewise. --- gcc/config/i386/i386.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index adcef1e..68f33f9 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -23538,6 +23538,9 @@ ix86_run_selftests (void) #undef TARGET_ADDRESS_COST #define TARGET_ADDRESS_COST ix86_address_cost +#undef TARGET_OVERLAP_OP_BY_PIECES_P +#define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true + #undef TARGET_FLAGS_REGNUM #define TARGET_FLAGS_REGNUM FLAGS_REG #undef TARGET_FIXED_CONDITION_CODE_REGS -- cgit v1.1 From 7d6f7aa409ebe37ea9eac25cc131f4a8f03acfa3 Mon Sep 17 00:00:00 2001 From: Eric Botcazou Date: Thu, 29 Apr 2021 13:34:27 +0200 Subject: Small housekeeping work in SPARC back-end gcc/ * config/sparc/sparc.c (gen_load_pcrel_sym): Delete. (load_got_register): Do the PIC dance here. (sparc_legitimize_tls_address): Simplify. (sparc_emit_probe_stack_range): Likewise. (sparc32_initialize_trampoline): Likewise. (sparc64_initialize_trampoline): Likewise. * config/sparc/sparc.md (load_pcrel_sym): Add @ marker. (probe_stack_range): Likewise. (flush): Likewise. (tgd_hi22): Likewise. (tgd_lo10): Likewise. (tgd_add): Likewise. (tgd_call): Likewise. (tldm_hi22): Likewise. (tldm_lo10): Likewise. (tldm_add): Likewise. (tldm_call): Likewise. (tldo_hix22): Likewise. (tldo_lox10): Likewise. (tldo_add): Likewise. (tie_hi22): Likewise. (tie_lo10): Likewise. (tie_add): Likewise. (tle_hix22): Likewise. (tle_lox10): Likewise. (stack_protect_setsi): Rename to... (stack_protect_set32): ...this. (stack_protect_setdi): Rename to... (stack_protect_set64): ...this. (stack_protect_set): Adjust calls to above. (stack_protect_testsi): Rename to... (stack_protect_test32): ...this. (stack_protect_testdi): Rename to... (stack_protect_test64): ...this. (stack_protect_test): Adjust calls to above. --- gcc/config/sparc/sparc.c | 143 ++++++++++++++-------------------------------- gcc/config/sparc/sparc.md | 54 ++++++++--------- 2 files changed, 70 insertions(+), 127 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c index 42ba415..3b4d416 100644 --- a/gcc/config/sparc/sparc.c +++ b/gcc/config/sparc/sparc.c @@ -4213,26 +4213,7 @@ sparc_got (void) return got_symbol_rtx; } -/* Wrapper around the load_pcrel_sym{si,di} patterns. */ - -static rtx -gen_load_pcrel_sym (rtx op0, rtx op1, rtx op2) -{ - int orig_flag_pic = flag_pic; - rtx insn; - - /* The load_pcrel_sym{si,di} patterns require absolute addressing. */ - flag_pic = 0; - if (TARGET_ARCH64) - insn = gen_load_pcrel_symdi (op0, op1, op2, GEN_INT (REGNO (op0))); - else - insn = gen_load_pcrel_symsi (op0, op1, op2, GEN_INT (REGNO (op0))); - flag_pic = orig_flag_pic; - - return insn; -} - -/* Output the load_pcrel_sym{si,di} patterns. */ +/* Output the load_pcrel_sym pattern. */ const char * output_load_pcrel_sym (rtx *operands) @@ -4299,8 +4280,15 @@ load_got_register (void) got_helper_rtx = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name)); } - insn - = gen_load_pcrel_sym (got_register_rtx, sparc_got (), got_helper_rtx); + /* The load_pcrel_sym{si,di} patterns require absolute addressing. */ + const int orig_flag_pic = flag_pic; + flag_pic = 0; + insn = gen_load_pcrel_sym (Pmode, + got_register_rtx, + sparc_got (), + got_helper_rtx, + GEN_INT (GLOBAL_OFFSET_TABLE_REGNUM)); + flag_pic = orig_flag_pic; } emit_insn (insn); @@ -4680,22 +4668,11 @@ sparc_legitimize_tls_address (rtx addr) ret = gen_reg_rtx (Pmode); o0 = gen_rtx_REG (Pmode, 8); got = sparc_tls_got (); - if (TARGET_ARCH32) - { - emit_insn (gen_tgd_hi22si (temp1, addr)); - emit_insn (gen_tgd_lo10si (temp2, temp1, addr)); - emit_insn (gen_tgd_addsi (o0, got, temp2, addr)); - insn = emit_call_insn (gen_tgd_callsi (o0, sparc_tls_get_addr (), - addr, const1_rtx)); - } - else - { - emit_insn (gen_tgd_hi22di (temp1, addr)); - emit_insn (gen_tgd_lo10di (temp2, temp1, addr)); - emit_insn (gen_tgd_adddi (o0, got, temp2, addr)); - insn = emit_call_insn (gen_tgd_calldi (o0, sparc_tls_get_addr (), - addr, const1_rtx)); - } + emit_insn (gen_tgd_hi22 (Pmode, temp1, addr)); + emit_insn (gen_tgd_lo10 (Pmode, temp2, temp1, addr)); + emit_insn (gen_tgd_add (Pmode, o0, got, temp2, addr)); + insn = emit_call_insn (gen_tgd_call (Pmode, o0, sparc_tls_get_addr (), + addr, const1_rtx)); use_reg (&CALL_INSN_FUNCTION_USAGE (insn), o0); RTL_CONST_CALL_P (insn) = 1; insn = get_insns (); @@ -4711,22 +4688,11 @@ sparc_legitimize_tls_address (rtx addr) ret = gen_reg_rtx (Pmode); o0 = gen_rtx_REG (Pmode, 8); got = sparc_tls_got (); - if (TARGET_ARCH32) - { - emit_insn (gen_tldm_hi22si (temp1)); - emit_insn (gen_tldm_lo10si (temp2, temp1)); - emit_insn (gen_tldm_addsi (o0, got, temp2)); - insn = emit_call_insn (gen_tldm_callsi (o0, sparc_tls_get_addr (), - const1_rtx)); - } - else - { - emit_insn (gen_tldm_hi22di (temp1)); - emit_insn (gen_tldm_lo10di (temp2, temp1)); - emit_insn (gen_tldm_adddi (o0, got, temp2)); - insn = emit_call_insn (gen_tldm_calldi (o0, sparc_tls_get_addr (), - const1_rtx)); - } + emit_insn (gen_tldm_hi22 (Pmode, temp1)); + emit_insn (gen_tldm_lo10 (Pmode, temp2, temp1)); + emit_insn (gen_tldm_add (Pmode, o0, got, temp2)); + insn = emit_call_insn (gen_tldm_call (Pmode, o0, sparc_tls_get_addr (), + const1_rtx)); use_reg (&CALL_INSN_FUNCTION_USAGE (insn), o0); RTL_CONST_CALL_P (insn) = 1; insn = get_insns (); @@ -4738,18 +4704,9 @@ sparc_legitimize_tls_address (rtx addr) UNSPEC_TLSLD_BASE)); temp1 = gen_reg_rtx (Pmode); temp2 = gen_reg_rtx (Pmode); - if (TARGET_ARCH32) - { - emit_insn (gen_tldo_hix22si (temp1, addr)); - emit_insn (gen_tldo_lox10si (temp2, temp1, addr)); - emit_insn (gen_tldo_addsi (ret, temp3, temp2, addr)); - } - else - { - emit_insn (gen_tldo_hix22di (temp1, addr)); - emit_insn (gen_tldo_lox10di (temp2, temp1, addr)); - emit_insn (gen_tldo_adddi (ret, temp3, temp2, addr)); - } + emit_insn (gen_tldo_hix22 (Pmode, temp1, addr)); + emit_insn (gen_tldo_lox10 (Pmode, temp2, temp1, addr)); + emit_insn (gen_tldo_add (Pmode, ret, temp3, temp2, addr)); break; case TLS_MODEL_INITIAL_EXEC: @@ -4757,27 +4714,17 @@ sparc_legitimize_tls_address (rtx addr) temp2 = gen_reg_rtx (Pmode); temp3 = gen_reg_rtx (Pmode); got = sparc_tls_got (); + emit_insn (gen_tie_hi22 (Pmode, temp1, addr)); + emit_insn (gen_tie_lo10 (Pmode, temp2, temp1, addr)); if (TARGET_ARCH32) - { - emit_insn (gen_tie_hi22si (temp1, addr)); - emit_insn (gen_tie_lo10si (temp2, temp1, addr)); - emit_insn (gen_tie_ld32 (temp3, got, temp2, addr)); - } + emit_insn (gen_tie_ld32 (temp3, got, temp2, addr)); else - { - emit_insn (gen_tie_hi22di (temp1, addr)); - emit_insn (gen_tie_lo10di (temp2, temp1, addr)); - emit_insn (gen_tie_ld64 (temp3, got, temp2, addr)); - } + emit_insn (gen_tie_ld64 (temp3, got, temp2, addr)); if (TARGET_SUN_TLS) { ret = gen_reg_rtx (Pmode); - if (TARGET_ARCH32) - emit_insn (gen_tie_addsi (ret, gen_rtx_REG (Pmode, 7), - temp3, addr)); - else - emit_insn (gen_tie_adddi (ret, gen_rtx_REG (Pmode, 7), - temp3, addr)); + emit_insn (gen_tie_add (Pmode, ret, gen_rtx_REG (Pmode, 7), + temp3, addr)); } else ret = gen_rtx_PLUS (Pmode, gen_rtx_REG (Pmode, 7), temp3); @@ -4786,16 +4733,8 @@ sparc_legitimize_tls_address (rtx addr) case TLS_MODEL_LOCAL_EXEC: temp1 = gen_reg_rtx (Pmode); temp2 = gen_reg_rtx (Pmode); - if (TARGET_ARCH32) - { - emit_insn (gen_tle_hix22si (temp1, addr)); - emit_insn (gen_tle_lox10si (temp2, temp1, addr)); - } - else - { - emit_insn (gen_tle_hix22di (temp1, addr)); - emit_insn (gen_tle_lox10di (temp2, temp1, addr)); - } + emit_insn (gen_tle_hix22 (Pmode, temp1, addr)); + emit_insn (gen_tle_lox10 (Pmode, temp2, temp1, addr)); ret = gen_rtx_PLUS (Pmode, gen_rtx_REG (Pmode, 7), temp2); break; @@ -5696,10 +5635,7 @@ sparc_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size) probes at FIRST + N * PROBE_INTERVAL for values of N from 1 until it is equal to ROUNDED_SIZE. */ - if (TARGET_ARCH64) - emit_insn (gen_probe_stack_rangedi (g1, g1, g4)); - else - emit_insn (gen_probe_stack_rangesi (g1, g1, g4)); + emit_insn (gen_probe_stack_range (Pmode, g1, g1, g4)); /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time @@ -9940,9 +9876,11 @@ sparc32_initialize_trampoline (rtx m_tramp, rtx fnaddr, rtx cxt) GEN_INT (trunc_int_for_mode (0x8410a000, SImode)), NULL_RTX, 1, OPTAB_DIRECT)); + emit_insn + (gen_flush (SImode, validize_mem (adjust_address (m_tramp, SImode, 0)))); + /* On UltraSPARC a flush flushes an entire cache line. The trampoline is aligned on a 16 byte boundary so one flush clears it all. */ - emit_insn (gen_flushsi (validize_mem (adjust_address (m_tramp, SImode, 0)))); if (sparc_cpu != PROCESSOR_ULTRASPARC && sparc_cpu != PROCESSOR_ULTRASPARC3 && sparc_cpu != PROCESSOR_NIAGARA @@ -9951,7 +9889,8 @@ sparc32_initialize_trampoline (rtx m_tramp, rtx fnaddr, rtx cxt) && sparc_cpu != PROCESSOR_NIAGARA4 && sparc_cpu != PROCESSOR_NIAGARA7 && sparc_cpu != PROCESSOR_M8) - emit_insn (gen_flushsi (validize_mem (adjust_address (m_tramp, SImode, 8)))); + emit_insn + (gen_flush (SImode, validize_mem (adjust_address (m_tramp, SImode, 8)))); /* Call __enable_execute_stack after writing onto the stack to make sure the stack address is accessible. */ @@ -9988,8 +9927,11 @@ sparc64_initialize_trampoline (rtx m_tramp, rtx fnaddr, rtx cxt) GEN_INT (trunc_int_for_mode (0xca586010, SImode))); emit_move_insn (adjust_address (m_tramp, DImode, 16), cxt); emit_move_insn (adjust_address (m_tramp, DImode, 24), fnaddr); - emit_insn (gen_flushdi (validize_mem (adjust_address (m_tramp, DImode, 0)))); + emit_insn + (gen_flush (DImode, validize_mem (adjust_address (m_tramp, DImode, 0)))); + /* On UltraSPARC a flush flushes an entire cache line. The trampoline is + aligned on a 16 byte boundary so one flush clears it all. */ if (sparc_cpu != PROCESSOR_ULTRASPARC && sparc_cpu != PROCESSOR_ULTRASPARC3 && sparc_cpu != PROCESSOR_NIAGARA @@ -9998,7 +9940,8 @@ sparc64_initialize_trampoline (rtx m_tramp, rtx fnaddr, rtx cxt) && sparc_cpu != PROCESSOR_NIAGARA4 && sparc_cpu != PROCESSOR_NIAGARA7 && sparc_cpu != PROCESSOR_M8) - emit_insn (gen_flushdi (validize_mem (adjust_address (m_tramp, DImode, 8)))); + emit_insn + (gen_flush (DImode, validize_mem (adjust_address (m_tramp, DImode, 8)))); /* Call __enable_execute_stack after writing onto the stack to make sure the stack address is accessible. */ diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md index c5d3696..a8d9962 100644 --- a/gcc/config/sparc/sparc.md +++ b/gcc/config/sparc/sparc.md @@ -1592,7 +1592,7 @@ ;; because the RDPC instruction is extremely expensive and incurs a complete ;; instruction pipeline flush. -(define_insn "load_pcrel_sym" +(define_insn "@load_pcrel_sym" [(set (match_operand:P 0 "register_operand" "=r") (unspec:P [(match_operand:P 1 "symbolic_operand" "") (match_operand:P 2 "call_address_operand" "") @@ -7290,7 +7290,7 @@ visl") = adjust_address (operands[0], GET_MODE (operands[0]), SPARC_STACK_BIAS); }) -(define_insn "probe_stack_range" +(define_insn "@probe_stack_range" [(set (match_operand:P 0 "register_operand" "=r") (unspec_volatile:P [(match_operand:P 1 "register_operand" "0") (match_operand:P 2 "register_operand" "r")] @@ -7468,7 +7468,7 @@ visl") ;; Special pattern for the FLUSH instruction. -(define_insn "flush" +(define_insn "@flush" [(unspec_volatile [(match_operand:P 0 "memory_operand" "m")] UNSPECV_FLUSH)] "" { @@ -7935,14 +7935,14 @@ visl") ;; TLS support instructions. -(define_insn "tgd_hi22" +(define_insn "@tgd_hi22" [(set (match_operand:P 0 "register_operand" "=r") (high:P (unspec:P [(match_operand 1 "tgd_symbolic_operand" "")] UNSPEC_TLSGD)))] "TARGET_TLS" "sethi\\t%%tgd_hi22(%a1), %0") -(define_insn "tgd_lo10" +(define_insn "@tgd_lo10" [(set (match_operand:P 0 "register_operand" "=r") (lo_sum:P (match_operand:P 1 "register_operand" "r") (unspec:P [(match_operand 2 "tgd_symbolic_operand" "")] @@ -7950,7 +7950,7 @@ visl") "TARGET_TLS" "add\\t%1, %%tgd_lo10(%a2), %0") -(define_insn "tgd_add" +(define_insn "@tgd_add" [(set (match_operand:P 0 "register_operand" "=r") (plus:P (match_operand:P 1 "register_operand" "r") (unspec:P [(match_operand:P 2 "register_operand" "r") @@ -7959,7 +7959,7 @@ visl") "TARGET_TLS" "add\\t%1, %2, %0, %%tgd_add(%a3)") -(define_insn "tgd_call" +(define_insn "@tgd_call" [(set (match_operand 0 "register_operand" "=r") (call (mem:P (unspec:P [(match_operand:P 1 "symbolic_operand" "s") (match_operand 2 "tgd_symbolic_operand" "")] @@ -7972,20 +7972,20 @@ visl") (const_string "call") (const_string "call_no_delay_slot")))]) -(define_insn "tldm_hi22" +(define_insn "@tldm_hi22" [(set (match_operand:P 0 "register_operand" "=r") (high:P (unspec:P [(const_int 0)] UNSPEC_TLSLDM)))] "TARGET_TLS" "sethi\\t%%tldm_hi22(%&), %0") -(define_insn "tldm_lo10" +(define_insn "@tldm_lo10" [(set (match_operand:P 0 "register_operand" "=r") (lo_sum:P (match_operand:P 1 "register_operand" "r") (unspec:P [(const_int 0)] UNSPEC_TLSLDM)))] "TARGET_TLS" "add\\t%1, %%tldm_lo10(%&), %0") -(define_insn "tldm_add" +(define_insn "@tldm_add" [(set (match_operand:P 0 "register_operand" "=r") (plus:P (match_operand:P 1 "register_operand" "r") (unspec:P [(match_operand:P 2 "register_operand" "r")] @@ -7993,7 +7993,7 @@ visl") "TARGET_TLS" "add\\t%1, %2, %0, %%tldm_add(%&)") -(define_insn "tldm_call" +(define_insn "@tldm_call" [(set (match_operand 0 "register_operand" "=r") (call (mem:P (unspec:P [(match_operand:P 1 "symbolic_operand" "s")] UNSPEC_TLSLDM)) @@ -8005,14 +8005,14 @@ visl") (const_string "call") (const_string "call_no_delay_slot")))]) -(define_insn "tldo_hix22" +(define_insn "@tldo_hix22" [(set (match_operand:P 0 "register_operand" "=r") (high:P (unspec:P [(match_operand 1 "tld_symbolic_operand" "")] UNSPEC_TLSLDO)))] "TARGET_TLS" "sethi\\t%%tldo_hix22(%a1), %0") -(define_insn "tldo_lox10" +(define_insn "@tldo_lox10" [(set (match_operand:P 0 "register_operand" "=r") (lo_sum:P (match_operand:P 1 "register_operand" "r") (unspec:P [(match_operand 2 "tld_symbolic_operand" "")] @@ -8020,7 +8020,7 @@ visl") "TARGET_TLS" "xor\\t%1, %%tldo_lox10(%a2), %0") -(define_insn "tldo_add" +(define_insn "@tldo_add" [(set (match_operand:P 0 "register_operand" "=r") (plus:P (match_operand:P 1 "register_operand" "r") (unspec:P [(match_operand:P 2 "register_operand" "r") @@ -8029,14 +8029,14 @@ visl") "TARGET_TLS" "add\\t%1, %2, %0, %%tldo_add(%a3)") -(define_insn "tie_hi22" +(define_insn "@tie_hi22" [(set (match_operand:P 0 "register_operand" "=r") (high:P (unspec:P [(match_operand 1 "tie_symbolic_operand" "")] UNSPEC_TLSIE)))] "TARGET_TLS" "sethi\\t%%tie_hi22(%a1), %0") -(define_insn "tie_lo10" +(define_insn "@tie_lo10" [(set (match_operand:P 0 "register_operand" "=r") (lo_sum:P (match_operand:P 1 "register_operand" "r") (unspec:P [(match_operand 2 "tie_symbolic_operand" "")] @@ -8068,7 +8068,7 @@ visl") [(set_attr "type" "load") (set_attr "subtype" "regular")]) -(define_insn "tie_add" +(define_insn "@tie_add" [(set (match_operand:P 0 "register_operand" "=r") (plus:P (match_operand:P 1 "register_operand" "r") (unspec:P [(match_operand:P 2 "register_operand" "r") @@ -8077,14 +8077,14 @@ visl") "TARGET_SUN_TLS" "add\\t%1, %2, %0, %%tie_add(%a3)") -(define_insn "tle_hix22" +(define_insn "@tle_hix22" [(set (match_operand:P 0 "register_operand" "=r") (high:P (unspec:P [(match_operand 1 "tle_symbolic_operand" "")] UNSPEC_TLSLE)))] "TARGET_TLS" "sethi\\t%%tle_hix22(%a1), %0") -(define_insn "tle_lox10" +(define_insn "@tle_lox10" [(set (match_operand:P 0 "register_operand" "=r") (lo_sum:P (match_operand:P 1 "register_operand" "r") (unspec:P [(match_operand 2 "tle_symbolic_operand" "")] @@ -8342,13 +8342,13 @@ visl") operands[1] = gen_rtx_MEM (Pmode, addr); #endif if (TARGET_ARCH64) - emit_insn (gen_stack_protect_setdi (operands[0], operands[1])); + emit_insn (gen_stack_protect_set64 (operands[0], operands[1])); else - emit_insn (gen_stack_protect_setsi (operands[0], operands[1])); + emit_insn (gen_stack_protect_set32 (operands[0], operands[1])); DONE; }) -(define_insn "stack_protect_setsi" +(define_insn "stack_protect_set32" [(set (match_operand:SI 0 "memory_operand" "=m") (unspec:SI [(match_operand:SI 1 "memory_operand" "m")] UNSPEC_SP_SET)) (set (match_scratch:SI 2 "=&r") (const_int 0))] @@ -8357,7 +8357,7 @@ visl") [(set_attr "type" "multi") (set_attr "length" "3")]) -(define_insn "stack_protect_setdi" +(define_insn "stack_protect_set64" [(set (match_operand:DI 0 "memory_operand" "=m") (unspec:DI [(match_operand:DI 1 "memory_operand" "m")] UNSPEC_SP_SET)) (set (match_scratch:DI 2 "=&r") (const_int 0))] @@ -8381,13 +8381,13 @@ visl") if (TARGET_ARCH64) { result = gen_reg_rtx (Pmode); - emit_insn (gen_stack_protect_testdi (result, operands[0], operands[1])); + emit_insn (gen_stack_protect_test64 (result, operands[0], operands[1])); test = gen_rtx_EQ (VOIDmode, result, const0_rtx); emit_jump_insn (gen_cbranchdi4 (test, result, const0_rtx, operands[2])); } else { - emit_insn (gen_stack_protect_testsi (operands[0], operands[1])); + emit_insn (gen_stack_protect_test32 (operands[0], operands[1])); result = gen_rtx_REG (CCmode, SPARC_ICC_REG); test = gen_rtx_EQ (VOIDmode, result, const0_rtx); emit_jump_insn (gen_cbranchcc4 (test, result, const0_rtx, operands[2])); @@ -8395,7 +8395,7 @@ visl") DONE; }) -(define_insn "stack_protect_testsi" +(define_insn "stack_protect_test32" [(set (reg:CC CC_REG) (unspec:CC [(match_operand:SI 0 "memory_operand" "m") (match_operand:SI 1 "memory_operand" "m")] @@ -8407,7 +8407,7 @@ visl") [(set_attr "type" "multi") (set_attr "length" "4")]) -(define_insn "stack_protect_testdi" +(define_insn "stack_protect_test64" [(set (match_operand:DI 0 "register_operand" "=&r") (unspec:DI [(match_operand:DI 1 "memory_operand" "m") (match_operand:DI 2 "memory_operand" "m")] -- cgit v1.1 From d03ca8a6148f55e119b8220a9c65147173b32065 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 29 Apr 2021 15:24:51 +0200 Subject: i386: Cleanup comparison predicates. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CCCmode is allowed only with GEU and LTU comparison codes. Also allow CCGZmode for these two codes. There is no need to check for trivial FP comparison operator, ix86_fp_compare_code_to_integer will return UNKNOWN code for unsupported operators. 2021-04-29 Uroš Bizjak gcc/ * config/i386/predicates.md (fcmov_comparison_operator): Do not check for trivial FP comparison operator. : Allow CCGZmode. : Do not allow CCCmode. (ix86_comparison_operator) : Allow only CCmode. (ix86_carry_flag_operator): Match only LTU and UNLT code. Do not check for trivial FP comparison operator. Allow CCGZmode. --- gcc/config/i386/predicates.md | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index b1df854..04a03a7 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1352,16 +1352,17 @@ enum rtx_code code = GET_CODE (op); if (inmode == CCFPmode) - { - if (!ix86_trivial_fp_comparison_operator (op, mode)) - return false; - code = ix86_fp_compare_code_to_integer (code); - } + code = ix86_fp_compare_code_to_integer (code); + /* i387 supports just limited amount of conditional codes. */ switch (code) { - case LTU: case GTU: case LEU: case GEU: - if (inmode == CCmode || inmode == CCFPmode || inmode == CCCmode) + case GEU: case LTU: + if (inmode == CCCmode || inmode == CCGZmode) + return true; + /* FALLTHRU */ + case GTU: case LEU: + if (inmode == CCmode || inmode == CCFPmode) return true; return false; case ORDERED: case UNORDERED: @@ -1418,11 +1419,11 @@ return true; return false; case GEU: case LTU: - if (inmode == CCGZmode) + if (inmode == CCCmode || inmode == CCGZmode) return true; /* FALLTHRU */ case GTU: case LEU: - if (inmode == CCmode || inmode == CCCmode || inmode == CCGZmode) + if (inmode == CCmode) return true; return false; case ORDERED: case UNORDERED: @@ -1441,20 +1442,14 @@ ;; Return true if OP is a valid comparison operator ;; testing carry flag to be set. (define_predicate "ix86_carry_flag_operator" - (match_code "ltu,lt,unlt,gtu,gt,ungt,le,unle,ge,unge,ltgt,uneq") + (match_code "ltu,unlt") { machine_mode inmode = GET_MODE (XEXP (op, 0)); enum rtx_code code = GET_CODE (op); if (inmode == CCFPmode) - { - if (!ix86_trivial_fp_comparison_operator (op, mode)) - return false; - code = ix86_fp_compare_code_to_integer (code); - } - else if (inmode == CCCmode) - return code == LTU || code == GTU; - else if (inmode != CCmode) + code = ix86_fp_compare_code_to_integer (code); + else if (inmode != CCmode && inmode != CCCmode && inmode != CCGZmode) return false; return code == LTU; -- cgit v1.1 From fd5d57946036c967dae292330fa0aa856a58fb4b Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 29 Apr 2021 16:43:33 +0200 Subject: i386: Mark x86 masked load builtins pure [PR100312] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark x86 AVX and AVX2 masked load builtins pure to enable dead code elimination and more appropriate alias analysis. 2021-04-29 Uroš Bizjak Richard Biener gcc/ PR target/100312 * config/i386/i386-builtin.def (IX86_BUILTIN_MASKLOADPD) (IX86_BUILTIN_MASKLOADPS, IX86_BUILTIN_MASKLOADPD256) (IX86_BUILTIN_MASKLOADPS256, IX86_BUILTIN_MASKLOADD) (IX86_BUILTIN_MASKLOADQ, IX86_BUILTIN_MASKLOADD256) (IX86_BUILTIN_MASKLOADQ256): Move from SPECIAL_ARGS to PURE_ARGS category. * config/i386/i386-builtins.c (ix86_init_mmx_sse_builtins): Handle PURE_ARGS category. * config/i386/i386-expand.c (ix86_expand_builtin): Ditto. --- gcc/config/i386/i386-builtin.def | 25 ++++++++++++++++--------- gcc/config/i386/i386-builtins.c | 22 ++++++++++++++++++++-- gcc/config/i386/i386-expand.c | 8 ++++++++ 3 files changed, 44 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 4dbd4f2..80c2a2c 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -187,10 +187,6 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq25 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF) -BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI) -BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI) -BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI) -BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF) @@ -198,10 +194,6 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_mask /* AVX2 */ BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI) @@ -473,7 +465,22 @@ BDESC (0, OPTION_MASK_ISA2_WIDEKL, CODE_FOR_nothing, "__builtin_ia32_aesdecwide2 BDESC (0, OPTION_MASK_ISA2_WIDEKL, CODE_FOR_nothing, "__builtin_ia32_aesencwide128kl_u8", IX86_BUILTIN_AESENCWIDE128KLU8, UNKNOWN, (int) UINT8_FTYPE_PV2DI_PCV2DI_PCVOID) BDESC (0, OPTION_MASK_ISA2_WIDEKL, CODE_FOR_nothing, "__builtin_ia32_aesencwide256kl_u8", IX86_BUILTIN_AESENCWIDE256KLU8, UNKNOWN, (int) UINT8_FTYPE_PV2DI_PCV2DI_PCVOID) -BDESC_END (SPECIAL_ARGS, ARGS) +BDESC_END (SPECIAL_ARGS, PURE_ARGS) + +/* AVX */ +BDESC_FIRST (pure_args, PURE_ARGS, + OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI) +BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI) +BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI) +BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI) + +/* AVX2 */ +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI) + +BDESC_END (PURE_ARGS, ARGS) /* Builtins with variable number of arguments. */ BDESC_FIRST (args, ARGS, diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c index 128bd39..b669110 100644 --- a/gcc/config/i386/i386-builtins.c +++ b/gcc/config/i386/i386-builtins.c @@ -108,8 +108,10 @@ BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST, IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1); BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST, +BDESC_VERIFYS (IX86_BUILTIN__BDESC_PURE_ARGS_FIRST, IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1); +BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST, + IX86_BUILTIN__BDESC_PURE_ARGS_LAST, 1); BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, IX86_BUILTIN__BDESC_ARGS_LAST, 1); BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, @@ -527,7 +529,23 @@ ix86_init_mmx_sse_builtins (void) IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, ARRAY_SIZE (bdesc_special_args) - 1); - /* Add all builtins with variable number of operands. */ + /* Add all pure builtins with variable number of operands. */ + for (i = 0, d = bdesc_pure_args; + i < ARRAY_SIZE (bdesc_pure_args); + i++, d++) + { + BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PURE_ARGS_FIRST, i); + if (d->name == 0) + continue; + + ftype = (enum ix86_builtin_func_type) d->flag; + def_builtin_pure (d->mask, d->mask2, d->name, ftype, d->code); + } + BDESC_VERIFYS (IX86_BUILTIN__BDESC_PURE_ARGS_LAST, + IX86_BUILTIN__BDESC_PURE_ARGS_FIRST, + ARRAY_SIZE (bdesc_pure_args) - 1); + + /* Add all const builtins with variable number of operands. */ for (i = 0, d = bdesc_args; i < ARRAY_SIZE (bdesc_args); i++, d++) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 516440e..7699062 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -13240,6 +13240,14 @@ rdseed_step: target); } + if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST + && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST) + { + i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST; + return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp, + target); + } + if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST) { -- cgit v1.1 From 449d7b40f6f6be8d7f9aa7232c73b0371f0963bf Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Thu, 29 Apr 2021 09:08:56 -0600 Subject: Fix nios2 build failure gcc * config/nios2/nios2-protos.h (nios2_fpu_insn_enabled): Move outside of RTX_CODE guard. --- gcc/config/nios2/nios2-protos.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/nios2/nios2-protos.h b/gcc/config/nios2/nios2-protos.h index df5d0c9..b831b0f 100644 --- a/gcc/config/nios2/nios2-protos.h +++ b/gcc/config/nios2/nios2-protos.h @@ -28,6 +28,7 @@ extern void nios2_expand_prologue (void); extern void nios2_expand_epilogue (bool); extern bool nios2_expand_return (void); extern void nios2_function_profiler (FILE *, int); +extern bool nios2_fpu_insn_enabled (enum n2fpu_code); #ifdef RTX_CODE extern bool nios2_large_constant_p (rtx); @@ -46,7 +47,6 @@ extern bool nios2_validate_compare (machine_mode, rtx *, rtx *, rtx *); extern bool nios2_validate_fpu_compare (machine_mode, rtx *, rtx *, rtx *, bool); -extern bool nios2_fpu_insn_enabled (enum n2fpu_code); extern const char * nios2_fpu_insn_asm (enum n2fpu_code); extern const char * nios2_add_insn_asm (rtx_insn *, rtx *); -- cgit v1.1 From 86403f4e6e5f72169147aca7e0d0b63e303ad5cd Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 29 Apr 2021 22:02:00 +0200 Subject: i386: Optimize carry flag comparisons a bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In ix86_int_compare, opportunistically swap operands of GTU and LEU comparisons to emit carry flag comparison, with the expectation that the comparison will combine to *add3_carry_0 or *sub3_carry_0 insn pattern. Do not use ix86_expand_carry_flag_compare because this function prefers carry flag comparisons too much - it forces the constants into registers and/or emits additional arithmetic instructions to convert simple comparisons into carry flag comparisons - but simply swap operands to convert GTU and LEU comparisons into GEU and LTU ones. Also, change the insn predicates of *add3_carry_0 and *sub3_carry_0 insn patterns to allow more combine opportunities with memory operands. 2021-04-29 Uroš Bizjak gcc/ * config/i386/i386-expand.c (ix86_expand_int_compare): Swap operands of GTU and LEU comparison to emit carry flag comparison. * config/i386/i386.md (*add3_carry_0): Change insn predicate to allow more combine opportunities with memory operands. (*sub3_carry_0): Ditto. --- gcc/config/i386/i386-expand.c | 8 ++++++++ gcc/config/i386/i386.md | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 7699062..fee4d07 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -2658,6 +2658,14 @@ ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1) machine_mode cmpmode; rtx tmp, flags; + /* Swap operands to emit carry flag comparison. */ + if ((code == GTU || code == LEU) + && nonimmediate_operand (op1, VOIDmode)) + { + std::swap (op0, op1); + code = swap_condition (code); + } + cmpmode = SELECT_CC_MODE (code, op0, op1); flags = gen_rtx_REG (cmpmode, FLAGS_REG); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index eff189f..b7f3e36 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -6777,7 +6777,7 @@ [(match_operand 2 "flags_reg_operand") (const_int 0)]) (match_operand:SWI 1 "nonimmediate_operand" "0"))) (clobber (reg:CC FLAGS_REG))] - "ix86_unary_operator_ok (PLUS, mode, operands)" + "!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1])" "adc{}\t{$0, %0|%0, 0}" [(set_attr "type" "alu") (set_attr "use_carry" "1") @@ -6919,7 +6919,7 @@ (match_operator:SWI 3 "ix86_carry_flag_operator" [(match_operand 2 "flags_reg_operand") (const_int 0)]))) (clobber (reg:CC FLAGS_REG))] - "ix86_unary_operator_ok (MINUS, mode, operands)" + "!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1])" "sbb{}\t{$0, %0|%0, 0}" [(set_attr "type" "alu") (set_attr "use_carry" "1") -- cgit v1.1 From c4551a27620670397b16101592d4689a339f77cb Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Thu, 29 Apr 2021 19:31:30 -0300 Subject: add ASM_OUTPUT_MAX_SKIP_ALIGN to i386.h Several i386 align tests expect p2align to be used, but not all configurations define ASM_OUTPUT_MAX_SKIP_ALIGN, even when HAVE_GAS_MAX_SKIP_P2ALIGN. i386.h had an equivalent ASM_OUTPUT_MAX_SKIP_PAD. I've renamed it and its uses to the documented _ALIGN spelling, and dropped all redundant defines elsewhere in gcc/config/i386/. for gcc/ChangeLog * config/i386/i386.h (ASM_OUTPUT_MAX_SKIP_PAD): Rename to... (ASM_OUTPUT_MAX_SKIP_ALIGN): ... this. Enclose in do/while(0). * config/i386/i386.c: Adjust. * config/i386/i386.md: Adjust. * config/i386/darwin.h (ASM_OUTPUT_MAX_SKIP_ALIGN): Drop. * config/i386/dragonfly.h (ASM_OUTPUT_MAX_SKIP_ALIGN): Likewise. * config/i386/freebsd.h (ASM_OUTPUT_MAX_SKIP_ALIGN): Likewise. * config/i386/gas.h (ASM_OUTPUT_MAX_SKIP_ALIGN): Likewise. * config/i386/gnu-user.h (ASM_OUTPUT_MAX_SKIP_ALIGN): Likewise. * config/i386/iamcu.h (ASM_OUTPUT_MAX_SKIP_ALIGN): Likewise. * config/i386/lynx.h (ASM_OUTPUT_MAX_SKIP_ALIGN): Likewise. * config/i386/netbsd-elf.h (ASM_OUTPUT_MAX_SKIP_ALIGN): Likewise. * config/i386/openbsdelf.h (ASM_OUTPUT_MAX_SKIP_ALIGN): Likewise. * config/i386/x86-64.h (ASM_OUTPUT_MAX_SKIP_ALIGN): Likewise. (ASM_OUTPUT_MAX_SKIP_PAD): Likewise. --- gcc/config/i386/darwin.h | 12 ------------ gcc/config/i386/dragonfly.h | 17 ----------------- gcc/config/i386/freebsd.h | 13 ------------- gcc/config/i386/gas.h | 16 ---------------- gcc/config/i386/gnu-user.h | 12 ------------ gcc/config/i386/i386.c | 4 ++-- gcc/config/i386/i386.h | 14 +++++++------- gcc/config/i386/i386.md | 4 ++-- gcc/config/i386/iamcu.h | 10 ---------- gcc/config/i386/lynx.h | 18 ------------------ gcc/config/i386/netbsd-elf.h | 16 ---------------- gcc/config/i386/openbsdelf.h | 16 ---------------- gcc/config/i386/x86-64.h | 24 ------------------------ 13 files changed, 11 insertions(+), 165 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h index 2657dfe..afa9f1b 100644 --- a/gcc/config/i386/darwin.h +++ b/gcc/config/i386/darwin.h @@ -217,18 +217,6 @@ along with GCC; see the file COPYING3. If not see } \ } while (0) -#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN -#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP) \ - do { \ - if ((LOG) != 0) { \ - if ((MAX_SKIP) == 0 || (MAX_SKIP) >= (1 << (LOG)) - 1) \ - fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ - else \ - fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ - } \ - } while (0) -#endif - /* Darwin x86 assemblers support the .ident directive. */ #undef TARGET_ASM_OUTPUT_IDENT diff --git a/gcc/config/i386/dragonfly.h b/gcc/config/i386/dragonfly.h index ab8a269..62fac88 100644 --- a/gcc/config/i386/dragonfly.h +++ b/gcc/config/i386/dragonfly.h @@ -61,23 +61,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #define SUBTARGET_EXTRA_SPECS \ { "dfbsd_dynamic_linker", DFBSD_DYNAMIC_LINKER } -/* A C statement to output to the stdio stream FILE an assembler - command to advance the location counter to a multiple of 1<= (1 << (LOG)) - 1) \ - fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ - else \ - fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ - } -#endif - /* Don't default to pcc-struct-return, we want to retain compatibility with older gcc versions AND pcc-struct-return is nonreentrant. (even though the SVR4 ABI for the i386 says that records and unions are diff --git a/gcc/config/i386/freebsd.h b/gcc/config/i386/freebsd.h index b1b3bb3..00df79a 100644 --- a/gcc/config/i386/freebsd.h +++ b/gcc/config/i386/freebsd.h @@ -96,19 +96,6 @@ along with GCC; see the file COPYING3. If not see #define SUBALIGN_LOG 3 -#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN -#undef ASM_OUTPUT_MAX_SKIP_ALIGN -#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP) \ - do { \ - if ((LOG) != 0) { \ - if ((MAX_SKIP) == 0 || (MAX_SKIP) >= (1 << (LOG)) - 1) \ - fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ - else \ - fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ - } \ - } while (0) -#endif - /* Don't default to pcc-struct-return, we want to retain compatibility with older gcc versions AND pcc-struct-return is nonreentrant. (even though the SVR4 ABI for the i386 says that records and unions are diff --git a/gcc/config/i386/gas.h b/gcc/config/i386/gas.h index f76a283..d3cfd61 100644 --- a/gcc/config/i386/gas.h +++ b/gcc/config/i386/gas.h @@ -59,22 +59,6 @@ along with GCC; see the file COPYING3. If not see #define ASM_OUTPUT_ALIGN(FILE,LOG) \ if ((LOG)!=0) fprintf ((FILE), "\t.balign %d\n", 1 << (LOG)) #endif - -/* A C statement to output to the stdio stream FILE an assembler - command to advance the location counter to a multiple of 1<= (1 << (LOG)) - 1) \ - fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ - else \ - fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ - } -#endif /* A C statement or statements which output an assembler instruction opcode to the stdio stream STREAM. The macro-operand PTR is a diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h index a23e7ab..57d6781 100644 --- a/gcc/config/i386/gnu-user.h +++ b/gcc/config/i386/gnu-user.h @@ -93,18 +93,6 @@ along with GCC; see the file COPYING3. If not see #define SUBALIGN_LOG 3 -#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN -#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP) \ - do { \ - if ((LOG) != 0) { \ - if ((MAX_SKIP) == 0 || (MAX_SKIP) >= (1 << (LOG)) - 1) \ - fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ - else \ - fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ - } \ - } while (0) -#endif - /* Handle special EH pointer encodings. Absolute, pc-relative, and indirect are handled automatically. */ #define ASM_MAYBE_OUTPUT_ENCODED_ADDR_RTX(FILE, ENCODING, SIZE, ADDR, DONE) \ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 68f33f9..48079c8 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -20946,7 +20946,7 @@ ix86_min_insn_size (rtx_insn *insn) return 2; } -#ifdef ASM_OUTPUT_MAX_SKIP_PAD +#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte window. */ @@ -21274,7 +21274,7 @@ ix86_reorg (void) ix86_pad_short_function (); else if (TARGET_PAD_RETURNS) ix86_pad_returns (); -#ifdef ASM_OUTPUT_MAX_SKIP_PAD +#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN if (TARGET_FOUR_JUMP_LIMIT) ix86_avoid_jump_mispredicts (); #endif diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 96b46ba..97d6f38 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2068,15 +2068,15 @@ extern int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER]; bytes if it is within MAX_SKIP bytes. */ #ifdef HAVE_GAS_MAX_SKIP_P2ALIGN -#undef ASM_OUTPUT_MAX_SKIP_PAD -#define ASM_OUTPUT_MAX_SKIP_PAD(FILE, LOG, MAX_SKIP) \ - if ((LOG) != 0) \ - { \ +# define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP) \ + do { \ + if ((LOG) != 0) { \ if ((MAX_SKIP) == 0 || (MAX_SKIP) >= (1 << (LOG)) - 1) \ - fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ + fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ else \ - fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ - } + fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ + } \ + } while (0) #endif /* Write the extra assembler code needed to declare a function diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index b7f3e36..70ff29b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -13937,8 +13937,8 @@ [(unspec_volatile [(match_operand 0)] UNSPECV_ALIGN)] "" { -#ifdef ASM_OUTPUT_MAX_SKIP_PAD - ASM_OUTPUT_MAX_SKIP_PAD (asm_out_file, 4, (int)INTVAL (operands[0])); +#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN + ASM_OUTPUT_MAX_SKIP_ALIGN (asm_out_file, 4, (int)INTVAL (operands[0])); #else /* It is tempting to use ASM_OUTPUT_ALIGN here, but we don't want to do that. The align insn is used to avoid 3 jump instructions in the row to improve diff --git a/gcc/config/i386/iamcu.h b/gcc/config/i386/iamcu.h index be99406..33012b2 100644 --- a/gcc/config/i386/iamcu.h +++ b/gcc/config/i386/iamcu.h @@ -66,16 +66,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #define SUBALIGN_LOG 3 -#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP) \ - do { \ - if ((LOG) != 0) { \ - if ((MAX_SKIP) == 0 || (MAX_SKIP) >= (1 << (LOG)) - 1) \ - fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ - else \ - fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ - } \ - } while (0) - /* Handle special EH pointer encodings. Absolute, pc-relative, and indirect are handled automatically. */ #define ASM_MAYBE_OUTPUT_ENCODED_ADDR_RTX(FILE, ENCODING, SIZE, ADDR, DONE) \ diff --git a/gcc/config/i386/lynx.h b/gcc/config/i386/lynx.h index 4804016..70b2587 100644 --- a/gcc/config/i386/lynx.h +++ b/gcc/config/i386/lynx.h @@ -50,24 +50,6 @@ along with GCC; see the file COPYING3. If not see : (n) == 7 ? 4 \ : ((n) >= FIRST_STACK_REG && (n) <= LAST_STACK_REG) ? (int) (n) + 8 \ : (-1)) - -/* A C statement to output to the stdio stream FILE an assembler - command to advance the location counter to a multiple of 1<= (1 << (LOG)) - 1) \ - fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ - else \ - fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ - } \ - } while (0) -#endif /* Undefine SUBTARGET_EXTRA_SPECS it is empty anyway. We define it in config/lynx.h. */ diff --git a/gcc/config/i386/netbsd-elf.h b/gcc/config/i386/netbsd-elf.h index a0bbfd0..66cd192 100644 --- a/gcc/config/i386/netbsd-elf.h +++ b/gcc/config/i386/netbsd-elf.h @@ -95,22 +95,6 @@ along with GCC; see the file COPYING3. If not see assemble_name(FILE, NAME2); \ fputc('\n', FILE); } while (0) -/* A C statement to output to the stdio stream FILE an assembler - command to advance the location counter to a multiple of 1<= (1 << (LOG)) - 1) \ - fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ - else \ - fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ - } -#endif - /* We always use gas here, so we don't worry about ECOFF assembler problems. */ #undef TARGET_GAS diff --git a/gcc/config/i386/openbsdelf.h b/gcc/config/i386/openbsdelf.h index c411ff5..862ba12 100644 --- a/gcc/config/i386/openbsdelf.h +++ b/gcc/config/i386/openbsdelf.h @@ -61,24 +61,8 @@ along with GCC; see the file COPYING3. If not see #undef ASM_APP_OFF #define ASM_APP_OFF "#NO_APP\n" -/* A C statement to output to the stdio stream FILE an assembler - command to advance the location counter to a multiple of 1<= (1 << (LOG)) - 1) \ - fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ - else \ - fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ - } \ - } while (0) -#endif - /* OpenBSD's profiler recovers all information from the stack pointer. The icky part is not here, but in . */ #undef FUNCTION_PROFILER diff --git a/gcc/config/i386/x86-64.h b/gcc/config/i386/x86-64.h index 0cdd980..ea872a4 100644 --- a/gcc/config/i386/x86-64.h +++ b/gcc/config/i386/x86-64.h @@ -70,32 +70,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see } \ while (0) -/* This is used to align code labels according to Intel recommendations. */ - #define SUBALIGN_LOG 3 -#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN -#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP) \ - do { \ - if ((LOG) != 0) { \ - if ((MAX_SKIP) == 0 || (MAX_SKIP) >= (1 << (LOG)) - 1) \ - fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ - else \ - fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ - } \ - } while (0) -#undef ASM_OUTPUT_MAX_SKIP_PAD -#define ASM_OUTPUT_MAX_SKIP_PAD(FILE, LOG, MAX_SKIP) \ - if ((LOG) != 0) \ - { \ - if ((MAX_SKIP) == 0 || (MAX_SKIP) >= (1 << (LOG)) - 1) \ - fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ - else \ - fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ - } -#endif - - /* i386 System V Release 4 uses DWARF debugging info. x86-64 ABI specifies DWARF2. */ -- cgit v1.1 From 6efd040c301b06fae51657c8370ad940c5c3d513 Mon Sep 17 00:00:00 2001 From: LevyHsu Date: Thu, 29 Apr 2021 13:42:04 +0800 Subject: RISC-V: Add patterns for builtin overflow. gcc/ * config/riscv/riscv.c (riscv_min_arithmetic_precision): New. * config/riscv/riscv.h (TARGET_MIN_ARITHMETIC_PRECISION): New. * config/riscv/riscv.md (addv4, uaddv4): New. (subv4, usubv4, mulv4, umulv4): New. --- gcc/config/riscv/riscv.c | 8 ++ gcc/config/riscv/riscv.h | 4 + gcc/config/riscv/riscv.md | 245 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 257 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c index 17cdf70..e1064e3 100644 --- a/gcc/config/riscv/riscv.c +++ b/gcc/config/riscv/riscv.c @@ -351,6 +351,14 @@ static const struct riscv_tune_info riscv_tune_info_table[] = { { "size", generic, &optimize_size_tune_info }, }; +/* Implement TARGET_MIN_ARITHMETIC_PRECISION. */ + +static unsigned int +riscv_min_arithmetic_precision (void) +{ + return 32; +} + /* Return the riscv_tune_info entry for the given name string. */ static const struct riscv_tune_info * diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index d17096e..f3e8572 100644 --- a/gcc/config/riscv/riscv.h +++ b/gcc/config/riscv/riscv.h @@ -146,6 +146,10 @@ ASM_MISA_SPEC #define MIN_UNITS_PER_WORD 4 #endif +/* Allows SImode op in builtin overflow pattern, see internal-fn.c. */ +#undef TARGET_MIN_ARITHMETIC_PRECISION +#define TARGET_MIN_ARITHMETIC_PRECISION riscv_min_arithmetic_precision + /* The `Q' extension is not yet supported. */ #define UNITS_PER_FP_REG (TARGET_DOUBLE_FLOAT ? 8 : 4) diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index c3687d5..0e35960 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -467,6 +467,81 @@ [(set_attr "type" "arith") (set_attr "mode" "DI")]) +(define_expand "addv4" + [(set (match_operand:GPR 0 "register_operand" "=r,r") + (plus:GPR (match_operand:GPR 1 "register_operand" " r,r") + (match_operand:GPR 2 "arith_operand" " r,I"))) + (label_ref (match_operand 3 "" ""))] + "" +{ + if (TARGET_64BIT && mode == SImode) + { + rtx t3 = gen_reg_rtx (DImode); + rtx t4 = gen_reg_rtx (DImode); + rtx t5 = gen_reg_rtx (DImode); + rtx t6 = gen_reg_rtx (DImode); + + emit_insn (gen_addsi3 (operands[0], operands[1], operands[2])); + if (GET_CODE (operands[1]) != CONST_INT) + emit_insn (gen_extend_insn (t4, operands[1], DImode, SImode, 0)); + else + t4 = operands[1]; + if (GET_CODE (operands[2]) != CONST_INT) + emit_insn (gen_extend_insn (t5, operands[2], DImode, SImode, 0)); + else + t5 = operands[2]; + emit_insn (gen_adddi3 (t3, t4, t5)); + emit_insn (gen_extend_insn (t6, operands[0], DImode, SImode, 0)); + + riscv_expand_conditional_branch (operands[3], NE, t6, t3); + } + else + { + rtx t3 = gen_reg_rtx (mode); + rtx t4 = gen_reg_rtx (mode); + + emit_insn (gen_add3_insn (operands[0], operands[1], operands[2])); + rtx cmp1 = gen_rtx_LT (mode, operands[2], const0_rtx); + emit_insn (gen_cstore4 (t3, cmp1, operands[2], const0_rtx)); + rtx cmp2 = gen_rtx_LT (mode, operands[0], operands[1]); + + emit_insn (gen_cstore4 (t4, cmp2, operands[0], operands[1])); + riscv_expand_conditional_branch (operands[3], NE, t3, t4); + } + DONE; +}) + +(define_expand "uaddv4" + [(set (match_operand:GPR 0 "register_operand" "=r,r") + (plus:GPR (match_operand:GPR 1 "register_operand" " r,r") + (match_operand:GPR 2 "arith_operand" " r,I"))) + (label_ref (match_operand 3 "" ""))] + "" +{ + if (TARGET_64BIT && mode == SImode) + { + rtx t3 = gen_reg_rtx (DImode); + rtx t4 = gen_reg_rtx (DImode); + + if (GET_CODE (operands[1]) != CONST_INT) + emit_insn (gen_extend_insn (t3, operands[1], DImode, SImode, 0)); + else + t3 = operands[1]; + emit_insn (gen_addsi3 (operands[0], operands[1], operands[2])); + emit_insn (gen_extend_insn (t4, operands[0], DImode, SImode, 0)); + + riscv_expand_conditional_branch (operands[3], LTU, t4, t3); + } + else + { + emit_insn (gen_add3_insn (operands[0], operands[1], operands[2])); + riscv_expand_conditional_branch (operands[3], LTU, operands[0], + operands[1]); + } + + DONE; +}) + (define_insn "*addsi3_extended" [(set (match_operand:DI 0 "register_operand" "=r,r") (sign_extend:DI @@ -523,6 +598,85 @@ [(set_attr "type" "arith") (set_attr "mode" "SI")]) +(define_expand "subv4" + [(set (match_operand:GPR 0 "register_operand" "= r") + (minus:GPR (match_operand:GPR 1 "reg_or_0_operand" " rJ") + (match_operand:GPR 2 "register_operand" " r"))) + (label_ref (match_operand 3 "" ""))] + "" +{ + if (TARGET_64BIT && mode == SImode) + { + rtx t3 = gen_reg_rtx (DImode); + rtx t4 = gen_reg_rtx (DImode); + rtx t5 = gen_reg_rtx (DImode); + rtx t6 = gen_reg_rtx (DImode); + + emit_insn (gen_subsi3 (operands[0], operands[1], operands[2])); + if (GET_CODE (operands[1]) != CONST_INT) + emit_insn (gen_extend_insn (t4, operands[1], DImode, SImode, 0)); + else + t4 = operands[1]; + if (GET_CODE (operands[2]) != CONST_INT) + emit_insn (gen_extend_insn (t5, operands[2], DImode, SImode, 0)); + else + t5 = operands[2]; + emit_insn (gen_subdi3 (t3, t4, t5)); + emit_insn (gen_extend_insn (t6, operands[0], DImode, SImode, 0)); + + riscv_expand_conditional_branch (operands[3], NE, t6, t3); + } + else + { + rtx t3 = gen_reg_rtx (mode); + rtx t4 = gen_reg_rtx (mode); + + emit_insn (gen_sub3_insn (operands[0], operands[1], operands[2])); + + rtx cmp1 = gen_rtx_LT (mode, operands[2], const0_rtx); + emit_insn (gen_cstore4 (t3, cmp1, operands[2], const0_rtx)); + + rtx cmp2 = gen_rtx_LT (mode, operands[1], operands[0]); + emit_insn (gen_cstore4 (t4, cmp2, operands[1], operands[0])); + + riscv_expand_conditional_branch (operands[3], NE, t3, t4); + } + + DONE; +}) + +(define_expand "usubv4" + [(set (match_operand:GPR 0 "register_operand" "= r") + (minus:GPR (match_operand:GPR 1 "reg_or_0_operand" " rJ") + (match_operand:GPR 2 "register_operand" " r"))) + (label_ref (match_operand 3 "" ""))] + "" +{ + if (TARGET_64BIT && mode == SImode) + { + rtx t3 = gen_reg_rtx (DImode); + rtx t4 = gen_reg_rtx (DImode); + + if (GET_CODE (operands[1]) != CONST_INT) + emit_insn (gen_extend_insn (t3, operands[1], DImode, SImode, 0)); + else + t3 = operands[1]; + emit_insn (gen_subsi3 (operands[0], operands[1], operands[2])); + emit_insn (gen_extend_insn (t4, operands[0], DImode, SImode, 0)); + + riscv_expand_conditional_branch (operands[3], LTU, t3, t4); + } + else + { + emit_insn (gen_sub3_insn (operands[0], operands[1], operands[2])); + riscv_expand_conditional_branch (operands[3], LTU, operands[1], + operands[0]); + } + + DONE; +}) + + (define_insn "*subsi3_extended" [(set (match_operand:DI 0 "register_operand" "= r") (sign_extend:DI @@ -614,6 +768,97 @@ [(set_attr "type" "imul") (set_attr "mode" "DI")]) +(define_expand "mulv4" + [(set (match_operand:GPR 0 "register_operand" "=r") + (mult:GPR (match_operand:GPR 1 "register_operand" " r") + (match_operand:GPR 2 "register_operand" " r"))) + (label_ref (match_operand 3 "" ""))] + "TARGET_MUL" +{ + if (TARGET_64BIT && mode == SImode) + { + rtx t3 = gen_reg_rtx (DImode); + rtx t4 = gen_reg_rtx (DImode); + rtx t5 = gen_reg_rtx (DImode); + rtx t6 = gen_reg_rtx (DImode); + + if (GET_CODE (operands[1]) != CONST_INT) + emit_insn (gen_extend_insn (t4, operands[1], DImode, SImode, 0)); + else + t4 = operands[1]; + if (GET_CODE (operands[2]) != CONST_INT) + emit_insn (gen_extend_insn (t5, operands[2], DImode, SImode, 0)); + else + t5 = operands[2]; + emit_insn (gen_muldi3 (t3, t4, t5)); + + emit_move_insn (operands[0], gen_lowpart (SImode, t3)); + emit_insn (gen_extend_insn (t6, operands[0], DImode, SImode, 0)); + + riscv_expand_conditional_branch (operands[3], NE, t6, t3); + } + else + { + rtx hp = gen_reg_rtx (mode); + rtx lp = gen_reg_rtx (mode); + + emit_insn (gen_mul3_highpart (hp, operands[1], operands[2])); + emit_insn (gen_mul3 (operands[0], operands[1], operands[2])); + emit_insn (gen_ashr3 (lp, operands[0], + GEN_INT (BITS_PER_WORD - 1))); + + riscv_expand_conditional_branch (operands[3], NE, hp, lp); + } + + DONE; +}) + +(define_expand "umulv4" + [(set (match_operand:GPR 0 "register_operand" "=r") + (mult:GPR (match_operand:GPR 1 "register_operand" " r") + (match_operand:GPR 2 "register_operand" " r"))) + (label_ref (match_operand 3 "" ""))] + "TARGET_MUL" +{ + if (TARGET_64BIT && mode == SImode) + { + rtx t3 = gen_reg_rtx (DImode); + rtx t4 = gen_reg_rtx (DImode); + rtx t5 = gen_reg_rtx (DImode); + rtx t6 = gen_reg_rtx (DImode); + rtx t7 = gen_reg_rtx (DImode); + rtx t8 = gen_reg_rtx (DImode); + + if (GET_CODE (operands[1]) != CONST_INT) + emit_insn (gen_extend_insn (t3, operands[1], DImode, SImode, 0)); + else + t3 = operands[1]; + if (GET_CODE (operands[2]) != CONST_INT) + emit_insn (gen_extend_insn (t4, operands[2], DImode, SImode, 0)); + else + t4 = operands[2]; + + emit_insn (gen_ashldi3 (t5, t3, GEN_INT (32))); + emit_insn (gen_ashldi3 (t6, t4, GEN_INT (32))); + emit_insn (gen_umuldi3_highpart (t7, t5, t6)); + emit_move_insn (operands[0], gen_lowpart (SImode, t7)); + emit_insn (gen_lshrdi3 (t8, t7, GEN_INT (32))); + + riscv_expand_conditional_branch (operands[3], NE, t8, const0_rtx); + } + else + { + rtx hp = gen_reg_rtx (mode); + + emit_insn (gen_umul3_highpart (hp, operands[1], operands[2])); + emit_insn (gen_mul3 (operands[0], operands[1], operands[2])); + + riscv_expand_conditional_branch (operands[3], NE, hp, const0_rtx); + } + + DONE; +}) + (define_insn "*mulsi3_extended" [(set (match_operand:DI 0 "register_operand" "=r") (sign_extend:DI -- cgit v1.1 From 17f2908fcf058e145cff275966e34f8c7f57c2c5 Mon Sep 17 00:00:00 2001 From: Geng Qi Date: Wed, 28 Apr 2021 16:29:33 +0800 Subject: RISC-V: For '-march' and '-mabi' options, add 'Negative' property mentions itself. When use multi-lib riscv-tool-chain. A bug is triggered when there are two '-march' at command line. riscv64-unknown-elf-gcc -march=rv32gcp -mabi=ilp32f -march=rv32gcpzp64 HelloWorld.c /lhome/gengq/riscv64-linux-ptest/lib/gcc/riscv64-unknown-elf/10.2.0/../../../../riscv64-unknown-elf/bin/ld: /lhome/gengq/riscv64-linux-ptest/lib/gcc/riscv64-unknown-elf/10.2.0/../../../../riscv64-unknown-elf/lib/crt0.o: ABI is incompatible with that of the selected emulation: target emulation `elf64-littleriscv' does not match `elf32-littleriscv' /lhome/gengq/riscv64-linux-ptest/lib/gcc/riscv64-unknown-elf/10.2.0/../../../../riscv64-unknown-elf/bin/ld: failed to merge target specific data of file /lhome/gengq/riscv64-linux-ptest/lib/gcc/riscv64-unknown-elf/10.2.0/../../../../riscv64-unknown-elf/lib/crt0.o /lhome/gengq/riscv64-linux-ptest/lib/gcc/riscv64-unknown-elf/10.2.0/../../../../riscv64-unknown-elf/bin/ld: /lhome/gengq/riscv64-linux-ptest/lib/gcc/riscv64-unknown-elf/10.2.0/crtbegin.o: ABI is incompatible with that of the selected emulation: target emulation `elf64-littleriscv' does not match `elf32-littleriscv' /lhome/gengq/riscv64-linux-ptest/lib/gcc/riscv64-unknown-elf/10.2.0/../../../../riscv64-unknown-elf/bin/ld: failed to merge target specific data of file /lhome/gengq/riscv64-linux-ptest/lib/gcc/riscv64-unknown-elf/10.2.0/crtbegin.o ...... This patch fix it. And the DRIVER would prune the extra '-march' and '-mabi' options and keep only the last one valid. gcc/ChangeLog: * config/riscv/riscv.opt (march=,mabi=): Negative itself. --- gcc/config/riscv/riscv.opt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt index e294e22..5ff85c2 100644 --- a/gcc/config/riscv/riscv.opt +++ b/gcc/config/riscv/riscv.opt @@ -38,7 +38,7 @@ Target Var(TARGET_PLT) Init(1) When generating -fpic code, allow the use of PLTs. Ignored for fno-pic. mabi= -Target RejectNegative Joined Enum(abi_type) Var(riscv_abi) Init(ABI_ILP32) +Target RejectNegative Joined Enum(abi_type) Var(riscv_abi) Init(ABI_ILP32) Negative(mabi=) Specify integer and floating-point calling convention. mpreferred-stack-boundary= @@ -79,7 +79,7 @@ Target Mask(DIV) Use hardware instructions for integer division. march= -Target RejectNegative Joined +Target RejectNegative Joined Negative(march=) -march= Generate code for given RISC-V ISA (e.g. RV64IM). ISA strings must be lower-case. -- cgit v1.1 From c111f6066043d3b7bc4141ca0411eae9294aa6c5 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 30 Apr 2021 10:15:26 +0200 Subject: i386: Introduce reversed ADC and SBB patterns [PR98060] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compiler is able to merge LTU comparisons with PLUS or MINUS pattern to form addition with carry (ADC) and subtraction with borrow (SBB) instructions: op = op + carry [ADC $0, op] op = op - carry [SBB $0, op] The patch introduces reversed ADC and SBB insn patterns: op = op + !carry [SBB $-1, op] op = op - !carry [ADC $-1, op] allowing the compiler to also merge GEU comparisons. 2021-04-30 Uroš Bizjak gcc/ PR target/98060 * config/i386/i386.md (*add3_carry_0r): New insn pattern. (*addsi3_carry_zext_0r): Ditto. (*sub3_carry_0): Ditto. (*subsi3_carry_zext_0r): Ditto. * config/i386/predicates.md (ix86_carry_flag_unset_operator): New predicate. * config/i386/i386.c (ix86_rtx_costs) : Also consider ix86_carry_flag_unset_operator to calculate the cost of adc/sbb insn. gcc/testsuite/ PR target/98060 * gcc.target/i386/pr98060.c: New test. --- gcc/config/i386/i386.c | 10 +++++-- gcc/config/i386/i386.md | 65 ++++++++++++++++++++++++++++++++++++++++--- gcc/config/i386/predicates.md | 16 +++++++++++ 3 files changed, 84 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 48079c8..780da10 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -20057,13 +20057,16 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, } else if (GET_CODE (XEXP (x, 0)) == PLUS) { + rtx op = XEXP (XEXP (x, 0), 0); + /* Add with carry, ignore the cost of adding a carry flag. */ - if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode)) + if (ix86_carry_flag_operator (op, mode) + || ix86_carry_flag_unset_operator (op, mode)) *total = cost->add; else { *total = cost->lea; - *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, + *total += rtx_cost (op, mode, outer_code, opno, speed); } @@ -20081,7 +20084,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) <= UNITS_PER_WORD && GET_CODE (XEXP (x, 0)) == MINUS - && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode)) + && (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode) + || ix86_carry_flag_unset_operator (XEXP (XEXP (x, 0), 1), mode))) { *total = cost->add; *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 70ff29b..f79fd12 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -6773,8 +6773,8 @@ (define_insn "*add3_carry_0" [(set (match_operand:SWI 0 "nonimmediate_operand" "=m") (plus:SWI - (match_operator:SWI 3 "ix86_carry_flag_operator" - [(match_operand 2 "flags_reg_operand") (const_int 0)]) + (match_operator:SWI 2 "ix86_carry_flag_operator" + [(reg FLAGS_REG) (const_int 0)]) (match_operand:SWI 1 "nonimmediate_operand" "0"))) (clobber (reg:CC FLAGS_REG))] "!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1])" @@ -6784,6 +6784,20 @@ (set_attr "pent_pair" "pu") (set_attr "mode" "")]) +(define_insn "*add3_carry_0r" + [(set (match_operand:SWI 0 "nonimmediate_operand" "=m") + (plus:SWI + (match_operator:SWI 2 "ix86_carry_flag_unset_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_operand:SWI 1 "nonimmediate_operand" "0"))) + (clobber (reg:CC FLAGS_REG))] + "!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1])" + "sbb{}\t{$-1, %0|%0, -1}" + [(set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "mode" "")]) + (define_insn "*addsi3_carry_zext" [(set (match_operand:DI 0 "register_operand" "=r") (zero_extend:DI @@ -6814,6 +6828,20 @@ (set_attr "pent_pair" "pu") (set_attr "mode" "SI")]) +(define_insn "*addsi3_carry_zext_0r" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (plus:SI (match_operator:SI 2 "ix86_carry_flag_unset_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_operand:SI 1 "register_operand" "0")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT" + "sbb{l}\t{$-1, %k0|%k0, -1}" + [(set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "mode" "SI")]) + ;; There is no point to generate ADCX instruction. ADC is shorter and faster. (define_insn "addcarry" @@ -6916,8 +6944,8 @@ [(set (match_operand:SWI 0 "nonimmediate_operand" "=m") (minus:SWI (match_operand:SWI 1 "nonimmediate_operand" "0") - (match_operator:SWI 3 "ix86_carry_flag_operator" - [(match_operand 2 "flags_reg_operand") (const_int 0)]))) + (match_operator:SWI 2 "ix86_carry_flag_operator" + [(reg FLAGS_REG) (const_int 0)]))) (clobber (reg:CC FLAGS_REG))] "!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1])" "sbb{}\t{$0, %0|%0, 0}" @@ -6926,6 +6954,20 @@ (set_attr "pent_pair" "pu") (set_attr "mode" "")]) +(define_insn "*sub3_carry_0r" + [(set (match_operand:SWI 0 "nonimmediate_operand" "=m") + (minus:SWI + (match_operand:SWI 1 "nonimmediate_operand" "0") + (match_operator:SWI 2 "ix86_carry_flag_unset_operator" + [(reg FLAGS_REG) (const_int 0)]))) + (clobber (reg:CC FLAGS_REG))] + "!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1])" + "adc{}\t{$-1, %0|%0, -1}" + [(set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "mode" "")]) + (define_insn "*subsi3_carry_zext" [(set (match_operand:DI 0 "register_operand" "=r") (zero_extend:DI @@ -6958,6 +7000,21 @@ (set_attr "pent_pair" "pu") (set_attr "mode" "SI")]) +(define_insn "*subsi3_carry_zext_0r" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (minus:SI + (match_operand:SI 1 "register_operand" "0") + (match_operator:SI 2 "ix86_carry_flag_unset_operator" + [(reg FLAGS_REG) (const_int 0)])))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT" + "adc{l}\t{$-1, %k0|%k0, -1}" + [(set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "mode" "SI")]) + (define_insn "@sub3_carry_ccc" [(set (reg:CCC FLAGS_REG) (compare:CCC diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 04a03a7..6dfbb08 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1455,6 +1455,22 @@ return code == LTU; }) +;; Return true if OP is a valid comparison operator +;; testing carry flag to be unset. +(define_predicate "ix86_carry_flag_unset_operator" + (match_code "geu,ge") +{ + machine_mode inmode = GET_MODE (XEXP (op, 0)); + enum rtx_code code = GET_CODE (op); + + if (inmode == CCFPmode) + code = ix86_fp_compare_code_to_integer (code); + else if (inmode != CCmode && inmode != CCCmode && inmode != CCGZmode) + return false; + + return code == GEU; +}) + ;; Return true if this comparison only requires testing one flag bit. (define_predicate "ix86_trivial_fp_comparison_operator" (match_code "gt,ge,unlt,unle,uneq,ltgt,ordered,unordered")) -- cgit v1.1 From d9398dd2902a0142fa3b493cf61a340f1f5ab46f Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Fri, 30 Apr 2021 12:32:08 -0400 Subject: Define target hook to emit KFmode constants for libgcc. This patch defines a target hook so that the KFmode constants (__LIBGCC_KF_MAX__, __LIBGCC_KF_MIN__, and __LIBGCC_KF_EPSILON__) needed to build _divkc3.c in libgcc are defined. The need for these constants were added in the April 28th changes to libgcc that added complex division optimizations. We only define the KFmode constants if IEEE 128-bit floating point is supported, but long double does not use the IEEE 128-bit format. If long double uses the IEEE 128-bit format, it will use TFmode and not KFmode. gcc/ 2021-04-30 Michael Meissner PR bootstrap/100327 * config/rs6000/rs6000.c (TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P): Define. (rs6000_libgcc_floating_mode_supported_p): New target hook. --- gcc/config/rs6000/rs6000.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 14ff56a..7718176 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1569,6 +1569,10 @@ static const struct attribute_spec rs6000_attribute_table[] = #undef TARGET_SCALAR_MODE_SUPPORTED_P #define TARGET_SCALAR_MODE_SUPPORTED_P rs6000_scalar_mode_supported_p +#undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P +#define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \ + rs6000_libgcc_floating_mode_supported_p + #undef TARGET_VECTOR_MODE_SUPPORTED_P #define TARGET_VECTOR_MODE_SUPPORTED_P rs6000_vector_mode_supported_p @@ -23826,6 +23830,31 @@ rs6000_scalar_mode_supported_p (scalar_mode mode) return default_scalar_mode_supported_p (mode); } +/* Target hook for libgcc_floating_mode_supported_p. */ + +static bool +rs6000_libgcc_floating_mode_supported_p (scalar_float_mode mode) +{ + switch (mode) + { + case E_SFmode: + case E_DFmode: + case E_TFmode: + return true; + + /* We only return true for KFmode if IEEE 128-bit types are supported, and + if long double does not use the IEEE 128-bit format. If long double + uses the IEEE 128-bit format, it will use TFmode and not KFmode. + Because the code will not use KFmode in that case, there will be aborts + because it can't find KFmode in the Floatn types. */ + case E_KFmode: + return TARGET_FLOAT128_TYPE && !TARGET_IEEEQUAD; + + default: + return false; + } +} + /* Target hook for vector_mode_supported_p. */ static bool rs6000_vector_mode_supported_p (machine_mode mode) -- cgit v1.1 From 5672fe9da4ab4e8787c288b64008251065c67c98 Mon Sep 17 00:00:00 2001 From: Senthil Kumar Selvaraj Date: Fri, 30 Apr 2021 16:34:11 +0000 Subject: AVR cc0 conversion - adjust peepholes This patch adjusts peepholes to match and generate parallels with a clobber of REG_CC. It also sets mov_insn as the name of the pattern for the split insn (rather than the define_insn_and_split), so that avr_2word_insn_p, which looks for CODE_FOR_mov_insn, works correctly. This is required for the *cpse.eq peephole to fire, and also helps generate better code for avr_out_sbxx_branch. gcc/ChangeLog: * config/avr/avr.md: Adjust peepholes to match and generate parallels with clobber of REG_CC. (mov_insn): Rename to mov_insn_split. (*mov_insn): Rename to mov_insn. --- gcc/config/avr/avr.md | 524 +++++++++++++++++++++++++++++--------------------- 1 file changed, 308 insertions(+), 216 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md index 2206fa1..a1a325b 100644 --- a/gcc/config/avr/avr.md +++ b/gcc/config/avr/avr.md @@ -724,9 +724,7 @@ ;; are call-saved registers, and most of LD_REGS are call-used registers, ;; so this may still be a win for registers live across function calls. -;; "movqi_insn" -;; "movqq_insn" "movuqq_insn" -(define_insn_and_split "mov_insn" +(define_insn_and_split "mov_insn_split" [(set (match_operand:ALL1 0 "nonimmediate_operand" "=r ,d ,Qm ,r ,q,r,*r") (match_operand:ALL1 1 "nox_general_operand" "r Y00,n Ynn,r Y00,Qm,r,q,i"))] "register_operand (operands[0], mode) @@ -737,7 +735,9 @@ (match_dup 1)) (clobber (reg:CC REG_CC))])]) -(define_insn "*mov_insn" +;; "movqi_insn" +;; "movqq_insn" "movuqq_insn" +(define_insn "mov_insn" [(set (match_operand:ALL1 0 "nonimmediate_operand" "=r ,d ,Qm ,r ,q,r,*r") (match_operand:ALL1 1 "nox_general_operand" "r Y00,n Ynn,r Y00,Qm,r,q,i")) (clobber (reg:CC REG_CC))] @@ -758,7 +758,8 @@ (define_insn "*reload_in" [(set (match_operand:ALL1 0 "register_operand" "=l") (match_operand:ALL1 1 "const_operand" "i")) - (clobber (match_operand:QI 2 "register_operand" "=&d"))] + (clobber (match_operand:QI 2 "register_operand" "=&d")) + (clobber (reg:CC REG_CC))] "reload_completed" "ldi %2,lo8(%1) mov %0,%2" @@ -766,15 +767,17 @@ (define_peephole2 [(match_scratch:QI 2 "d") - (set (match_operand:ALL1 0 "l_register_operand" "") - (match_operand:ALL1 1 "const_operand" ""))] + (parallel [(set (match_operand:ALL1 0 "l_register_operand" "") + (match_operand:ALL1 1 "const_operand" "")) + (clobber (reg:CC REG_CC))])] ; No need for a clobber reg for 0x0, 0x01 or 0xff "!satisfies_constraint_Y00 (operands[1]) && !satisfies_constraint_Y01 (operands[1]) && !satisfies_constraint_Ym1 (operands[1])" [(parallel [(set (match_dup 0) (match_dup 1)) - (clobber (match_dup 2))])]) + (clobber (match_dup 2)) + (clobber (reg:CC REG_CC))])]) ;;============================================================================ ;; move word (16 bit) @@ -804,12 +807,14 @@ (define_peephole2 [(match_scratch:QI 2 "d") - (set (match_operand:ALL2 0 "l_register_operand" "") - (match_operand:ALL2 1 "const_or_immediate_operand" ""))] + (parallel [(set (match_operand:ALL2 0 "l_register_operand" "") + (match_operand:ALL2 1 "const_or_immediate_operand" "")) + (clobber (reg:CC REG_CC))])] "operands[1] != CONST0_RTX (mode)" [(parallel [(set (match_dup 0) (match_dup 1)) - (clobber (match_dup 2))])]) + (clobber (match_dup 2)) + (clobber (reg:CC REG_CC))])]) ;; '*' because it is not used in rtl generation, only in above peephole ;; "*reload_inhi" @@ -855,30 +860,36 @@ (set_attr "adjust_len" "mov16")]) (define_peephole2 ; movw - [(set (match_operand:ALL1 0 "even_register_operand" "") - (match_operand:ALL1 1 "even_register_operand" "")) - (set (match_operand:ALL1 2 "odd_register_operand" "") - (match_operand:ALL1 3 "odd_register_operand" ""))] + [(parallel [(set (match_operand:ALL1 0 "even_register_operand" "") + (match_operand:ALL1 1 "even_register_operand" "")) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_operand:ALL1 2 "odd_register_operand" "") + (match_operand:ALL1 3 "odd_register_operand" "")) + (clobber (reg:CC REG_CC))])] "AVR_HAVE_MOVW && REGNO (operands[0]) == REGNO (operands[2]) - 1 && REGNO (operands[1]) == REGNO (operands[3]) - 1" - [(set (match_dup 4) - (match_dup 5))] + [(parallel [(set (match_dup 4) + (match_dup 5)) + (clobber (reg:CC REG_CC))])] { operands[4] = gen_rtx_REG (HImode, REGNO (operands[0])); operands[5] = gen_rtx_REG (HImode, REGNO (operands[1])); }) (define_peephole2 ; movw_r - [(set (match_operand:ALL1 0 "odd_register_operand" "") - (match_operand:ALL1 1 "odd_register_operand" "")) - (set (match_operand:ALL1 2 "even_register_operand" "") - (match_operand:ALL1 3 "even_register_operand" ""))] + [(parallel [(set (match_operand:ALL1 0 "odd_register_operand" "") + (match_operand:ALL1 1 "odd_register_operand" "")) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_operand:ALL1 2 "even_register_operand" "") + (match_operand:ALL1 3 "even_register_operand" "")) + (clobber (reg:CC REG_CC))])] "AVR_HAVE_MOVW && REGNO (operands[2]) == REGNO (operands[0]) - 1 && REGNO (operands[3]) == REGNO (operands[1]) - 1" - [(set (match_dup 4) - (match_dup 5))] + [(parallel [(set (match_dup 4) + (match_dup 5)) + (clobber (reg:CC REG_CC))])] { operands[4] = gen_rtx_REG (HImode, REGNO (operands[2])); operands[5] = gen_rtx_REG (HImode, REGNO (operands[3])); @@ -919,8 +930,9 @@ (define_peephole2 ; *reload_inpsi [(match_scratch:QI 2 "d") - (set (match_operand:PSI 0 "l_register_operand" "") - (match_operand:PSI 1 "immediate_operand" "")) + (parallel [(set (match_operand:PSI 0 "l_register_operand" "") + (match_operand:PSI 1 "immediate_operand" "")) + (clobber (reg:CC REG_CC))]) (match_dup 2)] "operands[1] != const0_rtx && operands[1] != constm1_rtx" @@ -973,8 +985,9 @@ (define_peephole2 ; *reload_insi [(match_scratch:QI 2 "d") - (set (match_operand:ALL4 0 "l_register_operand" "") - (match_operand:ALL4 1 "immediate_operand" "")) + (parallel [(set (match_operand:ALL4 0 "l_register_operand" "") + (match_operand:ALL4 1 "immediate_operand" "")) + (clobber (reg:CC REG_CC))]) (match_dup 2)] "operands[1] != CONST0_RTX (mode)" [(parallel [(set (match_dup 0) @@ -1055,8 +1068,9 @@ (define_peephole2 ; *reload_insf [(match_scratch:QI 2 "d") - (set (match_operand:SF 0 "l_register_operand" "") - (match_operand:SF 1 "const_double_operand" "")) + (parallel [(set (match_operand:SF 0 "l_register_operand" "") + (match_operand:SF 1 "const_double_operand" "")) + (clobber (reg:CC REG_CC))]) (match_dup 2)] "operands[1] != CONST0_RTX (SFmode)" [(parallel [(set (match_dup 0) @@ -1603,16 +1617,19 @@ ;; itself because that insn is special to reload. (define_peephole2 ; addhi3_clobber - [(set (match_operand:ALL2 0 "d_register_operand" "") - (match_operand:ALL2 1 "const_operand" "")) - (set (match_operand:ALL2 2 "l_register_operand" "") - (plus:ALL2 (match_dup 2) - (match_dup 0)))] + [(parallel [(set (match_operand:ALL2 0 "d_register_operand" "") + (match_operand:ALL2 1 "const_operand" "")) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_operand:ALL2 2 "l_register_operand" "") + (plus:ALL2 (match_dup 2) + (match_dup 0))) + (clobber (reg:CC REG_CC))])] "peep2_reg_dead_p (2, operands[0])" [(parallel [(set (match_dup 2) (plus:ALL2 (match_dup 2) (match_dup 1))) - (clobber (match_dup 3))])] + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])] { operands[3] = simplify_gen_subreg (QImode, operands[0], mode, 0); }) @@ -1623,15 +1640,18 @@ (define_peephole2 ; addhi3_clobber [(parallel [(set (match_operand:ALL2 0 "l_register_operand" "") (match_operand:ALL2 1 "const_operand" "")) - (clobber (match_operand:QI 2 "d_register_operand" ""))]) - (set (match_operand:ALL2 3 "l_register_operand" "") - (plus:ALL2 (match_dup 3) - (match_dup 0)))] + (clobber (match_operand:QI 2 "d_register_operand" "")) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_operand:ALL2 3 "l_register_operand" "") + (plus:ALL2 (match_dup 3) + (match_dup 0))) + (clobber (reg:CC REG_CC))])] "peep2_reg_dead_p (2, operands[0])" [(parallel [(set (match_dup 3) (plus:ALL2 (match_dup 3) (match_dup 1))) - (clobber (match_dup 2))])]) + (clobber (match_dup 2)) + (clobber (reg:CC REG_CC))])]) ;; "addhi3_clobber" ;; "addhq3_clobber" "adduhq3_clobber" @@ -4387,14 +4407,17 @@ (set_attr "adjust_len" "*,out_bitop,out_bitop,out_bitop")]) (define_peephole2 ; andi - [(set (match_operand:QI 0 "d_register_operand" "") - (and:QI (match_dup 0) - (match_operand:QI 1 "const_int_operand" ""))) - (set (match_dup 0) - (and:QI (match_dup 0) - (match_operand:QI 2 "const_int_operand" "")))] + [(parallel [(set (match_operand:QI 0 "d_register_operand" "") + (and:QI (match_dup 0) + (match_operand:QI 1 "const_int_operand" ""))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 0) + (and:QI (match_dup 0) + (match_operand:QI 2 "const_int_operand" ""))) + (clobber (reg:CC REG_CC))])] "" - [(set (match_dup 0) (and:QI (match_dup 0) (match_dup 1)))] + [(parallel [(set (match_dup 0) (and:QI (match_dup 0) (match_dup 1))) + (clobber (reg:CC REG_CC))])] { operands[1] = GEN_INT (INTVAL (operands[1]) & INTVAL (operands[2])); }) @@ -5140,13 +5163,15 @@ ;; No need to compute it, map to 8-bit shift. (define_peephole2 - [(set (match_operand:HI 0 "register_operand" "") - (ashift:HI (match_dup 0) - (match_operand:QI 1 "register_operand" "")))] + [(parallel [(set (match_operand:HI 0 "register_operand" "") + (ashift:HI (match_dup 0) + (match_operand:QI 1 "register_operand" ""))) + (clobber (reg:CC REG_CC))])] "" - [(set (match_dup 2) - (ashift:QI (match_dup 2) - (match_dup 1))) + [(parallel [(set (match_dup 2) + (ashift:QI (match_dup 2) + (match_dup 1))) + (clobber (reg:CC REG_CC))]) (clobber (match_dup 3))] { operands[3] = simplify_gen_subreg (QImode, operands[0], HImode, 1); @@ -5188,56 +5213,72 @@ ;; Optimize if a scratch register from LD_REGS happens to be available. (define_peephole2 ; ashlqi3_l_const4 - [(set (match_operand:ALL1 0 "l_register_operand" "") - (ashift:ALL1 (match_dup 0) - (const_int 4))) + [(parallel [(set (match_operand:ALL1 0 "l_register_operand" "") + (ashift:ALL1 (match_dup 0) + (const_int 4))) + (clobber (reg:CC REG_CC))]) (match_scratch:QI 1 "d")] "" - [(set (match_dup 2) (rotate:QI (match_dup 2) (const_int 4))) - (set (match_dup 1) (const_int -16)) - (set (match_dup 2) (and:QI (match_dup 2) (match_dup 1)))] + [(parallel [(set (match_dup 2) (rotate:QI (match_dup 2) (const_int 4))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 1) (const_int -16)) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 2) (and:QI (match_dup 2) (match_dup 1))) + (clobber (reg:CC REG_CC))])] { operands[2] = avr_to_int_mode (operands[0]); }) (define_peephole2 ; ashlqi3_l_const5 - [(set (match_operand:ALL1 0 "l_register_operand" "") - (ashift:ALL1 (match_dup 0) - (const_int 5))) + [(parallel [(set (match_operand:ALL1 0 "l_register_operand" "") + (ashift:ALL1 (match_dup 0) + (const_int 5))) + (clobber (reg:CC REG_CC))]) (match_scratch:QI 1 "d")] "" - [(set (match_dup 2) (rotate:QI (match_dup 2) (const_int 4))) - (set (match_dup 2) (ashift:QI (match_dup 2) (const_int 1))) - (set (match_dup 1) (const_int -32)) - (set (match_dup 2) (and:QI (match_dup 2) (match_dup 1)))] + [(parallel [(set (match_dup 2) (rotate:QI (match_dup 2) (const_int 4))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 2) (ashift:QI (match_dup 2) (const_int 1))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 1) (const_int -32)) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 2) (and:QI (match_dup 2) (match_dup 1))) + (clobber (reg:CC REG_CC))])] { operands[2] = avr_to_int_mode (operands[0]); }) (define_peephole2 ; ashlqi3_l_const6 - [(set (match_operand:ALL1 0 "l_register_operand" "") - (ashift:ALL1 (match_dup 0) - (const_int 6))) + [(parallel [(set (match_operand:ALL1 0 "l_register_operand" "") + (ashift:ALL1 (match_dup 0) + (const_int 6))) + (clobber (reg:CC REG_CC))]) (match_scratch:QI 1 "d")] "" - [(set (match_dup 2) (rotate:QI (match_dup 2) (const_int 4))) - (set (match_dup 2) (ashift:QI (match_dup 2) (const_int 2))) - (set (match_dup 1) (const_int -64)) - (set (match_dup 2) (and:QI (match_dup 2) (match_dup 1)))] + [(parallel [(set (match_dup 2) (rotate:QI (match_dup 2) (const_int 4))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 2) (ashift:QI (match_dup 2) (const_int 2))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 1) (const_int -64)) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 2) (and:QI (match_dup 2) (match_dup 1))) + (clobber (reg:CC REG_CC))])] { operands[2] = avr_to_int_mode (operands[0]); }) (define_peephole2 [(match_scratch:QI 3 "d") - (set (match_operand:ALL2 0 "register_operand" "") - (ashift:ALL2 (match_operand:ALL2 1 "register_operand" "") - (match_operand:QI 2 "const_int_operand" "")))] + (parallel [(set (match_operand:ALL2 0 "register_operand" "") + (ashift:ALL2 (match_operand:ALL2 1 "register_operand" "") + (match_operand:QI 2 "const_int_operand" ""))) + (clobber (reg:CC REG_CC))])] "" [(parallel [(set (match_dup 0) (ashift:ALL2 (match_dup 1) (match_dup 2))) - (clobber (match_dup 3))])]) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) ;; "*ashlhi3_const" ;; "*ashlhq3_const" "*ashluhq3_const" @@ -5271,14 +5312,16 @@ (define_peephole2 [(match_scratch:QI 3 "d") - (set (match_operand:ALL4 0 "register_operand" "") - (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "") - (match_operand:QI 2 "const_int_operand" "")))] + (parallel [(set (match_operand:ALL4 0 "register_operand" "") + (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "") + (match_operand:QI 2 "const_int_operand" ""))) + (clobber (reg:CC REG_CC))])] "" [(parallel [(set (match_dup 0) (ashift:ALL4 (match_dup 1) (match_dup 2))) - (clobber (match_dup 3))])]) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) ;; "*ashlsi3_const" ;; "*ashlsq3_const" "*ashlusq3_const" @@ -5476,14 +5519,16 @@ (define_peephole2 [(match_scratch:QI 3 "d") - (set (match_operand:ALL2 0 "register_operand" "") - (ashiftrt:ALL2 (match_operand:ALL2 1 "register_operand" "") - (match_operand:QI 2 "const_int_operand" "")))] + (parallel [(set (match_operand:ALL2 0 "register_operand" "") + (ashiftrt:ALL2 (match_operand:ALL2 1 "register_operand" "") + (match_operand:QI 2 "const_int_operand" ""))) + (clobber (reg:CC REG_CC))])] "" [(parallel [(set (match_dup 0) (ashiftrt:ALL2 (match_dup 1) (match_dup 2))) - (clobber (match_dup 3))])]) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) ;; "*ashrhi3_const" ;; "*ashrhq3_const" "*ashruhq3_const" @@ -5517,14 +5562,16 @@ (define_peephole2 [(match_scratch:QI 3 "d") - (set (match_operand:ALL4 0 "register_operand" "") - (ashiftrt:ALL4 (match_operand:ALL4 1 "register_operand" "") - (match_operand:QI 2 "const_int_operand" "")))] + (parallel [(set (match_operand:ALL4 0 "register_operand" "") + (ashiftrt:ALL4 (match_operand:ALL4 1 "register_operand" "") + (match_operand:QI 2 "const_int_operand" ""))) + (clobber (reg:CC REG_CC))])] "" [(parallel [(set (match_dup 0) (ashiftrt:ALL4 (match_dup 1) (match_dup 2))) - (clobber (match_dup 3))])]) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) ;; "*ashrsi3_const" ;; "*ashrsq3_const" "*ashrusq3_const" @@ -5715,56 +5762,72 @@ ;; Optimize if a scratch register from LD_REGS happens to be available. (define_peephole2 ; lshrqi3_l_const4 - [(set (match_operand:ALL1 0 "l_register_operand" "") - (lshiftrt:ALL1 (match_dup 0) - (const_int 4))) + [(parallel [(set (match_operand:ALL1 0 "l_register_operand" "") + (lshiftrt:ALL1 (match_dup 0) + (const_int 4))) + (clobber (reg:CC REG_CC))]) (match_scratch:QI 1 "d")] "" - [(set (match_dup 2) (rotate:QI (match_dup 2) (const_int 4))) - (set (match_dup 1) (const_int 15)) - (set (match_dup 2) (and:QI (match_dup 2) (match_dup 1)))] + [(parallel [(set (match_dup 2) (rotate:QI (match_dup 2) (const_int 4))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 1) (const_int 15)) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 2) (and:QI (match_dup 2) (match_dup 1))) + (clobber (reg:CC REG_CC))])] { operands[2] = avr_to_int_mode (operands[0]); }) (define_peephole2 ; lshrqi3_l_const5 - [(set (match_operand:ALL1 0 "l_register_operand" "") - (lshiftrt:ALL1 (match_dup 0) - (const_int 5))) + [(parallel [(set (match_operand:ALL1 0 "l_register_operand" "") + (lshiftrt:ALL1 (match_dup 0) + (const_int 5))) + (clobber (reg:CC REG_CC))]) (match_scratch:QI 1 "d")] "" - [(set (match_dup 2) (rotate:QI (match_dup 2) (const_int 4))) - (set (match_dup 2) (lshiftrt:QI (match_dup 2) (const_int 1))) - (set (match_dup 1) (const_int 7)) - (set (match_dup 2) (and:QI (match_dup 2) (match_dup 1)))] + [(parallel [(set (match_dup 2) (rotate:QI (match_dup 2) (const_int 4))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 2) (lshiftrt:QI (match_dup 2) (const_int 1))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 1) (const_int 7)) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 2) (and:QI (match_dup 2) (match_dup 1))) + (clobber (reg:CC REG_CC))])] { operands[2] = avr_to_int_mode (operands[0]); }) (define_peephole2 ; lshrqi3_l_const6 - [(set (match_operand:ALL1 0 "l_register_operand" "") - (lshiftrt:ALL1 (match_dup 0) - (const_int 6))) + [(parallel [(set (match_operand:ALL1 0 "l_register_operand" "") + (lshiftrt:ALL1 (match_dup 0) + (const_int 6))) + (clobber (reg:CC REG_CC))]) (match_scratch:QI 1 "d")] "" - [(set (match_dup 2) (rotate:QI (match_dup 2) (const_int 4))) - (set (match_dup 2) (lshiftrt:QI (match_dup 2) (const_int 2))) - (set (match_dup 1) (const_int 3)) - (set (match_dup 2) (and:QI (match_dup 2) (match_dup 1)))] + [(parallel [(set (match_dup 2) (rotate:QI (match_dup 2) (const_int 4))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 2) (lshiftrt:QI (match_dup 2) (const_int 2))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 1) (const_int 3)) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 2) (and:QI (match_dup 2) (match_dup 1))) + (clobber (reg:CC REG_CC))])] { operands[2] = avr_to_int_mode (operands[0]); }) (define_peephole2 [(match_scratch:QI 3 "d") - (set (match_operand:ALL2 0 "register_operand" "") - (lshiftrt:ALL2 (match_operand:ALL2 1 "register_operand" "") - (match_operand:QI 2 "const_int_operand" "")))] + (parallel [(set (match_operand:ALL2 0 "register_operand" "") + (lshiftrt:ALL2 (match_operand:ALL2 1 "register_operand" "") + (match_operand:QI 2 "const_int_operand" ""))) + (clobber (reg:CC REG_CC))])] "" [(parallel [(set (match_dup 0) (lshiftrt:ALL2 (match_dup 1) (match_dup 2))) - (clobber (match_dup 3))])]) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) ;; "*lshrhi3_const" ;; "*lshrhq3_const" "*lshruhq3_const" @@ -5798,14 +5861,16 @@ (define_peephole2 [(match_scratch:QI 3 "d") - (set (match_operand:ALL4 0 "register_operand" "") - (lshiftrt:ALL4 (match_operand:ALL4 1 "register_operand" "") - (match_operand:QI 2 "const_int_operand" "")))] + (parallel [(set (match_operand:ALL4 0 "register_operand" "") + (lshiftrt:ALL4 (match_operand:ALL4 1 "register_operand" "") + (match_operand:QI 2 "const_int_operand" ""))) + (clobber (reg:CC REG_CC))])] "" [(parallel [(set (match_dup 0) (lshiftrt:ALL4 (match_dup 1) (match_dup 2))) - (clobber (match_dup 3))])]) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])]) ;; "*lshrsi3_const" ;; "*lshrsq3_const" "*lshrusq3_const" @@ -6817,83 +6882,95 @@ (define_peephole2 [(set (reg:CC REG_CC) (compare:CC (match_operand:QI 0 "register_operand" "") (const_int 0))) - (set (pc) (if_then_else (ge (reg:CC REG_CC) (const_int 0)) - (label_ref (match_operand 1 "" "")) - (pc)))] + (parallel [(set (pc) (if_then_else (ge (reg:CC REG_CC) (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc))) + (clobber (reg:CC REG_CC))])] "" - [(set (pc) (if_then_else (eq (zero_extract:HI (match_dup 0) - (const_int 1) - (const_int 7)) - (const_int 0)) - (label_ref (match_dup 1)) - (pc)))]) + [(parallel [(set (pc) (if_then_else (eq (zero_extract:HI (match_dup 0) + (const_int 1) + (const_int 7)) + (const_int 0)) + (label_ref (match_dup 1)) + (pc))) + (clobber (reg:CC REG_CC))])]) (define_peephole2 [(set (reg:CC REG_CC) (compare:CC (match_operand:QI 0 "register_operand" "") (const_int 0))) - (set (pc) (if_then_else (lt (reg:CC REG_CC) (const_int 0)) - (label_ref (match_operand 1 "" "")) - (pc)))] + (parallel [(set (pc) (if_then_else (lt (reg:CC REG_CC) (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc))) + (clobber (reg:CC REG_CC))])] "" - [(set (pc) (if_then_else (ne (zero_extract:HI (match_dup 0) - (const_int 1) - (const_int 7)) - (const_int 0)) - (label_ref (match_dup 1)) - (pc)))]) + [(parallel [(set (pc) (if_then_else (ne (zero_extract:HI (match_dup 0) + (const_int 1) + (const_int 7)) + (const_int 0)) + (label_ref (match_dup 1)) + (pc))) + (clobber (reg:CC REG_CC))])]) (define_peephole2 [(parallel [(set (reg:CC REG_CC) (compare:CC (match_operand:HI 0 "register_operand" "") (const_int 0))) (clobber (match_operand:HI 2 ""))]) - (set (pc) (if_then_else (ge (reg:CC REG_CC) (const_int 0)) - (label_ref (match_operand 1 "" "")) - (pc)))] + (parallel [(set (pc) (if_then_else (ge (reg:CC REG_CC) (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc))) + (clobber (reg:CC REG_CC))])] "" - [(set (pc) (if_then_else (eq (and:HI (match_dup 0) (const_int -32768)) - (const_int 0)) - (label_ref (match_dup 1)) - (pc)))]) + [(parallel [(set (pc) (if_then_else (eq (and:HI (match_dup 0) (const_int -32768)) + (const_int 0)) + (label_ref (match_dup 1)) + (pc))) + (clobber (reg:CC REG_CC))])]) (define_peephole2 [(parallel [(set (reg:CC REG_CC) (compare:CC (match_operand:HI 0 "register_operand" "") (const_int 0))) (clobber (match_operand:HI 2 ""))]) - (set (pc) (if_then_else (lt (reg:CC REG_CC) (const_int 0)) - (label_ref (match_operand 1 "" "")) - (pc)))] + (parallel [(set (pc) (if_then_else (lt (reg:CC REG_CC) (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc))) + (clobber (reg:CC REG_CC))])] "" - [(set (pc) (if_then_else (ne (and:HI (match_dup 0) (const_int -32768)) - (const_int 0)) - (label_ref (match_dup 1)) - (pc)))]) + [(parallel [(set (pc) (if_then_else (ne (and:HI (match_dup 0) (const_int -32768)) + (const_int 0)) + (label_ref (match_dup 1)) + (pc))) + (clobber (reg:CC REG_CC))])]) (define_peephole2 [(parallel [(set (reg:CC REG_CC) (compare:CC (match_operand:SI 0 "register_operand" "") (const_int 0))) (clobber (match_operand:SI 2 ""))]) - (set (pc) (if_then_else (ge (reg:CC REG_CC) (const_int 0)) - (label_ref (match_operand 1 "" "")) - (pc)))] - "" - [(set (pc) (if_then_else (eq (and:SI (match_dup 0) (match_dup 2)) - (const_int 0)) - (label_ref (match_dup 1)) - (pc)))] + (parallel [(set (pc) (if_then_else (ge (reg:CC REG_CC) (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc))) + (clobber (reg:CC REG_CC))])] + "" + [(parallel [(set (pc) (if_then_else (eq (and:SI (match_dup 0) (match_dup 2)) + (const_int 0)) + (label_ref (match_dup 1)) + (pc))) + (clobber (reg:CC REG_CC))])] "operands[2] = gen_int_mode (-2147483647 - 1, SImode);") (define_peephole2 [(parallel [(set (reg:CC REG_CC) (compare:CC (match_operand:SI 0 "register_operand" "") (const_int 0))) (clobber (match_operand:SI 2 ""))]) - (set (pc) (if_then_else (lt (reg:CC REG_CC) (const_int 0)) - (label_ref (match_operand 1 "" "")) - (pc)))] - "" - [(set (pc) (if_then_else (ne (and:SI (match_dup 0) (match_dup 2)) - (const_int 0)) - (label_ref (match_dup 1)) - (pc)))] + (parallel [(set (pc) (if_then_else (lt (reg:CC REG_CC) (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc))) + (clobber (reg:CC REG_CC))])] + "" + [(parallel [(set (pc) (if_then_else (ne (and:SI (match_dup 0) (match_dup 2)) + (const_int 0)) + (label_ref (match_dup 1)) + (pc))) + (clobber (reg:CC REG_CC))])] "operands[2] = gen_int_mode (-2147483647 - 1, SImode);") ;; ************************************************************************ @@ -7575,16 +7652,18 @@ [(parallel [(set (match_operand:SI 0 "d_register_operand" "") (plus:SI (match_dup 0) (const_int -1))) - (clobber (scratch:QI))]) + (clobber (scratch:QI)) + (clobber (reg:CC REG_CC))]) (parallel [(set (reg:CC REG_CC) (compare:CC (match_dup 0) (const_int -1))) (clobber (match_operand:QI 1 "d_register_operand" ""))]) - (set (pc) - (if_then_else (eqne (reg:CC REG_CC) - (const_int 0)) - (label_ref (match_operand 2 "" "")) - (pc)))] + (parallel [(set (pc) + (if_then_else (eqne (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 2 "" "")) + (pc))) + (clobber (reg:CC REG_CC))])] "" { const char *op; @@ -7616,18 +7695,20 @@ }) (define_peephole ; "*dec-and-branchhi!=-1" - [(set (match_operand:HI 0 "d_register_operand" "") - (plus:HI (match_dup 0) - (const_int -1))) + [(parallel [(set (match_operand:HI 0 "d_register_operand" "") + (plus:HI (match_dup 0) + (const_int -1))) + (clobber (reg:CC REG_CC))]) (parallel [(set (reg:CC REG_CC) (compare:CC (match_dup 0) (const_int -1))) (clobber (match_operand:QI 1 "d_register_operand" ""))]) - (set (pc) - (if_then_else (eqne (reg:CC REG_CC) - (const_int 0)) - (label_ref (match_operand 2 "" "")) - (pc)))] + (parallel [(set (pc) + (if_then_else (eqne (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 2 "" "")) + (pc))) + (clobber (reg:CC REG_CC))])] "" { const char *op; @@ -7659,16 +7740,18 @@ [(parallel [(set (match_operand:HI 0 "d_register_operand" "") (plus:HI (match_dup 0) (const_int -1))) - (clobber (scratch:QI))]) + (clobber (scratch:QI)) + (clobber (reg:CC REG_CC))]) (parallel [(set (reg:CC REG_CC) (compare:CC (match_dup 0) (const_int -1))) (clobber (match_operand:QI 1 "d_register_operand" ""))]) - (set (pc) - (if_then_else (eqne (reg:CC REG_CC) - (const_int 0)) - (label_ref (match_operand 2 "" "")) - (pc)))] + (parallel [(set (pc) + (if_then_else (eqne (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 2 "" "")) + (pc))) + (clobber (reg:CC REG_CC))])] "" { const char *op; @@ -7700,16 +7783,18 @@ [(parallel [(set (match_operand:HI 0 "l_register_operand" "") (plus:HI (match_dup 0) (const_int -1))) - (clobber (match_operand:QI 3 "d_register_operand" ""))]) + (clobber (match_operand:QI 3 "d_register_operand" "")) + (clobber (reg:CC REG_CC))]) (parallel [(set (reg:CC REG_CC) (compare:CC (match_dup 0) (const_int -1))) (clobber (match_operand:QI 1 "d_register_operand" ""))]) - (set (pc) - (if_then_else (eqne (reg:CC REG_CC) - (const_int 0)) - (label_ref (match_operand 2 "" "")) - (pc)))] + (parallel [(set (pc) + (if_then_else (eqne (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 2 "" "")) + (pc))) + (clobber (reg:CC REG_CC))])] "" { const char *op; @@ -7735,17 +7820,19 @@ }) (define_peephole ; "*dec-and-branchqi!=-1" - [(set (match_operand:QI 0 "d_register_operand" "") - (plus:QI (match_dup 0) - (const_int -1))) + [(parallel [(set (match_operand:QI 0 "d_register_operand" "") + (plus:QI (match_dup 0) + (const_int -1))) + (clobber (reg:CC REG_CC))]) (set (reg:CC REG_CC) (compare:CC (match_dup 0) (const_int -1))) - (set (pc) - (if_then_else (eqne (reg:CC REG_CC) - (const_int 0)) - (label_ref (match_operand 1 "" "")) - (pc)))] + (parallel [(set (pc) + (if_then_else (eqne (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc))) + (clobber (reg:CC REG_CC))])] "" { const char *op; @@ -7773,11 +7860,12 @@ [(set (reg:CC REG_CC) (compare:CC (match_operand:ALL1 1 "register_operand" "r,r") (match_operand:ALL1 2 "reg_or_0_operand" "r,Y00"))) - (set (pc) - (if_then_else (eq (reg:CC REG_CC) - (const_int 0)) - (label_ref (match_operand 0 "" "")) - (pc)))] + (parallel [(set (pc) + (if_then_else (eq (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 0 "" "")) + (pc))) + (clobber (reg:CC REG_CC))])] "jump_over_one_insn_p (insn, operands[0])" "@ cpse %1,%2 @@ -7808,11 +7896,12 @@ [(set (reg:CC REG_CC) (compare:CC (match_operand:ALL1 1 "register_operand" "") (match_operand:ALL1 2 "reg_or_0_operand" ""))) - (set (pc) - (if_then_else (ne (reg:CC REG_CC) - (const_int 0)) - (label_ref (match_operand 0 "" "")) - (pc)))] + (parallel [(set (pc) + (if_then_else (ne (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 0 "" "")) + (pc))) + (clobber (reg:CC REG_CC))])] "!AVR_HAVE_JMP_CALL || !TARGET_SKIP_BUG" { @@ -9391,14 +9480,17 @@ (define_peephole2 - [(set (match_operand:QI 0 "register_operand") - (const_int 0)) - (set (match_dup 0) - (ior:QI (match_dup 0) - (match_operand:QI 1 "register_operand")))] + [(parallel [(set (match_operand:QI 0 "register_operand") + (const_int 0)) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 0) + (ior:QI (match_dup 0) + (match_operand:QI 1 "register_operand"))) + (clobber (reg:CC REG_CC))])] "" - [(set (match_dup 0) - (match_dup 1))]) + [(parallel [(set (match_dup 0) + (match_dup 1)) + (clobber (reg:CC REG_CC))])]) (define_expand "extzv" -- cgit v1.1 From f546e0d3d0316aa76a45de1f548591bde7308c41 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 4 Feb 2021 23:00:00 +0000 Subject: aarch64: Use RTL builtins for vmull[_high]_p8 intrinsics Rewrite vmull[_high]_p8 Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-05 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Add pmull[2] builtin generator macros. * config/aarch64/aarch64-simd.md (aarch64_pmullv8qi): Define. (aarch64_pmull_hiv16qi_insn): Define. (aarch64_pmull_hiv16qi): Define. * config/aarch64/arm_neon.h (vmull_high_p8): Use RTL builtin instead of inline asm. (vmull_p8): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 2 ++ gcc/config/aarch64/aarch64-simd.md | 38 ++++++++++++++++++++++++++++ gcc/config/aarch64/arm_neon.h | 16 +++--------- 3 files changed, 44 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 337ec8d..5d4c01f 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -46,6 +46,8 @@ BUILTIN_VDC (COMBINE, combine, 0, AUTO_FP) VAR1 (COMBINEP, combine, 0, NONE, di) BUILTIN_VB (BINOP, pmul, 0, NONE) + VAR1 (BINOP, pmull, 0, NONE, v8qi) + VAR1 (BINOP, pmull_hi, 0, NONE, v16qi) BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP) BUILTIN_VHSDF_DF (UNOP, sqrt, 2, FP) BUILTIN_VDQ_I (BINOP, addp, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index fbfed33..65e6390 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4471,6 +4471,44 @@ [(set_attr "type" "neon_mul_")] ) +(define_insn "aarch64_pmullv8qi" + [(set (match_operand:V8HI 0 "register_operand" "=w") + (unspec:V8HI [(match_operand:V8QI 1 "register_operand" "w") + (match_operand:V8QI 2 "register_operand" "w")] + UNSPEC_PMULL))] + "TARGET_SIMD" + "pmull\\t%0.8h, %1.8b, %2.8b" + [(set_attr "type" "neon_mul_b_long")] +) + +(define_insn "aarch64_pmull_hiv16qi_insn" + [(set (match_operand:V8HI 0 "register_operand" "=w") + (unspec:V8HI + [(vec_select:V8QI + (match_operand:V16QI 1 "register_operand" "w") + (match_operand:V16QI 3 "vect_par_cnst_hi_half" "")) + (vec_select:V8QI + (match_operand:V16QI 2 "register_operand" "w") + (match_dup 3))] + UNSPEC_PMULL))] + "TARGET_SIMD" + "pmull2\\t%0.8h, %1.16b, %2.16b" + [(set_attr "type" "neon_mul_b_long")] +) + +(define_expand "aarch64_pmull_hiv16qi" + [(match_operand:V8HI 0 "register_operand") + (match_operand:V16QI 1 "register_operand") + (match_operand:V16QI 2 "register_operand")] + "TARGET_SIMD" + { + rtx p = aarch64_simd_vect_par_cnst_half (V16QImode, 16, true); + emit_insn (gen_aarch64_pmull_hiv16qi_insn (operands[0], operands[1], + operands[2], p)); + DONE; + } +) + ;; fmulx. (define_insn "aarch64_fmulx" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 4b8ec52..bde2d17 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -8228,12 +8228,8 @@ __extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_high_p8 (poly8x16_t __a, poly8x16_t __b) { - poly16x8_t __result; - __asm__ ("pmull2 %0.8h,%1.16b,%2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (poly16x8_t) __builtin_aarch64_pmull_hiv16qi ((int8x16_t) __a, + (int8x16_t) __b); } __extension__ extern __inline int16x8_t @@ -8366,12 +8362,8 @@ __extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_p8 (poly8x8_t __a, poly8x8_t __b) { - poly16x8_t __result; - __asm__ ("pmull %0.8h, %1.8b, %2.8b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (poly16x8_t) __builtin_aarch64_pmullv8qi ((int8x8_t) __a, + (int8x8_t) __b); } __extension__ extern __inline int16x8_t -- cgit v1.1 From 60518e6473248b16db9125504da0351707c35d1a Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 18 Jan 2021 12:42:52 +0000 Subject: aarch64: Use RTL builtins for FP ml[as]_n intrinsics Rewrite floating-point vml[as][q]_n Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-01-18 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Add float_ml[as]_n_builtin generator macros. * config/aarch64/aarch64-simd.md (*aarch64_mul3_elt_from_dup): Rename to... (mul_n3): This, and re-order arguments. (aarch64_float_mla_n): Define. (aarch64_float_mls_n): Define. * config/aarch64/arm_neon.h (vmla_n_f32): Use RTL builtin instead of inline asm. (vmlaq_n_f32): Likewise. (vmls_n_f32): Likewise. (vmlsq_n_f32): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 3 ++ gcc/config/aarch64/aarch64-simd.md | 46 ++++++++++++++++++++++++---- gcc/config/aarch64/arm_neon.h | 32 +++---------------- 3 files changed, 47 insertions(+), 34 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 5d4c01f..3b5e884 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -668,6 +668,9 @@ BUILTIN_VHSDF (TERNOP, fnma, 4, FP) VAR1 (TERNOP, fnma, 4, FP, hf) + BUILTIN_VDQSF (TERNOP, float_mla_n, 0, FP) + BUILTIN_VDQSF (TERNOP, float_mls_n, 0, FP) + /* Implemented by aarch64_simd_bsl. */ BUILTIN_VDQQH (BSL_P, simd_bsl, 0, NONE) VAR2 (BSL_P, simd_bsl,0, NONE, di, v2di) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 65e6390..6edfd2d 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -750,14 +750,14 @@ [(set_attr "type" "neon_mul__scalar")] ) -(define_insn "*aarch64_mul3_elt_from_dup" +(define_insn "mul_n3" [(set (match_operand:VMUL 0 "register_operand" "=w") - (mult:VMUL - (vec_duplicate:VMUL - (match_operand: 1 "register_operand" "")) - (match_operand:VMUL 2 "register_operand" "w")))] + (mult:VMUL + (vec_duplicate:VMUL + (match_operand: 2 "register_operand" "")) + (match_operand:VMUL 1 "register_operand" "w")))] "TARGET_SIMD" - "mul\t%0., %2., %1.[0]"; + "mul\t%0., %1., %2.[0]"; [(set_attr "type" "neon_mul__scalar")] ) @@ -2636,6 +2636,40 @@ [(set_attr "type" "neon_fp_abs_")] ) +(define_expand "aarch64_float_mla_n" + [(set (match_operand:VDQSF 0 "register_operand") + (plus:VDQSF + (mult:VDQSF + (vec_duplicate:VDQSF + (match_operand: 3 "register_operand")) + (match_operand:VDQSF 2 "register_operand")) + (match_operand:VDQSF 1 "register_operand")))] + "TARGET_SIMD" + { + rtx scratch = gen_reg_rtx (mode); + emit_insn (gen_mul_n3 (scratch, operands[2], operands[3])); + emit_insn (gen_add3 (operands[0], operands[1], scratch)); + DONE; + } +) + +(define_expand "aarch64_float_mls_n" + [(set (match_operand:VDQSF 0 "register_operand") + (minus:VDQSF + (match_operand:VDQSF 1 "register_operand") + (mult:VDQSF + (vec_duplicate:VDQSF + (match_operand: 3 "register_operand")) + (match_operand:VDQSF 2 "register_operand"))))] + "TARGET_SIMD" + { + rtx scratch = gen_reg_rtx (mode); + emit_insn (gen_mul_n3 (scratch, operands[2], operands[3])); + emit_insn (gen_sub3 (operands[0], operands[1], scratch)); + DONE; + } +) + (define_insn "fma4" [(set (match_operand:VHSDF 0 "register_operand" "=w") (fma:VHSDF (match_operand:VHSDF 1 "register_operand" "w") diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index bde2d17..f1e1e0e 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -7035,13 +7035,7 @@ __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) { - float32x2_t __result; - float32x2_t __t1; - __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s" - : "=w"(__result), "=w"(__t1) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_float_mla_nv2sf (__a, __b, __c); } __extension__ extern __inline int16x4_t @@ -7388,13 +7382,7 @@ __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) { - float32x4_t __result; - float32x4_t __t1; - __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s" - : "=w"(__result), "=w"(__t1) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_float_mla_nv4sf (__a, __b, __c); } __extension__ extern __inline int16x8_t @@ -7481,13 +7469,7 @@ __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) { - float32x2_t __result; - float32x2_t __t1; - __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s" - : "=w"(__result), "=w"(__t1) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_float_mls_nv2sf (__a, __b, __c); } __extension__ extern __inline int16x4_t @@ -7838,13 +7820,7 @@ __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) { - float32x4_t __result; - float32x4_t __t1; - __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s" - : "=w"(__result), "=w"(__t1) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_float_mls_nv4sf (__a, __b, __c); } __extension__ extern __inline int16x8_t -- cgit v1.1 From b0d9aac8992c1f8c3198d9528a9867c653623dfb Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 16 Feb 2021 15:42:36 +0000 Subject: aarch64: Use RTL builtins for FP ml[as] intrinsics Rewrite floating-point vml[as][q] Neon intrinsics to use RTL builtins rather than relying on the GCC vector extensions. Using RTL builtins allows control over the emission of fmla/fmls instructions (which we don't want here.) With this commit, the code generated by these intrinsics changes from a fused multiply-add/subtract instruction to an fmul followed by an fadd/fsub instruction. If the programmer really wants fmla/fmls instructions, they can use the vfm[as] intrinsics. gcc/ChangeLog: 2021-02-16 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Add float_ml[as] builtin generator macros. * config/aarch64/aarch64-simd.md (aarch64_float_mla): Define. (aarch64_float_mls): Define. * config/aarch64/arm_neon.h (vmla_f32): Use RTL builtin instead of relying on GCC vector extensions. (vmla_f64): Likewise. (vmlaq_f32): Likewise. (vmlaq_f64): Likewise. (vmls_f32): Likewise. (vmls_f64): Likewise. (vmlsq_f32): Likewise. (vmlsq_f64): Likewise. * config/aarch64/iterators.md: Define VDQF_DF mode iterator. --- gcc/config/aarch64/aarch64-simd-builtins.def | 2 ++ gcc/config/aarch64/aarch64-simd.md | 32 ++++++++++++++++++++++++++++ gcc/config/aarch64/arm_neon.h | 16 +++++++------- gcc/config/aarch64/iterators.md | 1 + 4 files changed, 43 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 3b5e884..2a2fc20 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -668,6 +668,8 @@ BUILTIN_VHSDF (TERNOP, fnma, 4, FP) VAR1 (TERNOP, fnma, 4, FP, hf) + BUILTIN_VDQF_DF (TERNOP, float_mla, 0, FP) + BUILTIN_VDQF_DF (TERNOP, float_mls, 0, FP) BUILTIN_VDQSF (TERNOP, float_mla_n, 0, FP) BUILTIN_VDQSF (TERNOP, float_mls_n, 0, FP) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 6edfd2d..0f96cd0 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -2636,6 +2636,38 @@ [(set_attr "type" "neon_fp_abs_")] ) +(define_expand "aarch64_float_mla" + [(set (match_operand:VDQF_DF 0 "register_operand") + (plus:VDQF_DF + (mult:VDQF_DF + (match_operand:VDQF_DF 2 "register_operand") + (match_operand:VDQF_DF 3 "register_operand")) + (match_operand:VDQF_DF 1 "register_operand")))] + "TARGET_SIMD" + { + rtx scratch = gen_reg_rtx (mode); + emit_insn (gen_mul3 (scratch, operands[2], operands[3])); + emit_insn (gen_add3 (operands[0], operands[1], scratch)); + DONE; + } +) + +(define_expand "aarch64_float_mls" + [(set (match_operand:VDQF_DF 0 "register_operand") + (minus:VDQF_DF + (match_operand:VDQF_DF 1 "register_operand") + (mult:VDQF_DF + (match_operand:VDQF_DF 2 "register_operand") + (match_operand:VDQF_DF 3 "register_operand"))))] + "TARGET_SIMD" + { + rtx scratch = gen_reg_rtx (mode); + emit_insn (gen_mul3 (scratch, operands[2], operands[3])); + emit_insn (gen_sub3 (operands[0], operands[1], scratch)); + DONE; + } +) + (define_expand "aarch64_float_mla_n" [(set (match_operand:VDQSF 0 "register_operand") (plus:VDQSF diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index f1e1e0e..0227cad 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -20347,28 +20347,28 @@ __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmla_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c) { - return __a + __b * __c; + return __builtin_aarch64_float_mlav2sf (__a, __b, __c); } __extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmla_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c) { - return __a + __b * __c; + return (float64x1_t) {__builtin_aarch64_float_mladf (__a[0], __b[0], __c[0])}; } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) { - return __a + __b * __c; + return __builtin_aarch64_float_mlav4sf (__a, __b, __c); } __extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c) { - return __a + __b * __c; + return __builtin_aarch64_float_mlav2df (__a, __b, __c); } /* vmla_lane */ @@ -20545,28 +20545,28 @@ __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmls_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c) { - return __a - __b * __c; + return __builtin_aarch64_float_mlsv2sf (__a, __b, __c); } __extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmls_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c) { - return __a - __b * __c; + return (float64x1_t) {__builtin_aarch64_float_mlsdf (__a[0], __b[0], __c[0])}; } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) { - return __a - __b * __c; + return __builtin_aarch64_float_mlsv4sf (__a, __b, __c); } __extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c) { - return __a - __b * __c; + return __builtin_aarch64_float_mlsv2df (__a, __b, __c); } /* vmls_lane */ diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 634c44e..c57aa6b 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -149,6 +149,7 @@ V2SF V4SF V2DF]) ;; Advanced SIMD Float modes, and DF. +(define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF]) (define_mode_iterator VHSDF_DF [(V4HF "TARGET_SIMD_F16INST") (V8HF "TARGET_SIMD_F16INST") V2SF V4SF V2DF DF]) -- cgit v1.1 From 1baf4ed878639536c50a7aab9e7be64da43356fd Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 16 Feb 2021 23:59:22 +0000 Subject: aarch64: Use RTL builtins for FP ml[as][q]_lane intrinsics Rewrite floating-point vml[as][q]_lane Neon intrinsics to use RTL builtins rather than relying on the GCC vector extensions. Using RTL builtins allows control over the emission of fmla/fmls instructions (which we don't want here.) With this commit, the code generated by these intrinsics changes from a fused multiply-add/subtract instruction to an fmul followed by an fadd/fsub instruction. If the programmer really wants fmla/fmls instructions, they can use the vfm[as] intrinsics. gcc/ChangeLog: 2021-02-16 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Add float_ml[as]_lane builtin generator macros. * config/aarch64/aarch64-simd.md (*aarch64_mul3_elt): Rename to... (mul_lane3): This, and re-order arguments. (aarch64_float_mla_lane): Define. (aarch64_float_mls_lane): Define. * config/aarch64/arm_neon.h (vmla_lane_f32): Use RTL builtin instead of GCC vector extensions. (vmlaq_lane_f32): Likewise. (vmls_lane_f32): Likewise. (vmlsq_lane_f32): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 2 + gcc/config/aarch64/aarch64-simd.md | 58 +++++++++++++++++++++++----- gcc/config/aarch64/arm_neon.h | 8 ++-- 3 files changed, 55 insertions(+), 13 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 2a2fc20..8e4b4ed 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -672,6 +672,8 @@ BUILTIN_VDQF_DF (TERNOP, float_mls, 0, FP) BUILTIN_VDQSF (TERNOP, float_mla_n, 0, FP) BUILTIN_VDQSF (TERNOP, float_mls_n, 0, FP) + BUILTIN_VDQSF (QUADOP_LANE, float_mla_lane, 0, FP) + BUILTIN_VDQSF (QUADOP_LANE, float_mls_lane, 0, FP) /* Implemented by aarch64_simd_bsl. */ BUILTIN_VDQQH (BSL_P, simd_bsl, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 0f96cd0..bdee49f 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -718,18 +718,18 @@ } ) -(define_insn "*aarch64_mul3_elt" +(define_insn "mul_lane3" [(set (match_operand:VMUL 0 "register_operand" "=w") - (mult:VMUL - (vec_duplicate:VMUL - (vec_select: - (match_operand:VMUL 1 "register_operand" "") - (parallel [(match_operand:SI 2 "immediate_operand")]))) - (match_operand:VMUL 3 "register_operand" "w")))] + (mult:VMUL + (vec_duplicate:VMUL + (vec_select: + (match_operand:VMUL 2 "register_operand" "") + (parallel [(match_operand:SI 3 "immediate_operand" "i")]))) + (match_operand:VMUL 1 "register_operand" "w")))] "TARGET_SIMD" { - operands[2] = aarch64_endian_lane_rtx (mode, INTVAL (operands[2])); - return "mul\\t%0., %3., %1.[%2]"; + operands[3] = aarch64_endian_lane_rtx (mode, INTVAL (operands[3])); + return "mul\\t%0., %1., %2.[%3]"; } [(set_attr "type" "neon_mul__scalar")] ) @@ -2702,6 +2702,46 @@ } ) +(define_expand "aarch64_float_mla_lane" + [(set (match_operand:VDQSF 0 "register_operand") + (plus:VDQSF + (mult:VDQSF + (vec_duplicate:VDQSF + (vec_select: + (match_operand:V2SF 3 "register_operand") + (parallel [(match_operand:SI 4 "immediate_operand")]))) + (match_operand:VDQSF 2 "register_operand")) + (match_operand:VDQSF 1 "register_operand")))] + "TARGET_SIMD" + { + rtx scratch = gen_reg_rtx (mode); + emit_insn (gen_mul_lane3 (scratch, operands[2], + operands[3], operands[4])); + emit_insn (gen_add3 (operands[0], operands[1], scratch)); + DONE; + } +) + +(define_expand "aarch64_float_mls_lane" + [(set (match_operand:VDQSF 0 "register_operand") + (minus:VDQSF + (match_operand:VDQSF 1 "register_operand") + (mult:VDQSF + (vec_duplicate:VDQSF + (vec_select: + (match_operand:V2SF 3 "register_operand") + (parallel [(match_operand:SI 4 "immediate_operand")]))) + (match_operand:VDQSF 2 "register_operand"))))] + "TARGET_SIMD" + { + rtx scratch = gen_reg_rtx (mode); + emit_insn (gen_mul_lane3 (scratch, operands[2], + operands[3], operands[4])); + emit_insn (gen_sub3 (operands[0], operands[1], scratch)); + DONE; + } +) + (define_insn "fma4" [(set (match_operand:VHSDF 0 "register_operand" "=w") (fma:VHSDF (match_operand:VHSDF 1 "register_operand" "w") diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 0227cad..5328d44 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -20378,7 +20378,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmla_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __lane) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return __builtin_aarch64_float_mla_lanev2sf (__a, __b, __c, __lane); } __extension__ extern __inline int16x4_t @@ -20462,7 +20462,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __lane) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return __builtin_aarch64_float_mla_lanev4sf (__a, __b, __c, __lane); } __extension__ extern __inline int16x8_t @@ -20576,7 +20576,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmls_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __lane) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + return __builtin_aarch64_float_mls_lanev2sf (__a, __b, __c, __lane); } __extension__ extern __inline int16x4_t @@ -20660,7 +20660,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __lane) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + return __builtin_aarch64_float_mls_lanev4sf (__a, __b, __c, __lane); } __extension__ extern __inline int16x8_t -- cgit v1.1 From d388179a798c6528563873cbabd80a0e7272c013 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 17 Feb 2021 13:13:52 +0000 Subject: aarch64: Use RTL builtins for FP ml[as][q]_laneq intrinsics Rewrite floating-point vml[as][q]_laneq Neon intrinsics to use RTL builtins rather than relying on the GCC vector extensions. Using RTL builtins allows control over the emission of fmla/fmls instructions (which we don't want here.) With this commit, the code generated by these intrinsics changes from a fused multiply-add/subtract instruction to an fmul followed by an fadd/fsub instruction. If the programmer really wants fmla/fmls instructions, they can use the vfm[as] intrinsics. gcc/ChangeLog: 2021-02-17 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Add float_ml[as][q]_laneq builtin generator macros. * config/aarch64/aarch64-simd.md (mul_laneq3): Define. (aarch64_float_mla_laneq): Define. (aarch64_float_mls_laneq): Define. * config/aarch64/arm_neon.h (vmla_laneq_f32): Use RTL builtin instead of GCC vector extensions. (vmlaq_laneq_f32): Likewise. (vmls_laneq_f32): Likewise. (vmlsq_laneq_f32): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 2 + gcc/config/aarch64/aarch64-simd.md | 56 ++++++++++++++++++++++++++++ gcc/config/aarch64/arm_neon.h | 8 ++-- 3 files changed, 62 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 8e4b4ed..1e81bb5 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -674,6 +674,8 @@ BUILTIN_VDQSF (TERNOP, float_mls_n, 0, FP) BUILTIN_VDQSF (QUADOP_LANE, float_mla_lane, 0, FP) BUILTIN_VDQSF (QUADOP_LANE, float_mls_lane, 0, FP) + BUILTIN_VDQSF (QUADOP_LANE, float_mla_laneq, 0, FP) + BUILTIN_VDQSF (QUADOP_LANE, float_mls_laneq, 0, FP) /* Implemented by aarch64_simd_bsl. */ BUILTIN_VDQQH (BSL_P, simd_bsl, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index bdee49f..2347629 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -734,6 +734,22 @@ [(set_attr "type" "neon_mul__scalar")] ) +(define_insn "mul_laneq3" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (mult:VDQSF + (vec_duplicate:VDQSF + (vec_select: + (match_operand:V4SF 2 "register_operand" "w") + (parallel [(match_operand:SI 3 "immediate_operand" "i")]))) + (match_operand:VDQSF 1 "register_operand" "w")))] + "TARGET_SIMD" + { + operands[3] = aarch64_endian_lane_rtx (V4SFmode, INTVAL (operands[3])); + return "fmul\\t%0., %1., %2.[%3]"; + } + [(set_attr "type" "neon_fp_mul_s_scalar")] +) + (define_insn "*aarch64_mul3_elt_" [(set (match_operand:VMUL_CHANGE_NLANES 0 "register_operand" "=w") (mult:VMUL_CHANGE_NLANES @@ -2742,6 +2758,46 @@ } ) +(define_expand "aarch64_float_mla_laneq" + [(set (match_operand:VDQSF 0 "register_operand") + (plus:VDQSF + (mult:VDQSF + (vec_duplicate:VDQSF + (vec_select: + (match_operand:V4SF 3 "register_operand") + (parallel [(match_operand:SI 4 "immediate_operand")]))) + (match_operand:VDQSF 2 "register_operand")) + (match_operand:VDQSF 1 "register_operand")))] + "TARGET_SIMD" + { + rtx scratch = gen_reg_rtx (mode); + emit_insn (gen_mul_laneq3 (scratch, operands[2], + operands[3], operands[4])); + emit_insn (gen_add3 (operands[0], operands[1], scratch)); + DONE; + } +) + +(define_expand "aarch64_float_mls_laneq" + [(set (match_operand:VDQSF 0 "register_operand") + (minus:VDQSF + (match_operand:VDQSF 1 "register_operand") + (mult:VDQSF + (vec_duplicate:VDQSF + (vec_select: + (match_operand:V4SF 3 "register_operand") + (parallel [(match_operand:SI 4 "immediate_operand")]))) + (match_operand:VDQSF 2 "register_operand"))))] + "TARGET_SIMD" + { + rtx scratch = gen_reg_rtx (mode); + emit_insn (gen_mul_laneq3 (scratch, operands[2], + operands[3], operands[4])); + emit_insn (gen_sub3 (operands[0], operands[1], scratch)); + DONE; + } +) + (define_insn "fma4" [(set (match_operand:VHSDF 0 "register_operand" "=w") (fma:VHSDF (match_operand:VHSDF 1 "register_operand" "w") diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 5328d44..17e059e 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -20420,7 +20420,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmla_laneq_f32 (float32x2_t __a, float32x2_t __b, float32x4_t __c, const int __lane) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return __builtin_aarch64_float_mla_laneqv2sf (__a, __b, __c, __lane); } __extension__ extern __inline int16x4_t @@ -20504,7 +20504,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c, const int __lane) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return __builtin_aarch64_float_mla_laneqv4sf (__a, __b, __c, __lane); } __extension__ extern __inline int16x8_t @@ -20618,7 +20618,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmls_laneq_f32 (float32x2_t __a, float32x2_t __b, float32x4_t __c, const int __lane) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + return __builtin_aarch64_float_mls_laneqv2sf (__a, __b, __c, __lane); } __extension__ extern __inline int16x4_t @@ -20702,7 +20702,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c, const int __lane) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + return __builtin_aarch64_float_mls_laneqv4sf (__a, __b, __c, __lane); } __extension__ extern __inline int16x8_t -- cgit v1.1 From 6e1ecc984ef71fc80d0fb7e91ac38af16ccb1943 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Sun, 2 May 2021 10:28:11 +0200 Subject: i386: Fix up plugin header install on x86 [PR100336] The recent addition of i386-isa.def which is included from i386.h results in failures to build gcc plugins, the i386.h header is installed, but i386-isa.def is not. 2021-05-02 Jakub Jelinek PR target/100336 * config/i386/t-i386 (TM_H): Add $(srcdir)/config/i386/i386-isa.def. --- gcc/config/i386/t-i386 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/t-i386 b/gcc/config/i386/t-i386 index 66d5a8c..d1d69af 100644 --- a/gcc/config/i386/t-i386 +++ b/gcc/config/i386/t-i386 @@ -18,7 +18,8 @@ OPTIONS_H_EXTRA += $(srcdir)/config/i386/stringop.def TM_H += $(srcdir)/config/i386/x86-tune.def \ - $(srcdir)/common/config/i386/i386-cpuinfo.h + $(srcdir)/common/config/i386/i386-cpuinfo.h \ + $(srcdir)/config/i386/i386-isa.def PASSES_EXTRA += $(srcdir)/config/i386/i386-passes.def i386-c.o: $(srcdir)/config/i386/i386-c.c -- cgit v1.1 From 7911a905276781c20f704f5a91b5125e0184d072 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Sun, 2 May 2021 14:17:23 +0200 Subject: nvptx: Fix up nvptx build against latest libstdc++ [PR100375] The r12-220-gd96db15967e78d7cecea3b1cf3169ceb924678ac change deprecated some non-standard std::pair constructors and that apparently broke nvptx.c build, where pseudo_node_t is std::pair and so nullptr (or NULL) needs to be used for the first argument of the ctors instead of 0. 2021-05-02 Jakub Jelinek PR target/100375 * config/nvptx/nvptx.c (nvptx_sese_pseudo): Use nullptr instead of 0 as first argument of pseudo_node_t constructors. --- gcc/config/nvptx/nvptx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 794c5a6..7a7a913 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -3682,9 +3682,9 @@ nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, edge e; edge_iterator ei; int hi_back = depth; - pseudo_node_t node_back (0, depth); + pseudo_node_t node_back (nullptr, depth); int hi_child = depth; - pseudo_node_t node_child (0, depth); + pseudo_node_t node_child (nullptr, depth); basic_block child = NULL; unsigned num_children = 0; int usd = -dir * sese->dir; @@ -3751,7 +3751,7 @@ nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, else { /* Fallen off graph, backlink to entry node. */ hi_back = 0; - node_back = pseudo_node_t (0, 0); + node_back = pseudo_node_t (nullptr, 0); } } @@ -3772,7 +3772,7 @@ nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, else { /* back edge to entry node */ - sese->push (pseudo_node_t (0, 0)); + sese->push (pseudo_node_t (nullptr, 0)); } } @@ -3781,7 +3781,7 @@ nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, if (!sese->brackets.length () || !edges || !edges->length ()) { hi_back = 0; - node_back = pseudo_node_t (0, 0); + node_back = pseudo_node_t (nullptr, 0); sese->push (node_back); } -- cgit v1.1 From 92f59e47f5a468b96b12b15233a6729904b1a1ee Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Tue, 6 Apr 2021 11:41:49 -0400 Subject: aix: Redesign section encoding and selection AIX symbol references can refer to either the symbol (a label) or a symbol with a qualifier (the storage mapping class). The storage mapping class provide information about the underlying COFF section into which the symbol will be placed, e.g. [RO] for read-only in the text section, [RW] for read-writer in the data section, or [BS] for the BSS section. A label is distinct from a qualname in the assembler language, e.g., foo and foo[RW] are different, but the symbol table of an object file strips the storage mapping class from the name, so that it no longer is relevant when referring to symbols across object files and libraries. .csect .data[RW] i: is a label "i" in the .data CSECT, which has storage mapping class [RW] so that it is placed in the read-write COFF section. .csect i[RW] is a CSECT "i[RW]". BSS does not allow interior labels. The AIX port of GCC had been emitting the storage mapping class where appropriate but not consistently using the storage mapping class on the DECL or SYM name. This patch updates the section encoding to properly place storage mapping class on symbol names and remove the decorations placed when emitting the symbol. The mapping classes correspond to sections and the encoding choices must exactly match the section choices made by get_section, so the logic for the computation of reloc in get_variable_section is split into its own function that XCOFF encode section info can call. gcc/ChangeLog: * varasm.c (compute_reloc_for_var): Split out from... (get_variable_section): Use it. * output.h (compute_reloc_for_var): Declare. * config/rs6000/rs6000-protos.h (rs6000_xcoff_asm_output_aligned_decl_common): Change alignment to unsigned int. * config/rs6000/rs6000.c (rs6000_legitimize_tls_address_aix): Don't append storage mapping class to symbol. (rs6000_xcoff_asm_named_section): Add BS and UL mapping classes. Don't convert TLS BSS to common. (rs6000_xcoff_unique_section): Don't fall back to select_secton. (rs6000_xcoff_section_type_flags): Add SECTION_BSS if DECL is bss_initializer. (rs6000_xcoff_asm_globalize_decl_name): Don't strip storage mapping class. (rs6000_xcoff_asm_output_aligned_decl_common): Align is unsigned int. If align is 0 from TLS class, use the same rules as varasm.c If not common, switch to BSS section manually. If common, emit appropriate comm or lcomm directive. (rs6000_xcoff_encode_section_info): Add logic to append all storage mapping classes. (rs6000_asm_weaken_decl): Adjust for qualname symbols. * config/rs6000/xcoff.h (ASM_OUTPUT_ALIGNED_DECL_LOCAL): Use rs6000_xcoff_asm_output_aligned_decl_common. (ASM_OUTPUT_ALIGNED_DECL_LOCAL): Use rs6000_xcoff_asm_output_aligned_decl_common. (ASM_OUTPUT_TLS_COMMON): Use rs6000_xcoff_asm_output_aligned_decl_common. gcc/testsuite/ChangeLog: * g++.dg/ext/visibility/fvisibility-inlines-hidden-4.C: Expect [BS] mapping class on AIX. * gcc.c-torture/compile/pr61159.c: XFAIL on AIX. * gcc.c-torture/execute/alias-2.c: Same. * gcc.dg/alias-7.c: Same. --- gcc/config/rs6000/rs6000-protos.h | 2 +- gcc/config/rs6000/rs6000.c | 160 +++++++++++++++++++++++--------------- gcc/config/rs6000/xcoff.h | 44 ++--------- 3 files changed, 104 insertions(+), 102 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index a06a147..bef727e 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -243,7 +243,7 @@ extern void rs6000_xcoff_declare_object_name (FILE *, const char *, tree); extern void rs6000_xcoff_asm_output_aligned_decl_common (FILE *, tree, const char *, unsigned HOST_WIDE_INT, - unsigned HOST_WIDE_INT); + unsigned int); extern void rs6000_elf_declare_function_name (FILE *, const char *, tree); extern bool rs6000_elf_in_small_data_p (const_tree); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 7718176..0e9cf17 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -9311,29 +9311,12 @@ rs6000_got_sym (void) static rtx rs6000_legitimize_tls_address_aix (rtx addr, enum tls_model model) { - rtx sym, mem, tocref, tlsreg, tmpreg, dest, tlsaddr; + rtx sym, mem, tocref, tlsreg, tmpreg, dest; const char *name; char *tlsname; - name = XSTR (addr, 0); - /* Append TLS CSECT qualifier, unless the symbol already is qualified - or the symbol will be in TLS private data section. */ - if (name[strlen (name) - 1] != ']' - && (TREE_PUBLIC (SYMBOL_REF_DECL (addr)) - || bss_initializer_p (SYMBOL_REF_DECL (addr)))) - { - tlsname = XALLOCAVEC (char, strlen (name) + 4); - strcpy (tlsname, name); - strcat (tlsname, - bss_initializer_p (SYMBOL_REF_DECL (addr)) ? "[UL]" : "[TL]"); - tlsaddr = copy_rtx (addr); - XSTR (tlsaddr, 0) = ggc_strdup (tlsname); - } - else - tlsaddr = addr; - /* Place addr into TOC constant pool. */ - sym = force_const_mem (GET_MODE (tlsaddr), tlsaddr); + sym = force_const_mem (GET_MODE (addr), addr); /* Output the TOC entry and create the MEM referencing the value. */ if (constant_pool_expr_p (XEXP (sym, 0)) @@ -21238,10 +21221,11 @@ rs6000_xcoff_asm_named_section (const char *name, unsigned int flags, tree decl ATTRIBUTE_UNUSED) { int smclass; - static const char * const suffix[5] = { "PR", "RO", "RW", "TL", "XO" }; + static const char * const suffix[7] + = { "PR", "RO", "RW", "BS", "TL", "UL", "XO" }; if (flags & SECTION_EXCLUDE) - smclass = 4; + smclass = 6; else if (flags & SECTION_DEBUG) { fprintf (asm_out_file, "\t.dwsect %s\n", name); @@ -21250,9 +21234,19 @@ rs6000_xcoff_asm_named_section (const char *name, unsigned int flags, else if (flags & SECTION_CODE) smclass = 0; else if (flags & SECTION_TLS) - smclass = 3; + { + if (flags & SECTION_BSS) + smclass = 5; + else + smclass = 4; + } else if (flags & SECTION_WRITE) - smclass = 2; + { + if (flags & SECTION_BSS) + smclass = 3; + else + smclass = 2; + } else smclass = 1; @@ -21291,11 +21285,7 @@ rs6000_xcoff_select_section (tree decl, int reloc, if (TREE_CODE (decl) == VAR_DECL && DECL_THREAD_LOCAL_P (decl)) { if (bss_initializer_p (decl)) - { - /* Convert to COMMON to emit in BSS. */ - DECL_COMMON (decl) = 1; - return tls_comm_section; - } + return tls_comm_section; else if (TREE_PUBLIC (decl)) return tls_data_section; else @@ -21315,17 +21305,6 @@ rs6000_xcoff_unique_section (tree decl, int reloc ATTRIBUTE_UNUSED) { const char *name; - /* Use select_section for private data and uninitialized data with - alignment <= BIGGEST_ALIGNMENT. */ - if (!TREE_PUBLIC (decl) - || DECL_COMMON (decl) - || (DECL_INITIAL (decl) == NULL_TREE - && DECL_ALIGN (decl) <= BIGGEST_ALIGNMENT) - || DECL_INITIAL (decl) == error_mark_node - || (flag_zero_initialized_in_bss - && initializer_zerop (DECL_INITIAL (decl)))) - return; - name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)); name = (*targetm.strip_name_encoding) (name); set_decl_section_name (decl, name); @@ -21370,6 +21349,9 @@ rs6000_xcoff_section_type_flags (tree decl, const char *name, int reloc) unsigned int align; unsigned int flags = default_section_type_flags (decl, name, reloc); + if (decl && DECL_P (decl) && VAR_P (decl) && bss_initializer_p (decl)) + flags |= SECTION_BSS; + /* Align to at least UNIT size. */ if ((flags & SECTION_CODE) != 0 || !decl || !DECL_P (decl)) align = MIN_UNITS_PER_WORD; @@ -21632,7 +21614,7 @@ rs6000_xcoff_asm_globalize_decl_name (FILE *stream, tree decl) { const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0); fputs (GLOBAL_ASM_OP, stream); - RS6000_OUTPUT_BASENAME (stream, name); + assemble_name (stream, name); #ifdef HAVE_GAS_HIDDEN fputs (rs6000_xcoff_visibility (decl), stream); #endif @@ -21647,27 +21629,50 @@ rs6000_xcoff_asm_output_aligned_decl_common (FILE *stream, tree decl ATTRIBUTE_UNUSED, const char *name, unsigned HOST_WIDE_INT size, - unsigned HOST_WIDE_INT align) + unsigned int align) { - unsigned HOST_WIDE_INT align2 = 2; + unsigned int align2 = 2; + + if (align == 0) + align = DATA_ABI_ALIGNMENT (TREE_TYPE (decl), DECL_ALIGN (decl)); if (align > 32) align2 = floor_log2 (align / BITS_PER_UNIT); else if (size > 4) align2 = 3; - fputs (COMMON_ASM_OP, stream); - RS6000_OUTPUT_BASENAME (stream, name); + if (! DECL_COMMON (decl)) + { + /* Forget section. */ + in_section = NULL; + + /* Globalize TLS BSS. */ + if (TREE_PUBLIC (decl) && DECL_THREAD_LOCAL_P (decl)) + fprintf (stream, "\t.globl %s\n", name); - fprintf (stream, - "," HOST_WIDE_INT_PRINT_UNSIGNED "," HOST_WIDE_INT_PRINT_UNSIGNED, - size, align2); + /* Switch to section and skip space. */ + fprintf (stream, "\t.csect %s,%u\n", name, align2); + ASM_DECLARE_OBJECT_NAME (stream, name, decl); + ASM_OUTPUT_SKIP (stream, size ? size : 1); + return; + } + + if (TREE_PUBLIC (decl)) + { + fprintf (stream, + "\t.comm %s," HOST_WIDE_INT_PRINT_UNSIGNED ",%u" , + name, size, align2); #ifdef HAVE_GAS_HIDDEN - if (decl != NULL) - fputs (rs6000_xcoff_visibility (decl), stream); + if (decl != NULL) + fputs (rs6000_xcoff_visibility (decl), stream); #endif - putc ('\n', stream); + putc ('\n', stream); + } + else + fprintf (stream, + "\t.lcomm %s," HOST_WIDE_INT_PRINT_UNSIGNED ",%s,%u\n", + (*targetm.strip_name_encoding) (name), size, name, align2); } /* This macro produces the initial definition of a object (variable) name. @@ -21733,19 +21738,50 @@ rs6000_xcoff_encode_section_info (tree decl, rtx rtl, int first) SYMBOL_REF_FLAGS (symbol) = flags; - /* Append mapping class to extern decls. */ symname = XSTR (symbol, 0); - if (decl /* sync condition with assemble_external () */ - && DECL_P (decl) && DECL_EXTERNAL (decl) && TREE_PUBLIC (decl) - && ((TREE_CODE (decl) == VAR_DECL && !DECL_THREAD_LOCAL_P (decl)) - || TREE_CODE (decl) == FUNCTION_DECL) + + /* Append CSECT mapping class, unless the symbol already is qualified. */ + if (decl + && DECL_P (decl) + && VAR_OR_FUNCTION_DECL_P (decl) + && lookup_attribute ("alias", DECL_ATTRIBUTES (decl)) == NULL_TREE && symname[strlen (symname) - 1] != ']') { - char *newname = (char *) alloca (strlen (symname) + 5); - strcpy (newname, symname); - strcat (newname, (TREE_CODE (decl) == FUNCTION_DECL - ? "[DS]" : "[UA]")); - XSTR (symbol, 0) = ggc_strdup (newname); + const char *smclass = NULL; + + if (TREE_CODE (decl) == FUNCTION_DECL) + { + if (DECL_EXTERNAL (decl)) + smclass = "[DS]"; + } + else if (DECL_THREAD_LOCAL_P (decl)) + { + if (bss_initializer_p (decl)) + smclass = "[UL]"; + else if (flag_data_sections) + smclass = "[TL]"; + } + else if (DECL_EXTERNAL (decl)) + smclass = "[UA]"; + else if (bss_initializer_p (decl)) + smclass = "[BS]"; + else if (flag_data_sections) + { + /* This must exactly match the logic of select section. */ + if (decl_readonly_section (decl, compute_reloc_for_var (decl))) + smclass = "[RO]"; + else + smclass = "[RW]"; + } + + if (smclass != NULL) + { + char *newname = XALLOCAVEC (char, strlen (symname) + 5); + + strcpy (newname, symname); + strcat (newname, smclass); + XSTR (symbol, 0) = ggc_strdup (newname); + } } } #endif /* HAVE_AS_TLS */ @@ -21756,11 +21792,11 @@ rs6000_asm_weaken_decl (FILE *stream, tree decl, const char *name, const char *val) { fputs ("\t.weak\t", stream); - RS6000_OUTPUT_BASENAME (stream, name); + assemble_name (stream, name); if (decl && TREE_CODE (decl) == FUNCTION_DECL && DEFAULT_ABI == ABI_AIX && DOT_SYMBOLS) { - if (TARGET_XCOFF) + if (TARGET_XCOFF && name[strlen (name) - 1] != ']') fputs ("[DS]", stream); #if TARGET_XCOFF && HAVE_GAS_HIDDEN if (TARGET_XCOFF) diff --git a/gcc/config/rs6000/xcoff.h b/gcc/config/rs6000/xcoff.h index c016678..5ba565f 100644 --- a/gcc/config/rs6000/xcoff.h +++ b/gcc/config/rs6000/xcoff.h @@ -104,6 +104,8 @@ #define TARGET_ENCODE_SECTION_INFO rs6000_xcoff_encode_section_info #endif #define ASM_OUTPUT_ALIGNED_DECL_COMMON rs6000_xcoff_asm_output_aligned_decl_common +#define ASM_OUTPUT_ALIGNED_DECL_LOCAL rs6000_xcoff_asm_output_aligned_decl_common +#define ASM_OUTPUT_ALIGNED_BSS rs6000_xcoff_asm_output_aligned_decl_common /* FP save and restore routines. */ #define SAVE_FP_PREFIX "._savef" @@ -218,48 +220,12 @@ to define a global common symbol. */ #define COMMON_ASM_OP "\t.comm " - -/* This says how to output an assembler line - to define a local common symbol. - The assembler in AIX 6.1 and later supports an alignment argument. - For earlier releases of AIX, we try to maintain - alignment after preceding TOC section if it was aligned - for 64-bit mode. */ - #define LOCAL_COMMON_ASM_OP "\t.lcomm " -#if TARGET_AIX_VERSION >= 61 -#define ASM_OUTPUT_ALIGNED_LOCAL(FILE, NAME, SIZE, ALIGN) \ - do { fputs (LOCAL_COMMON_ASM_OP, (FILE)); \ - RS6000_OUTPUT_BASENAME ((FILE), (NAME)); \ - if ((ALIGN) > 32) \ - fprintf ((FILE), "," HOST_WIDE_INT_PRINT_UNSIGNED",%s%u_,%u\n", \ - (SIZE), xcoff_bss_section_name, \ - floor_log2 ((ALIGN) / BITS_PER_UNIT), \ - floor_log2 ((ALIGN) / BITS_PER_UNIT)); \ - else if ((SIZE) > 4) \ - fprintf ((FILE), "," HOST_WIDE_INT_PRINT_UNSIGNED",%s3_,3\n", \ - (SIZE), xcoff_bss_section_name); \ - else \ - fprintf ((FILE), "," HOST_WIDE_INT_PRINT_UNSIGNED",%s,2\n", \ - (SIZE), xcoff_bss_section_name); \ - } while (0) -#endif - -#define ASM_OUTPUT_LOCAL(FILE, NAME, SIZE, ROUNDED) \ - do { fputs (LOCAL_COMMON_ASM_OP, (FILE)); \ - RS6000_OUTPUT_BASENAME ((FILE), (NAME)); \ - fprintf ((FILE), "," HOST_WIDE_INT_PRINT_UNSIGNED",%s\n", \ - (TARGET_32BIT ? (SIZE) : (ROUNDED)), \ - xcoff_bss_section_name); \ - } while (0) - #ifdef HAVE_AS_TLS -#define ASM_OUTPUT_TLS_COMMON(FILE, DECL, NAME, SIZE) \ - do { fputs (COMMON_ASM_OP, (FILE)); \ - RS6000_OUTPUT_BASENAME ((FILE), (NAME)); \ - fprintf ((FILE), "[UL]," HOST_WIDE_INT_PRINT_UNSIGNED"\n", \ - (SIZE)); \ +#define ASM_OUTPUT_TLS_COMMON(FILE, DECL, NAME, SIZE) \ + do { \ + rs6000_xcoff_asm_output_aligned_decl_common ((FILE), (DECL), (NAME), (SIZE), 0); \ } while (0) #endif -- cgit v1.1 From 4f48c335d36674f90046b2823f0ac1c0545dc082 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 26 Apr 2021 14:12:08 +0200 Subject: IBM Z: Handle hard registers in s390_md_asm_adjust() gen_fprx2_to_tf() and gen_tf_to_fprx2() cannot handle hard registers, since the subregs they create do not pass validation. Change s390_md_asm_adjust() to manually copy between hard VRs and FPRs instead of using these two functions. gcc/ChangeLog: PR target/100217 * config/s390/s390.c (s390_hard_fp_reg_p): New function. (s390_md_asm_adjust): Handle hard registers. gcc/testsuite/ChangeLog: PR target/100217 * gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c: New test. * gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c: New test. --- gcc/config/s390/s390.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index a9c945c..88361f9 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -16754,6 +16754,23 @@ f_constraint_p (const char *constraint) return seen_f_p && !seen_v_p; } +/* Return TRUE iff X is a hard floating-point (and not a vector) register. */ + +static bool +s390_hard_fp_reg_p (rtx x) +{ + if (!(REG_P (x) && HARD_REGISTER_P (x) && REG_ATTRS (x))) + return false; + + tree decl = REG_EXPR (x); + if (!(HAS_DECL_ASSEMBLER_NAME_P (decl) && DECL_ASSEMBLER_NAME_SET_P (decl))) + return false; + + const char *name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)); + + return name[0] == '*' && name[1] == 'f'; +} + /* Implement TARGET_MD_ASM_ADJUST hook in order to fix up "f" constraints when long doubles are stored in vector registers. */ @@ -16787,9 +16804,24 @@ s390_md_asm_adjust (vec &outputs, vec &inputs, gcc_assert (allows_reg); gcc_assert (!is_inout); /* Copy output value from a FPR pair into a vector register. */ - rtx fprx2 = gen_reg_rtx (FPRX2mode); + rtx fprx2; push_to_sequence2 (after_md_seq, after_md_end); - emit_insn (gen_fprx2_to_tf (outputs[i], fprx2)); + if (s390_hard_fp_reg_p (outputs[i])) + { + fprx2 = gen_rtx_REG (FPRX2mode, REGNO (outputs[i])); + /* The first half is already at the correct location, copy only the + * second one. Use the UNSPEC pattern instead of the SUBREG one, + * since s390_can_change_mode_class() rejects + * (subreg:DF (reg:TF %fN) 8) and thus subreg validation fails. */ + rtx v1 = gen_rtx_REG (V2DFmode, REGNO (outputs[i])); + rtx v3 = gen_rtx_REG (V2DFmode, REGNO (outputs[i]) + 1); + emit_insn (gen_vec_permiv2df (v1, v1, v3, const0_rtx)); + } + else + { + fprx2 = gen_reg_rtx (FPRX2mode); + emit_insn (gen_fprx2_to_tf (outputs[i], fprx2)); + } after_md_seq = get_insns (); after_md_end = get_last_insn (); end_sequence (); @@ -16813,8 +16845,20 @@ s390_md_asm_adjust (vec &outputs, vec &inputs, continue; gcc_assert (allows_reg); /* Copy input value from a vector register into a FPR pair. */ - rtx fprx2 = gen_reg_rtx (FPRX2mode); - emit_insn (gen_tf_to_fprx2 (fprx2, inputs[i])); + rtx fprx2; + if (s390_hard_fp_reg_p (inputs[i])) + { + fprx2 = gen_rtx_REG (FPRX2mode, REGNO (inputs[i])); + /* Copy only the second half. */ + rtx v1 = gen_rtx_REG (V2DFmode, REGNO (inputs[i]) + 1); + rtx v2 = gen_rtx_REG (V2DFmode, REGNO (inputs[i])); + emit_insn (gen_vec_permiv2df (v1, v2, v1, GEN_INT (3))); + } + else + { + fprx2 = gen_reg_rtx (FPRX2mode); + emit_insn (gen_tf_to_fprx2 (fprx2, inputs[i])); + } inputs[i] = fprx2; input_modes[i] = FPRX2mode; } -- cgit v1.1 From bd1cd0d0e0fecc6ac8632c266591767392480746 Mon Sep 17 00:00:00 2001 From: Segher Boessenkool Date: Fri, 23 Apr 2021 19:59:00 +0000 Subject: Remove CC0 This removes CC0 and all directly related infrastructure. CC_STATUS, CC_STATUS_MDEP, CC_STATUS_MDEP_INIT, and NOTICE_UPDATE_CC are deleted and poisoned. CC0 is only deleted (some targets use that name for something else). HAVE_cc0 is automatically generated, and we no longer will do that after this patch. CC_STATUS_INIT is suggested in final.c to also be useful for ports that are not CC0, and at least arm seems to use it for something. So I am leaving that alone, but most targets that have it could remove it. 2021-05-04 Segher Boessenkool * caller-save.c: Remove CC0. * cfgcleanup.c: Remove CC0. * cfgrtl.c: Remove CC0. * combine.c: Remove CC0. * compare-elim.c: Remove CC0. * conditions.h: Remove CC0. * config/h8300/h8300.h: Remove CC0. * config/h8300/h8300-protos.h: Remove CC0. * config/h8300/peepholes.md: Remove CC0. * config/i386/x86-tune-sched.c: Remove CC0. * config/m68k/m68k.c: Remove CC0. * config/rl78/rl78.c: Remove CC0. * config/sparc/sparc.c: Remove CC0. * config/xtensa/xtensa.c: Remove CC0. (gen_conditional_move): Use pc_rtx instead of cc0_rtx in a piece of RTL where that is used as a placeholder only. * cprop.c: Remove CC0. * cse.c: Remove CC0. * cselib.c: Remove CC0. * df-problems.c: Remove CC0. * df-scan.c: Remove CC0. * doc/md.texi: Remove CC0. Adjust an example. * doc/rtl.texi: Remove CC0. Adjust an example. * doc/tm.texi: Regenerate. * doc/tm.texi.in: Remove CC0. * emit-rtl.c: Remove CC0. * final.c: Remove CC0. * fwprop.c: Remove CC0. * gcse-common.c: Remove CC0. * gcse.c: Remove CC0. * genattrtab.c: Remove CC0. * genconfig.c: Remove CC0. * genemit.c: Remove CC0. * genextract.c: Remove CC0. * gengenrtl.c: Remove CC0. * genrecog.c: Remove CC0. * haifa-sched.c: Remove CC0. * ifcvt.c: Remove CC0. * ira-costs.c: Remove CC0. * ira.c: Remove CC0. * jump.c: Remove CC0. * loop-invariant.c: Remove CC0. * lra-constraints.c: Remove CC0. * lra-eliminations.c: Remove CC0. * optabs.c: Remove CC0. * postreload-gcse.c: Remove CC0. * postreload.c: Remove CC0. * print-rtl.c: Remove CC0. * read-rtl-function.c: Remove CC0. * reg-notes.def: Remove CC0. * reg-stack.c: Remove CC0. * reginfo.c: Remove CC0. * regrename.c: Remove CC0. * reload.c: Remove CC0. * reload1.c: Remove CC0. * reorg.c: Remove CC0. * resource.c: Remove CC0. * rtl.c: Remove CC0. * rtl.def: Remove CC0. * rtl.h: Remove CC0. * rtlanal.c: Remove CC0. * sched-deps.c: Remove CC0. * sched-rgn.c: Remove CC0. * shrink-wrap.c: Remove CC0. * simplify-rtx.c: Remove CC0. * system.h: Remove CC0. Poison NOTICE_UPDATE_CC, CC_STATUS_MDEP_INIT, CC_STATUS_MDEP, and CC_STATUS. * target.def: Remove CC0. * valtrack.c: Remove CC0. * var-tracking.c: Remove CC0. --- gcc/config/h8300/h8300-protos.h | 1 - gcc/config/h8300/h8300.h | 7 - gcc/config/h8300/peepholes.md | 947 --------------------------------------- gcc/config/i386/x86-tune-sched.c | 1 - gcc/config/m68k/m68k.c | 2 - gcc/config/rl78/rl78.c | 1 - gcc/config/sparc/sparc.c | 1 - gcc/config/xtensa/xtensa.c | 2 +- 8 files changed, 1 insertion(+), 961 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300-protos.h b/gcc/config/h8300/h8300-protos.h index c5667b3..45e7dec 100644 --- a/gcc/config/h8300/h8300-protos.h +++ b/gcc/config/h8300/h8300-protos.h @@ -36,7 +36,6 @@ extern const char *output_simode_bld (int, rtx[]); extern void final_prescan_insn (rtx_insn *, rtx *, int); extern int h8300_expand_movsi (rtx[]); extern machine_mode h8300_select_cc_mode (RTX_CODE, rtx, rtx); -extern void notice_update_cc (rtx, rtx_insn *); extern const char *output_logical_op (machine_mode, rtx *); extern unsigned int compute_logical_op_length (machine_mode, rtx *); diff --git a/gcc/config/h8300/h8300.h b/gcc/config/h8300/h8300.h index b1fbcc5..ea60021 100644 --- a/gcc/config/h8300/h8300.h +++ b/gcc/config/h8300/h8300.h @@ -569,13 +569,6 @@ struct cum_arg /* Here we define machine-dependent flags and fields in cc_status (see `conditions.h'). No extra ones are needed for the h8300. */ -/* Store in cc_status the expressions - that the condition codes will describe - after execution of an instruction whose pattern is EXP. - Do not alter them if the instruction would not alter the cc's. */ - -#define NOTICE_UPDATE_CC(EXP, INSN) notice_update_cc (EXP, INSN) - /* The add insns don't set overflow in a usable way. */ #define CC_OVERFLOW_UNUSABLE 01000 /* The mov,and,or,xor insns don't set carry. That's OK though as the diff --git a/gcc/config/h8300/peepholes.md b/gcc/config/h8300/peepholes.md index bd69018..a836d7d 100644 --- a/gcc/config/h8300/peepholes.md +++ b/gcc/config/h8300/peepholes.md @@ -349,90 +349,6 @@ (match_dup 1)))] "") -;; Turn -;; -;; subs #1,er4 -;; mov.w r4,r4 -;; bne .L2028 -;; -;; into -;; -;; dec.w #1,r4 -;; bne .L2028 - -(define_peephole2 - [(set (match_operand:HI 0 "register_operand" "") - (plus:HI (match_dup 0) - (match_operand 1 "incdec_operand" ""))) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_operator 4 "eqne_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "" - [(set (match_operand:HI 0 "register_operand" "") - (unspec:HI [(match_dup 0) - (match_dup 1)] - UNSPEC_INCDEC)) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_op_dup 4 [(cc0) (const_int 0)]) - (match_dup 2) - (match_dup 3)))]) - -;; The SImode version of the previous pattern. - -(define_peephole2 - [(set (match_operand:SI 0 "register_operand" "") - (plus:SI (match_dup 0) - (match_operand 1 "incdec_operand" ""))) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_operator 4 "eqne_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "" - [(set (match_operand:SI 0 "register_operand" "") - (unspec:SI [(match_dup 0) - (match_dup 1)] - UNSPEC_INCDEC)) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_op_dup 4 [(cc0) (const_int 0)]) - (match_dup 2) - (match_dup 3)))]) - -(define_peephole2 - [(parallel [(set (cc0) - (compare (zero_extract:SI (match_operand:QI 0 "register_operand" "") - (const_int 1) - (const_int 7)) - (const_int 0))) - (clobber (scratch:QI))]) - (set (pc) - (if_then_else (match_operator 4 "eqne_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "" - [(set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_op_dup 4 [(cc0) (const_int 0)]) - (match_dup 2) - (match_dup 3)))] - { - operands[4] = ((GET_CODE (operands[4]) == EQ) - ? gen_rtx_GE (VOIDmode, cc0_rtx, const0_rtx) - : gen_rtx_LT (VOIDmode, cc0_rtx, const0_rtx)); - }) - ;; If a load of mem:SI is followed by an AND that turns off the upper ;; half, then we can load mem:HI instead. @@ -456,829 +372,6 @@ operands[4] = gen_lowpart (HImode, operands[1]); }) -;; (compare (reg:HI) (const_int)) takes 4 bytes, so we try to achieve -;; the equivalent with shorter sequences. Here is the summary. Cases -;; are grouped for each define_peephole2. -;; -;; reg const_int use insn -;; -------------------------------------------------------- -;; dead -2 eq/ne inc.l -;; dead -1 eq/ne inc.l -;; dead 1 eq/ne dec.l -;; dead 2 eq/ne dec.l -;; -;; dead 1 ge/lt shar.l -;; dead 3 (H8S) ge/lt shar.l -;; -;; dead 1 geu/ltu shar.l -;; dead 3 (H8S) geu/ltu shar.l -;; -;; ---- 255 ge/lt mov.b -;; -;; ---- 255 geu/ltu mov.b - -;; Transform -;; -;; cmp.w #1,r0 -;; bne .L1 -;; -;; into -;; -;; dec.w #1,r0 -;; bne .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:HI 0 "register_operand" "") - (match_operand:HI 1 "incdec_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "eqne_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "INTVAL (operands[1]) != 0 && peep2_reg_dead_p (1, operands[0])" - [(set (match_dup 0) - (unspec:HI [(match_dup 0) - (match_dup 5)] - UNSPEC_INCDEC)) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_op_dup 4 [(cc0) (const_int 0)]) - (match_dup 2) - (match_dup 3)))] - { - operands[5] = GEN_INT (- INTVAL (operands[1])); - }) - -;; Transform -;; -;; cmp.w #1,r0 -;; bgt .L1 -;; -;; into -;; -;; shar.w r0 -;; bgt .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:HI 0 "register_operand" "") - (match_operand:HI 1 "const_int_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "gtle_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "peep2_reg_dead_p (1, operands[0]) - && (INTVAL (operands[1]) == 1 - || (TARGET_H8300S && INTVAL (operands[1]) == 3))" - [(parallel [(set (match_dup 0) - (ashiftrt:HI (match_dup 0) - (match_dup 5))) - (clobber (scratch:QI))]) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_dup 4) - (match_dup 2) - (match_dup 3)))] - { - operands[5] = GEN_INT (exact_log2 (INTVAL (operands[1]) + 1)); - }) - -;; Transform -;; -;; cmp.w #1,r0 -;; bhi .L1 -;; -;; into -;; -;; shar.w r0 -;; bne .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:HI 0 "register_operand" "") - (match_operand:HI 1 "const_int_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "gtuleu_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "peep2_reg_dead_p (1, operands[0]) - && (INTVAL (operands[1]) == 1 - || (TARGET_H8300S && INTVAL (operands[1]) == 3))" - [(parallel [(set (match_dup 0) - (ashiftrt:HI (match_dup 0) - (match_dup 5))) - (clobber (scratch:QI))]) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_dup 6) - (match_dup 2) - (match_dup 3)))] - { - operands[5] = GEN_INT (exact_log2 (INTVAL (operands[1]) + 1)); - operands[6] = gen_rtx_fmt_ee (GET_CODE (operands[4]) == GTU ? NE : EQ, - VOIDmode, cc0_rtx, const0_rtx); - }) - -;; Transform -;; -;; cmp.w #255,r0 -;; bgt .L1 -;; -;; into -;; -;; mov.b r0h,r0h -;; bgt .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:HI 0 "register_operand" "") - (const_int 255))) - (set (pc) - (if_then_else (match_operator 1 "gtle_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "" - [(set (cc0) (compare (and:HI (match_dup 0) - (const_int -256)) - (const_int 0))) - (set (pc) - (if_then_else (match_dup 1) - (match_dup 2) - (match_dup 3)))]) - -;; Transform -;; -;; cmp.w #255,r0 -;; bhi .L1 -;; -;; into -;; -;; mov.b r0h,r0h -;; bne .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:HI 0 "register_operand" "") - (const_int 255))) - (set (pc) - (if_then_else (match_operator 1 "gtuleu_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "" - [(set (cc0) (compare (and:HI (match_dup 0) - (const_int -256)) - (const_int 0))) - (set (pc) - (if_then_else (match_dup 4) - (match_dup 2) - (match_dup 3)))] - { - operands[4] = gen_rtx_fmt_ee (GET_CODE (operands[1]) == GTU ? NE : EQ, - VOIDmode, cc0_rtx, const0_rtx); - }) - -;; (compare (reg:SI) (const_int)) takes 6 bytes, so we try to achieve -;; the equivalent with shorter sequences. Here is the summary. Cases -;; are grouped for each define_peephole2. -;; -;; reg const_int use insn -;; -------------------------------------------------------- -;; live -2 eq/ne copy and inc.l -;; live -1 eq/ne copy and inc.l -;; live 1 eq/ne copy and dec.l -;; live 2 eq/ne copy and dec.l -;; -;; dead -2 eq/ne inc.l -;; dead -1 eq/ne inc.l -;; dead 1 eq/ne dec.l -;; dead 2 eq/ne dec.l -;; -;; dead -131072 eq/ne inc.w and test -;; dead -65536 eq/ne inc.w and test -;; dead 65536 eq/ne dec.w and test -;; dead 131072 eq/ne dec.w and test -;; -;; dead 0x000000?? except 1 and 2 eq/ne xor.b and test -;; dead 0x0000??00 eq/ne xor.b and test -;; dead 0x0000ffff eq/ne not.w and test -;; -;; dead 0xffffff?? except -1 and -2 eq/ne xor.b and not.l -;; dead 0xffff??ff eq/ne xor.b and not.l -;; dead 0x40000000 (H8S) eq/ne rotl.l and dec.l -;; dead 0x80000000 eq/ne rotl.l and dec.l -;; -;; live 1 ge/lt copy and shar.l -;; live 3 (H8S) ge/lt copy and shar.l -;; -;; live 1 geu/ltu copy and shar.l -;; live 3 (H8S) geu/ltu copy and shar.l -;; -;; dead 1 ge/lt shar.l -;; dead 3 (H8S) ge/lt shar.l -;; -;; dead 1 geu/ltu shar.l -;; dead 3 (H8S) geu/ltu shar.l -;; -;; dead 3 (H8/300H) ge/lt and.b and test -;; dead 7 ge/lt and.b and test -;; dead 15 ge/lt and.b and test -;; dead 31 ge/lt and.b and test -;; dead 63 ge/lt and.b and test -;; dead 127 ge/lt and.b and test -;; dead 255 ge/lt and.b and test -;; -;; dead 3 (H8/300H) geu/ltu and.b and test -;; dead 7 geu/ltu and.b and test -;; dead 15 geu/ltu and.b and test -;; dead 31 geu/ltu and.b and test -;; dead 63 geu/ltu and.b and test -;; dead 127 geu/ltu and.b and test -;; dead 255 geu/ltu and.b and test -;; -;; ---- 65535 ge/lt mov.w -;; -;; ---- 65535 geu/ltu mov.w - -;; Transform -;; -;; cmp.l #1,er0 -;; beq .L1 -;; -;; into -;; -;; dec.l #1,er0 -;; beq .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (match_operand:SI 1 "incdec_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "eqne_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "INTVAL (operands[1]) != 0 && peep2_reg_dead_p (1, operands[0])" - [(set (match_dup 0) - (unspec:SI [(match_dup 0) - (match_dup 5)] - UNSPEC_INCDEC)) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_op_dup 4 [(cc0) (const_int 0)]) - (match_dup 2) - (match_dup 3)))] - { - operands[5] = GEN_INT (- INTVAL (operands[1])); - }) - -;; Transform -;; -;; cmp.l #65536,er0 -;; beq .L1 -;; -;; into -;; -;; dec.l #1,e0 -;; beq .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (match_operand:SI 1 "const_int_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "eqne_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "peep2_reg_dead_p (1, operands[0]) - && (INTVAL (operands[1]) == -131072 - || INTVAL (operands[1]) == -65536 - || INTVAL (operands[1]) == 65536 - || INTVAL (operands[1]) == 131072)" - [(set (match_dup 0) - (plus:SI (match_dup 0) - (match_dup 5))) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_op_dup 4 [(cc0) (const_int 0)]) - (match_dup 2) - (match_dup 3)))] - { - operands[5] = GEN_INT (- INTVAL (operands[1])); - }) - -;; Transform -;; -;; cmp.l #100,er0 -;; beq .L1 -;; -;; into -;; -;; xor.b #100,er0 -;; mov.l er0,er0 -;; beq .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (match_operand:SI 1 "const_int_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "eqne_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "peep2_reg_dead_p (1, operands[0]) - && ((INTVAL (operands[1]) & 0x00ff) == INTVAL (operands[1]) - || (INTVAL (operands[1]) & 0xff00) == INTVAL (operands[1]) - || INTVAL (operands[1]) == 0x0000ffff) - && INTVAL (operands[1]) != 0 - && INTVAL (operands[1]) != 1 - && INTVAL (operands[1]) != 2" - [(set (match_dup 0) - (xor:SI (match_dup 0) - (match_dup 1))) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_op_dup 4 [(cc0) (const_int 0)]) - (match_dup 2) - (match_dup 3)))]) - -;; Transform -;; -;; cmp.l #-100,er0 -;; beq .L1 -;; -;; into -;; -;; xor.b #99,er0 -;; not.l er0 -;; beq .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (match_operand:SI 1 "const_int_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "eqne_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "peep2_reg_dead_p (1, operands[0]) - && ((INTVAL (operands[1]) | 0x00ff) == -1 - || (INTVAL (operands[1]) | 0xff00) == -1) - && INTVAL (operands[1]) != -1 - && INTVAL (operands[1]) != -2" - [(set (match_dup 0) - (xor:SI (match_dup 0) - (match_dup 5))) - (set (match_dup 0) - (not:SI (match_dup 0))) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_op_dup 4 [(cc0) (const_int 0)]) - (match_dup 2) - (match_dup 3)))] - { - operands[5] = GEN_INT (INTVAL (operands[1]) ^ -1); - }) - -;; Transform -;; -;; cmp.l #-2147483648,er0 -;; beq .L1 -;; -;; into -;; -;; rotl.l er0 -;; dec.l #1,er0 -;; beq .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (match_operand:SI 1 "const_int_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "eqne_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "peep2_reg_dead_p (1, operands[0]) - && (INTVAL (operands[1]) == -2147483647 - 1 - || (TARGET_H8300S && INTVAL (operands[1]) == 1073741824))" - [(set (match_dup 0) - (rotate:SI (match_dup 0) - (match_dup 5))) - (set (match_dup 0) - (unspec:SI [(match_dup 0) - (const_int -1)] - UNSPEC_INCDEC)) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_op_dup 4 [(cc0) (const_int 0)]) - (match_dup 2) - (match_dup 3)))] - { - operands[5] = GEN_INT (INTVAL (operands[1]) == -2147483647 - 1 ? 1 : 2); - }) - -;; Transform -;; -;; cmp.l #1,er0 -;; bgt .L1 -;; -;; into -;; -;; mov.l er0,er1 -;; shar.l er1 -;; bgt .L1 - -;; We avoid this transformation if we see more than one copy of the -;; same compare insn immediately before this one. - -(define_peephole2 - [(match_scratch:SI 5 "r") - (set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (match_operand:SI 1 "const_int_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "gtle_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "!peep2_reg_dead_p (1, operands[0]) - && (INTVAL (operands[1]) == 1 - || (TARGET_H8300S && INTVAL (operands[1]) == 3)) - && !same_cmp_preceding_p (insn)" - [(set (match_dup 5) - (match_dup 0)) - (parallel [(set (match_dup 5) - (ashiftrt:SI (match_dup 5) - (match_dup 6))) - (clobber (scratch:QI))]) - (set (cc0) (compare (match_dup 5) - (const_int 0))) - (set (pc) - (if_then_else (match_dup 4) - (match_dup 2) - (match_dup 3)))] - { - operands[6] = GEN_INT (exact_log2 (INTVAL (operands[1]) + 1)); - }) - -;; Transform -;; -;; cmp.l #1,er0 -;; bhi .L1 -;; -;; into -;; -;; mov.l er0,er1 -;; shar.l er1 -;; bne .L1 - -;; We avoid this transformation if we see more than one copy of the -;; same compare insn immediately before this one. - -(define_peephole2 - [(match_scratch:SI 5 "r") - (set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (match_operand:SI 1 "const_int_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "gtuleu_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "!peep2_reg_dead_p (1, operands[0]) - && (INTVAL (operands[1]) == 1 - || (TARGET_H8300S && INTVAL (operands[1]) == 3)) - && !same_cmp_preceding_p (insn)" - [(set (match_dup 5) - (match_dup 0)) - (parallel [(set (match_dup 5) - (ashiftrt:SI (match_dup 5) - (match_dup 6))) - (clobber (scratch:QI))]) - (set (cc0) (compare (match_dup 5) - (const_int 0))) - (set (pc) - (if_then_else (match_dup 7) - (match_dup 2) - (match_dup 3)))] - { - operands[6] = GEN_INT (exact_log2 (INTVAL (operands[1]) + 1)); - operands[7] = gen_rtx_fmt_ee (GET_CODE (operands[4]) == GTU ? NE : EQ, - VOIDmode, cc0_rtx, const0_rtx); - }) - -;; Transform -;; -;; cmp.l #1,er0 -;; bgt .L1 -;; -;; into -;; -;; shar.l er0 -;; bgt .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (match_operand:SI 1 "const_int_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "gtle_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "peep2_reg_dead_p (1, operands[0]) - && (INTVAL (operands[1]) == 1 - || (TARGET_H8300S && INTVAL (operands[1]) == 3))" - [(parallel [(set (match_dup 0) - (ashiftrt:SI (match_dup 0) - (match_dup 5))) - (clobber (scratch:QI))]) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_dup 4) - (match_dup 2) - (match_dup 3)))] - { - operands[5] = GEN_INT (exact_log2 (INTVAL (operands[1]) + 1)); - }) - -;; Transform -;; -;; cmp.l #1,er0 -;; bhi .L1 -;; -;; into -;; -;; shar.l er0 -;; bne .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (match_operand:SI 1 "const_int_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "gtuleu_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "peep2_reg_dead_p (1, operands[0]) - && (INTVAL (operands[1]) == 1 - || (TARGET_H8300S && INTVAL (operands[1]) == 3))" - [(parallel [(set (match_dup 0) - (ashiftrt:SI (match_dup 0) - (match_dup 5))) - (clobber (scratch:QI))]) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_dup 6) - (match_dup 2) - (match_dup 3)))] - { - operands[5] = GEN_INT (exact_log2 (INTVAL (operands[1]) + 1)); - operands[6] = gen_rtx_fmt_ee (GET_CODE (operands[4]) == GTU ? NE : EQ, - VOIDmode, cc0_rtx, const0_rtx); - }) - -;; Transform -;; -;; cmp.l #15,er0 -;; bgt .L1 -;; -;; into -;; -;; and #240,r0l -;; mov.l er0,er0 -;; bgt .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (match_operand:SI 1 "const_int_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "gtle_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "peep2_reg_dead_p (1, operands[0]) - && (INTVAL (operands[1]) == 3 - || INTVAL (operands[1]) == 7 - || INTVAL (operands[1]) == 15 - || INTVAL (operands[1]) == 31 - || INTVAL (operands[1]) == 63 - || INTVAL (operands[1]) == 127 - || INTVAL (operands[1]) == 255)" - [(set (match_dup 0) - (and:SI (match_dup 0) - (match_dup 5))) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_dup 4) - (match_dup 2) - (match_dup 3)))] - { - operands[5] = GEN_INT (~INTVAL (operands[1])); - }) - -;; Transform -;; -;; cmp.l #15,er0 -;; bhi .L1 -;; -;; into -;; -;; and #240,r0l -;; mov.l er0,er0 -;; bne .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (match_operand:SI 1 "const_int_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "gtuleu_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "peep2_reg_dead_p (1, operands[0]) - && ((TARGET_H8300H && INTVAL (operands[1]) == 3) - || INTVAL (operands[1]) == 7 - || INTVAL (operands[1]) == 15 - || INTVAL (operands[1]) == 31 - || INTVAL (operands[1]) == 63 - || INTVAL (operands[1]) == 127 - || INTVAL (operands[1]) == 255)" - [(set (match_dup 0) - (and:SI (match_dup 0) - (match_dup 5))) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_dup 6) - (match_dup 2) - (match_dup 3)))] - { - operands[5] = GEN_INT (~INTVAL (operands[1])); - operands[6] = gen_rtx_fmt_ee (GET_CODE (operands[4]) == GTU ? NE : EQ, - VOIDmode, cc0_rtx, const0_rtx); - }) - -;; Transform -;; -;; cmp.l #65535,er0 -;; bgt .L1 -;; -;; into -;; -;; mov.l e0,e0 -;; bgt .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (const_int 65535))) - (set (pc) - (if_then_else (match_operator 1 "gtle_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "" - [(set (cc0) (compare (and:SI (match_dup 0) - (const_int -65536)) - (const_int 0))) - (set (pc) - (if_then_else (match_dup 1) - (match_dup 2) - (match_dup 3)))]) - -;; Transform -;; -;; cmp.l #65535,er0 -;; bhi .L1 -;; -;; into -;; -;; mov.l e0,e0 -;; bne .L1 - -(define_peephole2 - [(set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (const_int 65535))) - (set (pc) - (if_then_else (match_operator 1 "gtuleu_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "" - [(set (cc0) (compare (and:SI (match_dup 0) - (const_int -65536)) - (const_int 0))) - (set (pc) - (if_then_else (match_dup 4) - (match_dup 2) - (match_dup 3)))] - { - operands[4] = gen_rtx_fmt_ee (GET_CODE (operands[1]) == GTU ? NE : EQ, - VOIDmode, cc0_rtx, const0_rtx); - }) - -;; Transform -;; -;; cmp.l #1,er0 -;; beq .L1 -;; -;; into -;; -;; mov.l er0,er1 -;; dec.l #1,er1 -;; beq .L1 - -;; We avoid this transformation if we see more than one copy of the -;; same compare insn. - -(define_peephole2 - [(match_scratch:SI 5 "r") - (set (cc0) - (compare (match_operand:SI 0 "register_operand" "") - (match_operand:SI 1 "incdec_operand" ""))) - (set (pc) - (if_then_else (match_operator 4 "eqne_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "INTVAL (operands[1]) != 0 - && !peep2_reg_dead_p (1, operands[0]) - && !same_cmp_following_p (insn)" - [(set (match_dup 5) - (match_dup 0)) - (set (match_dup 5) - (unspec:SI [(match_dup 5) - (match_dup 6)] - UNSPEC_INCDEC)) - (set (cc0) (compare (match_dup 5) - (const_int 0))) - (set (pc) - (if_then_else (match_op_dup 4 [(cc0) (const_int 0)]) - (match_dup 2) - (match_dup 3)))] - { - operands[6] = GEN_INT (- INTVAL (operands[1])); - }) - -;; Narrow the mode of testing if possible. - -(define_peephole2 - [(set (match_operand:HSI 0 "register_operand" "") - (and:HSI (match_dup 0) - (match_operand:HSI 1 "const_int_operand" ""))) - (set (cc0) (compare (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (match_operator 4 "eqne_operator" - [(cc0) (const_int 0)]) - (match_operand 2 "pc_or_label_operand" "") - (match_operand 3 "pc_or_label_operand" "")))] - "((const_int_qi_operand (operands[1], QImode) - || (GET_MODE (operands[0]) == SImode - && const_int_hi_operand (operands[1], HImode))) - && peep2_reg_dead_p (2, operands[0]))" - [(set (match_dup 5) (match_dup 7)) - (set (cc0) (compare (match_dup 5) - (const_int 0))) - (set (pc) - (if_then_else (match_op_dup 4 [(cc0) (const_int 0)]) - (match_dup 2) - (match_dup 3)))] - { - enum machine_mode mode; - - mode = const_int_qi_operand (operands[1], QImode) ? QImode : HImode; - operands[5] = gen_rtx_REG (mode, REGNO (operands[0])); - operands[6] = gen_int_mode (INTVAL (operands[1]), mode); - operands[7] = gen_rtx_AND (mode, operands[5], operands[6]); - }) - ;; These triggers right at the end of allocation of locals in the ;; prologue (and possibly at other places). @@ -1367,46 +460,6 @@ XEXP (operands[4], 0) = operands[1]; }) -;; Transform -;; -;; mov src1,reg -;; cmp reg,src2 -;; -;; into -;; -;; cmp src1,src2 -;; -;; if "reg" dies in the comparison. - -(define_peephole2 - [(set (match_operand 0 "register_operand" "") - (match_operand 1 "h8300_dst_operand" "")) - (set (cc0) - (compare (match_dup 0) - (match_operand 2 "h8300_src_operand" "")))] - "TARGET_H8300SX - && peep2_reg_dead_p (2, operands[0]) - && !reg_overlap_mentioned_p (operands[0], operands[2]) - && operands[2] != const0_rtx" - [(set (cc0) - (compare (match_dup 1) - (match_dup 2)))]) - -;; Likewise for the second operand. - -(define_peephole2 - [(set (match_operand 0 "register_operand" "") - (match_operand 1 "h8300_src_operand" "")) - (set (cc0) - (compare (match_operand 2 "h8300_dst_operand" "") - (match_dup 0)))] - "TARGET_H8300SX - && peep2_reg_dead_p (2, operands[0]) - && !reg_overlap_mentioned_p (operands[0], operands[2])" - [(set (cc0) - (compare (match_dup 2) - (match_dup 1)))]) - ;; Combine two moves. (define_peephole2 diff --git a/gcc/config/i386/x86-tune-sched.c b/gcc/config/i386/x86-tune-sched.c index 6d8bca9..2e5ee4e 100644 --- a/gcc/config/i386/x86-tune-sched.c +++ b/gcc/config/i386/x86-tune-sched.c @@ -181,7 +181,6 @@ exact_dependency_1 (rtx addr, rtx insn) case SYMBOL_REF: case CODE_LABEL: case PC: - case CC0: case EXPR_LIST: return false; default: diff --git a/gcc/config/m68k/m68k.c b/gcc/config/m68k/m68k.c index 40bdcb0..3f63c60 100644 --- a/gcc/config/m68k/m68k.c +++ b/gcc/config/m68k/m68k.c @@ -1993,8 +1993,6 @@ m68k_output_btst (rtx countop, rtx dataop, rtx_code code, int signpos) count == 0 followed by bcc/bcs are also possible, but need m68k-specific CC_Z_IN_NOT_V and CC_Z_IN_NOT_C flags. */ } - - cc_status.flags = CC_NOT_NEGATIVE; } output_asm_insn ("btst %0,%1", ops); return code; diff --git a/gcc/config/rl78/rl78.c b/gcc/config/rl78/rl78.c index f275cd3..4c34949 100644 --- a/gcc/config/rl78/rl78.c +++ b/gcc/config/rl78/rl78.c @@ -3854,7 +3854,6 @@ rl78_note_reg_uses (char *dead, rtx s, rtx insn) /* These codes have no constituent expressions and are unique. */ case SCRATCH: - case CC0: case PC: return; diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c index 3b4d416..b6e66dc 100644 --- a/gcc/config/sparc/sparc.c +++ b/gcc/config/sparc/sparc.c @@ -8809,7 +8809,6 @@ epilogue_renumber (rtx *where, int test) *where = gen_rtx_REG (GET_MODE (*where), OUTGOING_REGNO (REGNO(*where))); /* fallthrough */ case SCRATCH: - case CC0: case PC: case CONST_INT: case CONST_WIDE_INT: diff --git a/gcc/config/xtensa/xtensa.c b/gcc/config/xtensa/xtensa.c index 9a661dd..f4f8f19 100644 --- a/gcc/config/xtensa/xtensa.c +++ b/gcc/config/xtensa/xtensa.c @@ -898,7 +898,7 @@ gen_conditional_move (enum rtx_code code, machine_mode mode, code = GE; op1 = const0_rtx; } - cmp = gen_rtx_fmt_ee (code, VOIDmode, cc0_rtx, const0_rtx); + cmp = gen_rtx_fmt_ee (code, VOIDmode, pc_rtx, const0_rtx); if (boolean_operator (cmp, VOIDmode)) { -- cgit v1.1 From b50ccaf6dd743c373af95e90935b9a2b72157f3a Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Tue, 4 May 2021 08:56:28 -0600 Subject: Make bfin-elf build again gcc/ * config/bfin/bfin.h (NOTICE_UPDATE_CC): Remove. --- gcc/config/bfin/bfin.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/bfin/bfin.h b/gcc/config/bfin/bfin.h index f282d05..823ca2d 100644 --- a/gcc/config/bfin/bfin.h +++ b/gcc/config/bfin/bfin.h @@ -786,8 +786,6 @@ typedef struct { || GET_CODE (X) == LABEL_REF \ || (GET_CODE (X) == CONST && symbolic_reference_mentioned_p (X))) -#define NOTICE_UPDATE_CC(EXPR, INSN) 0 - /* Max number of bytes we can move from memory to memory in one reasonably fast instruction. */ #define MOVE_MAX UNITS_PER_WORD -- cgit v1.1 From 8b5b814d51ff73bc739c0c037ae18df07acf2d96 Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Tue, 4 May 2021 09:43:40 -0400 Subject: aix: encode function section AIX XCOFF symbols can be labels or qualnames (names with an appended mapping class). CSECTs must be declared with a mapping class. Within an assembler file, the symbol names with and without the mapping class are unique. An object file symbol table only presents the symbol name without the mapping class, but the section of the symbol depends on the mapping class. The AIX XCOFF assembly language does not support first class aliases. GCC implements symbol aliases by emitting additional labels for the function or object. When GCC encodes sections for a DECL, it must distinguish between the primary definition and the aliases, which don't have a mapping class encoding. .globl foo[DS] .globl .foo .globl foo1 .globl .foo1 .csect foo[DS] foo: foo1: .long .foo, TOC[tc0] 0 .csect .foo[PR] .foo: .foo1: The CSECT foo[DS] and label foo are distinct. foo1 is another label (alias) for foo, and .foo1 is another label (alias) for .foo. foo is the function descriptor and .foo is the code. This patch adds the [DS] mapping class to the encoding of FUNCTION_DECL but ensures that mapping class is not added to function aliases. rs6000_output_mi_thunk is updated to emit the function name that matches the behavior of GCC final.c for normal functions: get_fnname_from_decl based on the RTL name, not the DECL name. * config/rs6000/rs6000-call.c (rs6000_output_mi_thunk): Use get_fnname_from_decl for name of thunk. * config/rs6000/rs6000.c (rs6000_declare_alias): Use assemble_name and ASM_OUTPUT_LABEL. (rs6000_xcoff_declare_function_name): Use assemble_name and ASM_OUTPUT_LABEL. (rs6000_xcoff_declare_object_name): Use ASM_OUTPUT_LABEL. (rs6000_xcoff_encode_section_info): Don't add mapping class for aliases. Always add [DS] mapping class to primary FUNCTION_DECL. (rs6000_asm_weaken_decl): Don't explicitly add [DS]. --- gcc/config/rs6000/rs6000-call.c | 2 +- gcc/config/rs6000/rs6000.c | 41 ++++++++++++++++++++--------------------- 2 files changed, 21 insertions(+), 22 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index 6f6dc47..c4332a6 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -15077,7 +15077,7 @@ rs6000_output_mi_thunk (FILE *file, tree thunk_fndecl ATTRIBUTE_UNUSED, HOST_WIDE_INT delta, HOST_WIDE_INT vcall_offset, tree function) { - const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk_fndecl)); + const char *fnname = get_fnname_from_decl (thunk_fndecl); rtx this_rtx, funexp; rtx_insn *insn; diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 0e9cf17..ee15af9 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -21468,7 +21468,7 @@ rs6000_declare_alias (struct symtab_node *n, void *d) putc ('\n', data->file); } fputs ("\t.globl ", data->file); - RS6000_OUTPUT_BASENAME (data->file, buffer); + assemble_name (data->file, buffer); putc ('\n', data->file); } #ifdef ASM_WEAKEN_DECL @@ -21491,13 +21491,12 @@ rs6000_declare_alias (struct symtab_node *n, void *d) putc ('\n', data->file); } fputs ("\t.lglobl ", data->file); - RS6000_OUTPUT_BASENAME (data->file, buffer); + assemble_name (data->file, buffer); putc ('\n', data->file); } if (data->function_descriptor) - fputs (".", data->file); - RS6000_OUTPUT_BASENAME (data->file, buffer); - fputs (":\n", data->file); + putc ('.', data->file); + ASM_OUTPUT_LABEL (data->file, buffer); return false; } @@ -21574,21 +21573,24 @@ rs6000_xcoff_declare_function_name (FILE *file, const char *name, tree decl) RS6000_OUTPUT_BASENAME (file, buffer); putc ('\n', file); } + fputs ("\t.csect ", file); - RS6000_OUTPUT_BASENAME (file, buffer); - fputs (TARGET_32BIT ? "[DS]\n" : "[DS],3\n", file); - RS6000_OUTPUT_BASENAME (file, buffer); - fputs (":\n", file); + assemble_name (file, buffer); + fputs (TARGET_32BIT ? "\n" : ",3\n", file); + + ASM_OUTPUT_LABEL (file, buffer); + symtab_node::get (decl)->call_for_symbol_and_aliases (rs6000_declare_alias, &data, true); fputs (TARGET_32BIT ? "\t.long ." : "\t.llong .", file); RS6000_OUTPUT_BASENAME (file, buffer); fputs (", TOC[tc0], 0\n", file); + in_section = NULL; switch_to_section (function_section (decl)); putc ('.', file); - RS6000_OUTPUT_BASENAME (file, buffer); - fputs (":\n", file); + ASM_OUTPUT_LABEL (file, buffer); + data.function_descriptor = true; symtab_node::get (decl)->call_for_symbol_and_aliases (rs6000_declare_alias, &data, true); @@ -21683,8 +21685,7 @@ void rs6000_xcoff_declare_object_name (FILE *file, const char *name, tree decl) { struct declare_alias_data data = {file, false}; - RS6000_OUTPUT_BASENAME (file, name); - fputs (":\n", file); + ASM_OUTPUT_LABEL (file, name); symtab_node::get_create (decl)->call_for_symbol_and_aliases (rs6000_declare_alias, &data, true); } @@ -21740,20 +21741,19 @@ rs6000_xcoff_encode_section_info (tree decl, rtx rtl, int first) symname = XSTR (symbol, 0); - /* Append CSECT mapping class, unless the symbol already is qualified. */ + /* Append CSECT mapping class, unless the symbol already is qualified. + Aliases are implemented as labels, so the symbol name should not add + a mapping class. */ if (decl && DECL_P (decl) && VAR_OR_FUNCTION_DECL_P (decl) - && lookup_attribute ("alias", DECL_ATTRIBUTES (decl)) == NULL_TREE + && symtab_node::get (decl)->alias == 0 && symname[strlen (symname) - 1] != ']') { const char *smclass = NULL; if (TREE_CODE (decl) == FUNCTION_DECL) - { - if (DECL_EXTERNAL (decl)) - smclass = "[DS]"; - } + smclass = "[DS]"; else if (DECL_THREAD_LOCAL_P (decl)) { if (bss_initializer_p (decl)) @@ -21796,8 +21796,6 @@ rs6000_asm_weaken_decl (FILE *stream, tree decl, if (decl && TREE_CODE (decl) == FUNCTION_DECL && DEFAULT_ABI == ABI_AIX && DOT_SYMBOLS) { - if (TARGET_XCOFF && name[strlen (name) - 1] != ']') - fputs ("[DS]", stream); #if TARGET_XCOFF && HAVE_GAS_HIDDEN if (TARGET_XCOFF) fputs (rs6000_xcoff_visibility (decl), stream); @@ -21810,6 +21808,7 @@ rs6000_asm_weaken_decl (FILE *stream, tree decl, fputs (rs6000_xcoff_visibility (decl), stream); #endif fputc ('\n', stream); + if (val) { #ifdef ASM_OUTPUT_DEF -- cgit v1.1 From 9aed32cc8f1baca398a203ddf4df2f80f43562c1 Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Tue, 4 May 2021 21:49:41 -0300 Subject: restore EH on x86-vx7r2 x86-vx7r2 needs svr4_dbx_register_map, but the default in i386/i386.h was dbx_register_map, partially swapping ebp and esp in unwind info. i386/vxworks.h had a correct overrider, but it was conditional for vxworks < 7. This patch reenables the overrider unconditionally. for gcc/ChangeLog * config/i386/vxworks.h (DBX_REGISTER_NUMBER): Make it unconditional. --- gcc/config/i386/vxworks.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/vxworks.h b/gcc/config/i386/vxworks.h index b3ca224..ebda7d9 100644 --- a/gcc/config/i386/vxworks.h +++ b/gcc/config/i386/vxworks.h @@ -37,13 +37,6 @@ along with GCC; see the file COPYING3. If not see #define TARGET_SUBTARGET_DEFAULT \ (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_VECT8_RETURNS) -/* Provide our target specific DBX_REGISTER_NUMBER. VxWorks relies on - the SVR4 numbering. */ - -#undef DBX_REGISTER_NUMBER -#define DBX_REGISTER_NUMBER(n) \ - (TARGET_64BIT ? dbx64_register_map[n] : svr4_dbx_register_map[n]) - #undef PTRDIFF_TYPE #define PTRDIFF_TYPE (TARGET_LP64 ? "long int" : "int") @@ -61,6 +54,13 @@ along with GCC; see the file COPYING3. If not see #endif +/* Provide our target specific DBX_REGISTER_NUMBER. VxWorks relies on + the SVR4 numbering. */ + +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) \ + (TARGET_64BIT ? dbx64_register_map[n] : svr4_dbx_register_map[n]) + /* CPU macro definitions, ordered to account for VxWorks 7 not supporting CPUs older than PENTIUM4 since SR0650. */ -- cgit v1.1 From f3661f2d63fbc5fd30c24d22137691e16b0a0a17 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 5 May 2021 15:07:25 +0200 Subject: i386: Implement integer vector compares for 64bit vectors [PR98218] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement integer vector compares for 64bit vectors for TARGET_MMX_WITH_SSE. 2021-05-05 Uroš Bizjak gcc/ PR target/98218 * config/i386/i386-expand.c (ix86_expand_int_sse_cmp): Handle V8QI, V4HI and V2SI modes. * config/i386/i386.c (ix86_build_const_vector): Handle V2SImode. (ix86_build_signbit_mask): Ditto. * config/i386/mmx.md (MMXMODE14): New mode iterator. (3): New expander. (*mmx_3): New insn pattern. (3): New expander. (*mmx_3): New insn pattern. (vec_cmp): New expander. (vec_cmpu): Ditto. (vcond): Ditto. (vcondu): Ditto. (vcond_mask_): Ditto. gcc/testsuite/ PR target/98218 * gcc.target/i386/pr98218-1.c: New test. * gcc.target/i386/pr98218-1a.c: Ditto. * gcc.target/i386/pr98218-2.c: Ditto. * gcc.target/i386/pr98218-2a.c: Ditto. * gcc.target/i386/pr98218-3.c: Ditto. * gcc.target/i386/pr98218-3a.c: Ditto. * gcc.dg/vect/vect-bool-cmp.c (dg-final): Scan vect tree dump for "LOOP VECTORIZED", not VECTORIZED. --- gcc/config/i386/i386-expand.c | 19 +++++++ gcc/config/i386/i386.c | 2 + gcc/config/i386/mmx.md | 118 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index fee4d07..4dfe7d6 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -4204,16 +4204,32 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1, else if (code == GT && TARGET_SSE4_1) gen = gen_sminv16qi3; break; + case E_V8QImode: + if (code == GTU && TARGET_SSE2) + gen = gen_uminv8qi3; + else if (code == GT && TARGET_SSE4_1) + gen = gen_sminv8qi3; + break; case E_V8HImode: if (code == GTU && TARGET_SSE4_1) gen = gen_uminv8hi3; else if (code == GT && TARGET_SSE2) gen = gen_sminv8hi3; break; + case E_V4HImode: + if (code == GTU && TARGET_SSE4_1) + gen = gen_uminv4hi3; + else if (code == GT && TARGET_SSE2) + gen = gen_sminv4hi3; + break; case E_V4SImode: if (TARGET_SSE4_1) gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3; break; + case E_V2SImode: + if (TARGET_SSE4_1) + gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3; + break; case E_V2DImode: if (TARGET_AVX512VL) { @@ -4254,6 +4270,7 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1, case E_V8SImode: case E_V4DImode: case E_V4SImode: + case E_V2SImode: case E_V2DImode: { rtx t1, t2, mask; @@ -4278,7 +4295,9 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1, case E_V32QImode: case E_V16HImode: case E_V16QImode: + case E_V8QImode: case E_V8HImode: + case E_V4HImode: /* Perform a parallel unsigned saturating subtraction. */ x = gen_reg_rtx (mode); emit_insn (gen_rtx_SET diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 780da10..06b0f58 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -15284,6 +15284,7 @@ ix86_build_const_vector (machine_mode mode, bool vect, rtx value) case E_V16SImode: case E_V8SImode: case E_V4SImode: + case E_V2SImode: case E_V8DImode: case E_V4DImode: case E_V2DImode: @@ -15334,6 +15335,7 @@ ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert) case E_V8SFmode: case E_V4SFmode: case E_V2SFmode: + case E_V2SImode: vec_mode = mode; imode = SImode; break; diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 4c2b724..347295a 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -52,6 +52,7 @@ ;; Mix-n-match (define_mode_iterator MMXMODE12 [V8QI V4HI]) +(define_mode_iterator MMXMODE14 [V8QI V2SI]) (define_mode_iterator MMXMODE24 [V4HI V2SI]) (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI]) @@ -1417,6 +1418,31 @@ (set_attr "type" "mmxmul,ssemul,ssemul") (set_attr "mode" "DI,TI,TI")]) +(define_expand "3" + [(set (match_operand:MMXMODE14 0 "register_operand") + (smaxmin:MMXMODE14 + (match_operand:MMXMODE14 1 "register_operand") + (match_operand:MMXMODE14 2 "register_operand")))] + "TARGET_MMX_WITH_SSE && TARGET_SSE4_1" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*mmx_3" + [(set (match_operand:MMXMODE14 0 "register_operand" "=Yr,*x,Yv") + (smaxmin:MMXMODE14 + (match_operand:MMXMODE14 1 "register_operand" "%0,0,Yv") + (match_operand:MMXMODE14 2 "register_operand" "Yr,*x,Yv")))] + "TARGET_MMX_WITH_SSE && TARGET_SSE4_1 + && ix86_binary_operator_ok (, mode, operands)" + "@ + p\t{%2, %0|%0, %2} + p\t{%2, %0|%0, %2} + vp\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1,1,*") + (set_attr "prefix" "orig,orig,vex") + (set_attr "mode" "TI")]) + (define_expand "mmx_v4hi3" [(set (match_operand:V4HI 0 "register_operand") (smaxmin:V4HI @@ -1451,6 +1477,31 @@ (set_attr "type" "mmxadd,sseiadd,sseiadd") (set_attr "mode" "DI,TI,TI")]) +(define_expand "3" + [(set (match_operand:MMXMODE24 0 "register_operand") + (umaxmin:MMXMODE24 + (match_operand:MMXMODE24 1 "register_operand") + (match_operand:MMXMODE24 2 "register_operand")))] + "TARGET_MMX_WITH_SSE && TARGET_SSE4_1" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*mmx_3" + [(set (match_operand:MMXMODE24 0 "register_operand" "=Yr,*x,Yv") + (umaxmin:MMXMODE24 + (match_operand:MMXMODE24 1 "register_operand" "%0,0,Yv") + (match_operand:MMXMODE24 2 "register_operand" "Yr,*x,Yv")))] + "TARGET_MMX_WITH_SSE && TARGET_SSE4_1 + && ix86_binary_operator_ok (, mode, operands)" + "@ + p\t{%2, %0|%0, %2} + p\t{%2, %0|%0, %2} + vp\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1,1,*") + (set_attr "prefix" "orig,orig,vex") + (set_attr "mode" "TI")]) + (define_expand "mmx_v8qi3" [(set (match_operand:V8QI 0 "register_operand") (umaxmin:V8QI @@ -1582,6 +1633,73 @@ (set_attr "type" "mmxcmp,ssecmp,ssecmp") (set_attr "mode" "DI,TI,TI")]) +(define_expand "vec_cmp" + [(set (match_operand:MMXMODEI 0 "register_operand") + (match_operator:MMXMODEI 1 "" + [(match_operand:MMXMODEI 2 "register_operand") + (match_operand:MMXMODEI 3 "register_operand")]))] + "TARGET_MMX_WITH_SSE" +{ + bool ok = ix86_expand_int_vec_cmp (operands); + gcc_assert (ok); + DONE; +}) + +(define_expand "vec_cmpu" + [(set (match_operand:MMXMODEI 0 "register_operand") + (match_operator:MMXMODEI 1 "" + [(match_operand:MMXMODEI 2 "register_operand") + (match_operand:MMXMODEI 3 "register_operand")]))] + "TARGET_MMX_WITH_SSE" +{ + bool ok = ix86_expand_int_vec_cmp (operands); + gcc_assert (ok); + DONE; +}) + +(define_expand "vcond" + [(set (match_operand:MMXMODEI 0 "register_operand") + (if_then_else:MMXMODEI + (match_operator 3 "" + [(match_operand:MMXMODEI 4 "register_operand") + (match_operand:MMXMODEI 5 "register_operand")]) + (match_operand:MMXMODEI 1) + (match_operand:MMXMODEI 2)))] + "TARGET_MMX_WITH_SSE" +{ + bool ok = ix86_expand_int_vcond (operands); + gcc_assert (ok); + DONE; +}) + +(define_expand "vcondu" + [(set (match_operand:MMXMODEI 0 "register_operand") + (if_then_else:MMXMODEI + (match_operator 3 "" + [(match_operand:MMXMODEI 4 "register_operand") + (match_operand:MMXMODEI 5 "register_operand")]) + (match_operand:MMXMODEI 1) + (match_operand:MMXMODEI 2)))] + "TARGET_MMX_WITH_SSE" +{ + bool ok = ix86_expand_int_vcond (operands); + gcc_assert (ok); + DONE; +}) + +(define_expand "vcond_mask_" + [(set (match_operand:MMXMODEI 0 "register_operand") + (vec_merge:MMXMODEI + (match_operand:MMXMODEI 1 "register_operand") + (match_operand:MMXMODEI 2 "register_operand") + (match_operand:MMXMODEI 3 "register_operand")))] + "TARGET_MMX_WITH_SSE" +{ + ix86_expand_sse_movcc (operands[0], operands[3], + operands[1], operands[2]); + DONE; +}) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral logical operations -- cgit v1.1 From 14cf6aab8578132ec89ccb46e69899ae6008ff63 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Wed, 5 May 2021 07:49:28 -0600 Subject: Remove NOTICE_UPDATE_CC remnants on cr16 gcc * config/cr16/cr16.h (NOTICE_UPDATE_CC): Remove. * config/cr16/cr16.c (notice_update_cc): Remove. * config/cr16/cr16-protos.h (notice_update_cc): Remove. --- gcc/config/cr16/cr16-protos.h | 1 - gcc/config/cr16/cr16.c | 31 ------------------------------- gcc/config/cr16/cr16.h | 3 --- 3 files changed, 35 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/cr16/cr16-protos.h b/gcc/config/cr16/cr16-protos.h index 32f54e0..8580dfe 100644 --- a/gcc/config/cr16/cr16-protos.h +++ b/gcc/config/cr16/cr16-protos.h @@ -67,7 +67,6 @@ enum cr16_addrtype CR16_ABSOLUTE }; -extern void notice_update_cc (rtx); extern int cr16_operand_bit_pos (int val, int bitval); extern void cr16_decompose_const (rtx x, int *code, enum data_model_type *data, diff --git a/gcc/config/cr16/cr16.c b/gcc/config/cr16/cr16.c index 079706f..6c81c39 100644 --- a/gcc/config/cr16/cr16.c +++ b/gcc/config/cr16/cr16.c @@ -2095,37 +2095,6 @@ cr16_legitimate_constant_p (machine_mode mode ATTRIBUTE_UNUSED, return 1; } -void -notice_update_cc (rtx exp) -{ - if (GET_CODE (exp) == SET) - { - /* Jumps do not alter the cc's. */ - if (SET_DEST (exp) == pc_rtx) - return; - - /* Moving register or memory into a register: - it doesn't alter the cc's, but it might invalidate - the RTX's which we remember the cc's came from. - (Note that moving a constant 0 or 1 MAY set the cc's). */ - if (REG_P (SET_DEST (exp)) - && (REG_P (SET_SRC (exp)) || GET_CODE (SET_SRC (exp)) == MEM)) - { - return; - } - - /* Moving register into memory doesn't alter the cc's. - It may invalidate the RTX's which we remember the cc's came from. */ - if (GET_CODE (SET_DEST (exp)) == MEM && REG_P (SET_SRC (exp))) - { - return; - } - } - - CC_STATUS_INIT; - return; -} - static scalar_int_mode cr16_unwind_word_mode (void) { diff --git a/gcc/config/cr16/cr16.h b/gcc/config/cr16/cr16.h index ae90610..4ce9e81 100644 --- a/gcc/config/cr16/cr16.h +++ b/gcc/config/cr16/cr16.h @@ -195,9 +195,6 @@ while (0) (targetm.hard_regno_nregs (REGNO, \ GET_MODE_WIDER_MODE (word_mode).require ()) == 1) -#define NOTICE_UPDATE_CC(EXP, INSN) \ - notice_update_cc ((EXP)) - /* Interrupt functions can only use registers that have already been saved by the prologue, even if they would normally be call-clobbered Check if sizes are same and then check if it is possible to rename. */ -- cgit v1.1 From b927ffdd6cecd0eeda6ef77df2623519870b1e75 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Wed, 5 May 2021 09:15:42 -0600 Subject: Remove cc0 remnants from avr port gcc/ * config/avr/avr.md: Remove references to CC_STATUS_INIT. --- gcc/config/avr/avr.md | 4 ---- 1 file changed, 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md index a1a325b..271f95f 100644 --- a/gcc/config/avr/avr.md +++ b/gcc/config/avr/avr.md @@ -7668,7 +7668,6 @@ { const char *op; int jump_mode; - CC_STATUS_INIT; if (test_hard_reg_class (ADDW_REGS, operands[0])) output_asm_insn ("sbiw %0,1" CR_TAB "sbc %C0,__zero_reg__" CR_TAB @@ -7713,7 +7712,6 @@ { const char *op; int jump_mode; - CC_STATUS_INIT; if (test_hard_reg_class (ADDW_REGS, operands[0])) output_asm_insn ("sbiw %0,1", operands); else @@ -7756,7 +7754,6 @@ { const char *op; int jump_mode; - CC_STATUS_INIT; if (test_hard_reg_class (ADDW_REGS, operands[0])) output_asm_insn ("sbiw %0,1", operands); else @@ -7799,7 +7796,6 @@ { const char *op; int jump_mode; - CC_STATUS_INIT; output_asm_insn ("ldi %3,1" CR_TAB "sub %A0,%3" CR_TAB "sbc %B0,__zero_reg__", operands); -- cgit v1.1 From d9937da063e5847f45f7f1f7a02bed7dbc8fb2f6 Mon Sep 17 00:00:00 2001 From: Prathamesh Kulkarni Date: Wed, 5 May 2021 21:11:45 +0530 Subject: arm/97903: Missed optimization in lowering test operation. gcc/ChangeLog: 2021-05-05 Prathamesh Kulkarni * config/arm/neon.md (neon_vtst_combine): New pattern. * config/arm/predicates.md (minus_one_operand): New predicate. --- gcc/config/arm/neon.md | 13 +++++++++++++ gcc/config/arm/predicates.md | 4 ++++ 2 files changed, 17 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index fec2cc9..2a1e304 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2588,6 +2588,19 @@ [(set_attr "type" "neon_tst")] ) +(define_insn "neon_vtst_combine" + [(set (match_operand:VDQIW 0 "s_register_operand" "=w") + (plus:VDQIW + (eq:VDQIW + (and:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w") + (match_operand:VDQIW 2 "s_register_operand" "w")) + (match_operand:VDQIW 3 "zero_operand" "i")) + (match_operand:VDQIW 4 "minus_one_operand" "i")))] + "TARGET_NEON" + "vtst.\t%0, %1, %2" + [(set_attr "type" "neon_tst")] +) + (define_insn "neon_vabd" [(set (match_operand:VDQIW 0 "s_register_operand" "=w") (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w") diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md index c661f01..9db061d 100644 --- a/gcc/config/arm/predicates.md +++ b/gcc/config/arm/predicates.md @@ -200,6 +200,10 @@ (and (match_code "const_int,const_double,const_vector") (match_test "op == CONST0_RTX (mode)"))) +(define_predicate "minus_one_operand" + (and (match_code "const_int,const_double,const_vector") + (match_test "op == CONSTM1_RTX (mode)"))) + ;; Match a register, or zero in the appropriate mode. (define_predicate "reg_or_zero_operand" (ior (match_operand 0 "s_register_operand") -- cgit v1.1 From a0b4e09ab0102d9c0a5e6d603a080cb78600c40d Mon Sep 17 00:00:00 2001 From: Ivan Sorokin Date: Mon, 3 May 2021 11:39:48 +0300 Subject: x86: Build only one __cpu_model/__cpu_features2 variables GCC -O2 generated quite bad code for this function: bool f (void) { return __builtin_cpu_supports("popcnt") && __builtin_cpu_supports("ssse3"); } f: movl __cpu_model+12(%rip), %edx movl %edx, %eax shrl $6, %eax andl $1, %eax andl $4, %edx movl $0, %edx cmove %edx, %eax ret The problem was caused by the fact that internally every invocation of __builtin_cpu_supports built a new variable __cpu_model and a new type __processor_model. Because of this, GIMPLE level optimizers weren't able to CSE the loads of __cpu_model and optimize bit-operations properly. Improve GCC -O2 code generation by caching __cpu_model and__cpu_features2 variables as well as their types: f: movl __cpu_model+12(%rip), %eax andl $68, %eax cmpl $68, %eax sete %al ret 2021-05-05 Ivan Sorokin H.J. Lu gcc/ PR target/91400 * config/i386/i386-builtins.c (ix86_cpu_model_type_node): New. (ix86_cpu_model_var): Likewise. (ix86_cpu_features2_type_node): Likewise. (ix86_cpu_features2_var): Likewise. (fold_builtin_cpu): Cache __cpu_model and __cpu_features2 with their types. gcc/testsuite/ PR target/91400 * gcc.target/i386/pr91400-1.c: New test. * gcc.target/i386/pr91400-2.c: Likewise. --- gcc/config/i386/i386-builtins.c | 52 +++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 17 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c index b669110..8036aed 100644 --- a/gcc/config/i386/i386-builtins.c +++ b/gcc/config/i386/i386-builtins.c @@ -2103,6 +2103,11 @@ make_var_decl (tree type, const char *name) return new_decl; } +static GTY(()) tree ix86_cpu_model_type_node; +static GTY(()) tree ix86_cpu_model_var; +static GTY(()) tree ix86_cpu_features2_type_node; +static GTY(()) tree ix86_cpu_features2_var; + /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded into an integer defined in libgcc/config/i386/cpuinfo.c */ @@ -2114,12 +2119,16 @@ fold_builtin_cpu (tree fndecl, tree *args) = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl); tree param_string_cst = NULL; - tree __processor_model_type = build_processor_model_struct (); - tree __cpu_model_var = make_var_decl (__processor_model_type, - "__cpu_model"); - - - varpool_node::add (__cpu_model_var); + if (ix86_cpu_model_var == nullptr) + { + /* Build a single __cpu_model variable for all references to + __cpu_model so that GIMPLE level optimizers can CSE the loads + of __cpu_model and optimize bit-operations properly. */ + ix86_cpu_model_type_node = build_processor_model_struct (); + ix86_cpu_model_var = make_var_decl (ix86_cpu_model_type_node, + "__cpu_model"); + varpool_node::add (ix86_cpu_model_var); + } gcc_assert ((args != NULL) && (*args != NULL)); @@ -2160,7 +2169,7 @@ fold_builtin_cpu (tree fndecl, tree *args) return integer_zero_node; } - field = TYPE_FIELDS (__processor_model_type); + field = TYPE_FIELDS (ix86_cpu_model_type_node); field_val = processor_alias_table[i].model; /* CPU types are stored in the next field. */ @@ -2179,7 +2188,7 @@ fold_builtin_cpu (tree fndecl, tree *args) } /* Get the appropriate field in __cpu_model. */ - ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var, + ref = build3 (COMPONENT_REF, TREE_TYPE (field), ix86_cpu_model_var, field, NULL_TREE); /* Check the value. */ @@ -2212,13 +2221,22 @@ fold_builtin_cpu (tree fndecl, tree *args) if (isa_names_table[i].feature >= 32) { - tree index_type - = build_index_type (size_int (SIZE_OF_CPU_FEATURES)); - tree type = build_array_type (unsigned_type_node, index_type); - tree __cpu_features2_var = make_var_decl (type, - "__cpu_features2"); + if (ix86_cpu_features2_var == nullptr) + { + /* Build a single __cpu_features2 variable for all + references to __cpu_features2 so that GIMPLE level + optimizers can CSE the loads of __cpu_features2 and + optimize bit-operations properly. */ + tree index_type + = build_index_type (size_int (SIZE_OF_CPU_FEATURES)); + ix86_cpu_features2_type_node + = build_array_type (unsigned_type_node, index_type); + ix86_cpu_features2_var + = make_var_decl (ix86_cpu_features2_type_node, + "__cpu_features2"); + varpool_node::add (ix86_cpu_features2_var); + } - varpool_node::add (__cpu_features2_var); for (unsigned int j = 0; j < SIZE_OF_CPU_FEATURES; j++) if (isa_names_table[i].feature < (32 + 32 + j * 32)) { @@ -2226,7 +2244,7 @@ fold_builtin_cpu (tree fndecl, tree *args) - (32 + j * 32))); tree index = size_int (j); array_elt = build4 (ARRAY_REF, unsigned_type_node, - __cpu_features2_var, + ix86_cpu_features2_var, index, NULL_TREE, NULL_TREE); /* Return __cpu_features2[index] & field_val */ final = build2 (BIT_AND_EXPR, unsigned_type_node, @@ -2237,13 +2255,13 @@ fold_builtin_cpu (tree fndecl, tree *args) } } - field = TYPE_FIELDS (__processor_model_type); + field = TYPE_FIELDS (ix86_cpu_model_type_node); /* Get the last field, which is __cpu_features. */ while (DECL_CHAIN (field)) field = DECL_CHAIN (field); /* Get the appropriate field: __cpu_model.__cpu_features */ - ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var, + ref = build3 (COMPONENT_REF, TREE_TYPE (field), ix86_cpu_model_var, field, NULL_TREE); /* Access the 0th element of __cpu_features array. */ -- cgit v1.1 From e8d1ca7d2c344a411779892616c423e157f4aea8 Mon Sep 17 00:00:00 2001 From: Eric Botcazou Date: Wed, 5 May 2021 22:48:51 +0200 Subject: Fix PR target/100402 This is a regression for 64-bit Windows present from mainline down to the 9 branch and introduced by the fix for PR target/99234. Again SEH, but with a twist related to the way MinGW implements setjmp/longjmp, which turns out to be piggybacked on SEH with recent versions of MinGW, i.e. the longjmp performs a bona-fide unwinding of the stack, because it calls RtlUnwindEx with the second argument initially passed to setjmp, which is the result of __builtin_frame_address (0) in the MinGW header file: define setjmp(BUF) _setjmp((BUF), __builtin_frame_address (0)) This means that we directly expose the frame pointer to the SEH machinery here (unlike with regular exception handling where we use an intermediate CFA) and thus that we cannot do whatever we want with it. The old code would leave it unaligned, i.e. not multiple of 16, whereas the new code aligns it, but this breaks for some reason; at least it appears that a .seh_setframe directive with 0 as second argument always works, so the fix aligns it this way. gcc/ PR target/100402 * config/i386/i386.c (ix86_compute_frame_layout): For a SEH target, always return the establisher frame for __builtin_frame_address (0). gcc/testsuite/ * gcc.c-torture/execute/20210505-1.c: New test. --- gcc/config/i386/i386.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 06b0f58..ecc1535 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -6672,12 +6672,29 @@ ix86_compute_frame_layout (void) area, see the SEH code in config/i386/winnt.c for the rationale. */ frame->hard_frame_pointer_offset = frame->sse_reg_save_offset; - /* If we can leave the frame pointer where it is, do so. Also, return + /* If we can leave the frame pointer where it is, do so; however return the establisher frame for __builtin_frame_address (0) or else if the - frame overflows the SEH maximum frame size. */ + frame overflows the SEH maximum frame size. + + Note that the value returned by __builtin_frame_address (0) is quite + constrained, because setjmp is piggybacked on the SEH machinery with + recent versions of MinGW: + + # elif defined(__SEH__) + # if defined(__aarch64__) || defined(_ARM64_) + # define setjmp(BUF) _setjmp((BUF), __builtin_sponentry()) + # elif (__MINGW_GCC_VERSION < 40702) + # define setjmp(BUF) _setjmp((BUF), mingw_getsp()) + # else + # define setjmp(BUF) _setjmp((BUF), __builtin_frame_address (0)) + # endif + + and the second argument passed to _setjmp, if not null, is forwarded + to the TargetFrame parameter of RtlUnwindEx by longjmp (after it has + built an ExceptionRecord on the fly describing the setjmp buffer). */ const HOST_WIDE_INT diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset; - if (diff <= 255) + if (diff <= 255 && !crtl->accesses_prior_frames) { /* The resulting diff will be a multiple of 16 lower than 255, i.e. at most 240 as required by the unwind data structure. */ -- cgit v1.1 From e1fcf14f33e4f371aae8ae497ca58a760d53ca6d Mon Sep 17 00:00:00 2001 From: Christoph Muellner Date: Wed, 5 May 2021 21:23:41 +0200 Subject: RISC-V: Generate helpers for cbranch4. On RISC-V we are facing the fact, that our conditional branches require Pmode conditions. Currently, we generate them explicitly with a check for Pmode and then calling the proper generator (i.e. gen_cbranchdi4 on RV64 and gen_cbranchsi4 on RV32). Let's simplify this code by generating the INSN helpers and use gen_cbranch4 (Pmode). gcc/ PR target/100266 * config/riscv/riscv.c (riscv_block_move_loop): Use cbranch helper. * config/riscv/riscv.md (cbranch4): Generate helpers. (stack_protect_test): Use cbranch helper. --- gcc/config/riscv/riscv.c | 5 +---- gcc/config/riscv/riscv.md | 12 ++++-------- 2 files changed, 5 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c index e1064e3..27665e5 100644 --- a/gcc/config/riscv/riscv.c +++ b/gcc/config/riscv/riscv.c @@ -3258,10 +3258,7 @@ riscv_block_move_loop (rtx dest, rtx src, unsigned HOST_WIDE_INT length, /* Emit the loop condition. */ test = gen_rtx_NE (VOIDmode, src_reg, final_src); - if (Pmode == DImode) - emit_jump_insn (gen_cbranchdi4 (test, src_reg, final_src, label)); - else - emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label)); + emit_jump_insn (gen_cbranch4 (Pmode, test, src_reg, final_src, label)); /* Mop up any left-over bytes. */ if (leftover) diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 0e35960..f88877f 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -2153,7 +2153,7 @@ (label_ref (match_operand 1)) (pc)))]) -(define_expand "cbranch4" +(define_expand "@cbranch4" [(set (pc) (if_then_else (match_operator 0 "comparison_operator" [(match_operand:BR 1 "register_operand") @@ -2167,7 +2167,7 @@ DONE; }) -(define_expand "cbranch4" +(define_expand "@cbranch4" [(set (pc) (if_then_else (match_operator 0 "fp_branch_comparison" [(match_operand:ANYF 1 "register_operand") @@ -2829,12 +2829,8 @@ operands[0], operands[1])); - if (mode == DImode) - emit_jump_insn (gen_cbranchdi4 (gen_rtx_EQ (VOIDmode, result, const0_rtx), - result, const0_rtx, operands[2])); - else - emit_jump_insn (gen_cbranchsi4 (gen_rtx_EQ (VOIDmode, result, const0_rtx), - result, const0_rtx, operands[2])); + rtx cond = gen_rtx_EQ (VOIDmode, result, const0_rtx); + emit_jump_insn (gen_cbranch4 (mode, cond, result, const0_rtx, operands[2])); DONE; }) -- cgit v1.1 From 3c33c00f43bfe585d9414dfb620f0f518e55a457 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 4 May 2021 13:30:05 +0200 Subject: IBM Z: Fix error checking for builtin vec_permi The builtin vec_permi is peculiar in that its immediate operand is encoded differently than the immediate operand that is backing the builtin. This fixes the check for the immediate operand, adding a regression test in the process. This partially reverts commit 3191c1f4488d1f7563b563d7ae2a102a26f16d82 2021-05-06 Marius Hillenbrand gcc/ChangeLog: * config/s390/s390-builtins.def (O_M5, O1_M5, ...): Remove unused macros. (s390_vec_permi_s64, s390_vec_permi_b64, s390_vec_permi_u64) (s390_vec_permi_dbl, s390_vpdi): Use the O3_U2 type for the immediate operand. * config/s390/s390.c (s390_const_operand_ok): Remove unused values. gcc/testsuite/ChangeLog: * gcc.target/s390/zvector/imm-range-error-1.c: Fix test for __builtin_s390_vpdi. * gcc.target/s390/zvector/vec-permi.c: New test for builtin vec_permi. --- gcc/config/s390/s390-builtins.def | 44 ++++++++++++++------------------------- gcc/config/s390/s390.c | 7 +++++-- 2 files changed, 21 insertions(+), 30 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390-builtins.def b/gcc/config/s390/s390-builtins.def index f77ab75..8ca002d 100644 --- a/gcc/config/s390/s390-builtins.def +++ b/gcc/config/s390/s390-builtins.def @@ -29,7 +29,6 @@ #undef O_U16 #undef O_U32 -#undef O_M5 #undef O_M12 #undef O_S2 @@ -89,11 +88,6 @@ #undef O3_U32 #undef O4_U32 -#undef O1_M5 -#undef O2_M5 -#undef O3_M5 -#undef O4_M5 - #undef O1_M12 #undef O2_M12 #undef O3_M12 @@ -164,20 +158,19 @@ #define O_U16 8 /* unsigned 16 bit literal */ #define O_U32 9 /* unsigned 32 bit literal */ -#define O_M5 10 /* matches bitmask of 5 */ -#define O_M12 11 /* matches bitmask of 12 */ +#define O_M12 10 /* matches bitmask of 12 */ -#define O_S2 12 /* signed 2 bit literal */ -#define O_S3 13 /* signed 3 bit literal */ -#define O_S4 14 /* signed 4 bit literal */ -#define O_S5 15 /* signed 5 bit literal */ -#define O_S8 16 /* signed 8 bit literal */ -#define O_S12 17 /* signed 12 bit literal */ -#define O_S16 18 /* signed 16 bit literal */ -#define O_S32 19 /* signed 32 bit literal */ +#define O_S2 11 /* signed 2 bit literal */ +#define O_S3 12 /* signed 3 bit literal */ +#define O_S4 13 /* signed 4 bit literal */ +#define O_S5 14 /* signed 5 bit literal */ +#define O_S8 15 /* signed 8 bit literal */ +#define O_S12 16 /* signed 12 bit literal */ +#define O_S16 17 /* signed 16 bit literal */ +#define O_S32 18 /* signed 32 bit literal */ -#define O_ELEM 20 /* Element selector requiring modulo arithmetic. */ -#define O_LIT 21 /* Operand must be a literal fitting the target type. */ +#define O_ELEM 19 /* Element selector requiring modulo arithmetic. */ +#define O_LIT 20 /* Operand must be a literal fitting the target type. */ #define O_SHIFT 5 @@ -230,11 +223,6 @@ #define O3_U32 (O_U32 << (2 * O_SHIFT)) #define O4_U32 (O_U32 << (3 * O_SHIFT)) -#define O1_M5 O_M5 -#define O2_M5 (O_M5 << O_SHIFT) -#define O3_M5 (O_M5 << (2 * O_SHIFT)) -#define O4_M5 (O_M5 << (3 * O_SHIFT)) - #define O1_M12 O_M12 #define O2_M12 (O_M12 << O_SHIFT) #define O3_M12 (O_M12 << (2 * O_SHIFT)) @@ -671,12 +659,12 @@ OB_DEF_VAR (s390_vec_perm_dbl, s390_vperm, 0, B_DEF (s390_vperm, vec_permv16qi, 0, B_VX, 0, BT_FN_UV16QI_UV16QI_UV16QI_UV16QI) OB_DEF (s390_vec_permi, s390_vec_permi_s64, s390_vec_permi_dbl, B_VX, BT_FN_OV4SI_OV4SI_OV4SI_INT) -OB_DEF_VAR (s390_vec_permi_s64, s390_vpdi, 0, O3_M5, BT_OV_V2DI_V2DI_V2DI_INT) -OB_DEF_VAR (s390_vec_permi_b64, s390_vpdi, 0, O3_M5, BT_OV_BV2DI_BV2DI_BV2DI_INT) -OB_DEF_VAR (s390_vec_permi_u64, s390_vpdi, 0, O3_M5, BT_OV_UV2DI_UV2DI_UV2DI_INT) -OB_DEF_VAR (s390_vec_permi_dbl, s390_vpdi, 0, O3_M5, BT_OV_V2DF_V2DF_V2DF_INT) +OB_DEF_VAR (s390_vec_permi_s64, s390_vpdi, 0, O3_U2, BT_OV_V2DI_V2DI_V2DI_INT) +OB_DEF_VAR (s390_vec_permi_b64, s390_vpdi, 0, O3_U2, BT_OV_BV2DI_BV2DI_BV2DI_INT) +OB_DEF_VAR (s390_vec_permi_u64, s390_vpdi, 0, O3_U2, BT_OV_UV2DI_UV2DI_UV2DI_INT) +OB_DEF_VAR (s390_vec_permi_dbl, s390_vpdi, 0, O3_U2, BT_OV_V2DF_V2DF_V2DF_INT) -B_DEF (s390_vpdi, vec_permiv2di, 0, B_VX, O3_M5, BT_FN_UV2DI_UV2DI_UV2DI_INT) +B_DEF (s390_vpdi, vec_permiv2di, 0, B_VX, O3_U2, BT_FN_UV2DI_UV2DI_UV2DI_INT) OB_DEF (s390_vec_splat, s390_vec_splat2_s8, s390_vec_splat2_dbl,B_VX, BT_FN_OV4SI_OV4SI_UCHAR) OB_DEF_VAR (s390_vec_splat2_s8, s390_vrepb, 0, O2_U4, BT_OV_V16QI_V16QI_UCHAR) diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 88361f9..6bbeb64 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -734,11 +734,14 @@ s390_const_operand_ok (tree arg, int argnum, int op_flags, tree decl) { if (O_UIMM_P (op_flags)) { - unsigned HOST_WIDE_INT bitwidths[] = { 1, 2, 3, 4, 5, 8, 12, 16, 32, 4, 4 }; - unsigned HOST_WIDE_INT bitmasks[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 12 }; + unsigned HOST_WIDE_INT bitwidths[] = { 1, 2, 3, 4, 5, 8, 12, 16, 32, 4 }; + unsigned HOST_WIDE_INT bitmasks[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 12 }; unsigned HOST_WIDE_INT bitwidth = bitwidths[op_flags - O_U1]; unsigned HOST_WIDE_INT bitmask = bitmasks[op_flags - O_U1]; + gcc_assert(ARRAY_SIZE(bitwidths) == (O_M12 - O_U1 + 1)); + gcc_assert(ARRAY_SIZE(bitmasks) == (O_M12 - O_U1 + 1)); + if (!tree_fits_uhwi_p (arg) || tree_to_uhwi (arg) > (HOST_WIDE_INT_1U << bitwidth) - 1 || (bitmask && tree_to_uhwi (arg) & ~bitmask)) -- cgit v1.1 From 323b18d54b960d3ef64f60ad20838ef958334dc0 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 7 May 2021 10:37:52 +0200 Subject: i386: Fix up 8-byte vcond* with -mxop [PR100445] ix86_expand_sse_movcc has special TARGET_XOP handling and the recent addition of support of v*cond* patterns for MMXMODEI modes results in ICEs because the expected pattern doesn't exist. We can handle it using 128-bit vpcmov (if we ignore the upper 64 bits like we ignore in other TARGET_MMX_WITH_SSE support). 2021-05-07 Jakub Jelinek PR target/100445 * config/i386/mmx.md (*xop_pcmov_): New define_insn. * gcc.target/i386/pr100445.c: New test. --- gcc/config/i386/mmx.md | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 347295a..295501d 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1700,6 +1700,17 @@ DONE; }) +;; XOP parallel XMM conditional moves +(define_insn "*xop_pcmov_" + [(set (match_operand:MMXMODEI 0 "register_operand" "=x") + (if_then_else:MMXMODEI + (match_operand:MMXMODEI 3 "register_operand" "x") + (match_operand:MMXMODEI 1 "register_operand" "x") + (match_operand:MMXMODEI 2 "register_operand" "x")))] + "TARGET_XOP && TARGET_MMX_WITH_SSE" + "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sse4arg")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral logical operations -- cgit v1.1 From 92f372f00936a549de2cb9764eee722bb07959ba Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 7 May 2021 11:15:07 +0200 Subject: i386: Do not emit mask compares for mode sizes < 16 [PR100445] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent addition of v*cond* patterns for MMXMODEI modes allows 64bit MMX modes to enter ix86_expand_sse_cmp. ix86_use_mask_cmp_p was not prepared to reject mode sizes < 16, resulting in ICE due to unavailability of 64bit masked PCOM instructions. 2021-05-07 Uroš Bizjak gcc/ PR target/100445 * config/i386/i386-expand.c (ix86_use_mask_cmp_p): Return false for mode sizes < 16. gcc/testsuite/ PR target/100445 * gcc.target/i386/pr100445-1.c: New test. --- gcc/config/i386/i386-expand.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 4dfe7d6..61b2f92 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -3490,7 +3490,11 @@ static bool ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode, rtx op_true, rtx op_false) { - if (GET_MODE_SIZE (mode) == 64) + int vector_size = GET_MODE_SIZE (mode); + + if (vector_size < 16) + return false; + else if (vector_size == 64) return true; /* When op_true is NULL, op_false must be NULL, or vice versa. */ -- cgit v1.1 From 5795ec0edc30e077a9900cf3ca0a04ad8ac5ac97 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 7 May 2021 17:14:34 +0200 Subject: i386: Implement mmx_pblendv to optimize SSE conditional moves [PR98218] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement mmx_pblendv to optimize V8HI, V4HI and V2SI mode conditional moves for SSE4.1 targets. 2021-05-07 Uroš Bizjak gcc/ PR target/98218 * config/i386/i386-expand.c (ix86_expand_sse_movcc): Handle V8QI, V4HI and V2SI modes. * config/i386/mmx.md (mmx_pblendvb): New insn pattern. * config/i386/sse.md (unspec): Move UNSPEC_BLENDV ... * config/i386/i386.md (unspec): ... here. --- gcc/config/i386/i386-expand.c | 13 +++++++++++++ gcc/config/i386/i386.md | 1 + gcc/config/i386/mmx.md | 20 ++++++++++++++++++++ gcc/config/i386/sse.md | 1 - 4 files changed, 34 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 61b2f92..e9f11bc 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -3702,6 +3702,19 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) op_true = force_reg (mode, op_true); } break; + case E_V8QImode: + case E_V4HImode: + case E_V2SImode: + if (TARGET_SSE4_1) + { + gen = gen_mmx_pblendvb; + if (mode != V8QImode) + d = gen_reg_rtx (V8QImode); + op_false = gen_lowpart (V8QImode, op_false); + op_true = gen_lowpart (V8QImode, op_true); + cmp = gen_lowpart (V8QImode, cmp); + } + break; case E_V16QImode: case E_V8HImode: case E_V4SImode: diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index f79fd12..74e924f 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -118,6 +118,7 @@ UNSPEC_FIX_NOTRUNC UNSPEC_MASKMOV UNSPEC_MOVMSK + UNSPEC_BLENDV UNSPEC_RCP UNSPEC_RSQRT UNSPEC_PSADBW diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 295501d..f085708 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1700,6 +1700,26 @@ DONE; }) +(define_insn "mmx_pblendvb" + [(set (match_operand:V8QI 0 "register_operand" "=Yr,*x,x") + (unspec:V8QI + [(match_operand:V8QI 1 "register_operand" "0,0,x") + (match_operand:V8QI 2 "register_operand" "Yr,*x,x") + (match_operand:V8QI 3 "register_operand" "Yz,Yz,x")] + UNSPEC_BLENDV))] + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "@ + pblendvb\t{%3, %2, %0|%0, %2, %3} + pblendvb\t{%3, %2, %0|%0, %2, %3} + vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "*,*,1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "btver2_decode" "vector") + (set_attr "mode" "TI")]) + ;; XOP parallel XMM conditional moves (define_insn "*xop_pcmov_" [(set (match_operand:MMXMODEI 0 "register_operand" "=x") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 897cf3e..244fb13 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -39,7 +39,6 @@ UNSPEC_INSERTQ ;; For SSE4.1 support - UNSPEC_BLENDV UNSPEC_INSERTPS UNSPEC_DP UNSPEC_MOVNTDQA -- cgit v1.1 From 7af392687952608b988bd5a476583106b3f51740 Mon Sep 17 00:00:00 2001 From: Andrew Stubbs Date: Fri, 7 May 2021 15:42:21 +0100 Subject: amdgcn: disable TImode The TImode support works for moves only, which has worked in most case up to now, but no longer. We still need TImode to exist for the instructions that take two DImode values packed together, but we don't need to advertise this to the middle-end. gcc/ChangeLog: * config/gcn/gcn.c (gcn_scalar_mode_supported_p): Disable TImode. --- gcc/config/gcn/gcn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 9660ca6..2baf91d 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -361,7 +361,7 @@ gcn_scalar_mode_supported_p (scalar_mode mode) || mode == HImode /* || mode == HFmode */ || mode == SImode || mode == SFmode || mode == DImode || mode == DFmode - || mode == TImode); + /*|| mode == TImode*/); /* TI is used for back-end purposes only. */ } /* Implement TARGET_CLASS_MAX_NREGS. -- cgit v1.1 From 292da5c58da8fe60395c4b7569a0e828c800ded5 Mon Sep 17 00:00:00 2001 From: Andrew Stubbs Date: Fri, 7 May 2021 23:55:35 +0100 Subject: Revert "amdgcn: disable TImode" This reverts commit 7af392687952608b988bd5a476583106b3f51740. --- gcc/config/gcn/gcn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 2baf91d..9660ca6 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -361,7 +361,7 @@ gcn_scalar_mode_supported_p (scalar_mode mode) || mode == HImode /* || mode == HFmode */ || mode == SImode || mode == SFmode || mode == DImode || mode == DFmode - /*|| mode == TImode*/); /* TI is used for back-end purposes only. */ + || mode == TImode); } /* Implement TARGET_CLASS_MAX_NREGS. -- cgit v1.1 From 3b9eb2f8a8655bf1cc6bf14e732999c877ee4fff Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Mon, 10 May 2021 09:03:41 +0300 Subject: arc: Add alternative names for gp and fp registers. Add alternative register name r26 for gp register, and add alternative register name r27 for fp register. gcc/ 2021-05-10 Claudiu Zissulescu * config/arc/arc.h (ADDITIONAL_REGISTER_NAMES): Add r26 and r27. --- gcc/config/arc/arc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h index bbb36250..b7b3473 100644 --- a/gcc/config/arc/arc.h +++ b/gcc/config/arc/arc.h @@ -1193,6 +1193,8 @@ extern char rname56[], rname57[], rname58[], rname59[]; #define ADDITIONAL_REGISTER_NAMES \ { \ + {"r26", 26}, \ + {"r27", 27}, \ {"ilink", 29}, \ {"r29", 29}, \ {"r30", 30}, \ -- cgit v1.1 From b70c7c06aaa2ef55a29e6516a710e57f7218ab53 Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Mon, 10 May 2021 09:03:41 +0300 Subject: arc: Update ctz/clz patterns ARCv2 ISA introduces special clz/ctz instructions. This patch is adding support for them when available. Corner case: mov r0,0x0 : (w0) r0 <= 0x00000000 * ffs r1,r0 : (w0) r1 <= 0x0000001f * fls r2,r0 : (w0) r2 <= 0x00000000 * gcc/ 2021-05-10 Claudiu Zissulescu * config/arc/arc.h (CLZ_DEFINED_VALUE_AT_ZERO): Define. (CTZ_DEFINED_VALUE_AT_ZERO): Likewise. * config/arc/arc.md (clrsbsi2): Cleanup pattern. (norm_f): Likewise. (ffs): Likewise. (ffs_f): Likewise. (clzsi2): Use fls instruction when available. (arc_clzsi2): Likewise. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/arc.h | 6 ++++++ gcc/config/arc/arc.md | 53 +++++++++++++++++++++++++++++---------------------- 2 files changed, 36 insertions(+), 23 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h index b7b3473..bd1fe0a 100644 --- a/gcc/config/arc/arc.h +++ b/gcc/config/arc/arc.h @@ -1445,6 +1445,12 @@ do { \ */ #define SHIFT_COUNT_TRUNCATED 1 +/* Defines if the CLZ result is undefined or has a useful value. */ +#define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) ((VALUE) = 31, 2) + +/* Defines if the CTZ result is undefined or has a useful value. */ +#define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) ((VALUE) = 31, 2) + /* We assume that the store-condition-codes instructions store 0 for false and some other value for true. This is the value stored for true. */ #define STORE_FLAG_VALUE 1 diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index 7a52551..f3efe65 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -4396,24 +4396,20 @@ core_3, archs4x, archs4xd, archs4xd_slow" ;; Instructions generated through builtins (define_insn "clrsbsi2" - [(set (match_operand:SI 0 "dest_reg_operand" "=w,w") - (clrsb:SI (match_operand:SI 1 "general_operand" "cL,Cal")))] + [(set (match_operand:SI 0 "dest_reg_operand" "=r,r") + (clrsb:SI (match_operand:SI 1 "general_operand" "rL,Cal")))] "TARGET_NORM" - "@ - norm \t%0, %1 - norm \t%0, %1" + "norm\\t%0,%1" [(set_attr "length" "4,8") (set_attr "type" "two_cycle_core,two_cycle_core")]) (define_insn "norm_f" - [(set (match_operand:SI 0 "dest_reg_operand" "=w,w") - (clrsb:SI (match_operand:SI 1 "general_operand" "cL,Cal"))) + [(set (match_operand:SI 0 "dest_reg_operand" "=r,r") + (clrsb:SI (match_operand:SI 1 "general_operand" "rL,Cal"))) (set (reg:CC_ZN CC_REG) (compare:CC_ZN (match_dup 1) (const_int 0)))] "TARGET_NORM" - "@ - norm.f\t%0, %1 - norm.f\t%0, %1" + "norm.f\\t%0,%1" [(set_attr "length" "4,8") (set_attr "type" "two_cycle_core,two_cycle_core")]) @@ -4443,7 +4439,17 @@ core_3, archs4x, archs4xd, archs4xd_slow" (clz:SI (match_operand:SI 1 "register_operand" ""))) (clobber (match_dup 2))])] "TARGET_NORM" - "operands[2] = gen_rtx_REG (CC_ZNmode, CC_REG);") + " + if (TARGET_V2) + { + /* ARCv2's FLS is a bit more optimal than using norm. */ + rtx tmp = gen_reg_rtx (SImode); + emit_insn (gen_fls (tmp, operands[1])); + emit_insn (gen_subsi3 (operands[0], GEN_INT (31), tmp)); + DONE; + } + operands[2] = gen_rtx_REG (CC_ZNmode, CC_REG); + ") (define_insn_and_split "*arc_clzsi2" [(set (match_operand:SI 0 "register_operand" "=r") @@ -4475,8 +4481,13 @@ core_3, archs4x, archs4xd, archs4xd_slow" (match_operand:SI 1 "register_operand" "")] "TARGET_NORM" " - emit_insn (gen_arc_ctzsi2 (operands[0], operands[1])); - DONE; + if (TARGET_V2) + { + emit_insn (gen_ffs (operands[0], operands[1])); + DONE; + } + emit_insn (gen_arc_ctzsi2 (operands[0], operands[1])); + DONE; ") (define_insn_and_split "arc_ctzsi2" @@ -5575,26 +5586,22 @@ core_3, archs4x, archs4xd, archs4xd_slow" (set_attr "type" "misc")]) (define_insn "ffs" - [(set (match_operand:SI 0 "dest_reg_operand" "=w,w") - (unspec:SI [(match_operand:SI 1 "general_operand" "cL,Cal")] + [(set (match_operand:SI 0 "dest_reg_operand" "=r,r") + (unspec:SI [(match_operand:SI 1 "general_operand" "rL,Cal")] UNSPEC_ARC_FFS))] "TARGET_NORM && TARGET_V2" - "@ - ffs \t%0, %1 - ffs \t%0, %1" + "ffs\\t%0,%1" [(set_attr "length" "4,8") (set_attr "type" "two_cycle_core,two_cycle_core")]) (define_insn "ffs_f" - [(set (match_operand:SI 0 "dest_reg_operand" "=w,w") - (unspec:SI [(match_operand:SI 1 "general_operand" "cL,Cal")] + [(set (match_operand:SI 0 "dest_reg_operand" "=r,r") + (unspec:SI [(match_operand:SI 1 "general_operand" "rL,Cal")] UNSPEC_ARC_FFS)) (set (reg:CC_ZN CC_REG) (compare:CC_ZN (match_dup 1) (const_int 0)))] "TARGET_NORM && TARGET_V2" - "@ - ffs.f\t%0, %1 - ffs.f\t%0, %1" + "ffs.f\\t%0,%1" [(set_attr "length" "4,8") (set_attr "type" "two_cycle_core,two_cycle_core")]) -- cgit v1.1 From 89c94716241e2ba9cb42f512103528d0df6b0f3f Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Mon, 10 May 2021 09:03:41 +0300 Subject: arc: Fix compilation warnings. gcc/ 2021-05-10 Claudiu Zissulescu * common/config/arc/arc-common.c (arc_handle_option): Remove dot from string. * config/arc/arc.c (arc_reorg): Remove underscore from string. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/arc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c index 3201c3f..ec7328e 100644 --- a/gcc/config/arc/arc.c +++ b/gcc/config/arc/arc.c @@ -8433,7 +8433,7 @@ arc_reorg (void) if (!INSN_ADDRESSES_SET_P()) fatal_error (input_location, - "insn addresses not set after shorten_branches"); + "insn addresses not set after shorten branches"); for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) { -- cgit v1.1 From 0c2f8805fad50a95099ed19955866c777e397f3c Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Mon, 10 May 2021 09:49:35 +0300 Subject: arc: Disable movmisalign patterns when aligned access is required Disable movmisalign patterns when aligned access is required. gcc/ 2021-05-10 Claudiu Zissulescu * config/arc/simdext.md (movmisalignv2hi): Allow misaligned access only when munaligned-access option is on. (movmisalign): Likewise. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/simdext.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/simdext.md b/gcc/config/arc/simdext.md index f090075..d142aac 100644 --- a/gcc/config/arc/simdext.md +++ b/gcc/config/arc/simdext.md @@ -1382,7 +1382,7 @@ (define_expand "movmisalignv2hi" [(set (match_operand:V2HI 0 "general_operand" "") (match_operand:V2HI 1 "general_operand" ""))] - "" + "unaligned_access" "{ if (prepare_move_operands (operands, V2HImode)) DONE; @@ -1441,7 +1441,7 @@ (define_expand "movmisalign" [(set (match_operand:VWH 0 "general_operand" "") (match_operand:VWH 1 "general_operand" ""))] - "" + "unaligned_access" "{ if (prepare_move_operands (operands, mode)) DONE; -- cgit v1.1 From 09ae0f6c3ee0612012a67df4387d55efa19b8cad Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Mon, 10 May 2021 09:49:35 +0300 Subject: arc: Cleanup simdext.md file Textual cleanup of the simdext.md file. Format the output assembly instructions. gcc/ 2021-05-10 Claudiu Zissulescu * config/arc/simdext.md: Format and cleanup file. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/simdext.md | 730 +++++++++++++++++++++++++--------------------- 1 file changed, 400 insertions(+), 330 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/simdext.md b/gcc/config/arc/simdext.md index d142aac..41c4269 100644 --- a/gcc/config/arc/simdext.md +++ b/gcc/config/arc/simdext.md @@ -174,7 +174,7 @@ (parallel [(match_operand:SI 2 "immediate_operand" "L")]))) (match_operand:SI 3 "immediate_operand" "P"))))] "TARGET_SIMD_SET" - "vld128 %0, [i%2, %3]" + "vld128\\t%0,[i%2,%3]" [(set_attr "type" "simd_vload128") (set_attr "length" "4") (set_attr "cond" "nocond")] @@ -186,7 +186,7 @@ (match_operand:SI 2 "immediate_operand" "P"))) (match_operand:V8HI 3 "vector_register_operand" "=v"))] "TARGET_SIMD_SET" - "vst128 %3, [i%1, %2]" + "vst128\\t%3,[i%1,%2]" [(set_attr "type" "simd_vstore") (set_attr "length" "4") (set_attr "cond" "nocond")] @@ -204,7 +204,7 @@ (match_operand:V8HI 3 "vector_register_operand" "=v") (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)])))] "TARGET_SIMD_SET" - "vst64 %3, [i%1, %2]" + "vst64\\t%3,[i%1,%2]" [(set_attr "type" "simd_vstore") (set_attr "length" "4") (set_attr "cond" "nocond")] @@ -215,9 +215,9 @@ (match_operand:V8HI 1 "vector_register_or_memory_operand" "m,v,v"))] "TARGET_SIMD_SET && !(GET_CODE (operands[0]) == MEM && GET_CODE(operands[1]) == MEM)" "@ - vld128r %0, %1 - vst128r %1, %0 - vmvzw %0,%1,0xffff" + vld128r\\t%0,%1 + vst128r\\t%1,%0 + vmvzw\\t%0,%1,0xffff" [(set_attr "type" "simd_vload128,simd_vstore,simd_vmove_else_zero") (set_attr "length" "8,8,4") (set_attr "cond" "nocond, nocond, nocond")]) @@ -227,55 +227,21 @@ (match_operand:TI 1 "vector_register_or_memory_operand" "m,v,v"))] "" "@ - vld128r %0, %1 - vst128r %1, %0 - vmvzw %0,%1,0xffff" + vld128r\\t%0,%1 + vst128r\\t%1,%0 + vmvzw\\t%0,%1,0xffff" [(set_attr "type" "simd_vload128,simd_vstore,simd_vmove_else_zero") (set_attr "length" "8,8,4") (set_attr "cond" "nocond, nocond, nocond")]) -;; (define_insn "*movv8hi_insn_rr" -;; [(set (match_operand:V8HI 0 "vector_register_operand" "=v") -;; (match_operand:V8HI 1 "vector_register_operand" "v"))] -;; "" -;; "mov reg,reg" -;; [(set_attr "length" "8") -;; (set_attr "type" "move")]) - -;; (define_insn "*movv8_out" -;; [(set (match_operand:V8HI 0 "memory_operand" "=m") -;; (match_operand:V8HI 1 "vector_register_operand" "v"))] -;; "" -;; "mov out" -;; [(set_attr "length" "8") -;; (set_attr "type" "move")]) - - -;; (define_insn "addv8hi3" -;; [(set (match_operand:V8HI 0 "vector_register_operand" "=v") -;; (plus:V8HI (match_operand:V8HI 1 "vector_register_operand" "v") -;; (match_operand:V8HI 2 "vector_register_operand" "v")))] -;; "TARGET_SIMD_SET" -;; "vaddw %0, %1, %2" -;; [(set_attr "length" "8") -;; (set_attr "cond" "nocond")]) - -;; (define_insn "vaddw_insn" -;; [(set (match_operand:V8HI 0 "vector_register_operand" "=v") -;; (unspec [(match_operand:V8HI 1 "vector_register_operand" "v") -;; (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VADDW))] -;; "TARGET_SIMD_SET" -;; "vaddw %0, %1, %2" -;; [(set_attr "length" "8") -;; (set_attr "cond" "nocond")]) - ;; V V V Insns (define_insn "vaddaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VADDAW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VADDAW))] "TARGET_SIMD_SET" - "vaddaw %0, %1, %2" + "vaddaw\\t%0,%1,%2" [(set_attr "type" "simd_varith_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -283,9 +249,10 @@ (define_insn "vaddw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VADDW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VADDW))] "TARGET_SIMD_SET" - "vaddw %0, %1, %2" + "vaddw\\t%0,%1,2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -293,9 +260,10 @@ (define_insn "vavb_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VAVB))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VAVB))] "TARGET_SIMD_SET" - "vavb %0, %1, %2" + "vavb\\t%0,%1,%2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -303,9 +271,10 @@ (define_insn "vavrb_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VAVRB))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VAVRB))] "TARGET_SIMD_SET" - "vavrb %0, %1, %2" + "vavrb\\t%0,%1,%2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -313,9 +282,10 @@ (define_insn "vdifaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VDIFAW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VDIFAW))] "TARGET_SIMD_SET" - "vdifaw %0, %1, %2" + "vdifaw\\t%0,%1,%2" [(set_attr "type" "simd_varith_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -323,9 +293,10 @@ (define_insn "vdifw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VDIFW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VDIFW))] "TARGET_SIMD_SET" - "vdifw %0, %1, %2" + "vdifw\\t%0,%1,%2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -333,9 +304,10 @@ (define_insn "vmaxaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMAXAW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMAXAW))] "TARGET_SIMD_SET" - "vmaxaw %0, %1, %2" + "vmaxaw\\t%0,%1,2" [(set_attr "type" "simd_varith_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -343,9 +315,10 @@ (define_insn "vmaxw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMAXW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMAXW))] "TARGET_SIMD_SET" - "vmaxw %0, %1, %2" + "vmaxw\\t%0,%1,%2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -353,9 +326,10 @@ (define_insn "vminaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMINAW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMINAW))] "TARGET_SIMD_SET" - "vminaw %0, %1, %2" + "vminaw\\t%0,%1,%2" [(set_attr "type" "simd_varith_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -363,9 +337,10 @@ (define_insn "vminw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMINW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMINW))] "TARGET_SIMD_SET" - "vminw %0, %1, %2" + "vminw\\t%0,%1,%2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -373,9 +348,10 @@ (define_insn "vmulaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMULAW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMULAW))] "TARGET_SIMD_SET" - "vmulaw %0, %1, %2" + "vmulaw\\t%0,%1,%2" [(set_attr "type" "simd_varith_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -383,9 +359,10 @@ (define_insn "vmulfaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMULFAW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMULFAW))] "TARGET_SIMD_SET" - "vmulfaw %0, %1, %2" + "vmulfaw\\t%0,%1,%2" [(set_attr "type" "simd_varith_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -393,9 +370,10 @@ (define_insn "vmulfw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMULFW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMULFW))] "TARGET_SIMD_SET" - "vmulfw %0, %1, %2" + "vmulfw\\t%0,%1,%2" [(set_attr "type" "simd_varith_2cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -403,9 +381,10 @@ (define_insn "vmulw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMULW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMULW))] "TARGET_SIMD_SET" - "vmulw %0, %1, %2" + "vmulw\\t%0,%1,%2" [(set_attr "type" "simd_varith_2cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -413,9 +392,10 @@ (define_insn "vsubaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VSUBAW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VSUBAW))] "TARGET_SIMD_SET" - "vsubaw %0, %1, %2" + "vsubaw\\t%0,%1,%2" [(set_attr "type" "simd_varith_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -423,9 +403,10 @@ (define_insn "vsubw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VSUBW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VSUBW))] "TARGET_SIMD_SET" - "vsubw %0, %1, %2" + "vsubw\\t%0,%1,%2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -433,9 +414,10 @@ (define_insn "vsummw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VSUMMW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VSUMMW))] "TARGET_SIMD_SET" - "vsummw %0, %1, %2" + "vsummw\\t%0,%1,%2" [(set_attr "type" "simd_varith_2cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -443,9 +425,10 @@ (define_insn "vand_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VAND))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VAND))] "TARGET_SIMD_SET" - "vand %0, %1, %2" + "vand\\t%0,%1,%2" [(set_attr "type" "simd_vlogic") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -453,9 +436,10 @@ (define_insn "vandaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VANDAW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VANDAW))] "TARGET_SIMD_SET" - "vandaw %0, %1, %2" + "vandaw\\t%0,%1,%2" [(set_attr "type" "simd_vlogic_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -463,9 +447,10 @@ (define_insn "vbic_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VBIC))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VBIC))] "TARGET_SIMD_SET" - "vbic %0, %1, %2" + "vbic\\t%0,%1,%2" [(set_attr "type" "simd_vlogic") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -473,9 +458,10 @@ (define_insn "vbicaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VBICAW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VBICAW))] "TARGET_SIMD_SET" - "vbicaw %0, %1, %2" + "vbicaw\\t%0,%1,%2" [(set_attr "type" "simd_vlogic_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -483,9 +469,10 @@ (define_insn "vor_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VOR))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VOR))] "TARGET_SIMD_SET" - "vor %0, %1, %2" + "vor\\t%0,%1,%2" [(set_attr "type" "simd_vlogic") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -493,9 +480,10 @@ (define_insn "vxor_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VXOR))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VXOR))] "TARGET_SIMD_SET" - "vxor %0, %1, %2" + "vxor\\t%0,%1,%2" [(set_attr "type" "simd_vlogic") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -503,9 +491,10 @@ (define_insn "vxoraw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VXORAW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VXORAW))] "TARGET_SIMD_SET" - "vxoraw %0, %1, %2" + "vxoraw\\t%0,%1,%2" [(set_attr "type" "simd_vlogic_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -513,9 +502,10 @@ (define_insn "veqw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VEQW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VEQW))] "TARGET_SIMD_SET" - "veqw %0, %1, %2" + "veqw\\t%0,%1,%2" [(set_attr "type" "simd_vcompare") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -523,9 +513,10 @@ (define_insn "vlew_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VLEW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VLEW))] "TARGET_SIMD_SET" - "vlew %0, %1, %2" + "vlew\\t%0,%1,%2" [(set_attr "type" "simd_vcompare") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -533,9 +524,10 @@ (define_insn "vltw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VLTW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VLTW))] "TARGET_SIMD_SET" - "vltw %0, %1, %2" + "vltw\\t%0,%1,%2" [(set_attr "type" "simd_vcompare") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -543,9 +535,10 @@ (define_insn "vnew_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VNEW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VNEW))] "TARGET_SIMD_SET" - "vnew %0, %1, %2" + "vnew\\t%0,%1,%2" [(set_attr "type" "simd_vcompare") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -553,9 +546,10 @@ (define_insn "vmr1aw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR1AW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR1AW))] "TARGET_SIMD_SET" - "vmr1aw %0, %1, %2" + "vmr1aw\\t%0,%1,%2" [(set_attr "type" "simd_valign_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -563,9 +557,10 @@ (define_insn "vmr1w_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR1W))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR1W))] "TARGET_SIMD_SET" - "vmr1w %0, %1, %2" + "vmr1w\\t%0,%1,%2" [(set_attr "type" "simd_valign") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -573,9 +568,10 @@ (define_insn "vmr2aw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR2AW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR2AW))] "TARGET_SIMD_SET" - "vmr2aw %0, %1, %2" + "vmr2aw\\t%0,%1,%2" [(set_attr "type" "simd_valign_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -583,9 +579,10 @@ (define_insn "vmr2w_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR2W))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR2W))] "TARGET_SIMD_SET" - "vmr2w %0, %1, %2" + "vmr2w\\t%0,%1,%2" [(set_attr "type" "simd_valign") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -593,9 +590,10 @@ (define_insn "vmr3aw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR3AW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR3AW))] "TARGET_SIMD_SET" - "vmr3aw %0, %1, %2" + "vmr3aw\\t%0,%1,%2" [(set_attr "type" "simd_valign_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -603,9 +601,10 @@ (define_insn "vmr3w_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR3W))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR3W))] "TARGET_SIMD_SET" - "vmr3w %0, %1, %2" + "vmr3w\\t%0,%1,%2" [(set_attr "type" "simd_valign") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -613,9 +612,10 @@ (define_insn "vmr4aw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR4AW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR4AW))] "TARGET_SIMD_SET" - "vmr4aw %0, %1, %2" + "vmr4aw\\t%0,%1,%2" [(set_attr "type" "simd_valign_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -623,9 +623,10 @@ (define_insn "vmr4w_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR4W))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR4W))] "TARGET_SIMD_SET" - "vmr4w %0, %1, %2" + "vmr4w\\t%0,%1,%2" [(set_attr "type" "simd_valign") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -633,9 +634,10 @@ (define_insn "vmr5aw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR5AW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR5AW))] "TARGET_SIMD_SET" - "vmr5aw %0, %1, %2" + "vmr5aw\\t%0,%1,%2" [(set_attr "type" "simd_valign_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -643,9 +645,10 @@ (define_insn "vmr5w_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR5W))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR5W))] "TARGET_SIMD_SET" - "vmr5w %0, %1, %2" + "vmr5w\\t%0,%1,%2" [(set_attr "type" "simd_valign") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -653,9 +656,10 @@ (define_insn "vmr6aw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR6AW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR6AW))] "TARGET_SIMD_SET" - "vmr6aw %0, %1, %2" + "vmr6aw\\t%0,%1,%2" [(set_attr "type" "simd_valign_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -663,9 +667,10 @@ (define_insn "vmr6w_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR6W))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR6W))] "TARGET_SIMD_SET" - "vmr6w %0, %1, %2" + "vmr6w\\t%0,%1,%2" [(set_attr "type" "simd_valign") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -673,9 +678,10 @@ (define_insn "vmr7aw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR7AW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR7AW))] "TARGET_SIMD_SET" - "vmr7aw %0, %1, %2" + "vmr7aw\\t%0,%1,%2" [(set_attr "type" "simd_valign_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -683,9 +689,10 @@ (define_insn "vmr7w_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMR7W))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMR7W))] "TARGET_SIMD_SET" - "vmr7w %0, %1, %2" + "vmr7w\\t%0,%1,%2" [(set_attr "type" "simd_valign") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -693,9 +700,10 @@ (define_insn "vmrb_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VMRB))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VMRB))] "TARGET_SIMD_SET" - "vmrb %0, %1, %2" + "vmrb\\t%0,%1,%2" [(set_attr "type" "simd_valign") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -703,9 +711,10 @@ (define_insn "vh264f_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VH264F))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VH264F))] "TARGET_SIMD_SET" - "vh264f %0, %1, %2" + "vh264f\\t%0,%1,%2" [(set_attr "type" "simd_vspecial_3cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -713,9 +722,10 @@ (define_insn "vh264ft_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VH264FT))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VH264FT))] "TARGET_SIMD_SET" - "vh264ft %0, %1, %2" + "vh264ft\\t%0,%1,%2" [(set_attr "type" "simd_vspecial_3cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -723,9 +733,10 @@ (define_insn "vh264fw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VH264FW))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VH264FW))] "TARGET_SIMD_SET" - "vh264fw %0, %1, %2" + "vh264fw\\t%0,%1,%2" [(set_attr "type" "simd_vspecial_3cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -733,9 +744,10 @@ (define_insn "vvc1f_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VVC1F))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VVC1F))] "TARGET_SIMD_SET" - "vvc1f %0, %1, %2" + "vvc1f\\t%0,%1,%2" [(set_attr "type" "simd_vspecial_3cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -743,9 +755,10 @@ (define_insn "vvc1ft_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:V8HI 2 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VVC1FT))] + (match_operand:V8HI 2 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VVC1FT))] "TARGET_SIMD_SET" - "vvc1ft %0, %1, %2" + "vvc1ft\\t%0,%1,%2" [(set_attr "type" "simd_vspecial_3cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -754,22 +767,13 @@ ;;--- ;; V V r/limm Insns - -;; (define_insn "vbaddw_insn" -;; [(set (match_operand:V8HI 0 "vector_register_operand" "=v") -;; (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") -;; (match_operand:SI 2 "nonmemory_operand" "rCal")] UNSPEC_ARC_SIMD_VBADDW))] -;; "TARGET_SIMD_SET" -;; "vbaddw %0, %1, %2" -;; [(set_attr "length" "4") -;; (set_attr "cond" "nocond")]) - (define_insn "vbaddw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VBADDW))] + (match_operand:SI 2 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VBADDW))] "TARGET_SIMD_SET" - "vbaddw %0, %1, %2" + "vbaddw\\t%0,%1,%2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -777,9 +781,10 @@ (define_insn "vbmaxw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VBMAXW))] + (match_operand:SI 2 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VBMAXW))] "TARGET_SIMD_SET" - "vbmaxw %0, %1, %2" + "vbmaxw\\t%0,%1,%2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -787,9 +792,10 @@ (define_insn "vbminw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VBMINW))] + (match_operand:SI 2 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VBMINW))] "TARGET_SIMD_SET" - "vbminw %0, %1, %2" + "vbminw\\t%0,%1,%2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -797,9 +803,10 @@ (define_insn "vbmulaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VBMULAW))] + (match_operand:SI 2 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VBMULAW))] "TARGET_SIMD_SET" - "vbmulaw %0, %1, %2" + "vbmulaw\\t%0,%1,%2" [(set_attr "type" "simd_varith_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -807,9 +814,10 @@ (define_insn "vbmulfw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VBMULFW))] + (match_operand:SI 2 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VBMULFW))] "TARGET_SIMD_SET" - "vbmulfw %0, %1, %2" + "vbmulfw\\t%0,%1,%2" [(set_attr "type" "simd_varith_2cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -817,9 +825,10 @@ (define_insn "vbmulw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VBMULW))] + (match_operand:SI 2 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VBMULW))] "TARGET_SIMD_SET" - "vbmulw %0, %1, %2" + "vbmulw\\t%0,%1,%2" [(set_attr "type" "simd_varith_2cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -827,9 +836,10 @@ (define_insn "vbrsubw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VBRSUBW))] + (match_operand:SI 2 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VBRSUBW))] "TARGET_SIMD_SET" - "vbrsubw %0, %1, %2" + "vbrsubw\\t%0,%1,%2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -837,9 +847,10 @@ (define_insn "vbsubw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VBSUBW))] + (match_operand:SI 2 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VBSUBW))] "TARGET_SIMD_SET" - "vbsubw %0, %1, %2" + "vbsubw\\t%0,%1,%2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -849,9 +860,10 @@ (define_insn "vasrrwi_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "immediate_operand" "L")] UNSPEC_ARC_SIMD_VASRRWi))] + (match_operand:SI 2 "immediate_operand" "L")] + UNSPEC_ARC_SIMD_VASRRWi))] "TARGET_SIMD_SET" - "vasrrwi %0, %1, %2" + "vasrrwi\\t%0,%1,%2" [(set_attr "type" "simd_varith_2cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -859,9 +871,10 @@ (define_insn "vasrsrwi_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "immediate_operand" "L")] UNSPEC_ARC_SIMD_VASRSRWi))] + (match_operand:SI 2 "immediate_operand" "L")] + UNSPEC_ARC_SIMD_VASRSRWi))] "TARGET_SIMD_SET" - "vasrsrwi %0, %1, %2" + "vasrsrwi\\t%0,%1,%2" [(set_attr "type" "simd_varith_2cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -869,9 +882,10 @@ (define_insn "vasrwi_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "immediate_operand" "L")] UNSPEC_ARC_SIMD_VASRWi))] + (match_operand:SI 2 "immediate_operand" "L")] + UNSPEC_ARC_SIMD_VASRWi))] "TARGET_SIMD_SET" - "vasrwi %0, %1, %2" + "vasrwi\\t%0,%1,%2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -879,9 +893,10 @@ (define_insn "vasrpwbi_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "immediate_operand" "L")] UNSPEC_ARC_SIMD_VASRPWBi))] + (match_operand:SI 2 "immediate_operand" "L")] + UNSPEC_ARC_SIMD_VASRPWBi))] "TARGET_SIMD_SET" - "vasrpwbi %0, %1, %2" + "vasrpwbi\\t%0,%1,%2" [(set_attr "type" "simd_vpack") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -889,9 +904,10 @@ (define_insn "vasrrpwbi_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "immediate_operand" "L")] UNSPEC_ARC_SIMD_VASRRPWBi))] + (match_operand:SI 2 "immediate_operand" "L")] + UNSPEC_ARC_SIMD_VASRRPWBi))] "TARGET_SIMD_SET" - "vasrrpwbi %0, %1, %2" + "vasrrpwbi\\t%0,%1,%2" [(set_attr "type" "simd_vpack") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -899,9 +915,10 @@ (define_insn "vsr8awi_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "immediate_operand" "L")] UNSPEC_ARC_SIMD_VSR8AWi))] + (match_operand:SI 2 "immediate_operand" "L")] + UNSPEC_ARC_SIMD_VSR8AWi))] "TARGET_SIMD_SET" - "vsr8awi %0, %1, %2" + "vsr8awi\\t%0,%1,%2" [(set_attr "type" "simd_valign_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -909,9 +926,10 @@ (define_insn "vsr8i_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "immediate_operand" "L")] UNSPEC_ARC_SIMD_VSR8i))] + (match_operand:SI 2 "immediate_operand" "L")] + UNSPEC_ARC_SIMD_VSR8i))] "TARGET_SIMD_SET" - "vsr8i %0, %1, %2" + "vsr8i\\t%0,%1,%2" [(set_attr "type" "simd_valign") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -921,9 +939,10 @@ (define_insn "vmvaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "immediate_operand" "P")] UNSPEC_ARC_SIMD_VMVAW))] + (match_operand:SI 2 "immediate_operand" "P")] + UNSPEC_ARC_SIMD_VMVAW))] "TARGET_SIMD_SET" - "vmvaw %0, %1, %2" + "vmvaw\\t%0,%1,%2" [(set_attr "type" "simd_vmove_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -931,9 +950,10 @@ (define_insn "vmvw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "immediate_operand" "P")] UNSPEC_ARC_SIMD_VMVW))] + (match_operand:SI 2 "immediate_operand" "P")] + UNSPEC_ARC_SIMD_VMVW))] "TARGET_SIMD_SET" - "vmvw %0, %1, %2" + "vmvw\\t%0,%1,%2" [(set_attr "type" "simd_vmove") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -941,9 +961,10 @@ (define_insn "vmvzw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "immediate_operand" "P")] UNSPEC_ARC_SIMD_VMVZW))] + (match_operand:SI 2 "immediate_operand" "P")] + UNSPEC_ARC_SIMD_VMVZW))] "TARGET_SIMD_SET" - "vmvzw %0, %1, %2" + "vmvzw\\t%0,%1,%2" [(set_attr "type" "simd_vmove_else_zero") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -951,9 +972,10 @@ (define_insn "vd6tapf_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") - (match_operand:SI 2 "immediate_operand" "P")] UNSPEC_ARC_SIMD_VD6TAPF))] + (match_operand:SI 2 "immediate_operand" "P")] + UNSPEC_ARC_SIMD_VD6TAPF))] "TARGET_SIMD_SET" - "vd6tapf %0, %1, %2" + "vd6tapf\\t%0,%1,%2" [(set_attr "type" "simd_vspecial_4cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -962,9 +984,10 @@ (define_insn "vmovaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:SI 1 "nonmemory_operand" "r") - (match_operand:SI 2 "immediate_operand" "P")] UNSPEC_ARC_SIMD_VMOVAW))] + (match_operand:SI 2 "immediate_operand" "P")] + UNSPEC_ARC_SIMD_VMOVAW))] "TARGET_SIMD_SET" - "vmovaw %0, %1, %2" + "vmovaw\\t%0,%1,%2" [(set_attr "type" "simd_vmove_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -972,9 +995,10 @@ (define_insn "vmovw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:SI 1 "nonmemory_operand" "r") - (match_operand:SI 2 "immediate_operand" "P")] UNSPEC_ARC_SIMD_VMOVW))] + (match_operand:SI 2 "immediate_operand" "P")] + UNSPEC_ARC_SIMD_VMOVW))] "TARGET_SIMD_SET" - "vmovw %0, %1, %2" + "vmovw\\t%0,%1,%2" [(set_attr "type" "simd_vmove") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -982,9 +1006,10 @@ (define_insn "vmovzw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:SI 1 "nonmemory_operand" "r") - (match_operand:SI 2 "immediate_operand" "P")] UNSPEC_ARC_SIMD_VMOVZW))] + (match_operand:SI 2 "immediate_operand" "P")] + UNSPEC_ARC_SIMD_VMOVZW))] "TARGET_SIMD_SET" - "vmovzw %0, %1, %2" + "vmovzw\\t%0,%1,%2" [(set_attr "type" "simd_vmove_else_zero") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -994,9 +1019,10 @@ [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") (match_operand:SI 2 "immediate_operand" "K") - (match_operand:V8HI 3 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VSR8))] + (match_operand:V8HI 3 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VSR8))] "TARGET_SIMD_SET" - "vsr8 %0, %1, i%2" + "vsr8\\t%0,%1,i%2" [(set_attr "type" "simd_valign") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -1005,9 +1031,10 @@ [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") (match_operand:SI 2 "immediate_operand" "K") - (match_operand:V8HI 3 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VASRW))] + (match_operand:V8HI 3 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VASRW))] "TARGET_SIMD_SET" - "vasrw %0, %1, i%2" + "vasrw\\t%0,%1,i%2" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -1016,9 +1043,10 @@ [(set (match_operand:V8HI 0 "vector_register_operand" "=v") (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v") (match_operand:SI 2 "immediate_operand" "K") - (match_operand:V8HI 3 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VSR8AW))] + (match_operand:V8HI 3 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VSR8AW))] "TARGET_SIMD_SET" - "vsr8aw %0, %1, i%2" + "vsr8aw\\t%0,%1,i%2" [(set_attr "type" "simd_valign_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -1026,99 +1054,110 @@ ;; Va, Vb insns (define_insn "vabsaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") - (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VABSAW))] + (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VABSAW))] "TARGET_SIMD_SET" - "vabsaw %0, %1" + "vabsaw\\t%0,%1" [(set_attr "type" "simd_varith_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vabsw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") - (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VABSW))] + (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VABSW))] "TARGET_SIMD_SET" - "vabsw %0, %1" + "vabsw\\t%0,%1" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vaddsuw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") - (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VADDSUW))] + (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VADDSUW))] "TARGET_SIMD_SET" - "vaddsuw %0, %1" + "vaddsuw\\t%0,%1" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vsignw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") - (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VSIGNW))] + (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VSIGNW))] "TARGET_SIMD_SET" - "vsignw %0, %1" + "vsignw\\t%0,%1" [(set_attr "type" "simd_varith_1cycle") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vexch1_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") - (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VEXCH1))] + (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VEXCH1))] "TARGET_SIMD_SET" - "vexch1 %0, %1" + "vexch1\\t%0,%1" [(set_attr "type" "simd_vpermute") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vexch2_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") - (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VEXCH2))] + (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VEXCH2))] "TARGET_SIMD_SET" - "vexch2 %0, %1" + "vexch2\\t%0,%1" [(set_attr "type" "simd_vpermute") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vexch4_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") - (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VEXCH4))] + (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VEXCH4))] "TARGET_SIMD_SET" - "vexch4 %0, %1" + "vexch4\\t%0,%1" [(set_attr "type" "simd_vpermute") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vupbaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") - (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VUPBAW))] + (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VUPBAW))] "TARGET_SIMD_SET" - "vupbaw %0, %1" + "vupbaw\\t%0,%1" [(set_attr "type" "simd_vpack_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vupbw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") - (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VUPBW))] + (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VUPBW))] "TARGET_SIMD_SET" - "vupbw %0, %1" + "vupbw\\t%0,%1" [(set_attr "type" "simd_vpack") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vupsbaw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") - (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VUPSBAW))] + (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VUPSBAW))] "TARGET_SIMD_SET" - "vupsbaw %0, %1" + "vupsbaw\\t%0,%1" [(set_attr "type" "simd_vpack_with_acc") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vupsbw_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") - (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VUPSBW))] + (unspec:V8HI [(match_operand:V8HI 1 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VUPSBW))] "TARGET_SIMD_SET" - "vupsbw %0, %1" + "vupsbw\\t%0,%1" [(set_attr "type" "simd_vpack") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -1127,9 +1166,10 @@ (define_insn "vdirun_insn" [(set (match_operand:SI 0 "arc_simd_dma_register_operand" "=d") (unspec_volatile:SI [(match_operand:SI 1 "nonmemory_operand" "r") - (match_operand:SI 2 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VDIRUN))] + (match_operand:SI 2 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VDIRUN))] "TARGET_SIMD_SET" - "vdirun %1, %2" + "vdirun\\t%1,%2" [(set_attr "type" "simd_dma") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -1137,60 +1177,67 @@ (define_insn "vdorun_insn" [(set (match_operand:SI 0 "arc_simd_dma_register_operand" "=d") (unspec_volatile:SI [(match_operand:SI 1 "nonmemory_operand" "r") - (match_operand:SI 2 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VDORUN))] + (match_operand:SI 2 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VDORUN))] "TARGET_SIMD_SET" - "vdorun %1, %2" + "vdorun\\t%1,%2" [(set_attr "type" "simd_dma") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vdiwr_insn" [(set (match_operand:SI 0 "arc_simd_dma_register_operand" "=d,d") - (unspec_volatile:SI [(match_operand:SI 1 "nonmemory_operand" "r,Cal")] UNSPEC_ARC_SIMD_VDIWR))] + (unspec_volatile:SI [(match_operand:SI 1 "nonmemory_operand" "r,Cal")] + UNSPEC_ARC_SIMD_VDIWR))] "TARGET_SIMD_SET" - "vdiwr %0, %1" + "vdiwr\\t%0,%1" [(set_attr "type" "simd_dma") (set_attr "length" "4,8") (set_attr "cond" "nocond,nocond")]) (define_insn "vdowr_insn" [(set (match_operand:SI 0 "arc_simd_dma_register_operand" "=d,d") - (unspec_volatile:SI [(match_operand:SI 1 "nonmemory_operand" "r,Cal")] UNSPEC_ARC_SIMD_VDOWR))] + (unspec_volatile:SI [(match_operand:SI 1 "nonmemory_operand" "r,Cal")] + UNSPEC_ARC_SIMD_VDOWR))] "TARGET_SIMD_SET" - "vdowr %0, %1" + "vdowr\\t%0,%1" [(set_attr "type" "simd_dma") (set_attr "length" "4,8") (set_attr "cond" "nocond,nocond")]) ;; vector record and run instructions (define_insn "vrec_insn" - [(unspec_volatile [(match_operand:SI 0 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VREC)] + [(unspec_volatile [(match_operand:SI 0 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VREC)] "TARGET_SIMD_SET" - "vrec %0" + "vrec\\t%0" [(set_attr "type" "simd_vcontrol") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vrun_insn" - [(unspec_volatile [(match_operand:SI 0 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VRUN)] + [(unspec_volatile [(match_operand:SI 0 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VRUN)] "TARGET_SIMD_SET" - "vrun %0" + "vrun\\t%0" [(set_attr "type" "simd_vcontrol") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vrecrun_insn" - [(unspec_volatile [(match_operand:SI 0 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VRECRUN)] + [(unspec_volatile [(match_operand:SI 0 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VRECRUN)] "TARGET_SIMD_SET" - "vrecrun %0" + "vrecrun\\t%0" [(set_attr "type" "simd_vcontrol") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vendrec_insn" - [(unspec_volatile [(match_operand:SI 0 "nonmemory_operand" "r")] UNSPEC_ARC_SIMD_VENDREC)] + [(unspec_volatile [(match_operand:SI 0 "nonmemory_operand" "r")] + UNSPEC_ARC_SIMD_VENDREC)] "TARGET_SIMD_SET" - "vendrec %0" + "vendrec\\t%0" [(set_attr "type" "simd_vcontrol") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -1211,7 +1258,7 @@ (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]) )))] "TARGET_SIMD_SET" - "vld32wh %0, [i%3,%1]" + "vld32wh\\t%0,[i%3,%1]" [(set_attr "type" "simd_vload") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -1232,18 +1279,22 @@ [(match_operand:SI 3 "immediate_operand" "L")])) ))))))] "TARGET_SIMD_SET" - "vld32wl %0, [i%3,%1]" + "vld32wl\\t%0,[i%3,%1]" [(set_attr "type" "simd_vload") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vld64w_insn" [(set (match_operand:V8HI 0 "vector_register_operand" "=v") - (zero_extend:V8HI (mem:V4HI (plus:SI (zero_extend:SI (vec_select:HI (match_operand:V8HI 1 "vector_register_operand" "v") - (parallel [(match_operand:SI 2 "immediate_operand" "L")]))) - (match_operand:SI 3 "immediate_operand" "P")))))] + (zero_extend:V8HI + (mem:V4HI + (plus:SI + (zero_extend:SI + (vec_select:HI (match_operand:V8HI 1 "vector_register_operand" "v") + (parallel [(match_operand:SI 2 "immediate_operand" "L")]))) + (match_operand:SI 3 "immediate_operand" "P")))))] "TARGET_SIMD_SET" - "vld64w %0, [i%2, %3]" + "vld64w\\t%0,[i%2,%3]" [(set_attr "type" "simd_vload") (set_attr "length" "4") (set_attr "cond" "nocond")] @@ -1264,7 +1315,7 @@ (parallel [(match_operand:SI 3 "immediate_operand" "L")])) )))))] "TARGET_SIMD_SET" - "vld64 %0, [i%3,%1]" + "vld64\\t%0,[i%3,%1]" [(set_attr "type" "simd_vload") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -1287,40 +1338,48 @@ (match_operand:V8HI 2 "vector_register_operand" "v") (parallel [(match_operand:SI 3 "immediate_operand" "L")]))))))))] "TARGET_SIMD_SET" - "vld32 %0, [i%3,%1]" + "vld32\\t%0,[i%3,%1]" [(set_attr "type" "simd_vload") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vst16_n_insn" - [(set (mem:HI (plus:SI (match_operand:SI 0 "immediate_operand" "P") - (zero_extend: SI (vec_select:HI (match_operand:V8HI 1 "vector_register_operand" "v") - (parallel [(match_operand:SI 2 "immediate_operand" "L")]))))) + [(set (mem:HI + (plus:SI + (match_operand:SI 0 "immediate_operand" "P") + (zero_extend:SI + (vec_select:HI (match_operand:V8HI 1 "vector_register_operand" "v") + (parallel [(match_operand:SI 2 "immediate_operand" "L")]))))) (vec_select:HI (match_operand:V8HI 3 "vector_register_operand" "v") (parallel [(match_operand:SI 4 "immediate_operand" "L")])))] "TARGET_SIMD_SET" - "vst16_%4 %3,[i%2, %0]" + "vst16_%4\\t%3,[i%2,%0]" [(set_attr "type" "simd_vstore") (set_attr "length" "4") (set_attr "cond" "nocond")]) (define_insn "vst32_n_insn" - [(set (mem:SI (plus:SI (match_operand:SI 0 "immediate_operand" "P") - (zero_extend: SI (vec_select:HI (match_operand:V8HI 1 "vector_register_operand" "v") - (parallel [(match_operand:SI 2 "immediate_operand" "L")]))))) - (vec_select:SI (unspec:V4SI [(match_operand:V8HI 3 "vector_register_operand" "v")] UNSPEC_ARC_SIMD_VCAST) - (parallel [(match_operand:SI 4 "immediate_operand" "L")])))] + [(set (mem:SI + (plus:SI + (match_operand:SI 0 "immediate_operand" "P") + (zero_extend:SI + (vec_select:HI (match_operand:V8HI 1 "vector_register_operand" "v") + (parallel [(match_operand:SI 2 "immediate_operand" "L")]))))) + (vec_select:SI (unspec:V4SI [(match_operand:V8HI 3 "vector_register_operand" "v")] + UNSPEC_ARC_SIMD_VCAST) + (parallel [(match_operand:SI 4 "immediate_operand" "L")])))] "TARGET_SIMD_SET" - "vst32_%4 %3,[i%2, %0]" + "vst32_%4\\t%3,[i%2,%0]" [(set_attr "type" "simd_vstore") (set_attr "length" "4") (set_attr "cond" "nocond")]) ;; SIMD unit interrupt (define_insn "vinti_insn" - [(unspec_volatile [(match_operand:SI 0 "nonmemory_operand" "L")] UNSPEC_ARC_SIMD_VINTI)] + [(unspec_volatile [(match_operand:SI 0 "nonmemory_operand" "L")] + UNSPEC_ARC_SIMD_VINTI)] "TARGET_SIMD_SET" - "vinti %0" + "vinti\\t%0" [(set_attr "type" "simd_vcontrol") (set_attr "length" "4") (set_attr "cond" "nocond")]) @@ -1357,14 +1416,14 @@ (define_insn_and_split "*movv2hi_insn" [(set (match_operand:V2HI 0 "move_dest_operand" "=r,r,r,m") - (match_operand:V2HI 1 "general_operand" "i,r,m,r"))] + (match_operand:V2HI 1 "general_operand" "i,r,m,r"))] "(register_operand (operands[0], V2HImode) || register_operand (operands[1], V2HImode))" "@ # - mov%? %0, %1 - ld%U1%V1 %0,%1 - st%U0%V0 %1,%0" + mov%?\\t%0,%1 + ld%U1%V1\\t%0,%1 + st%U0%V0\\t%1,%0" "reload_completed && GET_CODE (operands[1]) == CONST_VECTOR" [(set (match_dup 0) (match_dup 2))] { @@ -1380,8 +1439,8 @@ ]) (define_expand "movmisalignv2hi" - [(set (match_operand:V2HI 0 "general_operand" "") - (match_operand:V2HI 1 "general_operand" ""))] + [(set (match_operand:V2HI 0 "general_operand" "") + (match_operand:V2HI 1 "general_operand" ""))] "unaligned_access" "{ if (prepare_move_operands (operands, V2HImode)) @@ -1411,19 +1470,19 @@ case 1: if (TARGET_PLUS_QMACW - && even_register_operand (operands[0], mode) + && even_register_operand (operands[0], mode) && even_register_operand (operands[1], mode)) - return \"vadd2%?\\t%0,%1,0\"; + return \"vadd2%?\\t%0,%1,0\"; return \"#\"; case 2: if (TARGET_LL64) - return \"ldd%U1%V1 %0,%1\"; + return \"ldd%U1%V1\\t%0,%1\"; return \"#\"; case 3: if (TARGET_LL64) - return \"std%U0%V0 %1,%0\"; + return \"std%U0%V0\\t%1,%0\"; return \"#\"; } }" @@ -1439,8 +1498,8 @@ ]) (define_expand "movmisalign" - [(set (match_operand:VWH 0 "general_operand" "") - (match_operand:VWH 1 "general_operand" ""))] + [(set (match_operand:VWH 0 "general_operand" "") + (match_operand:VWH 1 "general_operand" ""))] "unaligned_access" "{ if (prepare_move_operands (operands, mode)) @@ -1449,9 +1508,9 @@ (define_insn "bswapv2hi2" [(set (match_operand:V2HI 0 "register_operand" "=r,r") - (bswap:V2HI (match_operand:V2HI 1 "nonmemory_operand" "r,i")))] + (bswap:V2HI (match_operand:V2HI 1 "nonmemory_operand" "r,i")))] "TARGET_V2 && TARGET_SWAP" - "swape %0, %1" + "swape\\t%0,%1" [(set_attr "length" "4,8") (set_attr "type" "two_cycle_core")]) @@ -1461,7 +1520,7 @@ (plus:VCT (match_operand:VCT 1 "register_operand" "0,r") (match_operand:VCT 2 "register_operand" "r,r")))] "TARGET_PLUS_DMPY" - "vadd%? %0, %1, %2" + "vadd%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1472,7 +1531,7 @@ (minus:VCT (match_operand:VCT 1 "register_operand" "0,r") (match_operand:VCT 2 "register_operand" "r,r")))] "TARGET_PLUS_DMPY" - "vsub%? %0, %1, %2" + "vsub%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1482,14 +1541,16 @@ (define_insn "addsub3" [(set (match_operand:VDV 0 "register_operand" "=r,r") (vec_concat:VDV - (plus: (vec_select: (match_operand:VDV 1 "register_operand" "0,r") - (parallel [(const_int 0)])) - (vec_select: (match_operand:VDV 2 "register_operand" "r,r") - (parallel [(const_int 0)]))) - (minus: (vec_select: (match_dup 1) (parallel [(const_int 1)])) - (vec_select: (match_dup 2) (parallel [(const_int 1)])))))] + (plus: + (vec_select: (match_operand:VDV 1 "register_operand" "0,r") + (parallel [(const_int 0)])) + (vec_select: (match_operand:VDV 2 "register_operand" "r,r") + (parallel [(const_int 0)]))) + (minus: + (vec_select: (match_dup 1) (parallel [(const_int 1)])) + (vec_select: (match_dup 2) (parallel [(const_int 1)])))))] "TARGET_PLUS_DMPY" - "vaddsub%? %0, %1, %2" + "vaddsub%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1498,14 +1559,16 @@ (define_insn "subadd3" [(set (match_operand:VDV 0 "register_operand" "=r,r") (vec_concat:VDV - (minus: (vec_select: (match_operand:VDV 1 "register_operand" "0,r") - (parallel [(const_int 0)])) - (vec_select: (match_operand:VDV 2 "register_operand" "r,r") - (parallel [(const_int 0)]))) - (plus: (vec_select: (match_dup 1) (parallel [(const_int 1)])) - (vec_select: (match_dup 2) (parallel [(const_int 1)])))))] + (minus: + (vec_select: (match_operand:VDV 1 "register_operand" "0,r") + (parallel [(const_int 0)])) + (vec_select: (match_operand:VDV 2 "register_operand" "r,r") + (parallel [(const_int 0)]))) + (plus: + (vec_select: (match_dup 1) (parallel [(const_int 1)])) + (vec_select: (match_dup 2) (parallel [(const_int 1)])))))] "TARGET_PLUS_DMPY" - "vsubadd%? %0, %1, %2" + "vsubadd%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1515,10 +1578,11 @@ [(set (match_operand:V4HI 0 "even_register_operand" "=r,r") (vec_concat:V4HI (vec_concat:V2HI - (plus:HI (vec_select:HI (match_operand:V4HI 1 "even_register_operand" "0,r") - (parallel [(const_int 0)])) - (vec_select:HI (match_operand:V4HI 2 "even_register_operand" "r,r") - (parallel [(const_int 0)]))) + (plus:HI + (vec_select:HI (match_operand:V4HI 1 "even_register_operand" "0,r") + (parallel [(const_int 0)])) + (vec_select:HI (match_operand:V4HI 2 "even_register_operand" "r,r") + (parallel [(const_int 0)]))) (minus:HI (vec_select:HI (match_dup 1) (parallel [(const_int 1)])) (vec_select:HI (match_dup 2) (parallel [(const_int 1)])))) (vec_concat:V2HI @@ -1528,7 +1592,7 @@ (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) ))] "TARGET_PLUS_QMACW" - "vaddsub4h%? %0, %1, %2" + "vaddsub4h%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1538,10 +1602,11 @@ [(set (match_operand:V4HI 0 "even_register_operand" "=r,r") (vec_concat:V4HI (vec_concat:V2HI - (minus:HI (vec_select:HI (match_operand:V4HI 1 "even_register_operand" "0,r") - (parallel [(const_int 0)])) - (vec_select:HI (match_operand:V4HI 2 "even_register_operand" "r,r") - (parallel [(const_int 0)]))) + (minus:HI + (vec_select:HI (match_operand:V4HI 1 "even_register_operand" "0,r") + (parallel [(const_int 0)])) + (vec_select:HI (match_operand:V4HI 2 "even_register_operand" "r,r") + (parallel [(const_int 0)]))) (plus:HI (vec_select:HI (match_dup 1) (parallel [(const_int 1)])) (vec_select:HI (match_dup 2) (parallel [(const_int 1)])))) (vec_concat:V2HI @@ -1551,7 +1616,7 @@ (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) ))] "TARGET_PLUS_QMACW" - "vsubadd4h%? %0, %1, %2" + "vsubadd4h%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1581,7 +1646,7 @@ (SE:SI (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) (SE:SI (vec_select:HI (match_dup 2) (parallel [(const_int 1)])))))))] "TARGET_PLUS_DMPY" - "dmpyh%? %0, %1, %2" + "dmpyh%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1662,13 +1727,15 @@ (match_operand:V4HI 2 "even_register_operand" "r,r") (parallel [(const_int 0) (const_int 1)]))))) (set (reg:V2SI ARCV2_ACC) - (mult:V2SI (SE:V2SI (vec_select:V2HI (match_dup 1) - (parallel [(const_int 0) (const_int 1)]))) - (SE:V2SI (vec_select:V2HI (match_dup 2) - (parallel [(const_int 0) (const_int 1)]))))) + (mult:V2SI (SE:V2SI + (vec_select:V2HI (match_dup 1) + (parallel [(const_int 0) (const_int 1)]))) + (SE:V2SI + (vec_select:V2HI (match_dup 2) + (parallel [(const_int 0) (const_int 1)]))))) ] "TARGET_PLUS_MACD" - "vmpy2h%? %0, %1, %2" + "vmpy2h%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1684,7 +1751,7 @@ (parallel [(const_int 0) (const_int 1)]))))) ] "TARGET_PLUS_MACD" - "vmpy2h%? 0, %0, %1" + "vmpy2h%?\\t0,%0,%1" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "no") @@ -1716,26 +1783,28 @@ (match_operand:V4HI 2 "even_register_operand" "r,r") (parallel [(const_int 2) (const_int 3)]))))) (set (reg:V2SI ARCV2_ACC) - (mult:V2SI (SE:V2SI (vec_select:V2HI (match_dup 1) - (parallel [(const_int 2) (const_int 3)]))) - (SE:V2SI (vec_select:V2HI (match_dup 2) - (parallel [(const_int 2) (const_int 3)]))))) + (mult:V2SI (SE:V2SI + (vec_select:V2HI (match_dup 1) + (parallel [(const_int 2) (const_int 3)]))) + (SE:V2SI + (vec_select:V2HI (match_dup 2) + (parallel [(const_int 2) (const_int 3)]))))) ] "TARGET_PLUS_QMACW" - "vmpy2h%? %0, %R1, %R2" + "vmpy2h%?\\t%0,%R1,%R2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") (set_attr "cond" "canuse,nocond")]) (define_expand "vec_widen_mult_hi_v4hi" - [(set (match_operand:V2SI 0 "even_register_operand" "") + [(set (match_operand:V2SI 0 "even_register_operand") (mult:V2SI (SE:V2SI (vec_select:V2HI - (match_operand:V4HI 1 "even_register_operand" "") - (parallel [(const_int 2) (const_int 3)]))) + (match_operand:V4HI 1 "even_register_operand") + (parallel [(const_int 2) (const_int 3)]))) (SE:V2SI (vec_select:V2HI - (match_operand:V4HI 2 "even_register_operand" "") - (parallel [(const_int 2) (const_int 3)])))))] + (match_operand:V4HI 2 "even_register_operand") + (parallel [(const_int 2) (const_int 3)])))))] "TARGET_PLUS_MACD" { emit_insn (gen_arc_vec_mult_hi_v4hi (operands[0], @@ -1746,10 +1815,11 @@ ) (define_insn "arc_vec_mac_v2hiv2si" - [(set (match_operand:V2SI 0 "even_register_operand" "=r,Ral,r") + [(set (match_operand:V2SI 0 "even_register_operand" "=r,Ral,r") (plus:V2SI - (mult:V2SI (SE:V2SI (match_operand:V2HI 1 "register_operand" "0, r,r")) - (SE:V2SI (match_operand:V2HI 2 "register_operand" "r, r,r"))) + (mult:V2SI + (SE:V2SI (match_operand:V2HI 1 "register_operand" "0, r,r")) + (SE:V2SI (match_operand:V2HI 2 "register_operand" "r, r,r"))) (reg:V2SI ARCV2_ACC))) (set (reg:V2SI ARCV2_ACC) (plus:V2SI @@ -1786,7 +1856,7 @@ UNSPEC_ARC_DMACH)) (clobber (reg:DI ARCV2_ACC))] "TARGET_PLUS_DMPY" - "dmach%? %0, %1, %2" + "dmach%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1800,7 +1870,7 @@ UNSPEC_ARC_DMACHU)) (clobber (reg:DI ARCV2_ACC))] "TARGET_PLUS_DMPY" - "dmachu%? %0, %1, %2" + "dmachu%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1814,7 +1884,7 @@ UNSPEC_ARC_DMACWH)) (clobber (reg:DI ARCV2_ACC))] "TARGET_PLUS_QMACW" - "dmacwh%? %0, %1, %2" + "dmacwh%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1828,7 +1898,7 @@ UNSPEC_ARC_DMACWHU)) (clobber (reg:DI ARCV2_ACC))] "TARGET_PLUS_QMACW" - "dmacwhu%? %0, %1, %2" + "dmacwhu%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1842,7 +1912,7 @@ UNSPEC_ARC_VMAC2H)) (clobber (reg:DI ARCV2_ACC))] "TARGET_PLUS_MACD" - "vmac2h%? %0, %1, %2" + "vmac2h%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1856,7 +1926,7 @@ UNSPEC_ARC_VMAC2HU)) (clobber (reg:DI ARCV2_ACC))] "TARGET_PLUS_MACD" - "vmac2hu%? %0, %1, %2" + "vmac2hu%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1869,7 +1939,7 @@ UNSPEC_ARC_VMPY2H)) (clobber (reg:DI ARCV2_ACC))] "TARGET_PLUS_MACD" - "vmpy2h%? %0, %1, %2" + "vmpy2h%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1882,7 +1952,7 @@ UNSPEC_ARC_VMPY2HU)) (clobber (reg:DI ARCV2_ACC))] "TARGET_PLUS_MACD" - "vmpy2hu%? %0, %1, %2" + "vmpy2hu%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1896,7 +1966,7 @@ UNSPEC_ARC_QMACH)) (clobber (reg:DI ARCV2_ACC))] "TARGET_PLUS_QMACW" - "qmach%? %0, %1, %2" + "qmach%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1910,7 +1980,7 @@ UNSPEC_ARC_QMACHU)) (clobber (reg:DI ARCV2_ACC))] "TARGET_PLUS_QMACW" - "qmachu%? %0, %1, %2" + "qmachu%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1923,7 +1993,7 @@ UNSPEC_ARC_QMPYH)) (clobber (reg:DI ARCV2_ACC))] "TARGET_PLUS_QMACW" - "qmpyh%? %0, %1, %2" + "qmpyh%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") @@ -1936,7 +2006,7 @@ UNSPEC_ARC_QMPYHU)) (clobber (reg:DI ARCV2_ACC))] "TARGET_PLUS_QMACW" - "qmpyhu%? %0, %1, %2" + "qmpyhu%?\\t%0,%1,%2" [(set_attr "length" "4") (set_attr "type" "multi") (set_attr "predicable" "yes,no") -- cgit v1.1 From 79a27f32df8eab0add722f75332f78fe20d94da3 Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Mon, 10 May 2021 09:49:35 +0300 Subject: arc: Improve vector support for ARCv2. Add vector negate, reduc_plus_scal, vec_duplicate, vector min/max/mult/div patterns. Besides vector negate and reduction patterns, all the others are emulated using scalar instructions. The reason is taking advantage of the double load/store instructions as well as enabling the autovectorizer to further analize a loop. gcc/ 2021-05-10 Claudiu Zissulescu * config/arc/arc.md (UNSPEC_ARC_DMPYWH): Define. * config/arc/simdext.md (VCT): Add predicates for iterator elements. (EMUVEC): Define. (voptab): Likewise. (vec_widen_mult_hi_v4hi): Change pattern predicate. (v2si3): New patterns. (neg): Likewise. (reduc_plus_scal_v4hi): Likewise. (reduc_plus_scal_v2si): Likewise. (vec_duplicatev2si): Likewise. (vec_duplicatev4hi): Likewise. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/arc.md | 1 + gcc/config/arc/simdext.md | 112 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 111 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index f3efe65..b6f2d8e 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -128,6 +128,7 @@ UNSPEC_ARC_DMACHU UNSPEC_ARC_DMACWH UNSPEC_ARC_DMACWHU + UNSPEC_ARC_DMPYWH UNSPEC_ARC_QMACH UNSPEC_ARC_QMACHU UNSPEC_ARC_QMPYH diff --git a/gcc/config/arc/simdext.md b/gcc/config/arc/simdext.md index 41c4269..c7ca306 100644 --- a/gcc/config/arc/simdext.md +++ b/gcc/config/arc/simdext.md @@ -1395,9 +1395,20 @@ (define_mode_attr V_addsub_suffix [(V2HI "2h") (V2SI "")]) ;;all vectors -(define_mode_iterator VCT [V2HI V4HI V2SI]) +(define_mode_iterator VCT [(V2HI "TARGET_PLUS_DMPY") + (V4HI "TARGET_PLUS_QMACW") + (V2SI "TARGET_PLUS_QMACW")]) (define_mode_attr V_suffix [(V2HI "2h") (V4HI "4h") (V2SI "2")]) +(define_code_iterator EMUVEC [(mult "TARGET_MPYW") + (div "TARGET_DIVREM") + smax smin]) + +(define_code_attr voptab [(mult "mul") + (div "div") + (smin "smin") + (smax "smax")]) + ;; Widening operations. (define_code_iterator SE [sign_extend zero_extend]) (define_code_attr V_US [(sign_extend "s") (zero_extend "u")]) @@ -1805,7 +1816,7 @@ (SE:V2SI (vec_select:V2HI (match_operand:V4HI 2 "even_register_operand") (parallel [(const_int 2) (const_int 3)])))))] - "TARGET_PLUS_MACD" + "TARGET_PLUS_QMACW" { emit_insn (gen_arc_vec_mult_hi_v4hi (operands[0], operands[1], @@ -2011,3 +2022,100 @@ (set_attr "type" "multi") (set_attr "predicable" "yes,no") (set_attr "cond" "canuse,nocond")]) + +;; Emulated vector instructions. +(define_insn_and_split "v2si3" + [(set (match_operand:V2SI 0 "register_operand" "=r") + (EMUVEC:V2SI (match_operand:V2SI 1 "register_operand" "r") + (match_operand:V2SI 2 "nonmemory_operand" "ri")))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx high_dest = gen_highpart (SImode, operands[0]); + rtx low_dest = gen_lowpart (SImode, operands[0]); + rtx high_op1 = gen_highpart (SImode, operands[1]); + rtx low_op1 = gen_lowpart (SImode, operands[1]); + rtx high_op2 = gen_highpart (SImode, operands[2]); + rtx low_op2 = gen_lowpart (SImode, operands[2]); + emit_insn (gen_si3 (low_dest, low_op1, low_op2)); + emit_insn (gen_si3 (high_dest, high_op1, high_op2)); + DONE; + } + [(set_attr "length" "12") + (set_attr "type" "multi")]) + +(define_expand "neg2" + [(set (match_operand:VCT 0 "register_operand") + (neg:VCT (match_operand:VCT 1 "register_operand")))] + "TARGET_PLUS_DMPY" + "") + +(define_insn "*neg2" + [(set (match_operand:VCT 0 "register_operand" "=r") + (neg:VCT (match_operand:VCT 1 "register_operand" "r")))] + "TARGET_PLUS_DMPY" + "vsub\\t%0,0,%1" + [(set_attr "length" "8") + (set_attr "type" "multi")]) + +(define_insn "reduc_plus_scal_v4hi" + [(set (match_operand:HI 0 "even_register_operand" "=r") + (unspec:HI [(match_operand:V4HI 1 "even_register_operand" "r")] + UNSPEC_ARC_QMPYH)) + (clobber (reg:DI ARCV2_ACC))] + "TARGET_PLUS_QMACW" + "qmpyh\\t%0,%1,1" + [(set_attr "length" "4") + (set_attr "type" "multi")]) + +(define_insn "reduc_plus_scal_v2si" + [(set (match_operand:SI 0 "even_register_operand" "=r") + (unspec:SI [(match_operand:V2SI 1 "even_register_operand" "r")] + UNSPEC_ARC_DMPYWH)) + (clobber (reg:DI ARCV2_ACC))] + "TARGET_PLUS_DMPY" + "dmpywh\\t%0,%1,1" + [(set_attr "length" "4") + (set_attr "type" "multi")]) + +(define_insn_and_split "vec_duplicatev2si" + [(set (match_operand:V2SI 0 "register_operand" "=r") + (vec_duplicate:V2SI + (match_operand:SI 1 "nonmemory_operand" "ri")))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx high_dest = gen_highpart (SImode, operands[0]); + rtx low_dest = gen_lowpart (SImode, operands[0]); + emit_move_insn (high_dest, operands[1]); + emit_move_insn (low_dest, operands[1]); + DONE; + } + [(set_attr "length" "8") + (set_attr "type" "multi")]) + +(define_insn_and_split "vec_duplicatev4hi" + [(set (match_operand:V4HI 0 "register_operand" "=r") + (vec_duplicate:V4HI + (match_operand:HI 1 "nonmemory_operand" "ri")))] + "TARGET_BARREL_SHIFTER" + "#" + "reload_completed" + [(const_int 0)] + { + rtx high_dest = gen_highpart (SImode, operands[0]); + rtx low_dest = gen_lowpart (SImode, operands[0]); + rtx tmp = gen_lowpart (SImode, operands[1]); + emit_insn (gen_rtx_SET (high_dest, + gen_rtx_ASHIFT (SImode, tmp, GEN_INT (16)))); + emit_insn (gen_rtx_SET (low_dest, + gen_rtx_IOR (SImode, high_dest, tmp))); + emit_move_insn (high_dest, low_dest); + DONE; + } + [(set_attr "length" "12") + (set_attr "type" "multi")]) -- cgit v1.1 From 4bc6fb21bd932ba37ffb14795002f7214b8e3cfd Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Thu, 22 Apr 2021 09:39:39 +0200 Subject: Remove __cplusplus >= 201103 Right now, we require a C++11 compiler, so the check is not needed any longer. gcc/analyzer/ChangeLog: * program-state.cc (program_state::operator=): Remove __cplusplus >= 201103. (program_state::program_state): Likewise. * program-state.h: Likewise. * region-model.h (class region_model): Remove dead code. gcc/ChangeLog: * bitmap.h (class auto_bitmap): Remove __cplusplus >= 201103. * config/aarch64/aarch64.c: Likewise. * gimple-ssa-store-merging.c (store_immediate_info::store_immediate_info): Likewise. * sbitmap.h: Likewise. --- gcc/config/aarch64/aarch64.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index c2f4b27..04855cb 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -221,9 +221,7 @@ public: predicate in each predicate argument register. This means that we need at least 12 pieces. */ static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS; -#if __cplusplus >= 201103L static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates"); -#endif /* Describes one piece of a PST. Each piece is one of: -- cgit v1.1 From 7596c762137f26f495b53ec93471273887832e31 Mon Sep 17 00:00:00 2001 From: Alex Coplan Date: Mon, 10 May 2021 09:46:45 +0100 Subject: arm: Fix wrong code with MVE V2DImode loads and stores [PR99960] As the PR shows, we currently miscompile V2DImode loads and stores for MVE. We're currently using 64-bit loads/stores, but need to be using 128-bit vector loads and stores. Fixed thusly. Some intrinsics tests were checking that we (incorrectly) used the 64-bit loads/stores: these have been updated. gcc/ChangeLog: PR target/99960 * config/arm/mve.md (*mve_mov): Simplify output code. Use vldrw.u32 and vstrw.32 for V2D[IF]mode loads and stores. gcc/testsuite/ChangeLog: PR target/99960 * gcc.target/arm/mve/intrinsics/vldrdq_gather_base_wb_s64.c: Update now that we're (correctly) using full 128-bit vector loads/stores. * gcc.target/arm/mve/intrinsics/vldrdq_gather_base_wb_u64.c: Likewise. * gcc.target/arm/mve/intrinsics/vldrdq_gather_base_wb_z_s64.c: Likewise. * gcc.target/arm/mve/intrinsics/vldrdq_gather_base_wb_z_u64.c: Likewise. * gcc.target/arm/mve/intrinsics/vuninitializedq_int.c: Likewise. * gcc.target/arm/mve/intrinsics/vuninitializedq_int1.c: Likewise. --- gcc/config/arm/mve.md | 35 +++++------------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 7467d5f..5c11885 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -41,44 +41,19 @@ if (which_alternative == 4 || which_alternative == 7) { - rtx ops[2]; - int regno = (which_alternative == 7) - ? REGNO (operands[1]) : REGNO (operands[0]); - - ops[0] = operands[0]; - ops[1] = operands[1]; - if (mode == V2DFmode || mode == V2DImode) - { - if (which_alternative == 7) - { - ops[1] = gen_rtx_REG (DImode, regno); - output_asm_insn ("vstr.64\t%P1, %E0",ops); - } - else - { - ops[0] = gen_rtx_REG (DImode, regno); - output_asm_insn ("vldr.64\t%P0, %E1",ops); - } - } - else if (mode == TImode) + if (mode == V2DFmode || mode == V2DImode || mode == TImode) { if (which_alternative == 7) - output_asm_insn ("vstr.64\t%q1, %E0",ops); + output_asm_insn ("vstrw.32\t%q1, %E0", operands); else - output_asm_insn ("vldr.64\t%q0, %E1",ops); + output_asm_insn ("vldrw.u32\t%q0, %E1",operands); } else { if (which_alternative == 7) - { - ops[1] = gen_rtx_REG (TImode, regno); - output_asm_insn ("vstr.\t%q1, %E0",ops); - } + output_asm_insn ("vstr.\t%q1, %E0", operands); else - { - ops[0] = gen_rtx_REG (TImode, regno); - output_asm_insn ("vldr.\t%q0, %E1",ops); - } + output_asm_insn ("vldr.\t%q0, %E1", operands); } return ""; } -- cgit v1.1 From d1cee151e10e3099dba332d10f1f5c28ac05fb73 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Mon, 10 May 2021 12:51:33 +0000 Subject: arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version There is no need to have a signed and an unsigned version of these builtins. This is similar to what we do for Neon in arm_neon.h. This mechanical patch enables later cleanup patches. 2021-05-10 Christophe Lyon gcc/ * config/arm/arm_mve.h (__arm_vcmpeq*u*, __arm_vcmpne*u*): Call the 's' version of the builtin. --- gcc/config/arm/arm_mve.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h index 3a40c6e..e4dfe91 100644 --- a/gcc/config/arm/arm_mve.h +++ b/gcc/config/arm/arm_mve.h @@ -3695,21 +3695,21 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_mve_vcmpneq_uv16qi (__a, __b); + return __builtin_mve_vcmpneq_sv16qi ((int8x16_t)__a, (int8x16_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_mve_vcmpneq_uv8hi (__a, __b); + return __builtin_mve_vcmpneq_sv8hi ((int16x8_t)__a, (int16x8_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_mve_vcmpneq_uv4si (__a, __b); + return __builtin_mve_vcmpneq_sv4si ((int32x4_t)__a, (int32x4_t)__b); } __extension__ extern __inline int8x16_t @@ -3932,7 +3932,7 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_n_u8 (uint8x16_t __a, uint8_t __b) { - return __builtin_mve_vcmpneq_n_uv16qi (__a, __b); + return __builtin_mve_vcmpneq_n_sv16qi ((int8x16_t)__a, (int8_t)__b); } __extension__ extern __inline mve_pred16_t @@ -3953,14 +3953,14 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_mve_vcmpeqq_uv16qi (__a, __b); + return __builtin_mve_vcmpeqq_sv16qi ((int8x16_t)__a, (int8x16_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_n_u8 (uint8x16_t __a, uint8_t __b) { - return __builtin_mve_vcmpeqq_n_uv16qi (__a, __b); + return __builtin_mve_vcmpeqq_n_sv16qi ((int8x16_t)__a, (int8_t)__b); } __extension__ extern __inline mve_pred16_t @@ -4774,7 +4774,7 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_n_u16 (uint16x8_t __a, uint16_t __b) { - return __builtin_mve_vcmpneq_n_uv8hi (__a, __b); + return __builtin_mve_vcmpneq_n_sv8hi ((int16x8_t)__a, (int16_t)__b); } __extension__ extern __inline mve_pred16_t @@ -4795,14 +4795,14 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_mve_vcmpeqq_uv8hi (__a, __b); + return __builtin_mve_vcmpeqq_sv8hi ((int16x8_t)__a, (int16x8_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_n_u16 (uint16x8_t __a, uint16_t __b) { - return __builtin_mve_vcmpeqq_n_uv8hi (__a, __b); + return __builtin_mve_vcmpeqq_n_sv8hi ((int16x8_t)__a, (int16_t)__b); } __extension__ extern __inline mve_pred16_t @@ -5616,7 +5616,7 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_n_u32 (uint32x4_t __a, uint32_t __b) { - return __builtin_mve_vcmpneq_n_uv4si (__a, __b); + return __builtin_mve_vcmpneq_n_sv4si ((int32x4_t)__a, (int32_t)__b); } __extension__ extern __inline mve_pred16_t @@ -5637,14 +5637,14 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_mve_vcmpeqq_uv4si (__a, __b); + return __builtin_mve_vcmpeqq_sv4si ((int32x4_t)__a, (int32x4_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_n_u32 (uint32x4_t __a, uint32_t __b) { - return __builtin_mve_vcmpeqq_n_uv4si (__a, __b); + return __builtin_mve_vcmpeqq_n_sv4si ((int32x4_t)__a, (int32_t)__b); } __extension__ extern __inline mve_pred16_t -- cgit v1.1 From f56af5f9c20ca8cf4ee916bbfc06d8c1584868cb Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Mon, 10 May 2021 12:51:45 +0000 Subject: arm: MVE: Cleanup vcmpne/vcmpeq builtins After the previous patch, we no longer need to emit the unsigned variants of vcmpneq/vcmpeqq. This patch removes them as well as the corresponding iterator entries. 2021-05-10 Christophe Lyon gcc/ * config/arm/arm_mve_builtins.def (vcmpneq_u): Remove. (vcmpneq_n_u): Likewise. (vcmpeqq_u,): Likewise. (vcmpeqq_n_u): Likewise. * config/arm/iterators.md (supf): Remove VCMPNEQ_U, VCMPEQQ_U, VCMPEQQ_N_U and VCMPNEQ_N_U. * config/arm/mve.md (mve_vcmpneq): Remove iteration. (mve_vcmpeqq_n): Likewise. (mve_vcmpeqq): Likewise. (mve_vcmpneq_n): Likewise. --- gcc/config/arm/arm_mve_builtins.def | 4 ---- gcc/config/arm/iterators.md | 15 +++++++-------- gcc/config/arm/mve.md | 16 ++++++++-------- 3 files changed, 15 insertions(+), 20 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def index 460f6ba..ee34fd1 100644 --- a/gcc/config/arm/arm_mve_builtins.def +++ b/gcc/config/arm/arm_mve_builtins.def @@ -90,7 +90,6 @@ VAR3 (BINOP_NONE_NONE_IMM, vshrq_n_s, v16qi, v8hi, v4si) VAR1 (BINOP_NONE_NONE_UNONE, vaddlvq_p_s, v4si) VAR1 (BINOP_UNONE_UNONE_UNONE, vaddlvq_p_u, v4si) VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_s, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpneq_u, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vshlq_s, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_NONE, vshlq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vsubq_u, v16qi, v8hi, v4si) @@ -118,11 +117,8 @@ VAR3 (BINOP_UNONE_UNONE_UNONE, vhsubq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, veorq_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpneq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_n_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 8fb723e..0aba93f 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -1279,13 +1279,12 @@ (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s") (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U "u") (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s") (VSHLQ_S "s") - (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s") + (VADDLVQ_P_U "u") (VCMPNEQ_S "s") (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s") (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u") (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VBRSRQ_N_S "s") - (VBRSRQ_N_U "u") (VCMPEQQ_S "s") (VCMPEQQ_U "u") - (VCMPEQQ_N_S "s") (VCMPEQQ_N_U "u") (VCMPNEQ_N_S "s") - (VCMPNEQ_N_U "u") + (VBRSRQ_N_U "u") (VCMPEQQ_S "s") + (VCMPEQQ_N_S "s") (VCMPNEQ_N_S "s") (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s") (VHADDQ_U "u") (VHSUBQ_N_S "s") (VHSUBQ_N_U "u") (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u") @@ -1541,16 +1540,16 @@ (define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U]) (define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U]) (define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U]) -(define_int_iterator VCMPNEQ [VCMPNEQ_U VCMPNEQ_S]) +(define_int_iterator VCMPNEQ [VCMPNEQ_S]) (define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U]) (define_int_iterator VABDQ [VABDQ_S VABDQ_U]) (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U]) (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U]) (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S]) (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S]) -(define_int_iterator VCMPEQQ [VCMPEQQ_U VCMPEQQ_S]) -(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S VCMPEQQ_N_U]) -(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_U VCMPNEQ_N_S]) +(define_int_iterator VCMPEQQ [VCMPEQQ_S]) +(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S]) +(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_S]) (define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U]) (define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S]) (define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U]) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 5c11885..9712bc0 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -811,9 +811,9 @@ (set_attr "length""8")]) ;; -;; [vcmpneq_u, vcmpneq_s]) +;; [vcmpneq_s]) ;; -(define_insn "mve_vcmpneq_" +(define_insn "mve_vcmpneq_s" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1010,9 +1010,9 @@ ]) ;; -;; [vcmpeqq_n_s, vcmpeqq_n_u]) +;; [vcmpeqq_n_s]) ;; -(define_insn "mve_vcmpeqq_n_" +(define_insn "mve_vcmpeqq_n_s" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1025,9 +1025,9 @@ ]) ;; -;; [vcmpeqq_u, vcmpeqq_s]) +;; [vcmpeqq_s]) ;; -(define_insn "mve_vcmpeqq_" +(define_insn "mve_vcmpeqq_s" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1190,9 +1190,9 @@ ]) ;; -;; [vcmpneq_n_u, vcmpneq_n_s]) +;; [vcmpneq_n_s]) ;; -(define_insn "mve_vcmpneq_n_" +(define_insn "mve_vcmpneq_n_s" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") -- cgit v1.1 From 929056a7689bead913be686c3581bb8a8599acb6 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Mon, 10 May 2021 12:51:53 +0000 Subject: arm: MVE: Remove _s and _u suffixes from vcmp* builtins. This patch brings more unification in the vector comparison builtins, by removing the useless 's' (signed) suffix since we no longer need unsigned versions. 2021-05-10 Christophe Lyon gcc/ * config/arm/arm_mve.h (__arm_vcmp*): Remove 's' suffix. * config/arm/arm_mve_builtins.def (vcmp*): Remove 's' suffix. * config/arm/mve.md (mve_vcmp*): Remove 's' suffix in pattern names. --- gcc/config/arm/arm_mve.h | 120 ++++++++++++++++++------------------ gcc/config/arm/arm_mve_builtins.def | 32 +++++----- gcc/config/arm/mve.md | 64 +++++++++---------- 3 files changed, 108 insertions(+), 108 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h index e4dfe91..5d78269 100644 --- a/gcc/config/arm/arm_mve.h +++ b/gcc/config/arm/arm_mve.h @@ -3674,42 +3674,42 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_mve_vcmpneq_sv16qi (__a, __b); + return __builtin_mve_vcmpneq_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_mve_vcmpneq_sv8hi (__a, __b); + return __builtin_mve_vcmpneq_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_mve_vcmpneq_sv4si (__a, __b); + return __builtin_mve_vcmpneq_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_mve_vcmpneq_sv16qi ((int8x16_t)__a, (int8x16_t)__b); + return __builtin_mve_vcmpneq_v16qi ((int8x16_t)__a, (int8x16_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_mve_vcmpneq_sv8hi ((int16x8_t)__a, (int16x8_t)__b); + return __builtin_mve_vcmpneq_v8hi ((int16x8_t)__a, (int16x8_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_mve_vcmpneq_sv4si ((int32x4_t)__a, (int32x4_t)__b); + return __builtin_mve_vcmpneq_v4si ((int32x4_t)__a, (int32x4_t)__b); } __extension__ extern __inline int8x16_t @@ -3932,49 +3932,49 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_n_u8 (uint8x16_t __a, uint8_t __b) { - return __builtin_mve_vcmpneq_n_sv16qi ((int8x16_t)__a, (int8_t)__b); + return __builtin_mve_vcmpneq_n_v16qi ((int8x16_t)__a, (int8_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmphiq_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_mve_vcmphiq_uv16qi (__a, __b); + return __builtin_mve_vcmphiq_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmphiq_n_u8 (uint8x16_t __a, uint8_t __b) { - return __builtin_mve_vcmphiq_n_uv16qi (__a, __b); + return __builtin_mve_vcmphiq_n_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_mve_vcmpeqq_sv16qi ((int8x16_t)__a, (int8x16_t)__b); + return __builtin_mve_vcmpeqq_v16qi ((int8x16_t)__a, (int8x16_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_n_u8 (uint8x16_t __a, uint8_t __b) { - return __builtin_mve_vcmpeqq_n_sv16qi ((int8x16_t)__a, (int8_t)__b); + return __builtin_mve_vcmpeqq_n_v16qi ((int8x16_t)__a, (int8_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpcsq_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_mve_vcmpcsq_uv16qi (__a, __b); + return __builtin_mve_vcmpcsq_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpcsq_n_u8 (uint8x16_t __a, uint8_t __b) { - return __builtin_mve_vcmpcsq_n_uv16qi (__a, __b); + return __builtin_mve_vcmpcsq_n_v16qi (__a, __b); } __extension__ extern __inline uint8x16_t @@ -4144,77 +4144,77 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_n_s8 (int8x16_t __a, int8_t __b) { - return __builtin_mve_vcmpneq_n_sv16qi (__a, __b); + return __builtin_mve_vcmpneq_n_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpltq_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_mve_vcmpltq_sv16qi (__a, __b); + return __builtin_mve_vcmpltq_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpltq_n_s8 (int8x16_t __a, int8_t __b) { - return __builtin_mve_vcmpltq_n_sv16qi (__a, __b); + return __builtin_mve_vcmpltq_n_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpleq_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_mve_vcmpleq_sv16qi (__a, __b); + return __builtin_mve_vcmpleq_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpleq_n_s8 (int8x16_t __a, int8_t __b) { - return __builtin_mve_vcmpleq_n_sv16qi (__a, __b); + return __builtin_mve_vcmpleq_n_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpgtq_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_mve_vcmpgtq_sv16qi (__a, __b); + return __builtin_mve_vcmpgtq_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpgtq_n_s8 (int8x16_t __a, int8_t __b) { - return __builtin_mve_vcmpgtq_n_sv16qi (__a, __b); + return __builtin_mve_vcmpgtq_n_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpgeq_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_mve_vcmpgeq_sv16qi (__a, __b); + return __builtin_mve_vcmpgeq_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpgeq_n_s8 (int8x16_t __a, int8_t __b) { - return __builtin_mve_vcmpgeq_n_sv16qi (__a, __b); + return __builtin_mve_vcmpgeq_n_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_mve_vcmpeqq_sv16qi (__a, __b); + return __builtin_mve_vcmpeqq_v16qi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_n_s8 (int8x16_t __a, int8_t __b) { - return __builtin_mve_vcmpeqq_n_sv16qi (__a, __b); + return __builtin_mve_vcmpeqq_n_v16qi (__a, __b); } __extension__ extern __inline uint8x16_t @@ -4774,49 +4774,49 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_n_u16 (uint16x8_t __a, uint16_t __b) { - return __builtin_mve_vcmpneq_n_sv8hi ((int16x8_t)__a, (int16_t)__b); + return __builtin_mve_vcmpneq_n_v8hi ((int16x8_t)__a, (int16_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmphiq_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_mve_vcmphiq_uv8hi (__a, __b); + return __builtin_mve_vcmphiq_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmphiq_n_u16 (uint16x8_t __a, uint16_t __b) { - return __builtin_mve_vcmphiq_n_uv8hi (__a, __b); + return __builtin_mve_vcmphiq_n_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_mve_vcmpeqq_sv8hi ((int16x8_t)__a, (int16x8_t)__b); + return __builtin_mve_vcmpeqq_v8hi ((int16x8_t)__a, (int16x8_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_n_u16 (uint16x8_t __a, uint16_t __b) { - return __builtin_mve_vcmpeqq_n_sv8hi ((int16x8_t)__a, (int16_t)__b); + return __builtin_mve_vcmpeqq_n_v8hi ((int16x8_t)__a, (int16_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpcsq_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_mve_vcmpcsq_uv8hi (__a, __b); + return __builtin_mve_vcmpcsq_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpcsq_n_u16 (uint16x8_t __a, uint16_t __b) { - return __builtin_mve_vcmpcsq_n_uv8hi (__a, __b); + return __builtin_mve_vcmpcsq_n_v8hi (__a, __b); } __extension__ extern __inline uint16x8_t @@ -4986,77 +4986,77 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_n_s16 (int16x8_t __a, int16_t __b) { - return __builtin_mve_vcmpneq_n_sv8hi (__a, __b); + return __builtin_mve_vcmpneq_n_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpltq_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_mve_vcmpltq_sv8hi (__a, __b); + return __builtin_mve_vcmpltq_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpltq_n_s16 (int16x8_t __a, int16_t __b) { - return __builtin_mve_vcmpltq_n_sv8hi (__a, __b); + return __builtin_mve_vcmpltq_n_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpleq_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_mve_vcmpleq_sv8hi (__a, __b); + return __builtin_mve_vcmpleq_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpleq_n_s16 (int16x8_t __a, int16_t __b) { - return __builtin_mve_vcmpleq_n_sv8hi (__a, __b); + return __builtin_mve_vcmpleq_n_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpgtq_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_mve_vcmpgtq_sv8hi (__a, __b); + return __builtin_mve_vcmpgtq_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpgtq_n_s16 (int16x8_t __a, int16_t __b) { - return __builtin_mve_vcmpgtq_n_sv8hi (__a, __b); + return __builtin_mve_vcmpgtq_n_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpgeq_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_mve_vcmpgeq_sv8hi (__a, __b); + return __builtin_mve_vcmpgeq_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpgeq_n_s16 (int16x8_t __a, int16_t __b) { - return __builtin_mve_vcmpgeq_n_sv8hi (__a, __b); + return __builtin_mve_vcmpgeq_n_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_mve_vcmpeqq_sv8hi (__a, __b); + return __builtin_mve_vcmpeqq_v8hi (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_n_s16 (int16x8_t __a, int16_t __b) { - return __builtin_mve_vcmpeqq_n_sv8hi (__a, __b); + return __builtin_mve_vcmpeqq_n_v8hi (__a, __b); } __extension__ extern __inline uint16x8_t @@ -5616,49 +5616,49 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_n_u32 (uint32x4_t __a, uint32_t __b) { - return __builtin_mve_vcmpneq_n_sv4si ((int32x4_t)__a, (int32_t)__b); + return __builtin_mve_vcmpneq_n_v4si ((int32x4_t)__a, (int32_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmphiq_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_mve_vcmphiq_uv4si (__a, __b); + return __builtin_mve_vcmphiq_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmphiq_n_u32 (uint32x4_t __a, uint32_t __b) { - return __builtin_mve_vcmphiq_n_uv4si (__a, __b); + return __builtin_mve_vcmphiq_n_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_mve_vcmpeqq_sv4si ((int32x4_t)__a, (int32x4_t)__b); + return __builtin_mve_vcmpeqq_v4si ((int32x4_t)__a, (int32x4_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_n_u32 (uint32x4_t __a, uint32_t __b) { - return __builtin_mve_vcmpeqq_n_sv4si ((int32x4_t)__a, (int32_t)__b); + return __builtin_mve_vcmpeqq_n_v4si ((int32x4_t)__a, (int32_t)__b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpcsq_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_mve_vcmpcsq_uv4si (__a, __b); + return __builtin_mve_vcmpcsq_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpcsq_n_u32 (uint32x4_t __a, uint32_t __b) { - return __builtin_mve_vcmpcsq_n_uv4si (__a, __b); + return __builtin_mve_vcmpcsq_n_v4si (__a, __b); } __extension__ extern __inline uint32x4_t @@ -5828,77 +5828,77 @@ __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpneq_n_s32 (int32x4_t __a, int32_t __b) { - return __builtin_mve_vcmpneq_n_sv4si (__a, __b); + return __builtin_mve_vcmpneq_n_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpltq_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_mve_vcmpltq_sv4si (__a, __b); + return __builtin_mve_vcmpltq_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpltq_n_s32 (int32x4_t __a, int32_t __b) { - return __builtin_mve_vcmpltq_n_sv4si (__a, __b); + return __builtin_mve_vcmpltq_n_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpleq_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_mve_vcmpleq_sv4si (__a, __b); + return __builtin_mve_vcmpleq_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpleq_n_s32 (int32x4_t __a, int32_t __b) { - return __builtin_mve_vcmpleq_n_sv4si (__a, __b); + return __builtin_mve_vcmpleq_n_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpgtq_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_mve_vcmpgtq_sv4si (__a, __b); + return __builtin_mve_vcmpgtq_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpgtq_n_s32 (int32x4_t __a, int32_t __b) { - return __builtin_mve_vcmpgtq_n_sv4si (__a, __b); + return __builtin_mve_vcmpgtq_n_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpgeq_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_mve_vcmpgeq_sv4si (__a, __b); + return __builtin_mve_vcmpgeq_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpgeq_n_s32 (int32x4_t __a, int32_t __b) { - return __builtin_mve_vcmpgeq_n_sv4si (__a, __b); + return __builtin_mve_vcmpgeq_n_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_mve_vcmpeqq_sv4si (__a, __b); + return __builtin_mve_vcmpeqq_v4si (__a, __b); } __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmpeqq_n_s32 (int32x4_t __a, int32_t __b) { - return __builtin_mve_vcmpeqq_n_sv4si (__a, __b); + return __builtin_mve_vcmpeqq_n_v4si (__a, __b); } __extension__ extern __inline uint32x4_t diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def index ee34fd1..e9b5b28 100644 --- a/gcc/config/arm/arm_mve_builtins.def +++ b/gcc/config/arm/arm_mve_builtins.def @@ -89,7 +89,7 @@ VAR3 (BINOP_UNONE_UNONE_IMM, vshrq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_IMM, vshrq_n_s, v16qi, v8hi, v4si) VAR1 (BINOP_NONE_NONE_UNONE, vaddlvq_p_s, v4si) VAR1 (BINOP_UNONE_UNONE_UNONE, vaddlvq_p_u, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_s, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vshlq_s, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_NONE, vshlq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vsubq_u, v16qi, v8hi, v4si) @@ -117,10 +117,10 @@ VAR3 (BINOP_UNONE_UNONE_UNONE, vhsubq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, veorq_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_n_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_u, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vandq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vaddvq_p_u, v16qi, v8hi, v4si) @@ -142,17 +142,17 @@ VAR3 (BINOP_UNONE_UNONE_NONE, vbrsrq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_IMM, vshlq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_IMM, vrshrq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_IMM, vqshlq_n_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_n_s, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_s, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_n_s, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_s, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_n_s, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_s, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_s, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_s, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_s, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_s, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_s, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_NONE_IMM, vqshluq_n_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_UNONE, vaddvq_p_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vsubq_s, v16qi, v8hi, v4si) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 9712bc0..6f5fe06 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -811,9 +811,9 @@ (set_attr "length""8")]) ;; -;; [vcmpneq_s]) +;; [vcmpneq_]) ;; -(define_insn "mve_vcmpneq_s" +(define_insn "mve_vcmpneq_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -980,9 +980,9 @@ ) ;; -;; [vcmpcsq_n_u]) +;; [vcmpcsq_n_]) ;; -(define_insn "mve_vcmpcsq_n_u" +(define_insn "mve_vcmpcsq_n_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -995,9 +995,9 @@ ]) ;; -;; [vcmpcsq_u]) +;; [vcmpcsq_]) ;; -(define_insn "mve_vcmpcsq_u" +(define_insn "mve_vcmpcsq_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1010,9 +1010,9 @@ ]) ;; -;; [vcmpeqq_n_s]) +;; [vcmpeqq_n_]) ;; -(define_insn "mve_vcmpeqq_n_s" +(define_insn "mve_vcmpeqq_n_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1025,9 +1025,9 @@ ]) ;; -;; [vcmpeqq_s]) +;; [vcmpeqq_]) ;; -(define_insn "mve_vcmpeqq_s" +(define_insn "mve_vcmpeqq_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1040,9 +1040,9 @@ ]) ;; -;; [vcmpgeq_n_s]) +;; [vcmpgeq_n_]) ;; -(define_insn "mve_vcmpgeq_n_s" +(define_insn "mve_vcmpgeq_n_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1055,9 +1055,9 @@ ]) ;; -;; [vcmpgeq_s]) +;; [vcmpgeq_]) ;; -(define_insn "mve_vcmpgeq_s" +(define_insn "mve_vcmpgeq_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1070,9 +1070,9 @@ ]) ;; -;; [vcmpgtq_n_s]) +;; [vcmpgtq_n_]) ;; -(define_insn "mve_vcmpgtq_n_s" +(define_insn "mve_vcmpgtq_n_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1085,9 +1085,9 @@ ]) ;; -;; [vcmpgtq_s]) +;; [vcmpgtq_]) ;; -(define_insn "mve_vcmpgtq_s" +(define_insn "mve_vcmpgtq_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1100,9 +1100,9 @@ ]) ;; -;; [vcmphiq_n_u]) +;; [vcmphiq_n_]) ;; -(define_insn "mve_vcmphiq_n_u" +(define_insn "mve_vcmphiq_n_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1115,9 +1115,9 @@ ]) ;; -;; [vcmphiq_u]) +;; [vcmphiq_]) ;; -(define_insn "mve_vcmphiq_u" +(define_insn "mve_vcmphiq_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1130,9 +1130,9 @@ ]) ;; -;; [vcmpleq_n_s]) +;; [vcmpleq_n_]) ;; -(define_insn "mve_vcmpleq_n_s" +(define_insn "mve_vcmpleq_n_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1145,9 +1145,9 @@ ]) ;; -;; [vcmpleq_s]) +;; [vcmpleq_]) ;; -(define_insn "mve_vcmpleq_s" +(define_insn "mve_vcmpleq_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1160,9 +1160,9 @@ ]) ;; -;; [vcmpltq_n_s]) +;; [vcmpltq_n_]) ;; -(define_insn "mve_vcmpltq_n_s" +(define_insn "mve_vcmpltq_n_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1175,9 +1175,9 @@ ]) ;; -;; [vcmpltq_s]) +;; [vcmpltq_]) ;; -(define_insn "mve_vcmpltq_s" +(define_insn "mve_vcmpltq_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1190,9 +1190,9 @@ ]) ;; -;; [vcmpneq_n_s]) +;; [vcmpneq_n_]) ;; -(define_insn "mve_vcmpneq_n_s" +(define_insn "mve_vcmpneq_n_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") -- cgit v1.1 From d083fbf72d4533d2009c725524983e1184981e74 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Mon, 10 May 2021 12:52:02 +0000 Subject: arm: MVE: Factorize all vcmp* integer patterns After removing the signed and unsigned suffixes in the previous patches, we can now factorize the vcmp* patterns: there is no longer an asymmetry where operators do not have the same set of signed and unsigned variants. The will make maintenance easier. MVE has a different set of vector comparison operators than Neon, so we have to introduce dedicated iterators. 2021-05-10 Christophe Lyon gcc/ * config/arm/iterators.md (MVE_COMPARISONS): New. (mve_cmp_op): New. (mve_cmp_type): New. * config/arm/mve.md (mve_vcmpq_): New, merge all mve_vcmp patterns. (mve_vcmpneq_, mve_vcmpcsq_n_, mve_vcmpcsq_) (mve_vcmpeqq_n_, mve_vcmpeqq_, mve_vcmpgeq_n_) (mve_vcmpgeq_, mve_vcmpgtq_n_, mve_vcmpgtq_) (mve_vcmphiq_n_, mve_vcmphiq_, mve_vcmpleq_n_) (mve_vcmpleq_, mve_vcmpltq_n_, mve_vcmpltq_) (mve_vcmpneq_n_, mve_vcmpltq_n_, mve_vcmpltq_) (mve_vcmpneq_n_): Remove. --- gcc/config/arm/iterators.md | 8 ++ gcc/config/arm/mve.md | 250 ++++---------------------------------------- 2 files changed, 27 insertions(+), 231 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 0aba93f..29347f7 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -285,6 +285,8 @@ ;; Comparisons for vc (define_code_iterator COMPARISONS [eq gt ge le lt]) +;; Comparisons for MVE +(define_code_iterator MVE_COMPARISONS [eq ge geu gt gtu le lt ne]) ;; A list of ... (define_code_iterator IOR_XOR [ior xor]) @@ -336,8 +338,14 @@ (define_code_attr cmp_op [(eq "eq") (gt "gt") (ge "ge") (lt "lt") (le "le") (gtu "gt") (geu "ge")]) +(define_code_attr mve_cmp_op [(eq "eq") (gt "gt") (ge "ge") (lt "lt") (le "le") + (gtu "hi") (geu "cs") (ne "ne")]) + (define_code_attr cmp_type [(eq "i") (gt "s") (ge "s") (lt "s") (le "s")]) +(define_code_attr mve_cmp_type [(eq "i") (gt "s") (ge "s") (lt "s") (le "s") + (gtu "u") (geu "u") (ne "i")]) + (define_code_attr vfml_op [(plus "a") (minus "s")]) (define_code_attr ss_op [(ss_plus "qadd") (ss_minus "qsub")]) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 6f5fe06..85c108c 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -811,17 +811,30 @@ (set_attr "length""8")]) ;; -;; [vcmpneq_]) +;; [vcmpneq_, vcmpcsq_, vcmpeqq_, vcmpgeq_, vcmpgtq_, vcmphiq_, vcmpleq_, vcmpltq_]) ;; -(define_insn "mve_vcmpneq_" +(define_insn "mve_vcmpq_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:MVE_2 2 "s_register_operand" "w")] - VCMPNEQ)) + (MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w") + (match_operand:MVE_2 2 "s_register_operand" "w"))) + ] + "TARGET_HAVE_MVE" + "vcmp.%# , %q1, %q2" + [(set_attr "type" "mve_move") +]) + +;; +;; [vcmpcsq_n_, vcmpeqq_n_, vcmpgeq_n_, vcmpgtq_n_, vcmphiq_n_, vcmpleq_n_, vcmpltq_n_, vcmpneq_n_]) +;; +(define_insn "mve_vcmpq_n_" + [ + (set (match_operand:HI 0 "vpr_register_operand" "=Up") + (MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w") + (match_operand: 2 "s_register_operand" "r"))) ] "TARGET_HAVE_MVE" - "vcmp.i%# ne, %q1, %q2" + "vcmp.%# , %q1, %2" [(set_attr "type" "mve_move") ]) @@ -980,231 +993,6 @@ ) ;; -;; [vcmpcsq_n_]) -;; -(define_insn "mve_vcmpcsq_n_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPCSQ_N_U)) - ] - "TARGET_HAVE_MVE" - "vcmp.u%# cs, %q1, %2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpcsq_]) -;; -(define_insn "mve_vcmpcsq_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:MVE_2 2 "s_register_operand" "w")] - VCMPCSQ_U)) - ] - "TARGET_HAVE_MVE" - "vcmp.u%# cs, %q1, %q2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpeqq_n_]) -;; -(define_insn "mve_vcmpeqq_n_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPEQQ_N)) - ] - "TARGET_HAVE_MVE" - "vcmp.i%# eq, %q1, %2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpeqq_]) -;; -(define_insn "mve_vcmpeqq_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:MVE_2 2 "s_register_operand" "w")] - VCMPEQQ)) - ] - "TARGET_HAVE_MVE" - "vcmp.i%# eq, %q1, %q2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpgeq_n_]) -;; -(define_insn "mve_vcmpgeq_n_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPGEQ_N_S)) - ] - "TARGET_HAVE_MVE" - "vcmp.s%# ge, %q1, %2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpgeq_]) -;; -(define_insn "mve_vcmpgeq_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:MVE_2 2 "s_register_operand" "w")] - VCMPGEQ_S)) - ] - "TARGET_HAVE_MVE" - "vcmp.s%# ge, %q1, %q2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpgtq_n_]) -;; -(define_insn "mve_vcmpgtq_n_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPGTQ_N_S)) - ] - "TARGET_HAVE_MVE" - "vcmp.s%# gt, %q1, %2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpgtq_]) -;; -(define_insn "mve_vcmpgtq_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:MVE_2 2 "s_register_operand" "w")] - VCMPGTQ_S)) - ] - "TARGET_HAVE_MVE" - "vcmp.s%# gt, %q1, %q2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmphiq_n_]) -;; -(define_insn "mve_vcmphiq_n_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPHIQ_N_U)) - ] - "TARGET_HAVE_MVE" - "vcmp.u%# hi, %q1, %2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmphiq_]) -;; -(define_insn "mve_vcmphiq_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:MVE_2 2 "s_register_operand" "w")] - VCMPHIQ_U)) - ] - "TARGET_HAVE_MVE" - "vcmp.u%# hi, %q1, %q2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpleq_n_]) -;; -(define_insn "mve_vcmpleq_n_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPLEQ_N_S)) - ] - "TARGET_HAVE_MVE" - "vcmp.s%# le, %q1, %2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpleq_]) -;; -(define_insn "mve_vcmpleq_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:MVE_2 2 "s_register_operand" "w")] - VCMPLEQ_S)) - ] - "TARGET_HAVE_MVE" - "vcmp.s%# le, %q1, %q2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpltq_n_]) -;; -(define_insn "mve_vcmpltq_n_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPLTQ_N_S)) - ] - "TARGET_HAVE_MVE" - "vcmp.s%# lt, %q1, %2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpltq_]) -;; -(define_insn "mve_vcmpltq_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:MVE_2 2 "s_register_operand" "w")] - VCMPLTQ_S)) - ] - "TARGET_HAVE_MVE" - "vcmp.s%# lt, %q1, %q2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpneq_n_]) -;; -(define_insn "mve_vcmpneq_n_" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPNEQ_N)) - ] - "TARGET_HAVE_MVE" - "vcmp.i%# ne, %q1, %2" - [(set_attr "type" "mve_move") -]) - -;; ;; [veorq_u, veorq_s]) ;; (define_insn "mve_veorq_u" -- cgit v1.1 From 902692c1cbdb5c0ce7ea865fa5677aaeb78802f4 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Mon, 10 May 2021 12:52:11 +0000 Subject: arm: MVE: Factorize vcmp_*f* Like in the previous, we factorize the vcmp_*f* patterns to make maintenance easier. 2021-05-10 Christophe Lyon gcc/ * config/arm/iterators.md (MVE_FP_COMPARISONS): New. * config/arm/mve.md (mve_vcmpq_f) (mve_vcmpq_n_f): New, merge all vcmp_*f* patterns. (mve_vcmpeqq_f, mve_vcmpeqq_n_f, mve_vcmpgeq_f) (mve_vcmpgeq_n_f, mve_vcmpgtq_f) (mve_vcmpgtq_n_f, mve_vcmpleq_f) (mve_vcmpleq_n_f, mve_vcmpltq_f) (mve_vcmpltq_n_f, mve_vcmpneq_f) (mve_vcmpneq_n_f): Remove. * config/arm/unspecs.md (VCMPEQQ_F, VCMPEQQ_N_F, VCMPGEQ_F) (VCMPGEQ_N_F, VCMPGTQ_F, VCMPGTQ_N_F, VCMPLEQ_F, VCMPLEQ_N_F) (VCMPLTQ_F, VCMPLTQ_N_F, VCMPNEQ_F, VCMPNEQ_N_F): Remove. --- gcc/config/arm/iterators.md | 1 + gcc/config/arm/mve.md | 172 +++----------------------------------------- gcc/config/arm/unspecs.md | 12 ---- 3 files changed, 11 insertions(+), 174 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 29347f7..95df8bd 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -287,6 +287,7 @@ (define_code_iterator COMPARISONS [eq gt ge le lt]) ;; Comparisons for MVE (define_code_iterator MVE_COMPARISONS [eq ge geu gt gtu le lt ne]) +(define_code_iterator MVE_FP_COMPARISONS [eq ge gt le lt ne]) ;; A list of ... (define_code_iterator IOR_XOR [ior xor]) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 85c108c..45df211 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -1901,182 +1901,30 @@ ]) ;; -;; [vcmpeqq_f]) +;; [vcmpeqq_f, vcmpgeq_f, vcmpgtq_f, vcmpleq_f, vcmpltq_f, vcmpneq_f]) ;; -(define_insn "mve_vcmpeqq_f" +(define_insn "mve_vcmpq_f" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMPEQQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmp.f%# eq, %q1, %q2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpeqq_n_f]) -;; -(define_insn "mve_vcmpeqq_n_f" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPEQQ_N_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmp.f%# eq, %q1, %2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpgeq_f]) -;; -(define_insn "mve_vcmpgeq_f" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMPGEQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmp.f%# ge, %q1, %q2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpgeq_n_f]) -;; -(define_insn "mve_vcmpgeq_n_f" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPGEQ_N_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmp.f%# ge, %q1, %2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpgtq_f]) -;; -(define_insn "mve_vcmpgtq_f" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMPGTQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmp.f%# gt, %q1, %q2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpgtq_n_f]) -;; -(define_insn "mve_vcmpgtq_n_f" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPGTQ_N_F)) + (MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w") + (match_operand:MVE_0 2 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmp.f%# gt, %q1, %2" + "vcmp.f%# , %q1, %q2" [(set_attr "type" "mve_move") ]) ;; -;; [vcmpleq_f]) +;; [vcmpeqq_n_f, vcmpgeq_n_f, vcmpgtq_n_f, vcmpleq_n_f, vcmpltq_n_f, vcmpneq_n_f]) ;; -(define_insn "mve_vcmpleq_f" +(define_insn "mve_vcmpq_n_f" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMPLEQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmp.f%# le, %q1, %q2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpleq_n_f]) -;; -(define_insn "mve_vcmpleq_n_f" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPLEQ_N_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmp.f%# le, %q1, %2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpltq_f]) -;; -(define_insn "mve_vcmpltq_f" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMPLTQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmp.f%# lt, %q1, %q2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpltq_n_f]) -;; -(define_insn "mve_vcmpltq_n_f" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPLTQ_N_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmp.f%# lt, %q1, %2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpneq_f]) -;; -(define_insn "mve_vcmpneq_f" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMPNEQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmp.f%# ne, %q1, %q2" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmpneq_n_f]) -;; -(define_insn "mve_vcmpneq_n_f" - [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand: 2 "s_register_operand" "r")] - VCMPNEQ_N_F)) + (MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w") + (match_operand: 2 "s_register_operand" "r"))) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmp.f%# ne, %q1, %2" + "vcmp.f%# , %q1, %2" [(set_attr "type" "mve_move") ]) diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 4d47ab7..07ca53b 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -710,18 +710,6 @@ VABDQ_M_U VABDQ_F VADDQ_N_F - VCMPEQQ_F - VCMPEQQ_N_F - VCMPGEQ_F - VCMPGEQ_N_F - VCMPGTQ_F - VCMPGTQ_N_F - VCMPLEQ_F - VCMPLEQ_N_F - VCMPLTQ_F - VCMPLTQ_N_F - VCMPNEQ_F - VCMPNEQ_N_F VMAXNMAQ_F VMAXNMAVQ_F VMAXNMQ_F -- cgit v1.1 From 695776733785973e809c89a739f3fa8d4c9b9d4e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 10 May 2021 16:41:16 +0200 Subject: i386: Force V2SI mode operands to registers in expand_sse_movcc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For some reason middle-end does not enforce operand predicates for vcond patterns. 2021-05-10 Uroš Bizjak gcc/ * config/i386/i386-expand.c (ix86_expand_sse_movcc) : Force op_true to register. --- gcc/config/i386/i386-expand.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index e9f11bc..5cfde5b 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -3707,6 +3707,8 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V2SImode: if (TARGET_SSE4_1) { + op_true = force_reg (mode, op_true); + gen = gen_mmx_pblendvb; if (mode != V8QImode) d = gen_reg_rtx (V8QImode); -- cgit v1.1 From e85f3aeddbdba27ec883bb85a58f96615726da32 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Mon, 10 May 2021 16:39:04 +0000 Subject: arm: remove error in CPP_SPEC when float-abi soft and hard are used together arm.h has had this error message since 1997, and was never updated to take softfp into account. Anyway, it seems it was useful long ago, but it is no longer needed since option parsing has been improved: -mfloat-abi is handled via arm.opt and updates the var_float_abi variable. So, the last instance of -mfloat-abi= on the command line wins. This patch just removes this error message, thus enabling many more tests to pass on arm-eabi: * with -mcpu=cortex-a7/-mfloat-abi=soft/-march=armv7ve+simd (2 more passes) gcc.target/arm/pr52375.c g++.target/arm/pr99593.C (test for excess errors) * with -mthumb/-mfloat-abi=soft/-march=armv6s-m (115 more passes in C, 90 more in C++) gcc.target/arm/armv8_1m-fp16-move-1.c (test for excess errors) gcc.target/arm/armv8_1m-fp32-move-1.c (test for excess errors) gcc.target/arm/armv8_1m-fp64-move-1.c (test for excess errors) gcc.target/arm/armv8_2-fp16-move-1.c (test for excess errors) gcc.target/arm/cortex-m55-nodsp-flag-hard.c (test for excess errors) gcc.target/arm/cortex-m55-nofp-flag-hard.c (test for excess errors) gcc.target/arm/cortex-m55-nomve-flag-hard.c (test for excess errors) gcc.target/arm/cortex-m55-nomve.fp-flag-hard.c (test for excess errors) g++.target/arm/no_unique_address_1.C g++.target/arm/no_unique_address_2.C * with -mthumb/-mfloat-abi=soft/-march=armv7-m (153 more passes in C, 90 more in C++) gcc.dg/pr59418.c (test for excess errors) gcc.target/arm/armv8_1m-fp16-move-1.c (test for excess errors) gcc.target/arm/armv8_1m-fp32-move-1.c (test for excess errors) gcc.target/arm/armv8_1m-fp64-move-1.c (test for excess errors) gcc.target/arm/armv8_2-fp16-move-1.c (test for excess errors) gcc.target/arm/bfloat16_scalar_2_1.c (test for excess errors) gcc.target/arm/bfloat16_scalar_3_1.c (test for excess errors) gcc.target/arm/cortex-m55-nodsp-flag-hard.c (test for excess errors) gcc.target/arm/cortex-m55-nofp-flag-hard.c (test for excess errors) gcc.target/arm/cortex-m55-nomve-flag-hard.c (test for excess errors) gcc.target/arm/cortex-m55-nomve.fp-flag-hard.c (test for excess errors) gcc.target/arm/pr52375.c (test for excess errors) gcc.target/arm/simd/vld1_bf16_1.c (test for excess errors) gcc.target/arm/simd/vldn_lane_bf16_1.c (test for excess errors) gcc.target/arm/simd/vst1_bf16_1.c (test for excess errors) gcc.target/arm/simd/vstn_lane_bf16_1.c (test for excess errors) g++.target/arm/no_unique_address_1.C g++.target/arm/no_unique_address_2.C * with -mthumb/-mfloat-abi=hard/-march=armv7e-m+fp (65 more passes) gcc.target/arm/atomic-comp-swap-release-acquire-3.c (test for excess errors) gcc.target/arm/atomic-comp-swap-release-acquire-3.c scan-assembler-not dmb gcc.target/arm/atomic-comp-swap-release-acquire-3.c scan-assembler-times ldaex 4 gcc.target/arm/atomic-comp-swap-release-acquire-3.c scan-assembler-times stlex 4 gcc.target/arm/atomic-op-acq_rel-3.c (test for excess errors) gcc.target/arm/atomic-op-acq_rel-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-acq_rel-3.c scan-assembler-times ldaex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-acq_rel-3.c scan-assembler-times stlex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-acquire-3.c (test for excess errors) gcc.target/arm/atomic-op-acquire-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-acquire-3.c scan-assembler-times ldaex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-acquire-3.c scan-assembler-times strex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-char-3.c (test for excess errors) gcc.target/arm/atomic-op-char-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-char-3.c scan-assembler-times ldrexb\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-char-3.c scan-assembler-times strexb\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-consume-3.c (test for excess errors) gcc.target/arm/atomic-op-consume-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-consume-3.c scan-assembler-times ldaex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-consume-3.c scan-assembler-times strex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-int-3.c (test for excess errors) gcc.target/arm/atomic-op-int-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-int-3.c scan-assembler-times ldrex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-int-3.c scan-assembler-times strex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-relaxed-3.c (test for excess errors) gcc.target/arm/atomic-op-relaxed-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-relaxed-3.c scan-assembler-times ldrex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-relaxed-3.c scan-assembler-times strex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-release-3.c (test for excess errors) gcc.target/arm/atomic-op-release-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-release-3.c scan-assembler-times ldrex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-release-3.c scan-assembler-times stlex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-seq_cst-3.c (test for excess errors) gcc.target/arm/atomic-op-seq_cst-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-seq_cst-3.c scan-assembler-times ldaex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-seq_cst-3.c scan-assembler-times stlex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-short-3.c (test for excess errors) gcc.target/arm/atomic-op-short-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-short-3.c scan-assembler-times ldrexh\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-short-3.c scan-assembler-times strexh\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/builtin-bswap-2.c (test for excess errors) gcc.target/arm/builtin-bswap-2.c scan-assembler-not orr[ \t] gcc.target/arm/builtin-bswap-2.c scan-assembler-times rev16\\t 2 gcc.target/arm/builtin-bswap-2.c scan-assembler-times rev\\t 4 gcc.target/arm/builtin-bswap-2.c scan-assembler-times revsh\\t 2 gcc.target/arm/builtin-bswap16-2.c (test for excess errors) gcc.target/arm/builtin-bswap16-2.c scan-assembler-not orr[ \t] gcc.target/arm/pr89190.c (test for excess errors) gcc.target/arm/pr95646.c (test for excess errors) gcc.target/arm/pr95646.c check-function-bodies __acle_se_bar gcc.target/arm/pr95646.c scan-assembler-not mov\tr9, r0 * with -mthumb/-mfloat-abi=hard/-march=armv8-m.main+fp+dsp (870 more passes) gcc.target/arm/atomic-comp-swap-release-acquire-3.c (test for excess errors) gcc.target/arm/atomic-comp-swap-release-acquire-3.c scan-assembler-not dmb gcc.target/arm/atomic-comp-swap-release-acquire-3.c scan-assembler-times ldaex 4 gcc.target/arm/atomic-comp-swap-release-acquire-3.c scan-assembler-times stlex 4 gcc.target/arm/atomic-op-acq_rel-3.c (test for excess errors) gcc.target/arm/atomic-op-acq_rel-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-acq_rel-3.c scan-assembler-times ldaex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-acq_rel-3.c scan-assembler-times stlex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-acquire-3.c (test for excess errors) gcc.target/arm/atomic-op-acquire-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-acquire-3.c scan-assembler-times ldaex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-acquire-3.c scan-assembler-times strex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-char-3.c (test for excess errors) gcc.target/arm/atomic-op-char-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-char-3.c scan-assembler-times ldrexb\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-char-3.c scan-assembler-times strexb\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-consume-3.c (test for excess errors) gcc.target/arm/atomic-op-consume-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-consume-3.c scan-assembler-times ldaex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-consume-3.c scan-assembler-times strex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-int-3.c (test for excess errors) gcc.target/arm/atomic-op-int-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-int-3.c scan-assembler-times ldrex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-int-3.c scan-assembler-times strex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-relaxed-3.c (test for excess errors) gcc.target/arm/atomic-op-relaxed-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-relaxed-3.c scan-assembler-times ldrex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-relaxed-3.c scan-assembler-times strex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-release-3.c (test for excess errors) gcc.target/arm/atomic-op-release-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-release-3.c scan-assembler-times ldrex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-release-3.c scan-assembler-times stlex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-seq_cst-3.c (test for excess errors) gcc.target/arm/atomic-op-seq_cst-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-seq_cst-3.c scan-assembler-times ldaex\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-seq_cst-3.c scan-assembler-times stlex\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-short-3.c (test for excess errors) gcc.target/arm/atomic-op-short-3.c scan-assembler-not dmb gcc.target/arm/atomic-op-short-3.c scan-assembler-times ldrexh\tr[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/atomic-op-short-3.c scan-assembler-times strexh\t...?, r[0-9]+, \\[r[0-9]+\\] 6 gcc.target/arm/builtin-bswap-2.c (test for excess errors) gcc.target/arm/builtin-bswap-2.c scan-assembler-not orr[ \t] gcc.target/arm/builtin-bswap-2.c scan-assembler-times rev16\\t 2 gcc.target/arm/builtin-bswap-2.c scan-assembler-times rev\\t 4 gcc.target/arm/builtin-bswap-2.c scan-assembler-times revsh\\t 2 gcc.target/arm/builtin-bswap16-2.c (test for excess errors) gcc.target/arm/builtin-bswap16-2.c scan-assembler-not orr[ \t] gcc.target/arm/pr89190.c (test for excess errors) gcc.target/arm/pr95646.c (test for excess errors) gcc.target/arm/pr95646.c check-function-bodies __acle_se_bar gcc.target/arm/pr95646.c scan-assembler-not mov\tr9, r0 and all cmse tests 2021-05-10 Christophe Lyon gcc/ * config/arm/arm.h (CPP_SPEC): Remove error message about -mfloat-abi. --- gcc/config/arm/arm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index c70af57..e430e4d 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -90,8 +90,6 @@ extern tree arm_bf16_ptr_type_node; #undef CPP_SPEC #define CPP_SPEC "%(subtarget_cpp_spec) \ -%{mfloat-abi=soft:%{mfloat-abi=hard: \ - %e-mfloat-abi=soft and -mfloat-abi=hard may not be used together}} \ %{mbig-endian:%{mlittle-endian: \ %e-mbig-endian and -mlittle-endian may not be used together}}" -- cgit v1.1 From 51d89e61f7ebfe75ca752e62bd29b58cb957235c Mon Sep 17 00:00:00 2001 From: Pat Haugen Date: Mon, 10 May 2021 13:49:06 -0500 Subject: Add ALTIVEC_REGS as pressure class. Code that has heavy register pressure on Altivec registers can suffer from over-aggressive scheduling during sched1, which then leads to increased register spill. This is due to the fact that registers that prefer ALTIVEC_REGS are currently assigned an allocno class of VSX_REGS. This then misleads the scheduler to think there are 64 regs available, when in reality there are only 32 Altivec regs. This patch fixes the problem by assigning an allocno class of ALTIVEC_REGS and adding ALTIVEC_REGS as a pressure class. 2021-05-10 Pat Haugen gcc/ChangeLog: * config/rs6000/rs6000.c (rs6000_ira_change_pseudo_allocno_class): Return ALTIVEC_REGS if that is best_class. (rs6000_compute_pressure_classes): Add ALTIVEC_REGS. gcc/testsuite/ChangeLog: * gcc.target/powerpc/fold-vec-insert-float-p9.c: Adjust counts. * gcc.target/powerpc/vec-rlmi-rlnm.c: Likewise. --- gcc/config/rs6000/rs6000.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index ee15af9..c852fb8 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -22527,11 +22527,14 @@ rs6000_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED, of allocno class. */ if (best_class == BASE_REGS) return GENERAL_REGS; - if (TARGET_VSX - && (best_class == FLOAT_REGS || best_class == ALTIVEC_REGS)) + if (TARGET_VSX && best_class == FLOAT_REGS) return VSX_REGS; return best_class; + case VSX_REGS: + if (best_class == ALTIVEC_REGS) + return ALTIVEC_REGS; + default: break; } @@ -23649,12 +23652,12 @@ rs6000_compute_pressure_classes (enum reg_class *pressure_classes) n = 0; pressure_classes[n++] = GENERAL_REGS; + if (TARGET_ALTIVEC) + pressure_classes[n++] = ALTIVEC_REGS; if (TARGET_VSX) pressure_classes[n++] = VSX_REGS; else { - if (TARGET_ALTIVEC) - pressure_classes[n++] = ALTIVEC_REGS; if (TARGET_HARD_FLOAT) pressure_classes[n++] = FLOAT_REGS; } -- cgit v1.1 From b084bfd43a8b72d8db8702ff9cb316482662cb90 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Mon, 10 May 2021 21:34:36 -0500 Subject: rs6000: Move rs6000_vect_nonmem into target cost_data This patch is to move rs6000_vect_nonmem (target cost_data related information) into target cost_data struct. As Richi pointed out, we can gather data from add_stmt_cost invocations. This is one pre-step to centralize target cost_data related stuffs. gcc/ChangeLog: * config/rs6000/rs6000.c (rs6000_vect_nonmem): Renamed to vect_nonmem and moved into... (struct rs6000_cost_data): ...here. (rs6000_init_cost): Use vect_nonmem of cost_data instead. (rs6000_add_stmt_cost): Likewise. (rs6000_finish_cost): Likewise. --- gcc/config/rs6000/rs6000.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index c852fb8..96d0166 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5235,6 +5235,9 @@ typedef struct _rs6000_cost_data { struct loop *loop_info; unsigned cost[3]; + /* For each vectorized loop, this var holds TRUE iff a non-memory vector + instruction is needed by the vectorization. */ + bool vect_nonmem; } rs6000_cost_data; /* Test for likely overcommitment of vector hardware resources. If a @@ -5292,10 +5295,6 @@ rs6000_density_test (rs6000_cost_data *data) /* Implement targetm.vectorize.init_cost. */ -/* For each vectorized loop, this var holds TRUE iff a non-memory vector - instruction is needed by the vectorization. */ -static bool rs6000_vect_nonmem; - static void * rs6000_init_cost (struct loop *loop_info) { @@ -5304,7 +5303,7 @@ rs6000_init_cost (struct loop *loop_info) data->cost[vect_prologue] = 0; data->cost[vect_body] = 0; data->cost[vect_epilogue] = 0; - rs6000_vect_nonmem = false; + data->vect_nonmem = false; return data; } @@ -5364,7 +5363,7 @@ rs6000_add_stmt_cost (class vec_info *vinfo, void *data, int count, || kind == vec_promote_demote || kind == vec_construct || kind == scalar_to_vec) || (where == vect_body && kind == vector_stmt)) - rs6000_vect_nonmem = true; + cost_data->vect_nonmem = true; } return retval; @@ -5419,7 +5418,7 @@ rs6000_finish_cost (void *data, unsigned *prologue_cost, if (cost_data->loop_info) { loop_vec_info vec_info = loop_vec_info_for_loop (cost_data->loop_info); - if (!rs6000_vect_nonmem + if (!cost_data->vect_nonmem && LOOP_VINFO_VECT_FACTOR (vec_info) == 2 && LOOP_REQUIRES_VERSIONING (vec_info)) cost_data->cost[vect_body] += 10000; -- cgit v1.1 From 096f8215d2172ca4177cb26035e748d8f182fc8f Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Mon, 10 May 2021 22:13:27 -0500 Subject: vect: Add costing_for_scalar parameter to init_cost hook rs6000 port function rs6000_density_test wants to differentiate the current cost model is for the scalar version of a loop or block, or the vector version. As Richi suggested, this patch introduces one new parameter costing_for_scalar to init_cost hook to pass down this information explicitly. gcc/ChangeLog: * doc/tm.texi: Regenerated. * target.def (init_cost): Add new parameter costing_for_scalar. * targhooks.c (default_init_cost): Adjust for new parameter. * targhooks.h (default_init_cost): Likewise. * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Likewise. (vect_compute_single_scalar_iteration_cost): Likewise. (vect_analyze_loop_2): Likewise. * tree-vect-slp.c (_bb_vec_info::_bb_vec_info): Likewise. (vect_bb_vectorization_profitable_p): Likewise. * tree-vectorizer.h (init_cost): Likewise. * config/aarch64/aarch64.c (aarch64_init_cost): Likewise. * config/i386/i386.c (ix86_init_cost): Likewise. * config/rs6000/rs6000.c (rs6000_init_cost): Likewise. --- gcc/config/aarch64/aarch64.c | 2 +- gcc/config/i386/i386.c | 2 +- gcc/config/rs6000/rs6000.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 04855cb..85fd80e 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -14388,7 +14388,7 @@ struct aarch64_vector_costs /* Implement TARGET_VECTORIZE_INIT_COST. */ void * -aarch64_init_cost (class loop *) +aarch64_init_cost (class loop *, bool) { return new aarch64_vector_costs; } diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ecc1535..915f89f 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -22289,7 +22289,7 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info) /* Implement targetm.vectorize.init_cost. */ static void * -ix86_init_cost (class loop *) +ix86_init_cost (class loop *, bool) { unsigned *cost = XNEWVEC (unsigned, 3); cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0; diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 96d0166..1ef5149 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5296,7 +5296,7 @@ rs6000_density_test (rs6000_cost_data *data) /* Implement targetm.vectorize.init_cost. */ static void * -rs6000_init_cost (struct loop *loop_info) +rs6000_init_cost (struct loop *loop_info, bool) { rs6000_cost_data *data = XNEW (struct _rs6000_cost_data); data->loop_info = loop_info; -- cgit v1.1 From 1866182f6cf338880c68225d9de571b787b6abcd Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Mon, 10 May 2021 23:01:15 -0500 Subject: rs6000: Guard density_test only for vector version This patch teaches rs6000_density_test to only care about the vector version cost calculation and early return when calculating the single scalar iteration cost. Bootstrapped/regtested on powerpc64le-linux-gnu P9. gcc/ChangeLog: * config/rs6000/rs6000.c (struct rs6000_cost_data): New member costing_for_scalar. (rs6000_density_test): Early return if costing_for_scalar is true. (rs6000_init_cost): Init costing_for_scalar of rs6000_cost_data. --- gcc/config/rs6000/rs6000.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 1ef5149..d1b76f6 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5238,6 +5238,8 @@ typedef struct _rs6000_cost_data /* For each vectorized loop, this var holds TRUE iff a non-memory vector instruction is needed by the vectorization. */ bool vect_nonmem; + /* Indicates this is costing for the scalar version of a loop or block. */ + bool costing_for_scalar; } rs6000_cost_data; /* Test for likely overcommitment of vector hardware resources. If a @@ -5259,6 +5261,12 @@ rs6000_density_test (rs6000_cost_data *data) int vec_cost = data->cost[vect_body], not_vec_cost = 0; int i, density_pct; + /* This density test only cares about the cost of vector version of the + loop, so immediately return if we are passed costing for the scalar + version (namely computing single scalar iteration cost). */ + if (data->costing_for_scalar) + return; + for (i = 0; i < nbbs; i++) { basic_block bb = bbs[i]; @@ -5296,7 +5304,7 @@ rs6000_density_test (rs6000_cost_data *data) /* Implement targetm.vectorize.init_cost. */ static void * -rs6000_init_cost (struct loop *loop_info, bool) +rs6000_init_cost (struct loop *loop_info, bool costing_for_scalar) { rs6000_cost_data *data = XNEW (struct _rs6000_cost_data); data->loop_info = loop_info; @@ -5304,6 +5312,7 @@ rs6000_init_cost (struct loop *loop_info, bool) data->cost[vect_body] = 0; data->cost[vect_epilogue] = 0; data->vect_nonmem = false; + data->costing_for_scalar = costing_for_scalar; return data; } -- cgit v1.1 From 9b905ba9ebba8d2cc805c26351225e7f74c02333 Mon Sep 17 00:00:00 2001 From: Srinath Parvathaneni Date: Tue, 11 May 2021 10:43:11 +0100 Subject: arm: Remove duplicate definitions from arm_mve.h (pr100419). This patch removes several duplicated intrinsic definitions from arm_mve.h mentioned in PR100419 and also fixes the wrong arguments in few of intrinsics polymorphic variants. gcc/ChangeLog: 2021-05-04 Srinath Parvathaneni PR target/100419 * config/arm/arm_mve.h (__arm_vstrwq_scatter_offset): Fix wrong arguments. (__arm_vcmpneq): Remove duplicate definition. (__arm_vstrwq_scatter_offset_p): Likewise. (__arm_vmaxq_x): Likewise. (__arm_vmlsdavaq): Likewise. (__arm_vmlsdavaxq): Likewise. (__arm_vmlsdavq_p): Likewise. (__arm_vmlsdavxq_p): Likewise. (__arm_vrmlaldavhaq): Likewise. (__arm_vstrbq_p): Likewise. (__arm_vstrbq_scatter_offset): Likewise. (__arm_vstrbq_scatter_offset_p): Likewise. (__arm_vstrdq_scatter_offset): Likewise. (__arm_vstrdq_scatter_offset_p): Likewise. (__arm_vstrdq_scatter_shifted_offset): Likewise. (__arm_vstrdq_scatter_shifted_offset_p): Likewise. Co-authored-by: Joe Ramsay --- gcc/config/arm/arm_mve.h | 460 ++++++++++++++--------------------------------- 1 file changed, 132 insertions(+), 328 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h index 5d78269..1380f3a 100644 --- a/gcc/config/arm/arm_mve.h +++ b/gcc/config/arm/arm_mve.h @@ -37808,33 +37808,19 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_base_p_u32(p0, p1, __ARM_mve_coerce(__p2, uint32x4_t), p3), \ int (*)[__ARM_mve_type_float32x4_t]: __arm_vstrwq_scatter_base_p_f32(p0, p1, __ARM_mve_coerce(__p2, float32x4_t), p3));}) -#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ +#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(p0, int32_t *), __p1, __ARM_mve_coerce(__p2, int32x4_t)), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(p0, uint32_t *), __p1, __ARM_mve_coerce(__p2, uint32x4_t)), \ - int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]: __arm_vstrwq_scatter_offset_f32 (__ARM_mve_coerce(p0, float32_t *), __p1, __ARM_mve_coerce(__p2, float32x4_t)));}) - -#define __arm_vstrwq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_offset_p_s32 (__ARM_mve_coerce(p0, int32_t *), __p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_offset_p_u32 (__ARM_mve_coerce(p0, uint32_t *), __p1, __ARM_mve_coerce(__p2, uint32x4_t), p3), \ - int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]: __arm_vstrwq_scatter_offset_p_f32 (__ARM_mve_coerce(p0, float32_t *), __p1, __ARM_mve_coerce(__p2, float32x4_t), p3));}) - -#define __arm_vstrwq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_offset_p_s32 (__ARM_mve_coerce(p0, int32_t *), __p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_offset_p_u32 (__ARM_mve_coerce(p0, uint32_t *), __p1, __ARM_mve_coerce(__p2, uint32x4_t), p3), \ - int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]: __arm_vstrwq_scatter_offset_p_f32 (__ARM_mve_coerce(p0, float32_t *), __p1, __ARM_mve_coerce(__p2, float32x4_t), p3));}) + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(__p0, int32_t *), p1, __ARM_mve_coerce(__p2, int32x4_t)), \ + int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(__p0, uint32_t *), p1, __ARM_mve_coerce(__p2, uint32x4_t)), \ + int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]: __arm_vstrwq_scatter_offset_f32 (__ARM_mve_coerce(__p0, float32_t *), p1, __ARM_mve_coerce(__p2, float32x4_t)));}) -#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ +#define __arm_vstrwq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(p0, int32_t *), __p1, __ARM_mve_coerce(__p2, int32x4_t)), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(p0, uint32_t *), __p1, __ARM_mve_coerce(__p2, uint32x4_t)), \ - int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]: __arm_vstrwq_scatter_offset_f32 (__ARM_mve_coerce(p0, float32_t *), __p1, __ARM_mve_coerce(__p2, float32x4_t)));}) + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_offset_p_s32 (__ARM_mve_coerce(__p0, int32_t *), p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \ + int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_offset_p_u32 (__ARM_mve_coerce(__p0, uint32_t *), p1, __ARM_mve_coerce(__p2, uint32x4_t), p3), \ + int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]: __arm_vstrwq_scatter_offset_p_f32 (__ARM_mve_coerce(__p0, float32_t *), p1, __ARM_mve_coerce(__p2, float32x4_t), p3));}) #define __arm_vstrwq_scatter_shifted_offset(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ @@ -38422,6 +38408,12 @@ extern void *__ARM_undef; #define __arm_vcmpneq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t)), \ + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t)), \ + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \ + int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t)), \ + int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \ + int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)), \ int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vcmpneq_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t)), \ int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vcmpneq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vcmpneq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)), \ @@ -38871,23 +38863,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vcmpeqq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vcmpeqq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)));}) -#define __arm_vcmpneq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vcmpneq_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t)), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vcmpneq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vcmpneq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vcmpneq_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vcmpneq_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vcmpneq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t)), \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t)), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)));}) - - #define __arm_vqmovntq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ @@ -39036,22 +39011,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vcmpneq_m_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vcmpneq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) -#define __arm_vcmpneq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vcmpneq_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t)), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vcmpneq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vcmpneq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vcmpneq_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vcmpneq_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vcmpneq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t)), \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t)), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)));}) - #define __arm_vshlcq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ int (*)[__ARM_mve_type_int8x16_t]: __arm_vshlcq_s8 (__ARM_mve_coerce(__p0, int8x16_t), p1, p2), \ @@ -39367,52 +39326,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int16x8_t]: __arm_vminaq_m_s16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int32x4_t]: __arm_vminaq_m_s32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2));}) -#define __arm_vrmlaldavhaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vcmpltq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vcmpltq_m_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vcmpltq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpltq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t), p2), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpltq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpltq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t), p2));}) - -#define __arm_vmlsdavxq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vcmpleq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vcmpleq_m_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vcmpleq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpleq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t), p2), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpleq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpleq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t), p2));}) - -#define __arm_vmlsdavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vcmpgtq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vcmpgtq_m_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vcmpgtq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpgtq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t), p2), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpgtq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpgtq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t), p2));}) - -#define __arm_vmlsdavaxq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int16x8_t]: __arm_vshrntq_n_s16 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int32x4_t]: __arm_vshrntq_n_s32 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint16x8_t]: __arm_vshrntq_n_u16 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32x4_t]: __arm_vshrntq_n_u32 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - -#define __arm_vmlsdavaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int16x8_t]: __arm_vrshrntq_n_s16 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int32x4_t]: __arm_vrshrntq_n_s32 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint16x8_t]: __arm_vrshrntq_n_u16 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32x4_t]: __arm_vrshrntq_n_u32 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - #define __arm_vmovlbq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ @@ -39711,26 +39624,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vmulq_m_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vmulq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) -#define __arm_vstrbq(p0,p1) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]: __arm_vstrbq_s8 (__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16_t)), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vstrbq_s16 (__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrbq_s32 (__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vstrbq_u8 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vstrbq_u16 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrbq_u32 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));}) - -#define __arm_vstrbq_scatter_offset(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int8x16_t]: __arm_vstrbq_scatter_offset_s8 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, int8x16_t)), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int16x8_t]: __arm_vstrbq_scatter_offset_s16 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, int16x8_t)), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int32x4_t]: __arm_vstrbq_scatter_offset_s32 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, int32x4_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vstrbq_scatter_offset_u8 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vstrbq_scatter_offset_u16 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vstrbq_scatter_offset_u32 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t)));}) - #define __arm_vstrwq_scatter_base(p0,p1,p2) ({ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_base_s32(p0, p1, __ARM_mve_coerce(__p2, int32x4_t)), \ @@ -39745,27 +39638,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint16x8_t)), \ int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint32x4_t)));}) -#define __arm_vstrbq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]: __arm_vstrbq_p_s8 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vstrbq_p_s16 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrbq_p_s32 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vstrbq_p_u8 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vstrbq_p_u16 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrbq_p_u32 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - -#define __arm_vstrbq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int8x16_t]: __arm_vstrbq_scatter_offset_p_s8 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int16x8_t]: __arm_vstrbq_scatter_offset_p_s16 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int32x4_t]: __arm_vstrbq_scatter_offset_p_s32 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vstrbq_scatter_offset_p_u8 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vstrbq_scatter_offset_p_u16 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vstrbq_scatter_offset_p_u32 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) - #define __arm_vstrwq_scatter_base_p(p0,p1,p2,p3) ({ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_base_p_s32 (p0, p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \ @@ -39921,34 +39793,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_base_s64 (p0, p1, __ARM_mve_coerce(__p2, int64x2_t)), \ int (*)[__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_base_u64 (p0, p1, __ARM_mve_coerce(__p2, uint64x2_t)));}) -#define __arm_vstrdq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_offset_p_s64 (__ARM_mve_coerce(__p0, int64_t *), __p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \ - int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_offset_p_u64 (__ARM_mve_coerce(__p0, uint64_t *), __p1, __ARM_mve_coerce(__p2, uint64x2_t), p3));}) - -#define __arm_vstrdq_scatter_offset(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_offset_s64 (__ARM_mve_coerce(__p0, int64_t *), __p1, __ARM_mve_coerce(__p2, int64x2_t)), \ - int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_offset_u64 (__ARM_mve_coerce(__p0, uint64_t *), __p1, __ARM_mve_coerce(__p2, uint64x2_t)));}) - -#define __arm_vstrdq_scatter_shifted_offset_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_shifted_offset_p_s64 (__ARM_mve_coerce(__p0, int64_t *), __p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \ - int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_shifted_offset_p_u64 (__ARM_mve_coerce(__p0, uint64_t *), __p1, __ARM_mve_coerce(__p2, uint64x2_t), p3));}) - -#define __arm_vstrdq_scatter_shifted_offset(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_shifted_offset_s64 (__ARM_mve_coerce(__p0, int64_t *), __p1, __ARM_mve_coerce(__p2, int64x2_t)), \ - int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_shifted_offset_u64 (__ARM_mve_coerce(__p0, uint64_t *), __p1, __ARM_mve_coerce(__p2, uint64x2_t)));}) - #define __arm_vstrhq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ @@ -39981,29 +39825,17 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vstrhq_scatter_shifted_offset_p_u16 (__ARM_mve_coerce(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vstrhq_scatter_shifted_offset_p_u32 (__ARM_mve_coerce(p0, uint16_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) -#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(p0, int32_t *), __p1, __ARM_mve_coerce(__p2, int32x4_t)), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(p0, uint32_t *), __p1, __ARM_mve_coerce(__p2, uint32x4_t)));}) - -#define __arm_vstrwq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_offset_p_s32 (__ARM_mve_coerce(p0, int32_t *), __p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_offset_p_u32 (__ARM_mve_coerce(p0, uint32_t *), __p1, __ARM_mve_coerce(__p2, uint32x4_t)));}) - -#define __arm_vstrwq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ +#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_offset_p_s32 (__ARM_mve_coerce(p0, int32_t *), __p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_offset_p_u32 (__ARM_mve_coerce(p0, uint32_t *), __p1, __ARM_mve_coerce(__p2, uint32x4_t), p3));}) + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(__p0, int32_t *), p1, __ARM_mve_coerce(__p2, int32x4_t)), \ + int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(__p0, uint32_t *), p1, __ARM_mve_coerce(__p2, uint32x4_t)));}) -#define __arm_vstrwq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ +#define __arm_vstrwq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_offset_s32 (__ARM_mve_coerce(p0, int32_t *), __p1, __ARM_mve_coerce(__p2, int32x4_t)), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_offset_u32 (__ARM_mve_coerce(p0, uint32_t *), __p1, __ARM_mve_coerce(__p2, uint32x4_t)));}) + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_offset_p_s32 (__ARM_mve_coerce(__p0, int32_t *), p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \ + int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_offset_p_u32 (__ARM_mve_coerce(__p0, uint32_t *), p1, __ARM_mve_coerce(__p2, uint32x4_t), p3));}) #define __arm_vstrwq_scatter_shifted_offset(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ @@ -40160,32 +39992,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_veorq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_veorq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) -#define __arm_vmaxq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmulq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmulq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmulq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vmulq_x_n_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vmulq_x_n_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vmulq_x_n_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t), p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vmulq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vmulq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vmulq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vmulq_x_n_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t), p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vmulq_x_n_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t), p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vmulq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t), p3));}) - -#define __arm_vminq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vminq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vminq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vminq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vminq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vminq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vminq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) - #define __arm_vmovlbq_x(p1,p2) ({ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ int (*)[__ARM_mve_type_int8x16_t]: __arm_vmovlbq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), p2), \ @@ -41013,13 +40819,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsldavaxq_p_s16(__p0, __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsldavaxq_p_s32(__p0, __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3));}) -#define __arm_vrmlaldavhaq_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vrmlaldavhaq_p_s32 (__ARM_mve_coerce(__p0, int64_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vrmlaldavhaq_p_u32 (__ARM_mve_coerce(__p0, uint64_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) - #define __arm_vrmlaldavhaxq_p(p0,p1,p2,p3) __arm_vrmlaldavhaxq_p_s32(p0,p1,p2,p3) #define __arm_vrmlsldavhaq_p(p0,p1,p2,p3) __arm_vrmlsldavhaq_p_s32(p0,p1,p2,p3) @@ -41343,21 +41142,47 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vqrdmladhxq_m_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vqrdmladhxq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3));}) -#define __arm_vmlsdavaxq_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ +#define __arm_vmlsdavaxq_p(p0,p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ + __typeof(p2) __p2 = (p2); \ + _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmlsdavaxq_p_s8 (p0, __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavaxq_p_s16 (p0, __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavaxq_p_s32 (p0, __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3));}) + +#define __arm_vmlsdavaq(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmlsdavaxq_p_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavaxq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavaxq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3));}) + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmlsdavaq_s8(p0, __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t)), \ + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavaq_s16(p0, __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t)), \ + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavaq_s32(p0, __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t)));}) + +#define __arm_vmlsdavaxq(p0,p1,p2) ({ __typeof(p2) __p2 = (p2); \ + __typeof(p1) __p1 = (p1); \ + _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmlsdavaxq_s8(p0, __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t)), \ + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavaxq_s16(p0, __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t)), \ + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavaxq_s32(p0, __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t)));}) + +#define __arm_vmlsdavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ + __typeof(p1) __p1 = (p1); \ + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmlsdavq_p_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \ + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavq_p_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavq_p_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2));}) -#define __arm_vmlsdavaq_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ +#define __arm_vmlsdavxq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmlsdavxq_p_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \ + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavxq_p_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavxq_p_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2));}) + +#define __arm_vmlsdavaq_p(p0,p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmlsdavaq_p_s8(__p0, __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavaq_p_s16(__p0, __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavaq_p_s32(__p0, __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3));}) + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmlsdavaq_p_s8(p0, __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavaq_p_s16(p0, __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavaq_p_s32(p0, __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3));}) #define __arm_vmladavaxq_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -41445,8 +41270,8 @@ extern void *__ARM_undef; #define __arm_viwdupq_u16(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_n_u16 (__ARM_mve_coerce(__p0, uint32_t), p1, p2), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_wb_u16 (__ARM_mve_coerce(__p0, uint32_t *), p1, p2));}) + int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_n_u16 (__ARM_mve_coerce(__p0, uint32_t), p1, (const int) p2), \ + int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_wb_u16 (__ARM_mve_coerce(__p0, uint32_t *), p1, (const int) p2));}) #define __arm_viwdupq_u32(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ @@ -41628,16 +41453,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxavq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t), p2), \ int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxavq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t), p2));}) -#define __arm_vmaxq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmaxq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmaxq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmaxq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vmaxq_x_u8( __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vmaxq_x_u16( __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vmaxq_x_u32( __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) - #define __arm_vmaxvq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ @@ -41672,6 +41487,16 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminavq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t), p2), \ int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminavq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t), p2));}) +#define __arm_vmaxq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ + __typeof(p2) __p2 = (p2); \ + _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmaxq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ + int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmaxq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ + int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmaxq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ + int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vmaxq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ + int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vmaxq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ + int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vmaxq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) + #define __arm_vminq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ @@ -41810,22 +41635,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlaldavxq_p_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlaldavxq_p_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2));}) -#define __arm_vmlsdavaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmlsdavaq_s8(__p0, __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t)), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavaq_s16(__p0, __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavaq_s32(__p0, __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t)));}) - -#define __arm_vmlsdavaxq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmlsdavaxq_s8(__p0, __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t)), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavaxq_s16(__p0, __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavaxq_s32(__p0, __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t)));}) - #define __arm_vmlsdavq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ @@ -41833,13 +41642,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)));}) -#define __arm_vmlsdavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmlsdavq_p_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavq_p_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavq_p_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2));}) - #define __arm_vmlsdavxq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ @@ -41847,13 +41649,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavxq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \ int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavxq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)));}) -#define __arm_vmlsdavxq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmlsdavxq_p_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlsdavxq_p_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlsdavxq_p_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2));}) - #define __arm_vmlsldavaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ @@ -41948,13 +41743,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vmulltq_poly_x_p8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vmulltq_poly_x_p16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3));}) -#define __arm_vrmlaldavhaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vrmlaldavhaq_s32 (__ARM_mve_coerce(__p0, int64_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t)), \ - int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vrmlaldavhaq_u32 (__ARM_mve_coerce(__p0, uint64_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t)));}) - #define __arm_vrmlaldavhaxq(p0,p1,p2) __arm_vrmlaldavhaxq_s32(p0,p1,p2) #define __arm_vrmlaldavhq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ @@ -41994,35 +41782,15 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vstrbq_u16 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \ int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrbq_u32 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));}) -#define __arm_vstrbq_p(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]: __arm_vstrbq_p_s8 (__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vstrbq_p_s16 (__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrbq_p_s32 (__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vstrbq_p_u8 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vstrbq_p_u16 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrbq_p_u32 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - -#define __arm_vstrbq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int8x16_t]: __arm_vstrbq_scatter_offset_s8 (__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, int8x16_t)), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int16x8_t]: __arm_vstrbq_scatter_offset_s16 (__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, int16x8_t)), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int32x4_t]: __arm_vstrbq_scatter_offset_s32 (__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, int32x4_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vstrbq_scatter_offset_u8 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vstrbq_scatter_offset_u16 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vstrbq_scatter_offset_u32 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t)));}) - - -#define __arm_vstrbq_scatter_offset_p(p0,p1,p2,p3) ({__typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int8x16_t]: __arm_vstrbq_scatter_offset_p_s8 (__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int16x8_t]: __arm_vstrbq_scatter_offset_p_s16 (__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int32x4_t]: __arm_vstrbq_scatter_offset_p_s32 (__ARM_mve_coerce(p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vstrbq_scatter_offset_p_u8 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vstrbq_scatter_offset_p_u16 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vstrbq_scatter_offset_p_u32 (__ARM_mve_coerce(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) +#define __arm_vstrbq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ + __typeof(p1) __p1 = (p1); \ + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]: __arm_vstrbq_p_s8 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, int8x16_t), p2), \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vstrbq_p_s16 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, int16x8_t), p2), \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrbq_p_s32 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, int32x4_t), p2), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vstrbq_p_u8 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vstrbq_p_u16 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrbq_p_u32 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) #define __arm_vstrdq_scatter_base(p0,p1,p2) ({ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \ @@ -42034,29 +41802,65 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_base_p_s64 (p0, p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \ int (*)[__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_base_p_u64 (p0, p1, __ARM_mve_coerce(__p2, uint64x2_t), p3));}) -#define __arm_vstrdq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ +#define __arm_vrmlaldavhaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ + __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_offset_s64 (__ARM_mve_coerce(p0, int64_t *), __p1, __ARM_mve_coerce(__p2, int64x2_t)), \ - int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_offset_u64 (__ARM_mve_coerce(p0, uint64_t *), __p1, __ARM_mve_coerce(__p2, uint64x2_t)));}) + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vrmlaldavhaq_s32 (__ARM_mve_coerce(__p0, int64_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t)), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vrmlaldavhaq_u32 (__ARM_mve_coerce(__p0, uint64_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t)));}) -#define __arm_vstrdq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ +#define __arm_vrmlaldavhaq_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ + __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_offset_p_s64 (__ARM_mve_coerce(p0, int64_t *), __p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \ - int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_offset_p_u64 (__ARM_mve_coerce(p0, uint64_t *), __p1, __ARM_mve_coerce(__p2, uint64x2_t), p3));}) + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vrmlaldavhaq_p_s32 (__ARM_mve_coerce(__p0, int64_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ + int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vrmlaldavhaq_p_u32 (__ARM_mve_coerce(__p0, uint64_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) -#define __arm_vstrdq_scatter_shifted_offset(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ +#define __arm_vstrbq_scatter_offset(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ + __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_shifted_offset_s64 (__ARM_mve_coerce(p0, int64_t *), __p1, __ARM_mve_coerce(__p2, int64x2_t)), \ - int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_shifted_offset_u64 (__ARM_mve_coerce(p0, uint64_t *), __p1, __ARM_mve_coerce(__p2, uint64x2_t)));}) + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int8x16_t]: __arm_vstrbq_scatter_offset_s8 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, int8x16_t)), \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int16x8_t]: __arm_vstrbq_scatter_offset_s16 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, int16x8_t)), \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int32x4_t]: __arm_vstrbq_scatter_offset_s32 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, int32x4_t)), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vstrbq_scatter_offset_u8 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t)), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vstrbq_scatter_offset_u16 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t)), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vstrbq_scatter_offset_u32 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t)));}) -#define __arm_vstrdq_scatter_shifted_offset_p(p0,p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ +#define __arm_vstrbq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ + __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_shifted_offset_p_s64 (__ARM_mve_coerce(p0, int64_t *), __p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \ - int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_shifted_offset_p_u64 (__ARM_mve_coerce(p0, uint64_t *), __p1, __ARM_mve_coerce(__p2, uint64x2_t), p3));}) + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int8x16_t]: __arm_vstrbq_scatter_offset_p_s8 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int16x8_t]: __arm_vstrbq_scatter_offset_p_s16 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int32x4_t]: __arm_vstrbq_scatter_offset_p_s32 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vstrbq_scatter_offset_p_u8 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vstrbq_scatter_offset_p_u16 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vstrbq_scatter_offset_p_u32 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) + +#define __arm_vstrdq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ + __typeof(p2) __p2 = (p2); \ + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_offset_p_s64 (__ARM_mve_coerce(__p0, int64_t *), p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \ + int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_offset_p_u64 (__ARM_mve_coerce(__p0, uint64_t *), p1, __ARM_mve_coerce(__p2, uint64x2_t), p3));}) + +#define __arm_vstrdq_scatter_offset(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ + __typeof(p2) __p2 = (p2); \ + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_offset_s64 (__ARM_mve_coerce(__p0, int64_t *), p1, __ARM_mve_coerce(__p2, int64x2_t)), \ + int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_offset_u64 (__ARM_mve_coerce(__p0, uint64_t *), p1, __ARM_mve_coerce(__p2, uint64x2_t)));}) + +#define __arm_vstrdq_scatter_shifted_offset_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ + __typeof(p2) __p2 = (p2); \ + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_shifted_offset_p_s64 (__ARM_mve_coerce(__p0, int64_t *), p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \ + int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_shifted_offset_p_u64 (__ARM_mve_coerce(__p0, uint64_t *), p1, __ARM_mve_coerce(__p2, uint64x2_t), p3));}) + +#define __arm_vstrdq_scatter_shifted_offset(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ + __typeof(p2) __p2 = (p2); \ + _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p2)])0, \ + int (*)[__ARM_mve_type_int64_t_ptr][__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_shifted_offset_s64 (__ARM_mve_coerce(__p0, int64_t *), p1, __ARM_mve_coerce(__p2, int64x2_t)), \ + int (*)[__ARM_mve_type_uint64_t_ptr][__ARM_mve_type_uint64x2_t]: __arm_vstrdq_scatter_shifted_offset_u64 (__ARM_mve_coerce(__p0, uint64_t *), p1, __ARM_mve_coerce(__p2, uint64x2_t)));}) #endif /* __cplusplus */ #endif /* __ARM_FEATURE_MVE */ -- cgit v1.1 From 28de75d27685b2735612d264bb96f39001f4d836 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 11 May 2021 12:17:33 +0100 Subject: aarch64: A couple of mul_laneq tweaks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch removes the duplication between the mul_laneq3 and the older mul-lane patterns. The older patterns were previously divided into two based on whether the indexed operand had the same mode as the other operands or whether it had the opposite length from the other operands (64-bit vs. 128-bit). However, it seemed easier to divide them instead based on whether the indexed operand was 64-bit or 128-bit, since that maps directly to the arm_neon.h “q” conventions. Also, it looks like the older patterns were missing cases for V8HF<->V4HF combinations, which meant that vmul_laneq_f16 and vmulq_lane_f16 didn't produce single instructions. There was a typo in the V2SF entry for VCONQ, but in practice no patterns were using that entry until now. The test passes for both endiannesses, but endianness does change the mapping between regexps and functions. gcc/ * config/aarch64/iterators.md (VMUL_CHANGE_NLANES): Delete. (VMULD): New iterator. (VCOND): Handle V4HF and V8HF. (VCONQ): Fix entry for V2SF. * config/aarch64/aarch64-simd.md (mul_lane3): Use VMULD instead of VMUL. Use a 64-bit vector mode for the indexed operand. (*aarch64_mul3_elt_): Merge with... (mul_laneq3): ...this define_insn. Use VMUL instead of VDQSF. Use a 128-bit vector mode for the indexed operand. Use stype for the scheduling type. gcc/testsuite/ * gcc.target/aarch64/fmul_lane_1.c: New test. --- gcc/config/aarch64/aarch64-simd.md | 46 +++++++++++++------------------------- gcc/config/aarch64/iterators.md | 13 ++++++----- 2 files changed, 23 insertions(+), 36 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 2347629..9962089 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -719,51 +719,35 @@ ) (define_insn "mul_lane3" - [(set (match_operand:VMUL 0 "register_operand" "=w") - (mult:VMUL - (vec_duplicate:VMUL + [(set (match_operand:VMULD 0 "register_operand" "=w") + (mult:VMULD + (vec_duplicate:VMULD (vec_select: - (match_operand:VMUL 2 "register_operand" "") + (match_operand: 2 "register_operand" "") (parallel [(match_operand:SI 3 "immediate_operand" "i")]))) - (match_operand:VMUL 1 "register_operand" "w")))] + (match_operand:VMULD 1 "register_operand" "w")))] "TARGET_SIMD" { - operands[3] = aarch64_endian_lane_rtx (mode, INTVAL (operands[3])); + operands[3] = aarch64_endian_lane_rtx (mode, INTVAL (operands[3])); return "mul\\t%0., %1., %2.[%3]"; } [(set_attr "type" "neon_mul__scalar")] ) (define_insn "mul_laneq3" - [(set (match_operand:VDQSF 0 "register_operand" "=w") - (mult:VDQSF - (vec_duplicate:VDQSF - (vec_select: - (match_operand:V4SF 2 "register_operand" "w") - (parallel [(match_operand:SI 3 "immediate_operand" "i")]))) - (match_operand:VDQSF 1 "register_operand" "w")))] - "TARGET_SIMD" - { - operands[3] = aarch64_endian_lane_rtx (V4SFmode, INTVAL (operands[3])); - return "fmul\\t%0., %1., %2.[%3]"; - } - [(set_attr "type" "neon_fp_mul_s_scalar")] -) - -(define_insn "*aarch64_mul3_elt_" - [(set (match_operand:VMUL_CHANGE_NLANES 0 "register_operand" "=w") - (mult:VMUL_CHANGE_NLANES - (vec_duplicate:VMUL_CHANGE_NLANES + [(set (match_operand:VMUL 0 "register_operand" "=w") + (mult:VMUL + (vec_duplicate:VMUL (vec_select: - (match_operand: 1 "register_operand" "") - (parallel [(match_operand:SI 2 "immediate_operand")]))) - (match_operand:VMUL_CHANGE_NLANES 3 "register_operand" "w")))] + (match_operand: 2 "register_operand" "") + (parallel [(match_operand:SI 3 "immediate_operand")]))) + (match_operand:VMUL 1 "register_operand" "w")))] "TARGET_SIMD" { - operands[2] = aarch64_endian_lane_rtx (mode, INTVAL (operands[2])); - return "mul\\t%0., %3., %1.[%2]"; + operands[3] = aarch64_endian_lane_rtx (mode, INTVAL (operands[3])); + return "mul\\t%0., %1., %2.[%3]"; } - [(set_attr "type" "neon_mul__scalar")] + [(set_attr "type" "neon_mul__scalar")] ) (define_insn "mul_n3" diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index c57aa6b..69d9dbe 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -312,15 +312,17 @@ (define_mode_iterator DSX [DF DI SF SI]) -;; Modes available for Advanced SIMD mul lane operations. +;; Modes available for Advanced SIMD mul operations. (define_mode_iterator VMUL [V4HI V8HI V2SI V4SI (V4HF "TARGET_SIMD_F16INST") (V8HF "TARGET_SIMD_F16INST") V2SF V4SF V2DF]) -;; Modes available for Advanced SIMD mul lane operations changing lane -;; count. -(define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF]) +;; The subset of VMUL for which VCOND is a vector mode. +(define_mode_iterator VMULD [V4HI V8HI V2SI V4SI + (V4HF "TARGET_SIMD_F16INST") + (V8HF "TARGET_SIMD_F16INST") + V2SF V4SF]) ;; Iterators for single modes, for "@" patterns. (define_mode_iterator VNx16QI_ONLY [VNx16QI]) @@ -1201,6 +1203,7 @@ (V4HI "V4HI") (V8HI "V4HI") (V2SI "V2SI") (V4SI "V2SI") (DI "DI") (V2DI "DI") + (V4HF "V4HF") (V8HF "V4HF") (V2SF "V2SF") (V4SF "V2SF") (V2DF "DF")]) @@ -1210,7 +1213,7 @@ (V2SI "V4SI") (V4SI "V4SI") (DI "V2DI") (V2DI "V2DI") (V4HF "V8HF") (V8HF "V8HF") - (V2SF "V2SF") (V4SF "V4SF") + (V2SF "V4SF") (V4SF "V4SF") (V2DF "V2DF") (SI "V4SI") (HI "V8HI") (QI "V16QI")]) -- cgit v1.1 From 2ac1f0eb3887335b7a7e845812956fa436e72a91 Mon Sep 17 00:00:00 2001 From: Alex Coplan Date: Tue, 11 May 2021 13:11:09 +0100 Subject: arm: Avoid emitting bogus CFA adjusts for CMSE nonsecure calls [PR99725] The PR shows us attaching REG_CFA_ADJUST_CFA notes to stack pointer adjustments emitted in cmse_nonsecure_call_inline_register_clear (when -march=armv8.1-m.main). However, the stack pointer is not guaranteed to be the CFA reg. If we're at -O0 or we have -fno-omit-frame-pointer, then the frame pointer will be used as the CFA reg, and these notes on the sp adjustments will lead to ICEs in dwarf2out_frame_debug_adjust_cfa. This patch avoids emitting these notes if the current function has a frame pointer. gcc/ChangeLog: PR target/99725 * config/arm/arm.c (cmse_nonsecure_call_inline_register_clear): Avoid emitting CFA adjusts on the sp if we have the fp. gcc/testsuite/ChangeLog: PR target/99725 * gcc.target/arm/cmse/pr99725.c: New test. --- gcc/config/arm/arm.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 0371d98..2962071 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -18774,10 +18774,14 @@ cmse_nonsecure_call_inline_register_clear (void) imm = gen_int_mode (- lazy_store_stack_frame_size, SImode); add_insn = emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, imm)); - arm_add_cfa_adjust_cfa_note (add_insn, - - lazy_store_stack_frame_size, - stack_pointer_rtx, - stack_pointer_rtx); + /* If we have the frame pointer, then it will be the + CFA reg. Otherwise, the stack pointer is the CFA + reg, so we need to emit a CFA adjust. */ + if (!frame_pointer_needed) + arm_add_cfa_adjust_cfa_note (add_insn, + - lazy_store_stack_frame_size, + stack_pointer_rtx, + stack_pointer_rtx); emit_insn (gen_lazy_store_multiple_insn (stack_pointer_rtx)); } /* Save VFP callee-saved registers. */ @@ -18815,10 +18819,11 @@ cmse_nonsecure_call_inline_register_clear (void) rtx_insn *add_insn = emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, imm)); - arm_add_cfa_adjust_cfa_note (add_insn, - lazy_store_stack_frame_size, - stack_pointer_rtx, - stack_pointer_rtx); + if (!frame_pointer_needed) + arm_add_cfa_adjust_cfa_note (add_insn, + lazy_store_stack_frame_size, + stack_pointer_rtx, + stack_pointer_rtx); } /* Restore VFP callee-saved registers. */ else -- cgit v1.1 From b1f7fd8a2a5558da1e101de11bb1cdba081ce010 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 12 May 2021 08:11:18 +0200 Subject: i386: Implement FP vector compares for V2SFmode [PR98218] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement FP vector compares for V2SFmode for TARGET_MMX_WITH_SSE. 2021-05-12 Uroš Bizjak gcc/ PR target/98218 * config/i386/i386-expand.c (ix86_expand_sse_movcc): Handle V2SF mode. * config/i386/mmx.md (MMXMODE124): New mode iterator. (V2FI): Ditto. (mmxintvecmode): New mode attribute. (mmxintvecmodelower): Ditto. (*mmx_maskcmpv2sf3_comm): New insn pattern. (*mmx_maskcmpv2sf3): Ditto. (vec_cmpv2sfv2si): New expander. (vcondv2si): Ditto. (mmx_vlendvps): New insn pattern. (vcond): Also handle V2SFmode. (vcondu): Ditto. (vcond_mask_): Ditto. gcc/testsuite/ PR target/98218 * g++.target/i386/pr98218-1.C: Ditto. * gcc.target/i386/pr98218-4.c: New test. * gcc.target/i386/pr98218-1.c: Correct PR number. * gcc.target/i386/pr98218-1a.c: Ditto. * gcc.target/i386/pr98218-2.c: Ditto. * gcc.target/i386/pr98218-2a.c: Ditto. * gcc.target/i386/pr98218-3.c: Ditto. * gcc.target/i386/pr98218-3a.c: Ditto. --- gcc/config/i386/i386-expand.c | 7 +++ gcc/config/i386/mmx.md | 130 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 119 insertions(+), 18 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 5cfde5b..dd23008 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -3680,6 +3680,13 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) switch (mode) { + case E_V2SFmode: + if (TARGET_SSE4_1) + { + gen = gen_mmx_blendvps; + op_true = force_reg (mode, op_true); + } + break; case E_V4SFmode: if (TARGET_SSE4_1) gen = gen_sse4_1_blendvps; diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index f085708..d433c52 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -49,6 +49,7 @@ ;; All 8-byte vector modes handled by MMX (define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF]) +(define_mode_iterator MMXMODE124 [V8QI V4HI V2SI V2SF]) ;; Mix-n-match (define_mode_iterator MMXMODE12 [V8QI V4HI]) @@ -56,12 +57,22 @@ (define_mode_iterator MMXMODE24 [V4HI V2SI]) (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI]) +;; All V2S* modes +(define_mode_iterator V2FI [V2SF V2SI]) + ;; Mapping from integer vector mode to mnemonic suffix (define_mode_attr mmxvecsize [(V8QI "b") (V4HI "w") (V2SI "d") (V1DI "q")]) (define_mode_attr mmxdoublemode [(V8QI "V8HI") (V4HI "V4SI")]) +;; Mapping of vector float modes to an integer mode of the same size +(define_mode_attr mmxintvecmode + [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI")]) + +(define_mode_attr mmxintvecmodelower + [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi")]) + (define_mode_attr Yv_Yw [(V8QI "Yw") (V4HI "Yw") (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")]) @@ -714,6 +725,85 @@ (set_attr "prefix_extra" "1") (set_attr "mode" "V2SF")]) +(define_insn "*mmx_maskcmpv2sf3_comm" + [(set (match_operand:V2SF 0 "register_operand" "=x,x") + (match_operator:V2SF 3 "sse_comparison_operator" + [(match_operand:V2SF 1 "register_operand" "%0,x") + (match_operand:V2SF 2 "register_operand" "x,x")]))] + "TARGET_MMX_WITH_SSE + && GET_RTX_CLASS (GET_CODE (operands[3])) == RTX_COMM_COMPARE" + "@ + cmp%D3ps\t{%2, %0|%0, %2} + vcmp%D3ps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "orig,vex") + (set_attr "mode" "V4SF")]) + +(define_insn "*mmx_maskcmpv2sf3" + [(set (match_operand:V2SF 0 "register_operand" "=x,x") + (match_operator:V2SF 3 "sse_comparison_operator" + [(match_operand:V2SF 1 "register_operand" "0,x") + (match_operand:V2SF 2 "register_operand" "x,x")]))] + "TARGET_MMX_WITH_SSE" + "@ + cmp%D3ps\t{%2, %0|%0, %2} + vcmp%D3ps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "orig,vex") + (set_attr "mode" "V4SF")]) + +(define_expand "vec_cmpv2sfv2si" + [(set (match_operand:V2SI 0 "register_operand") + (match_operator:V2SI 1 "" + [(match_operand:V2SF 2 "register_operand") + (match_operand:V2SF 3 "register_operand")]))] + "TARGET_MMX_WITH_SSE" +{ + bool ok = ix86_expand_fp_vec_cmp (operands); + gcc_assert (ok); + DONE; +}) + +(define_expand "vcondv2sf" + [(set (match_operand:V2FI 0 "register_operand") + (if_then_else:V2FI + (match_operator 3 "" + [(match_operand:V2SF 4 "register_operand") + (match_operand:V2SF 5 "register_operand")]) + (match_operand:V2FI 1) + (match_operand:V2FI 2)))] + "TARGET_MMX_WITH_SSE" +{ + bool ok = ix86_expand_fp_vcond (operands); + gcc_assert (ok); + DONE; +}) + +(define_insn "mmx_blendvps" + [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,x") + (unspec:V2SF + [(match_operand:V2SF 1 "register_operand" "0,0,x") + (match_operand:V2SF 2 "register_operand" "Yr,*x,x") + (match_operand:V2SF 3 "register_operand" "Yz,Yz,x")] + UNSPEC_BLENDV))] + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "@ + blendvps\t{%3, %2, %0|%0, %2, %3} + blendvps\t{%3, %2, %0|%0, %2, %3} + vblendvps\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "length_immediate" "1") + (set_attr "prefix_data16" "1,1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "btver2_decode" "vector") + (set_attr "mode" "V4SF")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel single-precision floating point logical operations @@ -1657,42 +1747,46 @@ DONE; }) -(define_expand "vcond" - [(set (match_operand:MMXMODEI 0 "register_operand") - (if_then_else:MMXMODEI +(define_expand "vcond" + [(set (match_operand:MMXMODE124 0 "register_operand") + (if_then_else:MMXMODE124 (match_operator 3 "" [(match_operand:MMXMODEI 4 "register_operand") (match_operand:MMXMODEI 5 "register_operand")]) - (match_operand:MMXMODEI 1) - (match_operand:MMXMODEI 2)))] - "TARGET_MMX_WITH_SSE" + (match_operand:MMXMODE124 1) + (match_operand:MMXMODE124 2)))] + "TARGET_MMX_WITH_SSE + && (GET_MODE_NUNITS (mode) + == GET_MODE_NUNITS (mode))" { bool ok = ix86_expand_int_vcond (operands); gcc_assert (ok); DONE; }) -(define_expand "vcondu" - [(set (match_operand:MMXMODEI 0 "register_operand") - (if_then_else:MMXMODEI +(define_expand "vcondu" + [(set (match_operand:MMXMODE124 0 "register_operand") + (if_then_else:MMXMODE124 (match_operator 3 "" [(match_operand:MMXMODEI 4 "register_operand") (match_operand:MMXMODEI 5 "register_operand")]) - (match_operand:MMXMODEI 1) - (match_operand:MMXMODEI 2)))] - "TARGET_MMX_WITH_SSE" + (match_operand:MMXMODE124 1) + (match_operand:MMXMODE124 2)))] + "TARGET_MMX_WITH_SSE + && (GET_MODE_NUNITS (mode) + == GET_MODE_NUNITS (mode))" { bool ok = ix86_expand_int_vcond (operands); gcc_assert (ok); DONE; }) -(define_expand "vcond_mask_" - [(set (match_operand:MMXMODEI 0 "register_operand") - (vec_merge:MMXMODEI - (match_operand:MMXMODEI 1 "register_operand") - (match_operand:MMXMODEI 2 "register_operand") - (match_operand:MMXMODEI 3 "register_operand")))] +(define_expand "vcond_mask_" + [(set (match_operand:MMXMODE124 0 "register_operand") + (vec_merge:MMXMODE124 + (match_operand:MMXMODE124 1 "register_operand") + (match_operand:MMXMODE124 2 "register_operand") + (match_operand: 3 "register_operand")))] "TARGET_MMX_WITH_SSE" { ix86_expand_sse_movcc (operands[0], operands[3], -- cgit v1.1 From 8da3b309d8fb3ddec0b42218ca6762967b402dc3 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 7 Apr 2021 09:58:54 +0800 Subject: i386: Optimize vpblendvb on inverted mask register to vpblendvb on swapping the order of operand 1 and operand 2. [PR target/99908] - vpcmpeqd %ymm3, %ymm3, %ymm3 - vpandn %ymm3, %ymm2, %ymm2 - vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 + vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 gcc/ChangeLog: PR target/99908 * config/i386/sse.md (_pblendvb): Add splitters for pblendvb of NOT mask register. gcc/testsuite/ChangeLog: PR target/99908 * gcc.target/i386/avx2-pr99908.c: New test. * gcc.target/i386/sse4_1-pr99908.c: New test. --- gcc/config/i386/sse.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 244fb13..49dda74 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17734,6 +17734,35 @@ (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "")]) +(define_split + [(set (match_operand:VI1_AVX2 0 "register_operand") + (unspec:VI1_AVX2 + [(match_operand:VI1_AVX2 1 "vector_operand") + (match_operand:VI1_AVX2 2 "register_operand") + (not:VI1_AVX2 (match_operand:VI1_AVX2 3 "register_operand"))] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + [(set (match_dup 0) + (unspec:VI1_AVX2 + [(match_dup 2) (match_dup 1) (match_dup 3)] + UNSPEC_BLENDV))]) + +(define_split + [(set (match_operand:VI1_AVX2 0 "register_operand") + (unspec:VI1_AVX2 + [(match_operand:VI1_AVX2 1 "vector_operand") + (match_operand:VI1_AVX2 2 "register_operand") + (subreg:VI1_AVX2 (not (match_operand 3 "register_operand")) 0)] + UNSPEC_BLENDV))] + "TARGET_SSE4_1 + && GET_MODE_CLASS (GET_MODE (operands[3])) == MODE_VECTOR_INT + && GET_MODE_SIZE (GET_MODE (operands[3])) == " + [(set (match_dup 0) + (unspec:VI1_AVX2 + [(match_dup 2) (match_dup 1) (match_dup 4)] + UNSPEC_BLENDV))] + "operands[4] = gen_lowpart (mode, operands[3]);") + (define_insn_and_split "*_pblendvb_lt" [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x") (unspec:VI1_AVX2 -- cgit v1.1 From 2a1586401a21dcd43e0f904bb6eec26c8b2f366b Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Wed, 12 May 2021 12:40:37 +0200 Subject: [nvptx] Add -mptx=3.1/6.3 Add nvptx option -mptx that sets the ptx ISA version. This is currently hardcoded to 3.1. Tested libgomp on x86_64-linux with nvptx accelerator, both with default set to 3.1 and 6.3. gcc/ChangeLog: 2021-05-12 Tom de Vries PR target/96005 * config/nvptx/nvptx-opts.h (enum ptx_version): New enum. * config/nvptx/nvptx.c (nvptx_file_start): Print .version according to ptx_version_option. * config/nvptx/nvptx.h (TARGET_PTX_6_3): Define. * config/nvptx/nvptx.md (define_insn "nvptx_shuffle") (define_insn "nvptx_vote_ballot"): Use sync variant for TARGET_PTX_6_3. * config/nvptx/nvptx.opt (ptx_version): Add enum. (mptx): Add option. * doc/invoke.texi (Nvidia PTX Options): Add mptx item. --- gcc/config/nvptx/nvptx-opts.h | 6 ++++++ gcc/config/nvptx/nvptx.c | 5 ++++- gcc/config/nvptx/nvptx.h | 2 ++ gcc/config/nvptx/nvptx.md | 14 ++++++++++++-- gcc/config/nvptx/nvptx.opt | 14 ++++++++++++++ 5 files changed, 38 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h index ce88245..bfa926e 100644 --- a/gcc/config/nvptx/nvptx-opts.h +++ b/gcc/config/nvptx/nvptx-opts.h @@ -26,5 +26,11 @@ enum ptx_isa PTX_ISA_SM35 }; +enum ptx_version +{ + PTX_VERSION_3_1, + PTX_VERSION_6_3 +}; + #endif diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 7a7a913..ebbfa92 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -5309,7 +5309,10 @@ static void nvptx_file_start (void) { fputs ("// BEGIN PREAMBLE\n", asm_out_file); - fputs ("\t.version\t3.1\n", asm_out_file); + if (TARGET_PTX_6_3) + fputs ("\t.version\t6.3\n", asm_out_file); + else + fputs ("\t.version\t3.1\n", asm_out_file); if (TARGET_SM35) fputs ("\t.target\tsm_35\n", asm_out_file); else diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h index 2451703..fdaacdd 100644 --- a/gcc/config/nvptx/nvptx.h +++ b/gcc/config/nvptx/nvptx.h @@ -98,6 +98,8 @@ #define TARGET_SM35 (ptx_isa_option >= PTX_ISA_SM35) +#define TARGET_PTX_6_3 (ptx_version_option >= PTX_VERSION_6_3) + /* Registers. Since ptx is a virtual target, we just define a few hard registers for special purposes and leave pseudos unallocated. We have to have some available hard registers, to keep gcc setup diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 0f15609..00bb8fe 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -1452,14 +1452,24 @@ (match_operand:SI 3 "const_int_operand" "n")] UNSPEC_SHUFFLE))] "" - "%.\\tshfl%S3.b32\\t%0, %1, %2, 31;") + { + if (TARGET_PTX_6_3) + return "%.\\tshfl.sync%S3.b32\\t%0, %1, %2, 31, 0xffffffff;"; + else + return "%.\\tshfl%S3.b32\\t%0, %1, %2, 31;"; + }) (define_insn "nvptx_vote_ballot" [(set (match_operand:SI 0 "nvptx_register_operand" "=R") (unspec:SI [(match_operand:BI 1 "nvptx_register_operand" "R")] UNSPEC_VOTE_BALLOT))] "" - "%.\\tvote.ballot.b32\\t%0, %1;") + { + if (TARGET_PTX_6_3) + return "%.\\tvote.sync.ballot.b32\\t%0, %1, 0xffffffff;"; + else + return "%.\\tvote.ballot.b32\\t%0, %1;"; + }) ;; Patterns for OpenMP SIMD-via-SIMT lowering diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 51363e4..468c6ca 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -65,3 +65,17 @@ Enum(ptx_isa) String(sm_35) Value(PTX_ISA_SM35) misa= Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) Init(PTX_ISA_SM35) Specify the version of the ptx ISA to use. + +Enum +Name(ptx_version) Type(int) +Known PTX versions (for use with the -mptx= option): + +EnumValue +Enum(ptx_version) String(3.1) Value(PTX_VERSION_3_1) + +EnumValue +Enum(ptx_version) String(6.3) Value(PTX_VERSION_6_3) + +mptx= +Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option) Init(PTX_VERSION_3_1) +Specify the version of the ptx version to use. -- cgit v1.1 From 94de7e225c1fda079052c3f0725c926437d56c94 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Thu, 22 Apr 2021 15:33:16 +0800 Subject: Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] If the second operand of __builtin_shuffle is const vector 0, and with specific mask, it can be optimized to movq/vmovps. .i.e. foo128: - vxorps %xmm1, %xmm1, %xmm1 - vmovlhps %xmm1, %xmm0, %xmm0 + vmovq %xmm0, %xmm0 foo256: - vxorps %xmm1, %xmm1, %xmm1 - vshuff32x4 $0, %ymm1, %ymm0, %ymm0 + vmovaps %xmm0, %xmm0 foo512: - vxorps %xmm1, %xmm1, %xmm1 - vshuff32x4 $68, %zmm1, %zmm0, %zmm0 + vmovaps %ymm0, %ymm0 gcc/ChangeLog: PR target/94680 * config/i386/sse.md (ssedoublevecmode): Add attribute for V64QI/V32HI/V16SI/V4DI. (ssehalfvecmode): Add attribute for V2DI/V2DF. (*vec_concatv4si_0): Extend to VI124_128. (*vec_concat_0): New pre-reload splitter. * config/i386/predicates.md (movq_parallel): New predicate. gcc/testsuite/ChangeLog: PR target/94680 * gcc.target/i386/avx-pr94680.c: New test. * gcc.target/i386/avx512f-pr94680.c: New test. * gcc.target/i386/sse2-pr94680.c: New test. --- gcc/config/i386/predicates.md | 32 ++++++++++++++++++++++++++++++++ gcc/config/i386/sse.md | 37 +++++++++++++++++++++++++++++-------- 2 files changed, 61 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 6dfbb08..abd307e 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1535,6 +1535,38 @@ (and (match_code "mem") (match_test "MEM_ALIGN (op) < GET_MODE_BITSIZE (mode)"))) +;; Return true if OP is a parallel for an mov{d,q,dqa,ps,pd} vec_select, +;; where one of the two operands of the vec_concat is const0_operand. +(define_predicate "movq_parallel" + (match_code "parallel") +{ + unsigned nelt = XVECLEN (op, 0); + unsigned nelt2 = nelt >> 1; + unsigned i; + + if (nelt < 2) + return false; + + /* Validate that all of the elements are constants, + lower halves of permute are lower halves of the first operand, + upper halves of permute come from any of the second operand. */ + for (i = 0; i < nelt; ++i) + { + rtx er = XVECEXP (op, 0, i); + unsigned HOST_WIDE_INT ei; + + if (!CONST_INT_P (er)) + return 0; + ei = INTVAL (er); + if (i < nelt2 && ei != i) + return 0; + if (i >= nelt2 && (ei < nelt || ei >= nelt << 1)) + return 0; + } + + return 1; +}) + ;; Return true if OP is a vzeroall operation, known to be a PARALLEL. (define_predicate "vzeroall_operation" (match_code "parallel") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 49dda74..4072d0c 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -811,19 +811,22 @@ ;; Mapping of vector modes to a vector mode of double size (define_mode_attr ssedoublevecmode - [(V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI") + [(V64QI "V128QI") (V32HI "V64HI") (V16SI "V32SI") (V8DI "V16DI") + (V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI") (V16QI "V32QI") (V8HI "V16HI") (V4SI "V8SI") (V2DI "V4DI") + (V16SF "V32SF") (V8DF "V16DF") (V8SF "V16SF") (V4DF "V8DF") (V4SF "V8SF") (V2DF "V4DF")]) ;; Mapping of vector modes to a vector mode of half size +;; instead of V1DI/V1DF, DI/DF are used for V2DI/V2DF although they are scalar. (define_mode_attr ssehalfvecmode [(V64QI "V32QI") (V32HI "V16HI") (V16SI "V8SI") (V8DI "V4DI") (V4TI "V2TI") (V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI") - (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") + (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V2DI "DI") (V16SF "V8SF") (V8DF "V4DF") (V8SF "V4SF") (V4DF "V2DF") - (V4SF "V2SF")]) + (V4SF "V2SF") (V2DF "DF")]) (define_mode_attr ssehalfvecmodelower [(V64QI "v32qi") (V32HI "v16hi") (V16SI "v8si") (V8DI "v4di") (V4TI "v2ti") @@ -15939,11 +15942,11 @@ (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) -(define_insn "*vec_concatv4si_0" - [(set (match_operand:V4SI 0 "register_operand" "=v,x") - (vec_concat:V4SI - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") - (match_operand:V2SI 2 "const0_operand" " C,C")))] +(define_insn "*vec_concat_0" + [(set (match_operand:VI124_128 0 "register_operand" "=v,x") + (vec_concat:VI124_128 + (match_operand: 1 "nonimmediate_operand" "vm,?!*y") + (match_operand: 2 "const0_operand" " C,C")))] "TARGET_SSE2" "@ %vmovq\t{%1, %0|%0, %1} @@ -22158,6 +22161,24 @@ (set_attr "prefix" "maybe_evex") (set_attr "mode" "")]) +(define_insn_and_split "*vec_concat_0_1" + [(set (match_operand:V 0 "register_operand") + (vec_select:V + (vec_concat: + (match_operand:V 1 "nonimmediate_operand") + (match_operand:V 2 "const0_operand")) + (match_parallel 3 "movq_parallel" + [(match_operand 4 "const_int_operand")])))] + "ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (vec_concat:V (match_dup 1) (match_dup 5)))] +{ + operands[1] = gen_lowpart (mode, operands[1]); + operands[5] = CONST0_RTX (mode); +}) + (define_insn "vcvtph2ps" [(set (match_operand:V4SF 0 "register_operand" "=v") (vec_select:V4SF -- cgit v1.1 From f1693741cb2b2db519bb82155a3c0880fd820ea3 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 13 May 2021 11:09:53 +0200 Subject: i386: Fix up V2SFmode vcond* with -mxop [PR100581] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ix86_expand_sse_movcc has special TARGET_XOP handling and the recent addition of support of v*cond* patterns for V2SFmode results in ICEs because the expected pattern doesn't exist. We can handle it using 128-bit vpcmov (if we ignore the upper 64 bits like we ignore in other TARGET_MMX_WITH_SSE support). 2021-05-13 Uroš Bizjak gcc/ PR target/100581 * config/i386/i386-expand.c (ix86_expand_sse_movcc): Force mode sizes < 16 to a register when constructing vpcmov pattern. * config/i386/mmx.md (*xop_pcmov_): Use MMXMODE124 mode. gcc/testsuite/ PR target/100581 * g++.target/i386/pr100581.C: New test. --- gcc/config/i386/i386-expand.c | 3 ++- gcc/config/i386/mmx.md | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index dd23008..92488b8 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -3661,7 +3661,8 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) { op_true = force_reg (mode, op_true); - if (!nonimmediate_operand (op_false, mode)) + if (GET_MODE_SIZE (mode) < 16 + || !nonimmediate_operand (op_false, mode)) op_false = force_reg (mode, op_false); emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp, diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index d433c52..7fc2e5d 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1816,11 +1816,11 @@ ;; XOP parallel XMM conditional moves (define_insn "*xop_pcmov_" - [(set (match_operand:MMXMODEI 0 "register_operand" "=x") - (if_then_else:MMXMODEI - (match_operand:MMXMODEI 3 "register_operand" "x") - (match_operand:MMXMODEI 1 "register_operand" "x") - (match_operand:MMXMODEI 2 "register_operand" "x")))] + [(set (match_operand:MMXMODE124 0 "register_operand" "=x") + (if_then_else:MMXMODE124 + (match_operand:MMXMODE124 3 "register_operand" "x") + (match_operand:MMXMODE124 1 "register_operand" "x") + (match_operand:MMXMODE124 2 "register_operand" "x")))] "TARGET_XOP && TARGET_MMX_WITH_SSE" "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "sse4arg")]) -- cgit v1.1 From 829c4bea06600ea4201462f91ce6d76ca21fdb35 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 13 May 2021 12:14:14 +0200 Subject: ix86: Support V{2, 4}DImode arithmetic right shifts for SSE2+ [PR98856] As mentioned in the PR, we don't support arithmetic right V2DImode or V4DImode on x86 without -mavx512vl or -mxop. The ISAs indeed don't have {,v}psraq instructions until AVX512VL, but we actually can emulate it quite easily. One case is arithmetic >> 63, we can just emit {,v}pxor; {,v}pcmpgt for that for SSE4.2+, or for SSE2 psrad $31; pshufd $0xf5. Then arithmetic >> by constant > 32, that can be done with {,v}psrad $31 and {,v}psrad $(cst-32) and two operand permutation, arithmetic >> 32 can be done as {,v}psrad $31 and permutation of that and the original operand. Arithmetic >> by constant < 32 can be done as {,v}psrad $cst and {,v}psrlq $cst and two operand permutation. And arithmetic >> by variable scalar amount can be done as arithmetic >> 63, logical >> by the amount, << by (64 - amount of the >> 63 result; note that the vector << 64 result in 0) and oring together. I had to improve the permutation generation so that it actually handles the needed permutations (or handles them better). 2021-05-13 Jakub Jelinek PR tree-optimization/98856 * config/i386/i386.c (ix86_shift_rotate_cost): Add CODE argument. Expect V2DI and V4DI arithmetic right shifts to be emulated. (ix86_rtx_costs, ix86_add_stmt_cost): Adjust ix86_shift_rotate_cost caller. * config/i386/i386-expand.c (expand_vec_perm_2perm_interleave, expand_vec_perm_2perm_pblendv): New functions. (ix86_expand_vec_perm_const_1): Use them. * config/i386/sse.md (ashr3): Rename to ... (ashr3): ... this. (ashr3): New define_expand with VI248_AVX512BW iterator. (ashrv4di3): New define_expand. (ashrv2di3): Change condition to TARGET_SSE2, handle !TARGET_XOP and !TARGET_AVX512VL expansion. * gcc.target/i386/sse2-psraq-1.c: New test. * gcc.target/i386/sse4_2-psraq-1.c: New test. * gcc.target/i386/avx-psraq-1.c: New test. * gcc.target/i386/avx2-psraq-1.c: New test. * gcc.target/i386/avx-pr82370.c: Adjust expected number of vpsrad instructions. * gcc.target/i386/avx2-pr82370.c: Likewise. * gcc.target/i386/avx512f-pr82370.c: Likewise. * gcc.target/i386/avx512bw-pr82370.c: Likewise. * gcc.dg/torture/vshuf-4.inc: Add two further permutations. * gcc.dg/torture/vshuf-8.inc: Likewise. --- gcc/config/i386/i386-expand.c | 248 ++++++++++++++++++++++++++++++++++++++++++ gcc/config/i386/i386.c | 29 ++++- gcc/config/i386/sse.md | 246 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 517 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 92488b8..0fa8d45 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -18662,6 +18662,242 @@ expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d) return true; } +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement + a two vector permutation using two single vector permutations and + {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one + of dfirst or dsecond is identity permutation. */ + +static bool +expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn) +{ + unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt; + struct expand_vec_perm_d dfirst, dsecond, dfinal; + bool ident1 = true, ident2 = true; + + if (d->one_operand_p) + return false; + + if (GET_MODE_SIZE (d->vmode) == 16) + { + if (!TARGET_SSE) + return false; + if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2) + return false; + } + else if (GET_MODE_SIZE (d->vmode) == 32) + { + if (!TARGET_AVX) + return false; + if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2) + return false; + lane = nelt2; + } + else + return false; + + for (i = 1; i < nelt; i++) + if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1))) + return false; + + dfirst = *d; + dsecond = *d; + dfinal = *d; + dfirst.op1 = dfirst.op0; + dfirst.one_operand_p = true; + dsecond.op0 = dsecond.op1; + dsecond.one_operand_p = true; + + for (i = 0; i < nelt; i++) + if (d->perm[i] >= nelt) + { + dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt; + if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0)) + ident2 = false; + dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)] + = d->perm[i] - nelt; + } + else + { + dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i]; + if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0)) + ident1 = false; + dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i]; + } + + if (two_insn && !ident1 && !ident2) + return false; + + if (!d->testing_p) + { + if (!ident1) + dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode); + if (!ident2) + dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode); + if (d->perm[0] >= nelt) + std::swap (dfinal.op0, dfinal.op1); + } + + bool ok; + rtx_insn *seq1 = NULL, *seq2 = NULL; + + if (!ident1) + { + start_sequence (); + ok = expand_vec_perm_1 (&dfirst); + seq1 = get_insns (); + end_sequence (); + + if (!ok) + return false; + } + + if (!ident2) + { + start_sequence (); + ok = expand_vec_perm_1 (&dsecond); + seq2 = get_insns (); + end_sequence (); + + if (!ok) + return false; + } + + if (d->testing_p) + return true; + + for (i = 0; i < nelt; i++) + { + dfinal.perm[i] = i / 2; + if (i >= lane) + dfinal.perm[i] += lane / 2; + if ((i & 1) != 0) + dfinal.perm[i] += nelt; + } + emit_insn (seq1); + emit_insn (seq2); + ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1, + dfinal.perm, dfinal.nelt, false); + gcc_assert (ok); + return true; +} + +/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify + the permutation using two single vector permutations and the SSE4_1 pblendv + instruction. If two_insn, succeed only if one of dfirst or dsecond is + identity permutation. */ + +static bool +expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn) +{ + unsigned i, nelt = d->nelt; + struct expand_vec_perm_d dfirst, dsecond, dfinal; + machine_mode vmode = d->vmode; + bool ident1 = true, ident2 = true; + + /* Use the same checks as in expand_vec_perm_blend. */ + if (d->one_operand_p) + return false; + if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) + ; + else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) + ; + else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) + ; + else + return false; + + dfirst = *d; + dsecond = *d; + dfinal = *d; + dfirst.op1 = dfirst.op0; + dfirst.one_operand_p = true; + dsecond.op0 = dsecond.op1; + dsecond.one_operand_p = true; + + for (i = 0; i < nelt; ++i) + if (d->perm[i] >= nelt) + { + dfirst.perm[i] = 0xff; + dsecond.perm[i] = d->perm[i] - nelt; + if (d->perm[i] != i + nelt) + ident2 = false; + } + else + { + dsecond.perm[i] = 0xff; + dfirst.perm[i] = d->perm[i]; + if (d->perm[i] != i) + ident1 = false; + } + + if (two_insn && !ident1 && !ident2) + return false; + + /* For now. Ideally treat 0xff as a wildcard. */ + for (i = 0; i < nelt; ++i) + if (dfirst.perm[i] == 0xff) + { + if (GET_MODE_SIZE (vmode) == 32 + && dfirst.perm[i ^ (nelt / 2)] != 0xff) + dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2); + else + dfirst.perm[i] = i; + } + else + { + if (GET_MODE_SIZE (vmode) == 32 + && dsecond.perm[i ^ (nelt / 2)] != 0xff) + dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2); + else + dsecond.perm[i] = i; + } + + if (!d->testing_p) + { + if (!ident1) + dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode); + if (!ident2) + dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode); + } + + bool ok; + rtx_insn *seq1 = NULL, *seq2 = NULL; + + if (!ident1) + { + start_sequence (); + ok = expand_vec_perm_1 (&dfirst); + seq1 = get_insns (); + end_sequence (); + + if (!ok) + return false; + } + + if (!ident2) + { + start_sequence (); + ok = expand_vec_perm_1 (&dsecond); + seq2 = get_insns (); + end_sequence (); + + if (!ok) + return false; + } + + if (d->testing_p) + return true; + + for (i = 0; i < nelt; ++i) + dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i); + + emit_insn (seq1); + emit_insn (seq2); + ok = expand_vec_perm_blend (&dfinal); + gcc_assert (ok); + return true; +} + /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF permutation using two vperm2f128, followed by a vshufpd insn blending the two vectors together. */ @@ -19773,6 +20009,12 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_pblendv (d)) return true; + if (expand_vec_perm_2perm_interleave (d, true)) + return true; + + if (expand_vec_perm_2perm_pblendv (d, true)) + return true; + /* Try sequences of three instructions. */ if (expand_vec_perm_even_odd_pack (d)) @@ -19790,6 +20032,12 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_vperm2f128_vblend (d)) return true; + if (expand_vec_perm_2perm_interleave (d, false)) + return true; + + if (expand_vec_perm_2perm_pblendv (d, false)) + return true; + /* Try sequences of four instructions. */ if (expand_vec_perm_even_odd_trunc (d)) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 915f89f..6a1f574 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19732,6 +19732,7 @@ ix86_division_cost (const struct processor_costs *cost, static int ix86_shift_rotate_cost (const struct processor_costs *cost, + enum rtx_code code, enum machine_mode mode, bool constant_op1, HOST_WIDE_INT op1_val, bool speed, @@ -19770,6 +19771,19 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, count = 7; return ix86_vec_cost (mode, cost->sse_op * count); } + /* V*DImode arithmetic right shift is emulated. */ + else if (code == ASHIFTRT + && (mode == V2DImode || mode == V4DImode) + && !TARGET_XOP + && !TARGET_AVX512VL) + { + int count = 4; + if (constant_op1 && op1_val == 63 && TARGET_SSE4_2) + count = 2; + else if (constant_op1) + count = 3; + return ix86_vec_cost (mode, cost->sse_op * count); + } else return ix86_vec_cost (mode, cost->sse_op); } @@ -19939,13 +19953,15 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, case LSHIFTRT: case ROTATERT: bool skip_op0, skip_op1; - *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)), + *total = ix86_shift_rotate_cost (cost, code, mode, + CONSTANT_P (XEXP (x, 1)), CONST_INT_P (XEXP (x, 1)) ? INTVAL (XEXP (x, 1)) : -1, speed, GET_CODE (XEXP (x, 1)) == AND, SUBREG_P (XEXP (x, 1)) - && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND, + && GET_CODE (XEXP (XEXP (x, 1), + 0)) == AND, &skip_op0, &skip_op1); if (skip_op0 || skip_op1) { @@ -22383,11 +22399,16 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, case LROTATE_EXPR: case RROTATE_EXPR: { + tree op1 = gimple_assign_rhs1 (stmt_info->stmt); tree op2 = gimple_assign_rhs2 (stmt_info->stmt); stmt_cost = ix86_shift_rotate_cost - (ix86_cost, mode, + (ix86_cost, + (subcode == RSHIFT_EXPR + && !TYPE_UNSIGNED (TREE_TYPE (op1))) + ? ASHIFTRT : LSHIFTRT, mode, TREE_CODE (op2) == INTEGER_CST, - cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1, + cst_and_fits_in_hwi (op2) + ? int_cst_value (op2) : -1, true, false, false, NULL, NULL); } break; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 4072d0c..62f4e15f 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -12468,7 +12468,7 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "")]) -(define_insn "ashr3" +(define_insn "ashr3" [(set (match_operand:VI248_AVX512BW_AVX512VL 0 "register_operand" "=v,v") (ashiftrt:VI248_AVX512BW_AVX512VL (match_operand:VI248_AVX512BW_AVX512VL 1 "nonimmediate_operand" "v,vm") @@ -12482,6 +12482,126 @@ (const_string "0"))) (set_attr "mode" "")]) +(define_expand "ashr3" + [(set (match_operand:VI248_AVX512BW 0 "register_operand") + (ashiftrt:VI248_AVX512BW + (match_operand:VI248_AVX512BW 1 "nonimmediate_operand") + (match_operand:DI 2 "nonmemory_operand")))] + "TARGET_AVX512F") + +(define_expand "ashrv4di3" + [(set (match_operand:V4DI 0 "register_operand") + (ashiftrt:V4DI + (match_operand:V4DI 1 "nonimmediate_operand") + (match_operand:DI 2 "nonmemory_operand")))] + "TARGET_AVX2" +{ + if (!TARGET_AVX512VL) + { + if (CONST_INT_P (operands[2]) && UINTVAL (operands[2]) >= 63) + { + rtx zero = force_reg (V4DImode, CONST0_RTX (V4DImode)); + emit_insn (gen_avx2_gtv4di3 (operands[0], zero, operands[1])); + DONE; + } + if (operands[2] == const0_rtx) + { + emit_move_insn (operands[0], operands[1]); + DONE; + } + operands[1] = force_reg (V4DImode, operands[1]); + if (CONST_INT_P (operands[2])) + { + vec_perm_builder sel (8, 8, 1); + sel.quick_grow (8); + rtx arg0, arg1; + rtx op1 = lowpart_subreg (V8SImode, operands[1], V4DImode); + rtx target = gen_reg_rtx (V8SImode); + if (INTVAL (operands[2]) > 32) + { + arg0 = gen_reg_rtx (V8SImode); + arg1 = gen_reg_rtx (V8SImode); + emit_insn (gen_ashrv8si3 (arg1, op1, GEN_INT (31))); + emit_insn (gen_ashrv8si3 (arg0, op1, + GEN_INT (INTVAL (operands[2]) - 32))); + sel[0] = 1; + sel[1] = 9; + sel[2] = 3; + sel[3] = 11; + sel[4] = 5; + sel[5] = 13; + sel[6] = 7; + sel[7] = 15; + } + else if (INTVAL (operands[2]) == 32) + { + arg0 = op1; + arg1 = gen_reg_rtx (V8SImode); + emit_insn (gen_ashrv8si3 (arg1, op1, GEN_INT (31))); + sel[0] = 1; + sel[1] = 9; + sel[2] = 3; + sel[3] = 11; + sel[4] = 5; + sel[5] = 13; + sel[6] = 7; + sel[7] = 15; + } + else + { + arg0 = gen_reg_rtx (V4DImode); + arg1 = gen_reg_rtx (V8SImode); + emit_insn (gen_lshrv4di3 (arg0, operands[1], operands[2])); + emit_insn (gen_ashrv8si3 (arg1, op1, operands[2])); + arg0 = lowpart_subreg (V8SImode, arg0, V4DImode); + sel[0] = 0; + sel[1] = 9; + sel[2] = 2; + sel[3] = 11; + sel[4] = 4; + sel[5] = 13; + sel[6] = 6; + sel[7] = 15; + } + vec_perm_indices indices (sel, 2, 8); + bool ok = targetm.vectorize.vec_perm_const (V8SImode, target, + arg0, arg1, indices); + gcc_assert (ok); + emit_move_insn (operands[0], + lowpart_subreg (V4DImode, target, V8SImode)); + DONE; + } + + rtx zero = force_reg (V4DImode, CONST0_RTX (V4DImode)); + rtx zero_or_all_ones = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_gtv4di3 (zero_or_all_ones, zero, operands[1])); + rtx lshr_res = gen_reg_rtx (V4DImode); + emit_insn (gen_lshrv4di3 (lshr_res, operands[1], operands[2])); + rtx ashl_res = gen_reg_rtx (V4DImode); + rtx amount; + if (TARGET_64BIT) + { + amount = gen_reg_rtx (DImode); + emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)), + operands[2])); + } + else + { + rtx temp = gen_reg_rtx (SImode); + emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)), + lowpart_subreg (SImode, operands[2], + DImode))); + amount = gen_reg_rtx (V4SImode); + emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode), + temp)); + } + amount = lowpart_subreg (DImode, amount, GET_MODE (amount)); + emit_insn (gen_ashlv4di3 (ashl_res, zero_or_all_ones, amount)); + emit_insn (gen_iorv4di3 (operands[0], lshr_res, ashl_res)); + DONE; + } +}) + (define_insn "3" [(set (match_operand:VI248_AVX512BW_2 0 "register_operand" "=v,v") (any_lshift:VI248_AVX512BW_2 @@ -20329,10 +20449,132 @@ (ashiftrt:V2DI (match_operand:V2DI 1 "register_operand") (match_operand:DI 2 "nonmemory_operand")))] - "TARGET_XOP || TARGET_AVX512VL" + "TARGET_SSE2" { if (!TARGET_AVX512VL) { + if (TARGET_SSE4_2 + && CONST_INT_P (operands[2]) + && UINTVAL (operands[2]) >= 63) + { + rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode)); + emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1])); + DONE; + } + if (operands[2] == const0_rtx) + { + emit_move_insn (operands[0], operands[1]); + DONE; + } + if (CONST_INT_P (operands[2]) + && (!TARGET_XOP || UINTVAL (operands[2]) >= 63)) + { + vec_perm_builder sel (4, 4, 1); + sel.quick_grow (4); + rtx arg0, arg1; + rtx op1 = lowpart_subreg (V4SImode, operands[1], V2DImode); + rtx target = gen_reg_rtx (V4SImode); + if (UINTVAL (operands[2]) >= 63) + { + arg0 = arg1 = gen_reg_rtx (V4SImode); + emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31))); + sel[0] = 1; + sel[1] = 1; + sel[2] = 3; + sel[3] = 3; + } + else if (INTVAL (operands[2]) > 32) + { + arg0 = gen_reg_rtx (V4SImode); + arg1 = gen_reg_rtx (V4SImode); + emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31))); + emit_insn (gen_ashrv4si3 (arg0, op1, + GEN_INT (INTVAL (operands[2]) - 32))); + sel[0] = 1; + sel[1] = 5; + sel[2] = 3; + sel[3] = 7; + } + else if (INTVAL (operands[2]) == 32) + { + arg0 = op1; + arg1 = gen_reg_rtx (V4SImode); + emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31))); + sel[0] = 1; + sel[1] = 5; + sel[2] = 3; + sel[3] = 7; + } + else + { + arg0 = gen_reg_rtx (V2DImode); + arg1 = gen_reg_rtx (V4SImode); + emit_insn (gen_lshrv2di3 (arg0, operands[1], operands[2])); + emit_insn (gen_ashrv4si3 (arg1, op1, operands[2])); + arg0 = lowpart_subreg (V4SImode, arg0, V2DImode); + sel[0] = 0; + sel[1] = 5; + sel[2] = 2; + sel[3] = 7; + } + vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4); + bool ok = targetm.vectorize.vec_perm_const (V4SImode, target, + arg0, arg1, indices); + gcc_assert (ok); + emit_move_insn (operands[0], + lowpart_subreg (V2DImode, target, V4SImode)); + DONE; + } + if (!TARGET_XOP) + { + rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode)); + rtx zero_or_all_ones; + if (TARGET_SSE4_2) + { + zero_or_all_ones = gen_reg_rtx (V2DImode); + emit_insn (gen_sse4_2_gtv2di3 (zero_or_all_ones, zero, + operands[1])); + } + else + { + rtx temp = gen_reg_rtx (V4SImode); + emit_insn (gen_ashrv4si3 (temp, lowpart_subreg (V4SImode, + operands[1], + V2DImode), + GEN_INT (31))); + zero_or_all_ones = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp, + const1_rtx, const1_rtx, + GEN_INT (3), GEN_INT (3))); + zero_or_all_ones = lowpart_subreg (V2DImode, zero_or_all_ones, + V4SImode); + } + rtx lshr_res = gen_reg_rtx (V2DImode); + emit_insn (gen_lshrv2di3 (lshr_res, operands[1], operands[2])); + rtx ashl_res = gen_reg_rtx (V2DImode); + rtx amount; + if (TARGET_64BIT) + { + amount = gen_reg_rtx (DImode); + emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)), + operands[2])); + } + else + { + rtx temp = gen_reg_rtx (SImode); + emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)), + lowpart_subreg (SImode, operands[2], + DImode))); + amount = gen_reg_rtx (V4SImode); + emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode), + temp)); + } + amount = lowpart_subreg (DImode, amount, GET_MODE (amount)); + emit_insn (gen_ashlv2di3 (ashl_res, zero_or_all_ones, amount)); + emit_insn (gen_iorv2di3 (operands[0], lshr_res, ashl_res)); + DONE; + } + rtx reg = gen_reg_rtx (V2DImode); rtx par; bool negate = false; -- cgit v1.1 From a451598b2c02e1ca3c62fea272d73a9f31922252 Mon Sep 17 00:00:00 2001 From: Richard Earnshaw Date: Thu, 13 May 2021 11:42:58 +0100 Subject: arm: correctly handle inequality comparisons against max constants [PR100563] Normally we expect the gimple optimizers to fold away comparisons that are always true, but at some lower optimization levels this is not always the case, so the back-end has to be able to generate correct code in these cases. In this example, we have a comparison of the form (unsigned long long) op <= ~0ULL which, of course is always true. Normally, in the arm back-end we handle these expansions where the immediate cannot be handled directly by adding 1 to the constant and then adjusting the comparison operator: (unsigned long long) op < CONST + 1 but we cannot do that when the constant is already the largest value. Fortunately, we observe that the comparisons we need to handle this way are either always true or always false, so instead of forming a comparison against the maximum value, we can replace it with a comparison against the minimum value (which just happens to also be a constant we can handle. So op1 <= ~0ULL -> op1 >= 0U op1 > ~0ULL -> op1 < 0U op1 <= LONG_LONG_INT_MAX -> op1 >= (-LONG_LONG_INT_MAX - 1) op1 > LONG_LONG_INT_MAX -> op1 < (-LONG_LONG_INT_MAX - 1) gcc: PR target/100563 * config/arm/arm.c (arm_canonicalize_comparison): Correctly canonicalize DImode inequality comparisons against the maximum integral value. gcc/testsuite: * gcc.dg/pr100563.c: New test. --- gcc/config/arm/arm.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 2962071..d0c0c50 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -5563,9 +5563,20 @@ arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1, return; *op1 = GEN_INT (i + 1); *code = *code == GT ? GE : LT; - return; } - break; + else + { + /* GT maxval is always false, LE maxval is always true. + We can't fold that away here as we must make a + comparison, but we can fold them to comparisons + with the same result that can be handled: + op0 GT maxval -> op0 LT minval + op0 LE maxval -> op0 GE minval + where minval = (-maxval - 1). */ + *op1 = GEN_INT (-maxval - 1); + *code = *code == GT ? LT : GE; + } + return; case GTU: case LEU: @@ -5578,9 +5589,19 @@ arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1, return; *op1 = GEN_INT (i + 1); *code = *code == GTU ? GEU : LTU; - return; } - break; + else + { + /* GTU ~0 is always false, LEU ~0 is always true. + We can't fold that away here as we must make a + comparison, but we can fold them to comparisons + with the same result that can be handled: + op0 GTU ~0 -> op0 LTU 0 + op0 LEU ~0 -> op0 GEU 0. */ + *op1 = const0_rtx; + *code = *code == GTU ? LTU : GEU; + } + return; default: gcc_unreachable (); -- cgit v1.1 From 543c0cbca0ca4e9dbe703a9ea4b8eb79744157b6 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Wed, 12 May 2021 10:52:51 +0100 Subject: aarch64: Merge sqdmlal2 and sqdmlsl2 expanders The various sqdmlal2 and sqdmlsl2 expanders perform almost identical functions and can be merged using code iterators and attributes to reduce the code in the MD file. No behavioural change is expected. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (aarch64_sqdmlal2): Merge into... (aarch64_sqdmll2): ... This. (aarch64_sqdmlsl2): Delete. (aarch64_sqdmlal2_lane): Merge this... (aarch64_sqdmlsl2_lane): ... And this... (aarch64_sqdmll2_lane): ... Into this. (aarch64_sqdmlal2_laneq): Merge this... (aarch64_sqdmlsl2_laneq): ... And this... (aarch64_sqdmll2_laneq): ... Into this. (aarch64_sqdmlal2_n): Merge this... (aarch64_sqdmlsl2_n): ... And this... (aarch64_sqdmll2_n): ... Into this. --- gcc/config/aarch64/aarch64-simd.md | 104 ++++++++++--------------------------- 1 file changed, 28 insertions(+), 76 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 9962089..802cca3 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -5356,29 +5356,19 @@ [(set_attr "type" "neon_sat_mla__scalar_long")] ) -(define_expand "aarch64_sqdmlal2" +(define_expand "aarch64_sqdmll2" [(match_operand: 0 "register_operand") - (match_operand: 1 "register_operand") + (SBINQOPS: + (match_operand: 1 "register_operand") + (match_dup 1)) (match_operand:VQ_HSI 2 "register_operand") (match_operand:VQ_HSI 3 "register_operand")] "TARGET_SIMD" { rtx p = aarch64_simd_vect_par_cnst_half (mode, , true); - emit_insn (gen_aarch64_sqdmlal2_internal (operands[0], operands[1], - operands[2], operands[3], p)); - DONE; -}) - -(define_expand "aarch64_sqdmlsl2" - [(match_operand: 0 "register_operand") - (match_operand: 1 "register_operand") - (match_operand:VQ_HSI 2 "register_operand") - (match_operand:VQ_HSI 3 "register_operand")] - "TARGET_SIMD" -{ - rtx p = aarch64_simd_vect_par_cnst_half (mode, , true); - emit_insn (gen_aarch64_sqdmlsl2_internal (operands[0], operands[1], - operands[2], operands[3], p)); + emit_insn (gen_aarch64_sqdmll2_internal (operands[0], + operands[1], operands[2], + operands[3], p)); DONE; }) @@ -5436,63 +5426,37 @@ [(set_attr "type" "neon_sat_mla__scalar_long")] ) -(define_expand "aarch64_sqdmlal2_lane" - [(match_operand: 0 "register_operand") - (match_operand: 1 "register_operand") - (match_operand:VQ_HSI 2 "register_operand") - (match_operand: 3 "register_operand") - (match_operand:SI 4 "immediate_operand")] - "TARGET_SIMD" -{ - rtx p = aarch64_simd_vect_par_cnst_half (mode, , true); - emit_insn (gen_aarch64_sqdmlal2_lane_internal (operands[0], operands[1], - operands[2], operands[3], - operands[4], p)); - DONE; -}) - -(define_expand "aarch64_sqdmlal2_laneq" - [(match_operand: 0 "register_operand") - (match_operand: 1 "register_operand") - (match_operand:VQ_HSI 2 "register_operand") - (match_operand: 3 "register_operand") - (match_operand:SI 4 "immediate_operand")] - "TARGET_SIMD" -{ - rtx p = aarch64_simd_vect_par_cnst_half (mode, , true); - emit_insn (gen_aarch64_sqdmlal2_laneq_internal (operands[0], operands[1], - operands[2], operands[3], - operands[4], p)); - DONE; -}) - -(define_expand "aarch64_sqdmlsl2_lane" +(define_expand "aarch64_sqdmll2_lane" [(match_operand: 0 "register_operand") - (match_operand: 1 "register_operand") + (SBINQOPS: + (match_operand: 1 "register_operand") + (match_dup 1)) (match_operand:VQ_HSI 2 "register_operand") (match_operand: 3 "register_operand") (match_operand:SI 4 "immediate_operand")] "TARGET_SIMD" { rtx p = aarch64_simd_vect_par_cnst_half (mode, , true); - emit_insn (gen_aarch64_sqdmlsl2_lane_internal (operands[0], operands[1], - operands[2], operands[3], - operands[4], p)); + emit_insn (gen_aarch64_sqdmll2_lane_internal (operands[0], + operands[1], operands[2], + operands[3], operands[4], p)); DONE; }) -(define_expand "aarch64_sqdmlsl2_laneq" +(define_expand "aarch64_sqdmll2_laneq" [(match_operand: 0 "register_operand") - (match_operand: 1 "register_operand") + (SBINQOPS: + (match_operand: 1 "register_operand") + (match_dup 1)) (match_operand:VQ_HSI 2 "register_operand") (match_operand: 3 "register_operand") (match_operand:SI 4 "immediate_operand")] "TARGET_SIMD" { rtx p = aarch64_simd_vect_par_cnst_half (mode, , true); - emit_insn (gen_aarch64_sqdmlsl2_laneq_internal (operands[0], operands[1], - operands[2], operands[3], - operands[4], p)); + emit_insn (gen_aarch64_sqdmll2_laneq_internal (operands[0], + operands[1], operands[2], + operands[3], operands[4], p)); DONE; }) @@ -5515,31 +5479,19 @@ [(set_attr "type" "neon_sat_mla__scalar_long")] ) -(define_expand "aarch64_sqdmlal2_n" - [(match_operand: 0 "register_operand") - (match_operand: 1 "register_operand") - (match_operand:VQ_HSI 2 "register_operand") - (match_operand: 3 "register_operand")] - "TARGET_SIMD" -{ - rtx p = aarch64_simd_vect_par_cnst_half (mode, , true); - emit_insn (gen_aarch64_sqdmlal2_n_internal (operands[0], operands[1], - operands[2], operands[3], - p)); - DONE; -}) - -(define_expand "aarch64_sqdmlsl2_n" +(define_expand "aarch64_sqdmll2_n" [(match_operand: 0 "register_operand") - (match_operand: 1 "register_operand") + (SBINQOPS: + (match_operand: 1 "register_operand") + (match_dup 1)) (match_operand:VQ_HSI 2 "register_operand") (match_operand: 3 "register_operand")] "TARGET_SIMD" { rtx p = aarch64_simd_vect_par_cnst_half (mode, , true); - emit_insn (gen_aarch64_sqdmlsl2_n_internal (operands[0], operands[1], - operands[2], operands[3], - p)); + emit_insn (gen_aarch64_sqdmll2_n_internal (operands[0], + operands[1], operands[2], + operands[3], p)); DONE; }) -- cgit v1.1 From 36ecd44bd2aa2623e12af6b7c8cf433a4f29aa57 Mon Sep 17 00:00:00 2001 From: "prathamesh.kulkarni" Date: Fri, 14 May 2021 16:03:43 +0530 Subject: arm/PR66791: Replace calls to vtst builtin with it's boolean logic equivalent. gcc/ChangeLog: 2021-05-14 Prathamesh Kulkarni PR target/66791 * config/arm/arm_neon.h (vtst_s8): Replace call to vtst builtin with it's boolean logic equivalent. (vtst_s16): Likewise. (vtst_s32): Likewise. (vtst_u8): Likewise. (vtst_u16): Likewise. (vtst_u32): Likewise. (vtst_p8): Likewise. (vtst_p16): Likewise. (vtstq_s8): Likewise. (vtstq_s16): Likewise. (vtstq_s32): Likewise. (vtstq_u8): Likewise. (vtstq_u16): Likewise. (vtstq_u32): Likewise. (vtstq_p8): Likewise. (vtstq_p16): Likewise. * config/arm/arm_neon_builtins.def: Remove entry for vtst. * config/arm/neon.md (neon_vtst): Remove pattern. --- gcc/config/arm/arm_neon.h | 32 ++++++++++++++++---------------- gcc/config/arm/arm_neon_builtins.def | 1 - gcc/config/arm/neon.md | 10 ---------- 3 files changed, 16 insertions(+), 27 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index dc28b92..dcd533f 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -2919,112 +2919,112 @@ __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtst_s8 (int8x8_t __a, int8x8_t __b) { - return (uint8x8_t)__builtin_neon_vtstv8qi (__a, __b); + return (uint8x8_t) ((__a & __b) != 0); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtst_s16 (int16x4_t __a, int16x4_t __b) { - return (uint16x4_t)__builtin_neon_vtstv4hi (__a, __b); + return (uint16x4_t) ((__a & __b) != 0); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtst_s32 (int32x2_t __a, int32x2_t __b) { - return (uint32x2_t)__builtin_neon_vtstv2si (__a, __b); + return (uint32x2_t) ((__a & __b) != 0); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtst_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b); + return (uint8x8_t) ((__a & __b) != 0); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtst_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b); + return (uint16x4_t) ((__a & __b) != 0); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtst_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint32x2_t)__builtin_neon_vtstv2si ((int32x2_t) __a, (int32x2_t) __b); + return (uint32x2_t) ((__a & __b) != 0); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtst_p8 (poly8x8_t __a, poly8x8_t __b) { - return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b); + return (uint8x8_t) ((__a & __b) != 0); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtst_p16 (poly16x4_t __a, poly16x4_t __b) { - return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b); + return (uint16x4_t) ((__a & __b) != 0); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtstq_s8 (int8x16_t __a, int8x16_t __b) { - return (uint8x16_t)__builtin_neon_vtstv16qi (__a, __b); + return (uint8x16_t) ((__a & __b) != 0); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtstq_s16 (int16x8_t __a, int16x8_t __b) { - return (uint16x8_t)__builtin_neon_vtstv8hi (__a, __b); + return (uint16x8_t) ((__a & __b) != 0); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtstq_s32 (int32x4_t __a, int32x4_t __b) { - return (uint32x4_t)__builtin_neon_vtstv4si (__a, __b); + return (uint32x4_t) ((__a & __b) != 0); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtstq_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b); + return (uint8x16_t) ((__a & __b) != 0); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtstq_u16 (uint16x8_t __a, uint16x8_t __b) { - return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b); + return (uint16x8_t) ((__a & __b) != 0); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtstq_u32 (uint32x4_t __a, uint32x4_t __b) { - return (uint32x4_t)__builtin_neon_vtstv4si ((int32x4_t) __a, (int32x4_t) __b); + return (uint32x4_t) ((__a & __b) != 0); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtstq_p8 (poly8x16_t __a, poly8x16_t __b) { - return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b); + return (uint8x16_t) ((__a & __b) != 0); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtstq_p16 (poly16x8_t __a, poly16x8_t __b) { - return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b); + return (uint16x8_t) ((__a & __b) != 0); } __extension__ extern __inline int8x8_t diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 97e4f9c..70438ac 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -142,7 +142,6 @@ VAR2 (UNOP, vcgez, v4hf, v8hf) VAR2 (UNOP, vcgtz, v4hf, v8hf) VAR2 (UNOP, vclez, v4hf, v8hf) VAR2 (UNOP, vcltz, v4hf, v8hf) -VAR6 (BINOP, vtst, v8qi, v4hi, v2si, v16qi, v8hi, v4si) VAR6 (BINOP, vabds, v8qi, v4hi, v2si, v16qi, v8hi, v4si) VAR6 (BINOP, vabdu, v8qi, v4hi, v2si, v16qi, v8hi, v4si) VAR2 (BINOP, vabdf, v2sf, v4sf) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 2a1e304..641d26f 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2578,16 +2578,6 @@ DONE; }) -(define_insn "neon_vtst" - [(set (match_operand:VDQIW 0 "s_register_operand" "=w") - (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w") - (match_operand:VDQIW 2 "s_register_operand" "w")] - UNSPEC_VTST))] - "TARGET_NEON" - "vtst.\t%0, %1, %2" - [(set_attr "type" "neon_tst")] -) - (define_insn "neon_vtst_combine" [(set (match_operand:VDQIW 0 "s_register_operand" "=w") (plus:VDQIW -- cgit v1.1 From ff3809b459db881e80f627e81ec946e7bbd7041d Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Fri, 14 May 2021 10:05:42 +0100 Subject: aarch64: Make sqdmlal2 patterns match canonical RTL The sqdmlal2 patterns are hidden beneath the SBINQOPS iterator and unfortunately they don't match canonical RTL because the simple accumulate operand comes in the first arm of the SS_PLUS. This patch splits the SS_PLUS and SS_MINUS forms with the SS_PLUS operands set up to match the canonical form, where the complex operand comes first. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (aarch64_sqdmll2_lane_internal): Split into... (aarch64_sqdmlsl2_lane_internal): ... This... (aarch64_sqdmlal2_lane_internal): ... And this. (aarch64_sqdmll2_laneq_internal): Split into ... (aarch64_sqdmlsl2_laneq_internal): ... This... (aarch64_sqdmlal2_laneq_internal): ... And this. (aarch64_sqdmll2_n_internal): Split into... (aarch64_sqdmlsl2_n_internal): ... This... (aarch64_sqdmlal2_n_internal): ... And this. --- gcc/config/aarch64/aarch64-simd.md | 89 ++++++++++++++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 802cca3..e59bc7b 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -5374,9 +5374,9 @@ ;; vqdml[sa]l2_lane -(define_insn "aarch64_sqdmll2_lane_internal" +(define_insn "aarch64_sqdmlsl2_lane_internal" [(set (match_operand: 0 "register_operand" "=w") - (SBINQOPS: + (ss_minus: (match_operand: 1 "register_operand" "0") (ss_ashift: (mult: @@ -5395,14 +5395,40 @@ { operands[4] = aarch64_endian_lane_rtx (mode, INTVAL (operands[4])); return - "sqdmll2\\t%0, %2, %3.[%4]"; + "sqdmlsl2\\t%0, %2, %3.[%4]"; + } + [(set_attr "type" "neon_sat_mla__scalar_long")] +) + +(define_insn "aarch64_sqdmlal2_lane_internal" + [(set (match_operand: 0 "register_operand" "=w") + (ss_plus: + (ss_ashift: + (mult: + (sign_extend: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 5 "vect_par_cnst_hi_half" ""))) + (sign_extend: + (vec_duplicate: + (vec_select: + (match_operand: 3 "register_operand" "") + (parallel [(match_operand:SI 4 "immediate_operand" "i")]) + )))) + (const_int 1)) + (match_operand: 1 "register_operand" "0")))] + "TARGET_SIMD" + { + operands[4] = aarch64_endian_lane_rtx (mode, INTVAL (operands[4])); + return + "sqdmlal2\\t%0, %2, %3.[%4]"; } [(set_attr "type" "neon_sat_mla__scalar_long")] ) -(define_insn "aarch64_sqdmll2_laneq_internal" +(define_insn "aarch64_sqdmlsl2_laneq_internal" [(set (match_operand: 0 "register_operand" "=w") - (SBINQOPS: + (ss_minus: (match_operand: 1 "register_operand" "0") (ss_ashift: (mult: @@ -5421,7 +5447,33 @@ { operands[4] = aarch64_endian_lane_rtx (mode, INTVAL (operands[4])); return - "sqdmll2\\t%0, %2, %3.[%4]"; + "sqdmlsl2\\t%0, %2, %3.[%4]"; + } + [(set_attr "type" "neon_sat_mla__scalar_long")] +) + +(define_insn "aarch64_sqdmlal2_laneq_internal" + [(set (match_operand: 0 "register_operand" "=w") + (ss_plus: + (ss_ashift: + (mult: + (sign_extend: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 5 "vect_par_cnst_hi_half" ""))) + (sign_extend: + (vec_duplicate: + (vec_select: + (match_operand: 3 "register_operand" "") + (parallel [(match_operand:SI 4 "immediate_operand" "i")]) + )))) + (const_int 1)) + (match_operand: 1 "register_operand" "0")))] + "TARGET_SIMD" + { + operands[4] = aarch64_endian_lane_rtx (mode, INTVAL (operands[4])); + return + "sqdmlal2\\t%0, %2, %3.[%4]"; } [(set_attr "type" "neon_sat_mla__scalar_long")] ) @@ -5460,9 +5512,9 @@ DONE; }) -(define_insn "aarch64_sqdmll2_n_internal" +(define_insn "aarch64_sqdmlsl2_n_internal" [(set (match_operand: 0 "register_operand" "=w") - (SBINQOPS: + (ss_minus: (match_operand: 1 "register_operand" "0") (ss_ashift: (mult: @@ -5475,7 +5527,26 @@ (match_operand: 3 "register_operand" "")))) (const_int 1))))] "TARGET_SIMD" - "sqdmll2\\t%0, %2, %3.[0]" + "sqdmlsl2\\t%0, %2, %3.[0]" + [(set_attr "type" "neon_sat_mla__scalar_long")] +) + +(define_insn "aarch64_sqdmlal2_n_internal" + [(set (match_operand: 0 "register_operand" "=w") + (ss_plus: + (ss_ashift: + (mult: + (sign_extend: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" ""))) + (sign_extend: + (vec_duplicate: + (match_operand: 3 "register_operand" "")))) + (const_int 1)) + (match_operand: 1 "register_operand" "0")))] + "TARGET_SIMD" + "sqdmlal2\\t%0, %2, %3.[0]" [(set_attr "type" "neon_sat_mla__scalar_long")] ) -- cgit v1.1 From 0df864ed1d24bbb0cf7504814e2f48e496ea5669 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Fri, 14 May 2021 14:15:55 -0500 Subject: rs6000: Add -mrop-protect and -mprivileged flags 2021-05-14 Bill Schmidt gcc/ * config/rs6000/rs6000.c (rs6000_option_override_internal): Disable shrink wrap when inserting ROP-protect instructions. * config/rs6000/rs6000.opt (mrop-protect): New option. (mprivileged): Likewise. * doc/invoke.texi: Document mrop-protect and mprivileged. --- gcc/config/rs6000/rs6000.c | 4 ++++ gcc/config/rs6000/rs6000.opt | 8 ++++++++ 2 files changed, 12 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index d1b76f6..53a9f54 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -4040,6 +4040,10 @@ rs6000_option_override_internal (bool global_init_p) && ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY_ATOMIC) == 0)) rs6000_isa_flags |= OPTION_MASK_QUAD_MEMORY_ATOMIC; + /* If we are inserting ROP-protect instructions, disable shrink wrap. */ + if (rs6000_rop_protect) + flag_shrink_wrap = 0; + /* If we can shrink-wrap the TOC register save separately, then use -msave-toc-indirect unless explicitly disabled. */ if ((rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT) == 0 diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index 0dbdf75..2685fa7 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -619,3 +619,11 @@ Generate (do not generate) MMA instructions. mrelative-jumptables Target Undocumented Var(rs6000_relative_jumptables) Init(1) Save + +mrop-protect +Target Var(rs6000_rop_protect) Init(0) +Enable instructions that guard against return-oriented programming attacks. + +mprivileged +Target Var(rs6000_privileged) Init(0) +Generate code that will run in privileged state. -- cgit v1.1 From af979a98bce444082cd5fc5d358dcdf3f65fdfce Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Fri, 14 May 2021 14:25:25 -0500 Subject: rs6000: Emit ROP-mitigation instructions in prologue and epilogue 2021-05-14 Bill Schmidt gcc/ * config/rs6000/rs6000-internal.h (rs6000_stack): Add rop_hash_save_offset and rop_hash_size. * config/rs6000/rs6000-logue.c (rs6000_stack_info): Compute rop_hash_size and rop_hash_save_offset. (debug_stack_info): Dump rop_hash_save_offset and rop_hash_size. (rs6000_emit_prologue): Emit hashst[p] in prologue. (rs6000_emit_epilogue): Emit hashchk[p] in epilogue. * config/rs6000/rs6000.md (unspec): Add UNSPEC_HASHST and UNSPEC_HASHCHK. (hashst): New define_insn. (hashchk): Likewise. --- gcc/config/rs6000/rs6000-internal.h | 2 + gcc/config/rs6000/rs6000-logue.c | 74 +++++++++++++++++++++++++++++++++---- gcc/config/rs6000/rs6000.md | 31 ++++++++++++++++ 3 files changed, 100 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-internal.h b/gcc/config/rs6000/rs6000-internal.h index 428a786..88cf9bd 100644 --- a/gcc/config/rs6000/rs6000-internal.h +++ b/gcc/config/rs6000/rs6000-internal.h @@ -39,6 +39,7 @@ typedef struct rs6000_stack { int gp_save_offset; /* offset to save GP regs from initial SP */ int fp_save_offset; /* offset to save FP regs from initial SP */ int altivec_save_offset; /* offset to save AltiVec regs from initial SP */ + int rop_hash_save_offset; /* offset to save ROP hash from initial SP */ int lr_save_offset; /* offset to save LR from initial SP */ int cr_save_offset; /* offset to save CR from initial SP */ int vrsave_save_offset; /* offset to save VRSAVE from initial SP */ @@ -53,6 +54,7 @@ typedef struct rs6000_stack { int gp_size; /* size of saved GP registers */ int fp_size; /* size of saved FP registers */ int altivec_size; /* size of saved AltiVec registers */ + int rop_hash_size; /* size of ROP hash slot */ int cr_size; /* size to hold CR if not in fixed area */ int vrsave_size; /* size to hold VRSAVE */ int altivec_padding_size; /* size of altivec alignment padding */ diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c index b0ac183..13c00e7 100644 --- a/gcc/config/rs6000/rs6000-logue.c +++ b/gcc/config/rs6000/rs6000-logue.c @@ -595,19 +595,21 @@ rs6000_savres_strategy (rs6000_stack_t *info, +---------------------------------------+ | Parameter save area (+padding*) (P) | 32 +---------------------------------------+ - | Alloca space (A) | 32+P + | Optional ROP hash slot (R) | 32+P +---------------------------------------+ - | Local variable space (L) | 32+P+A + | Alloca space (A) | 32+P+R +---------------------------------------+ - | Save area for AltiVec registers (W) | 32+P+A+L + | Local variable space (L) | 32+P+R+A +---------------------------------------+ - | AltiVec alignment padding (Y) | 32+P+A+L+W + | Save area for AltiVec registers (W) | 32+P+R+A+L +---------------------------------------+ - | Save area for GP registers (G) | 32+P+A+L+W+Y + | AltiVec alignment padding (Y) | 32+P+R+A+L+W +---------------------------------------+ - | Save area for FP registers (F) | 32+P+A+L+W+Y+G + | Save area for GP registers (G) | 32+P+R+A+L+W+Y +---------------------------------------+ - old SP->| back chain to caller's caller | 32+P+A+L+W+Y+G+F + | Save area for FP registers (F) | 32+P+R+A+L+W+Y+G + +---------------------------------------+ + old SP->| back chain to caller's caller | 32+P+R+A+L+W+Y+G+F +---------------------------------------+ * If the alloca area is present, the parameter save area is @@ -716,6 +718,19 @@ rs6000_stack_info (void) /* Does this function call anything (apart from sibling calls)? */ info->calls_p = (!crtl->is_leaf || cfun->machine->ra_needs_full_frame); + info->rop_hash_size = 0; + + if (TARGET_POWER10 + && info->calls_p + && DEFAULT_ABI == ABI_ELFv2 + && rs6000_rop_protect) + info->rop_hash_size = 8; + else if (rs6000_rop_protect && DEFAULT_ABI != ABI_ELFv2) + { + /* We can't check this in rs6000_option_override_internal since + DEFAULT_ABI isn't established yet. */ + error ("%qs requires the ELFv2 ABI", "-mrop-protect"); + } /* Determine if we need to save the condition code registers. */ if (save_reg_p (CR2_REGNO) @@ -808,6 +823,11 @@ rs6000_stack_info (void) /* Adjust for AltiVec case. */ info->ehrd_offset = info->altivec_save_offset - ehrd_size; + + /* Adjust for ROP protection. */ + info->rop_hash_save_offset + = info->altivec_save_offset - info->rop_hash_size; + info->ehrd_offset -= info->rop_hash_size; } else info->ehrd_offset = info->gp_save_offset - ehrd_size; @@ -849,6 +869,7 @@ rs6000_stack_info (void) + info->gp_size + info->altivec_size + info->altivec_padding_size + + info->rop_hash_size + ehrd_size + ehcr_size + info->cr_size @@ -987,6 +1008,10 @@ debug_stack_info (rs6000_stack_t *info) fprintf (stderr, "\tvrsave_save_offset = %5d\n", info->vrsave_save_offset); + if (info->rop_hash_size) + fprintf (stderr, "\trop_hash_save_offset = %5d\n", + info->rop_hash_save_offset); + if (info->lr_save_p) fprintf (stderr, "\tlr_save_offset = %5d\n", info->lr_save_offset); @@ -1026,6 +1051,9 @@ debug_stack_info (rs6000_stack_t *info) fprintf (stderr, "\taltivec_padding_size= %5d\n", info->altivec_padding_size); + if (info->rop_hash_size) + fprintf (stderr, "\trop_hash_size = %5d\n", info->rop_hash_size); + if (info->cr_size) fprintf (stderr, "\tcr_size = %5d\n", info->cr_size); @@ -3252,6 +3280,22 @@ rs6000_emit_prologue (void) } } + /* The ROP hash store must occur before a stack frame is created, + since the hash operates on r1. */ + /* NOTE: The hashst isn't needed if we're going to do a sibcall, + but there's no way to know that here. Harmless except for + performance, of course. */ + if (TARGET_POWER10 && rs6000_rop_protect && info->rop_hash_size != 0) + { + gcc_assert (DEFAULT_ABI == ABI_ELFv2); + rtx stack_ptr = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM); + rtx addr = gen_rtx_PLUS (Pmode, stack_ptr, + GEN_INT (info->rop_hash_save_offset)); + rtx mem = gen_rtx_MEM (Pmode, addr); + rtx reg0 = gen_rtx_REG (Pmode, 0); + emit_insn (gen_hashst (mem, reg0)); + } + /* If we need to save CR, put it into r12 or r11. Choose r12 except when r12 will be needed by out-of-line gpr save. */ cr_save_regno = ((DEFAULT_ABI == ABI_AIX || DEFAULT_ABI == ABI_ELFv2) @@ -4980,6 +5024,22 @@ rs6000_emit_epilogue (enum epilogue_type epilogue_type) emit_insn (gen_add3_insn (sp_reg_rtx, sp_reg_rtx, sa)); } + /* The ROP hash check must occur after the stack pointer is restored + (since the hash involves r1), and is not performed for a sibcall. */ + if (TARGET_POWER10 + && rs6000_rop_protect + && info->rop_hash_size != 0 + && epilogue_type != EPILOGUE_TYPE_SIBCALL) + { + gcc_assert (DEFAULT_ABI == ABI_ELFv2); + rtx stack_ptr = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM); + rtx addr = gen_rtx_PLUS (Pmode, stack_ptr, + GEN_INT (info->rop_hash_save_offset)); + rtx mem = gen_rtx_MEM (Pmode, addr); + rtx reg0 = gen_rtx_REG (Pmode, 0); + emit_insn (gen_hashchk (reg0, mem)); + } + if (epilogue_type != EPILOGUE_TYPE_SIBCALL && restoring_FPRs_inline) { if (cfa_restores) diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index c8cdc42..0bfeb24 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -154,6 +154,8 @@ UNSPEC_CNTTZDM UNSPEC_PDEPD UNSPEC_PEXTD + UNSPEC_HASHST + UNSPEC_HASHCHK ]) ;; @@ -14948,6 +14950,35 @@ "TARGET_P9_MISC && TARGET_64BIT" "cmpeqb %0,%1,%2" [(set_attr "type" "logical")]) + + +;; ROP mitigation instructions. + +(define_insn "hashst" + [(set (match_operand:DI 0 "simple_offsettable_mem_operand" "=m") + (unspec_volatile:DI [(match_operand:DI 1 "int_reg_operand" "r")] + UNSPEC_HASHST))] + "TARGET_POWER10 && rs6000_rop_protect" +{ + static char templ[32]; + const char *p = rs6000_privileged ? "p" : ""; + sprintf (templ, "hashst%s %%1,%%0", p); + return templ; +} + [(set_attr "type" "store")]) + +(define_insn "hashchk" + [(unspec_volatile [(match_operand:DI 0 "int_reg_operand" "r") + (match_operand:DI 1 "simple_offsettable_mem_operand" "m")] + UNSPEC_HASHCHK)] + "TARGET_POWER10 && rs6000_rop_protect" +{ + static char templ[32]; + const char *p = rs6000_privileged ? "p" : ""; + sprintf (templ, "hashchk%s %%0,%%1", p); + return templ; +} + [(set_attr "type" "load")]) (include "sync.md") -- cgit v1.1 From 3ec3a9feb83d98715c97c0df4ae751a4eb582956 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Thu, 13 May 2021 13:10:10 -0500 Subject: rs6000: Conditionally define __ROP_PROTECT__ 2021-05-13 Bill Schmidt gcc/ * config/rs6000/rs6000-c.c (rs6000_target_modify_macros): Define __ROP_PROTECT__ if -mrop-protect is selected. --- gcc/config/rs6000/rs6000-c.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index 0f8a629..afcb5bb 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -602,6 +602,9 @@ rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT flags, /* Whether pc-relative code is being generated. */ if ((flags & OPTION_MASK_PCREL) != 0) rs6000_define_or_undefine_macro (define_p, "__PCREL__"); + /* Tell the user -mrop-protect is in play. */ + if (rs6000_rop_protect) + rs6000_define_or_undefine_macro (define_p, "__ROP_PROTECT__"); } void -- cgit v1.1 From 4a322345cab10879162a2ddf659fb0f873ba0182 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Sun, 16 May 2021 13:48:21 +0000 Subject: arm: remove error in CPP_SPEC when -mlittle-endian and -mbig-endian are used together arm.h has had this error message since 1997, but it is no longer needed since option parsing has been improved: -mXXX-endian is handled via arm.opt and updates the BIG_END mask. So, the last instance of -mXXX-endian on the command line wins. Tested on many arm* configurations, with no impact on the testsuite results. 2021-05-16 Christophe Lyon gcc/ * config/arm/arm.h (CPP_SPEC): Remove error message about -mlittle-endian/-mbig-endian conflict. --- gcc/config/arm/arm.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index e430e4d..8e5bd57 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -89,9 +89,7 @@ extern tree arm_bf16_ptr_type_node; #undef CPP_SPEC -#define CPP_SPEC "%(subtarget_cpp_spec) \ -%{mbig-endian:%{mlittle-endian: \ - %e-mbig-endian and -mlittle-endian may not be used together}}" +#define CPP_SPEC "%(subtarget_cpp_spec)" #ifndef CC1_SPEC #define CC1_SPEC "" -- cgit v1.1 From 45063c0506a00f2673049d46f12a6061dca4692f Mon Sep 17 00:00:00 2001 From: liuhongt Date: Thu, 13 May 2021 13:08:16 +0800 Subject: Fix ICE [PR target/100549] When arg0 is same as arg1 in __builtin_ia32_pcmpgtw, gimple_build (&stmts, GT_EXPR, cmp_type, arg0, arg1) will simplify the comparison to vector constant 0, no stmts is generated, which causes ICE in gsi_insert_before (gsi, stmts, GSI_SAME_STMT). So use gsi_insert_seq_before instead which will handle NULL seq. gcc/ChangeLog: PR target/100549 * config/i386/i386.c (ix86_gimple_fold_builtin): Use gsi_insert_seq_before instead. gcc/testsuite/ChangeLog: PR target/100549 * gcc.target/i386/pr100549.c: New test. --- gcc/config/i386/i386.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 6a1f574..befe69e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -18000,8 +18000,8 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) tree cmp_type = truth_type_for (type); gimple_seq stmts = NULL; tree cmp = gimple_build (&stmts, tcode, cmp_type, arg0, arg1); - gsi_insert_before (gsi, stmts, GSI_SAME_STMT); - gimple *g = gimple_build_assign (gimple_call_lhs (stmt), + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + gimple* g = gimple_build_assign (gimple_call_lhs (stmt), VEC_COND_EXPR, cmp, minus_one_vec, zero_vec); gimple_set_location (g, loc); -- cgit v1.1 From a6eacbf1055520e968d1a25f6d30d6ff4b66272d Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Mon, 17 May 2021 12:29:42 +0000 Subject: arm: Auto-vectorization for MVE: vcmp Since MVE has a different set of vector comparison operators from Neon, we have to update the expansion to take into account the new ones, for instance 'NE' for which MVE does not require to use 'EQ' with the inverted condition. Conversely, Neon supports comparisons with #0, MVE does not. For: typedef long int vs32 __attribute__((vector_size(16))); vs32 cmp_eq_vs32_reg (vs32 a, vs32 b) { return a == b; } we now generate: cmp_eq_vs32_reg: vldr.64 d4, .L123 @ 8 [c=8 l=4] *mve_movv4si/8 vldr.64 d5, .L123+8 vldr.64 d6, .L123+16 @ 9 [c=8 l=4] *mve_movv4si/8 vldr.64 d7, .L123+24 vcmp.i32 eq, q0, q1 @ 7 [c=16 l=4] mve_vcmpeqq_v4si vpsel q0, q3, q2 @ 15 [c=8 l=4] mve_vpselq_sv4si bx lr @ 26 [c=8 l=4] *thumb2_return .L124: .align 3 .L123: .word 0 .word 0 .word 0 .word 0 .word 1 .word 1 .word 1 .word 1 For some reason emit_move_insn (zero, CONST0_RTX (cmp_mode)) produces a pair of vldr instead of vmov.i32, qX, #0 2021-05-17 Christophe Lyon gcc/ * config/arm/arm-protos.h (arm_expand_vector_compare): Update prototype. * config/arm/arm.c (arm_expand_vector_compare): Add support for MVE. (arm_expand_vcond): Likewise. * config/arm/iterators.md (supf): Remove VCMPNEQ_S, VCMPEQQ_S, VCMPEQQ_N_S, VCMPNEQ_N_S. (VCMPNEQ, VCMPEQQ, VCMPEQQ_N, VCMPNEQ_N): Remove. * config/arm/mve.md (@mve_vcmpq_): Add '@' prefix. (@mve_vcmpq_f): Likewise. (@mve_vcmpq_n_f): Likewise. (@mve_vpselq_): Likewise. (@mve_vpselq_f"): Likewise. * config/arm/neon.md (vec_cmp): Likewise. (vcond): Likewise. (vcond): Likewise. (vcondu): Likewise. (vcond_mask_): Likewise. * config/arm/unspecs.md (VCMPNEQ_U, VCMPNEQ_S, VCMPEQQ_S) (VCMPEQQ_N_S, VCMPNEQ_N_S, VCMPEQQ_U, CMPEQQ_N_U, VCMPNEQ_N_U) (VCMPGEQ_N_S, VCMPGEQ_S, VCMPGTQ_N_S, VCMPGTQ_S, VCMPLEQ_N_S) (VCMPLEQ_S, VCMPLTQ_N_S, VCMPLTQ_S, VCMPCSQ_N_U, VCMPCSQ_U) (VCMPHIQ_N_U, VCMPHIQ_U): Remove. * config/arm/vec-common.md (vec_cmp): Likewise. (vcond): Likewise. (vcond): Likewise. (vcondu): Likewise. (vcond_mask_): Likewise. Added unsafe math condition. gcc/testsuite * gcc.target/arm/simd/mve-compare-1.c: New test with GCC vectors. * gcc.target/arm/simd/mve-compare-2.c: New test with GCC vectors. * gcc.target/arm/simd/mve-compare-scalar-1.c: New test with GCC vectors. * gcc.target/arm/simd/mve-vcmp-f32.c: New test for auto-vectorization. * gcc.target/arm/simd/mve-vcmp.c: New test for auto-vectorization. --- gcc/config/arm/arm-protos.h | 2 +- gcc/config/arm/arm.c | 211 +++++++++++++++++++++++++++++++++---------- gcc/config/arm/iterators.md | 9 +- gcc/config/arm/mve.md | 10 +- gcc/config/arm/neon.md | 87 ------------------ gcc/config/arm/unspecs.md | 20 ---- gcc/config/arm/vec-common.md | 108 ++++++++++++++++++++++ 7 files changed, 281 insertions(+), 166 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 2521541..ffccaa7 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -373,7 +373,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx, extern bool arm_fusion_enabled_p (tune_params::fuse_ops); extern bool arm_valid_symbolic_address_p (rtx); extern bool arm_validize_comparison (rtx *, rtx *, rtx *); -extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool); +extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool, bool); #endif /* RTX_CODE */ extern bool arm_gen_setmem (rtx *); diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index d0c0c50..eee3671 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -30959,66 +30959,113 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem, and return true if TARGET contains the inverse. If !CAN_INVERT, always store the result in TARGET, never its inverse. + If VCOND_MVE, do not emit the vpsel instruction here, let arm_expand_vcond do + it with the right destination type to avoid emiting two vpsel, one here and + one in arm_expand_vcond. + Note that the handling of floating-point comparisons is not IEEE compliant. */ bool arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, - bool can_invert) + bool can_invert, bool vcond_mve) { machine_mode cmp_result_mode = GET_MODE (target); machine_mode cmp_mode = GET_MODE (op0); bool inverted; - switch (code) - { - /* For these we need to compute the inverse of the requested - comparison. */ - case UNORDERED: - case UNLT: - case UNLE: - case UNGT: - case UNGE: - case UNEQ: - case NE: - code = reverse_condition_maybe_unordered (code); - if (!can_invert) - { - /* Recursively emit the inverted comparison into a temporary - and then store its inverse in TARGET. This avoids reusing - TARGET (which for integer NE could be one of the inputs). */ - rtx tmp = gen_reg_rtx (cmp_result_mode); - if (arm_expand_vector_compare (tmp, code, op0, op1, true)) - gcc_unreachable (); - emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp))); - return false; - } - inverted = true; - break; - default: + /* MVE supports more comparisons than Neon. */ + if (TARGET_HAVE_MVE) inverted = false; - break; - } + else + switch (code) + { + /* For these we need to compute the inverse of the requested + comparison. */ + case UNORDERED: + case UNLT: + case UNLE: + case UNGT: + case UNGE: + case UNEQ: + case NE: + code = reverse_condition_maybe_unordered (code); + if (!can_invert) + { + /* Recursively emit the inverted comparison into a temporary + and then store its inverse in TARGET. This avoids reusing + TARGET (which for integer NE could be one of the inputs). */ + rtx tmp = gen_reg_rtx (cmp_result_mode); + if (arm_expand_vector_compare (tmp, code, op0, op1, true, vcond_mve)) + gcc_unreachable (); + emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp))); + return false; + } + inverted = true; + break; + + default: + inverted = false; + break; + } switch (code) { - /* These are natively supported for zero comparisons, but otherwise - require the operands to be swapped. */ + /* These are natively supported by Neon for zero comparisons, but otherwise + require the operands to be swapped. For MVE, we can only compare + registers. */ case LE: case LT: - if (op1 != CONST0_RTX (cmp_mode)) - { - code = swap_condition (code); - std::swap (op0, op1); - } + if (!TARGET_HAVE_MVE) + if (op1 != CONST0_RTX (cmp_mode)) + { + code = swap_condition (code); + std::swap (op0, op1); + } /* Fall through. */ - /* These are natively supported for both register and zero operands. */ + /* These are natively supported by Neon for both register and zero + operands. MVE supports registers only. */ case EQ: case GE: case GT: - emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1)); + case NE: + if (TARGET_HAVE_MVE) + { + rtx vpr_p0; + if (vcond_mve) + vpr_p0 = target; + else + vpr_p0 = gen_reg_rtx (HImode); + + switch (GET_MODE_CLASS (cmp_mode)) + { + case MODE_VECTOR_INT: + emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1))); + break; + case MODE_VECTOR_FLOAT: + if (TARGET_HAVE_MVE_FLOAT) + emit_insn (gen_mve_vcmpq_f (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1))); + else + gcc_unreachable (); + break; + default: + gcc_unreachable (); + } + + /* If we are not expanding a vcond, build the result here. */ + if (!vcond_mve) + { + rtx zero = gen_reg_rtx (cmp_result_mode); + rtx one = gen_reg_rtx (cmp_result_mode); + emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); + emit_move_insn (one, CONST1_RTX (cmp_result_mode)); + emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); + } + } + else + emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1)); return inverted; /* These are natively supported for register operands only. @@ -31026,16 +31073,54 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, or canonicalized by target-independent code. */ case GEU: case GTU: - emit_insn (gen_neon_vc (code, cmp_mode, target, - op0, force_reg (cmp_mode, op1))); + if (TARGET_HAVE_MVE) + { + rtx vpr_p0; + if (vcond_mve) + vpr_p0 = target; + else + vpr_p0 = gen_reg_rtx (HImode); + + emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1))); + if (!vcond_mve) + { + rtx zero = gen_reg_rtx (cmp_result_mode); + rtx one = gen_reg_rtx (cmp_result_mode); + emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); + emit_move_insn (one, CONST1_RTX (cmp_result_mode)); + emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); + } + } + else + emit_insn (gen_neon_vc (code, cmp_mode, target, + op0, force_reg (cmp_mode, op1))); return inverted; /* These require the operands to be swapped and likewise do not support comparisons with zero. */ case LEU: case LTU: - emit_insn (gen_neon_vc (swap_condition (code), cmp_mode, - target, force_reg (cmp_mode, op1), op0)); + if (TARGET_HAVE_MVE) + { + rtx vpr_p0; + if (vcond_mve) + vpr_p0 = target; + else + vpr_p0 = gen_reg_rtx (HImode); + + emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0)); + if (!vcond_mve) + { + rtx zero = gen_reg_rtx (cmp_result_mode); + rtx one = gen_reg_rtx (cmp_result_mode); + emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); + emit_move_insn (one, CONST1_RTX (cmp_result_mode)); + emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); + } + } + else + emit_insn (gen_neon_vc (swap_condition (code), cmp_mode, + target, force_reg (cmp_mode, op1), op0)); return inverted; /* These need a combination of two comparisons. */ @@ -31047,8 +31132,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, rtx gt_res = gen_reg_rtx (cmp_result_mode); rtx alt_res = gen_reg_rtx (cmp_result_mode); rtx_code alt_code = (code == LTGT ? LT : LE); - if (arm_expand_vector_compare (gt_res, GT, op0, op1, true) - || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true)) + if (arm_expand_vector_compare (gt_res, GT, op0, op1, true, vcond_mve) + || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true, vcond_mve)) gcc_unreachable (); emit_insn (gen_rtx_SET (target, gen_rtx_IOR (cmp_result_mode, gt_res, alt_res))); @@ -31066,13 +31151,47 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, void arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode) { - rtx mask = gen_reg_rtx (cmp_result_mode); + /* When expanding for MVE, we do not want to emit a (useless) vpsel in + arm_expand_vector_compare, and another one here. */ + bool vcond_mve=false; + rtx mask; + + if (TARGET_HAVE_MVE) + { + vcond_mve=true; + mask = gen_reg_rtx (HImode); + } + else + mask = gen_reg_rtx (cmp_result_mode); + bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]), - operands[4], operands[5], true); + operands[4], operands[5], true, vcond_mve); if (inverted) std::swap (operands[1], operands[2]); + if (TARGET_NEON) emit_insn (gen_neon_vbsl (GET_MODE (operands[0]), operands[0], mask, operands[1], operands[2])); + else + { + machine_mode cmp_mode = GET_MODE (operands[4]); + rtx vpr_p0 = mask; + rtx zero = gen_reg_rtx (cmp_mode); + rtx one = gen_reg_rtx (cmp_mode); + emit_move_insn (zero, CONST0_RTX (cmp_mode)); + emit_move_insn (one, CONST1_RTX (cmp_mode)); + switch (GET_MODE_CLASS (cmp_mode)) + { + case MODE_VECTOR_INT: + emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0)); + break; + case MODE_VECTOR_FLOAT: + if (TARGET_HAVE_MVE_FLOAT) + emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0)); + break; + default: + gcc_unreachable (); + } + } } #define MAX_VECT_LEN 16 diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 95df8bd..a128465 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -1288,12 +1288,11 @@ (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s") (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U "u") (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s") (VSHLQ_S "s") - (VADDLVQ_P_U "u") (VCMPNEQ_S "s") + (VADDLVQ_P_U "u") (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s") (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u") (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VBRSRQ_N_S "s") - (VBRSRQ_N_U "u") (VCMPEQQ_S "s") - (VCMPEQQ_N_S "s") (VCMPNEQ_N_S "s") + (VBRSRQ_N_U "u") (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s") (VHADDQ_U "u") (VHSUBQ_N_S "s") (VHSUBQ_N_U "u") (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u") @@ -1549,16 +1548,12 @@ (define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U]) (define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U]) (define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U]) -(define_int_iterator VCMPNEQ [VCMPNEQ_S]) (define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U]) (define_int_iterator VABDQ [VABDQ_S VABDQ_U]) (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U]) (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U]) (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S]) (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S]) -(define_int_iterator VCMPEQQ [VCMPEQQ_S]) -(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S]) -(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_S]) (define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U]) (define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S]) (define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U]) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 45df211..133ebe9 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -813,7 +813,7 @@ ;; ;; [vcmpneq_, vcmpcsq_, vcmpeqq_, vcmpgeq_, vcmpgtq_, vcmphiq_, vcmpleq_, vcmpltq_]) ;; -(define_insn "mve_vcmpq_" +(define_insn "@mve_vcmpq_" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w") @@ -1903,7 +1903,7 @@ ;; ;; [vcmpeqq_f, vcmpgeq_f, vcmpgtq_f, vcmpleq_f, vcmpltq_f, vcmpneq_f]) ;; -(define_insn "mve_vcmpq_f" +(define_insn "@mve_vcmpq_f" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w") @@ -1917,7 +1917,7 @@ ;; ;; [vcmpeqq_n_f, vcmpgeq_n_f, vcmpgtq_n_f, vcmpleq_n_f, vcmpltq_n_f, vcmpneq_n_f]) ;; -(define_insn "mve_vcmpq_n_f" +(define_insn "@mve_vcmpq_n_f" [ (set (match_operand:HI 0 "vpr_register_operand" "=Up") (MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w") @@ -3282,7 +3282,7 @@ ;; ;; [vpselq_u, vpselq_s]) ;; -(define_insn "mve_vpselq_" +(define_insn "@mve_vpselq_" [ (set (match_operand:MVE_1 0 "s_register_operand" "=w") (unspec:MVE_1 [(match_operand:MVE_1 1 "s_register_operand" "w") @@ -4377,7 +4377,7 @@ ;; ;; [vpselq_f]) ;; -(define_insn "mve_vpselq_f" +(define_insn "@mve_vpselq_f" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 641d26f..cc82d06 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -1416,93 +1416,6 @@ [(set_attr "type" "neon_qsub")] ) -(define_expand "vec_cmp" - [(set (match_operand: 0 "s_register_operand") - (match_operator: 1 "comparison_operator" - [(match_operand:VDQW 2 "s_register_operand") - (match_operand:VDQW 3 "reg_or_zero_operand")]))] - "TARGET_NEON && (! || flag_unsafe_math_optimizations)" -{ - arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), - operands[2], operands[3], false); - DONE; -}) - -(define_expand "vec_cmpu" - [(set (match_operand:VDQIW 0 "s_register_operand") - (match_operator:VDQIW 1 "comparison_operator" - [(match_operand:VDQIW 2 "s_register_operand") - (match_operand:VDQIW 3 "reg_or_zero_operand")]))] - "TARGET_NEON" -{ - arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), - operands[2], operands[3], false); - DONE; -}) - -;; Conditional instructions. These are comparisons with conditional moves for -;; vectors. They perform the assignment: -;; -;; Vop0 = (Vop4 Vop5) ? Vop1 : Vop2; -;; -;; where op3 is <, <=, ==, !=, >= or >. Operations are performed -;; element-wise. - -(define_expand "vcond" - [(set (match_operand:VDQW 0 "s_register_operand") - (if_then_else:VDQW - (match_operator 3 "comparison_operator" - [(match_operand:VDQW 4 "s_register_operand") - (match_operand:VDQW 5 "reg_or_zero_operand")]) - (match_operand:VDQW 1 "s_register_operand") - (match_operand:VDQW 2 "s_register_operand")))] - "TARGET_NEON && (! || flag_unsafe_math_optimizations)" -{ - arm_expand_vcond (operands, mode); - DONE; -}) - -(define_expand "vcond" - [(set (match_operand: 0 "s_register_operand") - (if_then_else: - (match_operator 3 "comparison_operator" - [(match_operand:V32 4 "s_register_operand") - (match_operand:V32 5 "reg_or_zero_operand")]) - (match_operand: 1 "s_register_operand") - (match_operand: 2 "s_register_operand")))] - "TARGET_NEON && (! || flag_unsafe_math_optimizations)" -{ - arm_expand_vcond (operands, mode); - DONE; -}) - -(define_expand "vcondu" - [(set (match_operand:VDQW 0 "s_register_operand") - (if_then_else:VDQW - (match_operator 3 "arm_comparison_operator" - [(match_operand: 4 "s_register_operand") - (match_operand: 5 "reg_or_zero_operand")]) - (match_operand:VDQW 1 "s_register_operand") - (match_operand:VDQW 2 "s_register_operand")))] - "TARGET_NEON" -{ - arm_expand_vcond (operands, mode); - DONE; -}) - -(define_expand "vcond_mask_" - [(set (match_operand:VDQW 0 "s_register_operand") - (if_then_else:VDQW - (match_operand: 3 "s_register_operand") - (match_operand:VDQW 1 "s_register_operand") - (match_operand:VDQW 2 "s_register_operand")))] - "TARGET_NEON" -{ - emit_insn (gen_neon_vbsl (operands[0], operands[3], operands[1], - operands[2])); - DONE; -}) - ;; Patterns for builtins. ; good for plain vadd, vaddq. diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 07ca53b..0778db1 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -596,8 +596,6 @@ VCVTQ_N_FROM_F_U VADDLVQ_P_S VADDLVQ_P_U - VCMPNEQ_U - VCMPNEQ_S VSHLQ_S VSHLQ_U VABDQ_S @@ -605,9 +603,6 @@ VADDVAQ_S VADDVQ_P_S VBRSRQ_N_S - VCMPEQQ_S - VCMPEQQ_N_S - VCMPNEQ_N_S VHADDQ_S VHADDQ_N_S VHSUBQ_S @@ -645,9 +640,6 @@ VADDVAQ_U VADDVQ_P_U VBRSRQ_N_U - VCMPEQQ_U - VCMPEQQ_N_U - VCMPNEQ_N_U VHADDQ_U VHADDQ_N_U VHSUBQ_U @@ -680,14 +672,6 @@ VSHLQ_R_U VSUBQ_U VSUBQ_N_U - VCMPGEQ_N_S - VCMPGEQ_S - VCMPGTQ_N_S - VCMPGTQ_S - VCMPLEQ_N_S - VCMPLEQ_S - VCMPLTQ_N_S - VCMPLTQ_S VHCADDQ_ROT270_S VHCADDQ_ROT90_S VMAXAQ_S @@ -702,10 +686,6 @@ VQRDMULHQ_N_S VQRDMULHQ_S VQSHLUQ_N_S - VCMPCSQ_N_U - VCMPCSQ_U - VCMPHIQ_N_U - VCMPHIQ_U VABDQ_M_S VABDQ_M_U VABDQ_F diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 0b2b3b1..448731f 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -362,3 +362,111 @@ DONE; } }) + +(define_expand "vec_cmp" + [(set (match_operand: 0 "s_register_operand") + (match_operator: 1 "comparison_operator" + [(match_operand:VDQW 2 "s_register_operand") + (match_operand:VDQW 3 "reg_or_zero_operand")]))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT + && (! || flag_unsafe_math_optimizations)" +{ + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), + operands[2], operands[3], false, false); + DONE; +}) + +(define_expand "vec_cmpu" + [(set (match_operand:VDQIW 0 "s_register_operand") + (match_operator:VDQIW 1 "comparison_operator" + [(match_operand:VDQIW 2 "s_register_operand") + (match_operand:VDQIW 3 "reg_or_zero_operand")]))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT" +{ + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), + operands[2], operands[3], false, false); + DONE; +}) + +;; Conditional instructions. These are comparisons with conditional moves for +;; vectors. They perform the assignment: +;; +;; Vop0 = (Vop4 Vop5) ? Vop1 : Vop2; +;; +;; where op3 is <, <=, ==, !=, >= or >. Operations are performed +;; element-wise. + +(define_expand "vcond" + [(set (match_operand:VDQW 0 "s_register_operand") + (if_then_else:VDQW + (match_operator 3 "comparison_operator" + [(match_operand:VDQW 4 "s_register_operand") + (match_operand:VDQW 5 "reg_or_zero_operand")]) + (match_operand:VDQW 1 "s_register_operand") + (match_operand:VDQW 2 "s_register_operand")))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT + && (! || flag_unsafe_math_optimizations)" +{ + arm_expand_vcond (operands, mode); + DONE; +}) + +(define_expand "vcond" + [(set (match_operand: 0 "s_register_operand") + (if_then_else: + (match_operator 3 "comparison_operator" + [(match_operand:V32 4 "s_register_operand") + (match_operand:V32 5 "reg_or_zero_operand")]) + (match_operand: 1 "s_register_operand") + (match_operand: 2 "s_register_operand")))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT + && (! || flag_unsafe_math_optimizations)" +{ + arm_expand_vcond (operands, mode); + DONE; +}) + +(define_expand "vcondu" + [(set (match_operand:VDQW 0 "s_register_operand") + (if_then_else:VDQW + (match_operator 3 "arm_comparison_operator" + [(match_operand: 4 "s_register_operand") + (match_operand: 5 "reg_or_zero_operand")]) + (match_operand:VDQW 1 "s_register_operand") + (match_operand:VDQW 2 "s_register_operand")))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT" +{ + arm_expand_vcond (operands, mode); + DONE; +}) + +(define_expand "vcond_mask_" + [(set (match_operand:VDQW 0 "s_register_operand") + (if_then_else:VDQW + (match_operand: 3 "s_register_operand") + (match_operand:VDQW 1 "s_register_operand") + (match_operand:VDQW 2 "s_register_operand")))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT + && (! || flag_unsafe_math_optimizations)" +{ + if (TARGET_NEON) + { + emit_insn (gen_neon_vbsl (mode, operands[0], operands[3], + operands[1], operands[2])); + } + else if (TARGET_HAVE_MVE) + { + emit_insn (gen_mve_vpselq (VPSELQ_S, mode, operands[0], + operands[1], operands[2], operands[3])); + } + else + gcc_unreachable (); + + DONE; +}) -- cgit v1.1 From 7606865198b241b4c944f66761d6506b02ead951 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Mon, 17 May 2021 12:31:58 +0000 Subject: arm: Auto-vectorization for MVE: add __fp16 support to VCMP This patch adds __fp16 support to the previous patch that added vcmp support with MVE. For this we update existing expanders to use VDQWH iterator, and add a new expander vcond. In the process we need to create suitable iterators, and update v_cmp_result as needed. 2021-05-17 Christophe Lyon gcc/ * config/arm/iterators.md (V16): New iterator. (VH_cvtto): New iterator. (v_cmp_result): Added V4HF and V8HF support. * config/arm/vec-common.md (vec_cmp): Use VDQWH. (vcond): Likewise. (vcond_mask_): Likewise. (vcond): New expander. gcc/testsuite/ * gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors. * gcc.target/arm/simd/mve-vcmp-f16.c: New test for auto-vectorization. * gcc.target/arm/armv8_2-fp16-arith-1.c: Adjust since we now vectorize float16_t vectors. --- gcc/config/arm/iterators.md | 6 ++++++ gcc/config/arm/vec-common.md | 40 ++++++++++++++++++++++++++++------------ 2 files changed, 34 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index a128465..3042baf 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -231,6 +231,9 @@ ;; Vector modes for 16-bit floating-point support. (define_mode_iterator VH [V8HF V4HF]) +;; Modes with 16-bit elements only. +(define_mode_iterator V16 [V4HI V4HF V8HI V8HF]) + ;; 16-bit floating-point vector modes suitable for moving (includes BFmode). (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF]) @@ -571,6 +574,8 @@ ;; (Opposite) mode to convert to/from for vector-half mode conversions. (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI") (V8HI "V8HF") (V8HF "V8HI")]) +(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi") + (V8HI "v8hf") (V8HF "v8hi")]) ;; Define element mode for each vector mode. (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI") @@ -720,6 +725,7 @@ (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi") (V4HI "v4hi") (V8HI "v8hi") (V2SI "v2si") (V4SI "v4si") + (V4HF "v4hi") (V8HF "v8hi") (DI "di") (V2DI "v2di") (V2SF "v2si") (V4SF "v4si")]) diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 448731f..265fa40 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -366,8 +366,8 @@ (define_expand "vec_cmp" [(set (match_operand: 0 "s_register_operand") (match_operator: 1 "comparison_operator" - [(match_operand:VDQW 2 "s_register_operand") - (match_operand:VDQW 3 "reg_or_zero_operand")]))] + [(match_operand:VDQWH 2 "s_register_operand") + (match_operand:VDQWH 3 "reg_or_zero_operand")]))] "ARM_HAVE__ARITH && !TARGET_REALLY_IWMMXT && (! || flag_unsafe_math_optimizations)" @@ -399,13 +399,13 @@ ;; element-wise. (define_expand "vcond" - [(set (match_operand:VDQW 0 "s_register_operand") - (if_then_else:VDQW + [(set (match_operand:VDQWH 0 "s_register_operand") + (if_then_else:VDQWH (match_operator 3 "comparison_operator" - [(match_operand:VDQW 4 "s_register_operand") - (match_operand:VDQW 5 "reg_or_zero_operand")]) - (match_operand:VDQW 1 "s_register_operand") - (match_operand:VDQW 2 "s_register_operand")))] + [(match_operand:VDQWH 4 "s_register_operand") + (match_operand:VDQWH 5 "reg_or_zero_operand")]) + (match_operand:VDQWH 1 "s_register_operand") + (match_operand:VDQWH 2 "s_register_operand")))] "ARM_HAVE__ARITH && !TARGET_REALLY_IWMMXT && (! || flag_unsafe_math_optimizations)" @@ -430,6 +430,22 @@ DONE; }) +(define_expand "vcond" + [(set (match_operand: 0 "s_register_operand") + (if_then_else: + (match_operator 3 "comparison_operator" + [(match_operand:V16 4 "s_register_operand") + (match_operand:V16 5 "reg_or_zero_operand")]) + (match_operand: 1 "s_register_operand") + (match_operand: 2 "s_register_operand")))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT + && (! || flag_unsafe_math_optimizations)" +{ + arm_expand_vcond (operands, mode); + DONE; +}) + (define_expand "vcondu" [(set (match_operand:VDQW 0 "s_register_operand") (if_then_else:VDQW @@ -446,11 +462,11 @@ }) (define_expand "vcond_mask_" - [(set (match_operand:VDQW 0 "s_register_operand") - (if_then_else:VDQW + [(set (match_operand:VDQWH 0 "s_register_operand") + (if_then_else:VDQWH (match_operand: 3 "s_register_operand") - (match_operand:VDQW 1 "s_register_operand") - (match_operand:VDQW 2 "s_register_operand")))] + (match_operand:VDQWH 1 "s_register_operand") + (match_operand:VDQWH 2 "s_register_operand")))] "ARM_HAVE__ARITH && !TARGET_REALLY_IWMMXT && (! || flag_unsafe_math_optimizations)" -- cgit v1.1 From e91a17fe39c39e98cebe6e1cbc8064ee6846a3a7 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Mon, 17 May 2021 15:22:39 +0100 Subject: AArch64: Have -mcpu=native and -march=native enable extensions when CPU is unknown Currently when using -mcpu=native or -march=native on a CPU that is unknown to the compiler the compiler currently just used -march=armv8-a and enables none of the extensions. To make this a bit more useful this patch changes it to still use -march=armv8.a but to enable the extensions. We still cannot do tuning but at least if using this on a future SVE core the compiler will at the very least enable SVE etc. gcc/ChangeLog: * config/aarch64/driver-aarch64.c (DEFAULT_ARCH): New. (host_detect_local_cpu): Use it. gcc/testsuite/ChangeLog: * gcc.target/aarch64/cpunative/info_16: New test. * gcc.target/aarch64/cpunative/info_17: New test. * gcc.target/aarch64/cpunative/native_cpu_16.c: New test. * gcc.target/aarch64/cpunative/native_cpu_17.c: New test. --- gcc/config/aarch64/driver-aarch64.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/driver-aarch64.c b/gcc/config/aarch64/driver-aarch64.c index e2935a1..b58591d 100644 --- a/gcc/config/aarch64/driver-aarch64.c +++ b/gcc/config/aarch64/driver-aarch64.c @@ -58,6 +58,8 @@ struct aarch64_core_data #define INVALID_IMP ((unsigned char) -1) #define INVALID_CORE ((unsigned)-1) #define ALL_VARIANTS ((unsigned)-1) +/* Default architecture to use if -mcpu=native did not detect a known CPU. */ +#define DEFAULT_ARCH "8A" #define AARCH64_CORE(CORE_NAME, CORE_IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \ { CORE_NAME, #ARCH, IMP, PART, VARIANT, FLAGS }, @@ -390,10 +392,18 @@ host_detect_local_cpu (int argc, const char **argv) && (aarch64_cpu_data[i].variant == ALL_VARIANTS || variants[0] == aarch64_cpu_data[i].variant)) break; + if (aarch64_cpu_data[i].name == NULL) - goto not_found; + { + aarch64_arch_driver_info* arch_info + = get_arch_from_id (DEFAULT_ARCH); + + gcc_assert (arch_info); - if (arch) + res = concat ("-march=", arch_info->name, NULL); + default_flags = arch_info->flags; + } + else if (arch) { const char *arch_id = aarch64_cpu_data[i].arch; aarch64_arch_driver_info* arch_info = get_arch_from_id (arch_id); -- cgit v1.1 From 58f7c7e098b79c96403c8341823ec3ba1e8b3945 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Mon, 17 May 2021 10:11:52 +0200 Subject: [nvptx] Handle memmodel for atomic ops The atomic ops in nvptx.md have memmodel arguments, which are currently ignored. Handle these, fixing test-case fails libgomp.c-c++-common/reduction-{5,6}.c on volta. Tested libgomp on x86_64-linux with nvptx accelerator. gcc/ChangeLog: 2021-05-17 Tom de Vries PR target/100497 * config/nvptx/nvptx-protos.h (nvptx_output_atomic_insn): Declare * config/nvptx/nvptx.c (nvptx_output_barrier) (nvptx_output_atomic_insn): New function. (nvptx_print_operand): Add support for 'B'. * config/nvptx/nvptx.md: Use nvptx_output_atomic_insn for atomic insns. --- gcc/config/nvptx/nvptx-protos.h | 1 + gcc/config/nvptx/nvptx.c | 77 +++++++++++++++++++++++++++++++++++++++++ gcc/config/nvptx/nvptx.md | 31 ++++++++++++++--- 3 files changed, 104 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h index 1512209..b7e6ae2 100644 --- a/gcc/config/nvptx/nvptx-protos.h +++ b/gcc/config/nvptx/nvptx-protos.h @@ -57,5 +57,6 @@ extern const char *nvptx_output_set_softstack (unsigned); extern const char *nvptx_output_simt_enter (rtx, rtx, rtx); extern const char *nvptx_output_simt_exit (rtx); extern const char *nvptx_output_red_partition (rtx, rtx); +extern const char *nvptx_output_atomic_insn (const char *, rtx *, int, int); #endif #endif diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index ebbfa92..722b0fa 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -2444,6 +2444,53 @@ nvptx_output_mov_insn (rtx dst, rtx src) return "%.\tcvt%t0%t1\t%0, %1;"; } +/* Output a pre/post barrier for MEM_OPERAND according to MEMMODEL. */ + +static void +nvptx_output_barrier (rtx *mem_operand, int memmodel, bool pre_p) +{ + bool post_p = !pre_p; + + switch (memmodel) + { + case MEMMODEL_RELAXED: + return; + case MEMMODEL_CONSUME: + case MEMMODEL_ACQUIRE: + case MEMMODEL_SYNC_ACQUIRE: + if (post_p) + break; + return; + case MEMMODEL_RELEASE: + case MEMMODEL_SYNC_RELEASE: + if (pre_p) + break; + return; + case MEMMODEL_ACQ_REL: + case MEMMODEL_SEQ_CST: + case MEMMODEL_SYNC_SEQ_CST: + if (pre_p || post_p) + break; + return; + default: + gcc_unreachable (); + } + + output_asm_insn ("%.\tmembar%B0;", mem_operand); +} + +const char * +nvptx_output_atomic_insn (const char *asm_template, rtx *operands, int mem_pos, + int memmodel_pos) +{ + nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]), + true); + output_asm_insn (asm_template, operands); + nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]), + false); + return ""; +} + static void nvptx_print_operand (FILE *, rtx, int); /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this @@ -2660,6 +2707,36 @@ nvptx_print_operand (FILE *file, rtx x, int code) switch (code) { + case 'B': + if (SYMBOL_REF_P (XEXP (x, 0))) + switch (SYMBOL_DATA_AREA (XEXP (x, 0))) + { + case DATA_AREA_GENERIC: + /* Assume worst-case: global. */ + gcc_fallthrough (); /* FALLTHROUGH. */ + case DATA_AREA_GLOBAL: + break; + case DATA_AREA_SHARED: + fputs (".cta", file); + return; + case DATA_AREA_LOCAL: + case DATA_AREA_CONST: + case DATA_AREA_PARAM: + default: + gcc_unreachable (); + } + + /* There are 2 cases where membar.sys differs from membar.gl: + - host accesses global memory (f.i. systemwide atomics) + - 2 or more devices are setup in peer-to-peer mode, and one + peer can access global memory of other peer. + Neither are currently supported by openMP/OpenACC on nvptx, but + that could change, so we default to membar.sys. We could support + this more optimally by adding DATA_AREA_SYS and then emitting + .gl for DATA_AREA_GLOBAL and .sys for DATA_AREA_SYS. */ + fputs (".sys", file); + return; + case 'A': x = XEXP (x, 0); gcc_fallthrough (); /* FALLTHROUGH. */ diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 00bb8fe..108de1c 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -1642,7 +1642,11 @@ (set (match_dup 1) (unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))] "" - "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;" + { + const char *t + = "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;"; + return nvptx_output_atomic_insn (t, operands, 1, 4); + } [(set_attr "atomic" "true")]) (define_insn "atomic_exchange" @@ -1654,7 +1658,11 @@ (set (match_dup 1) (match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))] ;; input "" - "%.\\tatom%A1.exch.b%T0\\t%0, %1, %2;" + { + const char *t + = "%.\tatom%A1.exch.b%T0\t%0, %1, %2;"; + return nvptx_output_atomic_insn (t, operands, 1, 3); + } [(set_attr "atomic" "true")]) (define_insn "atomic_fetch_add" @@ -1667,7 +1675,11 @@ (set (match_operand:SDIM 0 "nvptx_register_operand" "=R") (match_dup 1))] "" - "%.\\tatom%A1.add%t0\\t%0, %1, %2;" + { + const char *t + = "%.\\tatom%A1.add%t0\\t%0, %1, %2;"; + return nvptx_output_atomic_insn (t, operands, 1, 3); + } [(set_attr "atomic" "true")]) (define_insn "atomic_fetch_addsf" @@ -1680,7 +1692,11 @@ (set (match_operand:SF 0 "nvptx_register_operand" "=R") (match_dup 1))] "" - "%.\\tatom%A1.add%t0\\t%0, %1, %2;" + { + const char *t + = "%.\\tatom%A1.add%t0\\t%0, %1, %2;"; + return nvptx_output_atomic_insn (t, operands, 1, 3); + } [(set_attr "atomic" "true")]) (define_code_iterator any_logic [and ior xor]) @@ -1696,7 +1712,12 @@ (set (match_operand:SDIM 0 "nvptx_register_operand" "=R") (match_dup 1))] "mode == SImode || TARGET_SM35" - "%.\\tatom%A1.b%T0.\\t%0, %1, %2;" + { + const char *t + = "%.\\tatom%A1.b%T0.\\t%0, %1, %2;"; + return nvptx_output_atomic_insn (t, operands, 1, 3); + } + [(set_attr "atomic" "true")]) (define_expand "atomic_test_and_set" -- cgit v1.1 From 7e75d62442fc3707c96c53d22f6c185fdf893c72 Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Tue, 18 May 2021 08:51:08 +0200 Subject: IBM Z: Support vector _Bool language extension _Bool needs to be defined as macro in order to trigger the context-sensitive macro expansion mechanism. gcc/ChangeLog: * config/s390/s390-c.c (s390_cpu_cpp_builtins_internal): Define _Bool as macro expanding to _Bool. gcc/testsuite/ChangeLog: * gcc.target/s390/zvector/vec-_Bool.c: New test. --- gcc/config/s390/s390-c.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390-c.c b/gcc/config/s390/s390-c.c index 7dbd8bf..4cce261 100644 --- a/gcc/config/s390/s390-c.c +++ b/gcc/config/s390/s390-c.c @@ -367,6 +367,8 @@ s390_cpu_cpp_builtins_internal (cpp_reader *pfile, old_opts, opts, "vector=vector", "vector"); s390_def_or_undef_macro (pfile, target_flag_set_p (MASK_ZVECTOR), old_opts, opts, "bool=bool", "bool"); + s390_def_or_undef_macro (pfile, target_flag_set_p (MASK_ZVECTOR), + old_opts, opts, "_Bool=_Bool", "_Bool"); if (TARGET_ZVECTOR_P (opts->x_target_flags) && __vector_keyword == NULL) { __vector_keyword = get_identifier ("__vector"); -- cgit v1.1 From def010e4156f8d39c2e8c914eab0b0df8a9fa078 Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Tue, 18 May 2021 10:54:38 +0300 Subject: arc: Fix typo in negv2si2 pattern gcc/ 2021-05-18 Claudiu Zissulescu * config/arc/simdext.md (negv2si2): Remove round bracket. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/simdext.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/simdext.md b/gcc/config/arc/simdext.md index c7ca306..dd63f93 100644 --- a/gcc/config/arc/simdext.md +++ b/gcc/config/arc/simdext.md @@ -2056,7 +2056,7 @@ [(set (match_operand:VCT 0 "register_operand" "=r") (neg:VCT (match_operand:VCT 1 "register_operand" "r")))] "TARGET_PLUS_DMPY" - "vsub\\t%0,0,%1" + "vsub\\t%0,0,%1" [(set_attr "length" "8") (set_attr "type" "multi")]) -- cgit v1.1 From c0129e2d489cc0ab419c58c4f8451898c2b5e3ae Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Fri, 19 Mar 2021 10:21:35 +0100 Subject: Use startswith in targets. gcc/ChangeLog: * common/config/aarch64/aarch64-common.c (aarch64_parse_extension): Use startswith function instead of strncmp. * common/config/bfin/bfin-common.c (bfin_handle_option): Likewise. * common/config/riscv/riscv-common.c (riscv_subset_list::parse): Likewise. * config/aarch64/aarch64-sve-builtins-shapes.cc (parse_type): Likewise. * config/aarch64/aarch64.c (aarch64_process_one_target_attr): Likewise. * config/alpha/alpha.c (alpha_elf_section_type_flags): Likewise. * config/arm/aarch-common.c (arm_md_asm_adjust): Likewise. * config/arm/arm.c (arm_file_start): Likewise. (arm_valid_target_attribute_rec): Likewise. (thumb1_md_asm_adjust): Likewise. * config/arm/driver-arm.c (host_detect_local_cpu): Likewise. * config/avr/avr.c (STR_PREFIX_P): Likewise. (avr_set_current_function): Likewise. (avr_handle_addr_attribute): Likewise. (avr_asm_output_aligned_decl_common): Likewise. (avr_asm_named_section): Likewise. (avr_section_type_flags): Likewise. (avr_asm_select_section): Likewise. * config/c6x/c6x.c (c6x_in_small_data_p): Likewise. (c6x_section_type_flags): Likewise. * config/darwin-c.c (darwin_cfstring_ref_p): Likewise. (darwin_objc_declare_unresolved_class_reference): Likewise. (darwin_objc_declare_class_definition): Likewise. * config/darwin.c (indirect_data): Likewise. (darwin_encode_section_info): Likewise. (darwin_objc2_section): Likewise. (darwin_objc1_section): Likewise. (machopic_select_section): Likewise. (darwin_globalize_label): Likewise. (darwin_label_is_anonymous_local_objc_name): Likewise. (darwin_asm_named_section): Likewise. (darwin_asm_output_dwarf_offset): Likewise. * config/frv/frv.c (frv_string_begins_with): Likewise. (frv_in_small_data_p): Likewise. * config/gcn/mkoffload.c (STR): Likewise. (main): Likewise. * config/i386/i386-builtins.c (get_builtin_code_for_version): Likewise. * config/i386/i386-options.c (ix86_option_override_internal): Likewise. * config/i386/i386.c (x86_64_elf_section_type_flags): Likewise. (ix86_md_asm_adjust): Likewise. * config/i386/intelmic-mkoffload.c (STR): Likewise. * config/i386/winnt.c (i386_pe_asm_named_section): Likewise. (i386_pe_file_end): Likewise. * config/ia64/ia64.c (ia64_in_small_data_p): Likewise. (ia64_section_type_flags): Likewise. * config/mips/driver-native.c (host_detect_local_cpu): Likewise. * config/mips/mips.c (mips_handle_interrupt_attr): Likewise. (mips16_stub_function_p): Likewise. (mips_function_rodata_section): Likewise. * config/msp430/msp430.c (msp430_mcu_name): Likewise. (msp430_function_section): Likewise. (msp430_section_type_flags): Likewise. (msp430_expand_helper): Likewise. * config/nios2/nios2.c (nios2_small_section_name_p): Likewise. (nios2_valid_target_attribute_rec): Likewise. * config/nvptx/mkoffload.c (process): Likewise. (STR): Likewise. * config/pa/som.h: Likewise. * config/pdp11/pdp11.c (pdp11_output_ident): Likewise. * config/riscv/riscv.c (riscv_elf_select_rtx_section): Likewise. * config/rs6000/rs6000.c (VTABLE_NAME_P): Likewise. (rs6000_inner_target_options): Likewise. * config/s390/driver-native.c (s390_host_detect_local_cpu): Likewise. * config/sparc/driver-sparc.c (host_detect_local_cpu): Likewise. * config/vax/vax.c (vax_output_int_move): Likewise. * config/vms/vms-ld.c (startswith): Likewise. (process_args): Likewise. (main): Likewise. * config/vms/vms.c: Likewise. --- gcc/config/aarch64/aarch64-sve-builtins-shapes.cc | 4 +- gcc/config/aarch64/aarch64.c | 2 +- gcc/config/alpha/alpha.c | 8 +- gcc/config/arm/aarch-common.c | 2 +- gcc/config/arm/arm.c | 8 +- gcc/config/arm/driver-arm.c | 4 +- gcc/config/avr/avr.c | 25 ++-- gcc/config/c6x/c6x.c | 14 +-- gcc/config/darwin-c.c | 9 +- gcc/config/darwin.c | 141 +++++++++++----------- gcc/config/frv/frv.c | 16 +-- gcc/config/gcn/mkoffload.c | 10 +- gcc/config/i386/i386-builtins.c | 2 +- gcc/config/i386/i386-options.c | 2 +- gcc/config/i386/i386.c | 7 +- gcc/config/i386/intelmic-mkoffload.c | 4 +- gcc/config/i386/winnt.c | 5 +- gcc/config/ia64/ia64.c | 20 +-- gcc/config/mips/driver-native.c | 2 +- gcc/config/mips/mips.c | 10 +- gcc/config/msp430/msp430.c | 13 +- gcc/config/nios2/nios2.c | 13 +- gcc/config/nvptx/mkoffload.c | 10 +- gcc/config/pa/som.h | 13 +- gcc/config/pdp11/pdp11.c | 2 +- gcc/config/riscv/riscv.c | 2 +- gcc/config/rs6000/rs6000.c | 18 +-- gcc/config/s390/driver-native.c | 12 +- gcc/config/sparc/driver-sparc.c | 2 +- gcc/config/vax/vax.c | 8 +- gcc/config/vms/vms-ld.c | 22 ++-- gcc/config/vms/vms.c | 2 +- 32 files changed, 195 insertions(+), 217 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc index e16c81c..2cc3fba 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc @@ -191,12 +191,12 @@ parse_type (const function_instance &instance, const char *&format) if (ch == 'e') { - if (strncmp (format, "pattern", 7) == 0) + if (startswith (format, "pattern")) { format += 7; return acle_svpattern; } - if (strncmp (format, "prfop", 5) == 0) + if (startswith (format, "prfop")) { format += 5; return acle_svprfop; diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 85fd80e..0835646 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -17377,7 +17377,7 @@ aarch64_process_one_target_attr (char *arg_str) if (*str_to_check == '+') return aarch64_handle_attr_isa_flags (str_to_check); - if (len > 3 && strncmp (str_to_check, "no-", 3) == 0) + if (len > 3 && startswith (str_to_check, "no-")) { invert = true; str_to_check += 3; diff --git a/gcc/config/alpha/alpha.c b/gcc/config/alpha/alpha.c index 335f1db..c702e68 100644 --- a/gcc/config/alpha/alpha.c +++ b/gcc/config/alpha/alpha.c @@ -9457,11 +9457,11 @@ alpha_elf_section_type_flags (tree decl, const char *name, int reloc) unsigned int flags = 0; if (strcmp (name, ".sdata") == 0 - || strncmp (name, ".sdata.", 7) == 0 - || strncmp (name, ".gnu.linkonce.s.", 16) == 0 + || startswith (name, ".sdata.") + || startswith (name, ".gnu.linkonce.s.") || strcmp (name, ".sbss") == 0 - || strncmp (name, ".sbss.", 6) == 0 - || strncmp (name, ".gnu.linkonce.sb.", 17) == 0) + || startswith (name, ".sbss.") + || startswith (name, ".gnu.linkonce.sb.")) flags = SECTION_SMALL; flags |= default_section_type_flags (decl, name, reloc); diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c index 24711d5..0dbdc56 100644 --- a/gcc/config/arm/aarch-common.c +++ b/gcc/config/arm/aarch-common.c @@ -542,7 +542,7 @@ arm_md_asm_adjust (vec &outputs, vec & /*inputs*/, for (unsigned i = 0, n = outputs.length (); i < n; ++i) { const char *con = constraints[i]; - if (strncmp (con, "=@cc", 4) != 0) + if (!startswith (con, "=@cc")) continue; con += 4; if (strchr (con, ',') != NULL) diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index eee3671..28cfd81 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -28169,7 +28169,7 @@ arm_file_start (void) else arm_print_asm_arch_directives (); } - else if (strncmp (arm_active_target.core_name, "generic", 7) == 0) + else if (startswith (arm_active_target.core_name, "generic")) { asm_fprintf (asm_out_file, "\t.arch %s\n", arm_active_target.core_name + 8); @@ -33054,7 +33054,7 @@ arm_valid_target_attribute_rec (tree args, struct gcc_options *opts) else if (!strcmp (q, "general-regs-only")) opts->x_target_flags |= MASK_GENERAL_REGS_ONLY; - else if (!strncmp (q, "fpu=", 4)) + else if (startswith (q, "fpu=")) { int fpu_index; if (! opt_enum_arg_to_value (OPT_mfpu_, q + 4, @@ -33073,7 +33073,7 @@ arm_valid_target_attribute_rec (tree args, struct gcc_options *opts) } opts->x_arm_fpu_index = (enum fpu_type) fpu_index; } - else if (!strncmp (q, "arch=", 5)) + else if (startswith (q, "arch=")) { char *arch = q + 5; const arch_option *arm_selected_arch @@ -34105,7 +34105,7 @@ thumb1_md_asm_adjust (vec &outputs, vec & /*inputs*/, HARD_REG_SET & /*clobbered_regs*/) { for (unsigned i = 0, n = outputs.length (); i < n; ++i) - if (strncmp (constraints[i], "=@cc", 4) == 0) + if (startswith (constraints[i], "=@cc")) { sorry ("asm flags not supported in thumb1 mode"); break; diff --git a/gcc/config/arm/driver-arm.c b/gcc/config/arm/driver-arm.c index 21ae2d7..247eab3 100644 --- a/gcc/config/arm/driver-arm.c +++ b/gcc/config/arm/driver-arm.c @@ -82,7 +82,7 @@ host_detect_local_cpu (int argc, const char **argv) while (fgets (buf, sizeof (buf), f) != NULL) { /* Find the vendor table associated with this implementer. */ - if (strncmp (buf, "CPU implementer", sizeof ("CPU implementer") - 1) == 0) + if (startswith (buf, "CPU implementer")) { int i; for (i = 0; vendors_table[i].vendor_no != NULL; i++) @@ -94,7 +94,7 @@ host_detect_local_cpu (int argc, const char **argv) } /* Detect arch/cpu. */ - if (strncmp (buf, "CPU part", sizeof ("CPU part") - 1) == 0) + if (startswith (buf, "CPU part")) { int i; diff --git a/gcc/config/avr/avr.c b/gcc/config/avr/avr.c index 06c84d5..c95c436 100644 --- a/gcc/config/avr/avr.c +++ b/gcc/config/avr/avr.c @@ -63,9 +63,6 @@ /* Maximal allowed offset for an address in the LD command */ #define MAX_LD_OFFSET(MODE) (64 - (signed)GET_MODE_SIZE (MODE)) -/* Return true if STR starts with PREFIX and false, otherwise. */ -#define STR_PREFIX_P(STR,PREFIX) (strncmp (STR, PREFIX, strlen (PREFIX)) == 0) - /* The 4 bits starting at SECTION_MACH_DEP are reserved to store the address space where data is to be located. As the only non-generic address spaces are all located in flash, @@ -1092,7 +1089,7 @@ avr_set_current_function (tree decl) that the name of the function is "__vector_NN" so as to catch when the user misspells the vector name. */ - if (!STR_PREFIX_P (name, "__vector")) + if (!startswith (name, "__vector")) warning_at (loc, OPT_Wmisspelled_isr, "%qs appears to be a misspelled " "%qs handler, missing %<__vector%> prefix", name, isr); #endif // AVR-LibC naming conventions @@ -9642,7 +9639,7 @@ static tree avr_handle_addr_attribute (tree *node, tree name, tree args, int flags ATTRIBUTE_UNUSED, bool *no_add) { - bool io_p = (strncmp (IDENTIFIER_POINTER (name), "io", 2) == 0); + bool io_p = startswith (IDENTIFIER_POINTER (name), "io"); location_t loc = DECL_SOURCE_LOCATION (*node); if (!VAR_P (*node)) @@ -10055,7 +10052,7 @@ avr_asm_output_aligned_decl_common (FILE * stream, /* __gnu_lto_slim is just a marker for the linker injected by toplev.c. There is no need to trigger __do_clear_bss code for them. */ - if (!STR_PREFIX_P (name, "__gnu_lto")) + if (!startswith (name, "__gnu_lto")) avr_need_clear_bss_p = true; if (local_p) @@ -10154,7 +10151,7 @@ avr_asm_named_section (const char *name, unsigned int flags, tree decl) const char *old_prefix = ".rodata"; const char *new_prefix = avr_addrspace[as].section_name; - if (STR_PREFIX_P (name, old_prefix)) + if (startswith (name, old_prefix)) { const char *sname = ACONCAT ((new_prefix, name + strlen (old_prefix), NULL)); @@ -10167,19 +10164,19 @@ avr_asm_named_section (const char *name, unsigned int flags, tree decl) } if (!avr_need_copy_data_p) - avr_need_copy_data_p = (STR_PREFIX_P (name, ".data") - || STR_PREFIX_P (name, ".gnu.linkonce.d")); + avr_need_copy_data_p = (startswith (name, ".data") + || startswith (name, ".gnu.linkonce.d")); if (!avr_need_copy_data_p #if defined HAVE_LD_AVR_AVRXMEGA3_RODATA_IN_FLASH && avr_arch->flash_pm_offset == 0 #endif ) - avr_need_copy_data_p = (STR_PREFIX_P (name, ".rodata") - || STR_PREFIX_P (name, ".gnu.linkonce.r")); + avr_need_copy_data_p = (startswith (name, ".rodata") + || startswith (name, ".gnu.linkonce.r")); if (!avr_need_clear_bss_p) - avr_need_clear_bss_p = STR_PREFIX_P (name, ".bss"); + avr_need_clear_bss_p = startswith (name, ".bss"); default_elf_asm_named_section (name, flags, decl); } @@ -10192,7 +10189,7 @@ avr_section_type_flags (tree decl, const char *name, int reloc) { unsigned int flags = default_section_type_flags (decl, name, reloc); - if (STR_PREFIX_P (name, ".noinit")) + if (startswith (name, ".noinit")) { if (decl && TREE_CODE (decl) == VAR_DECL && DECL_INITIAL (decl) == NULL_TREE) @@ -10402,7 +10399,7 @@ avr_asm_select_section (tree decl, int reloc, unsigned HOST_WIDE_INT align) const char * old_prefix = ".rodata"; const char * new_prefix = avr_addrspace[as].section_name; - if (STR_PREFIX_P (name, old_prefix)) + if (startswith (name, old_prefix)) { const char *sname = ACONCAT ((new_prefix, name + strlen (old_prefix), NULL)); diff --git a/gcc/config/c6x/c6x.c b/gcc/config/c6x/c6x.c index f9ad1e5..e7e1d6c 100644 --- a/gcc/config/c6x/c6x.c +++ b/gcc/config/c6x/c6x.c @@ -862,14 +862,14 @@ c6x_in_small_data_p (const_tree exp) const char *section = DECL_SECTION_NAME (exp); if (strcmp (section, ".neardata") == 0 - || strncmp (section, ".neardata.", 10) == 0 - || strncmp (section, ".gnu.linkonce.s.", 16) == 0 + || startswith (section, ".neardata.") + || startswith (section, ".gnu.linkonce.s.") || strcmp (section, ".bss") == 0 - || strncmp (section, ".bss.", 5) == 0 - || strncmp (section, ".gnu.linkonce.sb.", 17) == 0 + || startswith (section, ".bss.") + || startswith (section, ".gnu.linkonce.sb.") || strcmp (section, ".rodata") == 0 - || strncmp (section, ".rodata.", 8) == 0 - || strncmp (section, ".gnu.linkonce.s2.", 17) == 0) + || startswith (section, ".rodata.") + || startswith (section, ".gnu.linkonce.s2.")) return true; } else @@ -1063,7 +1063,7 @@ c6x_section_type_flags (tree decl, const char *name, int reloc) unsigned int flags = 0; if (strcmp (name, ".far") == 0 - || strncmp (name, ".far.", 5) == 0) + || startswith (name, ".far.")) flags |= SECTION_BSS; flags |= default_section_type_flags (decl, name, reloc); diff --git a/gcc/config/darwin-c.c b/gcc/config/darwin-c.c index b0424a9..951a998 100644 --- a/gcc/config/darwin-c.c +++ b/gcc/config/darwin-c.c @@ -808,8 +808,7 @@ darwin_cfstring_ref_p (const_tree strp) tn = DECL_NAME (tn); return (tn && IDENTIFIER_POINTER (tn) - && !strncmp (IDENTIFIER_POINTER (tn), "CFStringRef", - strlen ("CFStringRef"))); + && startswith (IDENTIFIER_POINTER (tn), "CFStringRef")); } /* At present the behavior of this is undefined and it does nothing. */ @@ -843,7 +842,7 @@ darwin_objc_declare_unresolved_class_reference (const char *name) size_t len = strlen (reference) + strlen(name) + 2; char *buf = (char *) alloca (len); - gcc_checking_assert (!strncmp (name, ".objc_class_name_", 17)); + gcc_checking_assert (startswith (name, ".objc_class_name_")); snprintf (buf, len, "%s%s", reference, name); symtab->finalize_toplevel_asm (build_string (strlen (buf), buf)); @@ -856,8 +855,8 @@ darwin_objc_declare_class_definition (const char *name) size_t len = strlen (xname) + 7 + 5; char *buf = (char *) alloca (len); - gcc_checking_assert (!strncmp (name, ".objc_class_name_", 17) - || !strncmp (name, "*.objc_category_name_", 21)); + gcc_checking_assert (startswith (name, ".objc_class_name_") + || startswith (name, "*.objc_category_name_")); /* Mimic default_globalize_label. */ snprintf (buf, len, ".globl\t%s", xname); diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c index 5d17391..c4016fe 100644 --- a/gcc/config/darwin.c +++ b/gcc/config/darwin.c @@ -329,7 +329,7 @@ indirect_data (rtx sym_ref) lprefix = (((name[0] == '*' || name[0] == '&') && (name[1] == 'L' || (name[1] == '"' && name[2] == 'L'))) - || (strncmp (name, "_OBJC_", 6) == 0)); + || (startswith (name, "_OBJC_"))); return ! lprefix; } @@ -1284,7 +1284,7 @@ darwin_encode_section_info (tree decl, rtx rtl, int first) tree o2meta = lookup_attribute ("OBJC2META", DECL_ATTRIBUTES (decl)); o2meta = o2meta ? TREE_VALUE (o2meta) : NULL_TREE; - if (o2meta && strncmp (IDENTIFIER_POINTER (o2meta), "V2_IVRF",7) == 0) + if (o2meta && startswith (IDENTIFIER_POINTER (o2meta), "V2_IVRF")) SYMBOL_REF_FLAGS (sym_ref) |= MACHO_SYMBOL_FLAG_MUST_INDIRECT; #endif } @@ -1443,58 +1443,58 @@ darwin_objc2_section (tree decl ATTRIBUTE_UNUSED, tree meta, section * base) /* Most of the OBJC2 META-data end up in the base section, so check it first. */ - if (!strncmp (p, "V2_BASE", 7)) + if (startswith (p, "V2_BASE")) return base; - else if (!strncmp (p, "V2_CNAM", 7)) + else if (startswith (p, "V2_CNAM")) return darwin_sections[objc2_class_names_section]; - else if (!strncmp (p, "V2_MNAM", 7)) + else if (startswith (p, "V2_MNAM")) return darwin_sections[objc2_method_names_section]; - else if (!strncmp (p, "V2_MTYP", 7)) + else if (startswith (p, "V2_MTYP")) return darwin_sections[objc2_method_types_section]; - else if (!strncmp (p, "V2_STRG", 7)) + else if (startswith (p, "V2_STRG")) return darwin_sections[cstring_section]; - else if (!strncmp (p, "G2_META", 7) || !strncmp (p, "G2_CLAS", 7)) + else if (startswith (p, "G2_META") || startswith (p, "G2_CLAS")) return darwin_sections[objc2_classdefs_section]; - else if (!strncmp (p, "V2_PCOL", 7)) + else if (startswith (p, "V2_PCOL")) return ld_uses_coal_sects ? darwin_sections[data_coal_section] : darwin_sections[objc2_data_section]; - else if (!strncmp (p, "V2_MREF", 7)) + else if (startswith (p, "V2_MREF")) return darwin_sections[objc2_message_refs_section]; - else if (!strncmp (p, "V2_CLRF", 7)) + else if (startswith (p, "V2_CLRF")) return darwin_sections[objc2_classrefs_section]; - else if (!strncmp (p, "V2_SURF", 7)) + else if (startswith (p, "V2_SURF")) return darwin_sections[objc2_super_classrefs_section]; - else if (!strncmp (p, "V2_NLCL", 7)) + else if (startswith (p, "V2_NLCL")) return darwin_sections[objc2_nonlazy_class_section]; - else if (!strncmp (p, "V2_CLAB", 7)) + else if (startswith (p, "V2_CLAB")) { classes_seen = 1; return darwin_sections[objc2_classlist_section]; } - else if (!strncmp (p, "V2_SRFS", 7)) + else if (startswith (p, "V2_SRFS")) return darwin_sections[objc2_selector_refs_section]; - else if (!strncmp (p, "V2_NLCA", 7)) + else if (startswith (p, "V2_NLCA")) return darwin_sections[objc2_nonlazy_category_section]; - else if (!strncmp (p, "V2_CALA", 7)) + else if (startswith (p, "V2_CALA")) return darwin_sections[objc2_categorylist_section]; - else if (!strncmp (p, "V2_PLST", 7)) + else if (startswith (p, "V2_PLST")) return darwin_sections[objc2_protocollist_section]; - else if (!strncmp (p, "V2_PRFS", 7)) + else if (startswith (p, "V2_PRFS")) return darwin_sections[objc2_protocolrefs_section]; - else if (!strncmp (p, "V2_INFO", 7)) + else if (startswith (p, "V2_INFO")) return darwin_sections[objc2_image_info_section]; - else if (!strncmp (p, "V2_EHTY", 7)) + else if (startswith (p, "V2_EHTY")) return ld_uses_coal_sects ? darwin_sections[data_coal_section] : data_section; - else if (!strncmp (p, "V2_CSTR", 7)) + else if (startswith (p, "V2_CSTR")) return darwin_sections[objc2_constant_string_object_section]; - else if (!strncmp (p, "V2_IVRF", 7)) + else if (startswith (p, "V2_IVRF")) return darwin_sections[objc2_ivar_section]; /* Not recognized, default. */ @@ -1515,72 +1515,72 @@ darwin_objc1_section (tree decl ATTRIBUTE_UNUSED, tree meta, section * base) objc_metadata_seen = 1; /* String sections first, cos there are lots of strings. */ - if (!strncmp (p, "V1_STRG", 7)) + if (startswith (p, "V1_STRG")) return darwin_sections[cstring_section]; - else if (!strncmp (p, "V1_CLSN", 7)) + else if (startswith (p, "V1_CLSN")) return darwin_sections[objc_class_names_section]; - else if (!strncmp (p, "V1_METN", 7)) + else if (startswith (p, "V1_METN")) return darwin_sections[objc_meth_var_names_section]; - else if (!strncmp (p, "V1_METT", 7)) + else if (startswith (p, "V1_METT")) return darwin_sections[objc_meth_var_types_section]; - else if (!strncmp (p, "V1_CLAS", 7)) + else if (startswith (p, "V1_CLAS")) { classes_seen = 1; return darwin_sections[objc_class_section]; } - else if (!strncmp (p, "V1_META", 7)) + else if (startswith (p, "V1_META")) return darwin_sections[objc_meta_class_section]; - else if (!strncmp (p, "V1_CATG", 7)) + else if (startswith (p, "V1_CATG")) return darwin_sections[objc_category_section]; - else if (!strncmp (p, "V1_PROT", 7)) + else if (startswith (p, "V1_PROT")) return darwin_sections[objc_protocol_section]; - else if (!strncmp (p, "V1_CLCV", 7)) + else if (startswith (p, "V1_CLCV")) return darwin_sections[objc_class_vars_section]; - else if (!strncmp (p, "V1_CLIV", 7)) + else if (startswith (p, "V1_CLIV")) return darwin_sections[objc_instance_vars_section]; - else if (!strncmp (p, "V1_CLCM", 7)) + else if (startswith (p, "V1_CLCM")) return darwin_sections[objc_cls_meth_section]; - else if (!strncmp (p, "V1_CLIM", 7)) + else if (startswith (p, "V1_CLIM")) return darwin_sections[objc_inst_meth_section]; - else if (!strncmp (p, "V1_CACM", 7)) + else if (startswith (p, "V1_CACM")) return darwin_sections[objc_cat_cls_meth_section]; - else if (!strncmp (p, "V1_CAIM", 7)) + else if (startswith (p, "V1_CAIM")) return darwin_sections[objc_cat_inst_meth_section]; - else if (!strncmp (p, "V1_PNSM", 7)) + else if (startswith (p, "V1_PNSM")) return darwin_sections[objc_cat_inst_meth_section]; - else if (!strncmp (p, "V1_PCLM", 7)) + else if (startswith (p, "V1_PCLM")) return darwin_sections[objc_cat_cls_meth_section]; - else if (!strncmp (p, "V1_CLPR", 7)) + else if (startswith (p, "V1_CLPR")) return darwin_sections[objc_cat_cls_meth_section]; - else if (!strncmp (p, "V1_CAPR", 7)) + else if (startswith (p, "V1_CAPR")) return darwin_sections[objc_category_section]; /* ??? CHECK me. */ - else if (!strncmp (p, "V1_PRFS", 7)) + else if (startswith (p, "V1_PRFS")) return darwin_sections[objc_cat_cls_meth_section]; - else if (!strncmp (p, "V1_CLRF", 7)) + else if (startswith (p, "V1_CLRF")) return darwin_sections[objc_cls_refs_section]; - else if (!strncmp (p, "V1_SRFS", 7)) + else if (startswith (p, "V1_SRFS")) return darwin_sections[objc_selector_refs_section]; - else if (!strncmp (p, "V1_MODU", 7)) + else if (startswith (p, "V1_MODU")) return darwin_sections[objc_module_info_section]; - else if (!strncmp (p, "V1_SYMT", 7)) + else if (startswith (p, "V1_SYMT")) return darwin_sections[objc_symbols_section]; - else if (!strncmp (p, "V1_INFO", 7)) + else if (startswith (p, "V1_INFO")) return darwin_sections[objc_image_info_section]; - else if (!strncmp (p, "V1_PLST", 7)) + else if (startswith (p, "V1_PLST")) return darwin_sections[objc1_prop_list_section]; - else if (!strncmp (p, "V1_PEXT", 7)) + else if (startswith (p, "V1_PEXT")) return darwin_sections[objc1_protocol_ext_section]; - else if (!strncmp (p, "V1_CEXT", 7)) + else if (startswith (p, "V1_CEXT")) return darwin_sections[objc1_class_ext_section]; - else if (!strncmp (p, "V2_CSTR", 7)) + else if (startswith (p, "V2_CSTR")) return darwin_sections[objc_constant_string_object_section]; return base; @@ -1747,7 +1747,7 @@ machopic_select_section (tree decl, && DECL_NAME (decl) && TREE_CODE (DECL_NAME (decl)) == IDENTIFIER_NODE && IDENTIFIER_POINTER (DECL_NAME (decl)) - && !strncmp (IDENTIFIER_POINTER (DECL_NAME (decl)), "_OBJC_", 6)) + && startswith (IDENTIFIER_POINTER (DECL_NAME (decl)), "_OBJC_")) /* c) legacy meta-data selection was deprecated at 4.6, removed now. */ gcc_unreachable (); @@ -1869,15 +1869,15 @@ finalize_dtors () void darwin_globalize_label (FILE *stream, const char *name) { - if (!!strncmp (name, "_OBJC_", 6)) + if (!startswith (name, "_OBJC_")) default_globalize_label (stream, name); /* We have some Objective C cases that need to be global, but only on newer OS versions. */ if (flag_objc_abi < 2 || flag_next_runtime < 100700) return; - if (!strncmp (name+6, "LabelPro", 8)) + if (startswith (name+6, "LabelPro")) default_globalize_label (stream, name); - if (!strncmp (name+6, "Protocol_", 9)) + if (startswith (name+6, "Protocol_")) default_globalize_label (stream, name); } @@ -1897,7 +1897,7 @@ darwin_label_is_anonymous_local_objc_name (const char *name) while (*p >= '0' && *p <= '9') p++; } - if (strncmp ((const char *)p, "_OBJC_", 6) != 0) + if (!startswith ((const char *)p, "_OBJC_")) return false; /* We need some of the objective c meta-data symbols to be visible to the @@ -1908,36 +1908,36 @@ darwin_label_is_anonymous_local_objc_name (const char *name) return true; p += 6; - if (!strncmp ((const char *)p, "ClassRef", 8)) + if (startswith ((const char *)p, "ClassRef")) return false; - else if (!strncmp ((const char *)p, "SelRef", 6)) + else if (startswith ((const char *)p, "SelRef")) return false; - else if (!strncmp ((const char *)p, "Category", 8)) + else if (startswith ((const char *)p, "Category")) { if (p[8] == '_' || p[8] == 'I' || p[8] == 'P' || p[8] == 'C' ) return false; return true; } - else if (!strncmp ((const char *)p, "ClassMethods", 12)) + else if (startswith ((const char *)p, "ClassMethods")) return false; - else if (!strncmp ((const char *)p, "Instance", 8)) + else if (startswith ((const char *)p, "Instance")) { if (p[8] == 'I' || p[8] == 'M') return false; return true; } - else if (!strncmp ((const char *)p, "CLASS_RO", 8)) + else if (startswith ((const char *)p, "CLASS_RO")) return false; - else if (!strncmp ((const char *)p, "METACLASS_RO", 12)) + else if (startswith ((const char *)p, "METACLASS_RO")) return false; - else if (!strncmp ((const char *)p, "Protocol", 8)) + else if (startswith ((const char *)p, "Protocol")) { if (p[8] == '_' || p[8] == 'I' || p[8] == 'P' || p[8] == 'M' || p[8] == 'C' || p[8] == 'O') return false; return true; } - else if (!strncmp ((const char *)p, "LabelPro", 8)) + else if (startswith ((const char *)p, "LabelPro")) return false; return true; } @@ -2032,8 +2032,7 @@ darwin_asm_named_section (const char *name, { /* LTO sections go in a special section that encapsulates the (unlimited) number of GNU LTO sections within a single mach-o one. */ - if (strncmp (name, LTO_SECTION_NAME_PREFIX, - strlen (LTO_SECTION_NAME_PREFIX)) == 0) + if (startswith (name, LTO_SECTION_NAME_PREFIX)) { darwin_lto_section_e e; /* We expect certain flags to be set... */ @@ -2062,9 +2061,9 @@ darwin_asm_named_section (const char *name, vec_alloc (lto_section_names, 16); vec_safe_push (lto_section_names, e); } - else if (strncmp (name, "__DWARF,", 8) == 0) + else if (startswith (name, "__DWARF,")) darwin_asm_dwarf_section (name, flags, decl, false); - else if (strncmp (name, "__GNU_DWARF_LTO,", 16) == 0) + else if (startswith (name, "__GNU_DWARF_LTO,")) darwin_asm_dwarf_section (name, flags, decl, true); else fprintf (asm_out_file, "\t.section %s\n", name); @@ -2973,9 +2972,9 @@ darwin_asm_output_dwarf_offset (FILE *file, int size, const char * lab, const char *lto_add = ""; gcc_checking_assert (base->common.flags & SECTION_NAMED); - is_for_lto = strncmp (base->named.name, "__GNU_DWARF_LTO,", 16) == 0; + is_for_lto = startswith (base->named.name, "__GNU_DWARF_LTO,"); gcc_checking_assert (is_for_lto - || strncmp (base->named.name, "__DWARF,", 8) == 0); + || startswith (base->named.name, "__DWARF,")); const char *name = strchr (base->named.name, ',') + 1; gcc_checking_assert (name); diff --git a/gcc/config/frv/frv.c b/gcc/config/frv/frv.c index 8201a20..a7f7f08 100644 --- a/gcc/config/frv/frv.c +++ b/gcc/config/frv/frv.c @@ -262,7 +262,6 @@ static frv_stack_t *frv_stack_cache = (frv_stack_t *)0; static void frv_option_override (void); static bool frv_legitimate_address_p (machine_mode, rtx, bool); static int frv_default_flags_for_cpu (void); -static int frv_string_begins_with (const char *, const char *); static FRV_INLINE bool frv_small_data_reloc_p (rtx, int); static void frv_print_operand (FILE *, rtx, int); static void frv_print_operand_address (FILE *, machine_mode, rtx); @@ -783,17 +782,6 @@ frv_option_override (void) } -/* Return true if NAME (a STRING_CST node) begins with PREFIX. */ - -static int -frv_string_begins_with (const char *name, const char *prefix) -{ - const int prefix_len = strlen (prefix); - - /* Remember: NAME's length includes the null terminator. */ - return (strncmp (name, prefix, prefix_len) == 0); -} - /* Implement TARGET_CONDITIONAL_REGISTER_USAGE. */ static void @@ -9312,9 +9300,9 @@ frv_in_small_data_p (const_tree decl) section_name = DECL_SECTION_NAME (decl); if (section_name) { - if (frv_string_begins_with (section_name, ".sdata")) + if (startswith (section_name, ".sdata")) return true; - if (frv_string_begins_with (section_name, ".sbss")) + if (startswith (section_name, ".sbss")) return true; return false; } diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c index dc9d518..5432f95 100644 --- a/gcc/config/gcn/mkoffload.c +++ b/gcc/config/gcn/mkoffload.c @@ -825,8 +825,7 @@ main (int argc, char **argv) bool fpic = false; for (int i = 1; i < argc; i++) { -#define STR "-foffload-abi=" - if (strncmp (argv[i], STR, strlen (STR)) == 0) + if (startswith (argv[i], "-foffload-abi=")) { if (strcmp (argv[i] + strlen (STR), "lp64") == 0) offload_abi = OFFLOAD_ABI_LP64; @@ -836,7 +835,6 @@ main (int argc, char **argv) fatal_error (input_location, "unrecognizable argument of option " STR); } -#undef STR else if (strcmp (argv[i], "-fopenmp") == 0) fopenmp = true; else if (strcmp (argv[i], "-fopenacc") == 0) @@ -995,9 +993,9 @@ main (int argc, char **argv) obstack_ptr_grow (&ld_argv_obstack, "-lgomp"); for (int i = 1; i < argc; i++) - if (strncmp (argv[i], "-l", 2) == 0 - || strncmp (argv[i], "-Wl", 3) == 0 - || strncmp (argv[i], "-march", 6) == 0) + if (startswith (argv[i], "-l") + || startswith (argv[i], "-Wl") + || startswith (argv[i], "-march")) obstack_ptr_grow (&ld_argv_obstack, argv[i]); obstack_ptr_grow (&cc_argv_obstack, "-dumpdir"); diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c index 8036aed..204e290 100644 --- a/gcc/config/i386/i386-builtins.c +++ b/gcc/config/i386/i386-builtins.c @@ -1992,7 +1992,7 @@ get_builtin_code_for_version (tree decl, tree *predicate_list) while (token != NULL) { /* Do not process "arch=" */ - if (strncmp (token, "arch=", 5) == 0) + if (startswith (token, "arch=")) { token = strtok (NULL, ","); continue; diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 64c6ef4..0eccb54 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -1904,7 +1904,7 @@ ix86_option_override_internal (bool main_args_p, /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string or defaulted. We need to use a sensible tune option. */ - if (!strncmp (opts->x_ix86_tune_string, "x86-64", 6) + if (startswith (opts->x_ix86_tune_string, "x86-64") && (opts->x_ix86_tune_string[6] == '\0' || (!strcmp (opts->x_ix86_tune_string + 6, "-v2") || !strcmp (opts->x_ix86_tune_string + 6, "-v3") diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index befe69e..743d8a2 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -752,9 +752,8 @@ x86_64_elf_section_type_flags (tree decl, const char *name, int reloc) flags |= SECTION_RELRO; if (strcmp (name, ".lbss") == 0 - || strncmp (name, ".lbss.", sizeof (".lbss.") - 1) == 0 - || strncmp (name, ".gnu.linkonce.lb.", - sizeof (".gnu.linkonce.lb.") - 1) == 0) + || startswith (name, ".lbss.") + || startswith (name, ".gnu.linkonce.lb.")) flags |= SECTION_BSS; return flags; @@ -21500,7 +21499,7 @@ ix86_md_asm_adjust (vec &outputs, vec & /*inputs*/, for (unsigned i = 0, n = outputs.length (); i < n; ++i) { const char *con = constraints[i]; - if (strncmp (con, "=@cc", 4) != 0) + if (!startswith (con, "=@cc")) continue; con += 4; if (strchr (con, ',') != NULL) diff --git a/gcc/config/i386/intelmic-mkoffload.c b/gcc/config/i386/intelmic-mkoffload.c index 475f071..cb946d6 100644 --- a/gcc/config/i386/intelmic-mkoffload.c +++ b/gcc/config/i386/intelmic-mkoffload.c @@ -613,8 +613,7 @@ main (int argc, char **argv) /* Scan the argument vector. */ for (int i = 1; i < argc; i++) { -#define STR "-foffload-abi=" - if (strncmp (argv[i], STR, strlen (STR)) == 0) + if (startswith (argv[i], "-foffload-abi=")) { if (strcmp (argv[i] + strlen (STR), "lp64") == 0) offload_abi = OFFLOAD_ABI_LP64; @@ -624,7 +623,6 @@ main (int argc, char **argv) fatal_error (input_location, "unrecognizable argument of option " STR); } -#undef STR else if (strcmp (argv[i], "-save-temps") == 0) save_temps = true; else if (strcmp (argv[i], "-v") == 0) diff --git a/gcc/config/i386/winnt.c b/gcc/config/i386/winnt.c index b66263a..4158a45 100644 --- a/gcc/config/i386/winnt.c +++ b/gcc/config/i386/winnt.c @@ -505,8 +505,7 @@ i386_pe_asm_named_section (const char *name, unsigned int flags, /* LTO sections need 1-byte alignment to avoid confusing the zlib decompression algorithm with trailing zero pad bytes. */ - if (strncmp (name, LTO_SECTION_NAME_PREFIX, - strlen (LTO_SECTION_NAME_PREFIX)) == 0) + if (startswith (name, LTO_SECTION_NAME_PREFIX)) *f++ = '0'; *f = '\0'; @@ -797,7 +796,7 @@ i386_pe_file_end (void) oname = name; if (name[0] == '.') ++name; - if (strncmp (name, "refptr.", 7) != 0) + if (!startswith (name, "refptr.")) continue; name += 7; fprintf (asm_out_file, "\t.section\t.rdata$%s, \"dr\"\n" diff --git a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c index f1a6de1..632b9df 100644 --- a/gcc/config/ia64/ia64.c +++ b/gcc/config/ia64/ia64.c @@ -10007,11 +10007,11 @@ ia64_in_small_data_p (const_tree exp) const char *section = DECL_SECTION_NAME (exp); if (strcmp (section, ".sdata") == 0 - || strncmp (section, ".sdata.", 7) == 0 - || strncmp (section, ".gnu.linkonce.s.", 16) == 0 + || startswith (section, ".sdata.") + || startswith (section, ".gnu.linkonce.s.") || strcmp (section, ".sbss") == 0 - || strncmp (section, ".sbss.", 6) == 0 - || strncmp (section, ".gnu.linkonce.sb.", 17) == 0) + || startswith (section, ".sbss.") + || startswith (section, ".gnu.linkonce.sb.")) return true; } else @@ -10869,13 +10869,13 @@ ia64_section_type_flags (tree decl, const char *name, int reloc) unsigned int flags = 0; if (strcmp (name, ".sdata") == 0 - || strncmp (name, ".sdata.", 7) == 0 - || strncmp (name, ".gnu.linkonce.s.", 16) == 0 - || strncmp (name, ".sdata2.", 8) == 0 - || strncmp (name, ".gnu.linkonce.s2.", 17) == 0 + || startswith (name, ".sdata.") + || startswith (name, ".gnu.linkonce.s.") + || startswith (name, ".sdata2.") + || startswith (name, ".gnu.linkonce.s2.") || strcmp (name, ".sbss") == 0 - || strncmp (name, ".sbss.", 6) == 0 - || strncmp (name, ".gnu.linkonce.sb.", 17) == 0) + || startswith (name, ".sbss.") + || startswith (name, ".gnu.linkonce.sb.")) flags = SECTION_SMALL; flags |= default_section_type_flags (decl, name, reloc); diff --git a/gcc/config/mips/driver-native.c b/gcc/config/mips/driver-native.c index eaf5f7e..46bb3ca 100644 --- a/gcc/config/mips/driver-native.c +++ b/gcc/config/mips/driver-native.c @@ -57,7 +57,7 @@ host_detect_local_cpu (int argc, const char **argv) return NULL; while (fgets (buf, sizeof (buf), f) != NULL) - if (strncmp (buf, "cpu model", sizeof ("cpu model") - 1) == 0) + if (startswith (buf, "cpu model")) { if (strstr (buf, "Godson2 V0.2") != NULL || strstr (buf, "Loongson-2 V0.2") != NULL diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 3155459..e5ba273 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -1513,14 +1513,14 @@ mips_handle_interrupt_attr (tree *node ATTRIBUTE_UNUSED, tree name, tree args, *no_add_attrs = true; } else if (strcmp (TREE_STRING_POINTER (cst), "eic") != 0 - && strncmp (TREE_STRING_POINTER (cst), "vector=", 7) != 0) + && !startswith (TREE_STRING_POINTER (cst), "vector=")) { warning (OPT_Wattributes, "argument to %qE attribute is neither eic, nor " "vector=", name); *no_add_attrs = true; } - else if (strncmp (TREE_STRING_POINTER (cst), "vector=", 7) == 0) + else if (startswith (TREE_STRING_POINTER (cst), "vector=")) { const char *arg = TREE_STRING_POINTER (cst) + 7; @@ -1849,7 +1849,7 @@ static bool mips16_stub_function_p (const_rtx x) { return (GET_CODE (x) == SYMBOL_REF - && strncmp (XSTR (x, 0), "__mips16_", 9) == 0); + && startswith (XSTR (x, 0), "__mips16_")); } /* Return true if function X is a locally-defined and locally-binding @@ -9323,7 +9323,7 @@ mips_function_rodata_section (tree decl, bool) if (decl && DECL_SECTION_NAME (decl)) { const char *name = DECL_SECTION_NAME (decl); - if (DECL_COMDAT_GROUP (decl) && strncmp (name, ".gnu.linkonce.t.", 16) == 0) + if (DECL_COMDAT_GROUP (decl) && startswith (name, ".gnu.linkonce.t.")) { char *rname = ASTRDUP (name); rname[14] = 'd'; @@ -9331,7 +9331,7 @@ mips_function_rodata_section (tree decl, bool) } else if (flag_function_sections && flag_data_sections - && strncmp (name, ".text.", 6) == 0) + && startswith (name, ".text.")) { char *rname = ASTRDUP (name); memcpy (rname + 1, "data", 4); diff --git a/gcc/config/msp430/msp430.c b/gcc/config/msp430/msp430.c index 581e051..1cdacb7 100644 --- a/gcc/config/msp430/msp430.c +++ b/gcc/config/msp430/msp430.c @@ -122,7 +122,7 @@ msp430_mcu_name (void) /* The 'i' in the device name symbol for msp430i* devices must be lower case, to match the expected symbol in msp430.h. */ - if (strncmp (target_mcu, "msp430i", 7) == 0) + if (startswith (target_mcu, "msp430i")) { snprintf (mcu_name, sizeof (mcu_name) - 1, "__MSP430i%s__", target_mcu + 7); @@ -2466,7 +2466,7 @@ msp430_function_section (tree decl, enum node_frequency freq, bool startup, const char * prefix = gen_prefix (decl); if (prefix == NULL - || strncmp (name, prefix, strlen (prefix)) == 0) + || startswith (name, prefix)) return default_function_section (decl, freq, startup, exit); name = ACONCAT ((prefix, name, NULL)); @@ -2479,11 +2479,11 @@ msp430_function_section (tree decl, enum node_frequency freq, bool startup, unsigned int msp430_section_type_flags (tree decl, const char * name, int reloc) { - if (strncmp (name, lower_prefix, strlen (lower_prefix)) == 0) + if (startswith (name, lower_prefix)) name += strlen (lower_prefix); - else if (strncmp (name, upper_prefix, strlen (upper_prefix)) == 0) + else if (startswith (name, upper_prefix)) name += strlen (upper_prefix); - else if (strncmp (name, either_prefix, strlen (either_prefix)) == 0) + else if (startswith (name, either_prefix)) name += strlen (either_prefix); return default_section_type_flags (decl, name, reloc); @@ -3243,8 +3243,7 @@ msp430_expand_helper (rtx *operands, const char *helper_name, machine_mode arg0mode = GET_MODE (operands[0]); machine_mode arg1mode = GET_MODE (operands[1]); machine_mode arg2mode = GET_MODE (operands[2]); - int expand_mpy = strncmp (helper_name, "__mspabi_mpy", - sizeof ("__mspabi_mpy") - 1) == 0; + int expand_mpy = startswith (helper_name, "__mspabi_mpy"); /* This function has been used incorrectly if CONST_VARIANTS is TRUE for a hwmpy function. */ gcc_assert (!(expand_mpy && const_variants)); diff --git a/gcc/config/nios2/nios2.c b/gcc/config/nios2/nios2.c index bf5e2be..26d4333 100644 --- a/gcc/config/nios2/nios2.c +++ b/gcc/config/nios2/nios2.c @@ -2336,9 +2336,9 @@ static bool nios2_small_section_name_p (const char *section) { return (strcmp (section, ".sbss") == 0 - || strncmp (section, ".sbss.", 6) == 0 + || startswith (section, ".sbss.") || strcmp (section, ".sdata") == 0 - || strncmp (section, ".sdata.", 7) == 0 + || startswith (section, ".sdata.") || (nios2_gprel_sec && regexec (&nios2_gprel_sec_regex, section, 0, NULL, 0) == 0)); } @@ -4199,12 +4199,12 @@ nios2_valid_target_attribute_rec (tree args) *p = '\0'; if (eq) *eq = '\0'; - if (!strncmp (argstr, "no-", 3)) + if (startswith (argstr, "no-")) { no_opt = true; argstr += 3; } - if (!strncmp (argstr, "custom-fpu-cfg", 14)) + if (startswith (argstr, "custom-fpu-cfg")) { char *end_eq = p; if (no_opt) @@ -4225,13 +4225,12 @@ nios2_valid_target_attribute_rec (tree args) nios2_handle_custom_fpu_cfg (eq, end_eq + 1, true); } - else if (!strncmp (argstr, "custom-", 7)) + else if (startswith (argstr, "custom-")) { int code = -1; unsigned int i; for (i = 0; i < ARRAY_SIZE (nios2_fpu_insn); i++) - if (!strncmp (argstr + 7, N2FPU_NAME (i), - strlen (N2FPU_NAME (i)))) + if (startswith (argstr + 7, N2FPU_NAME (i))) { /* Found insn. */ code = i; diff --git a/gcc/config/nvptx/mkoffload.c b/gcc/config/nvptx/mkoffload.c index b0a4dfa..c46c85d 100644 --- a/gcc/config/nvptx/mkoffload.c +++ b/gcc/config/nvptx/mkoffload.c @@ -256,13 +256,13 @@ process (FILE *in, FILE *out) case '\n': fprintf (out, "\\n\"\n\t\""); /* Look for mappings on subsequent lines. */ - while (strncmp (input + i, "//:", 3) == 0) + while (startswith (input + i, "//:")) { i += 3; - if (strncmp (input + i, "VAR_MAP ", 8) == 0) + if (startswith (input + i, "VAR_MAP ")) record_id (input + i + 8, &vars_tail); - else if (strncmp (input + i, "FUNC_MAP ", 9) == 0) + else if (startswith (input + i, "FUNC_MAP ")) record_id (input + i + 9, &funcs_tail); else abort (); @@ -481,8 +481,7 @@ main (int argc, char **argv) bool fpic = false; for (int i = 1; i < argc; i++) { -#define STR "-foffload-abi=" - if (strncmp (argv[i], STR, strlen (STR)) == 0) + if (startswith (argv[i], "-foffload-abi=")) { if (strcmp (argv[i] + strlen (STR), "lp64") == 0) offload_abi = OFFLOAD_ABI_LP64; @@ -492,7 +491,6 @@ main (int argc, char **argv) fatal_error (input_location, "unrecognizable argument of option " STR); } -#undef STR else if (strcmp (argv[i], "-fopenmp") == 0) fopenmp = true; else if (strcmp (argv[i], "-fopenacc") == 0) diff --git a/gcc/config/pa/som.h b/gcc/config/pa/som.h index d25a2ed..05cc315 100644 --- a/gcc/config/pa/som.h +++ b/gcc/config/pa/som.h @@ -47,32 +47,29 @@ along with GCC; see the file COPYING3. If not see do { \ static int in_shlib_list = 0; \ while (*PTR == ' ') PTR++; \ - if (strncmp (PTR, "shared library list:", \ - sizeof ("shared library list:") - 1) == 0) \ + if (startswith (PTR, "shared library list:")) \ { \ PTR = 0; \ in_shlib_list = 1; \ } \ - else if (strncmp (PTR, "shared library binding:", \ - sizeof ("shared library binding:") - 1) == 0)\ + else if (startswith (PTR, "shared library binding:")) \ { \ PTR = 0; \ in_shlib_list = 0; \ } \ - else if (strncmp (PTR, "static branch prediction disabled", \ - sizeof ("static branch prediction disabled") - 1) == 0)\ + else if (startswith (PTR, "static branch prediction disabled")) \ { \ PTR = 0; \ in_shlib_list = 0; \ } \ else if (in_shlib_list \ - && strncmp (PTR, "dynamic", sizeof ("dynamic") - 1) == 0) \ + && startswith (PTR, "dynamic")) \ { \ PTR += sizeof ("dynamic") - 1; \ while (*p == ' ') PTR++; \ } \ else if (in_shlib_list \ - && strncmp (PTR, "static", sizeof ("static") - 1) == 0) \ + && startswith (PTR, "static")) \ { \ PTR += sizeof ("static") - 1; \ while (*p == ' ') PTR++; \ diff --git a/gcc/config/pdp11/pdp11.c b/gcc/config/pdp11/pdp11.c index eb3bea4..b663b43 100644 --- a/gcc/config/pdp11/pdp11.c +++ b/gcc/config/pdp11/pdp11.c @@ -2251,7 +2251,7 @@ static void pdp11_output_ident (const char *ident) { if (TARGET_DEC_ASM) { - if (strncmp (ident, "GCC:", 4) != 0) + if (!startswith (ident, "GCC:")) fprintf (asm_out_file, "\t.ident\t\"%s\"\n", ident); } diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c index 27665e5..1baa299 100644 --- a/gcc/config/riscv/riscv.c +++ b/gcc/config/riscv/riscv.c @@ -3593,7 +3593,7 @@ riscv_elf_select_rtx_section (machine_mode mode, rtx x, if (riscv_size_ok_for_small_data_p (GET_MODE_SIZE (mode))) { - if (strncmp (s->named.name, ".rodata.cst", strlen (".rodata.cst")) == 0) + if (startswith (s->named.name, ".rodata.cst")) { /* Rename .rodata.cst* to .srodata.cst*. */ char *name = (char *) alloca (strlen (s->named.name) + 2); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 53a9f54..c304596 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -17205,12 +17205,12 @@ toc_hasher::equal (toc_hash_struct *h1, toc_hash_struct *h2) instead, there should be some programmatic way of inquiring as to whether or not an object is a vtable. */ -#define VTABLE_NAME_P(NAME) \ - (strncmp ("_vt.", name, strlen ("_vt.")) == 0 \ - || strncmp ("_ZTV", name, strlen ("_ZTV")) == 0 \ - || strncmp ("_ZTT", name, strlen ("_ZTT")) == 0 \ - || strncmp ("_ZTI", name, strlen ("_ZTI")) == 0 \ - || strncmp ("_ZTC", name, strlen ("_ZTC")) == 0) +#define VTABLE_NAME_P(NAME) \ + (startswith (name, "_vt.") \ + || startswith (name, "_ZTV") \ + || startswith (name, "_ZTT") \ + || startswith (name, "_ZTI") \ + || startswith (name, "_ZTC")) #ifdef NO_DOLLAR_IN_LABEL /* Return a GGC-allocated character string translating dollar signs in @@ -24204,7 +24204,7 @@ rs6000_inner_target_options (tree args, bool attr_p) const char *cpu_opt = NULL; p = NULL; - if (strncmp (q, "cpu=", 4) == 0) + if (startswith (q, "cpu=")) { int cpu_index = rs6000_cpu_name_lookup (q+4); if (cpu_index >= 0) @@ -24215,7 +24215,7 @@ rs6000_inner_target_options (tree args, bool attr_p) cpu_opt = q+4; } } - else if (strncmp (q, "tune=", 5) == 0) + else if (startswith (q, "tune=")) { int tune_index = rs6000_cpu_name_lookup (q+5); if (tune_index >= 0) @@ -24233,7 +24233,7 @@ rs6000_inner_target_options (tree args, bool attr_p) char *r = q; error_p = true; - if (strncmp (r, "no-", 3) == 0) + if (startswith (r, "no-")) { invert = true; r += 3; diff --git a/gcc/config/s390/driver-native.c b/gcc/config/s390/driver-native.c index c024715..71c4ff6 100644 --- a/gcc/config/s390/driver-native.c +++ b/gcc/config/s390/driver-native.c @@ -73,7 +73,7 @@ s390_host_detect_local_cpu (int argc, const char **argv) (has_features == 0 || has_processor == 0) && fgets (buf, sizeof (buf), f) != NULL; ) { - if (has_processor == 0 && strncmp (buf, "processor", 9) == 0) + if (has_processor == 0 && startswith (buf, "processor")) { const char *p; long machine_id; @@ -128,7 +128,7 @@ s390_host_detect_local_cpu (int argc, const char **argv) break; } } - if (has_features == 0 && strncmp (buf, "features", 8) == 0) + if (has_features == 0 && startswith (buf, "features")) { const char *p; @@ -144,13 +144,13 @@ s390_host_detect_local_cpu (int argc, const char **argv) p++; for (i = 0; !ISSPACE (p[i]) && p[i] != 0; i++) ; - if (i == 3 && strncmp (p, "dfp", 3) == 0) + if (i == 3 && startswith (p, "dfp")) has_dfp = 1; - else if (i == 2 && strncmp (p, "te", 2) == 0) + else if (i == 2 && startswith (p, "te")) has_te = 1; - else if (i == 2 && strncmp (p, "vx", 2) == 0) + else if (i == 2 && startswith (p, "vx")) has_vx = 1; - else if (i == 8 && strncmp (p, "highgprs", 8) == 0) + else if (i == 8 && startswith (p, "highgprs")) has_highgprs = 1; p += i; } diff --git a/gcc/config/sparc/driver-sparc.c b/gcc/config/sparc/driver-sparc.c index f70c53f..698c18e 100644 --- a/gcc/config/sparc/driver-sparc.c +++ b/gcc/config/sparc/driver-sparc.c @@ -148,7 +148,7 @@ host_detect_local_cpu (int argc, const char **argv) return NULL; while (fgets (buf, sizeof (buf), f) != NULL) - if (strncmp (buf, "cpu\t\t:", sizeof ("cpu\t\t:") - 1) == 0) + if (startswith (buf, "cpu\t\t:")) { for (i = 0; cpu_names [i].name; i++) if (strstr (buf, cpu_names [i].name) != NULL) diff --git a/gcc/config/vax/vax.c b/gcc/config/vax/vax.c index 96a7925..3aacd1e 100644 --- a/gcc/config/vax/vax.c +++ b/gcc/config/vax/vax.c @@ -1325,10 +1325,10 @@ vax_output_int_move (rtx insn ATTRIBUTE_UNUSED, rtx *operands, be shorter (1 opcode byte + 1 addrmode byte + 8 immediate value bytes .vs. 2 opcode bytes + 2 addrmode bytes + 8 immediate value value bytes. */ - if ((!strncmp (pattern_lo, "movl", 4) - && !strncmp (pattern_hi, "movl", 4)) - || (!strncmp (pattern_lo, "pushl", 5) - && !strncmp (pattern_hi, "pushl", 5))) + if ((startswith (pattern_lo, "movl") + && startswith (pattern_hi, "movl")) + || (startswith (pattern_lo, "pushl") + && startswith (pattern_hi, "pushl"))) return "movq %1,%0"; if (MEM_P (operands[0]) diff --git a/gcc/config/vms/vms-ld.c b/gcc/config/vms/vms-ld.c index 451ad0d..121aebd 100644 --- a/gcc/config/vms/vms-ld.c +++ b/gcc/config/vms/vms-ld.c @@ -94,6 +94,14 @@ static int translate_unix (char *, int); #endif +/* Return 1 if STR string starts with PREFIX. */ + +static inline int +startswith (const char *str, const char *prefix) +{ + return strncmp (str, prefix, strlen (prefix)) == 0; +} + /* Append STR to the command line to invoke the linker. Expand the line as necessary to accommodate. */ @@ -319,7 +327,7 @@ process_args (int argc, char **argv) for (i = 1; i < argc; i++) { - if (strncmp (argv[i], "-L", 2) == 0) + if (startswith (argv[i], "-L")) { search_dirs = XRESIZEVEC(const char *, search_dirs, search_dirs_len + 1); @@ -341,7 +349,7 @@ process_args (int argc, char **argv) } else if (strcmp (argv[i], "-g0") == 0) addarg ("/notraceback"); - else if (strncmp (argv[i], "-g", 2) == 0) + else if (startswith (argv[i], "-g")) { addarg ("/debug"); debug = 1; @@ -654,7 +662,7 @@ main (int argc, char **argv) /* Already handled. */ i++; } - else if (arg_len > 2 && strncmp (argv[i], "-l", 2) == 0) + else if (arg_len > 2 && startswith (argv[i], "-l")) { const char *libname; @@ -676,17 +684,17 @@ main (int argc, char **argv) } } else if (strcmp (argv[i], "-v" ) == 0 - || strncmp (argv[i], "-g", 2 ) == 0 + || startswith (argv[i], "-g") || strcmp (argv[i], "-static" ) == 0 || strcmp (argv[i], "-map" ) == 0 || strcmp (argv[i], "-save-temps") == 0 || strcmp (argv[i], "--noinhibit-exec") == 0 - || (arg_len > 2 && strncmp (argv[i], "-L", 2) == 0) - || (arg_len >= 6 && strncmp (argv[i], "-share", 6) == 0)) + || (arg_len > 2 && startswith (argv[i], "-L")) + || (arg_len >= 6 && startswith (argv[i], "-share"))) { /* Already handled. */ } - else if (strncmp (argv[i], "--opt=", 6) == 0) + else if (startswith (argv[i], "--opt=")) fprintf (optfile, "%s\n", argv[i] + 6); else if (arg_len > 1 && argv[i][0] == '@') { diff --git a/gcc/config/vms/vms.c b/gcc/config/vms/vms.c index 1ee1c86..bbf174e 100644 --- a/gcc/config/vms/vms.c +++ b/gcc/config/vms/vms.c @@ -302,7 +302,7 @@ vms_start_function (const char *fnname) #if VMS_DEBUGGING_INFO if (vms_debug_main && debug_info_level > DINFO_LEVEL_NONE - && strncmp (vms_debug_main, fnname, strlen (vms_debug_main)) == 0) + && startswith (vms_debug_main, fnname)) { targetm.asm_out.globalize_label (asm_out_file, VMS_DEBUG_MAIN_POINTER); ASM_OUTPUT_DEF (asm_out_file, VMS_DEBUG_MAIN_POINTER, fnname); -- cgit v1.1 From 6806469dbed7e2613fad5468a94830c2cc817c95 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Tue, 18 May 2021 11:17:56 +0200 Subject: startswith: Fix offloading targets. gcc/ChangeLog: * config/gcn/mkoffload.c (STR): Redefine. * config/i386/intelmic-mkoffload.c (STR): Likewise. * config/nvptx/mkoffload.c (STR): Likewise. --- gcc/config/gcn/mkoffload.c | 4 +++- gcc/config/i386/intelmic-mkoffload.c | 4 +++- gcc/config/nvptx/mkoffload.c | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c index 5432f95..1469a68 100644 --- a/gcc/config/gcn/mkoffload.c +++ b/gcc/config/gcn/mkoffload.c @@ -825,7 +825,8 @@ main (int argc, char **argv) bool fpic = false; for (int i = 1; i < argc; i++) { - if (startswith (argv[i], "-foffload-abi=")) +#define STR "-foffload-abi=" + if (startswith (argv[i], STR)) { if (strcmp (argv[i] + strlen (STR), "lp64") == 0) offload_abi = OFFLOAD_ABI_LP64; @@ -835,6 +836,7 @@ main (int argc, char **argv) fatal_error (input_location, "unrecognizable argument of option " STR); } +#undef STR else if (strcmp (argv[i], "-fopenmp") == 0) fopenmp = true; else if (strcmp (argv[i], "-fopenacc") == 0) diff --git a/gcc/config/i386/intelmic-mkoffload.c b/gcc/config/i386/intelmic-mkoffload.c index cb946d6..03858e6 100644 --- a/gcc/config/i386/intelmic-mkoffload.c +++ b/gcc/config/i386/intelmic-mkoffload.c @@ -613,7 +613,8 @@ main (int argc, char **argv) /* Scan the argument vector. */ for (int i = 1; i < argc; i++) { - if (startswith (argv[i], "-foffload-abi=")) +#define STR "-foffload-abi=" + if (startswith (argv[i], STR)) { if (strcmp (argv[i] + strlen (STR), "lp64") == 0) offload_abi = OFFLOAD_ABI_LP64; @@ -623,6 +624,7 @@ main (int argc, char **argv) fatal_error (input_location, "unrecognizable argument of option " STR); } +#undef STR else if (strcmp (argv[i], "-save-temps") == 0) save_temps = true; else if (strcmp (argv[i], "-v") == 0) diff --git a/gcc/config/nvptx/mkoffload.c b/gcc/config/nvptx/mkoffload.c index c46c85d..17f17e5 100644 --- a/gcc/config/nvptx/mkoffload.c +++ b/gcc/config/nvptx/mkoffload.c @@ -481,7 +481,8 @@ main (int argc, char **argv) bool fpic = false; for (int i = 1; i < argc; i++) { - if (startswith (argv[i], "-foffload-abi=")) +#define STR "-foffload-abi=" + if (startswith (argv[i], STR)) { if (strcmp (argv[i] + strlen (STR), "lp64") == 0) offload_abi = OFFLOAD_ABI_LP64; @@ -491,6 +492,7 @@ main (int argc, char **argv) fatal_error (input_location, "unrecognizable argument of option " STR); } +#undef STR else if (strcmp (argv[i], "-fopenmp") == 0) fopenmp = true; else if (strcmp (argv[i], "-fopenacc") == 0) -- cgit v1.1 From d39fbed75810fc7478842503ecb0268b85dc9c2e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 18 May 2021 15:45:54 +0200 Subject: i386: Fix split_double_mode with paradoxical subreg [PR100626] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit split_double_mode calls simplify_gen_subreg, which fails for the high half of the paradoxical subreg. Return temporary register instead of NULL RTX in this case. 2021-05-18 Uroš Bizjak gcc/ PR target/100626 * config/i386/i386-expand.c (split_double_mode): Return temporary register when simplify_gen_subreg fails with the high half od the paradoxical subreg. --- gcc/config/i386/i386-expand.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 0fa8d45..9f3d419 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -154,9 +154,13 @@ split_double_mode (machine_mode mode, rtx operands[], lo_half[num] = simplify_gen_subreg (half_mode, op, GET_MODE (op) == VOIDmode ? mode : GET_MODE (op), 0); - hi_half[num] = simplify_gen_subreg (half_mode, op, - GET_MODE (op) == VOIDmode - ? mode : GET_MODE (op), byte); + + rtx tmp = simplify_gen_subreg (half_mode, op, + GET_MODE (op) == VOIDmode + ? mode : GET_MODE (op), byte); + /* simplify_gen_subreg will return NULL RTX for the + high half of the paradoxical subreg. */ + hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode); } } } -- cgit v1.1 From 8b9484c54b4000209d4bfb270e22c9c8b9673fdb Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 18 May 2021 15:56:22 +0200 Subject: i386: Fix v4qiv4di2 expander MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a mode mismatch. 2021-05-18 Uroš Bizjak gcc/ * config/i386/sse.md (v4qiv4di2): Fix a mode mismatch with operand 1. --- gcc/config/i386/sse.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 62f4e15f..a4503dd 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -18558,8 +18558,8 @@ { if (!MEM_P (operands[1])) { - operands[1] = force_reg (V8QImode, operands[1]); - operands[1] = simplify_gen_subreg (V16QImode, operands[1], V8QImode, 0); + operands[1] = force_reg (V4QImode, operands[1]); + operands[1] = simplify_gen_subreg (V16QImode, operands[1], V4QImode, 0); emit_insn (gen_avx2_v4qiv4di2 (operands[0], operands[1])); DONE; } -- cgit v1.1 From 46ca31d65092e5afcef292f807fcf14c5363280d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 18 May 2021 17:25:54 +0200 Subject: i386: Implement 4-byte vector support [PR100637] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add infrastructure, logic and arithmetic support for 4-byte vectors. These can be used with SSE2 targets, where movd instructions from/to XMM registers are available. x86_64 ABI passes 4-byte vectors in integer registers, so also add logic operations with integer registers. 2021-05-18 Uroš Bizjak gcc/ PR target/100637 * config/i386/i386.h (VALID_SSE2_REG_MODE): Add V4QI and V2HI modes. (VALID_INT_MODE_P): Ditto. * config/i386/mmx.md (VI_32): New mode iterator. (mmxvecsize): Handle V4QI and V2HI. (Yv_Yw): Ditto. (mov): New expander. (*mov_internal): New insn pattern. (movmisalign): New expander. (neg): New expander. (3): New expander. (*3): New insn pattern. (mulv2hi3): New expander. (*mulv2hi3): New insn pattern. (one_cmpl2): New expander. (*andnot3): New insn pattern. (3): New expander. (*3): New insn pattern. gcc/testsuite/ PR target/100637 * gcc.target/i386/pr100637-1b.c: New test. * gcc.target/i386/pr100637-1w.c: Ditto. * gcc.target/i386/pr92658-avx2-2.c: Do not XFAIL scan for pmovsxbq. * gcc.target/i386/pr92658-avx2.c: Do not XFAIL scan for pmovzxbq. * gcc.target/i386/pr92658-avx512vl.c: Do not XFAIL scan for vpmovdb. * gcc.target/i386/pr92658-sse4-2.c: Do not XFAIL scan for pmovsxbd and pmovsxwq. * gcc.target/i386/pr92658-sse4.c: Do not XFAIL scan for pmovzxbd and pmovzxwq. --- gcc/config/i386/i386.h | 15 ++-- gcc/config/i386/mmx.md | 195 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 200 insertions(+), 10 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 97d6f38..d15f9b2 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1007,6 +1007,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define VALID_SSE2_REG_MODE(MODE) \ ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode \ + || (MODE) == V4QImode || (MODE) == V2HImode \ || (MODE) == V2DImode || (MODE) == DFmode) #define VALID_SSE_REG_MODE(MODE) \ @@ -1034,12 +1035,14 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); || (MODE) == SCmode || (MODE) == DCmode || (MODE) == XCmode) \ #define VALID_INT_MODE_P(MODE) \ - ((MODE) == QImode || (MODE) == HImode || (MODE) == SImode \ - || (MODE) == DImode \ - || (MODE) == CQImode || (MODE) == CHImode || (MODE) == CSImode \ - || (MODE) == CDImode \ - || (TARGET_64BIT && ((MODE) == TImode || (MODE) == CTImode \ - || (MODE) == TFmode || (MODE) == TCmode))) + ((MODE) == QImode || (MODE) == HImode \ + || (MODE) == SImode || (MODE) == DImode \ + || (MODE) == CQImode || (MODE) == CHImode \ + || (MODE) == CSImode || (MODE) == CDImode \ + || (TARGET_64BIT \ + && ((MODE) == TImode || (MODE) == CTImode \ + || (MODE) == TFmode || (MODE) == TCmode)) \ + || (MODE) == V4QImode || (MODE) == V2HImode) /* Return true for modes passed in SSE registers. */ #define SSE_REG_MODE_P(MODE) \ diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 7fc2e5d..7806b62 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -57,11 +57,15 @@ (define_mode_iterator MMXMODE24 [V4HI V2SI]) (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI]) +;; All 32bit integer vector modes +(define_mode_iterator VI_32 [V4QI V2HI]) + ;; All V2S* modes (define_mode_iterator V2FI [V2SF V2SI]) ;; Mapping from integer vector mode to mnemonic suffix -(define_mode_attr mmxvecsize [(V8QI "b") (V4HI "w") (V2SI "d") (V1DI "q")]) +(define_mode_attr mmxvecsize + [(V8QI "b") (V4QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) (define_mode_attr mmxdoublemode [(V8QI "V8HI") (V4HI "V4SI")]) @@ -74,7 +78,8 @@ [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi")]) (define_mode_attr Yv_Yw - [(V8QI "Yw") (V4HI "Yw") (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")]) + [(V8QI "Yw") (V4QI "Yw") (V4HI "Yw") (V2HI "Yw") + (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -233,6 +238,80 @@ DONE; }) +(define_expand "mov" + [(set (match_operand:VI_32 0 "nonimmediate_operand") + (match_operand:VI_32 1 "nonimmediate_operand"))] + "TARGET_SSE2" +{ + ix86_expand_vector_move (mode, operands); + DONE; +}) + +(define_insn "*mov_internal" + [(set (match_operand:VI_32 0 "nonimmediate_operand" + "=r ,m ,v,v,v,m,r,v") + (match_operand:VI_32 1 "general_operand" + "rmC,rC,C,v,m,v,v,r"))] + "TARGET_SSE2 && + !(MEM_P (operands[0]) && MEM_P (operands[1]))" +{ + switch (get_attr_type (insn)) + { + case TYPE_IMOV: + return "mov{l}\t{%1, %0|%0, %1}"; + + case TYPE_SSELOG1: + return standard_sse_constant_opcode (insn, operands); + + case TYPE_SSEMOV: + return ix86_output_ssemov (insn, operands); + + default: + gcc_unreachable (); + } +} + [(set (attr "type") + (cond [(eq_attr "alternative" "2") + (const_string "sselog1") + (eq_attr "alternative" "3,4,5,6,7") + (const_string "ssemov") + ] + (const_string "imov"))) + (set (attr "prefix") + (if_then_else (eq_attr "type" "sselog1,ssemov") + (const_string "maybe_vex") + (const_string "orig"))) + (set (attr "prefix_data16") + (if_then_else (and (eq_attr "type" "ssemov") (eq_attr "mode" "SI")) + (const_string "1") + (const_string "*"))) + (set (attr "mode") + (cond [(eq_attr "alternative" "2,3") + (cond [(match_test "TARGET_AVX") + (const_string "TI") + (match_test "optimize_function_for_size_p (cfun)") + (const_string "V4SF") + ] + (const_string "TI")) + ] + (const_string "SI"))) + (set (attr "preferred_for_speed") + (cond [(eq_attr "alternative" "6") + (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC") + (eq_attr "alternative" "7") + (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC") + ] + (symbol_ref "true")))]) + +(define_expand "movmisalign" + [(set (match_operand:VI_32 0 "nonimmediate_operand") + (match_operand:VI_32 1 "nonimmediate_operand"))] + "TARGET_SSE2" +{ + ix86_expand_vector_move (mode, operands); + DONE; +}) + (define_insn "sse_movntq" [(set (match_operand:DI 0 "memory_operand" "=m,m") (unspec:DI [(match_operand:DI 1 "register_operand" "y,r")] @@ -1229,6 +1308,14 @@ ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(define_expand "neg2" + [(set (match_operand:MMXMODEI 0 "register_operand") + (minus:MMXMODEI + (match_dup 2) + (match_operand:MMXMODEI 1 "register_operand")))] + "TARGET_MMX_WITH_SSE" + "operands[2] = force_reg (mode, CONST0_RTX (mode));") + (define_expand "mmx_3" [(set (match_operand:MMXMODEI8 0 "register_operand") (plusminus:MMXMODEI8 @@ -1248,8 +1335,10 @@ (define_insn "*mmx_3" [(set (match_operand:MMXMODEI8 0 "register_operand" "=y,x,") (plusminus:MMXMODEI8 - (match_operand:MMXMODEI8 1 "register_mmxmem_operand" "0,0,") - (match_operand:MMXMODEI8 2 "register_mmxmem_operand" "ym,x,")))] + (match_operand:MMXMODEI8 1 "register_mmxmem_operand" + "0,0,") + (match_operand:MMXMODEI8 2 "register_mmxmem_operand" + "ym,x,")))] "(TARGET_MMX || TARGET_MMX_WITH_SSE) && ix86_binary_operator_ok (, mode, operands)" "@ @@ -1261,6 +1350,36 @@ (set_attr "type" "mmxadd,sseadd,sseadd") (set_attr "mode" "DI,TI,TI")]) +(define_expand "neg2" + [(set (match_operand:VI_32 0 "register_operand") + (minus:VI_32 + (match_dup 2) + (match_operand:VI_32 1 "register_operand")))] + "TARGET_SSE2" + "operands[2] = force_reg (mode, CONST0_RTX (mode));") + +(define_expand "3" + [(set (match_operand:VI_32 0 "register_operand") + (plusminus:VI_32 + (match_operand:VI_32 1 "register_operand") + (match_operand:VI_32 2 "register_operand")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*3" + [(set (match_operand:VI_32 0 "register_operand" "=x,") + (plusminus:VI_32 + (match_operand:VI_32 1 "register_operand" "0,") + (match_operand:VI_32 2 "register_operand" "x,")))] + "TARGET_SSE2 + && ix86_binary_operator_ok (, mode, operands)" + "@ + p\t{%2, %0|%0, %2} + vp\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseadd") + (set_attr "mode" "TI")]) + (define_expand "mmx_3" [(set (match_operand:MMXMODE12 0 "register_operand") (sat_plusminus:MMXMODE12 @@ -1314,6 +1433,26 @@ (set_attr "type" "mmxmul,ssemul,ssemul") (set_attr "mode" "DI,TI,TI")]) +(define_expand "mulv2hi3" + [(set (match_operand:V2HI 0 "register_operand") + (mult:V2HI (match_operand:V2HI 1 "register_operand") + (match_operand:V2HI 2 "register_operand")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (MULT, V2HImode, operands);") + +(define_insn "*mulv2hi3" + [(set (match_operand:V2HI 0 "register_operand" "=x,Yw") + (mult:V2HI (match_operand:V2HI 1 "register_operand" "%0,Yw") + (match_operand:V2HI 2 "register_operand" "x,Yw")))] + "TARGET_SSE2 + && ix86_binary_operator_ok (MULT, V2HImode, operands)" + "@ + pmullw\t{%2, %0|%0, %2} + vpmullw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "ssemul") + (set_attr "mode" "TI")]) + (define_expand "mmx_smulv4hi3_highpart" [(set (match_operand:V4HI 0 "register_operand") (truncate:V4HI @@ -1839,6 +1978,14 @@ "TARGET_MMX_WITH_SSE" "operands[2] = force_reg (mode, CONSTM1_RTX (mode));") +(define_expand "one_cmpl2" + [(set (match_operand:VI_32 0 "register_operand") + (xor:VI_32 + (match_operand:VI_32 1 "register_operand") + (match_dup 2)))] + "TARGET_SSE2" + "operands[2] = force_reg (mode, CONSTM1_RTX (mode));") + (define_insn "mmx_andnot3" [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v") (and:MMXMODEI @@ -1855,6 +2002,22 @@ (set_attr "type" "mmxadd,sselog,sselog,sselog") (set_attr "mode" "DI,TI,TI,TI")]) +(define_insn "*andnot3" + [(set (match_operand:VI_32 0 "register_operand" "=r,x,x,v") + (and:VI_32 + (not:VI_32 (match_operand:VI_32 1 "register_operand" "r,0,x,v")) + (match_operand:VI_32 2 "register_operand" "r,x,x,v")))] + "TARGET_SSE2" + "@ + andn\t{%2, %1, %0|%0, %1, %2} + pandn\t{%2, %0|%0, %2} + vpandn\t{%2, %1, %0|%0, %1, %2} + vpandnd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "bmi,noavx,avx,avx512vl") + (set_attr "type" "bitmanip,sselog,sselog,sselog") + (set_attr "btver2_decode" "direct,*,*,*") + (set_attr "mode" "SI,TI,TI,TI")]) + (define_expand "mmx_3" [(set (match_operand:MMXMODEI 0 "register_operand") (any_logic:MMXMODEI @@ -1888,6 +2051,30 @@ (set_attr "type" "mmxadd,sselog,sselog,sselog") (set_attr "mode" "DI,TI,TI,TI")]) +(define_expand "3" + [(set (match_operand:VI_32 0 "register_operand") + (any_logic:VI_32 + (match_operand:VI_32 1 "register_operand") + (match_operand:VI_32 2 "register_operand")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*3" + [(set (match_operand:VI_32 0 "register_operand" "=r,x,x,v") + (any_logic:VI_32 + (match_operand:VI_32 1 "register_operand" "%0,0,x,v") + (match_operand:VI_32 2 "register_operand" "r,x,x,v")))] + "TARGET_SSE2 + && ix86_binary_operator_ok (, mode, operands)" + "@ + \t{%2, %0|%0, %2} + p\t{%2, %0|%0, %2} + vp\t{%2, %1, %0|%0, %1, %2} + vpd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "*,noavx,avx,avx512vl") + (set_attr "type" "alu,sselog,sselog,sselog") + (set_attr "mode" "SI,TI,TI,TI")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral element swizzling -- cgit v1.1 From 79cf0004ffbb0c7f12729a1fc7801d2f25539c5a Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 18 May 2021 09:04:39 -0500 Subject: rs6000: Remove old psabi warnings Long ago we were forced to make some small ABI breaks to correct errors in the implementation, and we added warning messages for the changes from GCC 4.9 to GCC 5. Enough time has passed that these are now just irritants, so let's remove them. Also clean up associated macros using rs6000_special_adjust_field_align_p, which has been always returning false for a long time. 2021-05-18 Bill Schmidt gcc/ * config/rs6000/freebsd64.h (ADJUST_FIELD_ALIGN): Remove call to rs6000_special_adjust_field_align_p. * config/rs6000/linux64.h (ADJUST_FIELD_ALIGN): Likewise. * config/rs6000/rs6000-call.c (rs6000_function_arg_boundary): Remove ABI warning. (rs6000_function_arg): Likewise. * config/rs6000/rs6000-protos.h (rs6000_special_adjust_field_align_p): Remove prototype. * config/rs6000/rs6000.c (rs6000_special_adjust_field_align_p): Remove. * config/rs6000/sysv4.h (ADJUST_FIELD_ALIGN): Remove call to rs6000_special_adjust_field_align_p. gcc/testsuite/ * gcc.target/powerpc/ppc64-abi-warn-1.c: Remove. * gcc.target/powerpc/ppc64-abi-warn-2.c: Remove. * gcc.target/powerpc/ppc64-abi-warn-3.c: Remove. --- gcc/config/rs6000/freebsd64.h | 10 ++++------ gcc/config/rs6000/linux64.h | 8 +++----- gcc/config/rs6000/rs6000-call.c | 31 ++----------------------------- gcc/config/rs6000/rs6000-protos.h | 1 - gcc/config/rs6000/rs6000.c | 26 -------------------------- gcc/config/rs6000/sysv4.h | 3 +-- 6 files changed, 10 insertions(+), 69 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/freebsd64.h b/gcc/config/rs6000/freebsd64.h index 11ea22e..7987983 100644 --- a/gcc/config/rs6000/freebsd64.h +++ b/gcc/config/rs6000/freebsd64.h @@ -306,12 +306,10 @@ extern int dot_symbols; /* PowerPC64 Linux word-aligns FP doubles when -malign-power is given. */ #undef ADJUST_FIELD_ALIGN #define ADJUST_FIELD_ALIGN(FIELD, TYPE, COMPUTED) \ - (rs6000_special_adjust_field_align_p ((TYPE), (COMPUTED)) \ - ? 128 \ - : (TARGET_64BIT \ - && TARGET_ALIGN_NATURAL == 0 \ - && TYPE_MODE (strip_array_types (TYPE)) == DFmode) \ - ? MIN ((COMPUTED), 32) \ + ((TARGET_64BIT \ + && TARGET_ALIGN_NATURAL == 0 \ + && TYPE_MODE (strip_array_types (TYPE)) == DFmode) \ + ? MIN ((COMPUTED), 32) \ : (COMPUTED)) #undef TOC_SECTION_ASM_OP diff --git a/gcc/config/rs6000/linux64.h b/gcc/config/rs6000/linux64.h index e3f2cd2..74be9a9 100644 --- a/gcc/config/rs6000/linux64.h +++ b/gcc/config/rs6000/linux64.h @@ -213,11 +213,9 @@ extern int dot_symbols; /* PowerPC64 Linux word-aligns FP doubles when -malign-power is given. */ #undef ADJUST_FIELD_ALIGN #define ADJUST_FIELD_ALIGN(FIELD, TYPE, COMPUTED) \ - (rs6000_special_adjust_field_align_p ((TYPE), (COMPUTED)) \ - ? 128 \ - : (TARGET_64BIT \ - && TARGET_ALIGN_NATURAL == 0 \ - && TYPE_MODE (strip_array_types (TYPE)) == DFmode) \ + ((TARGET_64BIT \ + && TARGET_ALIGN_NATURAL == 0 \ + && TYPE_MODE (strip_array_types (TYPE)) == DFmode) \ ? MIN ((COMPUTED), 32) \ : (COMPUTED)) diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index c4332a6..f271b0a 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -6793,25 +6793,8 @@ rs6000_function_arg_boundary (machine_mode mode, const_tree type) /* "Aggregate" means any AGGREGATE_TYPE except for single-element or homogeneous float/vector aggregates here. We already handled vector aggregates above, but still need to check for float here. */ - bool aggregate_p = (AGGREGATE_TYPE_P (type) - && !SCALAR_FLOAT_MODE_P (elt_mode)); - - /* We used to check for BLKmode instead of the above aggregate type - check. Warn when this results in any difference to the ABI. */ - if (aggregate_p != (mode == BLKmode)) - { - static bool warned; - if (!warned && warn_psabi) - { - warned = true; - inform (input_location, - "the ABI of passing aggregates with %d-byte alignment" - " has changed in GCC 5", - (int) TYPE_ALIGN (type) / BITS_PER_UNIT); - } - } - - if (aggregate_p) + if (AGGREGATE_TYPE_P (type) + && !SCALAR_FLOAT_MODE_P (elt_mode)) return 128; } @@ -7805,8 +7788,6 @@ rs6000_function_arg (cumulative_args_t cum_v, const function_arg_info &arg) if (i < n_elts && align_words + fpr_words < GP_ARG_NUM_REG && cum->nargs_prototype > 0) { - static bool warned; - machine_mode rmode = TARGET_32BIT ? SImode : DImode; int n_words = rs6000_arg_size (mode, type); @@ -7820,14 +7801,6 @@ rs6000_function_arg (cumulative_args_t cum_v, const function_arg_info &arg) rvec[k++] = gen_rtx_EXPR_LIST (VOIDmode, r, off); } while (++align_words < GP_ARG_NUM_REG && --n_words != 0); - - if (!warned && warn_psabi) - { - warned = true; - inform (input_location, - "the ABI of passing homogeneous % aggregates" - " has changed in GCC 5"); - } } return rs6000_finish_function_arg (mode, rvec, k); diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index bef727e..9de294d 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -226,7 +226,6 @@ address_is_prefixed (rtx addr, #ifdef TREE_CODE extern unsigned int rs6000_data_alignment (tree, unsigned int, enum data_align); -extern bool rs6000_special_adjust_field_align_p (tree, unsigned int); extern unsigned int rs6000_special_adjust_field_align (tree, unsigned int); extern unsigned int rs6000_special_round_type_align (tree, unsigned int, unsigned int); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index c304596..6db450a 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -7900,32 +7900,6 @@ rs6000_slow_unaligned_access (machine_mode mode, unsigned int align) && (int) align < VECTOR_ALIGN (mode))))); } -/* Previous GCC releases forced all vector types to have 16-byte alignment. */ - -bool -rs6000_special_adjust_field_align_p (tree type, unsigned int computed) -{ - if (TARGET_ALTIVEC && TREE_CODE (type) == VECTOR_TYPE) - { - if (computed != 128) - { - static bool warned; - if (!warned && warn_psabi) - { - warned = true; - inform (input_location, - "the layout of aggregates containing vectors with" - " %d-byte alignment has changed in GCC 5", - computed / BITS_PER_UNIT); - } - } - /* In current GCC there is no special case. */ - return false; - } - - return false; -} - /* AIX word-aligns FP doubles but doubleword-aligns 64-bit ints. */ unsigned int diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h index 510abe1..23ff594 100644 --- a/gcc/config/rs6000/sysv4.h +++ b/gcc/config/rs6000/sysv4.h @@ -325,8 +325,7 @@ do { \ /* An expression for the alignment of a structure field FIELD if the alignment computed in the usual way is COMPUTED. */ #define ADJUST_FIELD_ALIGN(FIELD, TYPE, COMPUTED) \ - (rs6000_special_adjust_field_align_p ((TYPE), (COMPUTED)) \ - ? 128 : COMPUTED) + (COMPUTED) #undef BIGGEST_FIELD_ALIGNMENT -- cgit v1.1 From ea30c7bd497bcd390f7b177e1e156f630a90f232 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 19 May 2021 09:57:29 +0200 Subject: i386: Allow 64bit vector modes in general registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow V8QI, V4HI and V2SI modes in 64bit general registers for TARGET_64BIT and add alternatives using general registers to 64bit vector logic instructions. 2021-05-19 Uroš Bizjak gcc/ * config/i386/i386.h (VALID_INT_MODE_P): Add V8QI, V4HI and V2SI modes for TARGET_64BIT. * config/i386/i386.md (isa): Add x64_bmi. (enabled): Handle x64_bmi. * config/i386/mmx.md (mmx_andnot3): Add alternative using 64bit general registers. (*mmx_3): Ditto. --- gcc/config/i386/i386.h | 6 ++++-- gcc/config/i386/i386.md | 15 +++++++++------ gcc/config/i386/mmx.md | 33 +++++++++++++++++++-------------- 3 files changed, 32 insertions(+), 22 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index d15f9b2..53d503f 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1039,10 +1039,12 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); || (MODE) == SImode || (MODE) == DImode \ || (MODE) == CQImode || (MODE) == CHImode \ || (MODE) == CSImode || (MODE) == CDImode \ + || (MODE) == V4QImode || (MODE) == V2HImode \ || (TARGET_64BIT \ && ((MODE) == TImode || (MODE) == CTImode \ - || (MODE) == TFmode || (MODE) == TCmode)) \ - || (MODE) == V4QImode || (MODE) == V2HImode) + || (MODE) == TFmode || (MODE) == TCmode \ + || (MODE) == V8QImode || (MODE) == V4HImode \ + || (MODE) == V2SImode))) /* Return true for modes passed in SSE registers. */ #define SSE_REG_MODE_P(MODE) \ diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 74e924f..2fc8fae 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -815,11 +815,12 @@ (define_attr "movu" "0,1" (const_string "0")) ;; Used to control the "enabled" attribute on a per-instruction basis. -(define_attr "isa" "base,x64,x64_sse2,x64_sse4,x64_sse4_noavx,x64_avx,nox64, +(define_attr "isa" "base,x64,nox64,x64_bmi,x64_sse2,x64_sse4,x64_sse4_noavx, + x64_avx,x64_avx512bw,x64_avx512dq, sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx, avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f, avx512bw,noavx512bw,avx512dq,noavx512dq, - avx512vl,noavx512vl,x64_avx512dq,x64_avx512bw, + avx512vl,noavx512vl, avxvnni,avx512vnnivl" (const_string "base")) @@ -829,6 +830,9 @@ (define_attr "enabled" "" (cond [(eq_attr "isa" "x64") (symbol_ref "TARGET_64BIT") + (eq_attr "isa" "nox64") (symbol_ref "!TARGET_64BIT") + (eq_attr "isa" "x64_bmi") + (symbol_ref "TARGET_64BIT && TARGET_BMI") (eq_attr "isa" "x64_sse2") (symbol_ref "TARGET_64BIT && TARGET_SSE2") (eq_attr "isa" "x64_sse4") @@ -837,14 +841,13 @@ (symbol_ref "TARGET_64BIT && TARGET_SSE4_1 && !TARGET_AVX") (eq_attr "isa" "x64_avx") (symbol_ref "TARGET_64BIT && TARGET_AVX") - (eq_attr "isa" "x64_avx512dq") - (symbol_ref "TARGET_64BIT && TARGET_AVX512DQ") (eq_attr "isa" "x64_avx512bw") (symbol_ref "TARGET_64BIT && TARGET_AVX512BW") - (eq_attr "isa" "nox64") (symbol_ref "!TARGET_64BIT") - (eq_attr "isa" "sse2") (symbol_ref "TARGET_SSE2") + (eq_attr "isa" "x64_avx512dq") + (symbol_ref "TARGET_64BIT && TARGET_AVX512DQ") (eq_attr "isa" "sse_noavx") (symbol_ref "TARGET_SSE && !TARGET_AVX") + (eq_attr "isa" "sse2") (symbol_ref "TARGET_SSE2") (eq_attr "isa" "sse2_noavx") (symbol_ref "TARGET_SSE2 && !TARGET_AVX") (eq_attr "isa" "sse3") (symbol_ref "TARGET_SSE3") diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 7806b62..d847978 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1987,20 +1987,24 @@ "operands[2] = force_reg (mode, CONSTM1_RTX (mode));") (define_insn "mmx_andnot3" - [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v") + [(set (match_operand:MMXMODEI 0 "register_operand" "=y,r,x,x,v") (and:MMXMODEI - (not:MMXMODEI (match_operand:MMXMODEI 1 "register_operand" "0,0,x,v")) - (match_operand:MMXMODEI 2 "register_mmxmem_operand" "ym,x,x,v")))] + (not:MMXMODEI (match_operand:MMXMODEI 1 "register_operand" + "0,r,0,x,v")) + (match_operand:MMXMODEI 2 "register_mmxmem_operand" + "ym,r,x,x,v")))] "TARGET_MMX || TARGET_MMX_WITH_SSE" "@ pandn\t{%2, %0|%0, %2} + andn\t{%2, %1, %0|%0, %1, %2} pandn\t{%2, %0|%0, %2} vpandn\t{%2, %1, %0|%0, %1, %2} vpandnd\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") - (set_attr "mmx_isa" "native,*,*,*") - (set_attr "type" "mmxadd,sselog,sselog,sselog") - (set_attr "mode" "DI,TI,TI,TI")]) + [(set_attr "isa" "*,x64_bmi,sse2_noavx,avx,avx512vl") + (set_attr "mmx_isa" "native,*,*,*,*") + (set_attr "type" "mmxadd,bitmanip,sselog,sselog,sselog") + (set_attr "btver2_decode" "*,direct,*,*,*") + (set_attr "mode" "DI,DI,TI,TI,TI")]) (define_insn "*andnot3" [(set (match_operand:VI_32 0 "register_operand" "=r,x,x,v") @@ -2035,21 +2039,22 @@ "ix86_fixup_binary_operands_no_copy (, mode, operands);") (define_insn "*mmx_3" - [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v") + [(set (match_operand:MMXMODEI 0 "register_operand" "=y,r,x,x,v") (any_logic:MMXMODEI - (match_operand:MMXMODEI 1 "register_mmxmem_operand" "%0,0,x,v") - (match_operand:MMXMODEI 2 "register_mmxmem_operand" "ym,x,x,v")))] + (match_operand:MMXMODEI 1 "register_mmxmem_operand" "%0,0,0,x,v") + (match_operand:MMXMODEI 2 "register_mmxmem_operand" "ym,r,x,x,v")))] "(TARGET_MMX || TARGET_MMX_WITH_SSE) && ix86_binary_operator_ok (, mode, operands)" "@ p\t{%2, %0|%0, %2} + \t{%2, %0|%0, %2} p\t{%2, %0|%0, %2} vp\t{%2, %1, %0|%0, %1, %2} vpd\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") - (set_attr "mmx_isa" "native,*,*,*") - (set_attr "type" "mmxadd,sselog,sselog,sselog") - (set_attr "mode" "DI,TI,TI,TI")]) + [(set_attr "isa" "*,x64,sse2_noavx,avx,avx512vl") + (set_attr "mmx_isa" "native,*,*,*,*") + (set_attr "type" "mmxadd,alu,sselog,sselog,sselog") + (set_attr "mode" "DI,DI,TI,TI,TI")]) (define_expand "3" [(set (match_operand:VI_32 0 "register_operand") -- cgit v1.1 From 7bb4b7a5ca984f7d860f33c9b791f425b264a71f Mon Sep 17 00:00:00 2001 From: Andre Simoes Dias Vieira Date: Mon, 17 May 2021 15:43:53 +0100 Subject: aarch64: Enable aarch64_load to use UNSPEC_PRED_X loads This patch will enable the use of loads using the UNSPEC_PRED_X enum in the aarch64_load pattern, thus enabling combine to combine such loads with extends. gcc/ChangeLog: 2021-05-19 Andre Vieira * config/aarch64/iterators.md (SVE_PRED_LOAD): New iterator. (pred_load): New int attribute. * config/aarch64/aarch64-sve.md (aarch64_load_): Use SVE_PRED_LOAD enum iterator and corresponding pred_load attribute. * config/aarch64/aarch64-sve-builtins-base.cc (expand): Update call to code_for_aarch64_load. gcc/testsuite/ChangeLog: 2021-05-19 Andre Vieira * gcc.target/aarch64/sve/logical_unpacked_and_2.c: Change scan-assembly-times to scan-assembly not for superfluous uxtb. * gcc.target/aarch64/sve/logical_unpacked_and_3.c: Likewise. * gcc.target/aarch64/sve/logical_unpacked_and_4.c: Likewise. * gcc.target/aarch64/sve/logical_unpacked_and_6.c: Likewise. * gcc.target/aarch64/sve/logical_unpacked_and_7.c: Likewise. * gcc.target/aarch64/sve/logical_unpacked_eor_2.c: Likewise. * gcc.target/aarch64/sve/logical_unpacked_eor_3.c: Likewise. * gcc.target/aarch64/sve/logical_unpacked_eor_4.c: Likewise. * gcc.target/aarch64/sve/logical_unpacked_eor_6.c: Likewise. * gcc.target/aarch64/sve/logical_unpacked_eor_7.c: Likewise. * gcc.target/aarch64/sve/logical_unpacked_orr_2.c: Likewise. * gcc.target/aarch64/sve/logical_unpacked_orr_3.c: Likewise. * gcc.target/aarch64/sve/logical_unpacked_orr_4.c: Likewise. * gcc.target/aarch64/sve/logical_unpacked_orr_6.c: Likewise. * gcc.target/aarch64/sve/logical_unpacked_orr_7.c: Likewise. * gcc.target/aarch64/sve/ld1_extend.c: New test. --- gcc/config/aarch64/aarch64-sve-builtins-base.cc | 2 +- gcc/config/aarch64/aarch64-sve.md | 4 ++-- gcc/config/aarch64/iterators.md | 4 ++++ 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index dfdf0e2..8fd6d3f 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -1123,7 +1123,7 @@ public: rtx expand (function_expander &e) const OVERRIDE { - insn_code icode = code_for_aarch64_load (extend_rtx_code (), + insn_code icode = code_for_aarch64_load (UNSPEC_LD1_SVE, extend_rtx_code (), e.vector_mode (0), e.memory_vector_mode ()); return e.use_contiguous_load_insn (icode); diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index b8b6f55..9e48c0e 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -1287,7 +1287,7 @@ ;; ------------------------------------------------------------------------- ;; Predicated load and extend, with 8 elements per 128-bit block. -(define_insn_and_rewrite "@aarch64_load_" +(define_insn_and_rewrite "@aarch64_load_" [(set (match_operand:SVE_HSDI 0 "register_operand" "=w") (unspec:SVE_HSDI [(match_operand: 3 "general_operand" "UplDnm") @@ -1295,7 +1295,7 @@ (unspec:SVE_PARTIAL_I [(match_operand: 2 "register_operand" "Upl") (match_operand:SVE_PARTIAL_I 1 "memory_operand" "m")] - UNSPEC_LD1_SVE))] + SVE_PRED_LOAD))] UNSPEC_PRED_X))] "TARGET_SVE && (~ & ) == 0" "ld1\t%0., %2/z, %1" diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 69d9dbe..f133bfd 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -2516,6 +2516,10 @@ (define_int_iterator SVE_LDFF1_LDNF1 [UNSPEC_LDFF1 UNSPEC_LDNF1]) +(define_int_iterator SVE_PRED_LOAD [UNSPEC_PRED_X UNSPEC_LD1_SVE]) + +(define_int_attr pred_load [(UNSPEC_PRED_X "_x") (UNSPEC_LD1_SVE "")]) + (define_int_iterator SVE2_U32_UNARY [UNSPEC_URECPE UNSPEC_RSQRTE]) (define_int_iterator SVE2_INT_UNARY_NARROWB [UNSPEC_SQXTNB -- cgit v1.1 From 3eddaad02dcce21fb67c42cc6e1e8f951a630ac1 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 3 Mar 2021 16:59:28 +0000 Subject: aarch64: Relax aarch64_hn2 RTL pattern Implement v[r]addhn2 and v[r]subhn2 Neon intrinsic RTL patterns using a vec_concat of a register_operand and an ADDSUBHN unspec - instead of just an ADDSUBHN2 unspec. This more relaxed pattern allows for more aggressive combinations and ultimately better code generation. This patch also removes the now redundant [R]ADDHN2 and [R]SUBHN2 unspecs and their iterator. gcc/ChangeLog: 2021-03-03 Jonathan Wright * config/aarch64/aarch64-simd.md (aarch64_hn2): Implement as an expand emitting a big/little endian instruction pattern. (aarch64_hn2_insn_le): Define. (aarch64_hn2_insn_be): Define. * config/aarch64/iterators.md: Remove UNSPEC_[R]ADDHN2 and UNSPEC_[R]SUBHN2 unspecs and ADDSUBHN2 iterator. --- gcc/config/aarch64/aarch64-simd.md | 43 ++++++++++++++++++++++++++++++++------ gcc/config/aarch64/iterators.md | 15 +------------ 2 files changed, 38 insertions(+), 20 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index e59bc7b..1efc854 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4594,17 +4594,48 @@ [(set_attr "type" "neon__halve_narrow_q")] ) -(define_insn "aarch64_hn2" +(define_insn "aarch64_hn2_insn_le" [(set (match_operand: 0 "register_operand" "=w") - (unspec: [(match_operand: 1 "register_operand" "0") - (match_operand:VQN 2 "register_operand" "w") - (match_operand:VQN 3 "register_operand" "w")] - ADDSUBHN2))] - "TARGET_SIMD" + (vec_concat: + (match_operand: 1 "register_operand" "0") + (unspec: [(match_operand:VQN 2 "register_operand" "w") + (match_operand:VQN 3 "register_operand" "w")] + ADDSUBHN)))] + "TARGET_SIMD && !BYTES_BIG_ENDIAN" + "hn2\\t%0., %2., %3." + [(set_attr "type" "neon__halve_narrow_q")] +) + +(define_insn "aarch64_hn2_insn_be" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (unspec: [(match_operand:VQN 2 "register_operand" "w") + (match_operand:VQN 3 "register_operand" "w")] + ADDSUBHN) + (match_operand: 1 "register_operand" "0")))] + "TARGET_SIMD && BYTES_BIG_ENDIAN" "hn2\\t%0., %2., %3." [(set_attr "type" "neon__halve_narrow_q")] ) +(define_expand "aarch64_hn2" + [(match_operand: 0 "register_operand") + (match_operand: 1 "register_operand") + (unspec [(match_operand:VQN 2 "register_operand") + (match_operand:VQN 3 "register_operand")] + ADDSUBHN)] + "TARGET_SIMD" + { + if (BYTES_BIG_ENDIAN) + emit_insn (gen_aarch64_hn2_insn_be (operands[0], + operands[1], operands[2], operands[3])); + else + emit_insn (gen_aarch64_hn2_insn_le (operands[0], + operands[1], operands[2], operands[3])); + DONE; + } +) + ;; pmul. (define_insn "aarch64_pmul" diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index f133bfd..29ce669 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -514,10 +514,6 @@ UNSPEC_RADDHN ; Used in aarch64-simd.md. UNSPEC_SUBHN ; Used in aarch64-simd.md. UNSPEC_RSUBHN ; Used in aarch64-simd.md. - UNSPEC_ADDHN2 ; Used in aarch64-simd.md. - UNSPEC_RADDHN2 ; Used in aarch64-simd.md. - UNSPEC_SUBHN2 ; Used in aarch64-simd.md. - UNSPEC_RSUBHN2 ; Used in aarch64-simd.md. UNSPEC_SQDMULH ; Used in aarch64-simd.md. UNSPEC_SQRDMULH ; Used in aarch64-simd.md. UNSPEC_PMUL ; Used in aarch64-simd.md. @@ -2241,9 +2237,6 @@ (define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN UNSPEC_SUBHN UNSPEC_RSUBHN]) -(define_int_iterator ADDSUBHN2 [UNSPEC_ADDHN2 UNSPEC_RADDHN2 - UNSPEC_SUBHN2 UNSPEC_RSUBHN2]) - (define_int_iterator FMAXMIN_UNS [UNSPEC_FMAX UNSPEC_FMIN UNSPEC_FMAXNM UNSPEC_FMINNM]) @@ -3000,8 +2993,6 @@ (UNSPEC_SABDL2 "s") (UNSPEC_UABDL2 "u") (UNSPEC_SADALP "s") (UNSPEC_UADALP "u") (UNSPEC_SUBHN "") (UNSPEC_RSUBHN "r") - (UNSPEC_ADDHN2 "") (UNSPEC_RADDHN2 "r") - (UNSPEC_SUBHN2 "") (UNSPEC_RSUBHN2 "r") (UNSPEC_USQADD "us") (UNSPEC_SUQADD "su") (UNSPEC_SSLI "s") (UNSPEC_USLI "u") (UNSPEC_SSRI "s") (UNSPEC_USRI "u") @@ -3064,11 +3055,7 @@ (UNSPEC_ADDHN "add") (UNSPEC_SUBHN "sub") (UNSPEC_RADDHN "add") - (UNSPEC_RSUBHN "sub") - (UNSPEC_ADDHN2 "add") - (UNSPEC_SUBHN2 "sub") - (UNSPEC_RADDHN2 "add") - (UNSPEC_RSUBHN2 "sub")]) + (UNSPEC_RSUBHN "sub")]) ;; BSL variants: first commutative operand. (define_int_attr bsl_1st [(1 "w") (2 "0")]) -- cgit v1.1 From 4e26303e0b90038473e3d7490dc0369a74866b1b Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 4 Mar 2021 12:36:09 +0000 Subject: aarch64: Relax aarch64_qshrn2_n RTL pattern Implement saturating right-shift and narrow high Neon intrinsic RTL patterns using a vec_concat of a register_operand and a VQSHRN_N unspec - instead of just a VQSHRN_N unspec. This more relaxed pattern allows for more aggressive combinations and ultimately better code generation. gcc/ChangeLog: 2021-03-04 Jonathan Wright * config/aarch64/aarch64-simd.md (aarch64_qshrn2_n): Implement as an expand emitting a big/little endian instruction pattern. (aarch64_qshrn2_n_insn_le): Define. (aarch64_qshrn2_n_insn_be): Define. --- gcc/config/aarch64/aarch64-simd.md | 49 +++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 1efc854..5473d61 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -6054,17 +6054,54 @@ [(set_attr "type" "neon_sat_shift_imm_narrow_q")] ) -(define_insn "aarch64_qshrn2_n" +(define_insn "aarch64_qshrn2_n_insn_le" [(set (match_operand: 0 "register_operand" "=w") - (unspec: [(match_operand: 1 "register_operand" "0") - (match_operand:VQN 2 "register_operand" "w") - (match_operand:SI 3 "aarch64_simd_shift_imm_offset_" "i")] - VQSHRN_N))] - "TARGET_SIMD" + (vec_concat: + (match_operand: 1 "register_operand" "0") + (unspec: [(match_operand:VQN 2 "register_operand" "w") + (match_operand:VQN 3 + "aarch64_simd_shift_imm_vec_")] + VQSHRN_N)))] + "TARGET_SIMD && !BYTES_BIG_ENDIAN" "qshrn2\\t%0., %2., %3" [(set_attr "type" "neon_sat_shift_imm_narrow_q")] ) +(define_insn "aarch64_qshrn2_n_insn_be" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (unspec: [(match_operand:VQN 2 "register_operand" "w") + (match_operand:VQN 3 + "aarch64_simd_shift_imm_vec_")] + VQSHRN_N) + (match_operand: 1 "register_operand" "0")))] + "TARGET_SIMD && BYTES_BIG_ENDIAN" + "qshrn2\\t%0., %2., %3" + [(set_attr "type" "neon_sat_shift_imm_narrow_q")] +) + +(define_expand "aarch64_qshrn2_n" + [(match_operand: 0 "register_operand") + (match_operand: 1 "register_operand") + (unspec: + [(match_operand:VQN 2 "register_operand") + (match_operand:SI 3 "aarch64_simd_shift_imm_offset_")] + VQSHRN_N)] + "TARGET_SIMD" + { + operands[3] = aarch64_simd_gen_const_vector_dup (mode, + INTVAL (operands[3])); + + if (BYTES_BIG_ENDIAN) + emit_insn (gen_aarch64_qshrn2_n_insn_be (operands[0], + operands[1], operands[2], operands[3])); + else + emit_insn (gen_aarch64_qshrn2_n_insn_le (operands[0], + operands[1], operands[2], operands[3])); + DONE; + } +) + ;; cm(eq|ge|gt|lt|le) ;; Note, we have constraints for Dz and Z as different expanders -- cgit v1.1 From 778ac63fe244b63380bd3b2dee4d20ff27332bce Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 14 May 2021 14:49:47 +0100 Subject: aarch64: Relax aarch64_sqxtun2 RTL pattern Use UNSPEC_SQXTUN instead of UNSPEC_SQXTUN2 in aarch64_sqxtun2 patterns. This allows for more more aggressive combinations and ultimately better code generation. The now redundant UNSPEC_SQXTUN2 is removed. gcc/ChangeLog: 2021-05-14 Jonathn Wright * config/aarch64/aarch64-simd.md: Use UNSPEC_SQXTUN instead of UNSPEC_SQXTUN2. * config/aarch64/iterators.md: Remove UNSPEC_SQXTUN2. --- gcc/config/aarch64/aarch64-simd.md | 6 +++--- gcc/config/aarch64/iterators.md | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 5473d61..7952309 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4859,7 +4859,7 @@ (vec_concat: (match_operand: 1 "register_operand" "0") (unspec: - [(match_operand:VQN 2 "register_operand" "w")] UNSPEC_SQXTUN2)))] + [(match_operand:VQN 2 "register_operand" "w")] UNSPEC_SQXTUN)))] "TARGET_SIMD && !BYTES_BIG_ENDIAN" "sqxtun2\\t%0., %2." [(set_attr "type" "neon_sat_shift_imm_narrow_q")] @@ -4869,7 +4869,7 @@ [(set (match_operand: 0 "register_operand" "=w") (vec_concat: (unspec: - [(match_operand:VQN 2 "register_operand" "w")] UNSPEC_SQXTUN2) + [(match_operand:VQN 2 "register_operand" "w")] UNSPEC_SQXTUN) (match_operand: 1 "register_operand" "0")))] "TARGET_SIMD && BYTES_BIG_ENDIAN" "sqxtun2\\t%0., %2." @@ -4880,7 +4880,7 @@ [(match_operand: 0 "register_operand") (match_operand: 1 "register_operand") (unspec: - [(match_operand:VQN 2 "register_operand")] UNSPEC_SQXTUN2)] + [(match_operand:VQN 2 "register_operand")] UNSPEC_SQXTUN)] "TARGET_SIMD" { if (BYTES_BIG_ENDIAN) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 29ce669..0ec93b0 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -521,7 +521,6 @@ UNSPEC_USQADD ; Used in aarch64-simd.md. UNSPEC_SUQADD ; Used in aarch64-simd.md. UNSPEC_SQXTUN ; Used in aarch64-simd.md. - UNSPEC_SQXTUN2 ; Used in aarch64-simd.md. UNSPEC_SSRA ; Used in aarch64-simd.md. UNSPEC_USRA ; Used in aarch64-simd.md. UNSPEC_SRSRA ; Used in aarch64-simd.md. -- cgit v1.1 From ddbdb9a384f53419d0e6fbcca2a4534a2668e5f8 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 14 May 2021 17:18:34 +0100 Subject: aarch64: Refactor aarch64_qshrn_n RTL pattern Split the aarch64_qshrn_n pattern into separate scalar and vector variants. Further split the vector pattern into big/little endian variants that model the zero-high-half semantics of the underlying instruction - allowing for more combinations with the write-to-high-half variant (aarch64_qshrn2_n.) gcc/ChangeLog: 2021-05-14 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Split builtin generation for aarch64_qshrn_n pattern into separate scalar and vector generators. * config/aarch64/aarch64-simd.md (aarch64_qshrn_n): Define as an expander and split into... (aarch64_qshrn_n_insn_le): This and... (aarch64_qshrn_n_insn_be): This. * config/aarch64/iterators.md: Define SD_HSDI iterator. --- gcc/config/aarch64/aarch64-simd-builtins.def | 18 ++++++---- gcc/config/aarch64/aarch64-simd.md | 54 +++++++++++++++++++++++++++- gcc/config/aarch64/iterators.md | 3 ++ 3 files changed, 68 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 1e81bb5..18baa67 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -421,12 +421,18 @@ BUILTIN_VQW (SHIFTIMM, sshll2_n, 0, NONE) BUILTIN_VQW (SHIFTIMM, ushll2_n, 0, NONE) /* Implemented by aarch64_qshrn_n. */ - BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrun_n, 0, NONE) - BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrun_n, 0, NONE) - BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrn_n, 0, NONE) - BUILTIN_VSQN_HSDI (USHIFTIMM, uqshrn_n, 0, NONE) - BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrn_n, 0, NONE) - BUILTIN_VSQN_HSDI (USHIFTIMM, uqrshrn_n, 0, NONE) + BUILTIN_VQN (SHIFTIMM, sqshrun_n, 0, NONE) + BUILTIN_VQN (SHIFTIMM, sqrshrun_n, 0, NONE) + BUILTIN_VQN (SHIFTIMM, sqshrn_n, 0, NONE) + BUILTIN_VQN (USHIFTIMM, uqshrn_n, 0, NONE) + BUILTIN_VQN (SHIFTIMM, sqrshrn_n, 0, NONE) + BUILTIN_VQN (USHIFTIMM, uqrshrn_n, 0, NONE) + BUILTIN_SD_HSDI (SHIFTIMM, sqshrun_n, 0, NONE) + BUILTIN_SD_HSDI (SHIFTIMM, sqrshrun_n, 0, NONE) + BUILTIN_SD_HSDI (SHIFTIMM, sqshrn_n, 0, NONE) + BUILTIN_SD_HSDI (USHIFTIMM, uqshrn_n, 0, NONE) + BUILTIN_SD_HSDI (SHIFTIMM, sqrshrn_n, 0, NONE) + BUILTIN_SD_HSDI (USHIFTIMM, uqrshrn_n, 0, NONE) /* Implemented by aarch64_qshrn2_n. */ BUILTIN_VQN (SHIFT2IMM_UUSS, sqshrun2_n, 0, NONE) BUILTIN_VQN (SHIFT2IMM_UUSS, sqrshrun2_n, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7952309..c67fa3f 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -6045,7 +6045,7 @@ (define_insn "aarch64_qshrn_n" [(set (match_operand: 0 "register_operand" "=w") - (unspec: [(match_operand:VSQN_HSDI 1 "register_operand" "w") + (unspec: [(match_operand:SD_HSDI 1 "register_operand" "w") (match_operand:SI 2 "aarch64_simd_shift_imm_offset_" "i")] VQSHRN_N))] @@ -6054,6 +6054,58 @@ [(set_attr "type" "neon_sat_shift_imm_narrow_q")] ) +(define_insn "aarch64_qshrn_n_insn_le" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (unspec: + [(match_operand:VQN 1 "register_operand" "w") + (match_operand:VQN 2 "aarch64_simd_shift_imm_vec_")] + VQSHRN_N) + (match_operand: 3 "aarch64_simd_or_scalar_imm_zero")))] + "TARGET_SIMD && !BYTES_BIG_ENDIAN" + "qshrn\\t%0, %1, %2" + [(set_attr "type" "neon_shift_imm_narrow_q")] +) + +(define_insn "aarch64_qshrn_n_insn_be" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (match_operand: 3 "aarch64_simd_or_scalar_imm_zero") + (unspec: + [(match_operand:VQN 1 "register_operand" "w") + (match_operand:VQN 2 "aarch64_simd_shift_imm_vec_")] + VQSHRN_N)))] + "TARGET_SIMD && BYTES_BIG_ENDIAN" + "qshrn\\t%0, %1, %2" + [(set_attr "type" "neon_shift_imm_narrow_q")] +) + +(define_expand "aarch64_qshrn_n" + [(set (match_operand: 0 "register_operand" "=w") + (unspec: [(match_operand:VQN 1 "register_operand") + (match_operand:SI 2 + "aarch64_simd_shift_imm_offset_")] + VQSHRN_N))] + "TARGET_SIMD" + { + operands[2] = aarch64_simd_gen_const_vector_dup (mode, + INTVAL (operands[2])); + rtx tmp = gen_reg_rtx (mode); + if (BYTES_BIG_ENDIAN) + emit_insn (gen_aarch64_qshrn_n_insn_be (tmp, + operands[1], operands[2], CONST0_RTX (mode))); + else + emit_insn (gen_aarch64_qshrn_n_insn_le (tmp, + operands[1], operands[2], CONST0_RTX (mode))); + + /* The intrinsic expects a narrow result, so emit a subreg that will get + optimized away as appropriate. */ + emit_move_insn (operands[0], lowpart_subreg (mode, tmp, + mode)); + DONE; + } +) + (define_insn "aarch64_qshrn2_n_insn_le" [(set (match_operand: 0 "register_operand" "=w") (vec_concat: diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 0ec93b0..e9047d0 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -282,6 +282,9 @@ ;; Scalar 64-bit container: 16, 32-bit integer modes (define_mode_iterator SD_HSI [HI SI]) +;; Scalar 64-bit container: 16-bit, 32-bit and 64-bit integer modes. +(define_mode_iterator SD_HSDI [HI SI DI]) + ;; Advanced SIMD 64-bit container: 16, 32-bit integer modes. (define_mode_iterator VQ_HSI [V8HI V4SI]) -- cgit v1.1 From 577d5819e0cada818aca975752809d55ccecc6e8 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Sun, 16 May 2021 13:01:47 +0100 Subject: aarch64: Use an expander for quad-word vec_pack_trunc pattern The existing vec_pack_trunc RTL pattern emits an opaque two- instruction assembly code sequence that prevents proper instruction scheduling. This commit changes the pattern to an expander that emits individual xtn and xtn2 instructions. This commit also consolidates the duplicate truncation patterns. gcc/ChangeLog: 2021-05-17 Jonathan Wright * config/aarch64/aarch64-simd.md (aarch64_simd_vec_pack_trunc_): Remove as duplicate of... (aarch64_xtn): This. (aarch64_xtn2_le): Move position in file. (aarch64_xtn2_be): Move position in file. (aarch64_xtn2): Move position in file. (vec_pack_trunc_): Define as an expander. --- gcc/config/aarch64/aarch64-simd.md | 113 ++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 57 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index c67fa3f..447b557 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1691,14 +1691,51 @@ ;; Narrowing operations. ;; For doubles. -(define_insn "aarch64_simd_vec_pack_trunc_" - [(set (match_operand: 0 "register_operand" "=w") - (truncate: (match_operand:VQN 1 "register_operand" "w")))] - "TARGET_SIMD" - "xtn\\t%0., %1." + +(define_insn "aarch64_xtn" + [(set (match_operand: 0 "register_operand" "=w") + (truncate: (match_operand:VQN 1 "register_operand" "w")))] + "TARGET_SIMD" + "xtn\\t%0., %1." [(set_attr "type" "neon_shift_imm_narrow_q")] ) +(define_insn "aarch64_xtn2_le" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (match_operand: 1 "register_operand" "0") + (truncate: (match_operand:VQN 2 "register_operand" "w"))))] + "TARGET_SIMD && !BYTES_BIG_ENDIAN" + "xtn2\t%0., %2." + [(set_attr "type" "neon_shift_imm_narrow_q")] +) + +(define_insn "aarch64_xtn2_be" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (truncate: (match_operand:VQN 2 "register_operand" "w")) + (match_operand: 1 "register_operand" "0")))] + "TARGET_SIMD && BYTES_BIG_ENDIAN" + "xtn2\t%0., %2." + [(set_attr "type" "neon_shift_imm_narrow_q")] +) + +(define_expand "aarch64_xtn2" + [(match_operand: 0 "register_operand") + (match_operand: 1 "register_operand") + (truncate: (match_operand:VQN 2 "register_operand"))] + "TARGET_SIMD" + { + if (BYTES_BIG_ENDIAN) + emit_insn (gen_aarch64_xtn2_be (operands[0], operands[1], + operands[2])); + else + emit_insn (gen_aarch64_xtn2_le (operands[0], operands[1], + operands[2])); + DONE; + } +) + (define_expand "vec_pack_trunc_" [(match_operand: 0 "register_operand") (match_operand:VDN 1 "register_operand") @@ -1711,7 +1748,7 @@ emit_insn (gen_move_lo_quad_ (tempreg, operands[lo])); emit_insn (gen_move_hi_quad_ (tempreg, operands[hi])); - emit_insn (gen_aarch64_simd_vec_pack_trunc_ (operands[0], tempreg)); + emit_insn (gen_aarch64_xtn (operands[0], tempreg)); DONE; }) @@ -1901,20 +1938,25 @@ ;; For quads. -(define_insn "vec_pack_trunc_" - [(set (match_operand: 0 "register_operand" "=&w") +(define_expand "vec_pack_trunc_" + [(set (match_operand: 0 "register_operand") (vec_concat: - (truncate: (match_operand:VQN 1 "register_operand" "w")) - (truncate: (match_operand:VQN 2 "register_operand" "w"))))] + (truncate: (match_operand:VQN 1 "register_operand")) + (truncate: (match_operand:VQN 2 "register_operand"))))] "TARGET_SIMD" { + rtx tmpreg = gen_reg_rtx (mode); + int lo = BYTES_BIG_ENDIAN ? 2 : 1; + int hi = BYTES_BIG_ENDIAN ? 1 : 2; + + emit_insn (gen_aarch64_xtn (tmpreg, operands[lo])); + if (BYTES_BIG_ENDIAN) - return "xtn\\t%0., %2.\;xtn2\\t%0., %1."; + emit_insn (gen_aarch64_xtn2_be (operands[0], tmpreg, operands[hi])); else - return "xtn\\t%0., %1.\;xtn2\\t%0., %2."; + emit_insn (gen_aarch64_xtn2_le (operands[0], tmpreg, operands[hi])); + DONE; } - [(set_attr "type" "multiple") - (set_attr "length" "8")] ) ;; Widening operations. @@ -8570,13 +8612,6 @@ "" ) -(define_expand "aarch64_xtn" - [(set (match_operand: 0 "register_operand" "=w") - (truncate: (match_operand:VQN 1 "register_operand" "w")))] - "TARGET_SIMD" - "" -) - ;; Truncate a 128-bit integer vector to a 64-bit vector. (define_insn "trunc2" [(set (match_operand: 0 "register_operand" "=w") @@ -8586,42 +8621,6 @@ [(set_attr "type" "neon_shift_imm_narrow_q")] ) -(define_insn "aarch64_xtn2_le" - [(set (match_operand: 0 "register_operand" "=w") - (vec_concat: - (match_operand: 1 "register_operand" "0") - (truncate: (match_operand:VQN 2 "register_operand" "w"))))] - "TARGET_SIMD && !BYTES_BIG_ENDIAN" - "xtn2\t%0., %2." - [(set_attr "type" "neon_shift_imm_narrow_q")] -) - -(define_insn "aarch64_xtn2_be" - [(set (match_operand: 0 "register_operand" "=w") - (vec_concat: - (truncate: (match_operand:VQN 2 "register_operand" "w")) - (match_operand: 1 "register_operand" "0")))] - "TARGET_SIMD && BYTES_BIG_ENDIAN" - "xtn2\t%0., %2." - [(set_attr "type" "neon_shift_imm_narrow_q")] -) - -(define_expand "aarch64_xtn2" - [(match_operand: 0 "register_operand") - (match_operand: 1 "register_operand") - (truncate: (match_operand:VQN 2 "register_operand"))] - "TARGET_SIMD" - { - if (BYTES_BIG_ENDIAN) - emit_insn (gen_aarch64_xtn2_be (operands[0], operands[1], - operands[2])); - else - emit_insn (gen_aarch64_xtn2_le (operands[0], operands[1], - operands[2])); - DONE; - } -) - (define_insn "aarch64_bfdot" [(set (match_operand:VDQSF 0 "register_operand" "=w") (plus:VDQSF -- cgit v1.1 From 45364338209929542b14b805796f40b71a0fa960 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 18 May 2021 15:56:53 +0100 Subject: aarch64: Use correct type attributes for RTL generating XTN(2) Use the correct "neon_move_narrow_q" type attribute in RTL patterns that generate XTN/XTN2 instructions. This makes a material difference because these instructions can be executed on both SIMD pipes in the Cortex-A57 core model, whereas the "neon_shift_imm_narrow_q" attribute (in use until now) would suggest to the scheduler that they could only execute on one of the two pipes. gcc/ChangeLog: 2021-05-18 Jonathan Wright * config/aarch64/aarch64-simd.md: Use "neon_move_narrow_q" type attribute in patterns generating XTN(2). --- gcc/config/aarch64/aarch64-simd.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 447b557..e750fae 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1697,7 +1697,7 @@ (truncate: (match_operand:VQN 1 "register_operand" "w")))] "TARGET_SIMD" "xtn\\t%0., %1." - [(set_attr "type" "neon_shift_imm_narrow_q")] + [(set_attr "type" "neon_move_narrow_q")] ) (define_insn "aarch64_xtn2_le" @@ -1707,7 +1707,7 @@ (truncate: (match_operand:VQN 2 "register_operand" "w"))))] "TARGET_SIMD && !BYTES_BIG_ENDIAN" "xtn2\t%0., %2." - [(set_attr "type" "neon_shift_imm_narrow_q")] + [(set_attr "type" "neon_move_narrow_q")] ) (define_insn "aarch64_xtn2_be" @@ -1717,7 +1717,7 @@ (match_operand: 1 "register_operand" "0")))] "TARGET_SIMD && BYTES_BIG_ENDIAN" "xtn2\t%0., %2." - [(set_attr "type" "neon_shift_imm_narrow_q")] + [(set_attr "type" "neon_move_narrow_q")] ) (define_expand "aarch64_xtn2" @@ -8618,7 +8618,7 @@ (truncate: (match_operand:VQN 1 "register_operand" "w")))] "TARGET_SIMD" "xtn\t%0., %1." - [(set_attr "type" "neon_shift_imm_narrow_q")] + [(set_attr "type" "neon_move_narrow_q")] ) (define_insn "aarch64_bfdot" -- cgit v1.1 From 5b953740da1976d90d974055c6d825c509c6e654 Mon Sep 17 00:00:00 2001 From: Alex Coplan Date: Wed, 19 May 2021 15:52:45 +0100 Subject: arm: Fix ICE with CMSE nonsecure calls on Armv8.1-M [PR100333] As the PR shows, we ICE shortly after expanding nonsecure calls for Armv8.1-M. For Armv8.1-M, we have TARGET_HAVE_FPCXT_CMSE. As it stands, the expander (arm.md:nonsecure_call_internal) moves the callee's address to a register (with copy_to_suggested_reg) only if !TARGET_HAVE_FPCXT_CMSE. However, looking at the pattern which the insn appears to be intended to match (thumb2.md:*nonsecure_call_reg_thumb2_fpcxt), it requires the callee's address to be in a register. This patch therefore just forces the callee's address into a register in the expander. gcc/ChangeLog: PR target/100333 * config/arm/arm.md (nonsecure_call_internal): Always ensure callee's address is in a register. gcc/testsuite/ChangeLog: PR target/100333 * gcc.target/arm/cmse/pr100333.c: New test. --- gcc/config/arm/arm.md | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index 45a471a..0646048 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -8580,18 +8580,21 @@ (use (match_operand 2 "" "")) (clobber (reg:SI LR_REGNUM))])] "use_cmse" - " { + rtx addr = XEXP (operands[0], 0); + rtx tmp = REG_P (addr) ? addr : force_reg (SImode, addr); + if (!TARGET_HAVE_FPCXT_CMSE) { - rtx tmp = - copy_to_suggested_reg (XEXP (operands[0], 0), - gen_rtx_REG (SImode, R4_REGNUM), - SImode); - - operands[0] = replace_equiv_address (operands[0], tmp); + rtx r4 = gen_rtx_REG (SImode, R4_REGNUM); + emit_move_insn (r4, tmp); + tmp = r4; } - }") + + if (tmp != addr) + operands[0] = replace_equiv_address (operands[0], tmp); + } +) (define_insn "*call_reg_armv5" [(call (mem:SI (match_operand:SI 0 "s_register_operand" "r")) -- cgit v1.1 From 9c5bd1e9811babe255ddbbdcda1d00ea5997b826 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Wed, 19 May 2021 05:42:51 -0500 Subject: vect: Replace hardcoded inner loop cost factor This patch is to replace the current hardcoded weight factor 50, which is applied by the loop vectorizer to the cost of statements in an inner loop relative to the loop being vectorized, with one newly added member inner_loop_cost_factor in loop vinfo. It also introduces one parameter vect-inner-loop-cost-factor whose default value is 50, and is used to initialize the inner_loop_cost_factor member. The motivation here is that: if targets want to have one unique function to gather some information in each add_stmt_cost call, no matter that it's put before or after the cost tweaking part for inner loop, it may have the need to adjust (expand or shrink) the gathered data as the factor. Now the factor is hardcoded, it's not easily maintained. Bootstrapped/regtested on powerpc64le-linux-gnu P9, x86_64-redhat-linux and aarch64-linux-gnu. gcc/ChangeLog: * doc/invoke.texi (vect-inner-loop-cost-factor): Document new parameter. * params.opt (vect-inner-loop-cost-factor): New. * targhooks.c (default_add_stmt_cost): Replace hardcoded factor 50 with LOOP_VINFO_INNER_LOOP_COST_FACTOR, include head file tree-vectorizer.h and its required ones. * config/aarch64/aarch64.c (aarch64_add_stmt_cost): Replace hardcoded factor 50 with LOOP_VINFO_INNER_LOOP_COST_FACTOR. * config/arm/arm.c (arm_add_stmt_cost): Likewise. * config/i386/i386.c (ix86_add_stmt_cost): Likewise. * config/rs6000/rs6000.c (rs6000_add_stmt_cost): Likewise. * tree-vect-loop.c (vect_compute_single_scalar_iteration_cost): Likewise. (_loop_vec_info::_loop_vec_info): Init inner_loop_cost_factor. * tree-vectorizer.h (_loop_vec_info): Add inner_loop_cost_factor. (LOOP_VINFO_INNER_LOOP_COST_FACTOR): New macro. --- gcc/config/aarch64/aarch64.c | 5 ++++- gcc/config/arm/arm.c | 6 +++++- gcc/config/i386/i386.c | 6 +++++- gcc/config/rs6000/rs6000.c | 6 +++++- 4 files changed, 19 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 0835646..c1e451e 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -15435,7 +15435,10 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, arbitrary and could potentially be improved with analysis. */ if (where == vect_body && stmt_info && stmt_in_inner_loop_p (vinfo, stmt_info)) - count *= 50; /* FIXME */ + { + gcc_assert (loop_vinfo); + count *= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); /* FIXME */ + } retval = (unsigned) (count * stmt_cost); costs->region[where] += retval; diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 28cfd81..caf4e56 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -12226,7 +12226,11 @@ arm_add_stmt_cost (vec_info *vinfo, void *data, int count, arbitrary and could potentially be improved with analysis. */ if (where == vect_body && stmt_info && stmt_in_inner_loop_p (vinfo, stmt_info)) - count *= 50; /* FIXME. */ + { + loop_vec_info loop_vinfo = dyn_cast (vinfo); + gcc_assert (loop_vinfo); + count *= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); /* FIXME. */ + } retval = (unsigned) (count * stmt_cost); cost[where] += retval; diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 743d8a2..f3b4518 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -22482,7 +22482,11 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, arbitrary and could potentially be improved with analysis. */ if (where == vect_body && stmt_info && stmt_in_inner_loop_p (vinfo, stmt_info)) - count *= 50; /* FIXME. */ + { + loop_vec_info loop_vinfo = dyn_cast (vinfo); + gcc_assert (loop_vinfo); + count *= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); /* FIXME. */ + } retval = (unsigned) (count * stmt_cost); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 6db450a..dfa517b 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5364,7 +5364,11 @@ rs6000_add_stmt_cost (class vec_info *vinfo, void *data, int count, arbitrary and could potentially be improved with analysis. */ if (where == vect_body && stmt_info && stmt_in_inner_loop_p (vinfo, stmt_info)) - count *= 50; /* FIXME. */ + { + loop_vec_info loop_vinfo = dyn_cast (vinfo); + gcc_assert (loop_vinfo); + count *= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); /* FIXME. */ + } retval = (unsigned) (count * stmt_cost); cost_data->cost[where] += retval; -- cgit v1.1 From 507359e1d4d18614eb9679043995edf0675b6ff5 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 20 May 2021 11:11:21 +0200 Subject: i386: Add mult-high and shift patterns for 4-byte vectors [PR100637] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2021-05-20 Uroš Bizjak gcc/ PR target/100637 * config/i386/mmx.md (Yv_Yw): Revert adding V4QI and V2HI modes. (*3): Use Yw instad of constrint. (mulv4hi3_highpart): New expander. (*mulv2hi3_highpart): New insn pattern. (mulv2hi3_higpart): New expander. (*v2hi3): New insn pattern. (v2hi3): New expander. * config/i386/sse.md (smulhrsv2hi3): New expander. (*smulhrsv2hi3): New insn pattern. gcc/testsuite/ PR target/100637 * gcc.target/i386/pr100637-1w.c (shl, ashr, lshr): New tests. --- gcc/config/i386/mmx.md | 78 ++++++++++++++++++++++++++++++++++++++++++++++---- gcc/config/i386/sse.md | 45 +++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index d847978..948ba47 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -78,8 +78,7 @@ [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi")]) (define_mode_attr Yv_Yw - [(V8QI "Yw") (V4QI "Yw") (V4HI "Yw") (V2HI "Yw") - (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")]) + [(V8QI "Yw") (V4HI "Yw") (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -1367,10 +1366,10 @@ "ix86_fixup_binary_operands_no_copy (, mode, operands);") (define_insn "*3" - [(set (match_operand:VI_32 0 "register_operand" "=x,") + [(set (match_operand:VI_32 0 "register_operand" "=x,Yw") (plusminus:VI_32 - (match_operand:VI_32 1 "register_operand" "0,") - (match_operand:VI_32 2 "register_operand" "x,")))] + (match_operand:VI_32 1 "register_operand" "0,Yw") + (match_operand:VI_32 2 "register_operand" "x,Yw")))] "TARGET_SSE2 && ix86_binary_operator_ok (, mode, operands)" "@ @@ -1523,6 +1522,51 @@ (set_attr "type" "mmxmul,ssemul,ssemul") (set_attr "mode" "DI,TI,TI")]) +(define_expand "mulv4hi3_highpart" + [(set (match_operand:V4HI 0 "register_operand") + (truncate:V4HI + (lshiftrt:V4SI + (mult:V4SI + (any_extend:V4SI + (match_operand:V4HI 1 "register_operand")) + (any_extend:V4SI + (match_operand:V4HI 2 "register_operand"))) + (const_int 16))))] + "TARGET_MMX_WITH_SSE" + "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);") + +(define_insn "*mulv2hi3_highpart" + [(set (match_operand:V2HI 0 "register_operand" "=x,Yw") + (truncate:V2HI + (lshiftrt:V2SI + (mult:V2SI + (any_extend:V2SI + (match_operand:V2HI 1 "register_operand" "%0,Yw")) + (any_extend:V2SI + (match_operand:V2HI 2 "register_operand" "x,Yw"))) + (const_int 16))))] + "TARGET_SSE2 + && ix86_binary_operator_ok (MULT, V2HImode, operands)" + "@ + pmulhw\t{%2, %0|%0, %2} + vpmulhw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "ssemul") + (set_attr "mode" "TI")]) + +(define_expand "mulv2hi3_highpart" + [(set (match_operand:V2HI 0 "register_operand") + (truncate:V2HI + (lshiftrt:V2SI + (mult:V2SI + (any_extend:V2SI + (match_operand:V2HI 1 "register_operand")) + (any_extend:V2SI + (match_operand:V2HI 2 "register_operand"))) + (const_int 16))))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (MULT, V2HImode, operands);") + (define_expand "mmx_pmaddwd" [(set (match_operand:V2SI 0 "register_operand") (plus:V2SI @@ -1817,6 +1861,30 @@ (match_operand:DI 2 "nonmemory_operand")))] "TARGET_MMX_WITH_SSE") +(define_insn "*v2hi3" + [(set (match_operand:V2HI 0 "register_operand" "=x,Yw") + (any_shift:V2HI + (match_operand:V2HI 1 "register_operand" "0,Yw") + (match_operand:DI 2 "nonmemory_operand" "xN,YwN")))] + "TARGET_SSE2" + "@ + pw\t{%2, %0|%0, %2} + vpw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseishft") + (set (attr "length_immediate") + (if_then_else (match_operand 2 "const_int_operand") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "TI")]) + +(define_expand "v2hi3" + [(set (match_operand:V2HI 0 "register_operand") + (any_shift:V2HI + (match_operand:V2HI 1 "register_operand") + (match_operand:DI 2 "nonmemory_operand")))] + "TARGET_SSE2") + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral comparisons diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a4503dd..0f1108f 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17239,6 +17239,51 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI,TI,TI")]) +(define_expand "smulhrsv2hi3" + [(set (match_operand:V2HI 0 "register_operand") + (truncate:V2HI + (lshiftrt:V2SI + (plus:V2SI + (lshiftrt:V2SI + (mult:V2SI + (sign_extend:V2SI + (match_operand:V2HI 1 "register_operand")) + (sign_extend:V2SI + (match_operand:V2HI 2 "register_operand"))) + (const_int 14)) + (match_dup 3)) + (const_int 1))))] + "TARGET_SSSE3" +{ + operands[3] = CONST1_RTX(V2HImode); + ix86_fixup_binary_operands_no_copy (MULT, V2HImode, operands); +}) + +(define_insn "*smulhrsv2hi3" + [(set (match_operand:V2HI 0 "register_operand" "=x,Yv") + (truncate:V2HI + (lshiftrt:V2SI + (plus:V2SI + (lshiftrt:V2SI + (mult:V2SI + (sign_extend:V2SI + (match_operand:V2HI 1 "register_operand" "%0,Yv")) + (sign_extend:V2SI + (match_operand:V2HI 2 "register_operand" "x,Yv"))) + (const_int 14)) + (match_operand:V2HI 3 "const1_operand")) + (const_int 1))))] + "TARGET_SSSE3 + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + pmulhrsw\t{%2, %0|%0, %2} + vpmulhrsw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseimul") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "TI")]) + (define_insn "_pshufb3" [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,") (unspec:VI1_AVX512 -- cgit v1.1 From 928e4ff1aea45165efbdc8ba6e409b191c2dde6c Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Thu, 20 May 2021 05:16:19 -0500 Subject: arm: Fix build failure by adding includes Commit r12-939 missed to add the required include files for the newly used type loop_vec_info. This patch is to add the include file "tree-vectorizer.h" which defines loop_vec_info and its required include file "cfgloop.h". gcc/ChangeLog: * config/arm/arm.c: Include head files tree-vectorizer.h and cfgloop.h. --- gcc/config/arm/arm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index caf4e56..9377aae 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -32,6 +32,7 @@ #include "tree.h" #include "memmodel.h" #include "cfghooks.h" +#include "cfgloop.h" #include "df.h" #include "tm_p.h" #include "stringpool.h" @@ -69,6 +70,7 @@ #include "gimplify.h" #include "gimple.h" #include "selftest.h" +#include "tree-vectorizer.h" /* This file should be included last. */ #include "target-def.h" -- cgit v1.1 From a71f55c482ada2c6c31d450ac22494b547512127 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 20 May 2021 18:48:16 +0200 Subject: i386: Avoid integer logic insns for 32bit and 64bit vector modes [PR100701] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integer logic instructions clobber flags, do not use them for 32bit and 64bit vector modes. 2021-05-20 Uroš Bizjak gcc/ PR target/100701 * config/i386/i386.md (isa): Remove x64_bmi. (enabled): Remove x64_bmi. * config/i386/mmx.md (mmx_andnot3): Remove general register alternative. (*andnot3): Ditto. (*mmx_3): Ditto. (*3): Ditto. gcc/testsuite/ PR target/100701 * gcc.target/i386/pr100701.c: New test. --- gcc/config/i386/i386.md | 4 +--- gcc/config/i386/mmx.md | 60 +++++++++++++++++++++---------------------------- 2 files changed, 27 insertions(+), 37 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 2fc8fae..960ecbd 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -815,7 +815,7 @@ (define_attr "movu" "0,1" (const_string "0")) ;; Used to control the "enabled" attribute on a per-instruction basis. -(define_attr "isa" "base,x64,nox64,x64_bmi,x64_sse2,x64_sse4,x64_sse4_noavx, +(define_attr "isa" "base,x64,nox64,x64_sse2,x64_sse4,x64_sse4_noavx, x64_avx,x64_avx512bw,x64_avx512dq, sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx, avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f, @@ -831,8 +831,6 @@ (define_attr "enabled" "" (cond [(eq_attr "isa" "x64") (symbol_ref "TARGET_64BIT") (eq_attr "isa" "nox64") (symbol_ref "!TARGET_64BIT") - (eq_attr "isa" "x64_bmi") - (symbol_ref "TARGET_64BIT && TARGET_BMI") (eq_attr "isa" "x64_sse2") (symbol_ref "TARGET_64BIT && TARGET_SSE2") (eq_attr "isa" "x64_sse4") diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 948ba47..baeed04 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -2055,40 +2055,34 @@ "operands[2] = force_reg (mode, CONSTM1_RTX (mode));") (define_insn "mmx_andnot3" - [(set (match_operand:MMXMODEI 0 "register_operand" "=y,r,x,x,v") + [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v") (and:MMXMODEI - (not:MMXMODEI (match_operand:MMXMODEI 1 "register_operand" - "0,r,0,x,v")) - (match_operand:MMXMODEI 2 "register_mmxmem_operand" - "ym,r,x,x,v")))] + (not:MMXMODEI (match_operand:MMXMODEI 1 "register_operand" "0,0,x,v")) + (match_operand:MMXMODEI 2 "register_mmxmem_operand" "ym,x,x,v")))] "TARGET_MMX || TARGET_MMX_WITH_SSE" "@ pandn\t{%2, %0|%0, %2} - andn\t{%2, %1, %0|%0, %1, %2} pandn\t{%2, %0|%0, %2} vpandn\t{%2, %1, %0|%0, %1, %2} vpandnd\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "isa" "*,x64_bmi,sse2_noavx,avx,avx512vl") - (set_attr "mmx_isa" "native,*,*,*,*") - (set_attr "type" "mmxadd,bitmanip,sselog,sselog,sselog") - (set_attr "btver2_decode" "*,direct,*,*,*") - (set_attr "mode" "DI,DI,TI,TI,TI")]) + [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") + (set_attr "mmx_isa" "native,*,*,*") + (set_attr "type" "mmxadd,sselog,sselog,sselog") + (set_attr "mode" "DI,TI,TI,TI")]) (define_insn "*andnot3" - [(set (match_operand:VI_32 0 "register_operand" "=r,x,x,v") + [(set (match_operand:VI_32 0 "register_operand" "=x,x,v") (and:VI_32 - (not:VI_32 (match_operand:VI_32 1 "register_operand" "r,0,x,v")) - (match_operand:VI_32 2 "register_operand" "r,x,x,v")))] + (not:VI_32 (match_operand:VI_32 1 "register_operand" "0,x,v")) + (match_operand:VI_32 2 "register_operand" "x,x,v")))] "TARGET_SSE2" "@ - andn\t{%2, %1, %0|%0, %1, %2} pandn\t{%2, %0|%0, %2} vpandn\t{%2, %1, %0|%0, %1, %2} vpandnd\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "isa" "bmi,noavx,avx,avx512vl") - (set_attr "type" "bitmanip,sselog,sselog,sselog") - (set_attr "btver2_decode" "direct,*,*,*") - (set_attr "mode" "SI,TI,TI,TI")]) + [(set_attr "isa" "noavx,avx,avx512vl") + (set_attr "type" "sselog") + (set_attr "mode" "TI")]) (define_expand "mmx_3" [(set (match_operand:MMXMODEI 0 "register_operand") @@ -2107,22 +2101,21 @@ "ix86_fixup_binary_operands_no_copy (, mode, operands);") (define_insn "*mmx_3" - [(set (match_operand:MMXMODEI 0 "register_operand" "=y,r,x,x,v") + [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v") (any_logic:MMXMODEI - (match_operand:MMXMODEI 1 "register_mmxmem_operand" "%0,0,0,x,v") - (match_operand:MMXMODEI 2 "register_mmxmem_operand" "ym,r,x,x,v")))] + (match_operand:MMXMODEI 1 "register_mmxmem_operand" "%0,0,x,v") + (match_operand:MMXMODEI 2 "register_mmxmem_operand" "ym,x,x,v")))] "(TARGET_MMX || TARGET_MMX_WITH_SSE) && ix86_binary_operator_ok (, mode, operands)" "@ p\t{%2, %0|%0, %2} - \t{%2, %0|%0, %2} p\t{%2, %0|%0, %2} vp\t{%2, %1, %0|%0, %1, %2} vpd\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "isa" "*,x64,sse2_noavx,avx,avx512vl") - (set_attr "mmx_isa" "native,*,*,*,*") - (set_attr "type" "mmxadd,alu,sselog,sselog,sselog") - (set_attr "mode" "DI,DI,TI,TI,TI")]) + [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") + (set_attr "mmx_isa" "native,*,*,*") + (set_attr "type" "mmxadd,sselog,sselog,sselog") + (set_attr "mode" "DI,TI,TI,TI")]) (define_expand "3" [(set (match_operand:VI_32 0 "register_operand") @@ -2133,20 +2126,19 @@ "ix86_fixup_binary_operands_no_copy (, mode, operands);") (define_insn "*3" - [(set (match_operand:VI_32 0 "register_operand" "=r,x,x,v") + [(set (match_operand:VI_32 0 "register_operand" "=x,x,v") (any_logic:VI_32 - (match_operand:VI_32 1 "register_operand" "%0,0,x,v") - (match_operand:VI_32 2 "register_operand" "r,x,x,v")))] + (match_operand:VI_32 1 "register_operand" "%0,x,v") + (match_operand:VI_32 2 "register_operand" "x,x,v")))] "TARGET_SSE2 && ix86_binary_operator_ok (, mode, operands)" "@ - \t{%2, %0|%0, %2} p\t{%2, %0|%0, %2} vp\t{%2, %1, %0|%0, %1, %2} vpd\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "isa" "*,noavx,avx,avx512vl") - (set_attr "type" "alu,sselog,sselog,sselog") - (set_attr "mode" "SI,TI,TI,TI")]) + [(set_attr "isa" "noavx,avx,avx512vl") + (set_attr "type" "sselog") + (set_attr "mode" "TI")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; -- cgit v1.1 From 66168f96f07b12bbe0beb6e0e988818f624d56bd Mon Sep 17 00:00:00 2001 From: Indu Bhagat Date: Fri, 30 Apr 2021 08:03:52 -0700 Subject: dwarf: new dwarf_debuginfo_p predicate This patch introduces a dwarf_debuginfo_p predicate that abstracts and replaces complex checks on write_symbols. gcc/c-family/ChangeLog: * c-lex.c (init_c_lex): Use dwarf_debuginfo_p. gcc/ChangeLog: * config/c6x/c6x.c (c6x_output_file_unwind): Use dwarf_debuginfo_p. * config/darwin.c (darwin_override_options): Likewise. * config/i386/cygming.h (DBX_REGISTER_NUMBER): Likewise. * config/i386/darwin.h (DBX_REGISTER_NUMBER): Likewise. (DWARF2_FRAME_REG_OUT): Likewise. * config/mips/mips.c (mips_output_filename): Likewise. * config/rs6000/rs6000.c (rs6000_xcoff_declare_function_name): Likewise. (rs6000_dbx_register_number): Likewise. * dbxout.c: Include flags.h. * dwarf2cfi.c (cfi_label_required_p): Likewise. (dwarf2out_do_frame): Likewise. * except.c: Include flags.h. * final.c (dwarf2_debug_info_emitted_p): Likewise. (final_scan_insn_1): Likewise. * flags.h (dwarf_debuginfo_p): New function declaration. * opts.c (dwarf_debuginfo_p): New function definition. * targhooks.c (default_debug_unwind_info): Use dwarf_debuginfo_p. * toplev.c (process_options): Likewise. --- gcc/config/c6x/c6x.c | 4 ++-- gcc/config/darwin.c | 3 ++- gcc/config/i386/cygming.h | 2 +- gcc/config/i386/darwin.h | 4 ++-- gcc/config/mips/mips.c | 3 ++- gcc/config/rs6000/rs6000.c | 4 ++-- 6 files changed, 11 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/c6x/c6x.c b/gcc/config/c6x/c6x.c index e7e1d6c..ce49495 100644 --- a/gcc/config/c6x/c6x.c +++ b/gcc/config/c6x/c6x.c @@ -59,6 +59,7 @@ #include "regrename.h" #include "dumpfile.h" #include "builtins.h" +#include "flags.h" /* This file should be included last. */ #include "target-def.h" @@ -439,8 +440,7 @@ c6x_output_file_unwind (FILE * f) { if (flag_unwind_tables || flag_exceptions) { - if (write_symbols == DWARF2_DEBUG - || write_symbols == VMS_AND_DWARF2_DEBUG) + if (dwarf_debuginfo_p ()) asm_fprintf (f, "\t.cfi_sections .debug_frame, .c6xabi.exidx\n"); else asm_fprintf (f, "\t.cfi_sections .c6xabi.exidx\n"); diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c index c4016fe..b160c23 100644 --- a/gcc/config/darwin.c +++ b/gcc/config/darwin.c @@ -46,6 +46,7 @@ along with GCC; see the file COPYING3. If not see #include "lto-section-names.h" #include "intl.h" #include "optabs.h" +#include "flags.h" /* Fix and Continue. @@ -3347,7 +3348,7 @@ darwin_override_options (void) && generating_for_darwin_version >= 9 && (flag_gtoggle ? (debug_info_level == DINFO_LEVEL_NONE) : (debug_info_level >= DINFO_LEVEL_NORMAL)) - && write_symbols == DWARF2_DEBUG) + && dwarf_debuginfo_p ()) flag_var_tracking_uninit = flag_var_tracking; /* Final check on PCI options; for Darwin these are not dependent on the PIE diff --git a/gcc/config/i386/cygming.h b/gcc/config/i386/cygming.h index cfbca34..ac458cd 100644 --- a/gcc/config/i386/cygming.h +++ b/gcc/config/i386/cygming.h @@ -82,7 +82,7 @@ along with GCC; see the file COPYING3. If not see #undef DBX_REGISTER_NUMBER #define DBX_REGISTER_NUMBER(n) \ (TARGET_64BIT ? dbx64_register_map[n] \ - : (write_symbols == DWARF2_DEBUG \ + : (dwarf_debuginfo_p () \ ? svr4_dbx_register_map[n] : dbx_register_map[n])) /* Map gcc register number to DWARF 2 CFA column number. For 32 bit diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h index afa9f1b..5312003 100644 --- a/gcc/config/i386/darwin.h +++ b/gcc/config/i386/darwin.h @@ -275,13 +275,13 @@ along with GCC; see the file COPYING3. If not see #undef DBX_REGISTER_NUMBER #define DBX_REGISTER_NUMBER(n) \ (TARGET_64BIT ? dbx64_register_map[n] \ - : write_symbols == DWARF2_DEBUG ? svr4_dbx_register_map[n] \ + : dwarf_debuginfo_p () ? svr4_dbx_register_map[n] \ : dbx_register_map[n]) /* Unfortunately, the 32-bit EH information also doesn't use the standard DWARF register numbers. */ #define DWARF2_FRAME_REG_OUT(n, for_eh) \ - (! (for_eh) || write_symbols != DWARF2_DEBUG || TARGET_64BIT ? (n) \ + (! (for_eh) || !dwarf_debuginfo_p () || TARGET_64BIT ? (n) \ : (n) == 5 ? 4 \ : (n) == 4 ? 5 \ : (n) >= 11 && (n) <= 18 ? (n) + 1 \ diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index e5ba273..1f1475c 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -65,6 +65,7 @@ along with GCC; see the file COPYING3. If not see #include "context.h" #include "builtins.h" #include "rtl-iter.h" +#include "flags.h" /* This file should be included last. */ #include "target-def.h" @@ -9489,7 +9490,7 @@ mips_output_filename (FILE *stream, const char *name) { /* If we are emitting DWARF-2, let dwarf2out handle the ".file" directives. */ - if (write_symbols == DWARF2_DEBUG) + if (dwarf_debuginfo_p ()) return; else if (mips_output_filename_first_time) { diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index dfa517b..af3cc90 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -21588,7 +21588,7 @@ rs6000_xcoff_declare_function_name (FILE *file, const char *name, tree decl) { if (write_symbols == DBX_DEBUG || write_symbols == XCOFF_DEBUG) xcoffout_declare_function (file, decl, buffer); - else if (write_symbols == DWARF2_DEBUG) + else if (dwarf_debuginfo_p ()) { name = (*targetm.strip_name_encoding) (name); fprintf (file, "\t.function .%s,.%s,2,0\n", name, name); @@ -23747,7 +23747,7 @@ rs6000_dbx_register_number (unsigned int regno, unsigned int format) { /* On some platforms, we use the standard DWARF register numbering for .debug_info and .debug_frame. */ - if ((format == 0 && write_symbols == DWARF2_DEBUG) || format == 1) + if ((format == 0 && dwarf_debuginfo_p ()) || format == 1) { #ifdef RS6000_USE_DWARF_NUMBERING if (regno <= 31) -- cgit v1.1 From dcde81134cb24da8e261a4346c806c676297922b Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 21 May 2021 08:01:34 +0200 Subject: i386: Add minmax and abs patterns for 4-byte vectors [PR100637] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2021-05-21 Uroš Bizjak gcc/ PR target/100637 * config/i386/mmx.md (SMAXMIN_MMXMODEI): New mode iterator. (3): Macroize expander from v4hi3> and 3 using SMAXMIN_MMXMODEI mode iterator. (*v4qi3): New insn pattern. (*v2hi3): Ditto. (SMAXMIN_VI_32): New mode iterator. (mode3): New expander. (UMAXMIN_MMXMODEI): New mode iterator. (3): Macroize expander from v8qi3> and 3 using UMAXMIN_MMXMODEI mode iterator. (*v4qi3): New insn pattern. (*v2hi3): Ditto. (UMAXMIN_VI_32): New mode iterator. (mode3): New expander. (abs2): New insn pattern. (ssse3_abs2, abs2): Move from ... * config/i386/sse.md: ... here. --- gcc/config/i386/mmx.md | 173 +++++++++++++++++++++++++++++++++++++++++-------- gcc/config/i386/sse.md | 21 ------ 2 files changed, 145 insertions(+), 49 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index baeed04..5e92be3 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1691,13 +1691,11 @@ (set_attr "type" "mmxmul,ssemul,ssemul") (set_attr "mode" "DI,TI,TI")]) -(define_expand "3" - [(set (match_operand:MMXMODE14 0 "register_operand") - (smaxmin:MMXMODE14 - (match_operand:MMXMODE14 1 "register_operand") - (match_operand:MMXMODE14 2 "register_operand")))] - "TARGET_MMX_WITH_SSE && TARGET_SSE4_1" - "ix86_fixup_binary_operands_no_copy (, mode, operands);") +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel integral shifts +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_insn "*mmx_3" [(set (match_operand:MMXMODE14 0 "register_operand" "=Yr,*x,Yv") @@ -1725,14 +1723,6 @@ && (TARGET_SSE || TARGET_3DNOW_A)" "ix86_fixup_binary_operands_no_copy (, V4HImode, operands);") -(define_expand "v4hi3" - [(set (match_operand:V4HI 0 "register_operand") - (smaxmin:V4HI - (match_operand:V4HI 1 "register_operand") - (match_operand:V4HI 2 "register_operand")))] - "TARGET_MMX_WITH_SSE" - "ix86_fixup_binary_operands_no_copy (, V4HImode, operands);") - (define_insn "*mmx_v4hi3" [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yw") (smaxmin:V4HI @@ -1750,14 +1740,58 @@ (set_attr "type" "mmxadd,sseiadd,sseiadd") (set_attr "mode" "DI,TI,TI")]) +(define_mode_iterator SMAXMIN_MMXMODEI + [(V8QI "TARGET_SSE4_1") V4HI (V2SI "TARGET_SSE4_1")]) + (define_expand "3" - [(set (match_operand:MMXMODE24 0 "register_operand") - (umaxmin:MMXMODE24 - (match_operand:MMXMODE24 1 "register_operand") - (match_operand:MMXMODE24 2 "register_operand")))] - "TARGET_MMX_WITH_SSE && TARGET_SSE4_1" + [(set (match_operand:SMAXMIN_MMXMODEI 0 "register_operand") + (smaxmin:SMAXMIN_MMXMODEI + (match_operand:SMAXMIN_MMXMODEI 1 "register_operand") + (match_operand:SMAXMIN_MMXMODEI 2 "register_operand")))] + "TARGET_MMX_WITH_SSE" "ix86_fixup_binary_operands_no_copy (, mode, operands);") +(define_insn "*v4qi3" + [(set (match_operand:V4QI 0 "register_operand" "=Yr,*x,Yv") + (smaxmin:V4QI + (match_operand:V4QI 1 "register_operand" "%0,0,Yv") + (match_operand:V4QI 2 "register_operand" "Yr,*x,Yv")))] + "TARGET_SSE4_1 + && ix86_binary_operator_ok (, V4QImode, operands)" + "@ + pb\t{%2, %0|%0, %2} + pb\t{%2, %0|%0, %2} + vpb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1,1,*") + (set_attr "prefix" "orig,orig,vex") + (set_attr "mode" "TI")]) + +(define_insn "*v2hi3" + [(set (match_operand:V2HI 0 "register_operand" "=x,Yw") + (smaxmin:V2HI + (match_operand:V2HI 1 "register_operand" "%0,Yw") + (match_operand:V2HI 2 "register_operand" "x,Yw")))] + "TARGET_SSE2 + && ix86_binary_operator_ok (, V2HImode, operands)" + "@ + pw\t{%2, %0|%0, %2} + vpw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseiadd") + (set_attr "mode" "TI")]) + +(define_mode_iterator SMAXMIN_VI_32 [(V4QI "TARGET_SSE4_1") V2HI]) + +(define_expand "3" + [(set (match_operand:SMAXMIN_VI_32 0 "register_operand") + (smaxmin:SMAXMIN_VI_32 + (match_operand:SMAXMIN_VI_32 1 "register_operand") + (match_operand:SMAXMIN_VI_32 2 "register_operand")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (, V4HImode, operands);") + (define_insn "*mmx_3" [(set (match_operand:MMXMODE24 0 "register_operand" "=Yr,*x,Yv") (umaxmin:MMXMODE24 @@ -1784,14 +1818,6 @@ && (TARGET_SSE || TARGET_3DNOW_A)" "ix86_fixup_binary_operands_no_copy (, V8QImode, operands);") -(define_expand "v8qi3" - [(set (match_operand:V8QI 0 "register_operand") - (umaxmin:V8QI - (match_operand:V8QI 1 "register_operand") - (match_operand:V8QI 2 "register_operand")))] - "TARGET_MMX_WITH_SSE" - "ix86_fixup_binary_operands_no_copy (, V8QImode, operands);") - (define_insn "*mmx_v8qi3" [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw") (umaxmin:V8QI @@ -1809,6 +1835,97 @@ (set_attr "type" "mmxadd,sseiadd,sseiadd") (set_attr "mode" "DI,TI,TI")]) +(define_mode_iterator UMAXMIN_MMXMODEI + [V8QI (V4HI "TARGET_SSE4_1") (V2SI "TARGET_SSE4_1")]) + +(define_expand "3" + [(set (match_operand:UMAXMIN_MMXMODEI 0 "register_operand") + (umaxmin:UMAXMIN_MMXMODEI + (match_operand:UMAXMIN_MMXMODEI 1 "register_operand") + (match_operand:UMAXMIN_MMXMODEI 2 "register_operand")))] + "TARGET_MMX_WITH_SSE" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*v4qi3" + [(set (match_operand:V4QI 0 "register_operand" "=x,Yw") + (umaxmin:V4QI + (match_operand:V4QI 1 "register_operand" "%0,Yw") + (match_operand:V4QI 2 "register_operand" "x,Yw")))] + "TARGET_SSE2 + && ix86_binary_operator_ok (, V4QImode, operands)" + "@ + pb\t{%2, %0|%0, %2} + vpb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseiadd") + (set_attr "mode" "TI")]) + +(define_insn "*v2hi3" + [(set (match_operand:V2HI 0 "register_operand" "=Yr,*x,Yv") + (umaxmin:V2HI + (match_operand:V2HI 1 "register_operand" "%0,0,Yv") + (match_operand:V2HI 2 "register_operand" "Yr,*x,Yv")))] + "TARGET_SSE4_1 + && ix86_binary_operator_ok (, V2HImode, operands)" + "@ + pw\t{%2, %0|%0, %2} + pw\t{%2, %0|%0, %2} + vpw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1,1,*") + (set_attr "prefix" "orig,orig,vex") + (set_attr "mode" "TI")]) + +(define_mode_iterator UMAXMIN_VI_32 [V4QI (V2HI "TARGET_SSE4_1")]) + +(define_expand "3" + [(set (match_operand:UMAXMIN_VI_32 0 "register_operand") + (umaxmin:UMAXMIN_VI_32 + (match_operand:UMAXMIN_VI_32 1 "register_operand") + (match_operand:UMAXMIN_VI_32 2 "register_operand")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (, V4HImode, operands);") + +(define_insn "ssse3_abs2" + [(set (match_operand:MMXMODEI 0 "register_operand" "=y,Yv") + (abs:MMXMODEI + (match_operand:MMXMODEI 1 "register_mmxmem_operand" "ym,Yv")))] + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3" + "@ + pabs\t{%1, %0|%0, %1} + %vpabs\t{%1, %0|%0, %1}" + [(set_attr "mmx_isa" "native,*") + (set_attr "type" "sselog1") + (set_attr "prefix_rep" "0") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "DI,TI")]) + +(define_expand "abs2" + [(set (match_operand:MMXMODEI 0 "register_operand") + (abs:MMXMODEI + (match_operand:MMXMODEI 1 "register_operand")))] + "TARGET_MMX_WITH_SSE && TARGET_SSSE3") + +(define_insn "abs2" + [(set (match_operand:VI_32 0 "register_operand" "=Yv") + (abs:VI_32 + (match_operand:VI_32 1 "register_operand" "Yv")))] + "TARGET_SSSE3" + "%vpabs\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1") + (set_attr "prefix_rep" "0") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "TI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel integral shifts +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + (define_insn "mmx_ashr3" [(set (match_operand:MMXMODE24 0 "register_operand" "=y,x,") (ashiftrt:MMXMODE24 diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 0f1108f..7269147 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17553,27 +17553,6 @@ } }) -(define_insn "ssse3_abs2" - [(set (match_operand:MMXMODEI 0 "register_operand" "=y,Yv") - (abs:MMXMODEI - (match_operand:MMXMODEI 1 "register_mmxmem_operand" "ym,Yv")))] - "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3" - "@ - pabs\t{%1, %0|%0, %1} - %vpabs\t{%1, %0|%0, %1}" - [(set_attr "mmx_isa" "native,*") - (set_attr "type" "sselog1") - (set_attr "prefix_rep" "0") - (set_attr "prefix_extra" "1") - (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) - (set_attr "mode" "DI,TI")]) - -(define_insn "abs2" - [(set (match_operand:MMXMODEI 0 "register_operand") - (abs:MMXMODEI - (match_operand:MMXMODEI 1 "register_operand")))] - "TARGET_MMX_WITH_SSE && TARGET_SSSE3") - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; AMD SSE4A instructions -- cgit v1.1 From 2df9d3c52e6758f6640e7c0ae0b7502c7cc1d430 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 21 May 2021 13:03:04 +0200 Subject: i386: Add comparisons for 4-byte vectors [PR100637] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2021-05-21 Uroš Bizjak gcc/ PR target/100637 * config/i386/i386-expand.c (ix86_expand_sse_movcc): Handle V4QI and V2HI modes. (ix86_expand_sse_movcc): Ditto. * config/i386/mmx.md (*3): New instruction pattern. (*eq3): Ditto. (*gt3): Ditto. (*xop_pcmov_): Ditto. (mmx_pblendvb32): Ditto. (mmx_pblendvb64): Rename from mmx_pblendvb. (vec_cmp): New expander. (vec_cmpu): Ditto. (vcond): Ditto. (vcondu): Ditto. (vcond_mask_): Ditto. gcc/testsuite/ PR target/100637 * g++.target/i386/pr100637-1b.C: New test. * g++.target/i386/pr100637-1w.C: Ditto. * gcc.target/i386/pr100637-2b.c: Ditto. * gcc.target/i386/pr100637-2w.c: Ditto. --- gcc/config/i386/i386-expand.c | 30 ++++++++- gcc/config/i386/mmx.md | 140 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 168 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 9f3d419..931b336 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -3721,7 +3721,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) { op_true = force_reg (mode, op_true); - gen = gen_mmx_pblendvb; + gen = gen_mmx_pblendvb64; if (mode != V8QImode) d = gen_reg_rtx (V8QImode); op_false = gen_lowpart (V8QImode, op_false); @@ -3729,6 +3729,20 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) cmp = gen_lowpart (V8QImode, cmp); } break; + case E_V4QImode: + case E_V2HImode: + if (TARGET_SSE4_1) + { + op_true = force_reg (mode, op_true); + + gen = gen_mmx_pblendvb32; + if (mode != V4QImode) + d = gen_reg_rtx (V4QImode); + op_false = gen_lowpart (V4QImode, op_false); + op_true = gen_lowpart (V4QImode, op_true); + cmp = gen_lowpart (V4QImode, cmp); + } + break; case E_V16QImode: case E_V8HImode: case E_V4SImode: @@ -4241,6 +4255,12 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1, else if (code == GT && TARGET_SSE4_1) gen = gen_sminv8qi3; break; + case E_V4QImode: + if (code == GTU && TARGET_SSE2) + gen = gen_uminv4qi3; + else if (code == GT && TARGET_SSE4_1) + gen = gen_sminv4qi3; + break; case E_V8HImode: if (code == GTU && TARGET_SSE4_1) gen = gen_uminv8hi3; @@ -4253,6 +4273,12 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1, else if (code == GT && TARGET_SSE2) gen = gen_sminv4hi3; break; + case E_V2HImode: + if (code == GTU && TARGET_SSE4_1) + gen = gen_uminv2hi3; + else if (code == GT && TARGET_SSE2) + gen = gen_sminv2hi3; + break; case E_V4SImode: if (TARGET_SSE4_1) gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3; @@ -4327,8 +4353,10 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1, case E_V16HImode: case E_V16QImode: case E_V8QImode: + case E_V4QImode: case E_V8HImode: case E_V4HImode: + case E_V2HImode: /* Perform a parallel unsigned saturating subtraction. */ x = gen_reg_rtx (mode); emit_insn (gen_rtx_SET diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 5e92be3..4c42e6d 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1403,6 +1403,20 @@ (set_attr "type" "mmxadd,sseadd,sseadd") (set_attr "mode" "DI,TI,TI")]) +(define_insn "*3" + [(set (match_operand:VI_32 0 "register_operand" "=x,Yw") + (sat_plusminus:VI_32 + (match_operand:VI_32 1 "register_operand" "0,Yw") + (match_operand:VI_32 2 "register_operand" "x,Yw")))] + "TARGET_SSE2 + && ix86_binary_operator_ok (, mode, operands)" + "@ + p\t{%2, %0|%0, %2} + vp\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseadd") + (set_attr "mode" "TI")]) + (define_expand "mmx_mulv4hi3" [(set (match_operand:V4HI 0 "register_operand") (mult:V4HI (match_operand:V4HI 1 "register_mmxmem_operand") @@ -2032,6 +2046,20 @@ (set_attr "type" "mmxcmp,ssecmp,ssecmp") (set_attr "mode" "DI,TI,TI")]) +(define_insn "*eq3" + [(set (match_operand:VI_32 0 "register_operand" "=x,x") + (eq:VI_32 + (match_operand:VI_32 1 "register_operand" "%0,x") + (match_operand:VI_32 2 "register_operand" "x,x")))] + "TARGET_SSE2 + && ix86_binary_operator_ok (EQ, mode, operands)" + "@ + pcmpeq\t{%2, %0|%0, %2} + vpcmpeq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "ssecmp") + (set_attr "mode" "TI")]) + (define_insn "mmx_gt3" [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x") (gt:MMXMODEI @@ -2047,6 +2075,19 @@ (set_attr "type" "mmxcmp,ssecmp,ssecmp") (set_attr "mode" "DI,TI,TI")]) +(define_insn "*gt3" + [(set (match_operand:VI_32 0 "register_operand" "=x,x") + (gt:VI_32 + (match_operand:VI_32 1 "register_operand" "0,x") + (match_operand:VI_32 2 "register_operand" "x,x")))] + "TARGET_SSE2" + "@ + pcmpgt\t{%2, %0|%0, %2} + vpcmpgt\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "ssecmp") + (set_attr "mode" "TI")]) + (define_expand "vec_cmp" [(set (match_operand:MMXMODEI 0 "register_operand") (match_operator:MMXMODEI 1 "" @@ -2059,6 +2100,18 @@ DONE; }) +(define_expand "vec_cmp" + [(set (match_operand:VI_32 0 "register_operand") + (match_operator:VI_32 1 "" + [(match_operand:VI_32 2 "register_operand") + (match_operand:VI_32 3 "register_operand")]))] + "TARGET_SSE2" +{ + bool ok = ix86_expand_int_vec_cmp (operands); + gcc_assert (ok); + DONE; +}) + (define_expand "vec_cmpu" [(set (match_operand:MMXMODEI 0 "register_operand") (match_operator:MMXMODEI 1 "" @@ -2071,6 +2124,18 @@ DONE; }) +(define_expand "vec_cmpu" + [(set (match_operand:VI_32 0 "register_operand") + (match_operator:VI_32 1 "" + [(match_operand:VI_32 2 "register_operand") + (match_operand:VI_32 3 "register_operand")]))] + "TARGET_SSE2" +{ + bool ok = ix86_expand_int_vec_cmp (operands); + gcc_assert (ok); + DONE; +}) + (define_expand "vcond" [(set (match_operand:MMXMODE124 0 "register_operand") (if_then_else:MMXMODE124 @@ -2088,6 +2153,21 @@ DONE; }) +(define_expand "vcond" + [(set (match_operand:VI_32 0 "register_operand") + (if_then_else:VI_32 + (match_operator 3 "" + [(match_operand:VI_32 4 "register_operand") + (match_operand:VI_32 5 "register_operand")]) + (match_operand:VI_32 1) + (match_operand:VI_32 2)))] + "TARGET_SSE2" +{ + bool ok = ix86_expand_int_vcond (operands); + gcc_assert (ok); + DONE; +}) + (define_expand "vcondu" [(set (match_operand:MMXMODE124 0 "register_operand") (if_then_else:MMXMODE124 @@ -2105,6 +2185,21 @@ DONE; }) +(define_expand "vcondu" + [(set (match_operand:VI_32 0 "register_operand") + (if_then_else:VI_32 + (match_operator 3 "" + [(match_operand:VI_32 4 "register_operand") + (match_operand:VI_32 5 "register_operand")]) + (match_operand:VI_32 1) + (match_operand:VI_32 2)))] + "TARGET_SSE2" +{ + bool ok = ix86_expand_int_vcond (operands); + gcc_assert (ok); + DONE; +}) + (define_expand "vcond_mask_" [(set (match_operand:MMXMODE124 0 "register_operand") (vec_merge:MMXMODE124 @@ -2118,7 +2213,20 @@ DONE; }) -(define_insn "mmx_pblendvb" +(define_expand "vcond_mask_" + [(set (match_operand:VI_32 0 "register_operand") + (vec_merge:VI_32 + (match_operand:VI_32 1 "register_operand") + (match_operand:VI_32 2 "register_operand") + (match_operand:VI_32 3 "register_operand")))] + "TARGET_SSE2" +{ + ix86_expand_sse_movcc (operands[0], operands[3], + operands[1], operands[2]); + DONE; +}) + +(define_insn "mmx_pblendvb64" [(set (match_operand:V8QI 0 "register_operand" "=Yr,*x,x") (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0,0,x") @@ -2138,6 +2246,26 @@ (set_attr "btver2_decode" "vector") (set_attr "mode" "TI")]) +(define_insn "mmx_pblendvb32" + [(set (match_operand:V4QI 0 "register_operand" "=Yr,*x,x") + (unspec:V4QI + [(match_operand:V4QI 1 "register_operand" "0,0,x") + (match_operand:V4QI 2 "register_operand" "Yr,*x,x") + (match_operand:V4QI 3 "register_operand" "Yz,Yz,x")] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "@ + pblendvb\t{%3, %2, %0|%0, %2, %3} + pblendvb\t{%3, %2, %0|%0, %2, %3} + vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "*,*,1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "btver2_decode" "vector") + (set_attr "mode" "TI")]) + ;; XOP parallel XMM conditional moves (define_insn "*xop_pcmov_" [(set (match_operand:MMXMODE124 0 "register_operand" "=x") @@ -2149,6 +2277,16 @@ "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "sse4arg")]) +(define_insn "*xop_pcmov_" + [(set (match_operand:VI_32 0 "register_operand" "=x") + (if_then_else:VI_32 + (match_operand:VI_32 3 "register_operand" "x") + (match_operand:VI_32 1 "register_operand" "x") + (match_operand:VI_32 2 "register_operand" "x")))] + "TARGET_XOP" + "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sse4arg")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral logical operations -- cgit v1.1 From 5ae352467a5a9414c0abea008f3719c6d3ae54e7 Mon Sep 17 00:00:00 2001 From: Aaron Sawdey Date: Tue, 2 Mar 2021 17:50:52 -0600 Subject: Add insn types for fusion pairs This adds new values for insn attr type for p10 fusion. The genfusion.pl script is modified to use them, and fusion.md regenerated to capture the new patterns. There are also some formatting only changes to fusion.md that apparently weren't captured after a previous commit of genfusion.pl. gcc/ * config/rs6000/rs6000.md (define_attr "type"): Add types for fusion. * config/rs6000/genfusion.pl (gen_ld_cmpi_p10): Use new fusion types. (gen_2logical): Use new fusion types. * config/rs6000/fusion.md: Regenerate. --- gcc/config/rs6000/fusion.md | 288 ++++++++++++++++++++--------------------- gcc/config/rs6000/genfusion.pl | 8 +- gcc/config/rs6000/rs6000.md | 20 ++- 3 files changed, 168 insertions(+), 148 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md index 56478fc..6d71bc2 100644 --- a/gcc/config/rs6000/fusion.md +++ b/gcc/config/rs6000/fusion.md @@ -35,7 +35,7 @@ (set (match_dup 2) (compare:CC (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -56,7 +56,7 @@ (set (match_dup 2) (compare:CCUNS (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -77,7 +77,7 @@ (set (match_dup 2) (compare:CC (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -98,7 +98,7 @@ (set (match_dup 2) (compare:CCUNS (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -119,7 +119,7 @@ (set (match_dup 2) (compare:CC (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -140,7 +140,7 @@ (set (match_dup 2) (compare:CCUNS (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -161,7 +161,7 @@ (set (match_dup 2) (compare:CC (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -182,7 +182,7 @@ (set (match_dup 2) (compare:CCUNS (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -203,7 +203,7 @@ (set (match_dup 2) (compare:CC (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -224,7 +224,7 @@ (set (match_dup 2) (compare:CCUNS (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -245,7 +245,7 @@ (set (match_dup 2) (compare:CC (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -266,7 +266,7 @@ (set (match_dup 2) (compare:CCUNS (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -287,7 +287,7 @@ (set (match_dup 2) (compare:CC (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -308,7 +308,7 @@ (set (match_dup 2) (compare:CCUNS (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -329,7 +329,7 @@ (set (match_dup 2) (compare:CCUNS (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -350,7 +350,7 @@ (set (match_dup 2) (compare:CCUNS (match_dup 0) (match_dup 3)))] "" - [(set_attr "type" "load") + [(set_attr "type" "fused_load_cmpi") (set_attr "cost" "8") (set_attr "length" "8")]) @@ -369,7 +369,7 @@ and %3,%1,%0\;and %3,%3,%2 and %3,%1,%0\;and %3,%3,%2 and %4,%1,%0\;and %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -387,7 +387,7 @@ andc %3,%1,%0\;and %3,%3,%2 andc %3,%1,%0\;and %3,%3,%2 andc %4,%1,%0\;and %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -405,7 +405,7 @@ eqv %3,%1,%0\;and %3,%3,%2 eqv %3,%1,%0\;and %3,%3,%2 eqv %4,%1,%0\;and %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -423,7 +423,7 @@ nand %3,%1,%0\;and %3,%3,%2 nand %3,%1,%0\;and %3,%3,%2 nand %4,%1,%0\;and %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -441,7 +441,7 @@ nor %3,%1,%0\;and %3,%3,%2 nor %3,%1,%0\;and %3,%3,%2 nor %4,%1,%0\;and %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -459,7 +459,7 @@ or %3,%1,%0\;and %3,%3,%2 or %3,%1,%0\;and %3,%3,%2 or %4,%1,%0\;and %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -477,7 +477,7 @@ orc %3,%1,%0\;and %3,%3,%2 orc %3,%1,%0\;and %3,%3,%2 orc %4,%1,%0\;and %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -495,7 +495,7 @@ xor %3,%1,%0\;and %3,%3,%2 xor %3,%1,%0\;and %3,%3,%2 xor %4,%1,%0\;and %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -513,7 +513,7 @@ and %3,%1,%0\;andc %3,%3,%2 and %3,%1,%0\;andc %3,%3,%2 and %4,%1,%0\;andc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -531,7 +531,7 @@ andc %3,%1,%0\;andc %3,%3,%2 andc %3,%1,%0\;andc %3,%3,%2 andc %4,%1,%0\;andc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -549,7 +549,7 @@ eqv %3,%1,%0\;andc %3,%3,%2 eqv %3,%1,%0\;andc %3,%3,%2 eqv %4,%1,%0\;andc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -567,7 +567,7 @@ nand %3,%1,%0\;andc %3,%3,%2 nand %3,%1,%0\;andc %3,%3,%2 nand %4,%1,%0\;andc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -585,7 +585,7 @@ nor %3,%1,%0\;andc %3,%3,%2 nor %3,%1,%0\;andc %3,%3,%2 nor %4,%1,%0\;andc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -603,7 +603,7 @@ or %3,%1,%0\;andc %3,%3,%2 or %3,%1,%0\;andc %3,%3,%2 or %4,%1,%0\;andc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -621,7 +621,7 @@ orc %3,%1,%0\;andc %3,%3,%2 orc %3,%1,%0\;andc %3,%3,%2 orc %4,%1,%0\;andc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -639,7 +639,7 @@ xor %3,%1,%0\;andc %3,%3,%2 xor %3,%1,%0\;andc %3,%3,%2 xor %4,%1,%0\;andc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -657,7 +657,7 @@ and %3,%1,%0\;eqv %3,%3,%2 and %3,%1,%0\;eqv %3,%3,%2 and %4,%1,%0\;eqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -675,7 +675,7 @@ andc %3,%1,%0\;eqv %3,%3,%2 andc %3,%1,%0\;eqv %3,%3,%2 andc %4,%1,%0\;eqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -693,7 +693,7 @@ eqv %3,%1,%0\;eqv %3,%3,%2 eqv %3,%1,%0\;eqv %3,%3,%2 eqv %4,%1,%0\;eqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -711,7 +711,7 @@ nand %3,%1,%0\;eqv %3,%3,%2 nand %3,%1,%0\;eqv %3,%3,%2 nand %4,%1,%0\;eqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -729,7 +729,7 @@ nor %3,%1,%0\;eqv %3,%3,%2 nor %3,%1,%0\;eqv %3,%3,%2 nor %4,%1,%0\;eqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -747,7 +747,7 @@ or %3,%1,%0\;eqv %3,%3,%2 or %3,%1,%0\;eqv %3,%3,%2 or %4,%1,%0\;eqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -765,7 +765,7 @@ orc %3,%1,%0\;eqv %3,%3,%2 orc %3,%1,%0\;eqv %3,%3,%2 orc %4,%1,%0\;eqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -783,7 +783,7 @@ xor %3,%1,%0\;eqv %3,%3,%2 xor %3,%1,%0\;eqv %3,%3,%2 xor %4,%1,%0\;eqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -801,7 +801,7 @@ and %3,%1,%0\;nand %3,%3,%2 and %3,%1,%0\;nand %3,%3,%2 and %4,%1,%0\;nand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -819,7 +819,7 @@ andc %3,%1,%0\;nand %3,%3,%2 andc %3,%1,%0\;nand %3,%3,%2 andc %4,%1,%0\;nand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -837,7 +837,7 @@ eqv %3,%1,%0\;nand %3,%3,%2 eqv %3,%1,%0\;nand %3,%3,%2 eqv %4,%1,%0\;nand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -855,7 +855,7 @@ nand %3,%1,%0\;nand %3,%3,%2 nand %3,%1,%0\;nand %3,%3,%2 nand %4,%1,%0\;nand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -873,7 +873,7 @@ nor %3,%1,%0\;nand %3,%3,%2 nor %3,%1,%0\;nand %3,%3,%2 nor %4,%1,%0\;nand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -891,7 +891,7 @@ or %3,%1,%0\;nand %3,%3,%2 or %3,%1,%0\;nand %3,%3,%2 or %4,%1,%0\;nand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -909,7 +909,7 @@ orc %3,%1,%0\;nand %3,%3,%2 orc %3,%1,%0\;nand %3,%3,%2 orc %4,%1,%0\;nand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -927,7 +927,7 @@ xor %3,%1,%0\;nand %3,%3,%2 xor %3,%1,%0\;nand %3,%3,%2 xor %4,%1,%0\;nand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -945,7 +945,7 @@ and %3,%1,%0\;nor %3,%3,%2 and %3,%1,%0\;nor %3,%3,%2 and %4,%1,%0\;nor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -963,7 +963,7 @@ andc %3,%1,%0\;nor %3,%3,%2 andc %3,%1,%0\;nor %3,%3,%2 andc %4,%1,%0\;nor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -981,7 +981,7 @@ eqv %3,%1,%0\;nor %3,%3,%2 eqv %3,%1,%0\;nor %3,%3,%2 eqv %4,%1,%0\;nor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -999,7 +999,7 @@ nand %3,%1,%0\;nor %3,%3,%2 nand %3,%1,%0\;nor %3,%3,%2 nand %4,%1,%0\;nor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1017,7 +1017,7 @@ nor %3,%1,%0\;nor %3,%3,%2 nor %3,%1,%0\;nor %3,%3,%2 nor %4,%1,%0\;nor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1035,7 +1035,7 @@ or %3,%1,%0\;nor %3,%3,%2 or %3,%1,%0\;nor %3,%3,%2 or %4,%1,%0\;nor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1053,7 +1053,7 @@ orc %3,%1,%0\;nor %3,%3,%2 orc %3,%1,%0\;nor %3,%3,%2 orc %4,%1,%0\;nor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1071,7 +1071,7 @@ xor %3,%1,%0\;nor %3,%3,%2 xor %3,%1,%0\;nor %3,%3,%2 xor %4,%1,%0\;nor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1089,7 +1089,7 @@ and %3,%1,%0\;or %3,%3,%2 and %3,%1,%0\;or %3,%3,%2 and %4,%1,%0\;or %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1107,7 +1107,7 @@ andc %3,%1,%0\;or %3,%3,%2 andc %3,%1,%0\;or %3,%3,%2 andc %4,%1,%0\;or %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1125,7 +1125,7 @@ eqv %3,%1,%0\;or %3,%3,%2 eqv %3,%1,%0\;or %3,%3,%2 eqv %4,%1,%0\;or %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1143,7 +1143,7 @@ nand %3,%1,%0\;or %3,%3,%2 nand %3,%1,%0\;or %3,%3,%2 nand %4,%1,%0\;or %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1161,7 +1161,7 @@ nor %3,%1,%0\;or %3,%3,%2 nor %3,%1,%0\;or %3,%3,%2 nor %4,%1,%0\;or %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1179,7 +1179,7 @@ or %3,%1,%0\;or %3,%3,%2 or %3,%1,%0\;or %3,%3,%2 or %4,%1,%0\;or %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1197,7 +1197,7 @@ orc %3,%1,%0\;or %3,%3,%2 orc %3,%1,%0\;or %3,%3,%2 orc %4,%1,%0\;or %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1215,7 +1215,7 @@ xor %3,%1,%0\;or %3,%3,%2 xor %3,%1,%0\;or %3,%3,%2 xor %4,%1,%0\;or %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1233,7 +1233,7 @@ and %3,%1,%0\;orc %3,%3,%2 and %3,%1,%0\;orc %3,%3,%2 and %4,%1,%0\;orc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1251,7 +1251,7 @@ andc %3,%1,%0\;orc %3,%3,%2 andc %3,%1,%0\;orc %3,%3,%2 andc %4,%1,%0\;orc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1269,7 +1269,7 @@ eqv %3,%1,%0\;orc %3,%3,%2 eqv %3,%1,%0\;orc %3,%3,%2 eqv %4,%1,%0\;orc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1287,7 +1287,7 @@ nand %3,%1,%0\;orc %3,%3,%2 nand %3,%1,%0\;orc %3,%3,%2 nand %4,%1,%0\;orc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1305,7 +1305,7 @@ nor %3,%1,%0\;orc %3,%3,%2 nor %3,%1,%0\;orc %3,%3,%2 nor %4,%1,%0\;orc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1323,7 +1323,7 @@ or %3,%1,%0\;orc %3,%3,%2 or %3,%1,%0\;orc %3,%3,%2 or %4,%1,%0\;orc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1341,7 +1341,7 @@ orc %3,%1,%0\;orc %3,%3,%2 orc %3,%1,%0\;orc %3,%3,%2 orc %4,%1,%0\;orc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1359,7 +1359,7 @@ xor %3,%1,%0\;orc %3,%3,%2 xor %3,%1,%0\;orc %3,%3,%2 xor %4,%1,%0\;orc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1377,7 +1377,7 @@ and %3,%1,%0\;xor %3,%3,%2 and %3,%1,%0\;xor %3,%3,%2 and %4,%1,%0\;xor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1395,7 +1395,7 @@ andc %3,%1,%0\;xor %3,%3,%2 andc %3,%1,%0\;xor %3,%3,%2 andc %4,%1,%0\;xor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1413,7 +1413,7 @@ eqv %3,%1,%0\;xor %3,%3,%2 eqv %3,%1,%0\;xor %3,%3,%2 eqv %4,%1,%0\;xor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1431,7 +1431,7 @@ nand %3,%1,%0\;xor %3,%3,%2 nand %3,%1,%0\;xor %3,%3,%2 nand %4,%1,%0\;xor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1449,7 +1449,7 @@ nor %3,%1,%0\;xor %3,%3,%2 nor %3,%1,%0\;xor %3,%3,%2 nor %4,%1,%0\;xor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1467,7 +1467,7 @@ or %3,%1,%0\;xor %3,%3,%2 or %3,%1,%0\;xor %3,%3,%2 or %4,%1,%0\;xor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1485,7 +1485,7 @@ orc %3,%1,%0\;xor %3,%3,%2 orc %3,%1,%0\;xor %3,%3,%2 orc %4,%1,%0\;xor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1503,7 +1503,7 @@ xor %3,%1,%0\;xor %3,%3,%2 xor %3,%1,%0\;xor %3,%3,%2 xor %4,%1,%0\;xor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1521,7 +1521,7 @@ vand %3,%1,%0\;vand %3,%3,%2 vand %3,%1,%0\;vand %3,%3,%2 vand %4,%1,%0\;vand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1539,7 +1539,7 @@ vandc %3,%1,%0\;vand %3,%3,%2 vandc %3,%1,%0\;vand %3,%3,%2 vandc %4,%1,%0\;vand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1557,7 +1557,7 @@ veqv %3,%1,%0\;vand %3,%3,%2 veqv %3,%1,%0\;vand %3,%3,%2 veqv %4,%1,%0\;vand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1575,7 +1575,7 @@ vnand %3,%1,%0\;vand %3,%3,%2 vnand %3,%1,%0\;vand %3,%3,%2 vnand %4,%1,%0\;vand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1593,7 +1593,7 @@ vnor %3,%1,%0\;vand %3,%3,%2 vnor %3,%1,%0\;vand %3,%3,%2 vnor %4,%1,%0\;vand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1611,7 +1611,7 @@ vor %3,%1,%0\;vand %3,%3,%2 vor %3,%1,%0\;vand %3,%3,%2 vor %4,%1,%0\;vand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1629,7 +1629,7 @@ vorc %3,%1,%0\;vand %3,%3,%2 vorc %3,%1,%0\;vand %3,%3,%2 vorc %4,%1,%0\;vand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1647,7 +1647,7 @@ vxor %3,%1,%0\;vand %3,%3,%2 vxor %3,%1,%0\;vand %3,%3,%2 vxor %4,%1,%0\;vand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1665,7 +1665,7 @@ vand %3,%1,%0\;vandc %3,%3,%2 vand %3,%1,%0\;vandc %3,%3,%2 vand %4,%1,%0\;vandc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1683,7 +1683,7 @@ vandc %3,%1,%0\;vandc %3,%3,%2 vandc %3,%1,%0\;vandc %3,%3,%2 vandc %4,%1,%0\;vandc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1701,7 +1701,7 @@ veqv %3,%1,%0\;vandc %3,%3,%2 veqv %3,%1,%0\;vandc %3,%3,%2 veqv %4,%1,%0\;vandc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1719,7 +1719,7 @@ vnand %3,%1,%0\;vandc %3,%3,%2 vnand %3,%1,%0\;vandc %3,%3,%2 vnand %4,%1,%0\;vandc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1737,7 +1737,7 @@ vnor %3,%1,%0\;vandc %3,%3,%2 vnor %3,%1,%0\;vandc %3,%3,%2 vnor %4,%1,%0\;vandc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1755,7 +1755,7 @@ vor %3,%1,%0\;vandc %3,%3,%2 vor %3,%1,%0\;vandc %3,%3,%2 vor %4,%1,%0\;vandc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1773,7 +1773,7 @@ vorc %3,%1,%0\;vandc %3,%3,%2 vorc %3,%1,%0\;vandc %3,%3,%2 vorc %4,%1,%0\;vandc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1791,7 +1791,7 @@ vxor %3,%1,%0\;vandc %3,%3,%2 vxor %3,%1,%0\;vandc %3,%3,%2 vxor %4,%1,%0\;vandc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1809,7 +1809,7 @@ vand %3,%1,%0\;veqv %3,%3,%2 vand %3,%1,%0\;veqv %3,%3,%2 vand %4,%1,%0\;veqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1827,7 +1827,7 @@ vandc %3,%1,%0\;veqv %3,%3,%2 vandc %3,%1,%0\;veqv %3,%3,%2 vandc %4,%1,%0\;veqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1845,7 +1845,7 @@ veqv %3,%1,%0\;veqv %3,%3,%2 veqv %3,%1,%0\;veqv %3,%3,%2 veqv %4,%1,%0\;veqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1863,7 +1863,7 @@ vnand %3,%1,%0\;veqv %3,%3,%2 vnand %3,%1,%0\;veqv %3,%3,%2 vnand %4,%1,%0\;veqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1881,7 +1881,7 @@ vnor %3,%1,%0\;veqv %3,%3,%2 vnor %3,%1,%0\;veqv %3,%3,%2 vnor %4,%1,%0\;veqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1899,7 +1899,7 @@ vor %3,%1,%0\;veqv %3,%3,%2 vor %3,%1,%0\;veqv %3,%3,%2 vor %4,%1,%0\;veqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1917,7 +1917,7 @@ vorc %3,%1,%0\;veqv %3,%3,%2 vorc %3,%1,%0\;veqv %3,%3,%2 vorc %4,%1,%0\;veqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1935,7 +1935,7 @@ vxor %3,%1,%0\;veqv %3,%3,%2 vxor %3,%1,%0\;veqv %3,%3,%2 vxor %4,%1,%0\;veqv %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1953,7 +1953,7 @@ vand %3,%1,%0\;vnand %3,%3,%2 vand %3,%1,%0\;vnand %3,%3,%2 vand %4,%1,%0\;vnand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1971,7 +1971,7 @@ vandc %3,%1,%0\;vnand %3,%3,%2 vandc %3,%1,%0\;vnand %3,%3,%2 vandc %4,%1,%0\;vnand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1989,7 +1989,7 @@ veqv %3,%1,%0\;vnand %3,%3,%2 veqv %3,%1,%0\;vnand %3,%3,%2 veqv %4,%1,%0\;vnand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2007,7 +2007,7 @@ vnand %3,%1,%0\;vnand %3,%3,%2 vnand %3,%1,%0\;vnand %3,%3,%2 vnand %4,%1,%0\;vnand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2025,7 +2025,7 @@ vnor %3,%1,%0\;vnand %3,%3,%2 vnor %3,%1,%0\;vnand %3,%3,%2 vnor %4,%1,%0\;vnand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2043,7 +2043,7 @@ vor %3,%1,%0\;vnand %3,%3,%2 vor %3,%1,%0\;vnand %3,%3,%2 vor %4,%1,%0\;vnand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2061,7 +2061,7 @@ vorc %3,%1,%0\;vnand %3,%3,%2 vorc %3,%1,%0\;vnand %3,%3,%2 vorc %4,%1,%0\;vnand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2079,7 +2079,7 @@ vxor %3,%1,%0\;vnand %3,%3,%2 vxor %3,%1,%0\;vnand %3,%3,%2 vxor %4,%1,%0\;vnand %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2097,7 +2097,7 @@ vand %3,%1,%0\;vnor %3,%3,%2 vand %3,%1,%0\;vnor %3,%3,%2 vand %4,%1,%0\;vnor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2115,7 +2115,7 @@ vandc %3,%1,%0\;vnor %3,%3,%2 vandc %3,%1,%0\;vnor %3,%3,%2 vandc %4,%1,%0\;vnor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2133,7 +2133,7 @@ veqv %3,%1,%0\;vnor %3,%3,%2 veqv %3,%1,%0\;vnor %3,%3,%2 veqv %4,%1,%0\;vnor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2151,7 +2151,7 @@ vnand %3,%1,%0\;vnor %3,%3,%2 vnand %3,%1,%0\;vnor %3,%3,%2 vnand %4,%1,%0\;vnor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2169,7 +2169,7 @@ vnor %3,%1,%0\;vnor %3,%3,%2 vnor %3,%1,%0\;vnor %3,%3,%2 vnor %4,%1,%0\;vnor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2187,7 +2187,7 @@ vor %3,%1,%0\;vnor %3,%3,%2 vor %3,%1,%0\;vnor %3,%3,%2 vor %4,%1,%0\;vnor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2205,7 +2205,7 @@ vorc %3,%1,%0\;vnor %3,%3,%2 vorc %3,%1,%0\;vnor %3,%3,%2 vorc %4,%1,%0\;vnor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2223,7 +2223,7 @@ vxor %3,%1,%0\;vnor %3,%3,%2 vxor %3,%1,%0\;vnor %3,%3,%2 vxor %4,%1,%0\;vnor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2241,7 +2241,7 @@ vand %3,%1,%0\;vor %3,%3,%2 vand %3,%1,%0\;vor %3,%3,%2 vand %4,%1,%0\;vor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2259,7 +2259,7 @@ vandc %3,%1,%0\;vor %3,%3,%2 vandc %3,%1,%0\;vor %3,%3,%2 vandc %4,%1,%0\;vor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2277,7 +2277,7 @@ veqv %3,%1,%0\;vor %3,%3,%2 veqv %3,%1,%0\;vor %3,%3,%2 veqv %4,%1,%0\;vor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2295,7 +2295,7 @@ vnand %3,%1,%0\;vor %3,%3,%2 vnand %3,%1,%0\;vor %3,%3,%2 vnand %4,%1,%0\;vor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2313,7 +2313,7 @@ vnor %3,%1,%0\;vor %3,%3,%2 vnor %3,%1,%0\;vor %3,%3,%2 vnor %4,%1,%0\;vor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2331,7 +2331,7 @@ vor %3,%1,%0\;vor %3,%3,%2 vor %3,%1,%0\;vor %3,%3,%2 vor %4,%1,%0\;vor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2349,7 +2349,7 @@ vorc %3,%1,%0\;vor %3,%3,%2 vorc %3,%1,%0\;vor %3,%3,%2 vorc %4,%1,%0\;vor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2367,7 +2367,7 @@ vxor %3,%1,%0\;vor %3,%3,%2 vxor %3,%1,%0\;vor %3,%3,%2 vxor %4,%1,%0\;vor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2385,7 +2385,7 @@ vand %3,%1,%0\;vorc %3,%3,%2 vand %3,%1,%0\;vorc %3,%3,%2 vand %4,%1,%0\;vorc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2403,7 +2403,7 @@ vandc %3,%1,%0\;vorc %3,%3,%2 vandc %3,%1,%0\;vorc %3,%3,%2 vandc %4,%1,%0\;vorc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2421,7 +2421,7 @@ veqv %3,%1,%0\;vorc %3,%3,%2 veqv %3,%1,%0\;vorc %3,%3,%2 veqv %4,%1,%0\;vorc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2439,7 +2439,7 @@ vnand %3,%1,%0\;vorc %3,%3,%2 vnand %3,%1,%0\;vorc %3,%3,%2 vnand %4,%1,%0\;vorc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2457,7 +2457,7 @@ vnor %3,%1,%0\;vorc %3,%3,%2 vnor %3,%1,%0\;vorc %3,%3,%2 vnor %4,%1,%0\;vorc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2475,7 +2475,7 @@ vor %3,%1,%0\;vorc %3,%3,%2 vor %3,%1,%0\;vorc %3,%3,%2 vor %4,%1,%0\;vorc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2493,7 +2493,7 @@ vorc %3,%1,%0\;vorc %3,%3,%2 vorc %3,%1,%0\;vorc %3,%3,%2 vorc %4,%1,%0\;vorc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2511,7 +2511,7 @@ vxor %3,%1,%0\;vorc %3,%3,%2 vxor %3,%1,%0\;vorc %3,%3,%2 vxor %4,%1,%0\;vorc %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2529,7 +2529,7 @@ vand %3,%1,%0\;vxor %3,%3,%2 vand %3,%1,%0\;vxor %3,%3,%2 vand %4,%1,%0\;vxor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2547,7 +2547,7 @@ vandc %3,%1,%0\;vxor %3,%3,%2 vandc %3,%1,%0\;vxor %3,%3,%2 vandc %4,%1,%0\;vxor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2565,7 +2565,7 @@ veqv %3,%1,%0\;vxor %3,%3,%2 veqv %3,%1,%0\;vxor %3,%3,%2 veqv %4,%1,%0\;vxor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2583,7 +2583,7 @@ vnand %3,%1,%0\;vxor %3,%3,%2 vnand %3,%1,%0\;vxor %3,%3,%2 vnand %4,%1,%0\;vxor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2601,7 +2601,7 @@ vnor %3,%1,%0\;vxor %3,%3,%2 vnor %3,%1,%0\;vxor %3,%3,%2 vnor %4,%1,%0\;vxor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2619,7 +2619,7 @@ vor %3,%1,%0\;vxor %3,%3,%2 vor %3,%1,%0\;vxor %3,%3,%2 vor %4,%1,%0\;vxor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2637,7 +2637,7 @@ vorc %3,%1,%0\;vxor %3,%3,%2 vorc %3,%1,%0\;vxor %3,%3,%2 vorc %4,%1,%0\;vxor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2655,6 +2655,6 @@ vxor %3,%1,%0\;vxor %3,%3,%2 vxor %3,%1,%0\;vxor %3,%3,%2 vxor %4,%1,%0\;vxor %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) diff --git a/gcc/config/rs6000/genfusion.pl b/gcc/config/rs6000/genfusion.pl index c86c743..ce48fd9 100755 --- a/gcc/config/rs6000/genfusion.pl +++ b/gcc/config/rs6000/genfusion.pl @@ -135,7 +135,7 @@ sub gen_ld_cmpi_p10 print " (set (match_dup 2)\n"; print " (compare:${ccmode} (match_dup 0) (match_dup 3)))]\n"; print " \"\"\n"; - print " [(set_attr \"type\" \"load\")\n"; + print " [(set_attr \"type\" \"fused_load_cmpi\")\n"; print " (set_attr \"cost\" \"8\")\n"; print " (set_attr \"length\" \"8\")])\n"; print "\n"; @@ -159,18 +159,20 @@ sub gen_2logical my ($kind, $vchr, $mode, $pred, $constraint, $cr, $outer, $outer_op, $outer_comp, $outer_inv, $outer_rtl, $inner, $inner_comp, $inner_inv, $inner_rtl, $inner_op, $both_commute, $c4, $bc, $inner_arg0, - $inner_arg1, $inner_exp, $outer_arg2, $outer_exp, $insn); + $inner_arg1, $inner_exp, $outer_arg2, $outer_exp, $insn, $fuse_type); KIND: foreach $kind ('scalar','vector') { if ( $kind eq 'vector' ) { $vchr = "v"; $mode = "VM"; $pred = "altivec_register_operand"; $constraint = "v"; + $fuse_type = "fused_vector"; } else { $vchr = ""; $mode = "GPR"; $pred = "gpc_reg_operand"; $constraint = "r"; + $fuse_type = "fused_arith_logical"; } $c4 = "${constraint},${constraint},${constraint},${constraint}"; OUTER: foreach $outer ( @logicals ) { @@ -227,7 +229,7 @@ sub gen_2logical ${inner_op} %3,%1,%0\\;${outer_op} %3,%3,%2 ${inner_op} %3,%1,%0\\;${outer_op} %3,%3,%2 ${inner_op} %4,%1,%0\\;${outer_op} %3,%4,%2" - [(set_attr "type" "logical") + [(set_attr "type" "$fuse_type") (set_attr "cost" "6") (set_attr "length" "8")]) EOF diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 0bfeb24..3f59b54 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -206,8 +206,26 @@ vecsimple,veccomplex,vecdiv,veccmp,veccmpsimple,vecperm, vecfloat,vecfdiv,vecdouble,mtvsr,mfvsr,crypto, veclogical,veccmpfx,vecexts,vecmove, - htm,htmsimple,dfp,mma" + htm,htmsimple,dfp,mma, + fused_arith_logical, + fused_cmp_isel, + fused_carry, + fused_load_cmpi, + fused_load_load,fused_store_store, + fused_addis_load, + fused_mtbc, + fused_vector" (const_string "integer")) +;; Attr type definitions for fused pairs: +;; fused_arith_logical is used for scalar logical+add/subf and +;; add/subf+logical pairs of instructions. +;; fused_load_cmpi is used for a D-form load fused with +;; a compare immediate. +;; fused_load_load is for a fused pair of loads to adjacent addresses. +;; fused_store_store is for a fused pair of stores to adjacent addresses. +;; fused_addis_load is for addis fused to D-form load for a larger immediate. +;; fused_mtbc is for fused mtlr and bclr[l] pairs. +;; fused_vector is for a fused pair of vector logical instructions. ;; What data size does this instruction work on? ;; This is used for insert, mul and others as necessary. -- cgit v1.1 From 079c23cfe079f203d5df83fea8e92a60c7d7e878 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Fri, 21 May 2021 14:46:00 +0100 Subject: aarch64: Add attributes for builtins specified in aarch64-builtins.c Besides the builtins in aarch64-simd-builtins.def there are a number of builtins defined in aarch64-builtins.c itself. They could also benefit from the attributes generated by aarch64_get_attributes. However aarch64_get_attributes and its helpers are only set up to handle a aarch64_simd_builtin_datum. This patch changes these functions to instead take a flag and mode value that are extracted from aarch64_simd_builtin_datum.flags and aarch64_simd_builtin_datum.mode anyway. Then the various builtin init functions in aarch64-builtins.c can pass down their own FLAG_* flags that they want to derive attributes from. gcc/ChangeLog: * config/aarch64/aarch64-builtins.c (aarch64_call_properties): Take a flag and mode value as arguments. (aarch64_modifies_global_state_p): Likewise. (aarch64_reads_global_state_p): Likewise. (aarch64_could_trap_p): Likewise. (aarch64_get_attributes): Likewise. (aarch64_init_simd_builtins): Adjust callsite of above. (aarch64_init_fcmla_laneq_builtins): Use aarch64_get_attributes to get function attributes to apply to builtins. (aarch64_init_crc32_builtins): Likewise. (aarch64_init_builtin_rsqrt): Likewise. --- gcc/config/aarch64/aarch64-builtins.c | 56 +++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 25 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index acdea2a..3cab3ec 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -906,14 +906,13 @@ aarch64_init_simd_builtin_scalar_types (void) "__builtin_aarch64_simd_udi"); } -/* Return a set of FLAG_* flags that describe what the function could do, +/* Return a set of FLAG_* flags derived from FLAGS + that describe what a function with result MODE could do, taking the command-line flags into account. */ static unsigned int -aarch64_call_properties (aarch64_simd_builtin_datum *d) +aarch64_call_properties (unsigned int flags, machine_mode mode) { - unsigned int flags = d->flags; - - if (!(flags & FLAG_AUTO_FP) && FLOAT_MODE_P (d->mode)) + if (!(flags & FLAG_AUTO_FP) && FLOAT_MODE_P (mode)) flags |= FLAG_FP; /* -fno-trapping-math means that we can assume any FP exceptions @@ -924,12 +923,12 @@ aarch64_call_properties (aarch64_simd_builtin_datum *d) return flags; } -/* Return true if calls to the function could modify some form of - global state. */ +/* Return true if calls to a function with flags F and mode MODE + could modify some form of global state. */ static bool -aarch64_modifies_global_state_p (aarch64_simd_builtin_datum *d) +aarch64_modifies_global_state_p (unsigned int f, machine_mode mode) { - unsigned int flags = aarch64_call_properties (d); + unsigned int flags = aarch64_call_properties (f, mode); if (flags & FLAG_RAISE_FP_EXCEPTIONS) return true; @@ -940,12 +939,12 @@ aarch64_modifies_global_state_p (aarch64_simd_builtin_datum *d) return flags & FLAG_WRITE_MEMORY; } -/* Return true if calls to the function could read some form of - global state. */ +/* Return true if calls to a function with flags F and mode MODE + could read some form of global state. */ static bool -aarch64_reads_global_state_p (aarch64_simd_builtin_datum *d) +aarch64_reads_global_state_p (unsigned int f, machine_mode mode) { - unsigned int flags = aarch64_call_properties (d); + unsigned int flags = aarch64_call_properties (f, mode); if (flags & FLAG_READ_FPCR) return true; @@ -953,11 +952,12 @@ aarch64_reads_global_state_p (aarch64_simd_builtin_datum *d) return flags & FLAG_READ_MEMORY; } -/* Return true if calls to the function could raise a signal. */ +/* Return true if calls to a function with flags F and mode MODE + could raise a signal. */ static bool -aarch64_could_trap_p (aarch64_simd_builtin_datum *d) +aarch64_could_trap_p (unsigned int f, machine_mode mode) { - unsigned int flags = aarch64_call_properties (d); + unsigned int flags = aarch64_call_properties (f, mode); if (flags & FLAG_RAISE_FP_EXCEPTIONS) return true; @@ -975,21 +975,22 @@ aarch64_add_attribute (const char *name, tree attrs) return tree_cons (get_identifier (name), NULL_TREE, attrs); } -/* Return the appropriate function attributes. */ +/* Return the appropriate attributes for a function that has + flags F and mode MODE. */ static tree -aarch64_get_attributes (aarch64_simd_builtin_datum *d) +aarch64_get_attributes (unsigned int f, machine_mode mode) { tree attrs = NULL_TREE; - if (!aarch64_modifies_global_state_p (d)) + if (!aarch64_modifies_global_state_p (f, mode)) { - if (aarch64_reads_global_state_p (d)) + if (aarch64_reads_global_state_p (f, mode)) attrs = aarch64_add_attribute ("pure", attrs); else attrs = aarch64_add_attribute ("const", attrs); } - if (!flag_non_call_exceptions || !aarch64_could_trap_p (d)) + if (!flag_non_call_exceptions || !aarch64_could_trap_p (f, mode)) attrs = aarch64_add_attribute ("nothrow", attrs); return aarch64_add_attribute ("leaf", attrs); @@ -1018,7 +1019,9 @@ aarch64_init_fcmla_laneq_builtins (void) = aarch64_simd_builtin_std_type (SImode, qualifier_lane_pair_index); tree ftype = build_function_type_list (argtype, argtype, argtype, quadtype, lanetype, NULL_TREE); - tree fndecl = aarch64_general_add_builtin (d->name, ftype, d->fcode); + tree attrs = aarch64_get_attributes (FLAG_FP, d->mode); + tree fndecl + = aarch64_general_add_builtin (d->name, ftype, d->fcode, attrs); aarch64_builtin_decls[d->fcode] = fndecl; } @@ -1148,7 +1151,7 @@ aarch64_init_simd_builtins (void) snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s", d->name); - tree attrs = aarch64_get_attributes (d); + tree attrs = aarch64_get_attributes (d->flags, d->mode); fndecl = aarch64_general_add_builtin (namebuf, ftype, fcode, attrs); aarch64_builtin_decls[fcode] = fndecl; @@ -1170,7 +1173,9 @@ aarch64_init_crc32_builtins () tree argtype = aarch64_simd_builtin_std_type (d->mode, qualifier_unsigned); tree ftype = build_function_type_list (usi_type, usi_type, argtype, NULL_TREE); - tree fndecl = aarch64_general_add_builtin (d->name, ftype, d->fcode); + tree attrs = aarch64_get_attributes (FLAG_NONE, d->mode); + tree fndecl + = aarch64_general_add_builtin (d->name, ftype, d->fcode, attrs); aarch64_builtin_decls[d->fcode] = fndecl; } @@ -1210,8 +1215,9 @@ aarch64_init_builtin_rsqrt (void) for (; bdd < bdd_end; bdd++) { ftype = build_function_type_list (bdd->type_node, bdd->type_node, NULL_TREE); + tree attrs = aarch64_get_attributes (FLAG_FP, TYPE_MODE (bdd->type_node)); fndecl = aarch64_general_add_builtin (bdd->builtin_name, - ftype, bdd->function_code); + ftype, bdd->function_code, attrs); aarch64_builtin_decls[bdd->function_code] = fndecl; } } -- cgit v1.1 From 782e57f2c0900f3c3bbaec4b367568b6d05236b8 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 21 May 2021 05:52:11 -0700 Subject: x86: Remove MAX_BITSIZE_MODE_ANY_INT It is only defined for i386 and everyone uses the default: #define MAX_BITSIZE_MODE_ANY_INT (64*BITS_PER_UNIT) Whatever problems we had before, they have been fixed now. * config/i386/i386-modes.def (MAX_BITSIZE_MODE_ANY_INT): Removed. --- gcc/config/i386/i386-modes.def | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def index dbddfd8..4e7014b 100644 --- a/gcc/config/i386/i386-modes.def +++ b/gcc/config/i386/i386-modes.def @@ -107,19 +107,10 @@ INT_MODE (XI, 64); PARTIAL_INT_MODE (HI, 16, P2QI); PARTIAL_INT_MODE (SI, 32, P2HI); -/* Mode used for signed overflow checking of TImode. As - MAX_BITSIZE_MODE_ANY_INT is only 160, wide-int.h reserves only that - rounded up to multiple of HOST_BITS_PER_WIDE_INT bits in wide_int etc., - so OImode is too large. For the overflow checking we actually need - just 1 or 2 bits beyond TImode precision. Use 160 bits to have - a multiple of 32. */ +/* Mode used for signed overflow checking of TImode. For the overflow + checking we actually need just 1 or 2 bits beyond TImode precision. + Use 160 bits to have a multiple of 32. */ PARTIAL_INT_MODE (OI, 160, POI); -/* Keep the OI and XI modes from confusing the compiler into thinking - that these modes could actually be used for computation. They are - only holders for vectors during data movement. Include POImode precision - though. */ -#define MAX_BITSIZE_MODE_ANY_INT (160) - /* The symbol Pmode stands for one of the above machine modes (usually SImode). The tm.h file specifies which one. It is not a distinct mode. */ -- cgit v1.1 From 29a2f51806c5b30e17a8d0e9ba7915a3c53c34ff Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Fri, 26 Feb 2021 04:34:49 -0800 Subject: openacc: Add support for gang local storage allocation in shared memory [PR90115] This patch implements a method to track the "private-ness" of OpenACC variables declared in offload regions in gang-partitioned, worker-partitioned or vector-partitioned modes. Variables declared implicitly in scoped blocks and those declared "private" on enclosing directives (e.g. "acc parallel") are both handled. Variables that are e.g. gang-private can then be adjusted so they reside in GPU shared memory. The reason for doing this is twofold: correct implementation of OpenACC semantics, and optimisation, since shared memory might be faster than the main memory on a GPU. Handling of private variables is intimately tied to the execution model for gangs/workers/vectors implemented by a particular target: for current targets, we use (or on mainline, will soon use) a broadcasting/neutering scheme. That is sufficient for code that e.g. sets a variable in worker-single mode and expects to use the value in worker-partitioned mode. The difficulty (semantics-wise) comes when the user wants to do something like an atomic operation in worker-partitioned mode and expects a worker-single (gang private) variable to be shared across each partitioned worker. Forcing use of shared memory for such variables makes that work properly. In terms of implementation, the parallelism level of a given loop is not fixed until the oaccdevlow pass in the offload compiler, so the patch delays fixing the parallelism level of variables declared on or within such loops until the same point. This is done by adding a new internal UNIQUE function (OACC_PRIVATE) that lists (the address of) each private variable as an argument, and other arguments set so as to be able to determine the correct parallelism level to use for the listed variables. This new internal function fits into the existing scheme for demarcating OpenACC loops, as described in comments in the patch. Two new target hooks are introduced: TARGET_GOACC_ADJUST_PRIVATE_DECL and TARGET_GOACC_EXPAND_VAR_DECL. The first can tweak a variable declaration at oaccdevlow time, and the second at expand time. The first or both of these target hooks can be used by a given offload target, depending on its strategy for implementing private variables. This patch updates the TARGET_GOACC_ADJUST_PRIVATE_DECL target hook in the AMD GCN backend to the current name and prototype. (An earlier version of the hook was already present, but dormant.) gcc/ PR middle-end/90115 * doc/tm.texi.in (TARGET_GOACC_EXPAND_VAR_DECL) (TARGET_GOACC_ADJUST_PRIVATE_DECL): Add documentation hooks. * doc/tm.texi: Regenerate. * expr.c (expand_expr_real_1): Expand decls using the expand_var_decl OpenACC hook if defined. * internal-fn.c (expand_UNIQUE): Handle IFN_UNIQUE_OACC_PRIVATE. * internal-fn.h (IFN_UNIQUE_CODES): Add OACC_PRIVATE. * omp-low.c (omp_context): Add oacc_privatization_candidates field. (lower_oacc_reductions): Add PRIVATE_MARKER parameter. Insert before fork. (lower_oacc_head_tail): Add PRIVATE_MARKER parameter. Modify private marker's gimple call arguments, and pass it to lower_oacc_reductions. (oacc_privatization_scan_clause_chain) (oacc_privatization_scan_decl_chain, lower_oacc_private_marker): New functions. (lower_omp_for, lower_omp_target, lower_omp_1): Use these. * omp-offload.c (convert.h): Include. (oacc_loop_xform_head_tail): Treat private-variable markers like fork/join when transforming head/tail sequences. (struct var_decl_rewrite_info): Add struct. (oacc_rewrite_var_decl, is_sync_builtin_call): New functions. (execute_oacc_device_lower): Support rewriting gang-private variables using target hook, and fix up addr_expr and var_decl nodes afterwards. * target.def (adjust_private_decl, expand_var_decl): New hooks. * config/gcn/gcn-protos.h (gcn_goacc_adjust_gangprivate_decl): Rename to... (gcn_goacc_adjust_private_decl): ...this. * config/gcn/gcn-tree.c (gcn_goacc_adjust_gangprivate_decl): Rename to... (gcn_goacc_adjust_private_decl): ...this. Add LEVEL parameter. * config/gcn/gcn.c (TARGET_GOACC_ADJUST_GANGPRIVATE_DECL): Rename definition using gcn_goacc_adjust_gangprivate_decl... (TARGET_GOACC_ADJUST_PRIVATE_DECL): ...to this, using gcn_goacc_adjust_private_decl. * config/nvptx/nvptx.c (tree-pretty-print.h): Include. (gang_private_shared_size): New global variable. (gang_private_shared_align): Likewise. (gang_private_shared_sym): Likewise. (gang_private_shared_hmap): Likewise. (nvptx_option_override): Initialize these. (nvptx_file_end): Output gang_private_shared_sym. (nvptx_goacc_adjust_private_decl, nvptx_goacc_expand_var_decl): New functions. (nvptx_set_current_function): Clear gang_private_shared_hmap. (TARGET_GOACC_ADJUST_PRIVATE_DECL): Define hook. (TARGET_GOACC_EXPAND_VAR_DECL): Likewise. libgomp/ PR middle-end/90115 * testsuite/libgomp.oacc-c-c++-common/private-atomic-1-gang.c: New test. * testsuite/libgomp.oacc-fortran/private-atomic-1-gang.f90: Likewise. * testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90: Likewise. Co-Authored-By: Chung-Lin Tang Co-Authored-By: Thomas Schwinge --- gcc/config/gcn/gcn-protos.h | 2 +- gcc/config/gcn/gcn-tree.c | 9 +++-- gcc/config/gcn/gcn.c | 4 +-- gcc/config/nvptx/nvptx.c | 80 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h index dc9331c..7ef7ae8 100644 --- a/gcc/config/gcn/gcn-protos.h +++ b/gcc/config/gcn/gcn-protos.h @@ -40,7 +40,7 @@ extern rtx gcn_gen_undef (machine_mode); extern bool gcn_global_address_p (rtx); extern tree gcn_goacc_adjust_propagation_record (tree record_type, bool sender, const char *name); -extern void gcn_goacc_adjust_gangprivate_decl (tree var); +extern tree gcn_goacc_adjust_private_decl (tree var, int level); extern void gcn_goacc_reduction (gcall *call); extern bool gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg); diff --git a/gcc/config/gcn/gcn-tree.c b/gcc/config/gcn/gcn-tree.c index 8f27099..75ea50c 100644 --- a/gcc/config/gcn/gcn-tree.c +++ b/gcc/config/gcn/gcn-tree.c @@ -577,9 +577,12 @@ gcn_goacc_adjust_propagation_record (tree record_type, bool sender, return decl; } -void -gcn_goacc_adjust_gangprivate_decl (tree var) +tree +gcn_goacc_adjust_private_decl (tree var, int level) { + if (level != GOMP_DIM_GANG) + return var; + tree type = TREE_TYPE (var); tree lds_type = build_qualified_type (type, TYPE_QUALS_NO_ADDR_SPACE (type) @@ -597,6 +600,8 @@ gcn_goacc_adjust_gangprivate_decl (tree var) if (machfun) machfun->use_flat_addressing = true; + + return var; } /* }}} */ diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 9660ca6..283a91f 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -6320,8 +6320,8 @@ gcn_dwarf_register_span (rtx rtl) #undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD #define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \ gcn_goacc_adjust_propagation_record -#undef TARGET_GOACC_ADJUST_GANGPRIVATE_DECL -#define TARGET_GOACC_ADJUST_GANGPRIVATE_DECL gcn_goacc_adjust_gangprivate_decl +#undef TARGET_GOACC_ADJUST_PRIVATE_DECL +#define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl #undef TARGET_GOACC_FORK_JOIN #define TARGET_GOACC_FORK_JOIN gcn_fork_join #undef TARGET_GOACC_REDUCTION diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 722b0fa..80116e5 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -75,6 +75,7 @@ #include "fold-const.h" #include "intl.h" #include "opts.h" +#include "tree-pretty-print.h" /* This file should be included last. */ #include "target-def.h" @@ -167,6 +168,12 @@ static unsigned vector_red_align; static unsigned vector_red_partition; static GTY(()) rtx vector_red_sym; +/* Shared memory block for gang-private variables. */ +static unsigned gang_private_shared_size; +static unsigned gang_private_shared_align; +static GTY(()) rtx gang_private_shared_sym; +static hash_map gang_private_shared_hmap; + /* Global lock variable, needed for 128bit worker & gang reductions. */ static GTY(()) tree global_lock_var; @@ -251,6 +258,10 @@ nvptx_option_override (void) vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; vector_red_partition = 0; + gang_private_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gang_private_shared"); + SET_SYMBOL_DATA_AREA (gang_private_shared_sym, DATA_AREA_SHARED); + gang_private_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; + diagnose_openacc_conflict (TARGET_GOMP, "-mgomp"); diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack"); diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt"); @@ -5435,6 +5446,10 @@ nvptx_file_end (void) write_shared_buffer (asm_out_file, vector_red_sym, vector_red_align, vector_red_size); + if (gang_private_shared_size) + write_shared_buffer (asm_out_file, gang_private_shared_sym, + gang_private_shared_align, gang_private_shared_size); + if (need_softstack_decl) { write_var_marker (asm_out_file, false, true, "__nvptx_stacks"); @@ -6662,6 +6677,64 @@ nvptx_truly_noop_truncation (poly_uint64, poly_uint64) return false; } +/* Implement TARGET_GOACC_ADJUST_PRIVATE_DECL. */ + +static tree +nvptx_goacc_adjust_private_decl (tree decl, int level) +{ + if (level != GOMP_DIM_GANG) + return decl; + + /* Set "oacc gang-private" attribute for gang-private variable + declarations. */ + if (!lookup_attribute ("oacc gang-private", DECL_ATTRIBUTES (decl))) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Setting 'oacc gang-private' attribute for decl:"); + print_generic_decl (dump_file, decl, TDF_SLIM); + fputc ('\n', dump_file); + } + tree id = get_identifier ("oacc gang-private"); + DECL_ATTRIBUTES (decl) = tree_cons (id, NULL, DECL_ATTRIBUTES (decl)); + } + + return decl; +} + +/* Implement TARGET_GOACC_EXPAND_VAR_DECL. */ + +static rtx +nvptx_goacc_expand_var_decl (tree var) +{ + /* Place "oacc gang-private" variables in shared memory. */ + if (VAR_P (var) + && lookup_attribute ("oacc gang-private", DECL_ATTRIBUTES (var))) + { + unsigned int offset, *poffset; + poffset = gang_private_shared_hmap.get (var); + if (poffset) + offset = *poffset; + else + { + unsigned HOST_WIDE_INT align = DECL_ALIGN (var); + gang_private_shared_size + = (gang_private_shared_size + align - 1) & ~(align - 1); + if (gang_private_shared_align < align) + gang_private_shared_align = align; + + offset = gang_private_shared_size; + bool existed = gang_private_shared_hmap.put (var, offset); + gcc_checking_assert (!existed); + gang_private_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var)); + } + rtx addr = plus_constant (Pmode, gang_private_shared_sym, offset); + return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr); + } + + return NULL_RTX; +} + static GTY(()) tree nvptx_previous_fndecl; static void @@ -6670,6 +6743,7 @@ nvptx_set_current_function (tree fndecl) if (!fndecl || fndecl == nvptx_previous_fndecl) return; + gang_private_shared_hmap.empty (); nvptx_previous_fndecl = fndecl; vector_red_partition = 0; oacc_bcast_partition = 0; @@ -6834,6 +6908,12 @@ nvptx_libc_has_function (enum function_class fn_class, tree type) #undef TARGET_HAVE_SPECULATION_SAFE_VALUE #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed +#undef TARGET_GOACC_ADJUST_PRIVATE_DECL +#define TARGET_GOACC_ADJUST_PRIVATE_DECL nvptx_goacc_adjust_private_decl + +#undef TARGET_GOACC_EXPAND_VAR_DECL +#define TARGET_GOACC_EXPAND_VAR_DECL nvptx_goacc_expand_var_decl + #undef TARGET_SET_CURRENT_FUNCTION #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function -- cgit v1.1 From f6f45309d9fc140006886456b291e4ac24812cea Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Thu, 20 May 2021 15:08:38 +0200 Subject: [OpenACC privatization, nvptx] Tighten some aspects [PR90115] No functional change. gcc/ PR middle-end/90115 * config/nvptx/nvptx.c (nvptx_goacc_adjust_private_decl) (nvptx_goacc_expand_var_decl): Tighten. --- gcc/config/nvptx/nvptx.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 80116e5..60d3f07 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -6682,12 +6682,12 @@ nvptx_truly_noop_truncation (poly_uint64, poly_uint64) static tree nvptx_goacc_adjust_private_decl (tree decl, int level) { - if (level != GOMP_DIM_GANG) - return decl; + gcc_checking_assert (!lookup_attribute ("oacc gang-private", + DECL_ATTRIBUTES (decl))); /* Set "oacc gang-private" attribute for gang-private variable declarations. */ - if (!lookup_attribute ("oacc gang-private", DECL_ATTRIBUTES (decl))) + if (level == GOMP_DIM_GANG) { if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -6708,9 +6708,10 @@ static rtx nvptx_goacc_expand_var_decl (tree var) { /* Place "oacc gang-private" variables in shared memory. */ - if (VAR_P (var) - && lookup_attribute ("oacc gang-private", DECL_ATTRIBUTES (var))) + if (lookup_attribute ("oacc gang-private", DECL_ATTRIBUTES (var))) { + gcc_checking_assert (VAR_P (var)); + unsigned int offset, *poffset; poffset = gang_private_shared_hmap.get (var); if (poffset) -- cgit v1.1 From 11b8286a83289f5b54e813f14ff56d730c3f3185 Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Thu, 20 May 2021 16:11:37 +0200 Subject: [OpenACC privatization] Largely extend diagnostics and corresponding testsuite coverage [PR90115] gcc/ PR middle-end/90115 * flag-types.h (enum openacc_privatization): New. * params.opt (-param=openacc-privatization): New. * doc/invoke.texi (openacc-privatization): Document it. * omp-general.h (get_openacc_privatization_dump_flags): New function. * omp-low.c (oacc_privatization_candidate_p): Add diagnostics. * omp-offload.c (execute_oacc_device_lower) : Re-work diagnostics. * target.def (goacc.adjust_private_decl): Add 'location_t' parameter. * doc/tm.texi: Regenerate. * config/gcn/gcn-protos.h (gcn_goacc_adjust_private_decl): Adjust. * config/gcn/gcn-tree.c (gcn_goacc_adjust_private_decl): Likewise. * config/nvptx/nvptx.c (nvptx_goacc_adjust_private_decl): Likewise. Preserve it for... (nvptx_goacc_expand_var_decl): ... use here. gcc/testsuite/ PR middle-end/90115 * c-c++-common/goacc/privatization-1-compute-loop.c: New file. * c-c++-common/goacc/privatization-1-compute.c: Likewise. * c-c++-common/goacc/privatization-1-routine_gang-loop.c: Likewise. * c-c++-common/goacc/privatization-1-routine_gang.c: Likewise. * gfortran.dg/goacc/privatization-1-compute-loop.f90: Likewise. * gfortran.dg/goacc/privatization-1-compute.f90: Likewise. * gfortran.dg/goacc/privatization-1-routine_gang-loop.f90: Likewise. * gfortran.dg/goacc/privatization-1-routine_gang.f90: Likewise. * c-c++-common/goacc-gomp/nesting-1.c: Update. * c-c++-common/goacc/private-reduction-1.c: Likewise. * gfortran.dg/goacc/private-3.f95: Likewise. libgomp/ PR middle-end/90115 * testsuite/libgomp.oacc-fortran/private-atomic-1-vector.f90: New file. * testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c: Update. * testsuite/libgomp.oacc-c-c++-common/host_data-7.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-5.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-5.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-6.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-5.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-6.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-7.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-g-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-g-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-v-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/parallel-reduction.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/private-atomic-1-gang.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/private-atomic-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/private-variables.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/static-variable-1.c: Likewise. * testsuite/libgomp.oacc-fortran/acc_on_device-1-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f: Likewise. * testsuite/libgomp.oacc-fortran/acc_on_device-1-3.f: Likewise. * testsuite/libgomp.oacc-fortran/declare-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/host_data-5.F90: Likewise. * testsuite/libgomp.oacc-fortran/if-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-3.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-6.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-3.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-4.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-5.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-6.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-7.f90: Likewise. * testsuite/libgomp.oacc-fortran/optional-private.f90: Likewise. * testsuite/libgomp.oacc-fortran/parallel-dims.f90: Likewise. * testsuite/libgomp.oacc-fortran/private-atomic-1-gang.f90: Likewise. * testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90: Likewise. * testsuite/libgomp.oacc-fortran/private-variables.f90: Likewise. * testsuite/libgomp.oacc-fortran/privatized-ref-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-7.f90: Likewise. --- gcc/config/gcn/gcn-protos.h | 2 +- gcc/config/gcn/gcn-tree.c | 2 +- gcc/config/nvptx/nvptx.c | 61 ++++++++++++++++++++++++++++++++++++++------- 3 files changed, 54 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h index 7ef7ae8..8bd0b43 100644 --- a/gcc/config/gcn/gcn-protos.h +++ b/gcc/config/gcn/gcn-protos.h @@ -40,7 +40,7 @@ extern rtx gcn_gen_undef (machine_mode); extern bool gcn_global_address_p (rtx); extern tree gcn_goacc_adjust_propagation_record (tree record_type, bool sender, const char *name); -extern tree gcn_goacc_adjust_private_decl (tree var, int level); +extern tree gcn_goacc_adjust_private_decl (location_t, tree var, int level); extern void gcn_goacc_reduction (gcall *call); extern bool gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg); diff --git a/gcc/config/gcn/gcn-tree.c b/gcc/config/gcn/gcn-tree.c index 75ea50c..1eb8882 100644 --- a/gcc/config/gcn/gcn-tree.c +++ b/gcc/config/gcn/gcn-tree.c @@ -578,7 +578,7 @@ gcn_goacc_adjust_propagation_record (tree record_type, bool sender, } tree -gcn_goacc_adjust_private_decl (tree var, int level) +gcn_goacc_adjust_private_decl (location_t, tree var, int level) { if (level != GOMP_DIM_GANG) return var; diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 60d3f07..6642bdf 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -6680,7 +6680,7 @@ nvptx_truly_noop_truncation (poly_uint64, poly_uint64) /* Implement TARGET_GOACC_ADJUST_PRIVATE_DECL. */ static tree -nvptx_goacc_adjust_private_decl (tree decl, int level) +nvptx_goacc_adjust_private_decl (location_t loc, tree decl, int level) { gcc_checking_assert (!lookup_attribute ("oacc gang-private", DECL_ATTRIBUTES (decl))); @@ -6689,14 +6689,12 @@ nvptx_goacc_adjust_private_decl (tree decl, int level) declarations. */ if (level == GOMP_DIM_GANG) { - if (dump_file && (dump_flags & TDF_DETAILS)) - { - fprintf (dump_file, "Setting 'oacc gang-private' attribute for decl:"); - print_generic_decl (dump_file, decl, TDF_SLIM); - fputc ('\n', dump_file); - } tree id = get_identifier ("oacc gang-private"); - DECL_ATTRIBUTES (decl) = tree_cons (id, NULL, DECL_ATTRIBUTES (decl)); + /* For later diagnostic purposes, pass LOC as VALUE (wrapped as a + TREE). */ + tree loc_tree = build_empty_stmt (loc); + DECL_ATTRIBUTES (decl) + = tree_cons (id, loc_tree, DECL_ATTRIBUTES (decl)); } return decl; @@ -6708,7 +6706,8 @@ static rtx nvptx_goacc_expand_var_decl (tree var) { /* Place "oacc gang-private" variables in shared memory. */ - if (lookup_attribute ("oacc gang-private", DECL_ATTRIBUTES (var))) + if (tree attr = lookup_attribute ("oacc gang-private", + DECL_ATTRIBUTES (var))) { gcc_checking_assert (VAR_P (var)); @@ -6728,6 +6727,50 @@ nvptx_goacc_expand_var_decl (tree var) bool existed = gang_private_shared_hmap.put (var, offset); gcc_checking_assert (!existed); gang_private_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var)); + + location_t loc = EXPR_LOCATION (TREE_VALUE (attr)); +#if 0 /* For some reason, this doesn't work. */ + if (dump_enabled_p ()) + { + dump_flags_t l_dump_flags + = get_openacc_privatization_dump_flags (); + + const dump_user_location_t d_u_loc + = dump_user_location_t::from_location_t (loc); +/* PR100695 "Format decoder, quoting in 'dump_printf' etc." */ +#if __GNUC__ >= 10 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wformat" +#endif + dump_printf_loc (l_dump_flags, d_u_loc, + "variable %<%T%> adjusted for OpenACC" + " privatization level: %qs\n", + var, "gang"); +#if __GNUC__ >= 10 +# pragma GCC diagnostic pop +#endif + } +#else /* ..., thus emulate that, good enough for testsuite usage. */ + if (param_openacc_privatization != OPENACC_PRIVATIZATION_QUIET) + inform (loc, + "variable %qD adjusted for OpenACC privatization level:" + " %qs", + var, "gang"); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + /* 'dumpfile.c:dump_loc' */ + fprintf (dump_file, "%s:%d:%d: ", LOCATION_FILE (loc), + LOCATION_LINE (loc), LOCATION_COLUMN (loc)); + fprintf (dump_file, "%s: ", "note"); + + fprintf (dump_file, + "variable '"); + print_generic_expr (dump_file, var, TDF_SLIM); + fprintf (dump_file, + "' adjusted for OpenACC privatization level: '%s'\n", + "gang"); + } +#endif } rtx addr = plus_constant (Pmode, gang_private_shared_sym, offset); return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr); -- cgit v1.1 From 842a05518982a130c8087d9ad6bdc457bec184e3 Mon Sep 17 00:00:00 2001 From: Aaron Sawdey Date: Mon, 25 Jan 2021 21:11:52 -0600 Subject: combine patterns for add-add fusion This patch adds a function to genfusion.pl to add a couple more patterns so combine can do fusion of pairs of add and vaddudm instructions. gcc/ChangeLog: * config/rs6000/genfusion.pl (gen_addadd): New function. * config/rs6000/fusion.md: Regenerate file. * config/rs6000/rs6000-cpus.def: Add OPTION_MASK_P10_FUSION_2ADD to masks. * config/rs6000/rs6000.c (rs6000_option_override_internal): Handle default value of OPTION_MASK_P10_FUSION_2ADD. * config/rs6000/rs6000.opt: Add -mpower10-fusion-2add. gcc/testsuite/ChangeLog: * gcc.target/powerpc/fusion-p10-addadd.c: New file. --- gcc/config/rs6000/fusion.md | 36 ++++++++++++++++++++++++++++++++ gcc/config/rs6000/genfusion.pl | 44 +++++++++++++++++++++++++++++++++++++++ gcc/config/rs6000/rs6000-cpus.def | 4 +++- gcc/config/rs6000/rs6000.c | 10 +++++++-- gcc/config/rs6000/rs6000.opt | 4 ++++ 5 files changed, 95 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md index 6d71bc2..6dfe1fa 100644 --- a/gcc/config/rs6000/fusion.md +++ b/gcc/config/rs6000/fusion.md @@ -2658,3 +2658,39 @@ [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) + +;; add-add fusion pattern generated by gen_addadd +(define_insn "*fuse_add_add" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (plus:GPR + (plus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) + (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_2ADD)" + "@ + add %3,%1,%0\;add %3,%3,%2 + add %3,%1,%0\;add %3,%3,%2 + add %3,%1,%0\;add %3,%3,%2 + add %4,%1,%0\;add %3,%4,%2" + [(set_attr "type" "fuse_arithlog") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; vaddudm-vaddudm fusion pattern generated by gen_addadd +(define_insn "*fuse_vaddudm_vaddudm" + [(set (match_operand:V2DI 3 "altivec_register_operand" "=0,1,&v,v") + (plus:V2DI + (plus:V2DI (match_operand:V2DI 0 "altivec_register_operand" "v,v,v,v") + (match_operand:V2DI 1 "altivec_register_operand" "%v,v,v,v")) + (match_operand:V2DI 2 "altivec_register_operand" "v,v,v,v"))) + (clobber (match_scratch:V2DI 4 "=X,X,X,&v"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_2ADD)" + "@ + vaddudm %3,%1,%0\;vaddudm %3,%3,%2 + vaddudm %3,%1,%0\;vaddudm %3,%3,%2 + vaddudm %3,%1,%0\;vaddudm %3,%3,%2 + vaddudm %4,%1,%0\;vaddudm %3,%4,%2" + [(set_attr "type" "fuse_vec") + (set_attr "cost" "6") + (set_attr "length" "8")]) diff --git a/gcc/config/rs6000/genfusion.pl b/gcc/config/rs6000/genfusion.pl index ce48fd9..f8ba978 100755 --- a/gcc/config/rs6000/genfusion.pl +++ b/gcc/config/rs6000/genfusion.pl @@ -240,8 +240,52 @@ EOF } } +sub gen_addadd +{ + my ($kind, $vchr, $op, $type, $mode, $pred, $constraint); + foreach $kind ('scalar','vector') { + if ( $kind eq 'vector' ) { + $vchr = "v"; + $op = "vaddudm"; + $type = "fuse_vec"; + $mode = "V2DI"; + $pred = "altivec_register_operand"; + $constraint = "v"; + } else { + $vchr = ""; + $op = "add"; + $type = "fuse_arithlog"; + $mode = "GPR"; + $pred = "gpc_reg_operand"; + $constraint = "r"; + } + my $c4 = "${constraint},${constraint},${constraint},${constraint}"; + print <<"EOF"; + +;; ${op}-${op} fusion pattern generated by gen_addadd +(define_insn "*fuse_${op}_${op}" + [(set (match_operand:${mode} 3 "${pred}" "=0,1,&${constraint},${constraint}") + (plus:${mode} + (plus:${mode} (match_operand:${mode} 0 "${pred}" "${c4}") + (match_operand:${mode} 1 "${pred}" "%${c4}")) + (match_operand:${mode} 2 "${pred}" "${c4}"))) + (clobber (match_scratch:${mode} 4 "=X,X,X,&${constraint}"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_2ADD)" + "@ + ${op} %3,%1,%0\\;${op} %3,%3,%2 + ${op} %3,%1,%0\\;${op} %3,%3,%2 + ${op} %3,%1,%0\\;${op} %3,%3,%2 + ${op} %4,%1,%0\\;${op} %3,%4,%2" + [(set_attr "type" "${type}") + (set_attr "cost" "6") + (set_attr "length" "8")]) +EOF + } +} + gen_ld_cmpi_p10(); gen_2logical(); +gen_addadd; exit(0); diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def index cbbb42c..d46a91d 100644 --- a/gcc/config/rs6000/rs6000-cpus.def +++ b/gcc/config/rs6000/rs6000-cpus.def @@ -85,7 +85,8 @@ | OTHER_POWER10_MASKS \ | OPTION_MASK_P10_FUSION \ | OPTION_MASK_P10_FUSION_LD_CMPI \ - | OPTION_MASK_P10_FUSION_2LOGICAL) + | OPTION_MASK_P10_FUSION_2LOGICAL \ + | OPTION_MASK_P10_FUSION_2ADD) /* Flags that need to be turned off if -mno-power9-vector. */ #define OTHER_P9_VECTOR_MASKS (OPTION_MASK_FLOAT128_HW \ @@ -135,6 +136,7 @@ | OPTION_MASK_P10_FUSION \ | OPTION_MASK_P10_FUSION_LD_CMPI \ | OPTION_MASK_P10_FUSION_2LOGICAL \ + | OPTION_MASK_P10_FUSION_2ADD \ | OPTION_MASK_HTM \ | OPTION_MASK_ISEL \ | OPTION_MASK_MFCRF \ diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index af3cc90..9f03256 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -4465,16 +4465,22 @@ rs6000_option_override_internal (bool global_init_p) if (TARGET_POWER10 && (rs6000_isa_flags_explicit & OPTION_MASK_MMA) == 0) rs6000_isa_flags |= OPTION_MASK_MMA; - if (TARGET_POWER10 && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION) == 0) + if (TARGET_POWER10 + && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION) == 0) rs6000_isa_flags |= OPTION_MASK_P10_FUSION; if (TARGET_POWER10 && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_LD_CMPI) == 0) rs6000_isa_flags |= OPTION_MASK_P10_FUSION_LD_CMPI; - if (TARGET_POWER10 && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2LOGICAL) == 0) + if (TARGET_POWER10 + && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2LOGICAL) == 0) rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2LOGICAL; + if (TARGET_POWER10 + && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0) + rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD; + /* Turn off vector pair/mma options on non-power10 systems. */ else if (!TARGET_POWER10 && TARGET_MMA) { diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index 2685fa7..e30dc04 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -502,6 +502,10 @@ mpower10-fusion-2logical Target Undocumented Mask(P10_FUSION_2LOGICAL) Var(rs6000_isa_flags) Fuse certain integer operations together for better performance on power10. +mpower10-fusion-2add +Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags) +Fuse certain add operations together for better performance on power10. + mcrypto Target Mask(CRYPTO) Var(rs6000_isa_flags) Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions. -- cgit v1.1 From 5e28089157dc5b2631ddbf612b233b9ad6f9c4aa Mon Sep 17 00:00:00 2001 From: Aaron Sawdey Date: Fri, 21 May 2021 21:59:39 -0500 Subject: Fix rs6000 p10 fusion patterns with old attr type names Somehow I managed to check in a version of genfusion.pl this afternoon that was not updated to the new insn attr type names. Committing as obvious and to make the code match what was posted and reviewed. gcc/ * config/rs6000/genfusion.pl (gen_addadd): Fix incorrect attr types. * config/rs6000/fusion.md: Regenerate file. --- gcc/config/rs6000/fusion.md | 4 ++-- gcc/config/rs6000/genfusion.pl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md index 6dfe1fa..4d810e6 100644 --- a/gcc/config/rs6000/fusion.md +++ b/gcc/config/rs6000/fusion.md @@ -2673,7 +2673,7 @@ add %3,%1,%0\;add %3,%3,%2 add %3,%1,%0\;add %3,%3,%2 add %4,%1,%0\;add %3,%4,%2" - [(set_attr "type" "fuse_arithlog") + [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -2691,6 +2691,6 @@ vaddudm %3,%1,%0\;vaddudm %3,%3,%2 vaddudm %3,%1,%0\;vaddudm %3,%3,%2 vaddudm %4,%1,%0\;vaddudm %3,%4,%2" - [(set_attr "type" "fuse_vec") + [(set_attr "type" "fused_vector") (set_attr "cost" "6") (set_attr "length" "8")]) diff --git a/gcc/config/rs6000/genfusion.pl b/gcc/config/rs6000/genfusion.pl index f8ba978..1fd46cc 100755 --- a/gcc/config/rs6000/genfusion.pl +++ b/gcc/config/rs6000/genfusion.pl @@ -247,14 +247,14 @@ sub gen_addadd if ( $kind eq 'vector' ) { $vchr = "v"; $op = "vaddudm"; - $type = "fuse_vec"; + $type = "fused_vector"; $mode = "V2DI"; $pred = "altivec_register_operand"; $constraint = "v"; } else { $vchr = ""; $op = "add"; - $type = "fuse_arithlog"; + $type = "fused_arith_logical"; $mode = "GPR"; $pred = "gpc_reg_operand"; $constraint = "r"; -- cgit v1.1 From c01c4331112aaf45f0de20ed8883dbeab83ed896 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 23 May 2021 22:14:21 +0200 Subject: i386: Add push insns for 4-byte vectors [PR100722] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2021-05-23 Uroš Bizjak gcc/ PR target/100722 * config/i386/mmx.md (*push2_rex64): New instruction pattern. (*push2): Ditto. (push splitter for SSE registers): New splitter. gcc/testsuite/ PR target/100722 * gcc.target/i386/pr100722.c: New test. --- gcc/config/i386/mmx.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 4c42e6d..453e8ea 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -302,6 +302,39 @@ ] (symbol_ref "true")))]) +;; For TARGET_64BIT we always round up to 8 bytes. +(define_insn "*push2_rex64" + [(set (match_operand:VI_32 0 "push_operand" "=X,X") + (match_operand:VI_32 1 "nonmemory_no_elim_operand" "rC,*v"))] + "TARGET_SSE2 && TARGET_64BIT" + "@ + push{q}\t%q1 + #" + [(set_attr "type" "push,multi") + (set_attr "mode" "DI")]) + +(define_insn "*push2" + [(set (match_operand:VI_32 0 "push_operand" "=<,<") + (match_operand:VI_32 1 "general_no_elim_operand" "rC*m,*v"))] + "TARGET_SSE2 && !TARGET_64BIT" + "@ + push{l}\t%1 + #" + [(set_attr "type" "push,multi") + (set_attr "mode" "SI")]) + +(define_split + [(set (match_operand:VI_32 0 "push_operand") + (match_operand:VI_32 1 "sse_reg_operand"))] + "TARGET_SSE2 && reload_completed" + [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2))) + (set (match_dup 0) (match_dup 1))] +{ + operands[2] = GEN_INT (-PUSH_ROUNDING (GET_MODE_SIZE (mode))); + /* Preserve memory attributes. */ + operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx); +}) + (define_expand "movmisalign" [(set (match_operand:VI_32 0 "nonimmediate_operand") (match_operand:VI_32 1 "nonimmediate_operand"))] -- cgit v1.1 From dc084c487e997e7f47cee55467628ade4488538b Mon Sep 17 00:00:00 2001 From: liuhongt Date: Thu, 20 May 2021 09:59:36 +0800 Subject: Fix ICE when lhs is NULL. gcc/ChangeLog: PR target/100660 * config/i386/i386.c (ix86_gimple_fold_builtin): Replacing stmt with GIMPLE_NOP when lhs doesn't exist. gcc/testsuite/ChangeLog: PR target/100660 * gcc.target/i386/pr100660.c: New test. --- gcc/config/i386/i386.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index f3b4518..28e6113 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -17991,21 +17991,24 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) gcc_assert (n_args == 2); arg0 = gimple_call_arg (stmt, 0); arg1 = gimple_call_arg (stmt, 1); - { - location_t loc = gimple_location (stmt); - tree type = TREE_TYPE (arg0); - tree zero_vec = build_zero_cst (type); - tree minus_one_vec = build_minus_one_cst (type); - tree cmp_type = truth_type_for (type); - gimple_seq stmts = NULL; - tree cmp = gimple_build (&stmts, tcode, cmp_type, arg0, arg1); - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - gimple* g = gimple_build_assign (gimple_call_lhs (stmt), - VEC_COND_EXPR, cmp, - minus_one_vec, zero_vec); - gimple_set_location (g, loc); - gsi_replace (gsi, g, false); - } + if (gimple_call_lhs (stmt)) + { + location_t loc = gimple_location (stmt); + tree type = TREE_TYPE (arg0); + tree zero_vec = build_zero_cst (type); + tree minus_one_vec = build_minus_one_cst (type); + tree cmp_type = truth_type_for (type); + gimple_seq stmts = NULL; + tree cmp = gimple_build (&stmts, tcode, cmp_type, arg0, arg1); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + gimple* g = gimple_build_assign (gimple_call_lhs (stmt), + VEC_COND_EXPR, cmp, + minus_one_vec, zero_vec); + gimple_set_location (g, loc); + gsi_replace (gsi, g, false); + } + else + gsi_replace (gsi, gimple_build_nop (), false); return true; case IX86_BUILTIN_PSLLD: -- cgit v1.1 From 5ba5e856f327b1d6b69d51a11ef63ce89bfbc868 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Mon, 8 Mar 2021 12:23:49 +0000 Subject: arm: Auto-vectorization for MVE: vld2/vst2 This patch enables MVE vld2/vst2 instructions for auto-vectorization. We move the existing expanders from neon.md and enable them for MVE, calling the respective emitter. 2021-03-12 Christophe Lyon gcc/ * config/arm/neon.md (vec_load_lanesoi) (vec_store_lanesoi): Move ... * config/arm/vec-common.md: here. gcc/testsuite/ * gcc.target/arm/simd/mve-vld2.c: New test, derived from slp-perm-2.c --- gcc/config/arm/neon.md | 14 -------------- gcc/config/arm/vec-common.md | 27 +++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 14 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index cc82d06..25d4252 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5066,13 +5066,6 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load2_2reg")))] ) -(define_expand "vec_load_lanesoi" - [(set (match_operand:OI 0 "s_register_operand") - (unspec:OI [(match_operand:OI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] - UNSPEC_VLD2))] - "TARGET_NEON") - (define_insn "neon_vld2" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") @@ -5200,13 +5193,6 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_store2_one_lane")))] ) -(define_expand "vec_store_lanesoi" - [(set (match_operand:OI 0 "neon_struct_operand") - (unspec:OI [(match_operand:OI 1 "s_register_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] - UNSPEC_VST2))] - "TARGET_NEON") - (define_insn "neon_vst2" [(set (match_operand:OI 0 "neon_struct_operand" "=Um") (unspec:OI [(match_operand:OI 1 "s_register_operand" "w") diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 265fa40..0b79e68 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -483,6 +483,33 @@ } else gcc_unreachable (); + DONE; +}) +(define_expand "vec_load_lanesoi" + [(set (match_operand:OI 0 "s_register_operand") + (unspec:OI [(match_operand:OI 1 "neon_struct_operand") + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2))] + "TARGET_NEON || TARGET_HAVE_MVE" +{ + if (TARGET_NEON) + emit_insn (gen_neon_vld2 (operands[0], operands[1])); + else + emit_insn (gen_mve_vld2q (operands[0], operands[1])); + DONE; +}) + +(define_expand "vec_store_lanesoi" + [(set (match_operand:OI 0 "neon_struct_operand") + (unspec:OI [(match_operand:OI 1 "s_register_operand") + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST2))] + "TARGET_NEON || TARGET_HAVE_MVE" +{ + if (TARGET_NEON) + emit_insn (gen_neon_vst2 (operands[0], operands[1])); + else + emit_insn (gen_mve_vst2q (operands[0], operands[1])); DONE; }) -- cgit v1.1 From 4eaf65ed6a6fbeefae28bd850329fb226e76f861 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Thu, 11 Mar 2021 11:08:49 +0000 Subject: arm: Auto-vectorization for MVE: vld4/vst4 This patch enables MVE vld4/vst4 instructions for auto-vectorization. We move the existing expanders from neon.md and enable them for MVE, calling the respective emitter. 2021-03-12 Christophe Lyon gcc/ * config/arm/neon.md (vec_load_lanesxi) (vec_store_lanexoi): Move ... * config/arm/vec-common.md: here. gcc/testsuite/ * gcc.target/arm/simd/mve-vld4.c: New test, derived from slp-perm-3.c --- gcc/config/arm/neon.md | 20 -------------------- gcc/config/arm/vec-common.md | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 20 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 25d4252..977adef 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5620,16 +5620,6 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load4_4reg")))] ) -(define_expand "vec_load_lanesxi" - [(match_operand:XI 0 "s_register_operand") - (match_operand:XI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] - "TARGET_NEON" -{ - emit_insn (gen_neon_vld4 (operands[0], operands[1])); - DONE; -}) - (define_expand "neon_vld4" [(match_operand:XI 0 "s_register_operand") (match_operand:XI 1 "neon_struct_operand") @@ -5821,16 +5811,6 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_store4_4reg")))] ) -(define_expand "vec_store_lanesxi" - [(match_operand:XI 0 "neon_struct_operand") - (match_operand:XI 1 "s_register_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] - "TARGET_NEON" -{ - emit_insn (gen_neon_vst4 (operands[0], operands[1])); - DONE; -}) - (define_expand "neon_vst4" [(match_operand:XI 0 "neon_struct_operand") (match_operand:XI 1 "s_register_operand") diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 0b79e68..e8b2901 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -513,3 +513,29 @@ emit_insn (gen_mve_vst2q (operands[0], operands[1])); DONE; }) + +(define_expand "vec_load_lanesxi" + [(match_operand:XI 0 "s_register_operand") + (match_operand:XI 1 "neon_struct_operand") + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_NEON || TARGET_HAVE_MVE" +{ + if (TARGET_NEON) + emit_insn (gen_neon_vld4 (operands[0], operands[1])); + else + emit_insn (gen_mve_vld4q (operands[0], operands[1])); + DONE; +}) + +(define_expand "vec_store_lanesxi" + [(match_operand:XI 0 "neon_struct_operand") + (match_operand:XI 1 "s_register_operand") + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_NEON || TARGET_HAVE_MVE" +{ + if (TARGET_NEON) + emit_insn (gen_neon_vst4 (operands[0], operands[1])); + else + emit_insn (gen_mve_vst4q (operands[0], operands[1])); + DONE; +}) -- cgit v1.1 From fb0746f3a6b7fd0223efa71d0dc3fc02166e338b Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Mon, 24 May 2021 14:23:50 +0100 Subject: AArch64: Cleanup aarch64_classify_symbol Use a GOT indirection for extern weak symbols instead of a literal - this is the same as PIC/PIE and mirrors LLVM behaviour. Ensure PIC/PIE use the same offset limits for symbols that don't use the GOT. Passes bootstrap and regress. ChangeLog: 2021-04-27 Wilco Dijkstra * config/aarch64/aarch64.c (aarch64_classify_symbol): Use GOT for extern weak symbols. Limit symbol offsets for non-GOT symbols with PIC/PIE. --- gcc/config/aarch64/aarch64.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index c1e451e..e9f961d 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -17902,7 +17902,14 @@ aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset) switch (aarch64_cmodel) { + case AARCH64_CMODEL_TINY_PIC: case AARCH64_CMODEL_TINY: + /* With -fPIC non-local symbols use the GOT. For orthogonality + always use the GOT for extern weak symbols. */ + if ((flag_pic || SYMBOL_REF_WEAK (x)) + && !aarch64_symbol_binds_local_p (x)) + return SYMBOL_TINY_GOT; + /* When we retrieve symbol + offset address, we have to make sure the offset does not cause overflow of the final address. But we have no way of knowing the address of symbol at compile time @@ -17910,42 +17917,30 @@ aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset) symbol + offset is outside the addressible range of +/-1MB in the TINY code model. So we limit the maximum offset to +/-64KB and assume the offset to the symbol is not larger than +/-(1MB - 64KB). - If offset_within_block_p is true we allow larger offsets. - Furthermore force to memory if the symbol is a weak reference to - something that doesn't resolve to a symbol in this module. */ - - if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x)) - return SYMBOL_FORCE_TO_MEM; + If offset_within_block_p is true we allow larger offsets. */ if (!(IN_RANGE (offset, -0x10000, 0x10000) || offset_within_block_p (x, offset))) return SYMBOL_FORCE_TO_MEM; return SYMBOL_TINY_ABSOLUTE; + + case AARCH64_CMODEL_SMALL_SPIC: + case AARCH64_CMODEL_SMALL_PIC: case AARCH64_CMODEL_SMALL: + if ((flag_pic || SYMBOL_REF_WEAK (x)) + && !aarch64_symbol_binds_local_p (x)) + return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC + ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G; + /* Same reasoning as the tiny code model, but the offset cap here is 1MB, allowing +/-3.9GB for the offset to the symbol. */ - - if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x)) - return SYMBOL_FORCE_TO_MEM; if (!(IN_RANGE (offset, -0x100000, 0x100000) || offset_within_block_p (x, offset))) return SYMBOL_FORCE_TO_MEM; return SYMBOL_SMALL_ABSOLUTE; - case AARCH64_CMODEL_TINY_PIC: - if (!aarch64_symbol_binds_local_p (x)) - return SYMBOL_TINY_GOT; - return SYMBOL_TINY_ABSOLUTE; - - case AARCH64_CMODEL_SMALL_SPIC: - case AARCH64_CMODEL_SMALL_PIC: - if (!aarch64_symbol_binds_local_p (x)) - return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC - ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G); - return SYMBOL_SMALL_ABSOLUTE; - case AARCH64_CMODEL_LARGE: /* This is alright even in PIC code as the constant pool reference is always PC relative and within -- cgit v1.1 From b326f495218a213079f572bd9960903b3425da74 Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Mon, 24 May 2021 14:31:37 +0100 Subject: AArch64: Enable fast shifts on Neoverse N1 Enable the fast shift feature in Neoverse N1 tuning - this means additions with a shift left by 1-4 are as fast as addition. This improves multiply by constant expansions, eg. x * 25 is now emitted using shifts rather than a multiply: add w0, w0, w0, lsl 2 add w0, w0, w0, lsl 2 ChangeLog: 2020-09-11 Wilco Dijkstra * config/aarch64/aarch64.c (neoversen1_tunings): Enable AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND. --- gcc/config/aarch64/aarch64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index e9f961d..2753c85 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1659,7 +1659,7 @@ static const struct tune_params neoversen1_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ &generic_prefetch_tune }; -- cgit v1.1 From a8764071f2eb6b4cdc9ecb788dfaa2b095b52598 Mon Sep 17 00:00:00 2001 From: Aaron Sawdey Date: Tue, 2 Mar 2021 18:06:37 -0600 Subject: Fusion patterns for add-logical/logical-add This patch modifies the function in genfusion.pl for generating the logical-logical patterns so that it can also generate the add-logical and logical-add patterns which are very similar. gcc/ChangeLog: * config/rs6000/genfusion.pl (gen_logical_addsubf): Refactor to add generation of logical-add and add-logical fusion pairs. * config/rs6000/rs6000-cpus.def: Add new fusion to ISA 3.1 mask and powerpc mask. * config/rs6000/rs6000.c (rs6000_option_override_internal): Turn on logical-add and add-logical fusion by default. * config/rs6000/rs6000.opt: Add -mpower10-fusion-logical-add and -mpower10-fusion-add-logical options. * config/rs6000/fusion.md: Regenerate file. gcc/testsuite/ChangeLog: * gcc.target/powerpc/fusion-p10-logadd.c: New file. --- gcc/config/rs6000/fusion.md | 872 +++++++++++++++++++++++++++----------- gcc/config/rs6000/genfusion.pl | 83 +++- gcc/config/rs6000/rs6000-cpus.def | 4 + gcc/config/rs6000/rs6000.c | 8 + gcc/config/rs6000/rs6000.opt | 12 +- 5 files changed, 700 insertions(+), 279 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md index 4d810e6..5191210 100644 --- a/gcc/config/rs6000/fusion.md +++ b/gcc/config/rs6000/fusion.md @@ -355,11 +355,11 @@ (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> and (define_insn "*fuse_and_and" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (and:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -373,11 +373,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> and (define_insn "*fuse_andc_and" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -391,11 +391,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> and (define_insn "*fuse_eqv_and" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -409,11 +409,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> and (define_insn "*fuse_nand_and" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -427,11 +427,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> and (define_insn "*fuse_nor_and" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -445,11 +445,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> and (define_insn "*fuse_or_and" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (and:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -463,11 +463,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> and (define_insn "*fuse_orc_and" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -481,11 +481,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> and (define_insn "*fuse_xor_and" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (and:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -499,11 +499,47 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; add-logical fusion pattern generated by gen_logical_addsubf +;; scalar add -> and +(define_insn "*fuse_add_and" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (and:GPR (plus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) + (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "@ + add %3,%1,%0\;and %3,%3,%2 + add %3,%1,%0\;and %3,%3,%2 + add %3,%1,%0\;and %3,%3,%2 + add %4,%1,%0\;and %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; add-logical fusion pattern generated by gen_logical_addsubf +;; scalar subf -> and +(define_insn "*fuse_subf_and" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (and:GPR (minus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) + (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "@ + subf %3,%1,%0\;and %3,%3,%2 + subf %3,%1,%0\;and %3,%3,%2 + subf %3,%1,%0\;and %3,%3,%2 + subf %4,%1,%0\;and %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> andc (define_insn "*fuse_and_andc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (and:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -517,11 +553,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> andc (define_insn "*fuse_andc_andc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -535,11 +571,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> andc (define_insn "*fuse_eqv_andc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -553,11 +589,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> andc (define_insn "*fuse_nand_andc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -571,11 +607,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> andc (define_insn "*fuse_nor_andc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -589,11 +625,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> andc (define_insn "*fuse_or_andc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (and:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -607,11 +643,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> andc (define_insn "*fuse_orc_andc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -625,11 +661,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> andc (define_insn "*fuse_xor_andc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (and:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -643,11 +679,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> eqv (define_insn "*fuse_and_eqv" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (not:GPR (xor:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (not:GPR (xor:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -661,11 +697,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> eqv (define_insn "*fuse_andc_eqv" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (not:GPR (xor:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (not:GPR (xor:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -679,11 +715,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> eqv (define_insn "*fuse_eqv_eqv" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (not:GPR (xor:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (not:GPR (xor:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -697,11 +733,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> eqv (define_insn "*fuse_nand_eqv" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (not:GPR (xor:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (not:GPR (xor:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -715,11 +751,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> eqv (define_insn "*fuse_nor_eqv" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (not:GPR (xor:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (not:GPR (xor:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -733,11 +769,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> eqv (define_insn "*fuse_or_eqv" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (not:GPR (xor:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (not:GPR (xor:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -751,11 +787,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> eqv (define_insn "*fuse_orc_eqv" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (not:GPR (xor:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (not:GPR (xor:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -769,11 +805,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> eqv (define_insn "*fuse_xor_eqv" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (not:GPR (xor:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (not:GPR (xor:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -787,11 +823,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> nand (define_insn "*fuse_and_nand" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (not:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (not:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -805,11 +841,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> nand (define_insn "*fuse_andc_nand" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (not:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (ior:GPR (not:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -823,11 +859,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> nand (define_insn "*fuse_eqv_nand" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (not:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (not:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -841,11 +877,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> nand (define_insn "*fuse_nand_nand" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (not:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (ior:GPR (not:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -859,11 +895,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> nand (define_insn "*fuse_nor_nand" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (not:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (ior:GPR (not:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -877,11 +913,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> nand (define_insn "*fuse_or_nand" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (not:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (not:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -895,11 +931,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> nand (define_insn "*fuse_orc_nand" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (not:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (ior:GPR (not:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -913,11 +949,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> nand (define_insn "*fuse_xor_nand" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -931,11 +967,47 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; add-logical fusion pattern generated by gen_logical_addsubf +;; scalar add -> nand +(define_insn "*fuse_add_nand" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (ior:GPR (not:GPR (plus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) + (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "@ + add %3,%1,%0\;nand %3,%3,%2 + add %3,%1,%0\;nand %3,%3,%2 + add %3,%1,%0\;nand %3,%3,%2 + add %4,%1,%0\;nand %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; add-logical fusion pattern generated by gen_logical_addsubf +;; scalar subf -> nand +(define_insn "*fuse_subf_nand" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (ior:GPR (not:GPR (minus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) + (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "@ + subf %3,%1,%0\;nand %3,%3,%2 + subf %3,%1,%0\;nand %3,%3,%2 + subf %3,%1,%0\;nand %3,%3,%2 + subf %4,%1,%0\;nand %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> nor (define_insn "*fuse_and_nor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (not:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (and:GPR (not:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -949,11 +1021,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> nor (define_insn "*fuse_andc_nor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (not:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (and:GPR (not:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -967,11 +1039,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> nor (define_insn "*fuse_eqv_nor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (not:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (and:GPR (not:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -985,11 +1057,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> nor (define_insn "*fuse_nand_nor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (not:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (and:GPR (not:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1003,11 +1075,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> nor (define_insn "*fuse_nor_nor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (not:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (and:GPR (not:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1021,11 +1093,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> nor (define_insn "*fuse_or_nor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (not:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (and:GPR (not:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1039,11 +1111,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> nor (define_insn "*fuse_orc_nor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (not:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (and:GPR (not:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1057,11 +1129,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> nor (define_insn "*fuse_xor_nor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1075,11 +1147,47 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; add-logical fusion pattern generated by gen_logical_addsubf +;; scalar add -> nor +(define_insn "*fuse_add_nor" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (and:GPR (not:GPR (plus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) + (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "@ + add %3,%1,%0\;nor %3,%3,%2 + add %3,%1,%0\;nor %3,%3,%2 + add %3,%1,%0\;nor %3,%3,%2 + add %4,%1,%0\;nor %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; add-logical fusion pattern generated by gen_logical_addsubf +;; scalar subf -> nor +(define_insn "*fuse_subf_nor" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (and:GPR (not:GPR (minus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) + (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "@ + subf %3,%1,%0\;nor %3,%3,%2 + subf %3,%1,%0\;nor %3,%3,%2 + subf %3,%1,%0\;nor %3,%3,%2 + subf %4,%1,%0\;nor %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> or (define_insn "*fuse_and_or" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1093,11 +1201,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> or (define_insn "*fuse_andc_or" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (ior:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1111,11 +1219,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> or (define_insn "*fuse_eqv_or" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1129,11 +1237,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> or (define_insn "*fuse_nand_or" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (ior:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1147,11 +1255,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> or (define_insn "*fuse_nor_or" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (ior:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1165,11 +1273,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> or (define_insn "*fuse_or_or" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1183,11 +1291,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> or (define_insn "*fuse_orc_or" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (ior:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1201,11 +1309,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> or (define_insn "*fuse_xor_or" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1219,11 +1327,47 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; add-logical fusion pattern generated by gen_logical_addsubf +;; scalar add -> or +(define_insn "*fuse_add_or" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (ior:GPR (plus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) + (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "@ + add %3,%1,%0\;or %3,%3,%2 + add %3,%1,%0\;or %3,%3,%2 + add %3,%1,%0\;or %3,%3,%2 + add %4,%1,%0\;or %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; add-logical fusion pattern generated by gen_logical_addsubf +;; scalar subf -> or +(define_insn "*fuse_subf_or" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (ior:GPR (minus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) + (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "@ + subf %3,%1,%0\;or %3,%3,%2 + subf %3,%1,%0\;or %3,%3,%2 + subf %3,%1,%0\;or %3,%3,%2 + subf %4,%1,%0\;or %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> orc (define_insn "*fuse_and_orc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1237,11 +1381,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> orc (define_insn "*fuse_andc_orc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (ior:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1255,11 +1399,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> orc (define_insn "*fuse_eqv_orc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1273,11 +1417,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> orc (define_insn "*fuse_nand_orc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (ior:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1291,11 +1435,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> orc (define_insn "*fuse_nor_orc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (ior:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1309,11 +1453,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> orc (define_insn "*fuse_or_orc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1327,11 +1471,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> orc (define_insn "*fuse_orc_orc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (ior:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1345,11 +1489,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> orc (define_insn "*fuse_xor_orc" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (ior:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1363,11 +1507,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> xor (define_insn "*fuse_and_xor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (xor:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (xor:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1381,11 +1525,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> xor (define_insn "*fuse_andc_xor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (xor:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (xor:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1399,11 +1543,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> xor (define_insn "*fuse_eqv_xor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (xor:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (xor:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1417,11 +1561,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> xor (define_insn "*fuse_nand_xor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (xor:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (xor:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1435,11 +1579,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> xor (define_insn "*fuse_nor_xor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (xor:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (xor:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1453,11 +1597,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> xor (define_insn "*fuse_or_xor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (xor:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (xor:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1471,11 +1615,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> xor (define_insn "*fuse_orc_xor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (xor:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (xor:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1489,11 +1633,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> xor (define_insn "*fuse_xor_xor" [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") - (xor:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (xor:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] @@ -1507,11 +1651,227 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-add fusion pattern generated by gen_logical_addsubf +;; scalar and -> add +(define_insn "*fuse_and_add" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (plus:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) + (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "@ + and %3,%1,%0\;add %3,%3,%2 + and %3,%1,%0\;add %3,%3,%2 + and %3,%1,%0\;add %3,%3,%2 + and %4,%1,%0\;add %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-add fusion pattern generated by gen_logical_addsubf +;; scalar nand -> add +(define_insn "*fuse_nand_add" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (plus:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) + (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "@ + nand %3,%1,%0\;add %3,%3,%2 + nand %3,%1,%0\;add %3,%3,%2 + nand %3,%1,%0\;add %3,%3,%2 + nand %4,%1,%0\;add %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-add fusion pattern generated by gen_logical_addsubf +;; scalar nor -> add +(define_insn "*fuse_nor_add" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (plus:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) + (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "@ + nor %3,%1,%0\;add %3,%3,%2 + nor %3,%1,%0\;add %3,%3,%2 + nor %3,%1,%0\;add %3,%3,%2 + nor %4,%1,%0\;add %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-add fusion pattern generated by gen_logical_addsubf +;; scalar or -> add +(define_insn "*fuse_or_add" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (plus:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) + (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "@ + or %3,%1,%0\;add %3,%3,%2 + or %3,%1,%0\;add %3,%3,%2 + or %3,%1,%0\;add %3,%3,%2 + or %4,%1,%0\;add %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-add fusion pattern generated by gen_logical_addsubf +;; scalar and -> subf +(define_insn "*fuse_and_subf" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (minus:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) + (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "@ + and %3,%1,%0\;subf %3,%3,%2 + and %3,%1,%0\;subf %3,%3,%2 + and %3,%1,%0\;subf %3,%3,%2 + and %4,%1,%0\;subf %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-add fusion pattern generated by gen_logical_addsubf +;; scalar nand -> subf +(define_insn "*fuse_nand_subf" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (minus:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) + (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "@ + nand %3,%1,%0\;subf %3,%3,%2 + nand %3,%1,%0\;subf %3,%3,%2 + nand %3,%1,%0\;subf %3,%3,%2 + nand %4,%1,%0\;subf %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-add fusion pattern generated by gen_logical_addsubf +;; scalar nor -> subf +(define_insn "*fuse_nor_subf" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (minus:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) + (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "@ + nor %3,%1,%0\;subf %3,%3,%2 + nor %3,%1,%0\;subf %3,%3,%2 + nor %3,%1,%0\;subf %3,%3,%2 + nor %4,%1,%0\;subf %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-add fusion pattern generated by gen_logical_addsubf +;; scalar or -> subf +(define_insn "*fuse_or_subf" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (minus:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) + (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "@ + or %3,%1,%0\;subf %3,%3,%2 + or %3,%1,%0\;subf %3,%3,%2 + or %3,%1,%0\;subf %3,%3,%2 + or %4,%1,%0\;subf %3,%4,%2" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-add fusion pattern generated by gen_logical_addsubf +;; scalar and -> rsubf +(define_insn "*fuse_and_rsubf" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (minus:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r") + (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "@ + and %3,%1,%0\;subf %3,%2,%3 + and %3,%1,%0\;subf %3,%2,%3 + and %3,%1,%0\;subf %3,%2,%3 + and %4,%1,%0\;subf %3,%2,%4" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-add fusion pattern generated by gen_logical_addsubf +;; scalar nand -> rsubf +(define_insn "*fuse_nand_rsubf" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (minus:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "@ + nand %3,%1,%0\;subf %3,%2,%3 + nand %3,%1,%0\;subf %3,%2,%3 + nand %3,%1,%0\;subf %3,%2,%3 + nand %4,%1,%0\;subf %3,%2,%4" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-add fusion pattern generated by gen_logical_addsubf +;; scalar nor -> rsubf +(define_insn "*fuse_nor_rsubf" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (minus:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r") + (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) + (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "@ + nor %3,%1,%0\;subf %3,%2,%3 + nor %3,%1,%0\;subf %3,%2,%3 + nor %3,%1,%0\;subf %3,%2,%3 + nor %4,%1,%0\;subf %3,%2,%4" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-add fusion pattern generated by gen_logical_addsubf +;; scalar or -> rsubf +(define_insn "*fuse_or_rsubf" + [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + (minus:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r") + (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") + (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) + (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] + "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "@ + or %3,%1,%0\;subf %3,%2,%3 + or %3,%1,%0\;subf %3,%2,%3 + or %3,%1,%0\;subf %3,%2,%3 + or %4,%1,%0\;subf %3,%2,%4" + [(set_attr "type" "fused_arith_logical") + (set_attr "cost" "6") + (set_attr "length" "8")]) + +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vand (define_insn "*fuse_vand_vand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1525,11 +1885,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vand (define_insn "*fuse_vandc_vand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1543,11 +1903,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vand (define_insn "*fuse_veqv_vand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1561,11 +1921,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vand (define_insn "*fuse_vnand_vand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1579,11 +1939,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vand (define_insn "*fuse_vnor_vand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1597,11 +1957,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vand (define_insn "*fuse_vor_vand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (and:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1615,11 +1975,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vand (define_insn "*fuse_vorc_vand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1633,11 +1993,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vand (define_insn "*fuse_vxor_vand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (and:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1651,11 +2011,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vandc (define_insn "*fuse_vand_vandc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1669,11 +2029,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vandc (define_insn "*fuse_vandc_vandc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1687,11 +2047,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vandc (define_insn "*fuse_veqv_vandc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1705,11 +2065,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vandc (define_insn "*fuse_vnand_vandc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1723,11 +2083,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vandc (define_insn "*fuse_vnor_vandc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1741,11 +2101,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vandc (define_insn "*fuse_vor_vandc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (and:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1759,11 +2119,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vandc (define_insn "*fuse_vorc_vandc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1777,11 +2137,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vandc (define_insn "*fuse_vxor_vandc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (and:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1795,11 +2155,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> veqv (define_insn "*fuse_vand_veqv" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (not:VM (xor:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (not:VM (xor:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1813,11 +2173,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> veqv (define_insn "*fuse_vandc_veqv" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (not:VM (xor:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (not:VM (xor:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1831,11 +2191,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> veqv (define_insn "*fuse_veqv_veqv" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (not:VM (xor:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (not:VM (xor:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1849,11 +2209,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> veqv (define_insn "*fuse_vnand_veqv" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (not:VM (xor:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (not:VM (xor:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1867,11 +2227,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> veqv (define_insn "*fuse_vnor_veqv" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (not:VM (xor:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (not:VM (xor:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1885,11 +2245,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> veqv (define_insn "*fuse_vor_veqv" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (not:VM (xor:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (not:VM (xor:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1903,11 +2263,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> veqv (define_insn "*fuse_vorc_veqv" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (not:VM (xor:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (not:VM (xor:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1921,11 +2281,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> veqv (define_insn "*fuse_vxor_veqv" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (not:VM (xor:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (not:VM (xor:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1939,11 +2299,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vnand (define_insn "*fuse_vand_vnand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (not:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (ior:VM (not:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1957,11 +2317,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vnand (define_insn "*fuse_vandc_vnand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (not:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (ior:VM (not:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1975,11 +2335,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vnand (define_insn "*fuse_veqv_vnand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (not:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (ior:VM (not:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -1993,11 +2353,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vnand (define_insn "*fuse_vnand_vnand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (not:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (ior:VM (not:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2011,11 +2371,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vnand (define_insn "*fuse_vnor_vnand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (not:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (ior:VM (not:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2029,11 +2389,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vnand (define_insn "*fuse_vor_vnand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (not:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (ior:VM (not:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2047,11 +2407,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vnand (define_insn "*fuse_vorc_vnand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (not:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (ior:VM (not:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2065,11 +2425,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vnand (define_insn "*fuse_vxor_vnand" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (ior:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2083,11 +2443,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vnor (define_insn "*fuse_vand_vnor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (not:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (and:VM (not:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2101,11 +2461,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vnor (define_insn "*fuse_vandc_vnor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (not:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (and:VM (not:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2119,11 +2479,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vnor (define_insn "*fuse_veqv_vnor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (not:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (and:VM (not:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2137,11 +2497,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vnor (define_insn "*fuse_vnand_vnor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (not:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (and:VM (not:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2155,11 +2515,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vnor (define_insn "*fuse_vnor_vnor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (not:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (and:VM (not:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2173,11 +2533,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vnor (define_insn "*fuse_vor_vnor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (not:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (and:VM (not:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2191,11 +2551,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vnor (define_insn "*fuse_vorc_vnor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (not:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (and:VM (not:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2209,11 +2569,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vnor (define_insn "*fuse_vxor_vnor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2227,11 +2587,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vor (define_insn "*fuse_vand_vor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (ior:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2245,11 +2605,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vor (define_insn "*fuse_vandc_vor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (ior:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2263,11 +2623,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vor (define_insn "*fuse_veqv_vor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (ior:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2281,11 +2641,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vor (define_insn "*fuse_vnand_vor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (ior:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2299,11 +2659,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vor (define_insn "*fuse_vnor_vor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (ior:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2317,11 +2677,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vor (define_insn "*fuse_vor_vor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (ior:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2335,11 +2695,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vor (define_insn "*fuse_vorc_vor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (ior:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2353,11 +2713,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vor (define_insn "*fuse_vxor_vor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (ior:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2371,11 +2731,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vorc (define_insn "*fuse_vand_vorc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (ior:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2389,11 +2749,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vorc (define_insn "*fuse_vandc_vorc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (ior:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2407,11 +2767,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vorc (define_insn "*fuse_veqv_vorc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (ior:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2425,11 +2785,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vorc (define_insn "*fuse_vnand_vorc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (ior:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2443,11 +2803,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vorc (define_insn "*fuse_vnor_vorc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (ior:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2461,11 +2821,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vorc (define_insn "*fuse_vor_vorc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (ior:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2479,11 +2839,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vorc (define_insn "*fuse_vorc_vorc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (ior:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2497,11 +2857,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vorc (define_insn "*fuse_vxor_vorc" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (ior:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (ior:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2515,11 +2875,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vxor (define_insn "*fuse_vand_vxor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (xor:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (xor:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2533,11 +2893,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vxor (define_insn "*fuse_vandc_vxor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (xor:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (xor:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2551,11 +2911,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vxor (define_insn "*fuse_veqv_vxor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (xor:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (xor:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2569,11 +2929,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vxor (define_insn "*fuse_vnand_vxor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (xor:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (xor:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2587,11 +2947,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vxor (define_insn "*fuse_vnor_vxor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (xor:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (xor:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2605,11 +2965,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vxor (define_insn "*fuse_vor_vxor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (xor:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (xor:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2623,11 +2983,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vxor (define_insn "*fuse_vorc_vxor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (xor:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) + (xor:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] @@ -2641,11 +3001,11 @@ (set_attr "cost" "6") (set_attr "length" "8")]) -;; logical-logical fusion pattern generated by gen_2logical +;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vxor (define_insn "*fuse_vxor_vxor" [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") - (xor:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") + (xor:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&r"))] diff --git a/gcc/config/rs6000/genfusion.pl b/gcc/config/rs6000/genfusion.pl index 1fd46cc..1285dd4 100755 --- a/gcc/config/rs6000/genfusion.pl +++ b/gcc/config/rs6000/genfusion.pl @@ -144,23 +144,32 @@ sub gen_ld_cmpi_p10 } } -sub gen_2logical +sub gen_logical_addsubf { my @logicals = ( "and", "andc", "eqv", "nand", "nor", "or", "orc", "xor" ); + my %logicals_addsub = ( "and"=>1, "nand"=>1, "nor"=>1, "or"=>1 ); + my @addsub = ( "add", "subf" ); + my %isaddsub = ( "add"=>1, "subf"=>1 ); my %complement = ( "and"=> 0, "andc"=> 1, "eqv"=> 0, "nand"=> 3, - "nor"=> 3, "or"=> 0, "orc"=> 1, "xor"=> 0 ); + "nor"=> 3, "or"=> 0, "orc"=> 1, "xor"=> 0, + "add"=> 0, "subf"=> 0 ); my %invert = ( "and"=> 0, "andc"=> 0, "eqv"=> 1, "nand"=> 0, - "nor"=> 0, "or"=> 0, "orc"=> 0, "xor"=> 0 ); + "nor"=> 0, "or"=> 0, "orc"=> 0, "xor"=> 0, + "add"=> 0, "subf"=> 0 ); my %commute2 = ( "and"=> 1, "andc"=> 0, "eqv"=> 1, "nand"=> 0, "nor"=> 0, "or"=> 1, "orc"=> 0, "xor"=> 1 ); my %rtlop = ( "and"=>"and", "andc"=>"and", "eqv"=>"xor", "nand"=>"ior", - "nor"=>"and", "or"=>"ior", "orc"=>"ior", "xor"=>"xor" ); + "nor"=>"and", "or"=>"ior", "orc"=>"ior", "xor"=>"xor", + "add"=>"plus", "subf"=>"minus" ); - my ($kind, $vchr, $mode, $pred, $constraint, $cr, $outer, $outer_op, - $outer_comp, $outer_inv, $outer_rtl, $inner, $inner_comp, $inner_inv, - $inner_rtl, $inner_op, $both_commute, $c4, $bc, $inner_arg0, - $inner_arg1, $inner_exp, $outer_arg2, $outer_exp, $insn, $fuse_type); + my ($kind, $vchr, $mode, $pred, $constraint, $cr, $outer, @outer_ops, + $outer_op, $outer_comp, $outer_inv, $outer_rtl, $inner, @inner_ops, + $inner_comp, $inner_inv, $inner_rtl, $inner_op, $both_commute, $c4, + $bc, $inner_arg0, $inner_arg1, $inner_exp, $outer_arg2, $outer_exp, + $target_flag, $ftype, $insn, $is_rsubf, $outer_32, $outer_42, + $outer_name, $fuse_type); KIND: foreach $kind ('scalar','vector') { + @outer_ops = @logicals; if ( $kind eq 'vector' ) { $vchr = "v"; $mode = "VM"; @@ -173,14 +182,37 @@ sub gen_2logical $pred = "gpc_reg_operand"; $constraint = "r"; $fuse_type = "fused_arith_logical"; + push (@outer_ops, @addsub); + push (@outer_ops, ( "rsubf" )); } $c4 = "${constraint},${constraint},${constraint},${constraint}"; - OUTER: foreach $outer ( @logicals ) { + OUTER: foreach $outer ( @outer_ops ) { + $outer_name = "${vchr}${outer}"; + if ( $outer eq "rsubf" ) { + $is_rsubf = 1; + $outer = "subf"; + } else { + $is_rsubf = 0; + } $outer_op = "${vchr}${outer}"; $outer_comp = $complement{$outer}; $outer_inv = $invert{$outer}; $outer_rtl = $rtlop{$outer}; - INNER: foreach $inner ( @logicals ) { + @inner_ops = @logicals; + $ftype = "logical-logical"; + $target_flag = "TARGET_P10_FUSION_2LOGICAL"; + if ( exists $isaddsub{$outer} ) { + @inner_ops = sort keys %logicals_addsub; + $ftype = "logical-add"; + $target_flag = "TARGET_P10_FUSION_LOGADD"; + } elsif ( $kind ne 'vector' && exists $logicals_addsub{$outer} ) { + push (@inner_ops, @addsub); + } + INNER: foreach $inner ( @inner_ops ) { + if ( exists $isaddsub{$inner} ) { + $ftype = "add-logical"; + $target_flag = "TARGET_P10_FUSION_ADDLOG"; + } $inner_comp = $complement{$inner}; $inner_inv = $invert{$inner}; $inner_rtl = $rtlop{$inner}; @@ -197,7 +229,7 @@ sub gen_2logical if ( ($inner_comp & 2) == 2 ) { $inner_arg1 = "(not:${mode} $inner_arg1)"; } - $inner_exp = "(${inner_rtl}:${mode} ${inner_arg0} + $inner_exp = "(${inner_rtl}:${mode} ${inner_arg0} ${inner_arg1})"; if ( $inner_inv == 1 ) { $inner_exp = "(not:${mode} $inner_exp)"; @@ -209,26 +241,35 @@ sub gen_2logical if ( ($outer_comp & 2) == 2 ) { $inner_exp = "(not:${mode} $inner_exp)"; } - $outer_exp = "(${outer_rtl}:${mode} ${inner_exp} + if ( $is_rsubf == 1 ) { + $outer_exp = "(${outer_rtl}:${mode} ${outer_arg2} + ${inner_exp})"; + $outer_32 = "%2,%3"; + $outer_42 = "%2,%4"; + } else { + $outer_exp = "(${outer_rtl}:${mode} ${inner_exp} ${outer_arg2})"; + $outer_32 = "%3,%2"; + $outer_42 = "%4,%2"; + } if ( $outer_inv == 1 ) { $outer_exp = "(not:${mode} $outer_exp)"; } $insn = <<"EOF"; -;; logical-logical fusion pattern generated by gen_2logical -;; $kind $inner_op -> $outer_op -(define_insn "*fuse_${inner_op}_${outer_op}" +;; $ftype fusion pattern generated by gen_logical_addsubf +;; $kind $inner_op -> $outer_name +(define_insn "*fuse_${inner_op}_${outer_name}" [(set (match_operand:${mode} 3 "${pred}" "=0,1,&${constraint},${constraint}") ${outer_exp}) (clobber (match_scratch:${mode} 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION && $target_flag)" "@ - ${inner_op} %3,%1,%0\\;${outer_op} %3,%3,%2 - ${inner_op} %3,%1,%0\\;${outer_op} %3,%3,%2 - ${inner_op} %3,%1,%0\\;${outer_op} %3,%3,%2 - ${inner_op} %4,%1,%0\\;${outer_op} %3,%4,%2" + ${inner_op} %3,%1,%0\\;${outer_op} %3,${outer_32} + ${inner_op} %3,%1,%0\\;${outer_op} %3,${outer_32} + ${inner_op} %3,%1,%0\\;${outer_op} %3,${outer_32} + ${inner_op} %4,%1,%0\\;${outer_op} %3,${outer_42}" [(set_attr "type" "$fuse_type") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -284,7 +325,7 @@ EOF } gen_ld_cmpi_p10(); -gen_2logical(); +gen_logical_addsubf(); gen_addadd; exit(0); diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def index d46a91d..52ce848 100644 --- a/gcc/config/rs6000/rs6000-cpus.def +++ b/gcc/config/rs6000/rs6000-cpus.def @@ -86,6 +86,8 @@ | OPTION_MASK_P10_FUSION \ | OPTION_MASK_P10_FUSION_LD_CMPI \ | OPTION_MASK_P10_FUSION_2LOGICAL \ + | OPTION_MASK_P10_FUSION_LOGADD \ + | OPTION_MASK_P10_FUSION_ADDLOG \ | OPTION_MASK_P10_FUSION_2ADD) /* Flags that need to be turned off if -mno-power9-vector. */ @@ -136,6 +138,8 @@ | OPTION_MASK_P10_FUSION \ | OPTION_MASK_P10_FUSION_LD_CMPI \ | OPTION_MASK_P10_FUSION_2LOGICAL \ + | OPTION_MASK_P10_FUSION_LOGADD \ + | OPTION_MASK_P10_FUSION_ADDLOG \ | OPTION_MASK_P10_FUSION_2ADD \ | OPTION_MASK_HTM \ | OPTION_MASK_ISEL \ diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 9f03256..835af77 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -4478,6 +4478,14 @@ rs6000_option_override_internal (bool global_init_p) rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2LOGICAL; if (TARGET_POWER10 + && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_LOGADD) == 0) + rs6000_isa_flags |= OPTION_MASK_P10_FUSION_LOGADD; + + if (TARGET_POWER10 + && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_ADDLOG) == 0) + rs6000_isa_flags |= OPTION_MASK_P10_FUSION_ADDLOG; + + if (TARGET_POWER10 && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0) rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD; diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index e30dc04..0538db3 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -500,11 +500,19 @@ Fuse certain integer operations together for better performance on power10. mpower10-fusion-2logical Target Undocumented Mask(P10_FUSION_2LOGICAL) Var(rs6000_isa_flags) -Fuse certain integer operations together for better performance on power10. +Fuse pairs of scalar or vector logical operations together for better performance on power10. + +mpower10-fusion-logical-add +Target Undocumented Mask(P10_FUSION_LOGADD) Var(rs6000_isa_flags) +Fuse scalar logical op with add/subf for better performance on power10. + +mpower10-fusion-add-logical +Target Undocumented Mask(P10_FUSION_ADDLOG) Var(rs6000_isa_flags) +Fuse scalar add/subf with logical op for better performance on power10. mpower10-fusion-2add Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags) -Fuse certain add operations together for better performance on power10. +Fuse dependent pairs of add or vaddudm instructions for better performance on power10. mcrypto Target Mask(CRYPTO) Var(rs6000_isa_flags) -- cgit v1.1 From db92bd223e3957ee58b5a0c0fffd8b7766f1def3 Mon Sep 17 00:00:00 2001 From: Geng Qi Date: Mon, 24 May 2021 20:22:52 +0800 Subject: C-SKY: Add fpuv3 instructions and CK860 arch. gcc/ChangeLog: * config/csky/constraints.md ("W"): New constriant for mem operand with base reg, index register. ("Q"): Renamed and modified "csky_valid_fpuv2_mem_operand" to "csky_valid_mem_constraint_operand" to deal with both "Q" and "W" constraint. ("Dv"): New constraint for const double value that can be used at fmovi instruction. * config/csky/csky-modes.def (HFmode): New mode. * config/csky/csky-protos.h (csky_valid_fpuv2_mem_operand): Rename to "csky_valid_mem_constraint_operand" and support new constraint "W". (csky_get_movedouble_length): New. (fpuv3_output_move): New. (fpuv3_const_double): New. * config/csky/csky.c (csky_option_override): New arch CK860 with fpv3. (decompose_csky_address): Refine. (csky_print_operand): New "CONST_DOUBLE" operand. (csky_output_move): Support fpv3 instructions. (csky_get_movedouble_length): New. (fpuv3_output_move): New. (fpuv3_const_double): New. (csky_emit_compare): Cover float comparsion. (csky_emit_compare_float): Refine. (csky_vaild_fpuv2_mem_operand): Rename to "csky_valid_mem_constraint_operand" and support new constraint "W". (ck860_rtx_costs): New. (csky_rtx_costs): Add the cost calculation of CK860. (regno_reg_class): New vregs for fpuv3. (csky_dbx_regno): Likewise. (csky_cpu_cpp_builtins): New builtin macro for fpuv3. (csky_conditional_register_usage): Suporrot fpuv3. (csky_dwarf_register_span): Suporrot fpuv3. (csky_init_builtins, csky_mangle_type): Support "__fp16" type. (ck810_legitimate_index_p): Support fp16. * config/csky/csky.h (TARGET_TLS): ADD CK860. (CSKY_VREG_P, CSKY_VREG_LO_P, CSKY_VREG_HI_P): Support fpuv3. (TARGET_SINGLE_FPU): Support fpuv3. (TARGET_SUPPORT_FPV3): New. (FIRST_PSEUDO_REGISTER): Change to 202 to hold the new fpuv3 registers. (FIXED_REGISTERS, CALL_REALLY_USED_REGISTERS, REGISTER_NAMES, REG_CLASS_CONTENTS): Support fpuv3. * config/csky/csky.md (movsf): Move to cksy_insn_fpu.md and refine. (csky_movsf_fpv2): Likewise. (ck801_movsf): Likewise. (csky_movsf): Likewise. (movdf): Likewise. (csky_movdf_fpv2): Likewise. (ck801_movdf): Likewise. (csky_movdf): Likewise. (movsicc): Refine. Use "comparison_operatior" instead of "ordered_comparison_operatior". (addsicc): Likewise. (CSKY_FIRST_VFP3_REGNUM, CSKY_LAST_VFP3_REGNUM): New constant. (call_value_internal_vh): New. * config/csky/csky_cores.def (CK860): New arch and cpu. (fpv3_hf): New. (fpv3_hsf): New. (fpv3_sdf): New. (fpv3): New. * config/csky/csky_insn_fpu.md: Refactor. Separate all float patterns into emit-patterns and match-patterns, remain the emit-patterns here, and move the match-patterns to csky_insn_fpuv2.md or csky_insn_fpuv3.md. * config/csky/csky_insn_fpuv2.md: New file for fpuv2 instructions. * config/csky/csky_insn_fpuv3.md: New file and new patterns for fpuv3 isntructions. * config/csky/csky_isa.def (fcr): New. (fpv3_hi): New. (fpv3_hf): New. (fpv3_sf): New. (fpv3_df): New. (CK860): New definition for ck860. * config/csky/csky_tables.opt (ck860): New processors ck860, ck860f. And new arch ck860. (fpv3_hf): New. (fpv3_hsf): New. (fpv3_hdf): New. (fpv3): New. * config/csky/predicates.md (csky_float_comparsion_operator): Delete "geu", "gtu", "leu", "ltu", which will never appear at float comparison. * config/csky/t-csky-elf: Support 860. * config/csky/t-csky-linux: Likewise. * doc/md.texi: Add "Q" and "W" constraints for C-SKY. --- gcc/config/csky/constraints.md | 13 +- gcc/config/csky/csky-modes.def | 2 + gcc/config/csky/csky-protos.h | 7 +- gcc/config/csky/csky.c | 644 ++++++++++++++++++++++++++---- gcc/config/csky/csky.h | 162 ++++++-- gcc/config/csky/csky.md | 127 ++---- gcc/config/csky/csky_cores.def | 13 + gcc/config/csky/csky_insn_fpu.md | 798 +++++++++++++++---------------------- gcc/config/csky/csky_insn_fpuv2.md | 470 ++++++++++++++++++++++ gcc/config/csky/csky_insn_fpuv3.md | 497 +++++++++++++++++++++++ gcc/config/csky/csky_isa.def | 15 + gcc/config/csky/csky_tables.opt | 21 + gcc/config/csky/predicates.md | 3 +- gcc/config/csky/t-csky-elf | 9 +- gcc/config/csky/t-csky-linux | 11 +- 15 files changed, 2117 insertions(+), 675 deletions(-) create mode 100644 gcc/config/csky/csky-modes.def create mode 100644 gcc/config/csky/csky_insn_fpuv2.md create mode 100644 gcc/config/csky/csky_insn_fpuv3.md (limited to 'gcc/config') diff --git a/gcc/config/csky/constraints.md b/gcc/config/csky/constraints.md index 6067d3d..937cb81 100644 --- a/gcc/config/csky/constraints.md +++ b/gcc/config/csky/constraints.md @@ -34,7 +34,11 @@ (define_memory_constraint "Q" "Memory operands with base register, index register and short displacement for FPUV2" - (match_test "csky_valid_fpuv2_mem_operand (op)")) + (match_test "csky_valid_mem_constraint_operand (op, \"Q\")")) + +(define_memory_constraint "W" + "Memory operands with base register, index register" + (match_test "csky_valid_mem_constraint_operand (op, \"W\")")) (define_constraint "R" "Memory operands whose address is a label_ref" @@ -172,3 +176,10 @@ "Constant in range [-8, -1]" (and (match_code "const_int") (match_test "CSKY_CONST_OK_FOR_US (ival)"))) + +(define_constraint "Dv" + "@VFPv3 + A const_double which can be used with a VFP fmovi + instruction." + (and (match_code "const_double") + (match_test "fpuv3_const_double_rtx (op)"))) diff --git a/gcc/config/csky/csky-modes.def b/gcc/config/csky/csky-modes.def new file mode 100644 index 0000000..a2427ff --- /dev/null +++ b/gcc/config/csky/csky-modes.def @@ -0,0 +1,2 @@ +/* Float modes. */ +FLOAT_MODE (HF, 2, ieee_half_format); /* Half-precision floating point */ diff --git a/gcc/config/csky/csky-protos.h b/gcc/config/csky/csky-protos.h index 7a2e23e..7c6528b 100644 --- a/gcc/config/csky/csky-protos.h +++ b/gcc/config/csky/csky-protos.h @@ -30,7 +30,7 @@ extern void csky_cpu_cpp_builtins (cpp_reader *); extern bool csky_inlinable_constant (HOST_WIDE_INT value); extern bool csky_shifted_imm8_constant (unsigned HOST_WIDE_INT, unsigned int *, unsigned int *); -extern bool csky_valid_fpuv2_mem_operand (rtx); +extern bool csky_valid_mem_constraint_operand (rtx, const char*); extern bool csky_minipool_load_p (rtx_insn *); extern const char *csky_output_move (rtx insn, rtx *, machine_mode); @@ -70,4 +70,9 @@ extern int csky_default_branch_cost (bool, bool); extern bool csky_default_logical_op_non_short_circuit (void); extern void csky_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree); +extern int csky_get_movedouble_length(rtx operands[]); + +/* The functions was used for fpuv3. */ +extern const char *fpuv3_output_move (rtx *operands); +extern int fpuv3_const_double_rtx (rtx); #endif /* GCC_CSKY_PROTOS_H */ diff --git a/gcc/config/csky/csky.c b/gcc/config/csky/csky.c index cdb95fe..6e97994 100644 --- a/gcc/config/csky/csky.c +++ b/gcc/config/csky/csky.c @@ -126,7 +126,46 @@ enum reg_class regno_reg_class[FIRST_PSEUDO_REGISTER] = /* Reserved. */ RESERVE_REGS, RESERVE_REGS, /* Register epc. */ - OTHER_REGS + OTHER_REGS, + /* Vec registers. */ + V_REGS, V_REGS, V_REGS, V_REGS, + V_REGS, V_REGS, V_REGS, V_REGS, + V_REGS, V_REGS, V_REGS, V_REGS, + V_REGS, V_REGS, V_REGS, V_REGS, + /* Reserved. */ + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + /* Reserved. */ + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, + + RESERVE_REGS, RESERVE_REGS, RESERVE_REGS }; /* Arrays that map GCC register numbers to debugger register numbers, @@ -138,11 +177,34 @@ const int csky_dbx_regno[FIRST_PSEUDO_REGISTER] = 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - -1, -1, 36, 37, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, 56, 57, 58, 59, - 60, 61, 62, 63, 64, 65, 66, 67, - 68, 69, 70, 71, -1, -1, 72 + -1, -1, 36, 37, + 75, 79, 83, 87, 91, 95, 99, 103, + 107, 111, 115, 119, 123, 127, 131, 135, + 74, 78, 82, 86, 90, 94, 98, 102, + 106, 110, 114, 118, 122, 126, 130, 134, + -1, -1, 72, + /* vr: 71 - 86 */ + 139, 143, 147, 151, 155, 159, 163, 167, + 171, 175, 179, 183, 187, 191, 195, 199, + 138, 142, 146, 150, 154, 158, 162, 166, + 170, 174, 178, 182, 186, 190, 194, 198, + /* resereved */ + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1 }; /* Table of machine attributes. */ @@ -351,6 +413,12 @@ csky_cpu_cpp_builtins (cpp_reader *pfile) builtin_define ("__CSKY_FPUV2__"); } + if (TARGET_SUPPORT_FPV3) + { + builtin_define ("__csky_fpuv3__"); + builtin_define ("__CSKY_FPUV3__"); + } + if (TARGET_ELRW) { builtin_define ("__csky_elrw__"); @@ -408,7 +476,6 @@ csky_cpu_cpp_builtins (cpp_reader *pfile) * Storage Layout * ******************************************************************/ - #undef TARGET_PROMOTE_FUNCTION_MODE #define TARGET_PROMOTE_FUNCTION_MODE \ default_promote_function_mode_always_promote @@ -416,6 +483,9 @@ csky_cpu_cpp_builtins (cpp_reader *pfile) #undef TARGET_CONSTANT_ALIGNMENT #define TARGET_CONSTANT_ALIGNMENT csky_constant_alignment +#undef TARGET_MANGLE_TYPE +#define TARGET_MANGLE_TYPE csky_mangle_type + /****************************************************************** * Stack Layout and Calling Conventions * @@ -692,6 +762,15 @@ csky_default_logical_op_non_short_circuit (void) #define TARGET_SCHED_ADJUST_COST csky_sched_adjust_cost +/****************************************************************** + * Builtin * + ******************************************************************/ + + +#undef TARGET_INIT_BUILTINS +#define TARGET_INIT_BUILTINS csky_init_builtins + + /* The declaration of functions. */ static void push_csky_minipool_fix (rtx_insn *, HOST_WIDE_INT, rtx *, machine_mode, rtx); @@ -837,6 +916,7 @@ Mfix *minipool_fix_tail; Mfix *minipool_barrier; /* Allow GC scanning of the minipool obstack. */ + static void csky_add_gc_roots (void) { @@ -846,6 +926,7 @@ csky_add_gc_roots (void) /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so strcpy from constants will be faster. */ + static HOST_WIDE_INT csky_constant_alignment (const_tree exp, HOST_WIDE_INT align) { @@ -1109,6 +1190,7 @@ get_csky_barrier_cost (rtx_insn *insn) (FIX->address,MAX_ADDRESS) to forcibly insert a minipool barrier. Create the barrier by inserting a jump and add a new fix entry for it. */ + static Mfix * create_csky_fix_barrier (Mfix *fix, Mfix *fix_next, HOST_WIDE_INT max_address) @@ -1455,6 +1537,7 @@ csky_compute_pushpop_length (rtx *operands) } /* Emit constant pools for -mconstpool. */ + static void csky_emit_constant_pools (void) { @@ -1796,6 +1879,7 @@ csky_initial_elimination_offset (int from, int to) CUM is a variable of type CUMULATIVE_ARGS which gives info about the preceding args and about the function being called. ARG is a description of the argument. */ + static rtx csky_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg) { @@ -1921,6 +2005,7 @@ csky_function_value (const_tree type, const_tree func, /* Implement TARGET_LIBCALL_VALUE. */ + static rtx csky_libcall_value (machine_mode mode, const_rtx libcall ATTRIBUTE_UNUSED) @@ -1949,6 +2034,7 @@ csky_function_value_regno_p (const unsigned int regno) /* Return an RTX indicating where the return address to the calling function can be found. */ + rtx csky_return_addr (int count, rtx frame ATTRIBUTE_UNUSED) { @@ -1964,6 +2050,7 @@ csky_return_addr (int count, rtx frame ATTRIBUTE_UNUSED) that must be put in registers. The value must be zero for arguments that are passed entirely in registers or that are entirely pushed on the stack. */ + static int csky_arg_partial_bytes (cumulative_args_t pcum_v, const function_arg_info &arg) { @@ -2180,7 +2267,19 @@ csky_conditional_register_usage (void) int regno; for (regno = CSKY_FIRST_VFP_REGNUM; - regno <= CSKY_LAST_VFP_REGNUM; regno++) + regno <= CSKY_LAST_VFP3_REGNUM; regno++) + { + fixed_regs[regno] = 1; + call_used_regs[regno] = 1; + } + } + + if (!TARGET_SUPPORT_FPV3) + { + int regno; + + for (regno = CSKY_FIRST_VFP3_REGNUM; + regno <= CSKY_LAST_VFP3_REGNUM; regno++) { fixed_regs[regno] = 1; call_used_regs[regno] = 1; @@ -2198,6 +2297,7 @@ csky_conditional_register_usage (void) } /* Implement TARGET_HARD_REGNO_NREGS. */ + static unsigned int csky_hard_regno_nregs (unsigned int regno, machine_mode mode) { @@ -2261,6 +2361,7 @@ csky_hard_regno_mode_ok (unsigned int regno, machine_mode mode) /* Implement TARGET_MODES_TIEABLE_P. We can't tie DFmode with other modes when V_REGs might be in use because those registers mess with the stored bits. */ + static bool csky_modes_tieable_p (machine_mode mode1, machine_mode mode2) { @@ -2272,6 +2373,7 @@ csky_modes_tieable_p (machine_mode mode1, machine_mode mode2) /* Implement TARGET_CAN_CHANGE_MODE_CLASS. V_REG registers can't do subreg as all values are reformatted to internal precision. */ + static bool csky_can_change_mode_class (machine_mode from, machine_mode to, @@ -2406,6 +2508,7 @@ csky_spill_class (reg_class_t rclass, machine_mode mode ATTRIBUTE_UNUSED) /* Convert a static initializer array of feature bits to sbitmap representation. */ + static void csky_initialize_isa (sbitmap isa, const enum csky_isa_feature *isa_bits) { @@ -2417,6 +2520,7 @@ csky_initialize_isa (sbitmap isa, const enum csky_isa_feature *isa_bits) /* Configure a build target TARGET from the user-specified options OPTS and OPTS_SET. */ + static void csky_configure_build_target (struct csky_build_target *target, struct cl_target_option *opts, @@ -2508,7 +2612,9 @@ csky_option_override (void) csky_base_arch = csky_active_target.base_arch; - if (flag_pic && !(CSKY_TARGET_ARCH (CK810) || CSKY_TARGET_ARCH (CK807))) + if (flag_pic && !(CSKY_TARGET_ARCH (CK807) + || CSKY_TARGET_ARCH (CK810) + || CSKY_TARGET_ARCH (CK860))) { flag_pic = 0; warning (0, "%qs is not supported by arch %s", @@ -2526,19 +2632,21 @@ csky_option_override (void) bool ok; int fpu_index; -#ifdef CSKY_FPUTYPE_DEFAULT - target_fpu_name = CSKY_FPUTYPE_DEFAULT; -#else - target_fpu_name = "fpv2"; -#endif - if (csky_active_target.core_name != NULL && !strchr (csky_active_target.core_name, 'f')) target_fpu_name = "auto"; else if (CSKY_TARGET_ARCH (CK803) || !TARGET_DOUBLE_FLOAT) target_fpu_name = "fpv2_sf"; + else if (CSKY_TARGET_ARCH (CK860)) + target_fpu_name = "fpv3"; else if (TARGET_DOUBLE_FLOAT && TARGET_FDIVDU) target_fpu_name = "fpv2_divd"; + else +#ifdef CSKY_FPUTYPE_DEFAULT + target_fpu_name = CSKY_FPUTYPE_DEFAULT; +#else + target_fpu_name = "fpv2"; +#endif ok = opt_enum_arg_to_value (OPT_mfpu_, target_fpu_name, &fpu_index, CL_TARGET); @@ -3020,10 +3128,8 @@ ck810_legitimate_index_p (machine_mode mode, rtx index, int strict_p) { enum rtx_code code = GET_CODE (index); - if (TARGET_HARD_FLOAT - && (mode == SFmode || mode == DFmode)) - return (code == CONST_INT && INTVAL (index) < 1024 - && INTVAL (index) >= 0 + if (code == CONST_INT && TARGET_HARD_FLOAT && CSKY_VREG_MODE_P (mode)) + return (INTVAL (index) < 1024 && INTVAL (index) >= 0 && (INTVAL (index) & 3) == 0); if (code == CONST_INT) @@ -3183,7 +3289,7 @@ static bool decompose_csky_address (rtx addr, struct csky_address *out) { rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX; - HOST_WIDE_INT scale = 1; + HOST_WIDE_INT scale = 0; rtx scale_rtx = NULL_RTX; int i; @@ -3231,7 +3337,10 @@ decompose_csky_address (rtx addr, struct csky_address *out) if (!base) base = op; else if (!index) - index = op; + { + index = op; + scale = 1; + } else return false; break; @@ -3259,7 +3368,7 @@ decompose_csky_address (rtx addr, struct csky_address *out) scale_rtx = XEXP (op, 1); if (!CONST_INT_P (scale_rtx)) return false; - scale = scale << INTVAL (scale_rtx); + scale = 1 << INTVAL (scale_rtx); break; default: return false; @@ -3484,6 +3593,14 @@ csky_print_operand (FILE *stream, rtx x, int code) case UNSPEC: csky_output_pic_addr_const (stream, x, code); break; + case CONST_DOUBLE: + { + char fpstr[20]; + real_to_decimal ( fpstr, CONST_DOUBLE_REAL_VALUE (x), + sizeof (fpstr), 0, 1); + fprintf (stream, "%s", fpstr); + } + break; default: output_addr_const (stream, x); break; @@ -3997,17 +4114,37 @@ csky_output_move (rtx insn ATTRIBUTE_UNUSED, rtx operands[], return "mfhi\t%0"; } - if (CSKY_VREG_P (dstreg) && CSKY_VREG_P (srcreg)) - return "fmovs\t%0, %1"; - if (CSKY_VREG_P (dstreg)) - return "fmtvrl\t%0, %1"; - if (CSKY_VREG_P (srcreg)) - return "fmfvrl\t%0, %1"; - - if (REGNO (src) == CSKY_CC_REGNUM) - return "mvc\t%0"; - else - return "mov\t%0, %1"; + if (CSKY_VREG_P (dstreg) && CSKY_VREG_P (srcreg)) + { + if (CSKY_ISA_FEATURE (fpv2_sf)) + return "fmovs\t%0, %1"; + else if (CSKY_ISA_FEATURE (fpv3_sf)) + return "fmov.32\t%0, %1"; + else + gcc_unreachable (); + } + if (CSKY_VREG_P (dstreg)) + { + if (CSKY_ISA_FEATURE (fpv2_sf)) + return "fmtvrl\t%0, %1"; + else if (CSKY_ISA_FEATURE (fpv3_sf)) + return "fmtvr.32.1\t%0, %1"; + else + gcc_unreachable (); + } + if (CSKY_VREG_P (srcreg)) + { + if (CSKY_ISA_FEATURE (fpv2_sf)) + return "fmfvrl\t%0, %1"; + else if (CSKY_ISA_FEATURE (fpv3_sf)) + return "fmfvr.32.1\t%0, %1"; + else + gcc_unreachable (); + } + if (REGNO (src) == CSKY_CC_REGNUM) + return "mvc\t%0"; + else + return "mov\t%0, %1"; } /* The situation mov memory to reg. */ else if (GET_CODE (src) == MEM) @@ -4018,13 +4155,21 @@ csky_output_move (rtx insn ATTRIBUTE_UNUSED, rtx operands[], switch (GET_MODE (src)) { case E_HImode: + case E_HFmode: return "ldr.h\t%0, %1"; case E_QImode: return "ldr.b\t%0, %1"; case E_SImode: case E_SFmode: if (CSKY_VREG_P (REGNO (dst))) - return "fldrs\t%0, %1"; + { + if (CSKY_ISA_FEATURE(fpv2_sf)) + return "fldrs\t%0, %1"; + else if (CSKY_ISA_FEATURE(fpv3_sf)) + return "fldr.32\t%0, %1"; + else + gcc_unreachable (); + } else return "ldr.w\t%0, %1"; default: @@ -4042,13 +4187,21 @@ csky_output_move (rtx insn ATTRIBUTE_UNUSED, rtx operands[], switch (GET_MODE (src)) { case E_HImode: + case E_HFmode: return "ld.h\t%0, %1"; case E_QImode: return "ld.b\t%0, %1"; case E_SFmode: case E_SImode: if (CSKY_VREG_P (REGNO (dst))) - return "flds\t%0, %1"; + { + if (CSKY_ISA_FEATURE(fpv2_sf)) + return "flds\t%0, %1"; + else if (CSKY_ISA_FEATURE(fpv3_sf)) + return "fld.32\t%0, %1"; + else + gcc_unreachable (); + } else return "ld.w\t%0, %1"; default: @@ -4106,7 +4259,14 @@ csky_output_move (rtx insn ATTRIBUTE_UNUSED, rtx operands[], case E_SFmode: case E_SImode: if (CSKY_VREG_P (REGNO (src))) - return "fstrs\t%1, %0"; + { + if (CSKY_ISA_FEATURE(fpv2_sf)) + return "fstrs\t%1, %0"; + else if (CSKY_ISA_FEATURE(fpv3_sf)) + return "fstr.32\t%1, %0"; + else + gcc_unreachable (); + } else return "str.w\t%1, %0"; default: @@ -4122,7 +4282,14 @@ csky_output_move (rtx insn ATTRIBUTE_UNUSED, rtx operands[], case E_SImode: case E_SFmode: if (CSKY_VREG_P (REGNO (src))) - return "fsts\t%1, %0"; + { + if (CSKY_ISA_FEATURE(fpv2_sf)) + return "fsts\t%1, %0"; + else if (CSKY_ISA_FEATURE(fpv3_sf)) + return "fst.32\t%1, %0"; + else + gcc_unreachable (); + } else return "st.w\t%1, %0"; default: @@ -4261,7 +4428,14 @@ csky_output_movedouble (rtx operands[], return "mthi\t%R1\n\tmtlo\t%1"; } else if (CSKY_VREG_P (srcreg) && CSKY_VREG_P (dstreg)) - return "fmovd\t%0, %1"; + { + if (CSKY_ISA_FEATURE(fpv2_df)) + return "fmovd\t%0, %1"; + else if (CSKY_ISA_FEATURE(fpv3_df)) + return "fmov.64\t%0, %1"; + else + gcc_unreachable (); + } else if (CSKY_VREG_P (srcreg)) { /* Since the vector registers in fpuv2_soft processors @@ -4270,18 +4444,46 @@ csky_output_movedouble (rtx operands[], if (TARGET_SOFT_FPU) return "fmfvrl\t%0, %1"; else if (TARGET_BIG_ENDIAN) - return "fmfvrh\t%0, %1\n\tfmfvrl\t%R0, %1"; + { + if (CSKY_ISA_FEATURE(fpv2_df)) + return "fmfvrh\t%0, %1\n\tfmfvrl\t%R0, %1"; + else if (CSKY_ISA_FEATURE(fpv3_df)) + return "fmfvr.64\t%R0, %0, %1"; + else + gcc_unreachable (); + } else - return "fmfvrh\t%R0, %1\n\tfmfvrl\t%0, %1"; + { + if (CSKY_ISA_FEATURE(fpv2_df)) + return "fmfvrh\t%R0, %1\n\tfmfvrl\t%0, %1"; + else if (CSKY_ISA_FEATURE(fpv3_df)) + return "fmfvr.64\t%0, %R0, %1"; + else + gcc_unreachable (); + } } else if (CSKY_VREG_P (dstreg)) { if (TARGET_SOFT_FPU) return "fmtvrl\t%0, %1"; else if (TARGET_BIG_ENDIAN) - return "fmtvrh\t%0, %1\n\tfmtvrl\t%0, %R1"; + { + if (CSKY_ISA_FEATURE(fpv2_df)) + return "fmtvrh\t%0, %1\n\tfmtvrl\t%0, %R1"; + else if (CSKY_ISA_FEATURE(fpv3_df)) + return "fmtvr.64\t%0, %R1, %1"; + else + gcc_unreachable (); + } else - return "fmtvrh\t%0, %R1\n\tfmtvrl\t%0, %1"; + { + if (CSKY_ISA_FEATURE(fpv2_df)) + return "fmtvrh\t%0, %R1\n\tfmtvrl\t%0, %1"; + else if (CSKY_ISA_FEATURE(fpv3_df)) + return "fmtvr.64\t%0, %1, %R1"; + else + gcc_unreachable (); + } } /* Ensure the second source not overwritten. */ @@ -4323,9 +4525,23 @@ csky_output_movedouble (rtx operands[], if (CSKY_VREG_P (dstreg)) { if (op0.index) - return "fldrd\t%0, %1"; + { + if (CSKY_ISA_FEATURE(fpv2_df)) + return "fldrd\t%0, %1"; + else if (CSKY_ISA_FEATURE(fpv3_df)) + return "fldr.64\t%0, %1"; + else + gcc_unreachable (); + } else - return "fldd\t%0, %1"; + { + if (CSKY_ISA_FEATURE(fpv2_df)) + return "fldd\t%0, %1"; + else if (CSKY_ISA_FEATURE(fpv3_df)) + return "fld.64\t%0, %1"; + else + gcc_unreachable (); + } } /* FIXME length attribute is wrong here. */ if (dstreg == basereg) @@ -4389,9 +4605,23 @@ csky_output_movedouble (rtx operands[], if (CSKY_VREG_P (srcreg)) { if (op0.index) - return "fstrd\t%1, %0"; + { + if (CSKY_ISA_FEATURE(fpv2_df)) + return "fstrd\t%1, %0"; + else if (CSKY_ISA_FEATURE(fpv3_df)) + return "fstr.64\t%1, %0"; + else + gcc_unreachable (); + } else - return "fstd\t%1, %0"; + { + if (CSKY_ISA_FEATURE(fpv2_df)) + return "fstd\t%1, %0"; + else if (CSKY_ISA_FEATURE(fpv3_df)) + return "fst.64\t%1, %0"; + else + gcc_unreachable (); + } } /* FIXME length attribute is wrong here. */ if (srcreg == basereg) @@ -4518,9 +4748,181 @@ csky_output_ck801_movedouble (rtx operands[], gcc_unreachable (); } +/* Calculate the instruction's length for moving double-word data. */ + +int +csky_get_movedouble_length(rtx operands[]) +{ + rtx dst = operands[0]; + rtx src = operands[1]; + + if (REG_P (dst)) + { + if (REG_P (src)) + { + int dstreg = REGNO (dst); + int srcreg = REGNO (src); + + if (CSKY_VREG_P (srcreg) && CSKY_VREG_P (dstreg)) + return 4; + else + return 8; + } + else if (GET_CODE (src) == MEM) + { + rtx memexp = XEXP (src, 0); + int dstreg = REGNO (dst); + struct csky_address op0; + decompose_csky_address (XEXP (src, 0), &op0); + + if (GET_CODE (memexp) == LABEL_REF) + return 8; + if (CSKY_VREG_P (dstreg)) + return 4; + return 8; + } + else if (GET_CODE (src) == CONST_INT || GET_CODE (src) == CONST_DOUBLE) + { + split_double (src, operands + 2, operands + 3); + if (CSKY_CONST_OK_FOR_N (INTVAL (operands[2]) + 1) + && CSKY_CONST_OK_FOR_N (INTVAL (operands[3]) + 1) + && REGNO (operands[0]) < 6) + return 4; + else + return 8; + } + } + else if (GET_CODE (dst) == MEM && GET_CODE (src) == REG) + { + rtx memexp = XEXP (dst, 0); + int srcreg = REGNO (src); + int offset = -1; + if (CSKY_VREG_P (srcreg)) + return 4; + + if (GET_CODE (memexp) == REG) + offset = 0; + else if (GET_CODE (memexp) == PLUS) + { + if (GET_CODE (XEXP (memexp, 0)) == REG) + offset = INTVAL (XEXP (memexp, 1)); + else if (GET_CODE (XEXP (memexp, 1)) == REG) + offset = INTVAL (XEXP (memexp, 0)); + else + gcc_unreachable (); + } + else + gcc_unreachable (); + + if (srcreg <= 6 && offset <= 1020) + return 4; + else if ((srcreg == 7 && offset <= 1024) || (srcreg <= 7 && offset == 1024)) + return 6; + else + return 8; + } + else + gcc_unreachable (); + + return 0; +} + +/* Output float point load/store instructions for fpuv3. */ + +const char * +fpuv3_output_move (rtx *operands) +{ + rtx reg, mem, addr, ops[2]; + bool isload = REG_P (operands[0]); + + const char *templ = "f%s%s.%s\t%%0, %%1"; + char buff[50]; + machine_mode mode; + + reg = operands[isload ? 0 : 1]; + mem = operands[isload ? 1 : 0]; + + gcc_assert (REG_P (reg)); + gcc_assert (CSKY_VREG_P (REGNO (reg))); + gcc_assert (MEM_P (mem)); + + mode = GET_MODE (reg); + const char *type = mode == DFmode ? "64" : + mode == SFmode ? "32" : + mode == HFmode ? "16" : + NULL; + gcc_assert(type != NULL); + + addr = XEXP (mem, 0); + struct csky_address caddr; + decompose_csky_address (addr, &caddr); + + ops[0] = reg; + ops[1] = mem; + sprintf (buff, templ, + isload ? "ld" : "st", + caddr.index ? "r" : "", + type); + output_asm_insn (buff, ops); + + return ""; +} + +/* Check if a const_double can be used by a VFP fmovi instruction. */ + +int +fpuv3_const_double_rtx (rtx x) +{ + REAL_VALUE_TYPE r, m; + r = *CONST_DOUBLE_REAL_VALUE (x); + + /* Fpuv3 doesn't support the following values. */ + if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r) || REAL_VALUE_MINUS_ZERO (r) + || r.cl == rvc_zero) + return 0; + + /* Extract sign, exponent and mantissa. */ + int exponent; + r = real_value_abs (&r); + exponent = REAL_EXP (&r); + + bool fail; + unsigned HOST_WIDE_INT mantissa, mant_hi; + unsigned HOST_WIDE_INT mask; + int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1; + real_ldexp (&m, &r, point_pos - exponent); + wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2); + mantissa = w.elt (0); + mant_hi = w.elt (1); + + exponent -= 1; + + if (!IN_RANGE (exponent, -4, 11)) + return 0; + + /* If there are bits set in the low part of the mantissa, these values are + not supported. */ + if (mantissa != 0) + return 0; + + /* Now, make the mantissa contain the most-significant bits, and the + point_pos indicates the number of these bits. */ + point_pos -= HOST_BITS_PER_WIDE_INT; + mantissa = mant_hi; + + /* We can only allow a mantissa of 9 significant digits, top of which is always 1. */ + mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 9)) - 1; + if ((mantissa & mask) != 0) + return 0; + + return 1; +} + + /* Split operands for an AND expression when OPERANDS[2] is a constant. Note operands[0] is marked earlyclobber in this case and can be overwritten. Return true if "DONE", false otherwise. */ + bool csky_split_and (rtx *operands) { @@ -4650,6 +5052,7 @@ csky_split_and (rtx *operands) /* Split operands for an IOR expression when OPERANDS[2] is a constant. Note operands[0] is marked earlyclobber in this case and can be overwritten. Return true if "DONE", false otherwise. */ + bool csky_split_ior (rtx *operands) { @@ -4717,6 +5120,7 @@ csky_split_ior (rtx *operands) /* Split operands for an XOR expression when OPERANDS[2] is a constant. Note operands[0] is marked earlyclobber in this case and can be overwritten. Return true if "DONE", false otherwise. */ + bool csky_split_xor (rtx *operands) { @@ -4765,6 +5169,7 @@ csky_split_xor (rtx *operands) /* Return true if X is an address form involving a symbol or label ref. */ + bool csky_symbolic_address_p (rtx x) { @@ -4793,6 +5198,9 @@ csky_emit_compare (enum rtx_code code, rtx op0, rtx op1) bool invert; rtx cc_reg = gen_rtx_REG (CCmode, CSKY_CC_REGNUM); + if (GET_MODE_CLASS(GET_MODE (op0)) == MODE_FLOAT) + return csky_emit_compare_float(code, op0, op1); + if (GET_CODE (op1) == CONST_INT) { HOST_WIDE_INT val = INTVAL (op1); @@ -5707,6 +6115,7 @@ tls_unspec_mentioned_p (rtx x) /* Implement LEGITIMATE_PIC_OPERAND_P. */ + bool csky_legitimate_pic_operand_p (rtx x) { @@ -5938,33 +6347,20 @@ csky_emit_compare_float (enum rtx_code code, rtx op0, rtx op1) op1 = force_reg (mode, op1); invert = false; + switch (code) { case EQ: code = NE; invert = true; break; - - case NE: - break; - case LE: - if (op1 == CONST0_RTX (mode)) - op1 = force_reg (mode, op1); - break; case GT: - if (op1 == CONST0_RTX (mode)) - op1 = force_reg (mode, op1); - break; - case GE: - break; case LT: - if (op1 == CONST0_RTX (mode)) - { - code = GE; - invert = true; - } - break; - case UNORDERED: + case LE: + if (op1 == CONST0_RTX (mode) && (CSKY_ISA_FEATURE_GET(fpv2_sf) + || CSKY_ISA_FEATURE_GET(fpv2_df) + || CSKY_ISA_FEATURE_GET(fpv2_divd))) + op1 = force_reg (mode, op1); break; case ORDERED: code = UNORDERED; @@ -5980,10 +6376,11 @@ csky_emit_compare_float (enum rtx_code code, rtx op0, rtx op1) return invert; } -/* Support for the Q memory constraint. Returns true if OP is a MEM RTX - with an address consisting of base + index or base + displacement. */ +/* Support for the Q or W memory constraint. Returns true if OP is a MEM + RTX with an address consisting of base + index or base + displacement. */ + bool -csky_valid_fpuv2_mem_operand (rtx op) +csky_valid_mem_constraint_operand (rtx op, const char *constraint) { struct csky_address addr; @@ -5998,7 +6395,7 @@ csky_valid_fpuv2_mem_operand (rtx op) return false; /* Verify index operand. */ - if (addr.index) + if (addr.index && (constraint[0] == 'Q' || constraint[0] == 'W')) { if (!is_csky_address_register_rtx_p (addr.index, 0)) return false; @@ -6010,7 +6407,7 @@ csky_valid_fpuv2_mem_operand (rtx op) return false; } /* Verify disp operand. */ - else if (addr.disp) + else if (addr.disp && constraint[0] == 'Q') { rtx disp = addr.disp; @@ -6023,7 +6420,11 @@ csky_valid_fpuv2_mem_operand (rtx op) return false; } - return true; + else if (constraint[0] == 'Q') + /* Single reg is valid for 'Q'. */ + return true; + + return false; } @@ -6442,7 +6843,7 @@ ck803_rtx_costs (rtx x, int code, int outer_code ATTRIBUTE_UNUSED, } } -/* TARGET_RTX_COSTS helper for ck807+ arches. */ +/* TARGET_RTX_COSTS helper for ck807/ck810 arches. */ static bool ck807_ck810_rtx_costs (rtx x, int code, @@ -6473,6 +6874,52 @@ ck807_ck810_rtx_costs (rtx x, int code, } } +/* TARGET_RTX_COSTS helper for ck860 arches. */ + +static bool +ck860_rtx_costs (rtx x, int code, machine_mode mode, + int outer_code ATTRIBUTE_UNUSED, + int *total, bool speed ATTRIBUTE_UNUSED) +{ + switch (code) + { + case PLUS: + /* The costs of mula is 1 more than mult. */ + if (GET_CODE (XEXP (x, 0)) == MULT && REG_P (XEXP (x, 1)) && speed) + { + rtx mul_op0 = XEXP (XEXP (x, 0), 0); + rtx mul_op1 = XEXP (XEXP (x, 0), 1); + if (REG_P (mul_op0) && REG_P (mul_op1)) + { + *total = COSTS_N_INSNS (1); + *total += rtx_cost (XEXP (x, 0), mode, + (enum rtx_code) code, 0, speed); + return true; + } + } + return false; + case MULT: + if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1))) + { + HOST_WIDE_INT val = INTVAL (XEXP (x, 1)); + if (val % 2 == 0 && val < 0xffffffff && val > 0) + { + *total = COSTS_N_INSNS (1); + return true; + } + } + return false; + + case CONST: + case LABEL_REF: + case SYMBOL_REF: + *total = COSTS_N_INSNS (3); + return true; + default: + return false; + } +} + /* Implement TARGET_RTX_COSTS, to compute a (partial) cost for rtx X. Return true if the complete cost has been computed, and false if @@ -6491,6 +6938,8 @@ csky_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code, return ck803_rtx_costs (x, code, outer_code, total, speed); else if (CSKY_TARGET_ARCH (CK807) || CSKY_TARGET_ARCH (CK810)) return ck807_ck810_rtx_costs (x, code, outer_code, total, speed); + else if (CSKY_TARGET_ARCH (CK860)) + return ck860_rtx_costs (x, code, mode, outer_code, total, speed); else gcc_unreachable (); } @@ -6633,6 +7082,7 @@ csky_warn_func_return (tree decl) /* Implement TARGET_RETURN_IN_MEMORY to decide whether TYPE should be returned in memory (true) or in a register (false). FNTYPE is the type of the function making the call. */ + static bool csky_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) @@ -6646,6 +7096,7 @@ csky_return_in_memory (const_tree type, Dwarf models VFP registers as 64-bit or 128-bit registers default. GCC models tham as 32-bit registers, so we need to describe this to the DWARF generation code. Other registers can use the default. */ + static rtx csky_dwarf_register_span (rtx rtl) { @@ -6659,11 +7110,15 @@ csky_dwarf_register_span (rtx rtl) if (!CSKY_VREG_P (regno)) return NULL_RTX; + if (CSKY_VREG_HI_P (regno)) + regno += 16; + mode = GET_MODE (rtl); if (GET_MODE_SIZE (mode) < 8) return NULL_RTX; - if (TARGET_SOFT_FPU) + + if (TARGET_SINGLE_FPU) { nregs = GET_MODE_SIZE (mode) / 4; for (i = 0; i < nregs; i += 2) @@ -6684,9 +7139,18 @@ csky_dwarf_register_span (rtx rtl) as the CPU bit width. Transform the 64-bit FPU registers to 32 bits here, and we will modify the unwind processing to fit CSKY architecture later. */ - nregs = GET_MODE_SIZE (mode) / 8; - for (i = 0; i < nregs; i++) - parts[i] = gen_rtx_REG (SImode, regno + i); + nregs = GET_MODE_SIZE (mode) / 4; + for (i = 0; i < nregs; i += 2) + if (TARGET_BIG_ENDIAN) + { + parts[i] = gen_rtx_REG (SImode, regno + i - 16); + parts[i + 1] = gen_rtx_REG (SImode, regno + i); + } + else + { + parts[i] = gen_rtx_REG (SImode, regno + i); + parts[i + 1] = gen_rtx_REG (SImode, regno + i - 16); + } } return gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nregs , parts)); @@ -6847,6 +7311,34 @@ csky_init_cumulative_args (CUMULATIVE_ARGS *pcum, tree fntype, pcum->is_stdarg = true; } + +/* Implement the TARGET_INIT_BUILTINS target macro. */ + +void +csky_init_builtins (void) +{ + /* Inint fp16. */ + static tree csky_floatHF_type_node = make_node (REAL_TYPE); + TYPE_PRECISION (csky_floatHF_type_node) = GET_MODE_PRECISION (HFmode); + layout_type (csky_floatHF_type_node); + (*lang_hooks.types.register_builtin_type) (csky_floatHF_type_node, "__fp16"); +} + + +/* Implement TARGET_MANGLE_TYPE. */ + +static const char * +csky_mangle_type (const_tree type) +{ + if (TYPE_NAME (type) && TREE_CODE (TYPE_NAME (type)) == TYPE_DECL + && DECL_NAME (TYPE_NAME (type)) + && !strcmp (IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (type))), "__fp16")) + return "__fp16"; + + /* Use the default mangling. */ + return NULL; +} + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-csky.h" diff --git a/gcc/config/csky/csky.h b/gcc/config/csky/csky.h index c7590ab..f535c42 100644 --- a/gcc/config/csky/csky.h +++ b/gcc/config/csky/csky.h @@ -28,8 +28,17 @@ #define CSKY_GENERAL_REGNO_P(N) \ ((N) < CSKY_NGPR_REGS && (int)(N) >= 0) -#define CSKY_VREG_P(N) \ - ((N) >= CSKY_FIRST_VFP_REGNUM && (N) <= CSKY_LAST_VFP_REGNUM) +#define CSKY_VREG_LO_P(N) \ + ((N) >= CSKY_FIRST_VFP_REGNUM \ + && (N) <= CSKY_LAST_VFP_REGNUM) + + #define CSKY_VREG_HI_P(N) \ + ((N) >= CSKY_FIRST_VFP3_REGNUM \ + && (N) <= CSKY_LAST_VFP3_REGNUM) + + #define CSKY_VREG_P(N) \ + (CSKY_VREG_LO_P(N) \ + || CSKY_VREG_HI_P(N)) #define CSKY_HILO_REG_P(N) \ ((N) == CSKY_HI_REGNUM || (N) == CSKY_LO_REGNUM) @@ -124,7 +133,7 @@ (optimize_size && TARGET_CONSTANT_POOL \ && (CSKY_TARGET_ARCH (CK801) || CSKY_TARGET_ARCH (CK802))) #define TARGET_TLS \ - (CSKY_TARGET_ARCH (CK807) || CSKY_TARGET_ARCH (CK810)) + (CSKY_TARGET_ARCH (CK807) || CSKY_TARGET_ARCH (CK810) || CSKY_TARGET_ARCH (CK860)) /* Run-time Target Specification. */ #define TARGET_SOFT_FLOAT (csky_float_abi == CSKY_FLOAT_ABI_SOFT) @@ -133,7 +142,9 @@ /* Use hardware floating point calling convention. */ #define TARGET_HARD_FLOAT_ABI (csky_float_abi == CSKY_FLOAT_ABI_HARD) -#define TARGET_SINGLE_FPU (csky_fpu_index == TARGET_FPU_fpv2_sf) +#define TARGET_SINGLE_FPU (csky_fpu_index == TARGET_FPU_fpv2_sf \ + || csky_fpu_index == TARGET_FPU_fpv3_hsf \ + || csky_fpu_index == TARGET_FPU_fpv3_hf) #define TARGET_DOUBLE_FPU (TARGET_HARD_FLOAT && !TARGET_SINGLE_FPU) #define FUNCTION_VARG_REGNO_P(REGNO) \ @@ -142,13 +153,18 @@ CSKY_FIRST_VFP_REGNUM + CSKY_NPARM_FREGS - 1)) #define CSKY_VREG_MODE_P(mode) \ - ((mode) == SFmode || (mode) == DFmode) + ((mode) == SFmode || (mode) == DFmode \ + || (CSKY_ISA_FEATURE(fpv3_hf) && (mode) == HFmode)) #define FUNCTION_VARG_MODE_P(mode) \ (TARGET_HARD_FLOAT_ABI \ && CSKY_VREG_MODE_P(mode) \ && !(mode == DFmode && TARGET_SINGLE_FPU)) +#define TARGET_SUPPORT_FPV3 (CSKY_ISA_FEATURE (fpv3_hf) \ + || CSKY_ISA_FEATURE (fpv3_sf) \ + || CSKY_ISA_FEATURE (fpv3_df)) + /* Number of loads/stores handled by ldm/stm. */ #define CSKY_MIN_MULTIPLE_STLD 3 #define CSKY_MAX_MULTIPLE_STLD 12 @@ -427,7 +443,7 @@ typedef struct ******************************************************************/ -#define FIRST_PSEUDO_REGISTER 71 +#define FIRST_PSEUDO_REGISTER 202 /* 1 for registers that have pervasive standard uses and are not available for the register allocator. @@ -456,7 +472,31 @@ typedef struct /* reserved */ \ 1, 1, \ /* epc */ \ - 1 \ + 1, \ + /* vr16 vr17 vr18 vr19 vr20 vr21 vr22 vr23 */ \ + 0, 0, 0, 0, 0, 0, 0, 0, \ + /* vr24 vr25 vr26 vr27 vr28 vr29 vr30 vr31 */ \ + 0, 0, 0, 0, 0, 0, 0, 0 , \ + /* reserved */ \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + /* reserved */ \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + \ + 1, 1, 1 \ } /* Like `CALL_USED_REGISTERS' but used to overcome a historical @@ -487,7 +527,31 @@ typedef struct /* reserved */ \ 1, 1, \ /* epc */ \ - 1 \ + 1, \ + /* vr16 vr17 vr18 vr19 vr20 vr21 vr22 vr23*/ \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + /* vr24 vr25 vr26 vr27 vr28 vr29 vr30 vr31 */ \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + /* reserved */ \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + /* reserved */ \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ + \ + 1, 1, 1 \ } #define REGISTER_NAMES \ @@ -510,7 +574,37 @@ typedef struct "vr0", "vr1", "vr2", "vr3", "vr4", "vr5", "vr6", "vr7", \ "vr8", "vr9", "vr10", "vr11", "vr12", "vr13", "vr14", "vr15", \ "reserved", "reserved", \ - "epc" \ + "epc", \ + /* V registers: 71~86 */ \ + "vr16", "vr17", "vr18", "vr19", "vr20", "vr21", "vr22", "vr23", \ + "vr24", "vr25", "vr26", "vr27", "vr28", "vr29", "vr30", "vr31", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", \ + /* reserved: 87~201*/ \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", "reserved", "reserved", "reserved", \ + "reserved", "reserved", \ + "reserved", "reserved", "reserved" \ } /* Table of additional register names to use in user input. */ @@ -569,9 +663,16 @@ typedef struct 52, 53, 54, 55, 56, 57, 58, 59, \ /* vr8 vr9 vr10 vr11 vr12 vr13 vr14 vr15 */ \ 60, 61, 62, 63, 64, 65, 66, 67, \ +/* vr16 vr17 vr18 vr18 vr20 vr21 vr22 vr23 */ \ + 71, 72, 73, 74, 75, 76, 77, 78, \ +/* vr24 vr25 vr26 vr27 vr28 vr28 vr30 vr31 */ \ + 79, 80, 81, 82, 83, 84, 85, 86, \ /* reserved */ \ 36, 37, 38, 39, 40, 41, 42, 43, \ 44, 45, 46, 47, 48, 49, 50, 51, \ +/* reserved */ \ + 87, 88, 89, 90, 91, 92, 93, 94, \ + 95, 96, 97, 98, 99, 100, 101, 102, \ /* sp tls reserved c reserved epc */ \ 14, 31, 32, 33, 68, 69, 70 } @@ -616,21 +717,34 @@ enum reg_class /* Define which registers fit in which classes. This is an initializer for a vector of HARD_REG_SET of length N_REG_CLASSES. */ -#define REG_CLASS_CONTENTS \ -{ \ - {0x00000000, 0x00000000, 0x00000000 }, /* NO_REGS */ \ - {0x000000FF, 0x00000000, 0x00000000 }, /* MINI_REGS */ \ - {0x00004000, 0x00000000, 0x00000000 }, /* SP_REGS */ \ - {0x0000FFFF, 0x00000000, 0x00000000 }, /* LOW_REGS */ \ - {0xFFFFFFFF, 0x00000000, 0x00000000 }, /* GENERAL_REGS */ \ - {0x00000000, 0x00000002, 0x00000000 }, /* C_REGS */ \ - {0x00000000, 0x00000004, 0x00000000 }, /* HI_REG */ \ - {0x00000000, 0x00000008, 0x00000000 }, /* LO_REG */ \ - {0x00000000, 0x0000000c, 0x00000000 }, /* HILO_REGS */ \ - {0x00000000, 0xFFF00000, 0x0000000F }, /* V_REGS */ \ - {0x00000000, 0x00000000, 0x00000040 }, /* OTHER_REGS */ \ - {0x00000000, 0x0FF00001, 0x00000030 }, /* RESERVE_REGS */ \ - {0xFFFFFFFF, 0xFFFFFFFF, 0x0000007F }, /* ALL_REGS */ \ +#define REG_CLASS_CONTENTS \ +{ \ + {0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000}, /* NO_REGS */ \ + {0x000000FF, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000}, /* MINI_REGS */ \ + {0x00004000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000}, /* SP_REGS */ \ + {0x0000FFFF, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000}, /* LOW_REGS */ \ + {0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000}, /* GENERAL_REGS */ \ + {0x00000000, 0x00000002, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000}, /* C_REGS */ \ + {0x00000000, 0x00000004, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000}, /* HI_REG */ \ + {0x00000000, 0x00000008, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000}, /* LO_REG */ \ + {0x00000000, 0x0000000c, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000}, /* HILO_REGS */ \ + {0x00000000, 0xFFF00000, 0x007FFF8F, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000}, /* V_REGS */ \ + {0x00000000, 0x00000000, 0x00000040, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000}, /* OTHER_REGS */ \ + {0x00000000, 0x000FFFF1, 0xFF800030, 0xFFFFFFFF, \ + 0xFFFFFFFF, 0xFFFFFFFF, 0x000003FF}, /* RESERVE_REGS */ \ + {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, \ + 0xFFFFFFFF, 0xFFFFFFFF, 0x000003FF}, /* ALL_REGS */ \ } /* Return register class from regno. */ diff --git a/gcc/config/csky/csky.md b/gcc/config/csky/csky.md index 8bb3b2b..c27d627 100644 --- a/gcc/config/csky/csky.md +++ b/gcc/config/csky/csky.md @@ -32,6 +32,8 @@ (CSKY_FIRST_RET_REGNUM 0) (CSKY_FIRST_VFP_REGNUM 52) (CSKY_LAST_VFP_REGNUM 67) + (CSKY_FIRST_VFP3_REGNUM 71) + (CSKY_LAST_VFP3_REGNUM 86) (CSKY_FIRST_HIGH_REGNUM 16) (CSKY_LAST_HIGH_REGNUM 31) (CSKY_FIRST_MINI_REGNUM 0) @@ -423,85 +425,6 @@ (set_attr "type" "alu,alu,alu,load,load,store")] ) -;; Float mov instructions. - -(define_expand "movsf" - [(set (match_operand:SF 0 "general_operand" "") - (match_operand:SF 1 "general_operand" ""))] - "" - " - if (GET_CODE (operands[0]) == MEM && can_create_pseudo_p ()) - operands[1] = force_reg (SFmode, operands[1]); - " -) - -;; FIXME: maybe the vreg load/stores should have their own type attr. -(define_insn "*csky_movsf_fpv2" - [(set (match_operand:SF 0 "nonimmediate_operand" "=b,r,v,r,r,r, m,Q,v,v,v") - (match_operand:SF 1 "general_operand" " b,r,r,v,m,mF,r,v,Q,v,m"))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "* return csky_output_move (insn, operands, SFmode);" - [(set_attr "length" "2,4,4,4,4,4,4,4,4,4,4") - (set_attr "type" "alu,alu,alu,alu,load,load,store,alu,alu,alu,alu")] -) - -(define_insn "*ck801_movsf" - [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,r, m") - (match_operand:SF 1 "general_operand" " r,m,mF,r"))] - "CSKY_ISA_FEATURE (E1)" - "* return csky_output_ck801_move (insn, operands, SFmode);" - [(set_attr "length" "2,4,4,4") - (set_attr "type" "alu,load,load,store")] -) - -(define_insn "*csky_movsf" - [(set (match_operand:SF 0 "nonimmediate_operand" "=b,r,r,r, m") - (match_operand:SF 1 "general_operand" " b,r,m,mF,r"))] - "CSKY_ISA_FEATURE (E2) && !CSKY_ISA_FEATURE (fpv2_sf)" - "* return csky_output_move (insn, operands, SFmode);" - [(set_attr "length" "2,4,4,4,4") - (set_attr "type" "alu,alu,load,load,store")] -) - - -(define_expand "movdf" - [(set (match_operand:DF 0 "general_operand" "") - (match_operand:DF 1 "general_operand" ""))] - "" - " - if (GET_CODE (operands[0]) == MEM && can_create_pseudo_p ()) - operands[1] = force_reg (DFmode, operands[1]); - " -) - -;; FIXME: maybe the vreg load/stores should have their own type attr. -(define_insn "*csky_movdf_fpv2" - [(set (match_operand:DF 0 "nonimmediate_operand" "=b,r,v,r,r,r, m,Q,v,v,v") - (match_operand:DF 1 "general_operand" "b,r,r,v,m,mF,r,v,Q,v,m"))] - "CSKY_ISA_FEATURE (fpv2_df)" - "* return csky_output_movedouble (operands, DFmode);" - [(set_attr "length" "4,8,8,8,8,8,8,8,8,8,8") - (set_attr "type" "alu,alu,alu,alu,load,load,store,alu,alu,alu,alu")] -) - -(define_insn "*ck801_movdf" - [(set (match_operand:DF 0 "nonimmediate_operand" "=r,r,r, m") - (match_operand:DF 1 "general_operand" " r,m,mF,r"))] - "CSKY_ISA_FEATURE (E1)" - "* return csky_output_ck801_movedouble (operands, DFmode);" - [(set_attr "length" "4,8,8,8") - (set_attr "type" "alu,load,load,store")] -) - -(define_insn "*csky_movdf" - [(set (match_operand:DF 0 "nonimmediate_operand" "=b,r,r,r, m") - (match_operand:DF 1 "general_operand" " b,r,m,mF,r"))] - "CSKY_ISA_FEATURE (E2) && !CSKY_ISA_FEATURE (fpv2_df)" - "* return csky_output_movedouble (operands, DFmode);" - [(set_attr "length" "4,8,8,8,8") - (set_attr "type" "alu,alu,load,load,store")] -) - ;; The only CCmode move supported is a nop. Without this pattern, ;; CSE is unable to eliminate redundant comparisons in conditional ;; execution expressions. @@ -522,7 +445,7 @@ (define_expand "movsicc" [(set (match_operand 0 "register_operand" "") - (if_then_else:SI (match_operand 1 "ordered_comparison_operator" "") + (if_then_else:SI (match_operand 1 "comparison_operator" "") (match_operand:SI 2 "register_operand" "") (match_operand:SI 3 "register_operand" "")))] "CSKY_ISA_FEATURE (E2)" @@ -1321,7 +1244,7 @@ (define_expand "addsicc" [(match_operand:SI 0 "register_operand" "") - (match_operand 1 "ordered_comparison_operator" "") + (match_operand 1 "comparison_operator" "") (match_operand:SI 2 "register_operand" "") (match_operand:SI 3 "csky_literal_K_Uh_operand" "")] "CSKY_ISA_FEATURE (E2)" @@ -3316,9 +3239,9 @@ (define_expand "untyped_call" [(parallel [(call (match_operand 0 "" "") - (const_int 0)) - (match_operand 1 "" "") - (match_operand 2 "" "")])] + (const_int 0)) + (match_operand 1 "" "") + (match_operand 2 "" "")])] "" { int i; @@ -3349,11 +3272,25 @@ "" [(set_attr "length" "0")]) -(define_insn "*call_value_internal_vs" - [(set (match_operand:SF 0 "register_operand" "=v,v,v") +(define_insn "*call_value_internal_vh" + [(set (match_operand:HF 0 "register_operand" "=v,v,v") (call (mem:SI (match_operand:SI 1 "csky_call_address_operand" "b, r,S")) (match_operand 2 "" ""))) (clobber (reg:SI CSKY_LR_REGNUM))] + "TARGET_HARD_FLOAT_ABI && CSKY_ISA_FEATURE (fpv3_hf)" + "@ + jsr\t%1 + jsr\t%1 + jbsr\t%1" + [(set_attr "length" "2,4,4") + (set_attr "type" "call_jsr,call_jsr,call")] +) + +(define_insn "*call_value_internal_vs" + [(set (match_operand:SF 0 "register_operand" "=v,v,v") + (call (mem:SI (match_operand:SI 1 "csky_call_address_operand" "b, r,S")) + (match_operand 2 "" ""))) + (clobber (reg:SI CSKY_LR_REGNUM))] "TARGET_HARD_FLOAT_ABI" "@ jsr\t%1 @@ -3364,9 +3301,9 @@ ) (define_insn "*call_value_internal_vd" - [(set (match_operand:DF 0 "register_operand" "=v,v,v") - (call (mem:SI (match_operand:SI 1 "csky_call_address_operand" "b, r,S")) - (match_operand 2 "" ""))) + [(set (match_operand:DF 0 "register_operand" "=v,v,v") + (call (mem:SI (match_operand:SI 1 "csky_call_address_operand" "b, r,S")) + (match_operand 2 "" ""))) (clobber (reg:SI CSKY_LR_REGNUM))] "TARGET_HARD_FLOAT_ABI && TARGET_DOUBLE_FPU" "@ @@ -3378,18 +3315,18 @@ ) (define_insn "*call_value_internal_pic_vs" - [(set (match_operand:SF 0 "register_operand" "=v") - (call (mem:SI (match_operand:SI 1 "csky_unspec_operand" "X")) - (match_operand 2 "" ""))) + [(set (match_operand:SF 0 "register_operand" "=v") + (call (mem:SI (match_operand:SI 1 "csky_unspec_operand" "X")) + (match_operand 2 "" ""))) (clobber (reg:SI CSKY_LR_REGNUM))] "flag_pic && TARGET_HARD_FLOAT_ABI" "* return csky_output_call (operands, 1);" ) (define_insn "*call_value_internal_pic_vd" - [(set (match_operand:DF 0 "register_operand" "=v") - (call (mem:SI (match_operand:SI 1 "csky_unspec_operand" "X")) - (match_operand 2 "" ""))) + [(set (match_operand:DF 0 "register_operand" "=v") + (call (mem:SI (match_operand:SI 1 "csky_unspec_operand" "X")) + (match_operand 2 "" ""))) (clobber (reg:SI CSKY_LR_REGNUM))] "flag_pic && TARGET_HARD_FLOAT_ABI && TARGET_DOUBLE_FPU" "* return csky_output_call (operands, 1);" diff --git a/gcc/config/csky/csky_cores.def b/gcc/config/csky/csky_cores.def index 8309e99..fcf42a4 100644 --- a/gcc/config/csky/csky_cores.def +++ b/gcc/config/csky/csky_cores.def @@ -38,6 +38,8 @@ CSKY_ARCH ("ck807", ck807, CK807, CSKY_ISA_FEAT (CSKY_ISA_CK807) CSKY_ISA_FEAT (CSKY_ISA_DSP)) CSKY_ARCH ("ck810", ck810, CK810, CSKY_ISA_FEAT (CSKY_ISA_CK810) CSKY_ISA_FEAT (CSKY_ISA_DSP)) +CSKY_ARCH ("ck860", ck860, CK860, + CSKY_ISA_FEAT (CSKY_ISA_CK860)) #endif @@ -181,6 +183,12 @@ CSKY_CORE ("ck810ft", ck810ff, ck810ft, CK810, CSKY_ISA_FEAT_NONE) CSKY_CORE ("ck810ftv", ck810ftv, ck810ftv, CK810, CSKY_ISA_FEAT_NONE) + +/* ck860 Architecture Processors */ +CSKY_CORE("ck860", ck860, ck860, CK860, + CSKY_ISA_FEAT_NONE) +CSKY_CORE("ck860f", ck860f, ck860f, CK860, + CSKY_ISA_FEAT_NONE) #endif @@ -196,4 +204,9 @@ CSKY_CORE ("ck810ftv", ck810ftv, ck810ftv, CK810, CSKY_FPU ("fpv2_sf", fpv2_sf, CSKY_ISA_FEAT (CSKY_ISA_FPv2_SF)) CSKY_FPU ("fpv2", fpv2, CSKY_ISA_FEAT (CSKY_ISA_FPv2)) CSKY_FPU ("fpv2_divd", fpv2_divd, CSKY_ISA_FEAT (CSKY_ISA_FPv2_DIVD)) + +CSKY_FPU ("fpv3_hf", fpv3_hf, CSKY_ISA_FEAT (CSKY_ISA_FPv3_HF)) +CSKY_FPU ("fpv3_hsf", fpv3_hsf, CSKY_ISA_FEAT (CSKY_ISA_FPv3_HSF)) +CSKY_FPU ("fpv3_sdf", fpv3_sdf, CSKY_ISA_FEAT (CSKY_ISA_FPv3_SDF)) +CSKY_FPU ("fpv3", fpv3, CSKY_ISA_FEAT (CSKY_ISA_FPv3)) #endif diff --git a/gcc/config/csky/csky_insn_fpu.md b/gcc/config/csky/csky_insn_fpu.md index c1e78af..e0d01ab 100644 --- a/gcc/config/csky/csky_insn_fpu.md +++ b/gcc/config/csky/csky_insn_fpu.md @@ -18,528 +18,314 @@ ;; along with GCC; see the file COPYING3. If not see ;; . */ -;; ------------------------------------------------------------------------- -;; Float Abs instructions -;; ------------------------------------------------------------------------- +(define_c_enum "unspec" [ + UNSPEC_FLOOR + UNSPEC_CEIL + UNSPEC_BTRUNC + UNSPEC_RINT +]) -(define_insn "abssf2" - [(set (match_operand:SF 0 "register_operand" "=v,r") - (abs:SF (match_operand:SF 1 "register_operand" "v, r")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "@ - fabss\t%0, %1 - bclri\t%0, %1, 31") +(define_c_enum "unspecv" [ + VUNSPEC_GET_FCR ; Represent fetch of FCR content. + VUNSPEC_SET_FCR ; Represent assign of FCR content. + VUNSPEC_INS_FCR ; Represent insert of FCR content. +]) -(define_insn "absdf2" - [(set (match_operand:DF 0 "register_operand" "=v") - (abs:DF (match_operand:DF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fabsd\t%0, %1") +(define_mode_iterator F3ANY [HF SF DF]) +(define_mode_attr f3t [(HF "16") (SF "32") (DF "64")]) +(define_mode_iterator SFDF [SF DF]) +(define_mode_attr f2t [(SF "32") (DF "64")]) -;; ------------------------------------------------------------------------- -;; Float Neg instructions -;; ------------------------------------------------------------------------- +(define_code_iterator FCMPZ [ne ge lt gt le]) +(define_code_attr zero_inst [(ne "nez") (ge "hsz") (lt "ltz") (gt "hz") (le "lsz")]) -(define_insn "negsf2" - [(set (match_operand:SF 0 "register_operand" "=v") - (neg:SF (match_operand:SF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fnegs\t%0, %1") +(define_code_iterator FCMP [ne ge lt]) +(define_code_attr reg_inst [(ne "ne") (ge "hs") (lt "lt")]) -(define_insn "negdf2" - [(set (match_operand:DF 0 "register_operand" "=v") - (neg:DF (match_operand:DF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fnegd\t%0, %1") +(define_code_iterator FIX_SU [fix unsigned_fix]) +(define_code_attr fixsuop [(fix "") (unsigned_fix "uns")]) +(define_code_attr fixsu [(fix "s") (unsigned_fix "u")]) +(define_code_iterator FLOAT_SU [float unsigned_float]) +(define_code_attr floatsuop [(float "") (unsigned_float "uns")]) +(define_code_attr floatsu [(float "s") (unsigned_float "u")]) -;; ------------------------------------------------------------------------- -;; Float Sqrt instructions -;; ------------------------------------------------------------------------- +(define_int_iterator FRM [UNSPEC_FLOOR + UNSPEC_CEIL UNSPEC_RINT]) -(define_insn "sqrtsf2" - [(set (match_operand:SF 0 "register_operand" "=v") - (sqrt:SF (match_operand:SF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fsqrts\t%0, %1") +(define_int_iterator FRMF [UNSPEC_FLOOR + UNSPEC_CEIL UNSPEC_BTRUNC]) -(define_insn "sqrtdf2" - [(set (match_operand:DF 0 "register_operand" "=v") - (sqrt:DF (match_operand:DF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_divd)" - "fsqrtd\t%0, %1") +(define_int_attr frm_pattern [(UNSPEC_FLOOR "floor") + (UNSPEC_CEIL "ceil") (UNSPEC_BTRUNC "btrunc") + (UNSPEC_RINT "rint")]) + +(define_int_attr rm [(UNSPEC_FLOOR ".rni") + (UNSPEC_CEIL ".rpi") (UNSPEC_BTRUNC ".rz") + (UNSPEC_RINT "")]) ;; ------------------------------------------------------------------------- -;; Float Add instructions +;; Float mov instructions ;; ------------------------------------------------------------------------- -(define_insn "addsf3" - [(set (match_operand:SF 0 "register_operand" "=v") - (plus:SF (match_operand:SF 1 "register_operand" "v") - (match_operand:SF 2 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fadds\t%0, %1, %2") +(define_expand "movhf" + [(set (match_operand:HF 0 "general_operand" "") + (match_operand:HF 1 "general_operand" ""))] + "CSKY_ISA_FEATURE(fpv3_hf)" + " + { + if (GET_CODE(operands[0]) == MEM && can_create_pseudo_p ()) + { + operands[1] = force_reg (HFmode, operands[1]); + } + } +") + +(define_expand "mov" + [(set (match_operand:SFDF 0 "general_operand" "") + (match_operand:SFDF 1 "general_operand" ""))] + "CSKY_ISA_FEATURE(fpv2_) + || CSKY_ISA_FEATURE(fpv3_)" + " + { + if (GET_CODE(operands[0]) == MEM && can_create_pseudo_p ()) + { + operands[1] = force_reg (mode, operands[1]); + } + } +") + +;; Move float value with general register. + +(define_insn "*e2_movsf" + [(set (match_operand:SF 0 "nonimmediate_operand" "=b,r,r,r, m") + (match_operand:SF 1 "general_operand" " b,r,m,mF,r"))] + "CSKY_ISA_FEATURE (E2) + && !CSKY_ISA_FEATURE (fpv2_sf) + && !CSKY_ISA_FEATURE (fpv3_sf)" + "* return csky_output_move (insn, operands, SFmode);" + [(set_attr "length" "2,4,4,4,4") + (set_attr "type" "alu,alu,load,load,store")] +) + +(define_insn "*e2_movdf" + [(set (match_operand:DF 0 "nonimmediate_operand" "=b,r,r,r, m") + (match_operand:DF 1 "general_operand" " b,r,m,mF,r"))] + "CSKY_ISA_FEATURE (E2) + && !CSKY_ISA_FEATURE (fpv2_df) + && !CSKY_ISA_FEATURE (fpv3_df)" + "* return csky_output_movedouble (operands, DFmode);" + [(set_attr "length" "4,8,8,8,8") + (set_attr "type" "alu,alu,load,load,store")] +) -(define_insn "adddf3" - [(set (match_operand:DF 0 "register_operand" "=v") - (plus:DF (match_operand:DF 1 "register_operand" "v") - (match_operand:DF 2 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "faddd\t%0, %1, %2") +(define_insn "*e1_movsf" + [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,r, m") + (match_operand:SF 1 "general_operand" " r,m,mF,r"))] + "CSKY_ISA_FEATURE (E1)" + "* return csky_output_ck801_move (insn, operands, SFmode);" + [(set_attr "length" "2,4,4,4") + (set_attr "type" "alu,load,load,store")] +) +(define_insn "*e1_movdf" + [(set (match_operand:DF 0 "nonimmediate_operand" "=r,r,r, m") + (match_operand:DF 1 "general_operand" " r,m,mF,r"))] + "CSKY_ISA_FEATURE (E1)" + "* return csky_output_ck801_movedouble (operands, DFmode);" + [(set_attr "length" "4,8,8,8") + (set_attr "type" "alu,load,load,store")] +) ;; ------------------------------------------------------------------------- -;; Float Sub instructions +;; Float Mul instructions ;; ------------------------------------------------------------------------- -(define_insn "subsf3" - [(set (match_operand:SF 0 "register_operand" "=v") - (minus:SF (match_operand:SF 1 "register_operand" "v") - (match_operand:SF 2 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fsubs\t%0, %1, %2") +(define_expand "mulhf3" + [(set (match_operand:HF 0 "register_operand" "=v") + (mult:HF (match_operand:HF 1 "register_operand" "v") + (match_operand:HF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "") -(define_insn "subdf3" - [(set (match_operand:DF 0 "register_operand" "=v") - (minus:DF (match_operand:DF 1 "register_operand" "v") - (match_operand:DF 2 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fsubd\t%0, %1, %2") +(define_expand "mul3" + [(set (match_operand:SFDF 0 "register_operand" "=v") + (mult:SFDF (match_operand:SFDF 1 "register_operand" "v") + (match_operand:SFDF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE(fpv2_) + || CSKY_ISA_FEATURE(fpv3_)" + "") +(define_expand "fma4" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (fma:F3ANY (match_operand:F3ANY 1 "register_operand" "v") + (match_operand:F3ANY 2 "register_operand" "v") + (match_operand:F3ANY 3 "register_operand" "0")))] + "CSKY_ISA_FEATURE(fpv3_)" + "") ;; ------------------------------------------------------------------------- -;; Float Mul instructions +;; Float ADD SUB NEG ABS instructions ;; ------------------------------------------------------------------------- -(define_insn "mulsf3" - [(set (match_operand:SF 0 "register_operand" "=v") - (mult:SF (match_operand:SF 1 "register_operand" "v") - (match_operand:SF 2 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fmuls\t%0, %1, %2") - -(define_insn "muldf3" - [(set (match_operand:DF 0 "register_operand" "=v") - (mult:DF (match_operand:DF 1 "register_operand" "v") - (match_operand:DF 2 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fmuld\t%0, %1, %2") - -(define_insn "*fpuv2_nmulsf3_1" - [(set (match_operand:SF 0 "register_operand" "=v") - (mult:SF (neg:SF (match_operand:SF 1 "register_operand" "%v")) - (match_operand:SF 2 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf) && !flag_rounding_math" - "fnmuls\t%0, %1, %2") - -(define_insn "*fpuv2_nmulsf3_2" - [(set (match_operand:SF 0 "register_operand" "=v") - (neg:SF (mult:SF (match_operand:SF 1 "register_operand" "v") - (match_operand:SF 2 "register_operand" "v"))))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fnmuls\t%0, %1, %2") - -(define_insn "*fpuv2_nmuldf3_1" - [(set (match_operand:DF 0 "register_operand" "=v") - (mult:DF (neg:DF (match_operand:DF 1 "register_operand" "%v")) - (match_operand:DF 2 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df) && !flag_rounding_math" - "fnmuld\t%0, %1, %2") - -(define_insn "*fpuv2_nmuldf3_2" - [(set (match_operand:DF 0 "register_operand" "=v") - (neg:DF (mult:DF (match_operand:DF 1 "register_operand" "v") - (match_operand:DF 2 "register_operand" "v"))))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fnmuld\t%0, %1, %2") +(define_expand "addhf3" + [(set (match_operand:HF 0 "register_operand" "") + (plus:HF (match_operand:HF 1 "register_operand" "") + (match_operand:HF 2 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "" +) + +(define_expand "add3" + [(set (match_operand:SFDF 0 "register_operand" "") + (plus:SFDF (match_operand:SFDF 1 "register_operand" "") + (match_operand:SFDF 2 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv2_) || CSKY_ISA_FEATURE(fpv3_)" + "" +) +(define_expand "subhf3" + [(set (match_operand:HF 0 "register_operand" "") + (minus:HF (match_operand:HF 1 "register_operand" "") + (match_operand:HF 2 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "" +) -;; ------------------------------------------------------------------------- -;; Float Div instructions -;; ------------------------------------------------------------------------- +(define_expand "sub3" + [(set (match_operand:SFDF 0 "register_operand" "") + (minus:SFDF (match_operand:SFDF 1 "register_operand" "") + (match_operand:SFDF 2 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv2_) || CSKY_ISA_FEATURE(fpv3_)" + "" +) -(define_expand "divsf3" - [(set (match_operand:SF 0 "register_operand" "") - (div:SF (match_operand:SF 1 "csky_arith_float1_operand" "") - (match_operand:SF 2 "register_operand" "")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "") +(define_expand "abshf2" + [(set (match_operand:HF 0 "register_operand" "") + (abs:HF (match_operand:HF 1 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "" +) -(define_insn "*fpuv2_divsf3" - [(set (match_operand:SF 0 "register_operand" "=v") - (div:SF (match_operand:SF 1 "register_operand" "v") - (match_operand:SF 2 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fdivs\t%0, %1, %2") - -(define_insn "*fpuv2_1_divsf3" - [(set (match_operand:SF 0 "register_operand" "=v") - (div:SF (match_operand:SF 1 "csky_const_float1_operand" "i") - (match_operand:SF 2 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "frecips\t%0, %2") - - -(define_expand "divdf3" - [(set (match_operand:DF 0 "register_operand" "") - (div:DF (match_operand:DF 1 "csky_arith_float1_operand" "") - (match_operand:DF 2 "register_operand" "")))] - "CSKY_ISA_FEATURE (fpv2_divd)" - "") +(define_expand "abs2" + [(set (match_operand:SFDF 0 "register_operand" "") + (abs:SFDF (match_operand:SFDF 1 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv2_) || CSKY_ISA_FEATURE(fpv3_)" + "" +) -(define_insn "*fpuv2_divdf3" - [(set (match_operand:DF 0 "register_operand" "=v") - (div:DF (match_operand:DF 1 "register_operand" "v") - (match_operand:DF 2 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_divd)" - "fdivd\t%0, %1, %2") +(define_expand "neghf2" + [(set (match_operand:HF 0 "register_operand" "") + (neg:HF (match_operand:HF 1 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "" +) -(define_insn "*fpuv2_1_divdf3" - [(set (match_operand:DF 0 "register_operand" "=v") - (div:DF (match_operand:DF 1 "csky_const_float1_operand" "i") - (match_operand:DF 2 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_divd)" - "frecipd\t%0, %2") +(define_expand "neg2" + [(set (match_operand:SFDF 0 "register_operand" "") + (neg:SFDF (match_operand:SFDF 1 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv2_) || CSKY_ISA_FEATURE(fpv3_)" + "" +) +(define_expand "sqrthf2" + [(set (match_operand:HF 0 "register_operand" "") + (sqrt:HF (match_operand:HF 1 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "" +) + +(define_expand "sqrt2" + [(set (match_operand:SFDF 0 "register_operand" "") + (sqrt:SFDF (match_operand:SFDF 1 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv2_) || CSKY_ISA_FEATURE(fpv3_)" + "" +) ;; ------------------------------------------------------------------------- -;; Float add(sub) with mult instructions +;; Float div instructions ;; ------------------------------------------------------------------------- -;; vrz <= vrz + vrx * vry -(define_insn "*fpuv2_fmacs" - [(set (match_operand:SF 0 "register_operand" "=v") - (plus:SF (mult:SF (match_operand:SF 1 "register_operand" "v") - (match_operand:SF 2 "register_operand" "v")) - (match_operand:SF 3 "register_operand" "0")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fmacs\t%0, %1, %2") - -(define_insn "*fpuv2_fmacd" - [(set (match_operand:DF 0 "register_operand" "=v") - (plus:DF (mult:DF (match_operand:DF 1 "register_operand" "v") - (match_operand:DF 2 "register_operand" "v")) - (match_operand:DF 3 "register_operand" "0")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fmacd\t%0, %1, %2") - -;; vrz <= vrz - vrx * vry -(define_insn "*fpuv2_fnmacs" - [(set (match_operand:SF 0 "register_operand" "=v") - (minus:SF (match_operand:SF 1 "register_operand" "0") - (mult:SF (match_operand:SF 2 "register_operand" "v") - (match_operand:SF 3 "register_operand" "v"))))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fnmacs\t%0, %2, %3") - -(define_insn "*fpuv2_fnmacd" - [(set (match_operand:DF 0 "register_operand" "=v") - (minus:DF (match_operand:DF 1 "register_operand" "0") - (mult:DF (match_operand:DF 2 "register_operand" "v") - (match_operand:DF 3 "register_operand" "v"))))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fnmacd\t%0, %2, %3") - -;; vrz <= vrx * vry - vrz -(define_insn "*fpuv2_fmscs" - [(set (match_operand:SF 0 "register_operand" "=v") - (minus:SF (mult:SF (match_operand:SF 1 "register_operand" "v") - (match_operand:SF 2 "register_operand" "v")) - (match_operand:SF 3 "register_operand" "0")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fmscs\t%0, %1, %2") - -(define_insn "*fpuv2_fmscd" - [(set (match_operand:DF 0 "register_operand" "=v") - (minus:DF (mult:DF (match_operand:DF 1 "register_operand" "v") - (match_operand:DF 2 "register_operand" "v")) - (match_operand:DF 3 "register_operand" "0")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fmscd\t%0, %1, %2") - -;; vrz = - (vrz + vrx * vry) -(define_insn "*fpuv2_fnmscs_1" - [(set (match_operand:SF 0 "register_operand" "=v") - (minus:SF (mult:SF (neg:SF (match_operand:SF 1 "register_operand" "%v")) - (match_operand:SF 2 "register_operand" "v")) - (match_operand:SF 3 "register_operand" "0")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fnmscs\t%0, %1, %2") - -(define_insn "*fpuv2_fnmscs_2" - [(set (match_operand:SF 0 "register_operand" "=v") - (neg:SF (plus:SF (mult:SF (match_operand:SF 1 "register_operand" "v") - (match_operand:SF 2 "register_operand" "v")) - (match_operand:SF 3 "register_operand" "0"))))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fnmscs\t%0, %1, %2") - -(define_insn "*fpuv2_fnmscd_1" - [(set (match_operand:DF 0 "register_operand" "=v") - (minus:DF (mult:DF (neg:DF (match_operand:DF 1 "register_operand" "%v")) - (match_operand:DF 2 "register_operand" "v")) - (match_operand:DF 3 "register_operand" "0")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fnmscd\t%0, %1, %2") - -(define_insn "*fpuv2_fnmscd_2" - [(set (match_operand:DF 0 "register_operand" "=v") - (neg:DF (plus:DF (mult:DF (match_operand:DF 1 "register_operand" "v") - (match_operand:DF 2 "register_operand" "v")) - (match_operand:DF 3 "register_operand" "0"))))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fnmscd\t%0, %1, %2") +(define_expand "div3" + [(set (match_operand:SFDF 0 "register_operand" "") + (div:SFDF (match_operand:SFDF 1 "csky_arith_float1_operand" "") + (match_operand:SFDF 2 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv2_) || CSKY_ISA_FEATURE(fpv3_)" + "") +(define_expand "divhf3" + [(set (match_operand:HF 0 "register_operand" "") + (div:HF (match_operand:HF 1 "csky_arith_float1_operand" "") + (match_operand:HF 2 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "") ;; ------------------------------------------------------------------------- ;; Float compare instructions ;; ------------------------------------------------------------------------- -(define_expand "cbranchsf4" +(define_expand "cbranch4" [(set (pc) (if_then_else (match_operator 0 "csky_float_comparison_operator" - [(match_operand:SF 1 "register_operand") - (match_operand:SF 2 "csky_compare_operand_float")]) + [(match_operand:SFDF 1 "register_operand") + (match_operand:SFDF 2 "csky_compare_operand_float")]) (label_ref (match_operand 3 "")) (pc)))] - "CSKY_ISA_FEATURE (fpv2_sf)" - " - { - enum rtx_code code = GET_CODE (operands[0]); - bool invert = csky_emit_compare_float (code, operands[1], operands[2]); +"CSKY_ISA_FEATURE(fpv2_) || CSKY_ISA_FEATURE(fpv3_)" +"{ + enum rtx_code code = GET_CODE (operands[0]); + bool invert; - if (invert) - emit_jump_insn (gen_csky_jbf (operands[3])); - else - emit_jump_insn (gen_csky_jbt (operands[3])); + invert = csky_emit_compare_float (code, operands[1], operands[2]); - DONE; - }") - -(define_insn "*fpuv2_unordered" - [(set (reg:CC 33) (unordered:CC (match_operand:SF 0 "register_operand" "v") - (match_operand:SF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fcmpuos\t%0, %1") - -(define_insn "*fpuv2_unordered_zero" - [(set (reg:CC 33) (unordered:CC (match_operand:SF 0 "register_operand" "v") - (match_operand:SF 1 "csky_const_float0_operand" "i")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fcmpuos\t%0, %0") - -(define_insn "*fpuv2_ne" - [(set (reg:CC 33) (ne:CC (match_operand:SF 0 "register_operand" "v") - (match_operand:SF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fcmpnes\t%0, %1") - -(define_insn "*fpuv2_gt" - [(set (reg:CC 33) (gt:CC (match_operand:SF 0 "register_operand" "v") - (match_operand:SF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fcmplts\t%1, %0") - -(define_insn "*fpuv2_ge" - [(set (reg:CC 33) (ge:CC (match_operand:SF 0 "register_operand" "v") - (match_operand:SF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fcmphss\t%0, %1") - -(define_insn "*fpuv2_lt" - [(set (reg:CC 33) (lt:CC (match_operand:SF 0 "register_operand" "v") - (match_operand:SF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fcmplts\t%0, %1") - -(define_insn "*fpuv2_le" - [(set (reg:CC 33) (le:CC (match_operand:SF 0 "register_operand" "v") - (match_operand:SF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fcmphss\t%1, %0") - -(define_insn "*fpuv2_gez" - [(set (reg:CC 33) (ge:CC (match_operand:SF 0 "register_operand" "v") - (match_operand:SF 1 "csky_const_float0_operand" "i")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fcmpzhss\t%0") - -(define_insn "*fpuv2_nez" - [(set (reg:CC 33) (ne:CC (match_operand:SF 0 "register_operand" "v") - (match_operand:SF 1 "csky_const_float0_operand" "i")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fcmpznes\t%0") - - -(define_expand "cbranchdf4" - [(set (pc) (if_then_else (match_operator 0 "csky_float_comparison_operator" - [(match_operand:DF 1 "register_operand") - (match_operand:DF 2 "csky_compare_operand_float")]) - (label_ref (match_operand 3 "")) - (pc)))] - "CSKY_ISA_FEATURE (fpv2_df)" - " - { - enum rtx_code code = GET_CODE (operands[0]); - bool invert = csky_emit_compare_float (code, operands[1], operands[2]); + if (invert) + emit_jump_insn (gen_csky_jbf (operands[3])); + else + emit_jump_insn (gen_csky_jbt (operands[3])); - if (invert) - emit_jump_insn (gen_csky_jbf (operands[3])); - else - emit_jump_insn (gen_csky_jbt (operands[3])); + DONE; - DONE; }") -(define_insn "*fpuv2_dunordered" - [(set (reg:CC 33) (unordered:CC (match_operand:DF 0 "register_operand" "v") - (match_operand:DF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fcmpuod\t%0, %1") - -(define_insn "*fpuv2_dunordered_zero" - [(set (reg:CC 33) (unordered:CC (match_operand:DF 0 "register_operand" "v") - (match_operand:DF 1 "csky_const_float0_operand" "i")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fcmpuod\t%0, %0") - -(define_insn "*fpuv2_dne" - [(set (reg:CC 33) (ne:CC (match_operand:DF 0 "register_operand" "v") - (match_operand:DF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fcmpned\t%0, %1") - -(define_insn "*fpuv2_dgt" - [(set (reg:CC 33) (gt:CC (match_operand:DF 0 "register_operand" "v") - (match_operand:DF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fcmpltd\t%1, %0") - -(define_insn "*fpuv2_dge" - [(set (reg:CC 33) (ge:CC (match_operand:DF 0 "register_operand" "v") - (match_operand:DF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fcmphsd\t%0, %1") - -(define_insn "*fpuv2_dlt" - [(set (reg:CC 33) (lt:CC (match_operand:DF 0 "register_operand" "v") - (match_operand:DF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fcmpltd\t%0, %1") - -(define_insn "*fpuv2_dle" - [(set (reg:CC 33) (le:CC (match_operand:DF 0 "register_operand" "v") - (match_operand:DF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fcmphsd\t%1, %0") - -(define_insn "*fpuv2_dgez" - [(set (reg:CC 33) (ge:CC (match_operand:DF 0 "register_operand" "v") - (match_operand:DF 1 "csky_const_float0_operand" "i")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fcmpzhsd\t%0") - -(define_insn "*fpuv2_dnez" - [(set (reg:CC 33) (ne:CC (match_operand:DF 0 "register_operand" "v") - (match_operand:DF 1 "csky_const_float0_operand" "i")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fcmpzned\t%0") +(define_expand "cbranchhf4" + [(set (pc) (if_then_else (match_operator 0 "csky_float_comparison_operator" + [(match_operand:HF 1 "register_operand") + (match_operand:HF 2 "csky_compare_operand_float")]) + (label_ref (match_operand 3 "")) + (pc)))] +"CSKY_ISA_FEATURE(fpv3_hf)" +"{ + enum rtx_code code = GET_CODE (operands[0]); + bool invert; + invert = csky_emit_compare_float (code, operands[1], operands[2]); -;; ------------------------------------------------------------------------- -;; Float convert instructions -;; ------------------------------------------------------------------------- + if (invert) + emit_jump_insn (gen_csky_jbf (operands[3])); + else + emit_jump_insn (gen_csky_jbt (operands[3])); -;; DF <- SF -(define_insn "extendsfdf2" - [(set (match_operand:DF 0 "register_operand" "=v") - (float_extend:DF (match_operand:SF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fstod\t%0, %1") - -;; SF <- DF -(define_insn "truncdfsf2" - [(set (match_operand:SF 0 "register_operand" "=v") - (float_truncate:SF (match_operand:DF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fdtos\t%0, %1") - -;; SF <- SI -(define_insn "floatsisf2" - [(set (match_operand:SF 0 "register_operand" "=v") - (float:SF (match_operand:SI 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fsitos\t%0, %1") - -;; DF <- SI -(define_insn "floatsidf2" - [(set (match_operand:DF 0 "register_operand" "=v") - (float:DF (match_operand:SI 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fsitod\t%0, %1") - -;; SF <- unsigned SI -(define_insn "floatunssisf2" - [(set (match_operand:SF 0 "register_operand" "=v") - (unsigned_float:SF (match_operand:SI 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fuitos\t%0, %1") - -;; DF <- unsigned SI -(define_insn "floatunssidf2" - [(set (match_operand:DF 0 "register_operand" "=v") - (unsigned_float:DF (match_operand:SI 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fuitod\t%0, %1") - -;; SI <- SF -(define_insn "fix_truncsfsi2" - [(set (match_operand:SI 0 "register_operand" "=v") - (fix:SI (match_operand:SF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fstosi.rz\t%0, %1") - -;; SI <- DF -(define_insn "fix_truncdfsi2" - [(set (match_operand:SI 0 "register_operand" "=v") - (fix:SI (match_operand:DF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fdtosi.rz\t%0, %1") - -;; unsigned SI <- SF -(define_insn "fixuns_truncsfsi2" - [(set (match_operand:SI 0 "register_operand" "=v") - (unsigned_fix:SI (match_operand:SF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_sf)" - "fstoui.rz\t%0, %1") - -;; unsigned SI <- DF -(define_insn "fixuns_truncdfsi2" - [(set (match_operand:SI 0 "register_operand" "=v") - (unsigned_fix:SI (match_operand:DF 1 "register_operand" "v")))] - "CSKY_ISA_FEATURE (fpv2_df)" - "fdtoui.rz\t%0, %1") + DONE; +}") ;; ------------------------------------------------------------------------- -;; Float mov instructions +;; Instructions for float cstore ;; ------------------------------------------------------------------------- -;; Note: movsf and movdf patterns are in csky.md. - -;; cstore SF -(define_expand "cstoresf4" +(define_expand "cstore4" [(set (match_operand:SI 0 "register_operand" "") - (match_operator 1 "ordered_comparison_operator" - [(match_operand:SF 2 "register_operand" "") - (match_operand:SF 3 "csky_compare_operand_float" "")]))] - "CSKY_ISA_FEATURE (fpv2_sf)" - " - { - bool invert = csky_emit_compare_float (GET_CODE (operands[1]), - operands[2], operands[3]); - if (invert) + (match_operator 1 "csky_float_comparison_operator" + [(match_operand:SFDF 2 "register_operand" "") + (match_operand:SFDF 3 "csky_compare_operand_float" "")]))] + "CSKY_ISA_FEATURE (fpv2_) || CSKY_ISA_FEATURE(fpv3_)" + "{ + bool invert; + + invert = csky_emit_compare_float (GET_CODE (operands[1]), + operands[2], operands[3]); + if(invert) emit_insn (gen_mvcv (operands[0])); else emit_insn (gen_mvc (operands[0])); @@ -547,21 +333,91 @@ }" ) -;; cstore DF -(define_expand "cstoredf4" +(define_expand "cstorehf4" [(set (match_operand:SI 0 "register_operand" "") - (match_operator 1 "ordered_comparison_operator" - [(match_operand:DF 2 "register_operand" "") - (match_operand:DF 3 "csky_compare_operand_float" "")]))] - "CSKY_ISA_FEATURE (fpv2_df)" - " - { - bool invert = csky_emit_compare_float (GET_CODE (operands[1]), - operands[2], operands[3]); - if (invert) + (match_operator 1 "csky_float_comparison_operator" + [(match_operand:HF 2 "register_operand" "") + (match_operand:HF 3 "csky_compare_operand_float" "")]))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "{ + bool invert; + + invert = csky_emit_compare_float (GET_CODE (operands[1]), + operands[2], operands[3]); + if(invert) emit_insn (gen_mvcv (operands[0])); else emit_insn (gen_mvc (operands[0])); DONE; }" ) + +;; ------------------------------------------------------------------------- +;; Float convert instructions +;; ------------------------------------------------------------------------- + +;; SF <- HF +(define_expand "extendhfsf2" + [(set (match_operand:SF 0 "register_operand" "") + (float_extend:SF (match_operand:HF 1 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "") + +;; HF <- SF +(define_expand "truncsfhf2" + [(set (match_operand:HF 0 "register_operand" "") + (float_truncate:HF (match_operand:SF 1 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "") + +;; DF <- SF +(define_expand "extendsfdf2" + [(set (match_operand:DF 0 "register_operand" "") + (float_extend:DF (match_operand:SF 1 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv2_df) || CSKY_ISA_FEATURE(fpv3_df)" + "") + +;; SF <- DF +(define_expand "truncdfsf2" + [(set (match_operand:SF 0 "register_operand" "") + (float_truncate:SF (match_operand:DF 1 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv2_df) || CSKY_ISA_FEATURE(fpv3_df)" + "") + +;; HF <- unsigned SI,SI +(define_expand "floatsihf2" + [(set (match_operand:HF 0 "register_operand" "") + (FLOAT_SU:HF (match_operand:SI 1 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "") + +;; DF,SF <- unsigned SI,SI +(define_expand "floatsi2" + [(set (match_operand:SFDF 0 "register_operand" "") + (FLOAT_SU:SFDF (match_operand:SI 1 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv2_) || CSKY_ISA_FEATURE(fpv3_)" + "") + +;; HF <- unsigned HI,HI +(define_expand "floathihf2" + [(set (match_operand:HF 0 "register_operand" "") + (FLOAT_SU:HF (match_operand:HI 1 "register_operand" "")))] + "CSKY_ISA_FEATURE(fpv3_hi) && CSKY_ISA_FEATURE(fpv3_hf)" + "") + +;; unsigned SI,SI <- HF +(define_expand "fix_trunchfsi2" + [(set (match_operand:SI 0 "register_operand" "") + (FIX_SU:SI (fix:HF (match_operand:HF 1 "register_operand" ""))))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "") + +;; unsigned SI,SI <- DF,SF +(define_expand "fix_truncsi2" + [(set (match_operand:SI 0 "register_operand" "") + (FIX_SU:SI (fix:SFDF (match_operand:SFDF 1 "register_operand" ""))))] + "CSKY_ISA_FEATURE(fpv2_) || CSKY_ISA_FEATURE(fpv3_)" + "") + +(include "csky_insn_fpuv3.md") +(include "csky_insn_fpuv2.md") diff --git a/gcc/config/csky/csky_insn_fpuv2.md b/gcc/config/csky/csky_insn_fpuv2.md new file mode 100644 index 0000000..0a680f8 --- /dev/null +++ b/gcc/config/csky/csky_insn_fpuv2.md @@ -0,0 +1,470 @@ + +;; ------------------------------------------------------------------------- +;; Float Abs instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpuv2_abssf2" + [(set (match_operand:SF 0 "register_operand" "=v,a,r") + (abs:SF (match_operand:SF 1 "register_operand" "v, 0,r")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "@ + fabss\t%0, %1 + bclri\t%0, %1, 31 + bclri\t%0, %1, 31" + [(set_attr "length" "4,2,4")]) + +(define_insn "*fpuv2_absdf2" + [(set (match_operand:DF 0 "register_operand" "=v") + (abs:DF (match_operand:DF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fabsd\t%0, %1") + + +;; ------------------------------------------------------------------------- +;; Float Neg instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpuv2_negsf2" + [(set (match_operand:SF 0 "register_operand" "=v") + (neg:SF (match_operand:SF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fnegs\t%0, %1") + +(define_insn "*fpuv2_negdf2" + [(set (match_operand:DF 0 "register_operand" "=v") + (neg:DF (match_operand:DF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fnegd\t%0, %1") + + +;; ------------------------------------------------------------------------- +;; Float Sqrt instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpuv2_sqrtsf2" + [(set (match_operand:SF 0 "register_operand" "=v") + (sqrt:SF (match_operand:SF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fsqrts\t%0, %1") + +(define_insn "*fpuv2_sqrtdf2" + [(set (match_operand:DF 0 "register_operand" "=v") + (sqrt:DF (match_operand:DF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_divd)" + "fsqrtd\t%0, %1") + + +;; ------------------------------------------------------------------------- +;; Float Add instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpuv2_addsf3" + [(set (match_operand:SF 0 "register_operand" "=v") + (plus:SF (match_operand:SF 1 "register_operand" "v") + (match_operand:SF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fadds\t%0, %1, %2") + +(define_insn "*fpuv2_adddf3" + [(set (match_operand:DF 0 "register_operand" "=v") + (plus:DF (match_operand:DF 1 "register_operand" "v") + (match_operand:DF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "faddd\t%0, %1, %2") + + +;; ------------------------------------------------------------------------- +;; Float Sub instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpuv2_subsf3" + [(set (match_operand:SF 0 "register_operand" "=v") + (minus:SF (match_operand:SF 1 "register_operand" "v") + (match_operand:SF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fsubs\t%0, %1, %2") + +(define_insn "*fpuv2_subdf3" + [(set (match_operand:DF 0 "register_operand" "=v") + (minus:DF (match_operand:DF 1 "register_operand" "v") + (match_operand:DF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fsubd\t%0, %1, %2") + + +;; ------------------------------------------------------------------------- +;; Float Mul instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpv2_mulsf3" + [(set (match_operand:SF 0 "register_operand" "=v") + (mult:SF (match_operand:SF 1 "register_operand" "v") + (match_operand:SF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fmuls\t%0, %1, %2") + +(define_insn "*fpv2_muldf3" + [(set (match_operand:DF 0 "register_operand" "=v") + (mult:DF (match_operand:DF 1 "register_operand" "v") + (match_operand:DF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fmuld\t%0, %1, %2") + +(define_insn "*fpuv2_nmulsf3_1" + [(set (match_operand:SF 0 "register_operand" "=v") + (mult:SF (neg:SF (match_operand:SF 1 "register_operand" "%v")) + (match_operand:SF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf) && !flag_rounding_math" + "fnmuls\t%0, %1, %2") + +(define_insn "*fpuv2_nmulsf3_2" + [(set (match_operand:SF 0 "register_operand" "=v") + (neg:SF (mult:SF (match_operand:SF 1 "register_operand" "v") + (match_operand:SF 2 "register_operand" "v"))))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fnmuls\t%0, %1, %2") + +(define_insn "*fpuv2_nmuldf3_1" + [(set (match_operand:DF 0 "register_operand" "=v") + (mult:DF (neg:DF (match_operand:DF 1 "register_operand" "%v")) + (match_operand:DF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df) && !flag_rounding_math" + "fnmuld\t%0, %1, %2") + +(define_insn "*fpuv2_nmuldf3_2" + [(set (match_operand:DF 0 "register_operand" "=v") + (neg:DF (mult:DF (match_operand:DF 1 "register_operand" "v") + (match_operand:DF 2 "register_operand" "v"))))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fnmuld\t%0, %1, %2") + + +;; ------------------------------------------------------------------------- +;; Float Div instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpuv2_divsf3" + [(set (match_operand:SF 0 "register_operand" "=v") + (div:SF (match_operand:SF 1 "register_operand" "v") + (match_operand:SF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fdivs\t%0, %1, %2") + +(define_insn "*fpuv2_1_divsf3" + [(set (match_operand:SF 0 "register_operand" "=v") + (div:SF (match_operand:SF 1 "csky_const_float1_operand" "i") + (match_operand:SF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "frecips\t%0, %2") + +(define_insn "*fpuv2_divdf3" + [(set (match_operand:DF 0 "register_operand" "=v") + (div:DF (match_operand:DF 1 "register_operand" "v") + (match_operand:DF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_divd)" + "fdivd\t%0, %1, %2") + +(define_insn "*fpuv2_1_divdf3" + [(set (match_operand:DF 0 "register_operand" "=v") + (div:DF (match_operand:DF 1 "csky_const_float1_operand" "i") + (match_operand:DF 2 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_divd)" + "frecipd\t%0, %2") + + +;; ------------------------------------------------------------------------- +;; Float add(sub) with mult instructions +;; ------------------------------------------------------------------------- + +;; vrz <= vrz + vrx * vry +(define_insn "*fpuv2_fmacs" + [(set (match_operand:SF 0 "register_operand" "=v") + (plus:SF (mult:SF (match_operand:SF 1 "register_operand" "v") + (match_operand:SF 2 "register_operand" "v")) + (match_operand:SF 3 "register_operand" "0")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fmacs\t%0, %1, %2") + +(define_insn "*fpuv2_fmacd" + [(set (match_operand:DF 0 "register_operand" "=v") + (plus:DF (mult:DF (match_operand:DF 1 "register_operand" "v") + (match_operand:DF 2 "register_operand" "v")) + (match_operand:DF 3 "register_operand" "0")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fmacd\t%0, %1, %2") + +;; vrz <= vrz - vrx * vry +(define_insn "*fpuv2_fnmacs" + [(set (match_operand:SF 0 "register_operand" "=v") + (minus:SF (match_operand:SF 1 "register_operand" "0") + (mult:SF (match_operand:SF 2 "register_operand" "v") + (match_operand:SF 3 "register_operand" "v"))))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fnmacs\t%0, %2, %3") + +(define_insn "*fpuv2_fnmacd" + [(set (match_operand:DF 0 "register_operand" "=v") + (minus:DF (match_operand:DF 1 "register_operand" "0") + (mult:DF (match_operand:DF 2 "register_operand" "v") + (match_operand:DF 3 "register_operand" "v"))))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fnmacd\t%0, %2, %3") + +;; vrz <= vrx * vry - vrz +(define_insn "*fpuv2_fmscs" + [(set (match_operand:SF 0 "register_operand" "=v") + (minus:SF (mult:SF (match_operand:SF 1 "register_operand" "v") + (match_operand:SF 2 "register_operand" "v")) + (match_operand:SF 3 "register_operand" "0")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fmscs\t%0, %1, %2") + +(define_insn "*fpuv2_fmscd" + [(set (match_operand:DF 0 "register_operand" "=v") + (minus:DF (mult:DF (match_operand:DF 1 "register_operand" "v") + (match_operand:DF 2 "register_operand" "v")) + (match_operand:DF 3 "register_operand" "0")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fmscd\t%0, %1, %2") + +;; vrz = - (vrz + vrx * vry) +(define_insn "*fpuv2_fnmscs_1" + [(set (match_operand:SF 0 "register_operand" "=v") + (minus:SF (mult:SF (neg:SF (match_operand:SF 1 "register_operand" "%v")) + (match_operand:SF 2 "register_operand" "v")) + (match_operand:SF 3 "register_operand" "0")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fnmscs\t%0, %1, %2") + +(define_insn "*fpuv2_fnmscs_2" + [(set (match_operand:SF 0 "register_operand" "=v") + (neg:SF (plus:SF (mult:SF (match_operand:SF 1 "register_operand" "v") + (match_operand:SF 2 "register_operand" "v")) + (match_operand:SF 3 "register_operand" "0"))))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fnmscs\t%0, %1, %2") + +(define_insn "*fpuv2_fnmscd_1" + [(set (match_operand:DF 0 "register_operand" "=v") + (minus:DF (mult:DF (neg:DF (match_operand:DF 1 "register_operand" "%v")) + (match_operand:DF 2 "register_operand" "v")) + (match_operand:DF 3 "register_operand" "0")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fnmscd\t%0, %1, %2") + +(define_insn "*fpuv2_fnmscd_2" + [(set (match_operand:DF 0 "register_operand" "=v") + (neg:DF (plus:DF (mult:DF (match_operand:DF 1 "register_operand" "v") + (match_operand:DF 2 "register_operand" "v")) + (match_operand:DF 3 "register_operand" "0"))))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fnmscd\t%0, %1, %2") + + +;; ------------------------------------------------------------------------- +;; Float compare instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpuv2_unordered" + [(set (reg:CC 33) (unordered:CC (match_operand:SF 0 "register_operand" "v") + (match_operand:SF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fcmpuos\t%0, %1") + +(define_insn "*fpuv2_unordered_zero" + [(set (reg:CC 33) (unordered:CC (match_operand:SF 0 "register_operand" "v") + (match_operand:SF 1 "csky_const_float0_operand" "i")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fcmpuos\t%0, %0") + +(define_insn "*fpuv2_ne" + [(set (reg:CC 33) (ne:CC (match_operand:SF 0 "register_operand" "v") + (match_operand:SF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fcmpnes\t%0, %1") + +(define_insn "*fpuv2_gt" + [(set (reg:CC 33) (gt:CC (match_operand:SF 0 "register_operand" "v") + (match_operand:SF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fcmplts\t%1, %0") + +(define_insn "*fpuv2_ge" + [(set (reg:CC 33) (ge:CC (match_operand:SF 0 "register_operand" "v") + (match_operand:SF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fcmphss\t%0, %1") + +(define_insn "*fpuv2_lt" + [(set (reg:CC 33) (lt:CC (match_operand:SF 0 "register_operand" "v") + (match_operand:SF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fcmplts\t%0, %1") + +(define_insn "*fpuv2_le" + [(set (reg:CC 33) (le:CC (match_operand:SF 0 "register_operand" "v") + (match_operand:SF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fcmphss\t%1, %0") + +(define_insn "*fpuv2_gez" + [(set (reg:CC 33) (ge:CC (match_operand:SF 0 "register_operand" "v") + (match_operand:SF 1 "csky_const_float0_operand" "i")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fcmpzhss\t%0") + +(define_insn "*fpuv2_nez" + [(set (reg:CC 33) (ne:CC (match_operand:SF 0 "register_operand" "v") + (match_operand:SF 1 "csky_const_float0_operand" "i")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fcmpznes\t%0") + +(define_insn "*fpuv2_dunordered" + [(set (reg:CC 33) (unordered:CC (match_operand:DF 0 "register_operand" "v") + (match_operand:DF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fcmpuod\t%0, %1") + +(define_insn "*fpuv2_dunordered_zero" + [(set (reg:CC 33) (unordered:CC (match_operand:DF 0 "register_operand" "v") + (match_operand:DF 1 "csky_const_float0_operand" "i")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fcmpuod\t%0, %0") + +(define_insn "*fpuv2_dne" + [(set (reg:CC 33) (ne:CC (match_operand:DF 0 "register_operand" "v") + (match_operand:DF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fcmpned\t%0, %1") + +(define_insn "*fpuv2_dgt" + [(set (reg:CC 33) (gt:CC (match_operand:DF 0 "register_operand" "v") + (match_operand:DF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fcmpltd\t%1, %0") + +(define_insn "*fpuv2_dge" + [(set (reg:CC 33) (ge:CC (match_operand:DF 0 "register_operand" "v") + (match_operand:DF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fcmphsd\t%0, %1") + +(define_insn "*fpuv2_dlt" + [(set (reg:CC 33) (lt:CC (match_operand:DF 0 "register_operand" "v") + (match_operand:DF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fcmpltd\t%0, %1") + +(define_insn "*fpuv2_dle" + [(set (reg:CC 33) (le:CC (match_operand:DF 0 "register_operand" "v") + (match_operand:DF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fcmphsd\t%1, %0") + +(define_insn "*fpuv2_dgez" + [(set (reg:CC 33) (ge:CC (match_operand:DF 0 "register_operand" "v") + (match_operand:DF 1 "csky_const_float0_operand" "i")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fcmpzhsd\t%0") + +(define_insn "*fpuv2_dnez" + [(set (reg:CC 33) (ne:CC (match_operand:DF 0 "register_operand" "v") + (match_operand:DF 1 "csky_const_float0_operand" "i")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fcmpzned\t%0") + + +;; ------------------------------------------------------------------------- +;; Float convert instructions +;; ------------------------------------------------------------------------- + +;; DF <- SF +(define_insn "*fpuv2_extendsfdf2" + [(set (match_operand:DF 0 "register_operand" "=v") + (float_extend:DF (match_operand:SF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fstod\t%0, %1") + +;; SF <- DF +(define_insn "*fpuv2_truncdfsf2" + [(set (match_operand:SF 0 "register_operand" "=v") + (float_truncate:SF (match_operand:DF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fdtos\t%0, %1") + +;; SF <- SI +(define_insn "*fpuv2_floatsisf2" + [(set (match_operand:SF 0 "register_operand" "=v") + (float:SF (match_operand:SI 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fsitos\t%0, %1") + +;; DF <- SI +(define_insn "*fpuv2_floatsidf2" + [(set (match_operand:DF 0 "register_operand" "=v") + (float:DF (match_operand:SI 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fsitod\t%0, %1") + +;; SF <- unsigned SI +(define_insn "*fpuv2_floatunssisf2" + [(set (match_operand:SF 0 "register_operand" "=v") + (unsigned_float:SF (match_operand:SI 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fuitos\t%0, %1") + +;; DF <- unsigned SI +(define_insn "*fpuv2_floatunssidf2" + [(set (match_operand:DF 0 "register_operand" "=v") + (unsigned_float:DF (match_operand:SI 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fuitod\t%0, %1") + +;; SI <- SF +(define_insn "*fpuv2_fix_truncsfsi2" + [(set (match_operand:SI 0 "register_operand" "=v") + (fix:SI (fix:SF (match_operand:SF 1 "register_operand" "v"))))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fstosi.rz\t%0, %1") + +;; SI <- DF +(define_insn "*fpuv2_fix_truncdfsi2" + [(set (match_operand:SI 0 "register_operand" "=v") + (fix:SI (fix:DF (match_operand:DF 1 "register_operand" "v"))))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fdtosi.rz\t%0, %1") + +;; unsigned SI <- SF +(define_insn "*fpuv2_fixuns_truncsfsi2" + [(set (match_operand:SI 0 "register_operand" "=v") + (unsigned_fix:SI (fix:SF (match_operand:SF 1 "register_operand" "v"))))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "fstoui.rz\t%0, %1") + +;; unsigned SI <- DF +(define_insn "*fpuv2_fixuns_truncdfsi2" + [(set (match_operand:SI 0 "register_operand" "=v") + (unsigned_fix:SI (fix:DF (match_operand:DF 1 "register_operand" "v"))))] + "CSKY_ISA_FEATURE (fpv2_df)" + "fdtoui.rz\t%0, %1") + + +;; ------------------------------------------------------------------------- +;; Float mov instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpuv2_movsf" + [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r, r,m,v,r,Q,v,v,v") + (match_operand:SF 1 "general_operand" " r,m,mF,r,r,v,v,Q,v,W"))] + "CSKY_ISA_FEATURE (fpv2_sf)" + "* return csky_output_move(insn, operands, SFmode);" +) + +(define_insn "*fpuv2_movdf" + [(set (match_operand:DF 0 "nonimmediate_operand" "=r,r, r,m, v,?r,Q,v,v,v") + (match_operand:DF 1 "general_operand" " r,m,mF,r,?r, v,v,Q,v,m"))] + "CSKY_ISA_FEATURE (fpv2_df)" + "* return csky_output_movedouble(operands, DFmode);" + [(set (attr "length") + (symbol_ref "csky_get_movedouble_length (operands)"))] +) diff --git a/gcc/config/csky/csky_insn_fpuv3.md b/gcc/config/csky/csky_insn_fpuv3.md new file mode 100644 index 0000000..053673c --- /dev/null +++ b/gcc/config/csky/csky_insn_fpuv3.md @@ -0,0 +1,497 @@ + +(define_c_enum "unspec" [ + UNSPEC_MAXNM_F3 + UNSPEC_MINNM_F3 +]) + +;; ------------------------------------------------------------------------- +;; Float mov instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpv3_movhf" + [(set (match_operand:HF 0 "nonimmediate_operand" "=r,r,v,r,m,r,Q,v,v,v, v") + (match_operand:HF 1 "general_operand" " r,F,r,v,r,m,v,Q,v,W,Dv"))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "* + switch (which_alternative) + { + case 2: + return \"fmtvr.16\\t%0, %1\"; + case 3: + return \"fmfvr.16\\t%0, %1\"; + case 6: + case 7: + case 9: + return fpuv3_output_move(operands); + case 8: + return \"fmov.16\\t%0, %1\"; + case 10: + return \"fmovi.16\\t%0, %1\"; + case 1: + { + long bits; + rtx ops[4]; + + bits = real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (operands[1]), HFmode); + ops[0] = operands[0]; + ops[1] = GEN_INT (bits); + + output_asm_insn (\"lrw\\t%0, %1\", ops); + return \"\"; + } + default: + return csky_output_move(insn, operands, HFmode); + } + " +) + +(define_insn "*fpv3_movsf" + [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r, r,m,v,r,Q,v,v,v, v") + (match_operand:SF 1 "general_operand" " r,m,mF,r,r,v,v,Q,v,W,Dv"))] + "CSKY_ISA_FEATURE(fpv3_sf)" + "* + switch (which_alternative) + { + case 4: + return \"fmtvr.32.1\\t%0, %1\"; + case 5: + return \"fmfvr.32.1\\t%0, %1\"; + case 6: + case 7: + case 9: + return fpuv3_output_move(operands); + case 8: + return \"fmov.32\\t%0, %1\"; + case 10: + return \"fmovi.32\\t%0, %1\"; + default: + return csky_output_move(insn, operands, SFmode); + } + " +) + +(define_insn "*fpv3_movdf" + [(set (match_operand:DF 0 "nonimmediate_operand" "=r,r, r,m,v,?r,Q,v,v,v, v") + (match_operand:DF 1 "general_operand" " r,m,mF,r,?r,v,v,Q,v,m,Dv"))] + "CSKY_ISA_FEATURE(fpv3_df)" + "* + switch (which_alternative) + { + case 4: + if (TARGET_BIG_ENDIAN) + return \"fmtvr.64\\t%0, %R1, %1\"; + return \"fmtvr.64\\t%0, %1, %R1\"; + case 5: + if (TARGET_BIG_ENDIAN) + return \"fmfvr.64\\t%R0, %0, %1\"; + return \"fmfvr.64\\t%0, %R0, %1\"; + case 6: + case 7: + case 9: + return fpuv3_output_move(operands); + case 8: + return \"fmov.64\\t%0, %1\"; + case 10: + return \"fmovi.64\\t%0, %1\"; + default: + return csky_output_movedouble(operands, DFmode); + } + " +) + +;; ------------------------------------------------------------------------- +;; Float Mul instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpv3_mul3" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (mult:F3ANY (match_operand:F3ANY 1 "register_operand" " v") + (match_operand:F3ANY 2 "register_operand" " v")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fmul.\t%0, %1, %2" +) + +;; ------------------------------------------------------------------------- +;; Float Muladd and mulsub instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpv3_mula3" + [(set (match_operand:F3ANY 0 "register_operand" "+v") + (plus:F3ANY (mult:F3ANY (match_operand:F3ANY 1 "register_operand" " v") + (match_operand:F3ANY 2 "register_operand" " v")) + (match_dup 0)))] + "CSKY_ISA_FEATURE(fpv3_)" + "fmula.\t%0, %1, %2" +) + +(define_insn "*fpv3_muls3" + [(set (match_operand:F3ANY 0 "register_operand" "+v") + (minus:F3ANY (match_dup 0) + (mult:F3ANY (match_operand:F3ANY 1 "register_operand" " v") + (match_operand:F3ANY 2 "register_operand" " v"))))] + "CSKY_ISA_FEATURE(fpv3_)" + "fmuls.\t%0, %1, %2" +) + +;; ------------------------------------------------------------------------- +;; Float fmula/fmuls/fnmula/fnmuls instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpv3_fmuls_4" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (fma:F3ANY (neg:F3ANY (match_operand:F3ANY 1 "register_operand" "v")) + (match_operand:F3ANY 2 "register_operand" "v") + (match_operand:F3ANY 3 "register_operand" "0")))] + "CSKY_ISA_FEATURE(fpv3_)" + "ffmuls.\t%0, %1, %2" +) + +(define_insn "*fpv3_fmula_4" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (fma:F3ANY (match_operand:F3ANY 1 "register_operand" " v") + (match_operand:F3ANY 2 "register_operand" " v") + (match_operand:F3ANY 3 "register_operand" "0")))] + "CSKY_ISA_FEATURE(fpv3_)" + "ffmula.\t%0, %1, %2" +) + +(define_insn "*fpv3_fnmula_4" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (neg: F3ANY (fma:F3ANY (match_operand:F3ANY 1 "register_operand" " v") + (match_operand:F3ANY 2 "register_operand" " v") + (match_operand:F3ANY 3 "register_operand" "0"))))] + "CSKY_ISA_FEATURE(fpv3_)" + "ffnmula.\t%0, %1, %2" +) + +(define_insn "*fpv3_fnmuls_4" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (fma:F3ANY (match_operand:F3ANY 1 "register_operand" " v") + (match_operand:F3ANY 2 "register_operand" " v") + (neg:F3ANY (match_operand:F3ANY 3 "register_operand" "0"))))] + "CSKY_ISA_FEATURE(fpv3_sf)" + "ffnmuls.\t%0, %1, %2" +) + +;; ------------------------------------------------------------------------- +;; Float div/recipe/sqrt instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpv3_div3" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (div:F3ANY (match_operand:F3ANY 1 "register_operand" " v") + (match_operand:F3ANY 2 "register_operand" " v")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fdiv.\t%0, %1, %2" +) + +(define_insn "*fpv3_recip3" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (div:F3ANY (match_operand:F3ANY 1 "csky_const_float1_operand" " i") + (match_operand:F3ANY 2 "register_operand" " v")))] + "CSKY_ISA_FEATURE(fpv3_)" + "frecip.\t%0, %2" +) + +(define_insn "*fpv3_sqrt2" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (sqrt:F3ANY (match_operand:F3ANY 1 "register_operand" " v")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fsqrt.\t%0, %1" +) + +;; ------------------------------------------------------------------------- +;; Float fmax/fmin instructions +;; ------------------------------------------------------------------------- + +(define_insn "fmax3" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (unspec:F3ANY [(match_operand:F3ANY 1 "register_operand" " v") + (match_operand:F3ANY 2 "register_operand" " v")] + UNSPEC_MAXNM_F3))] + "CSKY_ISA_FEATURE(fpv3_)" + "fmaxnm.\t%0, %1, %2" +) + +(define_insn "fmin3" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (unspec:F3ANY [(match_operand:F3ANY 1 "register_operand" " v") + (match_operand:F3ANY 2 "register_operand" " v")] + UNSPEC_MINNM_F3))] + "CSKY_ISA_FEATURE(fpv3_)" + "fminnm.\t%0, %1, %2" +) + +;; ------------------------------------------------------------------------- +;; Float compare instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpv3__3" + [(set (reg:CC CSKY_CC_REGNUM) + (FCMPZ:CC (match_operand:F3ANY 0 "register_operand" "v") + (match_operand:F3ANY 1 "csky_const_float0_operand" "i")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fcmp.\t%0" +) + +(define_insn "*fpv3__3" + [(set (reg:CC CSKY_CC_REGNUM) + (FCMP:CC (match_operand:F3ANY 0 "register_operand" "v") + (match_operand:F3ANY 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fcmp.\t%0, %1" +) + +(define_insn "*fpv3_gt3" + [(set (reg:CC CSKY_CC_REGNUM) + (gt:CC (match_operand:F3ANY 0 "register_operand" "v") + (match_operand:F3ANY 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fcmplt.\t%1, %0" +) + +(define_insn "*fpv3_le3" + [(set (reg:CC CSKY_CC_REGNUM) + (le:CC (match_operand:F3ANY 0 "register_operand" "v") + (match_operand:F3ANY 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fcmphs.\t%1, %0" +) + +(define_insn "*fpv3_unordered" + [(set (reg:CC CSKY_CC_REGNUM) + (unordered:CC (match_operand:F3ANY 0 "register_operand" "v") + (match_operand:F3ANY 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fcmpuo.\t%0, %1") + +(define_insn "*fpv3_unordered_zero" + [(set (reg:CC CSKY_CC_REGNUM) + (unordered:CC (match_operand:F3ANY 0 "register_operand" "v") + (match_operand:F3ANY 1 "csky_const_float0_operand" "i")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fcmpuoz.\t%0") + +;; ------------------------------------------------------------------------- +;; Float ADD instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpv3_add3" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (plus:F3ANY (match_operand:F3ANY 1 "register_operand" " v") + (match_operand:F3ANY 2 "register_operand" " v")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fadd.\t%0, %1, %2" +) + +;; ------------------------------------------------------------------------- +;; Float SUB instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpv3_sub3" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (minus:F3ANY (match_operand:F3ANY 1 "register_operand" " v") + (match_operand:F3ANY 2 "register_operand" " v")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fsub.\t%0, %1, %2" +) + +;; ------------------------------------------------------------------------- +;; Float NEG instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpv3_neg2" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (neg:F3ANY (match_operand:F3ANY 1 "register_operand" " v")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fneg.\t%0, %1" +) + +;; ------------------------------------------------------------------------- +;; Float ABS instructions +;; ------------------------------------------------------------------------- + +(define_insn "*fpv3_abs2" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (abs:F3ANY (match_operand:F3ANY 1 "register_operand" " v")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fabs.\t%0, %1" +) + +;; ------------------------------------------------------------------------- +;; Float common convert instructions +;; ------------------------------------------------------------------------- + +;; SF <- HF +(define_insn "*fpv3_extendhfsf2" + [(set (match_operand:SF 0 "register_operand" "=v") + (float_extend:SF (match_operand:HF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "fhtos\t%0, %1") + +;; HF <- SF +(define_insn "*fpv3_truncsfhf2" + [(set (match_operand:HF 0 "register_operand" "=v") + (float_truncate:HF (match_operand:SF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE(fpv3_hf)" + "fstoh\t%0, %1") + +;; DF <- SF +(define_insn "*fpv3_extendsfdf2" + [(set (match_operand:DF 0 "register_operand" "=v") + (float_extend:DF (match_operand:SF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE(fpv3_df)" + "fstod\t%0, %1") + +;; SF <- DF +(define_insn "*fpv3_truncdfsf2" + [(set (match_operand:SF 0 "register_operand" "=v") + (float_truncate:SF (match_operand:DF 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE(fpv3_df)" + "fdtos\t%0, %1") + +;; DF,SF,HF <- unsigned SI,SI +(define_insn "*fpv3_floatsi2" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (FLOAT_SU:F3ANY (match_operand:SI 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE(fpv3_)" + "fitof.32.f\t%0, %1") + +;; HF <- unsigned HI,HI +(define_insn "*fpv3_floathihf2" + [(set (match_operand:HF 0 "register_operand" "=v") + (FLOAT_SU:HF (match_operand:HI 1 "register_operand" "v")))] + "CSKY_ISA_FEATURE(fpv3_hi) && CSKY_ISA_FEATURE(fpv3_hf)" + "fitof.16.f16\t%0, %1") + +;; unsigned SI,SI <- DF,SF,HF +(define_insn "*fpv3_fix_truncsi2" + [(set (match_operand:SI 0 "register_operand" "=v") + (FIX_SU:SI (fix:F3ANY (match_operand:F3ANY 1 "register_operand" "v"))))] + "CSKY_ISA_FEATURE(fpv3_)" + "fftoi.f.32.rz\t%0, %1") + +;; ------------------------------------------------------------------------- +;; Float complex convert instructions +;; ------------------------------------------------------------------------- + +;; Fixed point to floating point conversions. + +;(define_insn "*combine_fcvt_fixed16_" +; [(set (match_operand:F3ANY 0 "register_operand" "=v") +; (mult:F3ANY (float:F3ANY (match_operand:HI 1 "register_operand" "0")) +; (match_operand 2 +; "const_double_fcvt_power_of_two_reciprocal_hq" "Dt")))] +; "CSKY_ISA_FEATURE(fpv3_) && !flag_rounding_math +; && CSKY_ISA_FEATURE(fpv3_hi)" +; "fxtof.s16.f\t%0, %1, %v2") +; +;(define_insn "*combine_fcvt_fixed32_" +; [(set (match_operand:F3ANY 0 "register_operand" "=v") +; (mult:F3ANY (float:F3ANY (match_operand:SI 1 "register_operand" "0")) +; (match_operand 2 +; "const_double_fcvt_power_of_two_reciprocal_sq" "Dt")))] +; "CSKY_ISA_FEATURE(fpv3_) && !flag_rounding_math" +; "fxtof.s32.f\t%0, %1, %v2") +; +;(define_insn "*combine_fcvt_unfixed16_" +; [(set (match_operand:F3ANY 0 "register_operand" "=v") +; (mult:F3ANY (unsigned_float:F3ANY (match_operand:HI 1 "register_operand" "0")) +; (match_operand 2 +; "const_double_fcvt_power_of_two_reciprocal_hq" "Dt")))] +; "CSKY_ISA_FEATURE(fpv3_) && !flag_rounding_math +; && CSKY_ISA_FEATURE(fpv3_hi)" +; "fxtof.u16.f\t%0, %1, %v2") +; +;(define_insn "*combine_fcvt_unfixed32_" +; [(set (match_operand:F3ANY 0 "register_operand" "=v") +; (mult:F3ANY (unsigned_float:F3ANY (match_operand:SI 1 "register_operand" "0")) +; (match_operand 2 +; "const_double_fcvt_power_of_two_reciprocal_sq" "Dt")))] +; "CSKY_ISA_FEATURE(fpv3_) && !flag_rounding_math" +; "fxtof.u32.f\t%0, %1, %v2") + +;; Floating point to fixed point conversions. + +;(define_insn "*combine_fcvt_fixed16" +; [(set (match_operand:HI 0 "register_operand" "=v") +; (fix:HI (fix:F3ANY (mult:F3ANY (match_operand:F3ANY 1 "register_operand" "0") +; (match_operand 2 +; "const_double_fcvt_power_of_two_hq" "Du")))))] +; "CSKY_ISA_FEATURE(fpv3_) && !flag_rounding_math +; && CSKY_ISA_FEATURE(fpv3_hi)" +; "fftox.f.s16\t%0, %1, %v2" +; ) +; +;(define_insn "*combine_fcvt_fixed32" +; [(set (match_operand:SI 0 "register_operand" "=v") +; (fix:SI (fix:F3ANY (mult:F3ANY (match_operand:F3ANY 1 "register_operand" "0") +; (match_operand 2 +; "const_double_fcvt_power_of_two_sq" "Du")))))] +; "CSKY_ISA_FEATURE(fpv3_) && !flag_rounding_math" +; "fftox.f.s32\t%0, %1, %v2" +; ) +; +;(define_insn "*combine_fcvt_unfixed16" +; [(set (match_operand:HI 0 "register_operand" "=v") +; (unsigned_fix:HI (fix:F3ANY (mult:F3ANY (match_operand:F3ANY 1 "register_operand" "0") +; (match_operand 2 +; "const_double_fcvt_power_of_two_hq" "Du")))))] +; "CSKY_ISA_FEATURE(fpv3_) && !flag_rounding_math +; && CSKY_ISA_FEATURE(fpv3_hi)" +; "fftox.f.u16\t%0, %1, %v2" +; ) +; +;(define_insn "*combine_fcvt_unfixed32" +; [(set (match_operand:SI 0 "register_operand" "=v") +; (unsigned_fix:SI (fix:F3ANY (mult:F3ANY (match_operand:F3ANY 1 "register_operand" "0") +; (match_operand 2 +; "const_double_fcvt_power_of_two_sq" "Du")))))] +; "CSKY_ISA_FEATURE(fpv3_) && !flag_rounding_math" +; "fftox.f.u32\t%0, %1, %v2" +; ) + +;; conversions need to be rounding to nearest. + +(define_insn "lsi2" + [(set (match_operand:SI 0 "register_operand" "=v") + (FIX_SU:SI (unspec:F3ANY [(match_operand:F3ANY 1 "register_operand" "0")] + FRM)))] + "CSKY_ISA_FEATURE(fpv3_)" + "fftoi.f.32\t%0, %1" +) + +(define_insn "2" + [(set (match_operand:F3ANY 0 "register_operand" "=v") + (unspec:F3ANY [(match_operand:F3ANY 1 "register_operand" "0")] FRMF))] + "CSKY_ISA_FEATURE(fpv3_)" + "fftofi.f\t%0, %1" +) + +;; Write Floating-point Control Register. +(define_insn "csky_setfcrsi" + [(unspec_volatile [(match_operand:SI 0 "register_operand" "r")] VUNSPEC_SET_FCR)] + "CSKY_ISA_FEATURE(fcr)" + "mtcr\t%0, fcr" +) + +;; Read Floating-point Control Register. +(define_insn "csky_getfcrsi" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec_volatile:SI [(const_int 0)] VUNSPEC_GET_FCR))] + "CSKY_ISA_FEATURE(fcr)" + "mfcr\t%0, fcr" +) + +;; Insert Floating-point Control Register. +(define_insn "csky_insfcrsi" + [(unspec_volatile [(match_operand:SI 0 "register_operand" "r") + (match_operand:SI 1 "const_int_operand" "i") + (match_operand:SI 2 "const_int_operand" "i")]VUNSPEC_INS_FCR) + (clobber (reg: SI 13))] + "CSKY_ISA_FEATURE(fcr)" + { + operands[1] = GEN_INT (INTVAL (operands[2]) + INTVAL (operands[1]) - 1); + return "mfcr\tt1, fcr\n\tins\tt1, %0, %1, %2\n\tmtcr\tt1, fcr"; + } +) diff --git a/gcc/config/csky/csky_isa.def b/gcc/config/csky/csky_isa.def index 5edce16..58498196 100644 --- a/gcc/config/csky/csky_isa.def +++ b/gcc/config/csky/csky_isa.def @@ -32,6 +32,7 @@ CSKY_ISA (7E10, "Extended insns for arch ck810 from ck807") /* Special insns */ CSKY_ISA (div, "divide insns") +CSKY_ISA (fcr, "Control the fcr register") /* Extended insns */ CSKY_ISA (dsp, "Extended insns for DSP") @@ -41,6 +42,11 @@ CSKY_ISA (fpv2_sf, "Single precision operations supported") CSKY_ISA (fpv2_df, "Double precision operations supported") CSKY_ISA (fpv2_divd, "Double precision div operations supported") +CSKY_ISA (fpv3_hi, "half word for fpu convert supported") +CSKY_ISA (fpv3_hf, "half precision operations supported") +CSKY_ISA (fpv3_sf, "Single precision operations supported") +CSKY_ISA (fpv3_df, "Double precision operations supported") + /* Specific insns mode */ #ifdef CSKY_ISA_MACRO #define CSKY_ISA_CK801 CSKY_ISA_FEATURE_GET (E1) @@ -50,10 +56,19 @@ CSKY_ISA (fpv2_divd, "Double precision div operations supported") #define CSKY_ISA_CK803R1 CSKY_ISA_CK803, CSKY_ISA_FEATURE_GET (3E3r1) #define CSKY_ISA_CK807 CSKY_ISA_CK803, CSKY_ISA_FEATURE_GET (3E7) #define CSKY_ISA_CK810 CSKY_ISA_CK807, CSKY_ISA_FEATURE_GET (7E10) +#define CSKY_ISA_CK860 CSKY_ISA_CK810, CSKY_ISA_FEATURE_GET(3E3r1) #define CSKY_ISA_DSP CSKY_ISA_FEATURE_GET (dsp) #define CSKY_ISA_FPv2_SF CSKY_ISA_FEATURE_GET (fpv2_sf) #define CSKY_ISA_FPv2 CSKY_ISA_FPv2_SF, CSKY_ISA_FEATURE_GET (fpv2_df) #define CSKY_ISA_FPv2_DIVD CSKY_ISA_FPv2, CSKY_ISA_FEATURE_GET (fpv2_divd) + +#define CSKY_ISA_FPv3_HF CSKY_ISA_FEATURE_GET (fpv3_hf), \ + CSKY_ISA_FEATURE_GET (fpv3_hi) +#define CSKY_ISA_FPv3_HSF CSKY_ISA_FPv3_HF, \ + CSKY_ISA_FEATURE_GET (fpv3_sf) +#define CSKY_ISA_FPv3_SDF CSKY_ISA_FEATURE_GET (fpv3_sf), \ + CSKY_ISA_FEATURE_GET (fpv3_df) +#define CSKY_ISA_FPv3 CSKY_ISA_FPv3_HF, CSKY_ISA_FPv3_SDF #endif diff --git a/gcc/config/csky/csky_tables.opt b/gcc/config/csky/csky_tables.opt index 3501f90..ca113dd 100644 --- a/gcc/config/csky/csky_tables.opt +++ b/gcc/config/csky/csky_tables.opt @@ -194,6 +194,12 @@ Enum(csky_processor_type) String(ck810ft) Value( TARGET_CPU_ck810ff) EnumValue Enum(csky_processor_type) String(ck810ftv) Value( TARGET_CPU_ck810ftv) +EnumValue +Enum(csky_processor_type) String(ck860) Value( TARGET_CPU_ck860) + +EnumValue +Enum(csky_processor_type) String(ck860f) Value( TARGET_CPU_ck860f) + Enum Name(csky_arch) Type(int) Known CSKY architectures (for use with the -march= option): @@ -213,6 +219,9 @@ Enum(csky_arch) String(ck807) Value(3) EnumValue Enum(csky_arch) String(ck810) Value(4) +EnumValue +Enum(csky_arch) String(ck860) Value(5) + Enum Name(csky_fpu) Type(enum csky_fpu_type) Known CSKY FPUs (for use with the -mfpu= option): @@ -227,4 +236,16 @@ EnumValue Enum(csky_fpu) String(fpv2_divd) Value(TARGET_FPU_fpv2_divd) EnumValue +Enum(csky_fpu) String(fpv3_hf) Value(TARGET_FPU_fpv3_hf) + +EnumValue +Enum(csky_fpu) String(fpv3_hsf) Value(TARGET_FPU_fpv3_hsf) + +EnumValue +Enum(csky_fpu) String(fpv3_sdf) Value(TARGET_FPU_fpv3_sdf) + +EnumValue +Enum(csky_fpu) String(fpv3) Value(TARGET_FPU_fpv3) + +EnumValue Enum(csky_fpu) String(auto) Value(TARGET_FPU_auto) diff --git a/gcc/config/csky/predicates.md b/gcc/config/csky/predicates.md index 4ffecb0..878446d 100644 --- a/gcc/config/csky/predicates.md +++ b/gcc/config/csky/predicates.md @@ -294,5 +294,4 @@ }) (define_special_predicate "csky_float_comparison_operator" - (match_code "eq,ne,le,lt,ge,gt,geu,gtu,leu,ltu, - unordered,ordered")) + (match_code "eq,ne,le,lt,ge,gt,unordered,ordered")) diff --git a/gcc/config/csky/t-csky-elf b/gcc/config/csky/t-csky-elf index bbdf286..4e7fcbe 100644 --- a/gcc/config/csky/t-csky-elf +++ b/gcc/config/csky/t-csky-elf @@ -27,8 +27,8 @@ MULTILIB_MATCHES = mbig-endian=EB MULTILIB_EXCEPTIONS = # Arch variants. -MULTILIB_OPTIONS += mcpu=ck802/mcpu=ck801/mcpu=ck803f/mcpu=ck807f/mcpu=ck810f -MULTILIB_DIRNAMES += ck802 ck801 ck803 ck807 ck810 +MULTILIB_OPTIONS += mcpu=ck802/mcpu=ck801/mcpu=ck803f/mcpu=ck807f/mcpu=ck810f/mcpu=ck860f +MULTILIB_DIRNAMES += ck802 ck801 ck803 ck807 ck810 ck860 # For arch ck802. MULTILIB_MATCHES += mcpu?ck802=march?ck802 @@ -100,6 +100,11 @@ MULTILIB_MATCHES += mcpu?ck807f=march?ck807ef MULTILIB_MATCHES += mcpu?ck807f=march?ck807 MULTILIB_MATCHES += mcpu?ck807f=mcpu?ck807 +# For arch ck860 +MULTILIB_MATCHES += mcpu?ck860f=march?ck860 +MULTILIB_MATCHES += mcpu?ck860f=mcpu?ck860 +MULTILIB_MATCHES += mcpu?ck860f=mcpu?c860 + # For option -mfloat-abi= MULTILIB_OPTIONS += mfloat-abi=soft/mfloat-abi=softfp/mfloat-abi=hard MULTILIB_DIRNAMES += soft soft-fp hard-fp diff --git a/gcc/config/csky/t-csky-linux b/gcc/config/csky/t-csky-linux index 9435b7a..0730c3a 100644 --- a/gcc/config/csky/t-csky-linux +++ b/gcc/config/csky/t-csky-linux @@ -21,11 +21,11 @@ MULTILIB_EXCEPTIONS = -CSKY_MULTILIB_OSDIRNAMES = mfloat-abi.softfp=/soft-fp mfloat-abi.hard=/hard-fp mfloat-abi.soft=/. mcpu.ck810f=/. mcpu.ck807f=/ck807 +CSKY_MULTILIB_OSDIRNAMES = mfloat-abi.softfp=/soft-fp mfloat-abi.hard=/hard-fp mfloat-abi.soft=/. mcpu.ck810f=/. mcpu.ck807f=/ck807 mcpu.ck860f=/ck860 # Arch variants. -MULTILIB_OPTIONS += mcpu=ck810f/mcpu=ck807f -MULTILIB_DIRNAMES += ck810 ck807 +MULTILIB_OPTIONS += mcpu=ck810f/mcpu=ck807f/mcpu=ck860f +MULTILIB_DIRNAMES += ck810 ck807 ck860 # For ck807. MULTILIB_MATCHES += mcpu?ck807f=march?ck807 @@ -41,6 +41,11 @@ MULTILIB_MATCHES += mcpu?ck810f=mcpu?ck810vf MULTILIB_MATCHES += mcpu?ck810f=mcpu?ck810ft MULTILIB_MATCHES += mcpu?ck810f=mcpu?ck810vft +# For ck860 +MULTILIB_MATCHES += mcpu?ck860f=march?ck860 +MULTILIB_MATCHES += mcpu?ck860f=mcpu?ck860 +MULTILIB_MATCHES += mcpu?ck860f=mcpu?c860 + # For option -mfloat-abi= MULTILIB_OPTIONS += mfloat-abi=soft/mfloat-abi=softfp/mfloat-abi=hard MULTILIB_DIRNAMES += soft soft-fp hard-fp -- cgit v1.1 From b8a618539c26f19cb5753f0757848c0933f3ac7c Mon Sep 17 00:00:00 2001 From: Geng Qi Date: Mon, 24 May 2021 20:22:53 +0800 Subject: C-SKY: Delete LO_REGS and HI_REGS, use HILO_REGS instead. gcc/ChangeLog: * config/csky/constraints.md ("l", "h"): Delete. * config/csky/csky.h (reg_class, REG_CLASS_NAMES, REG_CLASS_CONTENTS): Delete LO_REGS and HI_REGS. * config/csky/csky.c (regno_reg_classm, csky_secondary_reload, csky_register_move_cost): Use HILO_REGS instead of LO_REGS and HI_REGS. --- gcc/config/csky/constraints.md | 2 -- gcc/config/csky/csky.c | 7 +++---- gcc/config/csky/csky.h | 8 -------- 3 files changed, 3 insertions(+), 14 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/csky/constraints.md b/gcc/config/csky/constraints.md index 937cb81..c9bc9f2 100644 --- a/gcc/config/csky/constraints.md +++ b/gcc/config/csky/constraints.md @@ -24,8 +24,6 @@ (define_register_constraint "b" "LOW_REGS" "r0 - r15") (define_register_constraint "c" "C_REGS" "C register") (define_register_constraint "y" "HILO_REGS" "HI and LO registers") -(define_register_constraint "l" "LO_REGS" "LO register") -(define_register_constraint "h" "HI_REGS" "HI register") (define_register_constraint "v" "V_REGS" "vector registers") (define_register_constraint "z" "SP_REGS" "SP register") diff --git a/gcc/config/csky/csky.c b/gcc/config/csky/csky.c index 6e97994..b2160b9 100644 --- a/gcc/config/csky/csky.c +++ b/gcc/config/csky/csky.c @@ -112,7 +112,7 @@ enum reg_class regno_reg_class[FIRST_PSEUDO_REGISTER] = /* Reserved. */ RESERVE_REGS, /* CC,HI,LO registers. */ - C_REGS, HI_REGS, LO_REGS, + C_REGS, HILO_REGS, HILO_REGS, /* Reserved. */ RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, RESERVE_REGS, @@ -2477,8 +2477,7 @@ csky_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x, /* We always require a general register when copying anything to HI/LO_REGNUM, except when copying an SImode value from HI/LO_REGNUM to a general register, or when copying from register 0. */ - if ((rclass == HILO_REGS || rclass == LO_REGS || rclass == HI_REGS) - && !CSKY_GENERAL_REGNO_P (regno)) + if (rclass == HILO_REGS && !CSKY_GENERAL_REGNO_P (regno)) return GENERAL_REGS; if (rclass == V_REGS && !CSKY_GENERAL_REGNO_P (regno)) @@ -6546,7 +6545,7 @@ csky_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED, || (CLASS) == LOW_REGS) #define HILO_REG_CLASS_P(CLASS) \ - ((CLASS) == HI_REGS || (CLASS) == LO_REGS || (CLASS) == HILO_REGS) + ((CLASS) == HILO_REGS) #define V_REG_CLASS_P(CLASS) \ ((CLASS) == V_REGS) diff --git a/gcc/config/csky/csky.h b/gcc/config/csky/csky.h index f535c42..1fd72d0 100644 --- a/gcc/config/csky/csky.h +++ b/gcc/config/csky/csky.h @@ -685,8 +685,6 @@ enum reg_class LOW_REGS, GENERAL_REGS, C_REGS, - HI_REGS, - LO_REGS, HILO_REGS, V_REGS, OTHER_REGS, @@ -706,8 +704,6 @@ enum reg_class "LOW_REGS", \ "GENERAL_REGS", \ "C_REGS", \ - "HI_REGS", \ - "LO_REGS", \ "HILO_REGS", \ "V_REGS", \ "OTHER_REGS", \ @@ -731,10 +727,6 @@ enum reg_class 0x00000000, 0x00000000, 0x00000000}, /* GENERAL_REGS */ \ {0x00000000, 0x00000002, 0x00000000, 0x00000000, \ 0x00000000, 0x00000000, 0x00000000}, /* C_REGS */ \ - {0x00000000, 0x00000004, 0x00000000, 0x00000000, \ - 0x00000000, 0x00000000, 0x00000000}, /* HI_REG */ \ - {0x00000000, 0x00000008, 0x00000000, 0x00000000, \ - 0x00000000, 0x00000000, 0x00000000}, /* LO_REG */ \ {0x00000000, 0x0000000c, 0x00000000, 0x00000000, \ 0x00000000, 0x00000000, 0x00000000}, /* HILO_REGS */ \ {0x00000000, 0xFFF00000, 0x007FFF8F, 0x00000000, \ -- cgit v1.1 From 62fa9cb24a8cfb197717c809c20e69161f464720 Mon Sep 17 00:00:00 2001 From: Geng Qi Date: Mon, 24 May 2021 20:22:54 +0800 Subject: C-SKY: Bug fix for bad setting of TARGET_DSP and TARGET_DIV. gcc/ChangeLog: * config/csky/csky.c (csky_option_override): Init csky_arch_isa_features[] in advance, so TARGET_DSP and TARGET_DIV can be set well. --- gcc/config/csky/csky.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/csky/csky.c b/gcc/config/csky/csky.c index b2160b9..1a6cfd7 100644 --- a/gcc/config/csky/csky.c +++ b/gcc/config/csky/csky.c @@ -2680,6 +2680,18 @@ csky_option_override (void) TARGET_FDIVDU = 0; } + /* Initialize boolean versions of the architectural flags, for use + in the .md file. */ + +#undef CSKY_ISA +#define CSKY_ISA(IDENT, DESC) \ + { \ + csky_arch_isa_features[CSKY_ISA_FEATURE_GET (IDENT)] = \ + bitmap_bit_p (csky_active_target.isa, CSKY_ISA_FEATURE_GET (IDENT)); \ + } +#include "csky_isa.def" +#undef CSKY_ISA + /* Extended LRW instructions are enabled by default on CK801, disabled otherwise. */ if (TARGET_ELRW == -1) @@ -2752,18 +2764,6 @@ csky_option_override (void) TARGET_MULTIPLE_STLD = 0; } - /* Initialize boolean versions of the architectural flags, for use - in the .md file. */ - -#undef CSKY_ISA -#define CSKY_ISA(IDENT, DESC) \ - { \ - csky_arch_isa_features[CSKY_ISA_FEATURE_GET (IDENT)] = \ - bitmap_bit_p (csky_active_target.isa, CSKY_ISA_FEATURE_GET (IDENT)); \ - } -#include "csky_isa.def" -#undef CSKY_ISA - /* TODO */ /* Resynchronize the saved target options. */ -- cgit v1.1 From d19a00c60c15c253282f72299315261ecaa9c92f Mon Sep 17 00:00:00 2001 From: Geng Qi Date: Mon, 24 May 2021 20:22:55 +0800 Subject: C-SKY: Separate FRAME_POINTER_REGNUM into FRAME_POINTER_REGNUM and HARD_FRAME_POINTER_REGNUM. gcc/ChangeLog: * config/csky/csky.h (FRAME_POINTER_REGNUM): Use HARD_FRAME_POINTER_REGNUM and FRAME_POINTER_REGNUM instead of the signle definition. The signle definition may not work well at simplify_subreg_regno(). (HARD_FRAME_POINTER_REGNUM): New. (ELIMINABLE_REGS): Add for HARD_FRAME_POINTER_REGNUM. * config/csky/csky.c (get_csky_live_regs, csky_can_eliminate, csky_initial_elimination_offset, csky_expand_prologue, csky_expand_epilogue): Add for HARD_FRAME_POINTER_REGNUM. --- gcc/config/csky/csky.c | 15 +++++++++------ gcc/config/csky/csky.h | 7 +++++-- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/csky/csky.c b/gcc/config/csky/csky.c index 1a6cfd7..7f2af82 100644 --- a/gcc/config/csky/csky.c +++ b/gcc/config/csky/csky.c @@ -1751,12 +1751,12 @@ get_csky_live_regs (int *count) save = true; /* Frame pointer marked used. */ - else if (frame_pointer_needed && reg == FRAME_POINTER_REGNUM) + else if (frame_pointer_needed && reg == HARD_FRAME_POINTER_REGNUM) save = true; /* This is required for CK801/802 where FP is a fixed reg, otherwise we end up with no FP value available to the DWARF-2 unwinder. */ - else if (crtl->calls_eh_return && reg == FRAME_POINTER_REGNUM) + else if (crtl->calls_eh_return && reg == HARD_FRAME_POINTER_REGNUM) save = true; /* CK801/802 also need special handling for LR because it's clobbered @@ -1832,6 +1832,8 @@ csky_layout_stack_frame (void) static bool csky_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to) { + if (to == FRAME_POINTER_REGNUM) + return from != ARG_POINTER_REGNUM; if (to == STACK_POINTER_REGNUM) return !frame_pointer_needed; return true; @@ -1852,6 +1854,7 @@ csky_initial_elimination_offset (int from, int to) switch (from) { case FRAME_POINTER_REGNUM: + case HARD_FRAME_POINTER_REGNUM: offset = cfun->machine->reg_offset; break; @@ -1866,7 +1869,7 @@ csky_initial_elimination_offset (int from, int to) /* If we are asked for the offset to the frame pointer instead, then subtract the difference between the frame pointer and stack pointer. */ - if (to == FRAME_POINTER_REGNUM) + if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM) offset -= cfun->machine->reg_offset; return offset; } @@ -5785,7 +5788,7 @@ csky_expand_prologue (void) of the register save area. */ if (frame_pointer_needed) { - insn = emit_insn (gen_movsi (frame_pointer_rtx, stack_pointer_rtx)); + insn = emit_insn (gen_movsi (hard_frame_pointer_rtx, stack_pointer_rtx)); RTX_FRAME_RELATED_P (insn) = 1; } @@ -5848,7 +5851,7 @@ csky_expand_epilogue (void) /* Restore the SP to the base of the register save area. */ if (frame_pointer_needed) { - insn = emit_move_insn (stack_pointer_rtx, frame_pointer_rtx); + insn = emit_move_insn (stack_pointer_rtx, hard_frame_pointer_rtx); RTX_FRAME_RELATED_P (insn) = 1; } else @@ -6004,7 +6007,7 @@ csky_set_eh_return_address (rtx source, rtx scratch) if (frame_pointer_needed) { - basereg = frame_pointer_rtx; + basereg = hard_frame_pointer_rtx; delta = 0; } else diff --git a/gcc/config/csky/csky.h b/gcc/config/csky/csky.h index 1fd72d0..f2b0d1c 100644 --- a/gcc/config/csky/csky.h +++ b/gcc/config/csky/csky.h @@ -342,7 +342,8 @@ extern int csky_arch_isa_features[]; #define STACK_POINTER_REGNUM CSKY_SP_REGNUM /* Base register for access to local variables of the function. */ -#define FRAME_POINTER_REGNUM 8 +#define FRAME_POINTER_REGNUM 36 +#define HARD_FRAME_POINTER_REGNUM 8 /* Base register for access to arguments of the function. This is a fake register that is always eliminated. */ @@ -370,7 +371,9 @@ extern int csky_arch_isa_features[]; #define ELIMINABLE_REGS \ {{ ARG_POINTER_REGNUM, STACK_POINTER_REGNUM },\ { ARG_POINTER_REGNUM, FRAME_POINTER_REGNUM },\ - { FRAME_POINTER_REGNUM, STACK_POINTER_REGNUM }} + { ARG_POINTER_REGNUM, HARD_FRAME_POINTER_REGNUM },\ + { FRAME_POINTER_REGNUM, STACK_POINTER_REGNUM },\ + { FRAME_POINTER_REGNUM, HARD_FRAME_POINTER_REGNUM }} /* Define the offset between two registers, one to be eliminated, and the other its replacement, at the start of a routine. */ -- cgit v1.1 From 45d5c4769a40447885ece0cf001933874af9daa1 Mon Sep 17 00:00:00 2001 From: Cooper Qu Date: Tue, 25 May 2021 16:56:58 +0800 Subject: C-SKY: Amend copyrights of recently added files. gcc/ChangeLog: * config/csky/csky-modes.def : Amend copyright. * config/csky/csky_insn_fpuv2.md : Likewise. * config/csky/csky_insn_fpuv3.md : Likewise. gcc/testsuite/ChangeLog: * gcc.target/csky/fpuv3/fpuv3.exp : Amend copyright. --- gcc/config/csky/csky-modes.def | 20 ++++++++++++++++++++ gcc/config/csky/csky_insn_fpuv2.md | 19 +++++++++++++++++++ gcc/config/csky/csky_insn_fpuv3.md | 19 +++++++++++++++++++ 3 files changed, 58 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/csky/csky-modes.def b/gcc/config/csky/csky-modes.def index a2427ff..9062efc 100644 --- a/gcc/config/csky/csky-modes.def +++ b/gcc/config/csky/csky-modes.def @@ -1,2 +1,22 @@ +;; C-SKY extra machine modes. +;; Copyright (C) 2018-2021 Free Software Foundation, Inc. +;; Contributed by C-SKY Microsystems and Mentor Graphics. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + /* Float modes. */ FLOAT_MODE (HF, 2, ieee_half_format); /* Half-precision floating point */ diff --git a/gcc/config/csky/csky_insn_fpuv2.md b/gcc/config/csky/csky_insn_fpuv2.md index 0a680f8..d56b61f 100644 --- a/gcc/config/csky/csky_insn_fpuv2.md +++ b/gcc/config/csky/csky_insn_fpuv2.md @@ -1,3 +1,22 @@ +;; C-SKY FPUV2 instruction descriptions. +;; Copyright (C) 2018-2021 Free Software Foundation, Inc. +;; Contributed by C-SKY Microsystems and Mentor Graphics. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ ;; ------------------------------------------------------------------------- ;; Float Abs instructions diff --git a/gcc/config/csky/csky_insn_fpuv3.md b/gcc/config/csky/csky_insn_fpuv3.md index 053673c..b5f4798 100644 --- a/gcc/config/csky/csky_insn_fpuv3.md +++ b/gcc/config/csky/csky_insn_fpuv3.md @@ -1,3 +1,22 @@ +;; C-SKY FPUV3 instruction descriptions. +;; Copyright (C) 2018-2021 Free Software Foundation, Inc. +;; Contributed by C-SKY Microsystems and Mentor Graphics. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ (define_c_enum "unspec" [ UNSPEC_MAXNM_F3 -- cgit v1.1 From f5c6b71c9b02a2b2e0b7107d982def09abeeae88 Mon Sep 17 00:00:00 2001 From: Cooper Qu Date: Tue, 25 May 2021 20:03:48 +0800 Subject: C-SKY: Fix copyright of csky-modes.def. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The incorrect copyright comment format causes build error: builddir/source//gcc/gcc/config/csky/csky-modes.def: In function ‘void create_modes()’: builddir/source//gcc/gcc/config/csky/csky-modes.def:1:4: error: ‘C’ was not declared in this scope ;; C-SKY extra machine modes. ^ builddir/source//gcc/gcc/config/csky/csky-modes.def:1:6: error: ‘SKY’ was not declared in this scope ;; C-SKY extra machine modes. ^ builddir/source//gcc/gcc/config/csky/csky-modes.def:2:16: error: ‘Copyright’ was not declared in this scope ;; Copyright (C) 2018-2021 Free Software Foundation, Inc. ^ builddir/source//gcc/gcc/config/csky/csky-modes.def:3:4: error: ‘Contributed’ was not declared in this scope ;; Contributed by C-SKY Microsystems and Mentor Graphics. ^ gcc/ChangeLog: * config/csky/csky-modes.def : Fix copyright. --- gcc/config/csky/csky-modes.def | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/csky/csky-modes.def b/gcc/config/csky/csky-modes.def index 9062efc..109ee51 100644 --- a/gcc/config/csky/csky-modes.def +++ b/gcc/config/csky/csky-modes.def @@ -1,22 +1,22 @@ -;; C-SKY extra machine modes. -;; Copyright (C) 2018-2021 Free Software Foundation, Inc. -;; Contributed by C-SKY Microsystems and Mentor Graphics. -;; -;; This file is part of GCC. -;; -;; GCC is free software; you can redistribute it and/or modify it -;; under the terms of the GNU General Public License as published by -;; the Free Software Foundation; either version 3, or (at your option) -;; any later version. -;; -;; GCC is distributed in the hope that it will be useful, but -;; WITHOUT ANY WARRANTY; without even the implied warranty of -;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;; General Public License for more details. -;; -;; You should have received a copy of the GNU General Public License -;; along with GCC; see the file COPYING3. If not see -;; . */ +/* C-SKY extra machine modes. + Copyright (C) 2018-2021 Free Software Foundation, Inc. + Contributed by C-SKY Microsystems and Mentor Graphics. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ /* Float modes. */ FLOAT_MODE (HF, 2, ieee_half_format); /* Half-precision floating point */ -- cgit v1.1 From 3b0a7d624e64eeb81e4d5e8c62c46d86ef521857 Mon Sep 17 00:00:00 2001 From: Kito Cheng Date: Tue, 25 May 2021 21:26:12 +0800 Subject: RISC-V: Pass -mno-relax to assembler gcc/ChangeLog: * config/riscv/riscv.h (ASM_SPEC): Pass -mno-relax. --- gcc/config/riscv/riscv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'gcc/config') diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index f3e8572..f47d5b4 100644 --- a/gcc/config/riscv/riscv.h +++ b/gcc/config/riscv/riscv.h @@ -98,6 +98,7 @@ extern const char *riscv_default_mtune (int argc, const char **argv); %{" FPIE_OR_FPIC_SPEC ":-fpic} \ %{march=*} \ %{mabi=*} \ +%{mno-relax} \ %{mbig-endian} \ %{mlittle-endian} \ %(subtarget_asm_spec)" \ -- cgit v1.1 From 94079e642d95ba4bcb75354d6cd628a473a94479 Mon Sep 17 00:00:00 2001 From: Geng Qi Date: Tue, 25 May 2021 18:45:25 +0800 Subject: C-SKY: Add instruction "ld.bs". gcc/ * config/csky/csky.md (cskyv2_sextend_ldbs): New. gcc/testsuite/ * gcc.target/csky/ldbs.c: New. --- gcc/config/csky/csky.md | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/csky/csky.md b/gcc/config/csky/csky.md index c27d627..b980d4c 100644 --- a/gcc/config/csky/csky.md +++ b/gcc/config/csky/csky.md @@ -1533,6 +1533,7 @@ }" ) +;; hi -> si (define_insn "extendhisi2" [(set (match_operand:SI 0 "register_operand" "=r") (sign_extend:SI (match_operand:HI 1 "register_operand" "r")))] @@ -1557,6 +1558,15 @@ "sextb %0, %1" ) +(define_insn "*cskyv2_sextend_ldbs" + [(set (match_operand:SI 0 "register_operand" "=r") + (sign_extend:SI (match_operand:QI 1 "csky_simple_mem_operand" "m")))] + "CSKY_ISA_FEATURE (E2)" + "ld.bs\t%0, %1" + [(set_attr "length" "4") + (set_attr "type" "load")] +) + ;; qi -> hi (define_insn "extendqihi2" [(set (match_operand:HI 0 "register_operand" "=r") -- cgit v1.1 From 155d3038c23002581eb29d59ea23e44b3758c6dc Mon Sep 17 00:00:00 2001 From: Geng Qi Date: Fri, 30 Apr 2021 21:02:37 +0800 Subject: C-SKY: Fix FAIL of gcc.dg/torture/stackalign/builtin-return-2.c. gcc/ChangeLog: * config/csky/csky.md (untyped_call): Emit clobber for return registers to mark them used. --- gcc/config/csky/csky.md | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/csky/csky.md b/gcc/config/csky/csky.md index b980d4c..f91d851 100644 --- a/gcc/config/csky/csky.md +++ b/gcc/config/csky/csky.md @@ -3258,6 +3258,10 @@ emit_call_insn (gen_call (operands[0], const0_rtx)); + for (int i = 0; i < XVECLEN (operands[2], 0); i++) + emit_clobber (SET_SRC (XVECEXP (operands[2], 0, i))); + emit_insn (gen_blockage ()); + for (i = 0; i < XVECLEN (operands[2], 0); i++) { rtx set = XVECEXP (operands[2], 0, i); -- cgit v1.1 From 4553b95516176d578aa6ce81499509f6ec099bdb Mon Sep 17 00:00:00 2001 From: Geng Qi Date: Fri, 30 Apr 2021 21:03:33 +0800 Subject: C-SKY: Delete definition TARGET_PROMOTE_PROTOTYPES, just use the default definition. gcc/ChangeLog: * config/csky/csky.c (TARGET_PROMOTE_PROTOTYPES): Delete. --- gcc/config/csky/csky.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/csky/csky.c b/gcc/config/csky/csky.c index 7f2af82..2b44edf 100644 --- a/gcc/config/csky/csky.c +++ b/gcc/config/csky/csky.c @@ -512,9 +512,6 @@ csky_cpu_cpp_builtins (cpp_reader *pfile) #undef TARGET_SPLIT_COMPLEX_ARG #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true -#undef TARGET_PROMOTE_PROTOTYPES -#define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true - #undef TARGET_MUST_PASS_IN_STACK #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size -- cgit v1.1 From 41eba35b08a9bbd1f06b15e74942a94ea838d8cf Mon Sep 17 00:00:00 2001 From: Geng Qi Date: Wed, 26 May 2021 11:29:19 +0800 Subject: C-SKY: Support fldrd/fstrd for fpuv2 and fldr.64/fstr.64 for fpuv3. gcc/ChangeLog: * config/csky/csky.c (ck810_legitimate_index_p): Support "base + index" with DF mode. * config/csky/constraints.md ("Y"): New constraint for memory operands without index register. * config/csky/csky_insn_fpuv2.md (fpuv3_movdf): Use "Y" instead of "m" when mov between memory and general registers, and lower their priority. * config/csky/csky_insn_fpuv3.md (fpuv2_movdf): Likewise. gcc/testsuite/ChangeLog: * gcc.target/csky/fldrd_fstrd.c: New. * gcc.target/csky/fpuv3/fldr64_fstr64.c: New. --- gcc/config/csky/constraints.md | 4 ++++ gcc/config/csky/csky.c | 3 ++- gcc/config/csky/csky_insn_fpuv2.md | 4 ++-- gcc/config/csky/csky_insn_fpuv3.md | 16 ++++++++-------- 4 files changed, 16 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/csky/constraints.md b/gcc/config/csky/constraints.md index c9bc9f2..2641ab3 100644 --- a/gcc/config/csky/constraints.md +++ b/gcc/config/csky/constraints.md @@ -38,6 +38,10 @@ "Memory operands with base register, index register" (match_test "csky_valid_mem_constraint_operand (op, \"W\")")) +(define_memory_constraint "Y" + "Memory operands without index register" + (not (match_test "csky_valid_mem_constraint_operand (op, \"W\")"))) + (define_constraint "R" "Memory operands whose address is a label_ref" (and (match_code "mem") diff --git a/gcc/config/csky/csky.c b/gcc/config/csky/csky.c index 2b44edf..c0e42a2 100644 --- a/gcc/config/csky/csky.c +++ b/gcc/config/csky/csky.c @@ -3152,7 +3152,8 @@ ck810_legitimate_index_p (machine_mode mode, rtx index, int strict_p) /* The follow index is for ldr instruction, the ldr cannot load dword data, so the mode size should not be larger than 4. */ - else if (GET_MODE_SIZE (mode) <= 4) + else if (GET_MODE_SIZE (mode) <= 4 + || (TARGET_HARD_FLOAT && CSKY_VREG_MODE_P (mode))) { if (is_csky_address_register_rtx_p (index, strict_p)) return 1; diff --git a/gcc/config/csky/csky_insn_fpuv2.md b/gcc/config/csky/csky_insn_fpuv2.md index d56b61f..7bab99e 100644 --- a/gcc/config/csky/csky_insn_fpuv2.md +++ b/gcc/config/csky/csky_insn_fpuv2.md @@ -480,8 +480,8 @@ ) (define_insn "*fpuv2_movdf" - [(set (match_operand:DF 0 "nonimmediate_operand" "=r,r, r,m, v,?r,Q,v,v,v") - (match_operand:DF 1 "general_operand" " r,m,mF,r,?r, v,v,Q,v,m"))] + [(set (match_operand:DF 0 "nonimmediate_operand" "=r, v,?r,Q,v,v,v,r, r,Y") + (match_operand:DF 1 "general_operand" " r,?r, v,v,Q,v,m,Y,YF,r"))] "CSKY_ISA_FEATURE (fpv2_df)" "* return csky_output_movedouble(operands, DFmode);" [(set (attr "length") diff --git a/gcc/config/csky/csky_insn_fpuv3.md b/gcc/config/csky/csky_insn_fpuv3.md index b5f4798..7b9d4a7 100644 --- a/gcc/config/csky/csky_insn_fpuv3.md +++ b/gcc/config/csky/csky_insn_fpuv3.md @@ -90,27 +90,27 @@ ) (define_insn "*fpv3_movdf" - [(set (match_operand:DF 0 "nonimmediate_operand" "=r,r, r,m,v,?r,Q,v,v,v, v") - (match_operand:DF 1 "general_operand" " r,m,mF,r,?r,v,v,Q,v,m,Dv"))] + [(set (match_operand:DF 0 "nonimmediate_operand" "=r, v,?r,Q,v,v,v, v,r, r,Y") + (match_operand:DF 1 "general_operand" " r,?r, v,v,Q,v,m,Dv,Y,YF,r"))] "CSKY_ISA_FEATURE(fpv3_df)" "* switch (which_alternative) { - case 4: + case 1: if (TARGET_BIG_ENDIAN) return \"fmtvr.64\\t%0, %R1, %1\"; return \"fmtvr.64\\t%0, %1, %R1\"; - case 5: + case 2: if (TARGET_BIG_ENDIAN) return \"fmfvr.64\\t%R0, %0, %1\"; return \"fmfvr.64\\t%0, %R0, %1\"; + case 3: + case 4: case 6: - case 7: - case 9: return fpuv3_output_move(operands); - case 8: + case 5: return \"fmov.64\\t%0, %1\"; - case 10: + case 7: return \"fmovi.64\\t%0, %1\"; default: return csky_output_movedouble(operands, DFmode); -- cgit v1.1 From 5f338210456bf4f142a2da6eb0a01ae8ffecaa88 Mon Sep 17 00:00:00 2001 From: Geng Qi Date: Fri, 30 Apr 2021 21:02:15 +0800 Subject: C-SKY: Delete TARGET_CAN_CHANGE_MODE_CLASS, use defualt definition. gcc/ChangeLog: * config/csky/csky.c (csky_can_change_mode_class): Delete. For csky, HF/SF mode use the low bits of VREGS. --- gcc/config/csky/csky.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/csky/csky.c b/gcc/config/csky/csky.c index c0e42a2..e55821f 100644 --- a/gcc/config/csky/csky.c +++ b/gcc/config/csky/csky.c @@ -608,9 +608,6 @@ csky_default_logical_op_non_short_circuit (void) #undef TARGET_MODES_TIEABLE_P #define TARGET_MODES_TIEABLE_P csky_modes_tieable_p -#undef TARGET_CAN_CHANGE_MODE_CLASS -#define TARGET_CAN_CHANGE_MODE_CLASS csky_can_change_mode_class - #undef TARGET_CONDITIONAL_REGISTER_USAGE #define TARGET_CONDITIONAL_REGISTER_USAGE csky_conditional_register_usage @@ -2370,19 +2367,6 @@ csky_modes_tieable_p (machine_mode mode1, machine_mode mode2) && (mode1 == DFmode || mode2 == DFmode)); } -/* Implement TARGET_CAN_CHANGE_MODE_CLASS. - V_REG registers can't do subreg as all values are reformatted to - internal precision. */ - -static bool -csky_can_change_mode_class (machine_mode from, - machine_mode to, - reg_class_t rclass) -{ - return (GET_MODE_SIZE (from) == GET_MODE_SIZE (to) - || !reg_classes_intersect_p (V_REGS, rclass)); -} - /* Implement TARGET_CLASS_LIKELY_SPILLED_P. We need to define this for MINI_REGS when we only use r0 - r7. Otherwise we can end up using r0-r4 for function arguments, and don't -- cgit v1.1 From 74045879770ace0b14f0f809c8e795069044cf41 Mon Sep 17 00:00:00 2001 From: Jan-Benedict Glaw Date: Wed, 26 May 2021 15:22:11 +0200 Subject: arc: Remove useless register keyword The "register" keyword was removed in C++17, remove them to get the backend to build again. gcc/ * config/arc/arc.c (arc_address_cost, arc_print_operand_address, arc_ccfsm_advance, symbolic_reference_mentioned_p, arc_raw_symbolic_reference_mentioned_p): Remove register keyword. --- gcc/config/arc/arc.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c index ec7328e..9153f05 100644 --- a/gcc/config/arc/arc.c +++ b/gcc/config/arc/arc.c @@ -2488,8 +2488,8 @@ arc_address_cost (rtx addr, machine_mode, addr_space_t, bool speed) case PLUS : { - register rtx plus0 = XEXP (addr, 0); - register rtx plus1 = XEXP (addr, 1); + rtx plus0 = XEXP (addr, 0); + rtx plus1 = XEXP (addr, 1); if (GET_CODE (plus0) != REG && (GET_CODE (plus0) != MULT @@ -5032,7 +5032,7 @@ arc_print_operand (FILE *file, rtx x, int code) void arc_print_operand_address (FILE *file , rtx addr) { - register rtx base, index = 0; + rtx base, index = 0; switch (GET_CODE (addr)) { @@ -5157,7 +5157,7 @@ static void arc_ccfsm_advance (rtx_insn *insn, struct arc_ccfsm *state) { /* BODY will hold the body of INSN. */ - register rtx body; + rtx body; /* This will be 1 if trying to repeat the trick (ie: do the `else' part of an if/then/else), and things need to be reversed. */ @@ -6130,8 +6130,8 @@ arc_legitimate_pic_addr_p (rtx addr) static bool symbolic_reference_mentioned_p (rtx op) { - register const char *fmt; - register int i; + const char *fmt; + int i; if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF) return true; @@ -6141,7 +6141,7 @@ symbolic_reference_mentioned_p (rtx op) { if (fmt[i] == 'E') { - register int j; + int j; for (j = XVECLEN (op, i) - 1; j >= 0; j--) if (symbolic_reference_mentioned_p (XVECEXP (op, i, j))) @@ -6163,8 +6163,8 @@ symbolic_reference_mentioned_p (rtx op) bool arc_raw_symbolic_reference_mentioned_p (rtx op, bool skip_local) { - register const char *fmt; - register int i; + const char *fmt; + int i; if (GET_CODE(op) == UNSPEC) return false; @@ -6184,7 +6184,7 @@ arc_raw_symbolic_reference_mentioned_p (rtx op, bool skip_local) { if (fmt[i] == 'E') { - register int j; + int j; for (j = XVECLEN (op, i) - 1; j >= 0; j--) if (arc_raw_symbolic_reference_mentioned_p (XVECEXP (op, i, j), -- cgit v1.1 From 76898cec437561a5e74d92b98f4631b80300409d Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 26 May 2021 15:54:17 +0200 Subject: Remove useless register keywords This patch removes useless register keywords from several backends and one spot in the Ada FE. 2021-05-26 Jakub Jelinek gcc/ * config/epiphany/epiphany.c (epiphany_print_operand_address): Remove register keywords. * config/microblaze/microblaze.c (microblaze_legitimize_address, call_internal1, microblaze_option_override, print_operand): Likewise. * config/microblaze/microblaze.md (call_internal_plt, call_value_intern_plt, call_value_intern): Likewise. * config/arm/aout.h (ASM_OUTPUT_ALIGN): Likewise. * config/iq2000/iq2000.md (call_internal1, call_value_internal1, call_value_multiple_internal1): Likewise. * config/bfin/bfin.c (symbolic_reference_mentioned_p): Likewise. gcc/ada/ * init.c (__gnat_error_handler): Remove register keyword. --- gcc/config/arm/aout.h | 2 +- gcc/config/bfin/bfin.c | 6 +++--- gcc/config/epiphany/epiphany.c | 2 +- gcc/config/iq2000/iq2000.md | 6 +++--- gcc/config/microblaze/microblaze.c | 26 +++++++++++++------------- gcc/config/microblaze/microblaze.md | 18 ++++++++++-------- 6 files changed, 31 insertions(+), 29 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/aout.h b/gcc/config/arm/aout.h index 9688fb6..25a2812 100644 --- a/gcc/config/arm/aout.h +++ b/gcc/config/arm/aout.h @@ -257,7 +257,7 @@ #define ASM_OUTPUT_ALIGN(STREAM, POWER) \ do \ { \ - register int amount = 1 << (POWER); \ + int amount = 1 << (POWER); \ \ if (amount == 2) \ fprintf (STREAM, "\t.even\n"); \ diff --git a/gcc/config/bfin/bfin.c b/gcc/config/bfin/bfin.c index a000b7a..698dd87 100644 --- a/gcc/config/bfin/bfin.c +++ b/gcc/config/bfin/bfin.c @@ -1773,8 +1773,8 @@ function_arg_regno_p (int n) int symbolic_reference_mentioned_p (rtx op) { - register const char *fmt; - register int i; + const char *fmt; + int i; if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF) return 1; @@ -1784,7 +1784,7 @@ symbolic_reference_mentioned_p (rtx op) { if (fmt[i] == 'E') { - register int j; + int j; for (j = XVECLEN (op, i) - 1; j >= 0; j--) if (symbolic_reference_mentioned_p (XVECEXP (op, i, j))) diff --git a/gcc/config/epiphany/epiphany.c b/gcc/config/epiphany/epiphany.c index b60daa7..f248294 100644 --- a/gcc/config/epiphany/epiphany.c +++ b/gcc/config/epiphany/epiphany.c @@ -1394,7 +1394,7 @@ epiphany_print_operand (FILE *file, rtx x, int code) static void epiphany_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr) { - register rtx base, index = 0; + rtx base, index = 0; int offset = 0; switch (GET_CODE (addr)) diff --git a/gcc/config/iq2000/iq2000.md b/gcc/config/iq2000/iq2000.md index fd25e05..4637703 100644 --- a/gcc/config/iq2000/iq2000.md +++ b/gcc/config/iq2000/iq2000.md @@ -1553,7 +1553,7 @@ "" "* { - register rtx target = operands[0]; + rtx target = operands[0]; if (GET_CODE (target) == CONST_INT) return \"li\\t%@,%0\\n\\tjalr\\t%2,%@\"; @@ -1641,7 +1641,7 @@ "" "* { - register rtx target = operands[1]; + rtx target = operands[1]; if (GET_CODE (target) == CONST_INT) return \"li\\t%@,%1\\n\\tjalr\\t%3,%@\"; @@ -1678,7 +1678,7 @@ "" "* { - register rtx target = operands[1]; + rtx target = operands[1]; if (GET_CODE (target) == CONST_INT) return \"li\\t%@,%1\\n\\tjalr\\t%4,%@\"; diff --git a/gcc/config/microblaze/microblaze.c b/gcc/config/microblaze/microblaze.c index b444db1..6e8f367 100644 --- a/gcc/config/microblaze/microblaze.c +++ b/gcc/config/microblaze/microblaze.c @@ -991,7 +991,7 @@ static rtx microblaze_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, machine_mode mode ATTRIBUTE_UNUSED) { - register rtx xinsn = x, result; + rtx xinsn = x, result; if (GET_CODE (xinsn) == CONST && flag_pic && pic_address_needs_scratch (xinsn)) @@ -1011,10 +1011,10 @@ microblaze_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, if (GET_CODE (xinsn) == PLUS) { - register rtx xplus0 = XEXP (xinsn, 0); - register rtx xplus1 = XEXP (xinsn, 1); - register enum rtx_code code0 = GET_CODE (xplus0); - register enum rtx_code code1 = GET_CODE (xplus1); + rtx xplus0 = XEXP (xinsn, 0); + rtx xplus1 = XEXP (xinsn, 1); + enum rtx_code code0 = GET_CODE (xplus0); + enum rtx_code code1 = GET_CODE (xplus1); if (code0 != REG && code1 == REG) { @@ -1736,9 +1736,9 @@ microblaze_version_to_int (const char *version) static void microblaze_option_override (void) { - register int i, start; - register int regno; - register machine_mode mode; + int i, start; + int regno; + machine_mode mode; int ver; microblaze_section_threshold = (global_options_set.x_g_switch_value @@ -1891,11 +1891,11 @@ microblaze_option_override (void) for (mode = VOIDmode; mode != MAX_MACHINE_MODE; mode = (machine_mode) ((int) mode + 1)) { - register int size = GET_MODE_SIZE (mode); + int size = GET_MODE_SIZE (mode); for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) { - register int ok; + int ok; if (mode == CCmode) { @@ -2267,7 +2267,7 @@ microblaze_initial_elimination_offset (int from, int to) void print_operand (FILE * file, rtx op, int letter) { - register enum rtx_code code; + enum rtx_code code; if (PRINT_OPERAND_PUNCT_VALID_P (letter)) { @@ -2406,7 +2406,7 @@ print_operand (FILE * file, rtx op, int letter) else if (code == REG || code == SUBREG) { - register int regnum; + int regnum; if (code == REG) regnum = REGNO (op); @@ -2431,7 +2431,7 @@ print_operand (FILE * file, rtx op, int letter) rtx mem_reg = XEXP (op, 0); if (GET_CODE (mem_reg) == REG) { - register int regnum = REGNO (mem_reg); + int regnum = REGNO (mem_reg); fprintf (file, "%s", reg_names[regnum]); } } diff --git a/gcc/config/microblaze/microblaze.md b/gcc/config/microblaze/microblaze.md index 472ef4c..6d77752 100644 --- a/gcc/config/microblaze/microblaze.md +++ b/gcc/config/microblaze/microblaze.md @@ -2107,8 +2107,8 @@ (use (reg:SI R_GOT))] "flag_pic" { - register rtx target2 = gen_rtx_REG (Pmode, - GP_REG_FIRST + MB_ABI_SUB_RETURN_ADDR_REGNUM); + rtx target2 + = gen_rtx_REG (Pmode, GP_REG_FIRST + MB_ABI_SUB_RETURN_ADDR_REGNUM); gen_rtx_CLOBBER (VOIDmode, target2); return "brlid\tr15,%0\;%#"; } @@ -2122,9 +2122,9 @@ (clobber (reg:SI R_SR))] "" { - register rtx target = operands[0]; - register rtx target2 = gen_rtx_REG (Pmode, - GP_REG_FIRST + MB_ABI_SUB_RETURN_ADDR_REGNUM); + rtx target = operands[0]; + rtx target2 + = gen_rtx_REG (Pmode, GP_REG_FIRST + MB_ABI_SUB_RETURN_ADDR_REGNUM); if (GET_CODE (target) == SYMBOL_REF) { if (microblaze_break_function_p (SYMBOL_REF_DECL (target))) { gen_rtx_CLOBBER (VOIDmode, target2); @@ -2216,7 +2216,8 @@ (use (match_operand:SI 4 "register_operand"))] "flag_pic" { - register rtx target2=gen_rtx_REG (Pmode,GP_REG_FIRST + MB_ABI_SUB_RETURN_ADDR_REGNUM); + rtx target2 + = gen_rtx_REG (Pmode,GP_REG_FIRST + MB_ABI_SUB_RETURN_ADDR_REGNUM); gen_rtx_CLOBBER (VOIDmode,target2); return "brlid\tr15,%1\;%#"; @@ -2232,8 +2233,9 @@ (clobber (match_operand:SI 3 "register_operand" "=d"))] "" { - register rtx target = operands[1]; - register rtx target2=gen_rtx_REG (Pmode,GP_REG_FIRST + MB_ABI_SUB_RETURN_ADDR_REGNUM); + rtx target = operands[1]; + rtx target2 + = gen_rtx_REG (Pmode,GP_REG_FIRST + MB_ABI_SUB_RETURN_ADDR_REGNUM); if (GET_CODE (target) == SYMBOL_REF) { -- cgit v1.1 From 0e1fd432e9cd5a2a4703c9ef9cc61255ea22cc49 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Fri, 21 May 2021 16:12:58 +0000 Subject: arm: Auto-vectorization for MVE: vaddv This patch adds support for the reduc_plus_scal optab with MVE, which maps to the vaddv instruction. It moves the reduc_plus_scal_ expander from neon.md to vec-common.md and adds support for MVE to it. Since vaddv uses a 32-bits accumulator, we have to truncate it's result. For instance: int32_t test__s8x16 (int8_t *a) { int i; int8_t result = 0; for (i=0; i<16; i++) { result += a[i]; } return result; } is compiled into: vldrb.8 q3, [r0] vaddv.s8 r0, q3 sxtb r0, r0 bx lr If we used uint8_t instead of int8_t, we still use vaddv.s8 r0, q3, but truncate with uxtb r0, r0. 2021-05-25 Christophe Lyon gcc/ * config/arm/mve.md (mve_vaddvq_): Prefix with '@'. * config/arm/neon.md (reduc_plus_scal_): Move to .. * config/arm/vec-common.md: .. here. Add support for MVE. gcc/testsuite/ * gcc.target/arm/simd/mve-vaddv-1.c: New test. --- gcc/config/arm/mve.md | 2 +- gcc/config/arm/neon.md | 13 ------------- gcc/config/arm/vec-common.md | 26 ++++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 14 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 133ebe9..0a6ba80 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -464,7 +464,7 @@ ;; ;; [vaddvq_s, vaddvq_u]) ;; -(define_insn "mve_vaddvq_" +(define_insn "@mve_vaddvq_" [ (set (match_operand:SI 0 "s_register_operand" "=Te") (unspec:SI [(match_operand:MVE_2 1 "s_register_operand" "w")] diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 977adef..6a65733 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -1161,19 +1161,6 @@ DONE; }) -(define_expand "reduc_plus_scal_" - [(match_operand: 0 "nonimmediate_operand") - (match_operand:VQ 1 "s_register_operand")] - "ARM_HAVE_NEON__ARITH && !BYTES_BIG_ENDIAN" -{ - rtx step1 = gen_reg_rtx (mode); - - emit_insn (gen_quad_halves_plus (step1, operands[1])); - emit_insn (gen_reduc_plus_scal_ (operands[0], step1)); - - DONE; -}) - (define_expand "reduc_plus_scal_v2di" [(match_operand:DI 0 "nonimmediate_operand") (match_operand:V2DI 1 "s_register_operand")] diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index e8b2901..8e35151 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -539,3 +539,29 @@ emit_insn (gen_mve_vst4q (operands[0], operands[1])); DONE; }) + +(define_expand "reduc_plus_scal_" + [(match_operand: 0 "nonimmediate_operand") + (match_operand:VQ 1 "s_register_operand")] + "ARM_HAVE__ARITH + && !(TARGET_HAVE_MVE && FLOAT_MODE_P (mode)) + && !BYTES_BIG_ENDIAN" +{ + if (TARGET_NEON) + { + rtx step1 = gen_reg_rtx (mode); + + emit_insn (gen_quad_halves_plus (step1, operands[1])); + emit_insn (gen_reduc_plus_scal_ (operands[0], step1)); + } + else + { + /* vaddv generates a 32 bits accumulator. */ + rtx op0 = gen_reg_rtx (SImode); + + emit_insn (gen_mve_vaddvq (VADDVQ_S, mode, op0, operands[1])); + emit_move_insn (operands[0], gen_lowpart (mode, op0)); + } + + DONE; +}) -- cgit v1.1 From 28484d00c45b7bf094a22a4fddf9ffdc7482c7e1 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 26 May 2021 20:44:49 +0200 Subject: i386: Autovectorize 4-byte vectors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2021-05-26 Uroš Bizjak gcc/ * config/i386/i386.c (ix86_autovectorize_vector_modes): Add V4QImode and V16QImode for TARGET_SSE2. * doc/sourcebuild.texi (Vector-specific attributes): Add vect64 and vect32 description. gcc/testsuite/ * lib/target-supports.exp (check_effective_target_vect32): New. (available_vector_sizes): Append 32 for x86 targets. * gcc.dg/vect/pr71264.c (dg-final): Xfail scan dump for vect32 targets. * gcc.dg/vect/slp-28.c (dg-final): Adjust dumps for vect32 targets. * gcc.dg/vect/slp-3.c (dg-final): Ditto. * gcc.target/i386/pr100637-3b.c: New test. * gcc.target/i386/pr100637-3w.c: Ditto. * gcc.target/i386/pr100637-4b.c: Ditto. * gcc.target/i386/pr100637-4w.c: Ditto. --- gcc/config/i386/i386.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 28e6113..04649b4 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -22190,12 +22190,15 @@ ix86_autovectorize_vector_modes (vector_modes *modes, bool all) modes->safe_push (V16QImode); modes->safe_push (V32QImode); } - else if (TARGET_MMX_WITH_SSE) + else if (TARGET_SSE2) modes->safe_push (V16QImode); if (TARGET_MMX_WITH_SSE) modes->safe_push (V8QImode); + if (TARGET_SSE2) + modes->safe_push (V4QImode); + return 0; } -- cgit v1.1 From 04ba00d4ed735242c5284d2c623a3a9d42d94742 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 27 May 2021 09:22:01 +0200 Subject: i386: Add uavg_ceil patterns for 4-byte vectors [PR100637] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2021-05-27 Uroš Bizjak gcc/ PR target/100637 * config/i386/mmx.md (uavgv4qi3_ceil): New insn pattern. (uavgv2hi3_ceil): Ditto. gcc/testsuite/ PR target/100637 * gcc.target/i386/pr100637-3b.c (avgu): New test. * gcc.target/i386/pr100637-3w.c (avgu): Ditto. --- gcc/config/i386/mmx.md | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 453e8ea..23d88a4 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -3270,6 +3270,47 @@ ix86_fixup_binary_operands_no_copy (PLUS, mode, operands); }) +(define_insn "uavgv4qi3_ceil" + [(set (match_operand:V4QI 0 "register_operand" "=x,Yw") + (truncate:V4QI + (lshiftrt:V4HI + (plus:V4HI + (plus:V4HI + (zero_extend:V4HI + (match_operand:V4QI 1 "register_operand" "%0,Yw")) + (zero_extend:V4HI + (match_operand:V4QI 2 "register_operand" "x,Yw"))) + (const_vector:V4HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_SSE2" + "@ + pavgb\t{%2, %0|%0, %2} + vpavgb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseiadd") + (set_attr "mode" "TI")]) + +(define_insn "uavgv2hi3_ceil" + [(set (match_operand:V2HI 0 "register_operand" "=x,Yw") + (truncate:V2HI + (lshiftrt:V2SI + (plus:V2SI + (plus:V2SI + (zero_extend:V2SI + (match_operand:V2HI 1 "register_operand" "%0,Yw")) + (zero_extend:V2SI + (match_operand:V2HI 2 "register_operand" "x,Yw"))) + (const_vector:V2SI [(const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_SSE2" + "@ + pavgw\t{%2, %0|%0, %2} + vpavgw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseiadd") + (set_attr "mode" "TI")]) + (define_insn "mmx_psadbw" [(set (match_operand:V1DI 0 "register_operand" "=y,x,Yw") (unspec:V1DI [(match_operand:V8QI 1 "register_operand" "0,0,Yw") -- cgit v1.1 From 262e75d22c350acbdf4c1fb4f224cc5d3d711eff Mon Sep 17 00:00:00 2001 From: Richard Earnshaw Date: Thu, 27 May 2021 10:25:37 +0100 Subject: arm: Remove use of opts_set in arm_configure_build_target [PR100767] The variable global_options_set is a reflection of which options have been explicitly set from the command line in the structure global_options. But it doesn't describe the contents of a cl_target_option. cl_target_option is a set of options to apply and once configured should represent a viable set of options without needing to know which were explicitly set by the user. Unfortunately arm_configure_build_target was incorrectly conflating the two. Fortunately, however, we do not really need to know this since the various override_options functions should have sanitized the target_options values before constructing a cl_target_option structure. It is safe, therefore, to simply drop this parameter to arm_configure_build_target and rely on checking that various string parameters are non-null before dereferencing them. gcc: PR target/100767 * config/arm/arm.c (arm_configure_build_target): Remove parameter opts_set, directly check opts parameters for being non-null. (arm_option_restore): Update call to arm_configure_build_target. (arm_option_override): Likewise. (arm_can_inline_p): Likewise. (arm_valid_target_attribute_tree): Likewise. * config/arm/arm-c.c (arm_pragma_target_parse): Likewise. * config/arm/arm-protos.h (arm_configure_build_target): Adjust prototype. --- gcc/config/arm/arm-c.c | 3 +-- gcc/config/arm/arm-protos.h | 3 +-- gcc/config/arm/arm.c | 23 ++++++++++------------- 3 files changed, 12 insertions(+), 17 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm-c.c b/gcc/config/arm/arm-c.c index 7f97b84..ae2139c 100644 --- a/gcc/config/arm/arm-c.c +++ b/gcc/config/arm/arm-c.c @@ -408,8 +408,7 @@ arm_pragma_target_parse (tree args, tree pop_target) target_option_current_node, but not handle_pragma_target. */ target_option_current_node = cur_tree; arm_configure_build_target (&arm_active_target, - TREE_TARGET_OPTION (cur_tree), - &global_options_set, false); + TREE_TARGET_OPTION (cur_tree), false); } /* Update macros if target_node changes. The global state will be restored diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index ffccaa7..9b1f613 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -243,8 +243,7 @@ extern bool arm_change_mode_p (tree); extern tree arm_valid_target_attribute_tree (tree, struct gcc_options *, struct gcc_options *); extern void arm_configure_build_target (struct arm_build_target *, - struct cl_target_option *, - struct gcc_options *, bool); + struct cl_target_option *, bool); extern void arm_option_reconfigure_globals (void); extern void arm_options_perform_arch_sanity_checks (void); extern void arm_pr_long_calls (struct cpp_reader *); diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 9377aae..7b37e1b 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -3054,9 +3054,10 @@ arm_override_options_after_change (void) /* Implement TARGET_OPTION_RESTORE. */ static void arm_option_restore (struct gcc_options */* opts */, - struct gcc_options *opts_set, struct cl_target_option *ptr) + struct gcc_options */* opts_set */, + struct cl_target_option *ptr) { - arm_configure_build_target (&arm_active_target, ptr, opts_set, false); + arm_configure_build_target (&arm_active_target, ptr, false); } /* Reset options between modes that the user has specified. */ @@ -3179,7 +3180,6 @@ static sbitmap isa_quirkbits; void arm_configure_build_target (struct arm_build_target *target, struct cl_target_option *opts, - struct gcc_options *opts_set, bool warn_compatible) { const cpu_option *arm_selected_tune = NULL; @@ -3194,7 +3194,7 @@ arm_configure_build_target (struct arm_build_target *target, target->core_name = NULL; target->arch_name = NULL; - if (opts_set->x_arm_arch_string) + if (opts->x_arm_arch_string) { arm_selected_arch = arm_parse_arch_option_name (all_architectures, "-march", @@ -3202,7 +3202,7 @@ arm_configure_build_target (struct arm_build_target *target, arch_opts = strchr (opts->x_arm_arch_string, '+'); } - if (opts_set->x_arm_cpu_string) + if (opts->x_arm_cpu_string) { arm_selected_cpu = arm_parse_cpu_option_name (all_cores, "-mcpu", opts->x_arm_cpu_string); @@ -3212,7 +3212,7 @@ arm_configure_build_target (struct arm_build_target *target, options for tuning. */ } - if (opts_set->x_arm_tune_string) + if (opts->x_arm_tune_string) { arm_selected_tune = arm_parse_cpu_option_name (all_cores, "-mtune", opts->x_arm_tune_string); @@ -3476,8 +3476,7 @@ arm_option_override (void) } cl_target_option_save (&opts, &global_options, &global_options_set); - arm_configure_build_target (&arm_active_target, &opts, &global_options_set, - true); + arm_configure_build_target (&arm_active_target, &opts, true); #ifdef SUBTARGET_OVERRIDE_OPTIONS SUBTARGET_OVERRIDE_OPTIONS; @@ -32982,10 +32981,8 @@ arm_can_inline_p (tree caller, tree callee) caller_target.isa = sbitmap_alloc (isa_num_bits); callee_target.isa = sbitmap_alloc (isa_num_bits); - arm_configure_build_target (&caller_target, caller_opts, &global_options_set, - false); - arm_configure_build_target (&callee_target, callee_opts, &global_options_set, - false); + arm_configure_build_target (&caller_target, caller_opts, false); + arm_configure_build_target (&callee_target, callee_opts, false); if (!bitmap_subset_p (callee_target.isa, caller_target.isa)) can_inline = false; @@ -33121,7 +33118,7 @@ arm_valid_target_attribute_tree (tree args, struct gcc_options *opts, return NULL_TREE; cl_target_option_save (&cl_opts, opts, opts_set); - arm_configure_build_target (&arm_active_target, &cl_opts, opts_set, false); + arm_configure_build_target (&arm_active_target, &cl_opts, false); arm_option_check_internal (opts); /* Do any overrides, such as global options arch=xxx. We do this since arm_active_target was overridden. */ -- cgit v1.1 From 6c67afaf524a5e0e9220f78271a0f5764ca27bd0 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 27 May 2021 14:46:45 +0200 Subject: i386: Add XOP comparisons for 4- and 8-byte vectors [PR100637] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2021-05-27 Uroš Bizjak gcc/ PR target/100637 * config/i386/i386-expand.c (ix86_expand_int_sse_cmp): For TARGET_XOP bypass SSE comparisons for all supported vector modes. * config/i386/mmx.md (*xop_maskcmp3): New insn pattern. (*xop_maskcmp3): Ditto. (*xop_maskcmp_uns3): Ditto. (*xop_maskcmp_uns3): Ditto. --- gcc/config/i386/i386-expand.c | 4 ++-- gcc/config/i386/mmx.md | 56 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 931b336..4185f58 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -4124,8 +4124,8 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1, /* XOP supports all of the comparisons on all 128-bit vector int types. */ if (TARGET_XOP - && (mode == V16QImode || mode == V8HImode - || mode == V4SImode || mode == V2DImode)) + && GET_MODE_CLASS (mode) == MODE_VECTOR_INT + && GET_MODE_SIZE (mode) <= 16) ; /* AVX512F supports all of the comparsions on all 128/256/512-bit vector int types. */ diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 23d88a4..35e4123 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -2121,6 +2121,62 @@ (set_attr "type" "ssecmp") (set_attr "mode" "TI")]) +(define_insn "*xop_maskcmp3" + [(set (match_operand:MMXMODEI 0 "register_operand" "=x") + (match_operator:MMXMODEI 1 "ix86_comparison_int_operator" + [(match_operand:MMXMODEI 2 "register_operand" "x") + (match_operand:MMXMODEI 3 "register_operand" "x")]))] + "TARGET_XOP" + "vpcom%Y1\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "sse4arg") + (set_attr "prefix_data16" "0") + (set_attr "prefix_rep" "0") + (set_attr "prefix_extra" "2") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "*xop_maskcmp3" + [(set (match_operand:VI_32 0 "register_operand" "=x") + (match_operator:VI_32 1 "ix86_comparison_int_operator" + [(match_operand:VI_32 2 "register_operand" "x") + (match_operand:VI_32 3 "register_operand" "x")]))] + "TARGET_XOP" + "vpcom%Y1\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "sse4arg") + (set_attr "prefix_data16" "0") + (set_attr "prefix_rep" "0") + (set_attr "prefix_extra" "2") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "*xop_maskcmp_uns3" + [(set (match_operand:MMXMODEI 0 "register_operand" "=x") + (match_operator:MMXMODEI 1 "ix86_comparison_uns_operator" + [(match_operand:MMXMODEI 2 "register_operand" "x") + (match_operand:MMXMODEI 3 "register_operand" "x")]))] + "TARGET_XOP" + "vpcom%Y1u\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "prefix_data16" "0") + (set_attr "prefix_rep" "0") + (set_attr "prefix_extra" "2") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "*xop_maskcmp_uns3" + [(set (match_operand:VI_32 0 "register_operand" "=x") + (match_operator:VI_32 1 "ix86_comparison_uns_operator" + [(match_operand:VI_32 2 "register_operand" "x") + (match_operand:VI_32 3 "register_operand" "x")]))] + "TARGET_XOP" + "vpcom%Y1u\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "prefix_data16" "0") + (set_attr "prefix_rep" "0") + (set_attr "prefix_extra" "2") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + (define_expand "vec_cmp" [(set (match_operand:MMXMODEI 0 "register_operand") (match_operator:MMXMODEI 1 "" -- cgit v1.1 From a617e5d516b508dfea411cf3e82e407f7beed170 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 27 May 2021 22:23:52 +0200 Subject: i386: Remove unneeded binary operand fixup from expanders. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no need to call ix86_fixup_binary_operands when there are only one or no memory operands allowed. 2021-05-27 Uroš Bizjak gcc/ * config/i386/mmx.md (addv2sf3): Do not call ix86_fixup_binary_operands_no_copy. (subv2sf3): Ditto. (mulv2sf3): Ditto. (v2sf3): Ditto. (3): Ditto. (3): Remove expander. (3): Rename from "*3". (mulv4hi): Do not call ix86_fixup_binary_operands_no_copy. (mulv2hi3): Remove expander. (mulv2hi3): Rename from *mulv2hi3. (mulv2hi3_highpart): Remove expander. (mulv2hi3_highpart): Rename from *mulv2hi3_highpart. (3): Rename from "*mmx_3". (3): Remove expander. (SMAXMIN_MMXMODEI): Remove mode iterator. (v4hi3): New expander. (v4qi3): Rename from *v4qi3. (v2hi3): Rename from *v2hi3. (3): Remove expander. (SMAXMIN_VI_32): Remove mode iterator. (3): Rename from "*mmx_3". (3): Remove expander. (UMAXMIN_MMXMODEI): Remove mode iterator. (v8qi3): New expander. (v4qi3): Rename from *v4qi3. (v2hi3): Rename from *v2hi3. (3): Remove expander. (UMAXMIN_VI_32): Remove mode iterator. (v2hi3): Remove expander. (v2hi3): Rename from *v2hi3. (3): Do not call ix86_fixup_binary_operands_no_copy. (3): Remove expander. (3): Rename from "*3". (uavg3_ceil): Do not call ix86_fixup_binary_operands_no_copy. * config/i386/sse.md (div3): Do not call ix86_fixup_binary_operands_no_copy. (div3): Ditto. (3): Ditto. (smulhrsv4hi3): Ditto. (smulhrsv2hi3): Ditto. --- gcc/config/i386/mmx.md | 199 +++++++++++++------------------------------------ gcc/config/i386/sse.md | 17 +---- 2 files changed, 56 insertions(+), 160 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 35e4123..f39e062 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -417,8 +417,7 @@ (plus:V2SF (match_operand:V2SF 1 "register_operand") (match_operand:V2SF 2 "register_operand")))] - "TARGET_MMX_WITH_SSE" - "ix86_fixup_binary_operands_no_copy (PLUS, V2SFmode, operands);") + "TARGET_MMX_WITH_SSE") (define_insn "*mmx_addv2sf3" [(set (match_operand:V2SF 0 "register_operand" "=y,x,v") @@ -455,8 +454,7 @@ (minus:V2SF (match_operand:V2SF 1 "register_operand") (match_operand:V2SF 2 "register_operand")))] - "TARGET_MMX_WITH_SSE" - "ix86_fixup_binary_operands_no_copy (MINUS, V2SFmode, operands);") + "TARGET_MMX_WITH_SSE") (define_insn "*mmx_subv2sf3" [(set (match_operand:V2SF 0 "register_operand" "=y,y,x,v") @@ -489,8 +487,7 @@ (mult:V2SF (match_operand:V2SF 1 "register_operand") (match_operand:V2SF 2 "register_operand")))] - "TARGET_MMX_WITH_SSE" - "ix86_fixup_binary_operands_no_copy (MULT, V2SFmode, operands);") + "TARGET_MMX_WITH_SSE") (define_insn "*mmx_mulv2sf3" [(set (match_operand:V2SF 0 "register_operand" "=y,x,v") @@ -542,8 +539,6 @@ (operands[0], operands[1], operands[2])); DONE; } - else - ix86_fixup_binary_operands_no_copy (, V2SFmode, operands); }) ;; These versions of the min/max patterns are intentionally ignorant of @@ -709,7 +704,7 @@ (vec_select:SF (match_dup 1) (parallel [(match_operand:SI 3 "const_0_to_1_operand")]))))] - "TARGET_MMX_WITH_SSE && TARGET_SSE3 + "TARGET_SSE3 && TARGET_MMX_WITH_SSE && INTVAL (operands[2]) != INTVAL (operands[3])" "@ haddps\t{%0, %0|%0, %0} @@ -747,7 +742,7 @@ (vec_select:SF (match_dup 1) (parallel [(const_int 1)]))))] - "TARGET_MMX_WITH_SSE && TARGET_SSE3" + "TARGET_SSE3 && TARGET_MMX_WITH_SSE" "@ hsubps\t{%0, %0|%0, %0} vhsubps\t{%1, %1, %0|%0, %1, %1}" @@ -1091,7 +1086,7 @@ (define_insn "fixuns_truncv2sfv2si2" [(set (match_operand:V2SI 0 "register_operand" "=v") (unsigned_fix:V2SI (match_operand:V2SF 1 "register_operand" "v")))] - "TARGET_MMX_WITH_SSE && TARGET_AVX512VL" + "TARGET_AVX512VL && TARGET_MMX_WITH_SSE" "vcvttps2udq\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") @@ -1119,7 +1114,7 @@ (define_insn "floatunsv2siv2sf2" [(set (match_operand:V2SF 0 "register_operand" "=v") (unsigned_float:V2SF (match_operand:V2SI 1 "register_operand" "v")))] - "TARGET_MMX_WITH_SSE && TARGET_AVX512VL" + "TARGET_AVX512VL && TARGET_MMX_WITH_SSE" "vcvtudq2ps\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") @@ -1361,8 +1356,7 @@ (plusminus:MMXMODEI (match_operand:MMXMODEI 1 "register_operand") (match_operand:MMXMODEI 2 "register_operand")))] - "TARGET_MMX_WITH_SSE" - "ix86_fixup_binary_operands_no_copy (, mode, operands);") + "TARGET_MMX_WITH_SSE") (define_insn "*mmx_3" [(set (match_operand:MMXMODEI8 0 "register_operand" "=y,x,") @@ -1390,21 +1384,12 @@ "TARGET_SSE2" "operands[2] = force_reg (mode, CONST0_RTX (mode));") -(define_expand "3" - [(set (match_operand:VI_32 0 "register_operand") - (plusminus:VI_32 - (match_operand:VI_32 1 "register_operand") - (match_operand:VI_32 2 "register_operand")))] - "TARGET_SSE2" - "ix86_fixup_binary_operands_no_copy (, mode, operands);") - -(define_insn "*3" +(define_insn "3" [(set (match_operand:VI_32 0 "register_operand" "=x,Yw") (plusminus:VI_32 (match_operand:VI_32 1 "register_operand" "0,Yw") (match_operand:VI_32 2 "register_operand" "x,Yw")))] - "TARGET_SSE2 - && ix86_binary_operator_ok (, mode, operands)" + "TARGET_SSE2" "@ p\t{%2, %0|%0, %2} vp\t{%2, %1, %0|%0, %1, %2}" @@ -1441,8 +1426,7 @@ (sat_plusminus:VI_32 (match_operand:VI_32 1 "register_operand" "0,Yw") (match_operand:VI_32 2 "register_operand" "x,Yw")))] - "TARGET_SSE2 - && ix86_binary_operator_ok (, mode, operands)" + "TARGET_SSE2" "@ p\t{%2, %0|%0, %2} vp\t{%2, %1, %0|%0, %1, %2}" @@ -1461,8 +1445,7 @@ [(set (match_operand:V4HI 0 "register_operand") (mult:V4HI (match_operand:V4HI 1 "register_operand") (match_operand:V4HI 2 "register_operand")))] - "TARGET_MMX_WITH_SSE" - "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);") + "TARGET_MMX_WITH_SSE") (define_insn "*mmx_mulv4hi3" [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yw") @@ -1479,19 +1462,11 @@ (set_attr "type" "mmxmul,ssemul,ssemul") (set_attr "mode" "DI,TI,TI")]) -(define_expand "mulv2hi3" - [(set (match_operand:V2HI 0 "register_operand") - (mult:V2HI (match_operand:V2HI 1 "register_operand") - (match_operand:V2HI 2 "register_operand")))] - "TARGET_SSE2" - "ix86_fixup_binary_operands_no_copy (MULT, V2HImode, operands);") - -(define_insn "*mulv2hi3" +(define_insn "mulv2hi3" [(set (match_operand:V2HI 0 "register_operand" "=x,Yw") (mult:V2HI (match_operand:V2HI 1 "register_operand" "%0,Yw") (match_operand:V2HI 2 "register_operand" "x,Yw")))] - "TARGET_SSE2 - && ix86_binary_operator_ok (MULT, V2HImode, operands)" + "TARGET_SSE2" "@ pmullw\t{%2, %0|%0, %2} vpmullw\t{%2, %1, %0|%0, %1, %2}" @@ -1579,10 +1554,9 @@ (any_extend:V4SI (match_operand:V4HI 2 "register_operand"))) (const_int 16))))] - "TARGET_MMX_WITH_SSE" - "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);") + "TARGET_MMX_WITH_SSE") -(define_insn "*mulv2hi3_highpart" +(define_insn "mulv2hi3_highpart" [(set (match_operand:V2HI 0 "register_operand" "=x,Yw") (truncate:V2HI (lshiftrt:V2SI @@ -1592,8 +1566,7 @@ (any_extend:V2SI (match_operand:V2HI 2 "register_operand" "x,Yw"))) (const_int 16))))] - "TARGET_SSE2 - && ix86_binary_operator_ok (MULT, V2HImode, operands)" + "TARGET_SSE2" "@ pmulhw\t{%2, %0|%0, %2} vpmulhw\t{%2, %1, %0|%0, %1, %2}" @@ -1601,19 +1574,6 @@ (set_attr "type" "ssemul") (set_attr "mode" "TI")]) -(define_expand "mulv2hi3_highpart" - [(set (match_operand:V2HI 0 "register_operand") - (truncate:V2HI - (lshiftrt:V2SI - (mult:V2SI - (any_extend:V2SI - (match_operand:V2HI 1 "register_operand")) - (any_extend:V2SI - (match_operand:V2HI 2 "register_operand"))) - (const_int 16))))] - "TARGET_SSE2" - "ix86_fixup_binary_operands_no_copy (MULT, V2HImode, operands);") - (define_expand "mmx_pmaddwd" [(set (match_operand:V2SI 0 "register_operand") (plus:V2SI @@ -1744,13 +1704,12 @@ ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(define_insn "*mmx_3" +(define_insn "3" [(set (match_operand:MMXMODE14 0 "register_operand" "=Yr,*x,Yv") (smaxmin:MMXMODE14 (match_operand:MMXMODE14 1 "register_operand" "%0,0,Yv") (match_operand:MMXMODE14 2 "register_operand" "Yr,*x,Yv")))] - "TARGET_MMX_WITH_SSE && TARGET_SSE4_1 - && ix86_binary_operator_ok (, mode, operands)" + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" "@ p\t{%2, %0|%0, %2} p\t{%2, %0|%0, %2} @@ -1787,24 +1746,19 @@ (set_attr "type" "mmxadd,sseiadd,sseiadd") (set_attr "mode" "DI,TI,TI")]) -(define_mode_iterator SMAXMIN_MMXMODEI - [(V8QI "TARGET_SSE4_1") V4HI (V2SI "TARGET_SSE4_1")]) - -(define_expand "3" - [(set (match_operand:SMAXMIN_MMXMODEI 0 "register_operand") - (smaxmin:SMAXMIN_MMXMODEI - (match_operand:SMAXMIN_MMXMODEI 1 "register_operand") - (match_operand:SMAXMIN_MMXMODEI 2 "register_operand")))] - "TARGET_MMX_WITH_SSE" - "ix86_fixup_binary_operands_no_copy (, mode, operands);") +(define_expand "v4hi3" + [(set (match_operand:V4HI 0 "register_operand") + (smaxmin:V4HI + (match_operand:V4HI 1 "register_operand") + (match_operand:V4HI 2 "register_operand")))] + "TARGET_MMX_WITH_SSE") -(define_insn "*v4qi3" +(define_insn "v4qi3" [(set (match_operand:V4QI 0 "register_operand" "=Yr,*x,Yv") (smaxmin:V4QI (match_operand:V4QI 1 "register_operand" "%0,0,Yv") (match_operand:V4QI 2 "register_operand" "Yr,*x,Yv")))] - "TARGET_SSE4_1 - && ix86_binary_operator_ok (, V4QImode, operands)" + "TARGET_SSE4_1" "@ pb\t{%2, %0|%0, %2} pb\t{%2, %0|%0, %2} @@ -1815,13 +1769,12 @@ (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "TI")]) -(define_insn "*v2hi3" +(define_insn "v2hi3" [(set (match_operand:V2HI 0 "register_operand" "=x,Yw") (smaxmin:V2HI (match_operand:V2HI 1 "register_operand" "%0,Yw") (match_operand:V2HI 2 "register_operand" "x,Yw")))] - "TARGET_SSE2 - && ix86_binary_operator_ok (, V2HImode, operands)" + "TARGET_SSE2" "@ pw\t{%2, %0|%0, %2} vpw\t{%2, %1, %0|%0, %1, %2}" @@ -1829,23 +1782,12 @@ (set_attr "type" "sseiadd") (set_attr "mode" "TI")]) -(define_mode_iterator SMAXMIN_VI_32 [(V4QI "TARGET_SSE4_1") V2HI]) - -(define_expand "3" - [(set (match_operand:SMAXMIN_VI_32 0 "register_operand") - (smaxmin:SMAXMIN_VI_32 - (match_operand:SMAXMIN_VI_32 1 "register_operand") - (match_operand:SMAXMIN_VI_32 2 "register_operand")))] - "TARGET_SSE2" - "ix86_fixup_binary_operands_no_copy (, V4HImode, operands);") - -(define_insn "*mmx_3" +(define_insn "3" [(set (match_operand:MMXMODE24 0 "register_operand" "=Yr,*x,Yv") (umaxmin:MMXMODE24 (match_operand:MMXMODE24 1 "register_operand" "%0,0,Yv") (match_operand:MMXMODE24 2 "register_operand" "Yr,*x,Yv")))] - "TARGET_MMX_WITH_SSE && TARGET_SSE4_1 - && ix86_binary_operator_ok (, mode, operands)" + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" "@ p\t{%2, %0|%0, %2} p\t{%2, %0|%0, %2} @@ -1882,24 +1824,19 @@ (set_attr "type" "mmxadd,sseiadd,sseiadd") (set_attr "mode" "DI,TI,TI")]) -(define_mode_iterator UMAXMIN_MMXMODEI - [V8QI (V4HI "TARGET_SSE4_1") (V2SI "TARGET_SSE4_1")]) - -(define_expand "3" - [(set (match_operand:UMAXMIN_MMXMODEI 0 "register_operand") - (umaxmin:UMAXMIN_MMXMODEI - (match_operand:UMAXMIN_MMXMODEI 1 "register_operand") - (match_operand:UMAXMIN_MMXMODEI 2 "register_operand")))] - "TARGET_MMX_WITH_SSE" - "ix86_fixup_binary_operands_no_copy (, mode, operands);") +(define_expand "v8qi3" + [(set (match_operand:V8QI 0 "register_operand") + (umaxmin:V8QI + (match_operand:V8QI 1 "register_operand") + (match_operand:V8QI 2 "register_operand")))] + "TARGET_MMX_WITH_SSE") -(define_insn "*v4qi3" +(define_insn "v4qi3" [(set (match_operand:V4QI 0 "register_operand" "=x,Yw") (umaxmin:V4QI (match_operand:V4QI 1 "register_operand" "%0,Yw") (match_operand:V4QI 2 "register_operand" "x,Yw")))] - "TARGET_SSE2 - && ix86_binary_operator_ok (, V4QImode, operands)" + "TARGET_SSE2" "@ pb\t{%2, %0|%0, %2} vpb\t{%2, %1, %0|%0, %1, %2}" @@ -1907,13 +1844,12 @@ (set_attr "type" "sseiadd") (set_attr "mode" "TI")]) -(define_insn "*v2hi3" +(define_insn "v2hi3" [(set (match_operand:V2HI 0 "register_operand" "=Yr,*x,Yv") (umaxmin:V2HI (match_operand:V2HI 1 "register_operand" "%0,0,Yv") (match_operand:V2HI 2 "register_operand" "Yr,*x,Yv")))] - "TARGET_SSE4_1 - && ix86_binary_operator_ok (, V2HImode, operands)" + "TARGET_SSE4_1" "@ pw\t{%2, %0|%0, %2} pw\t{%2, %0|%0, %2} @@ -1924,16 +1860,6 @@ (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "TI")]) -(define_mode_iterator UMAXMIN_VI_32 [V4QI (V2HI "TARGET_SSE4_1")]) - -(define_expand "3" - [(set (match_operand:UMAXMIN_VI_32 0 "register_operand") - (umaxmin:UMAXMIN_VI_32 - (match_operand:UMAXMIN_VI_32 1 "register_operand") - (match_operand:UMAXMIN_VI_32 2 "register_operand")))] - "TARGET_SSE2" - "ix86_fixup_binary_operands_no_copy (, V4HImode, operands);") - (define_insn "ssse3_abs2" [(set (match_operand:MMXMODEI 0 "register_operand" "=y,Yv") (abs:MMXMODEI @@ -1953,7 +1879,7 @@ [(set (match_operand:MMXMODEI 0 "register_operand") (abs:MMXMODEI (match_operand:MMXMODEI 1 "register_operand")))] - "TARGET_MMX_WITH_SSE && TARGET_SSSE3") + "TARGET_SSSE3 && TARGET_MMX_WITH_SSE") (define_insn "abs2" [(set (match_operand:VI_32 0 "register_operand" "=Yv") @@ -2025,7 +1951,7 @@ (match_operand:DI 2 "nonmemory_operand")))] "TARGET_MMX_WITH_SSE") -(define_insn "*v2hi3" +(define_insn "v2hi3" [(set (match_operand:V2HI 0 "register_operand" "=x,Yw") (any_shift:V2HI (match_operand:V2HI 1 "register_operand" "0,Yw") @@ -2042,13 +1968,6 @@ (const_string "0"))) (set_attr "mode" "TI")]) -(define_expand "v2hi3" - [(set (match_operand:V2HI 0 "register_operand") - (any_shift:V2HI - (match_operand:V2HI 1 "register_operand") - (match_operand:DI 2 "nonmemory_operand")))] - "TARGET_SSE2") - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral comparisons @@ -2084,8 +2003,7 @@ (eq:VI_32 (match_operand:VI_32 1 "register_operand" "%0,x") (match_operand:VI_32 2 "register_operand" "x,x")))] - "TARGET_SSE2 - && ix86_binary_operator_ok (EQ, mode, operands)" + "TARGET_SSE2" "@ pcmpeq\t{%2, %0|%0, %2} vpcmpeq\t{%2, %1, %0|%0, %1, %2}" @@ -2441,8 +2359,7 @@ (any_logic:MMXMODEI (match_operand:MMXMODEI 1 "register_operand") (match_operand:MMXMODEI 2 "register_operand")))] - "TARGET_MMX_WITH_SSE" - "ix86_fixup_binary_operands_no_copy (, mode, operands);") + "TARGET_MMX_WITH_SSE") (define_insn "*mmx_3" [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v") @@ -2461,21 +2378,12 @@ (set_attr "type" "mmxadd,sselog,sselog,sselog") (set_attr "mode" "DI,TI,TI,TI")]) -(define_expand "3" - [(set (match_operand:VI_32 0 "register_operand") - (any_logic:VI_32 - (match_operand:VI_32 1 "register_operand") - (match_operand:VI_32 2 "register_operand")))] - "TARGET_SSE2" - "ix86_fixup_binary_operands_no_copy (, mode, operands);") - -(define_insn "*3" +(define_insn "3" [(set (match_operand:VI_32 0 "register_operand" "=x,x,v") (any_logic:VI_32 (match_operand:VI_32 1 "register_operand" "%0,x,v") (match_operand:VI_32 2 "register_operand" "x,x,v")))] - "TARGET_SSE2 - && ix86_binary_operator_ok (, mode, operands)" + "TARGET_SSE2" "@ p\t{%2, %0|%0, %2} vp\t{%2, %1, %0|%0, %1, %2} @@ -2672,7 +2580,7 @@ (match_operand:SI 2 "nonimmediate_operand" "rm,rm")) (match_operand:V2SI 1 "register_operand" "0,Yv") (match_operand:SI 3 "const_int_operand")))] - "TARGET_MMX_WITH_SSE && TARGET_SSE4_1 + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE && ((unsigned) exact_log2 (INTVAL (operands[3])) < GET_MODE_NUNITS (V2SImode))" { @@ -2752,7 +2660,7 @@ (match_operand:QI 2 "nonimmediate_operand" "rm,rm")) (match_operand:V8QI 1 "register_operand" "0,YW") (match_operand:SI 3 "const_int_operand")))] - "TARGET_MMX_WITH_SSE && TARGET_SSE4_1 + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE && ((unsigned) exact_log2 (INTVAL (operands[3])) < GET_MODE_NUNITS (V8QImode))" { @@ -2822,7 +2730,7 @@ (vec_select:QI (match_operand:V8QI 1 "register_operand" "YW,YW") (parallel [(match_operand:SI 2 "const_0_to_7_operand" "n,n")])))] - "TARGET_MMX_WITH_SSE && TARGET_SSE4_1" + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" "@ %vpextrb\t{%2, %1, %k0|%k0, %1, %2} %vpextrb\t{%2, %1, %0|%0, %1, %2}" @@ -2839,7 +2747,7 @@ (vec_select:QI (match_operand:V8QI 1 "register_operand" "YW") (parallel [(match_operand:SI 2 "const_0_to_7_operand" "n")]))))] - "TARGET_MMX_WITH_SSE && TARGET_SSE4_1" + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" "%vpextrb\t{%2, %1, %k0|%k0, %1, %2}" [(set_attr "type" "sselog1") (set_attr "prefix_data16" "1") @@ -3321,10 +3229,7 @@ (match_dup 3)) (const_int 1))))] "TARGET_MMX_WITH_SSE" -{ - operands[3] = CONST1_RTX(mode); - ix86_fixup_binary_operands_no_copy (PLUS, mode, operands); -}) + "operands[3] = CONST1_RTX(mode);") (define_insn "uavgv4qi3_ceil" [(set (match_operand:V4QI 0 "register_operand" "=x,Yw") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 7269147..1b3df21 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1999,8 +1999,7 @@ [(set (match_operand:VF2 0 "register_operand") (div:VF2 (match_operand:VF2 1 "register_operand") (match_operand:VF2 2 "vector_operand")))] - "TARGET_SSE2" - "ix86_fixup_binary_operands_no_copy (DIV, mode, operands);") + "TARGET_SSE2") (define_expand "div3" [(set (match_operand:VF1 0 "register_operand") @@ -2008,8 +2007,6 @@ (match_operand:VF1 2 "vector_operand")))] "TARGET_SSE" { - ix86_fixup_binary_operands_no_copy (DIV, mode, operands); - if (TARGET_SSE_MATH && TARGET_RECIP_VEC_DIV && !optimize_insn_for_size_p () @@ -12801,7 +12798,7 @@ { if (TARGET_AVX512F && (mode == V8DImode || TARGET_AVX512VL)) - ix86_fixup_binary_operands_no_copy (, mode, operands); + ; else { enum rtx_code code; @@ -17186,10 +17183,7 @@ (match_dup 3)) (const_int 1))))] "TARGET_MMX_WITH_SSE && TARGET_SSSE3" -{ - operands[3] = CONST1_RTX(V4HImode); - ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands); -}) + "operands[3] = CONST1_RTX(V4HImode);") (define_expand "ssse3_pmulhrswv4hi3" [(set (match_operand:V4HI 0 "register_operand") @@ -17254,10 +17248,7 @@ (match_dup 3)) (const_int 1))))] "TARGET_SSSE3" -{ - operands[3] = CONST1_RTX(V2HImode); - ix86_fixup_binary_operands_no_copy (MULT, V2HImode, operands); -}) + "operands[3] = CONST1_RTX(V2HImode);") (define_insn "*smulhrsv2hi3" [(set (match_operand:V2HI 0 "register_operand" "=x,Yv") -- cgit v1.1 From f69a1aee157f496e6d3f373fff1b26b03bbf3409 Mon Sep 17 00:00:00 2001 From: Cooper Qu Date: Fri, 28 May 2021 17:02:05 +0800 Subject: C-SKY: Define HAVE_sync_compare_and_swap*. The SYNC operations are implemented as library functions, not NSN patterns. As a result, the HAVE defines for the patterns are not defined. We need to define them to generate the corresponding __GCC_HAVE_SYNC_COMPARE_AND_SWAP_* and __GCC_ATOMIC_*_LOCK_FREE defines. gcc/ * config/csky/csky-linux-elf.h (HAVE_sync_compare_and_swapqi): Defined. (HAVE_sync_compare_and_swaphi): Likewise. (HAVE_sync_compare_and_swapsi): Likewise. --- gcc/config/csky/csky-linux-elf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/csky/csky-linux-elf.h b/gcc/config/csky/csky-linux-elf.h index 58a1f39..e94eb8c 100644 --- a/gcc/config/csky/csky-linux-elf.h +++ b/gcc/config/csky/csky-linux-elf.h @@ -133,3 +133,13 @@ #ifdef IN_LIBGCC2 extern int cacheflush (void *__addr, const int __nbytes, const int __op); #endif + +/* The SYNC operations are implemented as library functions, not + INSN patterns. As a result, the HAVE defines for the patterns are + not defined. We need to define them to generate the corresponding + __GCC_HAVE_SYNC_COMPARE_AND_SWAP_* and __GCC_ATOMIC_*_LOCK_FREE + defines. */ + +#define HAVE_sync_compare_and_swapqi 1 +#define HAVE_sync_compare_and_swaphi 1 +#define HAVE_sync_compare_and_swapsi 1 -- cgit v1.1 From 12bb62fbb47bd2848746da53c72ed068a4274daf Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Sat, 29 May 2021 09:37:30 -0600 Subject: [committed][PR bootstrap/100730] Fix warnings in H8 target files gcc/ * config/h8300/h8300.c (h8300_emit_stack_adjustment): Drop unused parameter. Call callers fixed. (push): Likewise. (output_plussi): Add FALLTHRU markers. (h8300_shift_needs_scratch_p): Add gcc_unreachable marker. --- gcc/config/h8300/h8300.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300.c b/gcc/config/h8300/h8300.c index 0ae8030..ba2b9da 100644 --- a/gcc/config/h8300/h8300.c +++ b/gcc/config/h8300/h8300.c @@ -91,7 +91,7 @@ static int h8300_interrupt_function_p (tree); static int h8300_saveall_function_p (tree); static int h8300_monitor_function_p (tree); static int h8300_os_task_function_p (tree); -static void h8300_emit_stack_adjustment (int, HOST_WIDE_INT, bool); +static void h8300_emit_stack_adjustment (int, HOST_WIDE_INT); static HOST_WIDE_INT round_frame_size (HOST_WIDE_INT); static unsigned int compute_saved_regs (void); static const char *cond_string (enum rtx_code); @@ -452,7 +452,7 @@ Fpa (rtx par) SIZE to adjust the stack pointer. */ static void -h8300_emit_stack_adjustment (int sign, HOST_WIDE_INT size, bool in_prologue) +h8300_emit_stack_adjustment (int sign, HOST_WIDE_INT size) { /* If the frame size is 0, we don't have anything to do. */ if (size == 0) @@ -511,7 +511,7 @@ compute_saved_regs (void) /* Emit an insn to push register RN. */ static rtx -push (int rn, bool in_prologue) +push (int rn) { rtx reg = gen_rtx_REG (word_mode, rn); rtx x; @@ -571,7 +571,7 @@ h8300_push_pop (int regno, int nregs, bool pop_p, bool return_p) if (pop_p) pop (regno); else - push (regno, false); + push (regno); return; } @@ -755,7 +755,7 @@ h8300_expand_prologue (void) if (frame_pointer_needed) { /* Push fp. */ - push (HARD_FRAME_POINTER_REGNUM, true); + push (HARD_FRAME_POINTER_REGNUM); F (emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx), 0); } @@ -787,7 +787,7 @@ h8300_expand_prologue (void) } /* Leave room for locals. */ - h8300_emit_stack_adjustment (-1, round_frame_size (get_frame_size ()), true); + h8300_emit_stack_adjustment (-1, round_frame_size (get_frame_size ())); if (flag_stack_usage_info) current_function_static_stack_size @@ -828,7 +828,7 @@ h8300_expand_epilogue (void) returned_p = false; /* Deallocate locals. */ - h8300_emit_stack_adjustment (1, frame_size, false); + h8300_emit_stack_adjustment (1, frame_size); /* Pop the saved registers in descending order. */ saved_regs = compute_saved_regs (); @@ -2707,10 +2707,14 @@ output_plussi (rtx *operands, bool need_flags) if (!need_flags) return "adds\t%2,%S0"; + /* FALLTHRU */ + case 0xfffffffc: if (!need_flags) return "subs\t%G2,%S0"; + /* FALLTHRU */ + case 0x00010000: case 0x00020000: if (!need_flags) @@ -2719,6 +2723,8 @@ output_plussi (rtx *operands, bool need_flags) return "inc.w\t%2,%e0"; } + /* FALLTHRU */ + case 0xffff0000: case 0xfffe0000: if (!need_flags) @@ -2726,6 +2732,9 @@ output_plussi (rtx *operands, bool need_flags) operands[2] = GEN_INT (intval >> 16); return "dec.w\t%G2,%e0"; } + + /* FALLTHRU */ + } /* See if we can finish with 4 bytes. */ @@ -2792,10 +2801,15 @@ compute_plussi_length (rtx *operands, bool need_flags) if (!need_flags) return 2; + /* FALLTHRU */ + case 0xffff0000: case 0xfffe0000: if (!need_flags) return 2; + + /* FALLTHRU */ + } /* See if we can finish with 4 bytes. */ @@ -3999,6 +4013,7 @@ h8300_shift_needs_scratch_p (int count, machine_mode mode, enum rtx_code type) else if (type == ASHIFTRT) return (ar == SHIFT_LOOP || (TARGET_H8300H && mode == SImode && count == 8)); + gcc_unreachable (); } /* Output the assembler code for doing shifts. */ -- cgit v1.1 From df4e0359dad239854af0ea9eacb8e7e3719557d0 Mon Sep 17 00:00:00 2001 From: Peter Bergner Date: Sun, 30 May 2021 22:45:55 -0500 Subject: rs6000: MMA test case ICEs using -O3 [PR99842] The mma_assemble_input_operand predicate does not accept reg+reg indexed addresses which can lead to ICEs. The lxv and lxvp instructions have indexed forms (lxvx and lxvpx), so the simple solution is to just allow indexed addresses in the predicate. 2021-05-30 Peter Bergner gcc/ PR target/99842 * config/rs6000/predicates.md(mma_assemble_input_operand): Allow indexed form addresses. gcc/testsuite/ PR target/99842 * g++.target/powerpc/pr99842.C: New. --- gcc/config/rs6000/predicates.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index e21bc74..121cbf1 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -1172,7 +1172,8 @@ (match_test "(mode == V16QImode && (vsx_register_operand (op, mode) || (MEM_P (op) - && quad_address_p (XEXP (op, 0), mode, false))))")) + && (indexed_or_indirect_address (XEXP (op, 0), mode) + || quad_address_p (XEXP (op, 0), mode, false)))))")) ;; Return 1 if this operand is valid for an MMA disassemble insn. (define_predicate "mma_disassemble_output_operand" -- cgit v1.1 From 4ea5fe8b4002d15c8706749a3c43ed107c9a02f9 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Wed, 2 Jun 2021 00:56:38 -0400 Subject: Fix minor bugs in H8 port logical ops. Prepare for more compare/test removal gcc/ * config/h8300/h8300-protos.h (compute_a_shift_length): Drop unused argument from prototype. (output_logical_op): Add rtx_code argument. (compute_logical_op_length): Likewise. * config/h8300/h8300.c (h8300_and_costs): Pass additional argument to compute_a_shift_length. (output_logical_op); New argument with the rtx code rather than extracting it from an operand. Handle QImode too. (compute_logical_op_length): Similary. (compute_a_shift_length): Drop unused argument. * config/h8300/h8300.md (logicals): New code iterator. * config/h8300/logical.md (3 expander): Combine the "and" expander with the "ior"/"xor" expander. (bclrmsx): Combine the QI/HI mode patterns. (3 insns): Use code iterator rather than match_operator. Handle QImode as well. Update call to output_logical_op and compute_logical_op_length to pass in rtx_code Fix split condition on all define_insn_and_split patterns. (one_cmpl2): Use to support both clobbering the flags and setting ZN via existing define_subst. * config/h8300/shiftrotate.md: Drop unused argument from calls to compute_a_shift_length. Signed-off-by: Jeff Law --- gcc/config/h8300/h8300-protos.h | 7 ++-- gcc/config/h8300/h8300.c | 21 ++++++----- gcc/config/h8300/h8300.md | 2 ++ gcc/config/h8300/logical.md | 77 +++++++++++++++++------------------------ gcc/config/h8300/shiftrotate.md | 12 +++---- 5 files changed, 55 insertions(+), 64 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300-protos.h b/gcc/config/h8300/h8300-protos.h index 45e7dec..af65329 100644 --- a/gcc/config/h8300/h8300-protos.h +++ b/gcc/config/h8300/h8300-protos.h @@ -29,16 +29,15 @@ extern unsigned int compute_mov_length (rtx *); extern const char *output_plussi (rtx *, bool); extern unsigned int compute_plussi_length (rtx *, bool); extern const char *output_a_shift (rtx *); -extern unsigned int compute_a_shift_length (rtx, rtx *); +extern unsigned int compute_a_shift_length (rtx *); extern const char *output_a_rotate (enum rtx_code, rtx *); extern unsigned int compute_a_rotate_length (rtx *); extern const char *output_simode_bld (int, rtx[]); extern void final_prescan_insn (rtx_insn *, rtx *, int); extern int h8300_expand_movsi (rtx[]); extern machine_mode h8300_select_cc_mode (RTX_CODE, rtx, rtx); -extern const char *output_logical_op (machine_mode, rtx *); -extern unsigned int compute_logical_op_length (machine_mode, - rtx *); +extern const char *output_logical_op (machine_mode, rtx_code code, rtx *); +extern unsigned int compute_logical_op_length (machine_mode, rtx_code, rtx *); extern int compute_logical_op_cc (machine_mode, rtx *); extern int compute_a_shift_cc (rtx, rtx *); diff --git a/gcc/config/h8300/h8300.c b/gcc/config/h8300/h8300.c index ba2b9da..ef947aa 100644 --- a/gcc/config/h8300/h8300.c +++ b/gcc/config/h8300/h8300.c @@ -1100,7 +1100,7 @@ h8300_and_costs (rtx x) operands[1] = XEXP (x, 0); operands[2] = XEXP (x, 1); operands[3] = x; - return compute_logical_op_length (GET_MODE (x), operands) / 2; + return compute_logical_op_length (GET_MODE (x), AND, operands) / 2; } /* Compute the cost of a shift insn. */ @@ -1119,7 +1119,7 @@ h8300_shift_costs (rtx x) operands[1] = NULL; operands[2] = XEXP (x, 1); operands[3] = x; - return compute_a_shift_length (NULL, operands) / 2; + return compute_a_shift_length (operands) / 2; } /* Worker function for TARGET_RTX_COSTS. */ @@ -2879,10 +2879,8 @@ compute_plussi_cc (rtx *operands) /* Output a logical insn. */ const char * -output_logical_op (machine_mode mode, rtx *operands) +output_logical_op (machine_mode mode, rtx_code code, rtx *operands) { - /* Figure out the logical op that we need to perform. */ - enum rtx_code code = GET_CODE (operands[3]); /* Pretend that every byte is affected if both operands are registers. */ const unsigned HOST_WIDE_INT intval = (unsigned HOST_WIDE_INT) ((GET_CODE (operands[2]) == CONST_INT) @@ -2923,6 +2921,10 @@ output_logical_op (machine_mode mode, rtx *operands) switch (mode) { + case E_QImode: + sprintf (insn_buf, "%s.b\t%%X2,%%X0", opname); + output_asm_insn (insn_buf, operands); + break; case E_HImode: /* First, see if we can finish with one insn. */ if (b0 != 0 && b1 != 0) @@ -3033,10 +3035,8 @@ output_logical_op (machine_mode mode, rtx *operands) /* Compute the length of a logical insn. */ unsigned int -compute_logical_op_length (machine_mode mode, rtx *operands) +compute_logical_op_length (machine_mode mode, rtx_code code, rtx *operands) { - /* Figure out the logical op that we need to perform. */ - enum rtx_code code = GET_CODE (operands[3]); /* Pretend that every byte is affected if both operands are registers. */ const unsigned HOST_WIDE_INT intval = (unsigned HOST_WIDE_INT) ((GET_CODE (operands[2]) == CONST_INT) @@ -3061,6 +3061,9 @@ compute_logical_op_length (machine_mode mode, rtx *operands) switch (mode) { + case E_QImode: + return 2; + case E_HImode: /* First, see if we can finish with one insn. */ if (b0 != 0 && b1 != 0) @@ -4189,7 +4192,7 @@ h8300_asm_insn_count (const char *templ) /* Compute the length of a shift insn. */ unsigned int -compute_a_shift_length (rtx insn ATTRIBUTE_UNUSED, rtx *operands) +compute_a_shift_length (rtx *operands) { rtx shift = operands[3]; machine_mode mode = GET_MODE (shift); diff --git a/gcc/config/h8300/h8300.md b/gcc/config/h8300/h8300.md index 9a42547..e596987 100644 --- a/gcc/config/h8300/h8300.md +++ b/gcc/config/h8300/h8300.md @@ -229,6 +229,8 @@ (define_code_iterator shifts [ashift ashiftrt lshiftrt]) +(define_code_iterator logicals [ior xor and]) + (define_code_iterator ors [ior xor]) (include "movepush.md") diff --git a/gcc/config/h8300/logical.md b/gcc/config/h8300/logical.md index eb99c20..d778d24 100644 --- a/gcc/config/h8300/logical.md +++ b/gcc/config/h8300/logical.md @@ -1,11 +1,20 @@ +;; Generic for binary logicals across the supported integer modes +(define_expand "3" + [(set (match_operand:QHSI 0 "register_operand" "") + (logicals:QHSI (match_operand:QHSI 1 "register_operand" "") + (match_operand:QHSI 2 "h8300_src_operand" "")))] + "" + "") + +;; There's a ton of cleanup to do from here below. ;; ---------------------------------------------------------------------- ;; AND INSTRUCTIONS ;; ---------------------------------------------------------------------- -(define_insn "bclrqi_msx" - [(set (match_operand:QI 0 "bit_register_indirect_operand" "=WU") - (and:QI (match_operand:QI 1 "bit_register_indirect_operand" "%0") - (match_operand:QI 2 "single_zero_operand" "Y0")))] +(define_insn "bclr_msx" + [(set (match_operand:QHI 0 "bit_register_indirect_operand" "=WU") + (and:QHI (match_operand:QHI 1 "bit_register_indirect_operand" "%0") + (match_operand:QHI 2 "single_zero_operand" "Y0")))] "TARGET_H8300SX && rtx_equal_p (operands[0], operands[1])" "bclr\\t%W2,%0" [(set_attr "length" "8")]) @@ -24,21 +33,13 @@ operands[2] = GEN_INT ((INTVAL (operands[2])) >> 8); }) -(define_insn "bclrhi_msx" - [(set (match_operand:HI 0 "bit_register_indirect_operand" "=m") - (and:HI (match_operand:HI 1 "bit_register_indirect_operand" "%0") - (match_operand:HI 2 "single_zero_operand" "Y0")))] - "TARGET_H8300SX" - "bclr\\t%W2,%0" - [(set_attr "length" "8")]) - (define_insn_and_split "*andqi3_2" [(set (match_operand:QI 0 "bit_operand" "=U,rQ,r") (and:QI (match_operand:QI 1 "bit_operand" "%0,0,WU") (match_operand:QI 2 "h8300_src_operand" "Y0,rQi,IP1>X")))] "TARGET_H8300SX" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (and:QI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) @@ -62,7 +63,7 @@ "register_operand (operands[0], QImode) || single_zero_operand (operands[2], QImode)" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (and:QI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) @@ -78,13 +79,6 @@ and %X2,%X0" [(set_attr "length" "2,8")]) -(define_expand "and3" - [(set (match_operand:QHSI 0 "register_operand" "") - (and:QHSI (match_operand:QHSI 1 "register_operand" "") - (match_operand:QHSI 2 "h8300_src_operand" "")))] - "" - "") - (define_insn_and_split "*andor3" [(set (match_operand:QHSI 0 "register_operand" "=r") (ior:QHSI (and:QHSI (match_operand:QHSI 2 "register_operand" "r") @@ -95,7 +89,7 @@ || (mode == SImode && (INTVAL (operands[3]) & 0xffff) != 0))" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (ior:QHSI (and:QHSI (match_dup 2) (match_dup 3)) (match_dup 1))) @@ -150,7 +144,7 @@ (match_operand:SI 1 "register_operand" "0")))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (ior:SI (and:SI (ashift:SI (match_dup 2) (const_int 8)) (const_int 65280)) @@ -195,7 +189,7 @@ "TARGET_H8300SX || register_operand (operands[0], QImode) || single_one_operand (operands[2], QImode)" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (ors:QI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) @@ -216,39 +210,32 @@ [(set_attr "length" "8,*") (set_attr "length_table" "*,logicb")]) -(define_expand "3" - [(set (match_operand:QHSI 0 "register_operand" "") - (ors:QHSI (match_operand:QHSI 1 "register_operand" "") - (match_operand:QHSI 2 "h8300_src_operand" "")))] - "" - "") - ;; ---------------------------------------------------------------------- ;; {AND,IOR,XOR}{HI3,SI3} PATTERNS ;; ---------------------------------------------------------------------- (define_insn_and_split "*logical3" - [(set (match_operand:HSI 0 "h8300_dst_operand" "=rQ") - (match_operator:HSI 3 "bit_operator" - [(match_operand:HSI 1 "h8300_dst_operand" "%0") - (match_operand:HSI 2 "h8300_src_operand" "rQi")]))] + [(set (match_operand:QHSI 0 "h8300_dst_operand" "=rQ") + (logicals:QHSI + (match_operand:QHSI 1 "h8300_dst_operand" "%0") + (match_operand:QHSI 2 "h8300_src_operand" "rQi")))] "h8300_operands_match_p (operands)" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) (clobber (reg:CC CC_REG))])]) -(define_insn "*logical3_clobber_flags" - [(set (match_operand:HSI 0 "h8300_dst_operand" "=rQ") - (match_operator:HSI 3 "bit_operator" - [(match_operand:HSI 1 "h8300_dst_operand" "%0") - (match_operand:HSI 2 "h8300_src_operand" "rQi")])) +(define_insn "*3_clobber_flags" + [(set (match_operand:QHSI 0 "h8300_dst_operand" "=rQ") + (logicals:QHSI + (match_operand:QHSI 1 "h8300_dst_operand" "%0") + (match_operand:QHSI 2 "h8300_src_operand" "rQi"))) (clobber (reg:CC CC_REG))] "h8300_operands_match_p (operands)" - { return output_logical_op (mode, operands); } + { return output_logical_op (mode, , operands); } [(set (attr "length") - (symbol_ref "compute_logical_op_length (mode, operands)"))]) + (symbol_ref "compute_logical_op_length (mode, , operands)"))]) ;; ---------------------------------------------------------------------- @@ -260,11 +247,11 @@ (not:QHSI (match_operand:QHSI 1 "h8300_dst_operand" "0")))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (not:QHSI (match_dup 1))) (clobber (reg:CC CC_REG))])]) -(define_insn "one_cmpl2_clobber_flags" +(define_insn "one_cmpl2_" [(set (match_operand:QHSI 0 "h8300_dst_operand" "=rQ") (not:QHSI (match_operand:QHSI 1 "h8300_dst_operand" "0"))) (clobber (reg:CC CC_REG))] diff --git a/gcc/config/h8300/shiftrotate.md b/gcc/config/h8300/shiftrotate.md index f1c86f7..4bf8fe1 100644 --- a/gcc/config/h8300/shiftrotate.md +++ b/gcc/config/h8300/shiftrotate.md @@ -175,7 +175,7 @@ return output_a_shift (operands); } [(set (attr "length") - (symbol_ref "compute_a_shift_length (insn, operands)"))]) + (symbol_ref "compute_a_shift_length (operands)"))]) (define_insn_and_split "*shiftqi_noscratch" [(set (match_operand:QI 0 "register_operand" "=r,r") @@ -203,7 +203,7 @@ return output_a_shift (operands); } [(set (attr "length") - (symbol_ref "compute_a_shift_length (insn, operands)"))]) + (symbol_ref "compute_a_shift_length (operands)"))]) (define_insn_and_split "*shifthi" [(set (match_operand:HI 0 "register_operand" "=r,r") @@ -230,7 +230,7 @@ return output_a_shift (operands); } [(set (attr "length") - (symbol_ref "compute_a_shift_length (insn, operands)"))]) + (symbol_ref "compute_a_shift_length (operands)"))]) (define_insn_and_split "*shifthi_noscratch" [(set (match_operand:HI 0 "register_operand" "=r,r") @@ -258,7 +258,7 @@ return output_a_shift (operands); } [(set (attr "length") - (symbol_ref "compute_a_shift_length (insn, operands)"))]) + (symbol_ref "compute_a_shift_length (operands)"))]) (define_insn_and_split "*shiftsi" [(set (match_operand:SI 0 "register_operand" "=r,r") @@ -285,7 +285,7 @@ return output_a_shift (operands); } [(set (attr "length") - (symbol_ref "compute_a_shift_length (insn, operands)"))]) + (symbol_ref "compute_a_shift_length (operands)"))]) (define_insn_and_split "*shiftsi_noscratch" [(set (match_operand:SI 0 "register_operand" "=r,r") @@ -313,7 +313,7 @@ return output_a_shift (operands); } [(set (attr "length") - (symbol_ref "compute_a_shift_length (insn, operands)"))]) + (symbol_ref "compute_a_shift_length (operands)"))]) ;; Split a variable shift into a loop. If the register containing ;; the shift count dies, then we just use that register. -- cgit v1.1 From 22d834e32b509b22f68000b7f012d8e45d833ea8 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 31 May 2021 21:59:50 +0200 Subject: IBM Z: Remove match_scratch workaround Since commit dd1ef00c45ba ("Fix bug in the define_subst handling that made match_scratch unusable for multi-alternative patterns.") the workaround for that bug in *ashrdi3_31 is not only no longer necessary, but actually breaks the build. Get rid of it by using only one alternative in (match_scratch). It will be replicated as many times as needed in order to match the pattern with which (define_subst) is used. gcc/ChangeLog: * config/s390/s390.md(*ashrdi3_31): Use a single constraint. * config/s390/subst.md(cconly_subst): Use a single constraint in (match_scratch). gcc/testsuite/ChangeLog: * gcc.target/s390/ashr.c: New test. --- gcc/config/s390/s390.md | 14 ++++---------- gcc/config/s390/subst.md | 2 +- 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 7faf775..0c5b4dc 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -9328,19 +9328,13 @@ "" "") -; FIXME: The number of alternatives is doubled here to match the fix -; number of 2 in the subst pattern for the (clobber (match_scratch... -; The right fix should be to support match_scratch in the output -; pattern of a define_subst. (define_insn "*ashrdi3_31" - [(set (match_operand:DI 0 "register_operand" "=d, d") - (ashiftrt:DI (match_operand:DI 1 "register_operand" "0, 0") - (match_operand:QI 2 "shift_count_operand" "jsc,jsc"))) + [(set (match_operand:DI 0 "register_operand" "=d") + (ashiftrt:DI (match_operand:DI 1 "register_operand" "0") + (match_operand:QI 2 "shift_count_operand" "jsc"))) (clobber (reg:CC CC_REGNUM))] "!TARGET_ZARCH" - "@ - srda\t%0,%Y2 - srda\t%0,%Y2" + "srda\t%0,%Y2" [(set_attr "op_type" "RS") (set_attr "atype" "reg")]) diff --git a/gcc/config/s390/subst.md b/gcc/config/s390/subst.md index 384af11..3ea6fc4 100644 --- a/gcc/config/s390/subst.md +++ b/gcc/config/s390/subst.md @@ -45,7 +45,7 @@ "s390_match_ccmode(insn, CCSmode)" [(set (reg CC_REGNUM) (compare (match_dup 1) (const_int 0))) - (clobber (match_scratch:DSI 0 "=d,d"))]) + (clobber (match_scratch:DSI 0 "=d"))]) (define_subst_attr "cconly" "cconly_subst" "" "_cconly") -- cgit v1.1 From 46d04271a4983b5430ed4830ab65ea26052176fb Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Wed, 2 Jun 2021 14:31:00 +0300 Subject: ARC: gcc driver default to hs38_linux arc700 is legacy and there's no active development for it, so switch to latest hs38_linux as default Signed-off-by: Vineet Gupta Signed-off-by: Claudiu Zissulescu gcc/ 2021-06-02 Vineet Gupta * config/arc/arc.h (TARGET_CPU_DEFAULT): Change to hs38_linux. --- gcc/config/arc/arc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h index bd1fe0a..252241a 100644 --- a/gcc/config/arc/arc.h +++ b/gcc/config/arc/arc.h @@ -34,7 +34,7 @@ along with GCC; see the file COPYING3. If not see #define SYMBOL_FLAG_CMEM (SYMBOL_FLAG_MACH_DEP << 3) #ifndef TARGET_CPU_DEFAULT -#define TARGET_CPU_DEFAULT PROCESSOR_arc700 +#define TARGET_CPU_DEFAULT PROCESSOR_hs38_linux #endif /* Check if this symbol has a long_call attribute in its declaration */ -- cgit v1.1 From d2d74c9fc0cf46f66cd02698a52f5e5db109271d Mon Sep 17 00:00:00 2001 From: Pat Haugen Date: Wed, 2 Jun 2021 15:03:40 -0500 Subject: Make sure link reg save MEM has frame alias set. gcc/ChangeLog: * config/rs6000/rs6000-logue.c (rs6000_emit_prologue): Use gen_frame_store. --- gcc/config/rs6000/rs6000-logue.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c index 13c00e7..07337c4 100644 --- a/gcc/config/rs6000/rs6000-logue.c +++ b/gcc/config/rs6000/rs6000-logue.c @@ -3257,7 +3257,7 @@ rs6000_emit_prologue (void) if (!WORLD_SAVE_P (info) && info->lr_save_p && !cfun->machine->lr_is_wrapped_separately) { - rtx addr, reg, mem; + rtx reg; reg = gen_rtx_REG (Pmode, 0); START_USE (0); @@ -3267,13 +3267,8 @@ rs6000_emit_prologue (void) if (!(strategy & (SAVE_NOINLINE_GPRS_SAVES_LR | SAVE_NOINLINE_FPRS_SAVES_LR))) { - addr = gen_rtx_PLUS (Pmode, frame_reg_rtx, - GEN_INT (info->lr_save_offset + frame_off)); - mem = gen_rtx_MEM (Pmode, addr); - /* This should not be of rs6000_sr_alias_set, because of - __builtin_return_address. */ - - insn = emit_move_insn (mem, reg); + insn = emit_insn (gen_frame_store (reg, frame_reg_rtx, + info->lr_save_offset + frame_off)); rs6000_frame_related (insn, frame_reg_rtx, sp_off - frame_off, NULL_RTX, NULL_RTX); END_USE (0); -- cgit v1.1 From 50b1de860a58bf85b40a72219bc2fdfaf0dff355 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 2 Jun 2021 22:09:53 +0200 Subject: xtensa: Fix 2 warnings during xtensa build [PR100841] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building gcc targetting xtensa-linux, there are 2 warnings the PR complains about: ../../gcc/dwarf2cfi.c: In function ‘void init_one_dwarf_reg_size(int, machine_mode, rtx, machine_mode, init_one_dwarf_reg_state*)’: ../../gcc/dwarf2cfi.c:291:12: warning: comparison of integer expressions of different signedness: ‘const unsigned int’ and ‘int’ [-Wsign-compare] 291 | if (rnum >= DWARF_FRAME_REGISTERS) ../../gcc/function.c: In function ‘void gen_call_used_regs_seq(rtx_insn*, unsigned int)’: ../../gcc/function.c:5897:63: warning: comparison of unsigned expression in ‘< 0’ is always false [-Wtype-limits] 5897 | if (crtl->uses_only_leaf_regs && LEAF_REG_REMAP (regno) < 0) which might during bootstrap or when configured with --enable-werror-always be turned into errors. The first one is the -Wsign-compare warning, in c-family we do: 2281 /* Do not warn if the signed quantity is an unsuffixed integer 2282 literal (or some static constant expression involving such 2283 literals or a conditional expression involving such literals) 2284 and it is non-negative. */ 2285 if (tree_expr_nonnegative_warnv_p (sop, &ovf)) 2286 /* OK */; and so don't warn if that function determines the expression is non-negative. But xtensa defines DWARF_FRAME_REGISTERS as (16 + (something ? 0 : 1)) and that isn't handled by tree_expr_nonnegative_warnv_p, VRP can handle it of course, but that is much later. The second chunk rewrites it into a form that tree_expr_nonnegative_warnv_p can handle, in particular (something ? 16 : 16 + 1), where for COND_EXPRs that function checks both the 2nd and 3rd operand of the ternary operator and if both are nonnegative, returns true. The other warning has been introduced fairly recently; LEAF_REG_REMAP is currently used by 2 targets only, and is documented to yield -1 if a hard reg number can't be remapped and the remapped register number otherwise. That means that the type of the expression should be signed (otherwise -1 could never appear), and on SPARC indeed it is defined as extern char leaf_reg_remap[]; #define LEAF_REG_REMAP(REGNO) (leaf_reg_remap[REGNO]) so unless the host is -funsigned-char by default it works fine. I guess sparc.[ch] should be fixed to use signed char of leaf_reg_remap, Eric? The argument to LEAF_REG_REMAP is often unsigned int though, hard register numbers are usually not negative, and thus the warning. I think xtensa doesn't have 2G hard registers and so it is ok to just cast it to int. 2021-06-02 Jakub Jelinek PR target/100841 * config/xtensa/xtensa.h (LEAF_REG_REMAP): Cast REGNO to int to avoid -Wtype-limits warnings. (DWARF_FRAME_REGISTER): Rewrite into ternary operator with addition in operands to avoid -Wsign-compare warnings. --- gcc/config/xtensa/xtensa.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/xtensa/xtensa.h b/gcc/config/xtensa/xtensa.h index b01f6af..923ab5a 100644 --- a/gcc/config/xtensa/xtensa.h +++ b/gcc/config/xtensa/xtensa.h @@ -279,7 +279,7 @@ extern const char xtensa_leaf_regs[FIRST_PSEUDO_REGISTER]; /* For Xtensa, no remapping is necessary, but this macro must be defined if LEAF_REGISTERS is defined. */ -#define LEAF_REG_REMAP(REGNO) (REGNO) +#define LEAF_REG_REMAP(REGNO) ((int) (REGNO)) /* This must be declared if LEAF_REGISTERS is set. */ extern int leaf_function; @@ -775,8 +775,9 @@ typedef struct xtensa_args #define INCOMING_RETURN_ADDR_RTX gen_rtx_REG (Pmode, 0) #define DWARF_FRAME_RETURN_COLUMN DWARF_FRAME_REGNUM (0) #define DWARF_ALT_FRAME_RETURN_COLUMN 16 -#define DWARF_FRAME_REGISTERS (DWARF_ALT_FRAME_RETURN_COLUMN \ - + (TARGET_WINDOWED_ABI ? 0 : 1)) +#define DWARF_FRAME_REGISTERS (TARGET_WINDOWED_ABI \ + ? DWARF_ALT_FRAME_RETURN_COLUMN \ + : DWARF_ALT_FRAME_RETURN_COLUMN + 1) #define EH_RETURN_DATA_REGNO(N) ((N) < 2 ? (N) + 2 : INVALID_REGNUM) #define ASM_PREFERRED_EH_DATA_FORMAT(CODE, GLOBAL) \ (flag_pic \ -- cgit v1.1 From bff9a7ec6e3b8bf9d9635445c94e6c166e6f43e1 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Wed, 2 Jun 2021 21:15:17 -0500 Subject: arc: Remove define_insn_and_split *bbit_di define_insn_and_split *bbit_di has unexpected empty split condition when its insn condition isn't empty. But as Claudiu pointed out, this pattern looks useless and it's better to remove it. gcc/ChangeLog: * config/arc/arc.md (*bbit_di): Remove. --- gcc/config/arc/arc.md | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index b6f2d8e..a67bb58 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -5016,34 +5016,6 @@ core_3, archs4x, archs4xd, archs4xd_slow" (if_then_else (match_test "get_attr_length (insn) == 6") (const_string "true") (const_string "false")))]) -; ??? When testing a bit from a DImode register, combine creates a -; zero_extract in DImode. This goes via an AND with a DImode constant, -; so can only be observed on 64 bit hosts. -(define_insn_and_split "*bbit_di" - [(set (pc) - (if_then_else - (match_operator 3 "equality_comparison_operator" - [(zero_extract:DI (match_operand:SI 1 "register_operand" "Rcqq,c") - (const_int 1) - (match_operand 2 "immediate_operand" "L,L")) - (const_int 0)]) - (label_ref (match_operand 0 "" "")) - (pc))) - (clobber (reg:CC_ZN CC_REG))] - "!CROSSING_JUMP_P (insn)" - "#" - "" - [(parallel - [(set (pc) (if_then_else (match_dup 3) (label_ref (match_dup 0)) (pc))) - (clobber (reg:CC_ZN CC_REG))])] -{ - rtx xtr; - - xtr = gen_rtx_ZERO_EXTRACT (SImode, operands[1], const1_rtx, operands[2]); - operands[3] = gen_rtx_fmt_ee (GET_CODE (operands[3]), GET_MODE (operands[3]), - xtr, const0_rtx); -}) - ;; ------------------------------------------------------------------- ;; Hardware loop ;; ------------------------------------------------------------------- -- cgit v1.1 From 592ed7db12ed0d6c71bca0cbfef6dcdf383bc24f Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Thu, 3 Jun 2021 13:44:53 +0300 Subject: arc: Remove obsolete options Remove the following obsolete options: - munalign-prob-threshold - malign-call - mmixed-code The ARC's options are marked as obsolete and ignored for backwards compatibility. gcc/ 2021-06-03 Claudiu Zissulescu * common/config/arc/arc-common.c (arc_option_optimization_table): Remove malign-call. * config/arc/arc.c (arc_unalign_branch_p): Remove unused function. * config/arc/arc.h (TARGET_MIXED_CODE): Remove macro. (INDEX_REG_CLASS): Only refer to GENERAL_REGS. * config/arc/arc.md (abssi2_mixed): Remove pattern. * config/arc/arc.opt (munalign-prob-threshold): Mark it obsolete. (malign-call): Likewise. (mmixed-code): Likewise. * doc/invoke.texi (ARC): Update doc. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/arc.c | 23 ----------------------- gcc/config/arc/arc.h | 4 +--- gcc/config/arc/arc.md | 8 -------- gcc/config/arc/arc.opt | 18 ++++++------------ 4 files changed, 7 insertions(+), 46 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c index 9153f05..b77d056 100644 --- a/gcc/config/arc/arc.c +++ b/gcc/config/arc/arc.c @@ -9868,29 +9868,6 @@ gen_acc2 (void) return gen_rtx_REG (SImode, TARGET_BIG_ENDIAN ? 57: 56); } -/* FIXME: a parameter should be added, and code added to final.c, - to reproduce this functionality in shorten_branches. */ -#if 0 -/* Return nonzero iff BRANCH should be unaligned if possible by upsizing - a previous instruction. */ -int -arc_unalign_branch_p (rtx branch) -{ - rtx note; - - if (!TARGET_UNALIGN_BRANCH) - return 0; - /* Do not do this if we have a filled delay slot. */ - if (get_attr_delay_slot_filled (branch) == DELAY_SLOT_FILLED_YES - && !NEXT_INSN (branch)->deleted ()) - return 0; - note = find_reg_note (branch, REG_BR_PROB, 0); - return (!note - || (arc_unalign_prob_threshold && !br_prob_note_reliable_p (note)) - || INTVAL (XEXP (note, 0)) < arc_unalign_prob_threshold); -} -#endif - /* When estimating sizes during arc_reorg, when optimizing for speed, there are three reasons why we need to consider branches to be length 6: - annull-false delay slot insns are implemented using conditional execution, diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h index 252241a..0224ae6 100644 --- a/gcc/config/arc/arc.h +++ b/gcc/config/arc/arc.h @@ -115,8 +115,6 @@ extern const char *arc_cpu_to_as (int argc, const char **argv); /* Run-time compilation parameters selecting different hardware subsets. */ -#define TARGET_MIXED_CODE (TARGET_MIXED_CODE_SET) - #define TARGET_SPFP (TARGET_SPFP_FAST_SET || TARGET_SPFP_COMPACT_SET) #define TARGET_DPFP (TARGET_DPFP_FAST_SET || TARGET_DPFP_COMPACT_SET \ || TARGET_FP_DP_AX) @@ -571,7 +569,7 @@ extern enum reg_class arc_regno_reg_class[]; a scale factor or added to another register (as well as added to a displacement). */ -#define INDEX_REG_CLASS (TARGET_MIXED_CODE ? ARCOMPACT16_REGS : GENERAL_REGS) +#define INDEX_REG_CLASS GENERAL_REGS /* The class value for valid base registers. A base register is one used in an address which is the register value plus a displacement. */ diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index a67bb58..de61b2b 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -2011,14 +2011,6 @@ core_3, archs4x, archs4xd, archs4xd_slow" ;; Absolute instructions -(define_insn "*abssi2_mixed" - [(set (match_operand:SI 0 "compact_register_operand" "=q") - (abs:SI (match_operand:SI 1 "compact_register_operand" "q")))] - "TARGET_MIXED_CODE" - "abs%? %0,%1%&" - [(set_attr "type" "two_cycle_core") - (set_attr "iscompact" "true")]) - (define_insn "abssi2" [(set (match_operand:SI 0 "dest_reg_operand" "=Rcq#q,w,w") (abs:SI (match_operand:SI 1 "nonmemory_operand" "Rcq#q,cL,Cal")))] diff --git a/gcc/config/arc/arc.opt b/gcc/config/arc/arc.opt index 85688d5..a8935db 100644 --- a/gcc/config/arc/arc.opt +++ b/gcc/config/arc/arc.opt @@ -136,12 +136,8 @@ Target Mask(CODE_DENSITY) Enable code density instructions for ARCv2. mmixed-code -Target Mask(MIXED_CODE_SET) -Tweak register allocation to help 16-bit instruction generation. -; originally this was: -;Generate ARCompact 16-bit instructions intermixed with 32-bit instructions -; but we do that without -mmixed-code, too, it's just a different instruction -; count / size tradeoff. +Target Ignore +Does nothing. Preserved for backward compatibility. ; We use an explict definition for the negative form because that is the ; actually interesting option, and we want that to have its own comment. @@ -292,11 +288,9 @@ mmul32x16 Target Mask(MULMAC_32BY16_SET) Generate 32x16 multiply and mac instructions. -; the initializer is supposed to be: Init(REG_BR_PROB_BASE/2) , -; alas, basic-block.h is not included in options.c . munalign-prob-threshold= -Target RejectNegative Joined UInteger Var(arc_unalign_prob_threshold) Init(10000/2) -Set probability threshold for unaligning branches. +Target Ignore +Does nothing. Preserved for backward compatibility. mmedium-calls Target Var(TARGET_MEDIUM_CALLS) Init(TARGET_MMEDIUM_CALLS_DEFAULT) @@ -307,8 +301,8 @@ Target Var(TARGET_ANNOTATE_ALIGN) Explain what alignment considerations lead to the decision to make an insn short or long. malign-call -Target Var(TARGET_ALIGN_CALL) -Do alignment optimizations for call instructions. +Target Ignore +Does nothing. Preserved for backward compatibility. mRcq Target Var(TARGET_Rcq) -- cgit v1.1 From f4242710c7966a965bbd14cd50ecdba28fd5717e Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Thu, 3 Jun 2021 12:06:38 +0000 Subject: arm: Auto-vectorization for MVE: vabs This patch adds support for auto-vectorization of absolute value computation using vabs. We use a similar pattern to what is used in neon.md and extend the existing neg2 expander to match both 'neg' and 'abs'. This implies renaming the existing abs2 define_insn in neon.md to avoid a clash with the new expander with the same name. 2021-06-03 Christophe Lyon gcc/ * config/arm/mve.md (mve_vabsq_f): Use 'abs' instead of unspec. (mve_vabsq_s): Likewise. * config/arm/neon.md (abs2): Rename to neon_abs2. * config/arm/unspecs.md (VABSQ_F, VABSQ_S): Delete. * config/arm/vec-common.md (neg2): Rename to 2. gcc/testsuite/ * gcc.target/arm/simd/mve-vabs.c: New test. --- gcc/config/arm/mve.md | 6 ++---- gcc/config/arm/neon.md | 2 +- gcc/config/arm/unspecs.md | 2 -- gcc/config/arm/vec-common.md | 4 ++-- 4 files changed, 5 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 0a6ba80..0bfa6a9 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -269,8 +269,7 @@ (define_insn "mve_vabsq_f" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")] - VABSQ_F)) + (abs:MVE_0 (match_operand:MVE_0 1 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "vabs.f%# %q0, %q1" @@ -481,8 +480,7 @@ (define_insn "mve_vabsq_s" [ (set (match_operand:MVE_2 0 "s_register_operand" "=w") - (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")] - VABSQ_S)) + (abs:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE" "vabs.s%#\t%q0, %q1" diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 6a65733..077c62f 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -739,7 +739,7 @@ [(set_attr "type" "neon_move")] ) -(define_insn "abs2" +(define_insn "neon_abs2" [(set (match_operand:VDQW 0 "s_register_operand" "=w") (abs:VDQW (match_operand:VDQW 1 "s_register_operand" "w")))] "TARGET_NEON" diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 0778db1..ed1bc29 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -538,7 +538,6 @@ VRNDAQ_F VREV64Q_F VDUPQ_N_F - VABSQ_F VREV32Q_F VCVTTQ_F32_F16 VCVTBQ_F32_F16 @@ -562,7 +561,6 @@ VCLSQ_S VADDVQ_S VADDVQ_U - VABSQ_S VREV32Q_U VREV32Q_S VMOVLTQ_U diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 8e35151..80b2732 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -208,9 +208,9 @@ "ARM_HAVE__ARITH && !TARGET_REALLY_IWMMXT" ) -(define_expand "neg2" +(define_expand "2" [(set (match_operand:VDQWH 0 "s_register_operand" "") - (neg:VDQWH (match_operand:VDQWH 1 "s_register_operand" "")))] + (ABSNEG:VDQWH (match_operand:VDQWH 1 "s_register_operand" "")))] "ARM_HAVE__ARITH && !TARGET_REALLY_IWMMXT" ) -- cgit v1.1 From 52e130652a76ff3d14c0f572fcd79fa53637ce2c Mon Sep 17 00:00:00 2001 From: Aaron Sawdey Date: Wed, 2 Jun 2021 22:54:08 -0500 Subject: Fix operand order to subf for p10 fusion. This certainly causes a bootstrap miscompare, and might also be responsible for PR/100820. The operands to subf were reversed in the logical-add/sub fusion patterns, and I screwed up my bootstrap test which is how it ended up getting committed. gcc/ChangeLog * config/rs6000/genfusion.pl (gen_logical_addsubf): Fix input order to subf instruction. * config/rs6000/fusion.md: Regenerate. --- gcc/config/rs6000/fusion.md | 64 +++++++++++++++++++++--------------------- gcc/config/rs6000/genfusion.pl | 20 +++++++------ 2 files changed, 43 insertions(+), 41 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md index 5191210..e642ff5 100644 --- a/gcc/config/rs6000/fusion.md +++ b/gcc/config/rs6000/fusion.md @@ -1733,10 +1733,10 @@ (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" "@ - and %3,%1,%0\;subf %3,%3,%2 - and %3,%1,%0\;subf %3,%3,%2 - and %3,%1,%0\;subf %3,%3,%2 - and %4,%1,%0\;subf %3,%4,%2" + and %3,%1,%0\;subf %3,%2,%3 + and %3,%1,%0\;subf %3,%2,%3 + and %3,%1,%0\;subf %3,%2,%3 + and %4,%1,%0\;subf %3,%2,%4" [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1751,10 +1751,10 @@ (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" "@ - nand %3,%1,%0\;subf %3,%3,%2 - nand %3,%1,%0\;subf %3,%3,%2 - nand %3,%1,%0\;subf %3,%3,%2 - nand %4,%1,%0\;subf %3,%4,%2" + nand %3,%1,%0\;subf %3,%2,%3 + nand %3,%1,%0\;subf %3,%2,%3 + nand %3,%1,%0\;subf %3,%2,%3 + nand %4,%1,%0\;subf %3,%2,%4" [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1769,10 +1769,10 @@ (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" "@ - nor %3,%1,%0\;subf %3,%3,%2 - nor %3,%1,%0\;subf %3,%3,%2 - nor %3,%1,%0\;subf %3,%3,%2 - nor %4,%1,%0\;subf %3,%4,%2" + nor %3,%1,%0\;subf %3,%2,%3 + nor %3,%1,%0\;subf %3,%2,%3 + nor %3,%1,%0\;subf %3,%2,%3 + nor %4,%1,%0\;subf %3,%2,%4" [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1787,10 +1787,10 @@ (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" "@ - or %3,%1,%0\;subf %3,%3,%2 - or %3,%1,%0\;subf %3,%3,%2 - or %3,%1,%0\;subf %3,%3,%2 - or %4,%1,%0\;subf %3,%4,%2" + or %3,%1,%0\;subf %3,%2,%3 + or %3,%1,%0\;subf %3,%2,%3 + or %3,%1,%0\;subf %3,%2,%3 + or %4,%1,%0\;subf %3,%2,%4" [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1805,10 +1805,10 @@ (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" "@ - and %3,%1,%0\;subf %3,%2,%3 - and %3,%1,%0\;subf %3,%2,%3 - and %3,%1,%0\;subf %3,%2,%3 - and %4,%1,%0\;subf %3,%2,%4" + and %3,%1,%0\;subf %3,%3,%2 + and %3,%1,%0\;subf %3,%3,%2 + and %3,%1,%0\;subf %3,%3,%2 + and %4,%1,%0\;subf %3,%4,%2" [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1823,10 +1823,10 @@ (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" "@ - nand %3,%1,%0\;subf %3,%2,%3 - nand %3,%1,%0\;subf %3,%2,%3 - nand %3,%1,%0\;subf %3,%2,%3 - nand %4,%1,%0\;subf %3,%2,%4" + nand %3,%1,%0\;subf %3,%3,%2 + nand %3,%1,%0\;subf %3,%3,%2 + nand %3,%1,%0\;subf %3,%3,%2 + nand %4,%1,%0\;subf %3,%4,%2" [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1841,10 +1841,10 @@ (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" "@ - nor %3,%1,%0\;subf %3,%2,%3 - nor %3,%1,%0\;subf %3,%2,%3 - nor %3,%1,%0\;subf %3,%2,%3 - nor %4,%1,%0\;subf %3,%2,%4" + nor %3,%1,%0\;subf %3,%3,%2 + nor %3,%1,%0\;subf %3,%3,%2 + nor %3,%1,%0\;subf %3,%3,%2 + nor %4,%1,%0\;subf %3,%4,%2" [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) @@ -1859,10 +1859,10 @@ (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" "@ - or %3,%1,%0\;subf %3,%2,%3 - or %3,%1,%0\;subf %3,%2,%3 - or %3,%1,%0\;subf %3,%2,%3 - or %4,%1,%0\;subf %3,%2,%4" + or %3,%1,%0\;subf %3,%3,%2 + or %3,%1,%0\;subf %3,%3,%2 + or %3,%1,%0\;subf %3,%3,%2 + or %4,%1,%0\;subf %3,%4,%2" [(set_attr "type" "fused_arith_logical") (set_attr "cost" "6") (set_attr "length" "8")]) diff --git a/gcc/config/rs6000/genfusion.pl b/gcc/config/rs6000/genfusion.pl index 1285dd4..577b955 100755 --- a/gcc/config/rs6000/genfusion.pl +++ b/gcc/config/rs6000/genfusion.pl @@ -166,7 +166,7 @@ sub gen_logical_addsubf $outer_op, $outer_comp, $outer_inv, $outer_rtl, $inner, @inner_ops, $inner_comp, $inner_inv, $inner_rtl, $inner_op, $both_commute, $c4, $bc, $inner_arg0, $inner_arg1, $inner_exp, $outer_arg2, $outer_exp, - $target_flag, $ftype, $insn, $is_rsubf, $outer_32, $outer_42, + $target_flag, $ftype, $insn, $is_subf, $is_rsubf, $outer_32, $outer_42, $outer_name, $fuse_type); KIND: foreach $kind ('scalar','vector') { @outer_ops = @logicals; @@ -188,11 +188,10 @@ sub gen_logical_addsubf $c4 = "${constraint},${constraint},${constraint},${constraint}"; OUTER: foreach $outer ( @outer_ops ) { $outer_name = "${vchr}${outer}"; - if ( $outer eq "rsubf" ) { - $is_rsubf = 1; + $is_subf = ( $outer eq "subf" ); + $is_rsubf = ( $outer eq "rsubf" ); + if ( $is_rsubf ) { $outer = "subf"; - } else { - $is_rsubf = 0; } $outer_op = "${vchr}${outer}"; $outer_comp = $complement{$outer}; @@ -241,16 +240,19 @@ sub gen_logical_addsubf if ( ($outer_comp & 2) == 2 ) { $inner_exp = "(not:${mode} $inner_exp)"; } + if ( $is_subf ) { + $outer_32 = "%2,%3"; + $outer_42 = "%2,%4"; + } else { + $outer_32 = "%3,%2"; + $outer_42 = "%4,%2"; + } if ( $is_rsubf == 1 ) { $outer_exp = "(${outer_rtl}:${mode} ${outer_arg2} ${inner_exp})"; - $outer_32 = "%2,%3"; - $outer_42 = "%2,%4"; } else { $outer_exp = "(${outer_rtl}:${mode} ${inner_exp} ${outer_arg2})"; - $outer_32 = "%3,%2"; - $outer_42 = "%4,%2"; } if ( $outer_inv == 1 ) { $outer_exp = "(not:${mode} $outer_exp)"; -- cgit v1.1 From 5883e567564c5b3caecba0c13e8a360a14cdc846 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 3 Jun 2021 20:05:31 +0200 Subject: i386: Add insert and extract patterns for 4-byte vectors [PR100637] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch introduces insert and extract patterns for 4-byte vectors. It effectively only emits PINSR and PEXTR instructions when available, otherwise falls back to generic code that emulates these instructions via inserts, extracts, logic operations and shifts in integer registers. Please note that generic fallback produces better code than the current approach of constructing new vector in memory (due to store forwarding stall) so also enable QImode 8-byte vector inserts only with TARGET_SSE4_1. 2021-06-03 Uroš Bizjak gcc/ PR target/100637 * config/i386/i386-expand.c (ix86_expand_vector_set): Handle V2HI and V4QI modes. (ix86_expand_vector_extract): Ditto. * config/i386/mmx.md (*pinsrw): New insn pattern. (*pinsrb): Ditto. (*pextrw): Ditto. (*pextrw_zext): Ditto. (*pextrb): Ditto. (*pextrb_zext): Ditto. (vec_setv2hi): New expander. (vec_extractv2hihi): Ditto. (vec_setv4qi): Ditto. (vec_extractv4qiqi): Ditto. (vec_setv8qi): Enable only for TARGET_SSE4_1. (vec_extractv8qiqi): Ditto. gcc/testsuite/ PR target/100637 * gcc.target/i386/vperm-v2hi.c: New test. * gcc.target/i386/vperm-v4qi.c: Ditto. --- gcc/config/i386/i386-expand.c | 6 ++ gcc/config/i386/mmx.md | 176 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 180 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 4185f58..eb7cdb0 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -14968,6 +14968,7 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) return; case E_V8HImode: + case E_V2HImode: use_vec_merge = TARGET_SSE2; break; case E_V4HImode: @@ -14975,6 +14976,7 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) break; case E_V16QImode: + case E_V4QImode: use_vec_merge = TARGET_SSE4_1; break; @@ -15274,6 +15276,7 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) break; case E_V8HImode: + case E_V2HImode: use_vec_extr = TARGET_SSE2; break; case E_V4HImode: @@ -15294,6 +15297,9 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) return; } break; + case E_V4QImode: + use_vec_extr = TARGET_SSE4_1; + break; case E_V8SFmode: if (TARGET_AVX) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index f39e062..914e5e9 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -3092,7 +3092,7 @@ [(match_operand:V8QI 0 "register_operand") (match_operand:QI 1 "register_operand") (match_operand 2 "const_int_operand")] - "TARGET_MMX || TARGET_MMX_WITH_SSE" + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" { ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1], INTVAL (operands[2])); @@ -3103,7 +3103,7 @@ [(match_operand:QI 0 "register_operand") (match_operand:V8QI 1 "register_operand") (match_operand 2 "const_int_operand")] - "TARGET_MMX || TARGET_MMX_WITH_SSE" + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" { ix86_expand_vector_extract (TARGET_MMX_WITH_SSE, operands[0], operands[1], INTVAL (operands[2])); @@ -3120,6 +3120,178 @@ DONE; }) +(define_insn "*pinsrw" + [(set (match_operand:V2HI 0 "register_operand" "=x,YW") + (vec_merge:V2HI + (vec_duplicate:V2HI + (match_operand:HI 2 "nonimmediate_operand" "rm,rm")) + (match_operand:V2HI 1 "register_operand" "0,YW") + (match_operand:SI 3 "const_int_operand")))] + "TARGET_SSE2 + && ((unsigned) exact_log2 (INTVAL (operands[3])) + < GET_MODE_NUNITS (V2HImode))" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); + switch (which_alternative) + { + case 1: + if (MEM_P (operands[2])) + return "vpinsrw\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + else + return "vpinsrw\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; + case 0: + if (MEM_P (operands[2])) + return "pinsrw\t{%3, %2, %0|%0, %2, %3}"; + else + return "pinsrw\t{%3, %k2, %0|%0, %k2, %3}"; + default: + gcc_unreachable (); + } +} + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "*pinsrb" + [(set (match_operand:V4QI 0 "register_operand" "=x,YW") + (vec_merge:V4QI + (vec_duplicate:V4QI + (match_operand:QI 2 "nonimmediate_operand" "rm,rm")) + (match_operand:V4QI 1 "register_operand" "0,YW") + (match_operand:SI 3 "const_int_operand")))] + "TARGET_SSE4_1 + && ((unsigned) exact_log2 (INTVAL (operands[3])) + < GET_MODE_NUNITS (V4QImode))" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); + switch (which_alternative) + { + case 1: + if (MEM_P (operands[2])) + return "vpinsrb\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + else + return "vpinsrb\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; + case 0: + if (MEM_P (operands[2])) + return "pinsrb\t{%3, %2, %0|%0, %2, %3}"; + else + return "pinsrb\t{%3, %k2, %0|%0, %k2, %3}"; + default: + gcc_unreachable (); + } +} + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "orig,vex") + (set_attr "mode" "TI")]) + +(define_insn "*pextrw" + [(set (match_operand:HI 0 "register_sse4nonimm_operand" "=r,m") + (vec_select:HI + (match_operand:V2HI 1 "register_operand" "YW,YW") + (parallel [(match_operand:SI 2 "const_0_to_1_operand" "n,n")])))] + "TARGET_SSE2" + "@ + %vpextrw\t{%2, %1, %k0|%k0, %1, %2} + %vpextrw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "*,sse4") + (set_attr "type" "sselog1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "*pextrw_zext" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (zero_extend:SWI48 + (vec_select:HI + (match_operand:V2HI 1 "register_operand" "YW") + (parallel [(match_operand:SI 2 "const_0_to_1_operand" "n")]))))] + "TARGET_SSE2" + "%vpextrw\t{%2, %1, %k0|%k0, %1, %2}" + [(set_attr "type" "sselog1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "*pextrb" + [(set (match_operand:QI 0 "nonimmediate_operand" "=r,m") + (vec_select:QI + (match_operand:V4QI 1 "register_operand" "YW,YW") + (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n,n")])))] + "TARGET_SSE4_1" + "@ + %vpextrb\t{%2, %1, %k0|%k0, %1, %2} + %vpextrb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "*pextrb_zext" + [(set (match_operand:SWI248 0 "register_operand" "=r") + (zero_extend:SWI248 + (vec_select:QI + (match_operand:V4QI 1 "register_operand" "YW") + (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")]))))] + "TARGET_SSE4_1" + "%vpextrb\t{%2, %1, %k0|%k0, %1, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_expand "vec_setv2hi" + [(match_operand:V2HI 0 "register_operand") + (match_operand:HI 1 "register_operand") + (match_operand 2 "const_int_operand")] + "TARGET_SSE2" +{ + ix86_expand_vector_set (false, operands[0], operands[1], + INTVAL (operands[2])); + DONE; +}) + +(define_expand "vec_extractv2hihi" + [(match_operand:HI 0 "register_operand") + (match_operand:V2HI 1 "register_operand") + (match_operand 2 "const_int_operand")] + "TARGET_SSE2" +{ + ix86_expand_vector_extract (false, operands[0], + operands[1], INTVAL (operands[2])); + DONE; +}) + +(define_expand "vec_setv4qi" + [(match_operand:V4QI 0 "register_operand") + (match_operand:QI 1 "register_operand") + (match_operand 2 "const_int_operand")] + "TARGET_SSE4_1" +{ + ix86_expand_vector_set (false, operands[0], operands[1], + INTVAL (operands[2])); + DONE; +}) + +(define_expand "vec_extractv4qiqi" + [(match_operand:QI 0 "register_operand") + (match_operand:V4QI 1 "register_operand") + (match_operand 2 "const_int_operand")] + "TARGET_SSE4_1" +{ + ix86_expand_vector_extract (false, operands[0], + operands[1], INTVAL (operands[2])); + DONE; +}) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Miscellaneous -- cgit v1.1 From 47d25a0314d8f28d399cd93f673df5886ca81d78 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Wed, 2 Jun 2021 07:04:54 +0200 Subject: cris: Update unexpected empty split condition gcc/ChangeLog: * config/cris/cris.md (*addi_reload): Fix empty split condition. --- gcc/config/cris/cris.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/cris/cris.md b/gcc/config/cris/cris.md index 7de0ec6..d5a3c70 100644 --- a/gcc/config/cris/cris.md +++ b/gcc/config/cris/cris.md @@ -1311,7 +1311,7 @@ && (INTVAL (operands[3]) == 2 || INTVAL (operands[3]) == 4) && (reload_in_progress || reload_completed)" "#" - "" + "&& 1" [(set (match_dup 0) (plus:SI (ashift:SI (match_dup 2) (match_dup 3)) (match_dup 1)))] "operands[3] = operands[3] == const2_rtx ? const1_rtx : const2_rtx;") -- cgit v1.1 From a3f6bd7891495a0ed65f7da7a55d36c730328692 Mon Sep 17 00:00:00 2001 From: Haochen Gui Date: Fri, 4 Jun 2021 11:04:31 +0800 Subject: rs6000: Expand PROMOTE_MODE marco in rs6000_promote_function_mode This patch prepares for the patch which disables mode promotion of pseudos on rs6000. gcc/ChangeLog: * config/rs6000/rs6000-call.c (rs6000_promote_function_mode): Replace PROMOTE_MODE marco with its content. --- gcc/config/rs6000/rs6000-call.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index f271b0a..b4e13af 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -6646,7 +6646,9 @@ rs6000_promote_function_mode (const_tree type ATTRIBUTE_UNUSED, int *punsignedp ATTRIBUTE_UNUSED, const_tree, int for_return ATTRIBUTE_UNUSED) { - PROMOTE_MODE (mode, *punsignedp, type); + if (GET_MODE_CLASS (mode) == MODE_INT + && GET_MODE_SIZE (mode) < (TARGET_32BIT ? 4 : 8)) + mode = TARGET_32BIT ? SImode : DImode; return mode; } -- cgit v1.1 From 9080a3bf23297885fdc47221da37a71d6dec93c5 Mon Sep 17 00:00:00 2001 From: Haochen Gui Date: Fri, 4 Jun 2021 14:38:53 +0800 Subject: rs6000: Disable mode promotion for pseudos rs6000 has instructions that can do almost everything 32 bit at least as efficiently as corresponding 64 bit things. The mode promotion can be defered to when a wide mode is necessary. So it helps a lot not promote mode for pseudos. SPECint test shows that the overall performance improvement (by geomean) is more than 2% with this patch. testsuite/gcc.target/powerpc/not-promote-mode.c illustrates how the patch eliminates the redundant extensions and do further optimization by disabling mode promotion for pseduos. gcc/ChangeLog * config/rs6000/rs6000.h (PROMOTE_MODE): Remove. gcc/testsuite/ChangeLog: * gcc.target/powerpc/not-promote-mode.c: New. --- gcc/config/rs6000/rs6000.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index 164d359..a5f7b1d 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -667,17 +667,6 @@ extern unsigned char rs6000_recip_bits[]; /* Target machine storage layout. */ -/* Define this macro if it is advisable to hold scalars in registers - in a wider mode than that declared by the program. In such cases, - the value is constrained to be within the bounds of the declared - type, but kept valid in the wider mode. The signedness of the - extension may differ from that of the type. */ - -#define PROMOTE_MODE(MODE,UNSIGNEDP,TYPE) \ - if (GET_MODE_CLASS (MODE) == MODE_INT \ - && GET_MODE_SIZE (MODE) < (TARGET_32BIT ? 4 : 8)) \ - (MODE) = TARGET_32BIT ? SImode : DImode; - /* Define this if most significant bit is lowest numbered in instructions that operate on numbered bit-fields. */ /* That is true on RS/6000. */ -- cgit v1.1 From 39e5a954c156f7af16aa1a8f87405433d8031c4e Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Fri, 4 Jun 2021 10:12:32 +0300 Subject: arc: Don't allow millicode thunks with reduced register set CPUs. The millicode thunks are not reduced register set safe. Disable them for CPUs having this option on. gcc/ 2021-06-04 Claudiu Zissulescu * config/arc/arc.c (arc_override_options): Disable millicode thunks when RF16 is on. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/arc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c index b77d056..0d34c96 100644 --- a/gcc/config/arc/arc.c +++ b/gcc/config/arc/arc.c @@ -1451,8 +1451,10 @@ arc_override_options (void) if (TARGET_ARC700 && (arc_tune != ARC_TUNE_ARC7XX)) flag_delayed_branch = 0; - /* Millicode thunks doesn't work with long calls. */ - if (TARGET_LONG_CALLS_SET) + /* Millicode thunks doesn't work for long calls. */ + if (TARGET_LONG_CALLS_SET + /* neither for RF16. */ + || TARGET_RF16) target_flags &= ~MASK_MILLICODE_THUNK_SET; /* Set unaligned to all HS cpus. */ -- cgit v1.1 From b7dd2e4eeb44bc8678ecde8a6c7401de85e63561 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 4 Jun 2021 11:20:02 +0200 Subject: x86: Fix ix86_expand_vector_init for V*TImode [PR100887] We have vec_initv4tiv2ti and vec_initv2titi patterns which call ix86_expand_vector_init and assume it works for those modes. For the case of construction from two half-sized vectors, the code assumes it will always succeed, but we have only insn patterns with SImode and DImode element types. QImode and HImode element types are already handled by performing it with same sized vectors with SImode elements and the following patch extends that to V*TImode vectors. 2021-06-04 Jakub Jelinek PR target/100887 * config/i386/i386-expand.c (ix86_expand_vector_init): Handle concatenation from half-sized modes with TImode elements. * gcc.target/i386/pr100887.c: New test. --- gcc/config/i386/i386-expand.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index eb7cdb0..68bb5ab 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -14610,11 +14610,15 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals) if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts) { rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) }; - if (inner_mode == QImode || inner_mode == HImode) + if (inner_mode == QImode + || inner_mode == HImode + || inner_mode == TImode) { unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode); - mode = mode_for_vector (SImode, n_bits / 4).require (); - inner_mode = mode_for_vector (SImode, n_bits / 8).require (); + scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode; + n_bits /= GET_MODE_SIZE (elt_mode); + mode = mode_for_vector (elt_mode, n_bits).require (); + inner_mode = mode_for_vector (elt_mode, n_bits / 2).require (); ops[0] = gen_lowpart (inner_mode, ops[0]); ops[1] = gen_lowpart (inner_mode, ops[1]); subtarget = gen_reg_rtx (mode); -- cgit v1.1 From ed106d6544c785ca61296a64bec4b33b703dc586 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Fri, 28 May 2021 00:21:00 -0500 Subject: i386: Update unexpected empty split condition gcc/ChangeLog: * config/i386/i386.md (*load_tp_x32_zext, *add_tp_x32_zext, *tls_dynamic_gnu2_combine_32): Fix empty split condition. * config/i386/sse.md (*_pmovmskb_lt, *_pmovmskb_zext_lt, *sse2_pmovmskb_ext_lt, *_pblendvb_lt): Likewise. --- gcc/config/i386/i386.md | 6 +++--- gcc/config/i386/sse.md | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 960ecbd..f0bb798 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -15741,7 +15741,7 @@ (unspec:SI [(const_int 0)] UNSPEC_TP)))] "TARGET_X32" "#" - "" + "&& 1" [(set (match_dup 0) (zero_extend:DI (match_dup 1)))] { @@ -15779,7 +15779,7 @@ (clobber (reg:CC FLAGS_REG))] "TARGET_X32" "#" - "" + "&& 1" [(parallel [(set (match_dup 0) (zero_extend:DI @@ -15870,7 +15870,7 @@ (clobber (reg:CC FLAGS_REG))] "!TARGET_64BIT && TARGET_GNU2_TLS" "#" - "" + "&& 1" [(set (match_dup 0) (match_dup 5))] { operands[5] = can_create_pseudo_p () ? gen_reg_rtx (Pmode) : operands[0]; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 1b3df21..e4248e5 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -16562,7 +16562,7 @@ UNSPEC_MOVMSK))] "TARGET_SSE2" "#" - "" + "&& 1" [(set (match_dup 0) (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))] "" @@ -16584,7 +16584,7 @@ UNSPEC_MOVMSK)))] "TARGET_64BIT && TARGET_SSE2" "#" - "" + "&& 1" [(set (match_dup 0) (zero_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] "" @@ -16606,7 +16606,7 @@ UNSPEC_MOVMSK)))] "TARGET_64BIT && TARGET_SSE2" "#" - "" + "&& 1" [(set (match_dup 0) (sign_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] "" @@ -17911,7 +17911,7 @@ UNSPEC_BLENDV))] "TARGET_SSE4_1" "#" - "" + "&& 1" [(set (match_dup 0) (unspec:VI1_AVX2 [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] -- cgit v1.1 From 9651794fff3a16c476e148de855d4f2136234c73 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Fri, 28 May 2021 00:20:49 -0500 Subject: arm: Update unexpected empty split condition gcc/ChangeLog: * config/arm/vfp.md (no_literal_pool_df_immediate, no_literal_pool_sf_immediate): Fix empty split condition. --- gcc/config/arm/vfp.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md index f97af92..55b6c1a 100644 --- a/gcc/config/arm/vfp.md +++ b/gcc/config/arm/vfp.md @@ -2129,7 +2129,7 @@ && !arm_const_double_rtx (operands[1]) && !(TARGET_VFP_DOUBLE && vfp3_const_double_rtx (operands[1]))" "#" - "" + "&& 1" [(const_int 0)] { long buf[2]; @@ -2154,7 +2154,7 @@ && TARGET_VFP_BASE && !vfp3_const_double_rtx (operands[1])" "#" - "" + "&& 1" [(const_int 0)] { long buf; -- cgit v1.1 From 8d7dae0eb366a88a1baba1857ecc54c09e4a520e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 4 Jun 2021 17:37:15 +0200 Subject: i386: Add init pattern for V2HI vectors [PR100637] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2021-06-03 Uroš Bizjak gcc/ PR target/100637 * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate): Handle V2HI mode. (ix86_expand_vector_init_general): Ditto. Use SImode instead of word_mode for logic operations when GET_MODE_SIZE (mode) < UNITS_PER_WORD. (expand_vec_perm_even_odd_1): Assert that V2HI mode should be implemented by expand_vec_perm_1. (expand_vec_perm_broadcast_1): Assert that V2HI and V4HI modes should be implemented using standard shuffle patterns. (ix86_vectorize_vec_perm_const): Handle V2HImode. Add V4HI and V2HI modes to modes, implementable with shuffle for one operand. * config/i386/mmx.md (*punpckwd): New insn_and_split pattern. (*pshufw_1): New insn pattern. (*vec_dupv2hi): Ditto. (vec_initv2hihi): New expander. gcc/testsuite/ PR target/100637 * gcc.dg/vect/slp-perm-9.c (dg-final): Adjust dumps for vect32 targets. --- gcc/config/i386/i386-expand.c | 45 ++++++++++++++++++------ gcc/config/i386/mmx.md | 82 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+), 10 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 68bb5ab..804cb59 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -13723,6 +13723,19 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, } goto widen; + case E_V2HImode: + if (TARGET_SSE2) + { + rtx x; + + val = gen_lowpart (SImode, val); + x = gen_rtx_TRUNCATE (HImode, val); + x = gen_rtx_VEC_DUPLICATE (mode, x); + emit_insn (gen_rtx_SET (target, x)); + return true; + } + return false; + case E_V8QImode: if (!mmx_ok) return false; @@ -14524,6 +14537,8 @@ quarter: case E_V4HImode: case E_V8QImode: + + case E_V2HImode: break; default: @@ -14532,12 +14547,14 @@ quarter: { int i, j, n_elts, n_words, n_elt_per_word; - machine_mode inner_mode; + machine_mode tmp_mode, inner_mode; rtx words[4], shift; + tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode; + inner_mode = GET_MODE_INNER (mode); n_elts = GET_MODE_NUNITS (mode); - n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD; + n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode); n_elt_per_word = n_elts / n_words; shift = GEN_INT (GET_MODE_BITSIZE (inner_mode)); @@ -14548,15 +14565,15 @@ quarter: for (j = 0; j < n_elt_per_word; ++j) { rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1); - elt = convert_modes (word_mode, inner_mode, elt, true); + elt = convert_modes (tmp_mode, inner_mode, elt, true); if (j == 0) word = elt; else { - word = expand_simple_binop (word_mode, ASHIFT, word, shift, + word = expand_simple_binop (tmp_mode, ASHIFT, word, shift, word, 1, OPTAB_LIB_WIDEN); - word = expand_simple_binop (word_mode, IOR, word, elt, + word = expand_simple_binop (tmp_mode, IOR, word, elt, word, 1, OPTAB_LIB_WIDEN); } } @@ -14570,14 +14587,14 @@ quarter: { rtx tmp = gen_reg_rtx (mode); emit_clobber (tmp); - emit_move_insn (gen_lowpart (word_mode, tmp), words[0]); - emit_move_insn (gen_highpart (word_mode, tmp), words[1]); + emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]); + emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]); emit_move_insn (target, tmp); } else if (n_words == 4) { rtx tmp = gen_reg_rtx (V4SImode); - gcc_assert (word_mode == SImode); + gcc_assert (tmp_mode == SImode); vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words)); ix86_expand_vector_init_general (false, V4SImode, tmp, vals); emit_move_insn (target, gen_lowpart (mode, tmp)); @@ -19548,6 +19565,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) case E_V2DImode: case E_V2SImode: case E_V4SImode: + case E_V2HImode: /* These are always directly implementable by expand_vec_perm_1. */ gcc_unreachable (); @@ -19758,6 +19776,8 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) case E_V2DImode: case E_V2SImode: case E_V4SImode: + case E_V2HImode: + case E_V4HImode: /* These are always implementable using standard shuffle patterns. */ gcc_unreachable (); @@ -20267,6 +20287,10 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, if (!TARGET_MMX_WITH_SSE) return false; break; + case E_V2HImode: + if (!TARGET_SSE2) + return false; + break; case E_V2DImode: case E_V2DFmode: if (!TARGET_SSE) @@ -20298,10 +20322,11 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, /* Check whether the mask can be applied to the vector type. */ d.one_operand_p = (which != 3); - /* Implementable with shufps or pshufd. */ + /* Implementable with shufps, pshufd or pshuflw. */ if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V2SFmode - || d.vmode == V4SImode || d.vmode == V2SImode)) + || d.vmode == V4SImode || d.vmode == V2SImode + || d.vmode == V4HImode || d.vmode == V2HImode)) return true; /* Otherwise we have to go through the motions and see if we can diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 914e5e9..c3fd280 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -3292,6 +3292,88 @@ DONE; }) +(define_insn_and_split "*punpckwd" + [(set (match_operand:V2HI 0 "register_operand" "=x,Yw") + (vec_select:V2HI + (vec_concat:V4HI + (match_operand:V2HI 1 "register_operand" "0,Yw") + (match_operand:V2HI 2 "register_operand" "x,Yw")) + (parallel [(match_operand 3 "const_0_to_3_operand") + (match_operand 4 "const_0_to_3_operand")])))] + "TARGET_SSE2" + "#" + "&& reload_completed" + [(set (match_dup 5) + (vec_select:V4HI + (match_dup 5) + (parallel [(match_dup 3) (match_dup 4) + (const_int 0) (const_int 0)])))] +{ + rtx dest = lowpart_subreg (V8HImode, operands[0], V2HImode); + rtx op1 = lowpart_subreg (V8HImode, operands[1], V2HImode); + rtx op2 = lowpart_subreg (V8HImode, operands[2], V2HImode); + + emit_insn (gen_vec_interleave_lowv8hi (dest, op1, op2)); + + static const int map[4] = { 0, 2, 1, 3 }; + + int sel0 = map[INTVAL (operands[3])]; + int sel1 = map[INTVAL (operands[4])]; + + if (sel0 == 0 && sel1 == 1) + DONE; + + operands[3] = GEN_INT (sel0); + operands[4] = GEN_INT (sel1); + + operands[5] = lowpart_subreg (V4HImode, dest, V8HImode); +} + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sselog") + (set_attr "mode" "TI")]) + +(define_insn "*pshufw_1" + [(set (match_operand:V2HI 0 "register_operand" "=Yw") + (vec_select:V2HI + (match_operand:V2HI 1 "register_operand" "Yw") + (parallel [(match_operand 2 "const_0_to_1_operand") + (match_operand 3 "const_0_to_1_operand")])))] + "TARGET_SSE2" +{ + int mask = 0; + mask |= INTVAL (operands[2]) << 0; + mask |= INTVAL (operands[3]) << 2; + mask |= 2 << 4; + mask |= 3 << 6; + operands[2] = GEN_INT (mask); + + return "%vpshuflw\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "*vec_dupv2hi" + [(set (match_operand:V2HI 0 "register_operand" "=Yw") + (vec_duplicate:V2HI + (truncate:HI + (match_operand:SI 1 "register_operand" "Yw"))))] + "TARGET_SSE2" + "%vpshuflw\t{$0, %1, %0|%0, %1, 0}" + [(set_attr "type" "sselog1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_expand "vec_initv2hihi" + [(match_operand:V2HI 0 "register_operand") + (match_operand 1)] + "TARGET_SSE2" +{ + ix86_expand_vector_init (false, operands[0], + operands[1]); + DONE; +}) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Miscellaneous -- cgit v1.1 From 1b51f038cf027fdc1bf00240cacee59dd5cbe458 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 4 Jun 2021 17:51:05 +0200 Subject: i386: Convert a couple of predicates to use match_code RTXes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional changes. 2021-06-04 Uroš Bizjak gcc/ * config/i386/predicates.md (GOT_memory_operand): Implement using match_code RTXes. (GOT32_symbol_operand): Ditto. --- gcc/config/i386/predicates.md | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index abd307e..d2f5f15 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -734,13 +734,10 @@ ;; Return true if OP is a GOT memory operand. (define_predicate "GOT_memory_operand" - (match_operand 0 "memory_operand") -{ - op = XEXP (op, 0); - return (GET_CODE (op) == CONST - && GET_CODE (XEXP (op, 0)) == UNSPEC - && XINT (XEXP (op, 0), 1) == UNSPEC_GOTPCREL); -}) + (and (match_operand 0 "memory_operand") + (match_code "const" "0") + (match_code "unspec" "00") + (match_test "XINT (XEXP (XEXP (op, 0), 0), 1) == UNSPEC_GOTPCREL"))) ;; Test for a valid operand for a call instruction. ;; Allow constant call address operands in Pmode only. @@ -767,9 +764,9 @@ ;; Return true if OP is a 32-bit GOT symbol operand. (define_predicate "GOT32_symbol_operand" - (match_test "GET_CODE (op) == CONST - && GET_CODE (XEXP (op, 0)) == UNSPEC - && XINT (XEXP (op, 0), 1) == UNSPEC_GOT")) + (and (match_code "const") + (match_code "unspec" "0") + (match_test "XINT (XEXP (op, 0), 1) == UNSPEC_GOT"))) ;; Match exactly zero. (define_predicate "const0_operand" -- cgit v1.1 From 549d7f4310f6f8c2c64efcb6f3efcee99c9d9f4f Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Sat, 5 Jun 2021 01:27:02 -0400 Subject: Fix split conditions in H8/300 port gcc/ * config/h8300/addsub.md: Fix split condition in define_insn_and_split patterns. * config/h8300/bitfield.md: Likewise. * config/h8300/combiner.md: Likewise. * config/h8300/divmod.md: Likewise. * config/h8300/extensions.md: Likewise. * config/h8300/jumpcall.md: Likewise. * config/h8300/movepush.md: Likewise. * config/h8300/multiply.md: Likewise. * config/h8300/other.md: Likewise. * config/h8300/shiftrotate.md: Likewise. * config/h8300/logical.md: Likewise. Fix split pattern to use code iterator that somehow slipped through. --- gcc/config/h8300/addsub.md | 16 +++++----- gcc/config/h8300/bitfield.md | 16 +++++----- gcc/config/h8300/combiner.md | 68 ++++++++++++++++++++--------------------- gcc/config/h8300/divmod.md | 12 ++++---- gcc/config/h8300/extensions.md | 12 ++++---- gcc/config/h8300/jumpcall.md | 2 +- gcc/config/h8300/logical.md | 2 +- gcc/config/h8300/movepush.md | 22 ++++++------- gcc/config/h8300/multiply.md | 12 ++++---- gcc/config/h8300/other.md | 2 +- gcc/config/h8300/shiftrotate.md | 18 +++++------ 11 files changed, 91 insertions(+), 91 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/addsub.md b/gcc/config/h8300/addsub.md index 3585bff..b1eb0d2 100644 --- a/gcc/config/h8300/addsub.md +++ b/gcc/config/h8300/addsub.md @@ -15,7 +15,7 @@ (match_operand:QI 2 "h8300_src_operand" "rQi")))] "h8300_operands_match_p (operands)" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (plus:QI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) @@ -34,7 +34,7 @@ (match_operand:HI 2 "h8300_src_operand" "L,N,J,n,r")))] "!TARGET_H8300SX" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (plus:HI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) @@ -81,7 +81,7 @@ (match_operand:HI 2 "h8300_src_operand" "P3>X,P3"))))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (ior:SI (and:SI (match_dup 1) (const_int -256)) (zero_extend:SI (match_dup 2)))) @@ -758,7 +758,7 @@ (match_operand:SI 2 "register_operand" "0")))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (ior:SI (ashift:SI (match_dup 1) (const_int 31)) (match_dup 2))) @@ -782,7 +782,7 @@ (match_operand:SI 4 "register_operand" "0")))] "(INTVAL (operands[3]) & ~0xffff) == 0" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (ior:SI (and:SI (ashift:SI (match_dup 1) (match_dup 2)) (match_dup 3)) @@ -815,7 +815,7 @@ (match_operand:SI 4 "register_operand" "0")))] "((INTVAL (operands[3]) << INTVAL (operands[2])) & ~0xffff) == 0" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (ior:SI (and:SI (lshiftrt:SI (match_dup 1) (match_dup 2)) (match_dup 3)) @@ -848,7 +848,7 @@ (match_operand:SI 3 "register_operand" "0")))] "INTVAL (operands[2]) < 16" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (ior:SI (zero_extract:SI (match_dup 1) (const_int 1) @@ -875,7 +875,7 @@ (match_operand:SI 2 "register_operand" "0")))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (ior:SI (and:SI (lshiftrt:SI (match_dup 1) (const_int 30)) (const_int 2)) @@ -902,7 +902,7 @@ (clobber (match_scratch:HI 3 "=&r"))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (ior:SI (and:SI (lshiftrt:SI (match_dup 1) (const_int 9)) (const_int 4194304)) @@ -993,7 +993,7 @@ (const_int 1))))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (ior:SI (and:SI (match_dup 1) (const_int 1)) (lshiftrt:SI (match_dup 1) (const_int 1)))) @@ -1147,7 +1147,7 @@ (const_int 8)) 1))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (subreg:QI (lshiftrt:HI (match_dup 1) (const_int 8)) 1)) (clobber (reg:CC CC_REG))])]) @@ -1169,7 +1169,7 @@ (const_int 8)) 3))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (subreg:QI (lshiftrt:SI (match_dup 1) (const_int 8)) 3)) (clobber (reg:CC CC_REG))])]) @@ -1190,7 +1190,7 @@ (clobber (match_scratch:SI 2 "=&r"))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (subreg:QI (lshiftrt:SI (match_dup 1) (const_int 16)) 3)) (clobber (match_dup 2)) @@ -1213,7 +1213,7 @@ (clobber (match_scratch:SI 2 "=&r"))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (subreg:QI (lshiftrt:SI (match_dup 1) (const_int 24)) 3)) (clobber (match_dup 2)) diff --git a/gcc/config/h8300/divmod.md b/gcc/config/h8300/divmod.md index b5ab6b7..67f253c 100644 --- a/gcc/config/h8300/divmod.md +++ b/gcc/config/h8300/divmod.md @@ -8,7 +8,7 @@ (match_operand:HSI 2 "reg_or_nibble_operand" "r IP4>X")))] "TARGET_H8300SX" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (udiv:HSI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) @@ -27,7 +27,7 @@ (match_operand:HSI 2 "reg_or_nibble_operand" "r IP4>X")))] "TARGET_H8300SX" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (div:HSI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) @@ -53,7 +53,7 @@ (zero_extend:HI (match_dup 2)))))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (truncate:QI (udiv:HI (match_dup 1) (zero_extend:HI (match_dup 2))))) @@ -97,7 +97,7 @@ (sign_extend:HI (match_dup 2)))))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (truncate:QI (div:HI (match_dup 1) (sign_extend:HI (match_dup 2))))) @@ -140,7 +140,7 @@ (zero_extend:SI (match_dup 2)))))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (truncate:HI (udiv:SI (match_dup 1) (zero_extend:SI (match_dup 2))))) @@ -183,7 +183,7 @@ (sign_extend:SI (match_dup 2)))))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (truncate:HI (div:SI (match_dup 1) (sign_extend:SI (match_dup 2))))) diff --git a/gcc/config/h8300/extensions.md b/gcc/config/h8300/extensions.md index 7631230..bc10179 100644 --- a/gcc/config/h8300/extensions.md +++ b/gcc/config/h8300/extensions.md @@ -16,7 +16,7 @@ (zero_extend:HI (match_operand:QI 1 "general_operand_src" "0,g>")))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (zero_extend:HI (match_dup 1))) (clobber (reg:CC CC_REG))])]) @@ -91,7 +91,7 @@ (zero_extend:SI (match_operand:QI 1 "register_operand" "0")))] "TARGET_H8300SX" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (zero_extend:SI (match_dup 1))) (clobber (reg:CC CC_REG))])]) @@ -114,7 +114,7 @@ (zero_extend:SI (match_operand:HI 1 "register_operand" "0")))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (zero_extend:SI (match_dup 1))) (clobber (reg:CC CC_REG))])]) @@ -137,7 +137,7 @@ (sign_extend:HI (match_operand:QI 1 "register_operand" "0")))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (sign_extend:HI (match_dup 1))) (clobber (reg:CC CC_REG))])]) @@ -172,7 +172,7 @@ (sign_extend:SI (match_operand:QI 1 "register_operand" "0")))] "TARGET_H8300SX" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (sign_extend:SI (match_dup 1))) (clobber (reg:CC CC_REG))])]) @@ -195,7 +195,7 @@ (sign_extend:SI (match_operand:HI 1 "register_operand" "0")))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (sign_extend:SI (match_dup 1))) (clobber (reg:CC CC_REG))])]) diff --git a/gcc/config/h8300/jumpcall.md b/gcc/config/h8300/jumpcall.md index 49d1e43..7b6a66a 100644 --- a/gcc/config/h8300/jumpcall.md +++ b/gcc/config/h8300/jumpcall.md @@ -22,7 +22,7 @@ (pc)))] "" "#" - "reload_completed" + "&& reload_completed" [(set (reg:H8cc CC_REG) (compare:H8cc (match_dup 1) (match_dup 2))) (set (pc) diff --git a/gcc/config/h8300/logical.md b/gcc/config/h8300/logical.md index d778d24..34cf74e 100644 --- a/gcc/config/h8300/logical.md +++ b/gcc/config/h8300/logical.md @@ -223,7 +223,7 @@ "#" "&& reload_completed" [(parallel [(set (match_dup 0) - (match_op_dup 3 [(match_dup 1) (match_dup 2)])) + (logicals:QHSI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) (define_insn "*3_clobber_flags" diff --git a/gcc/config/h8300/movepush.md b/gcc/config/h8300/movepush.md index b106cd5..9ce00fb 100644 --- a/gcc/config/h8300/movepush.md +++ b/gcc/config/h8300/movepush.md @@ -9,7 +9,7 @@ (match_operand:QI 1 "general_operand_src" " I,r>,r,n,m,r"))] "!TARGET_H8300SX && h8300_move_ok (operands[0], operands[1])" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_dup 1)) (clobber (reg:CC CC_REG))])]) @@ -32,7 +32,7 @@ (match_operand:QI 1 "general_operand_src" "P4>X,rQi"))] "TARGET_H8300SX" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_dup 1)) (clobber (reg:CC CC_REG))])]) @@ -69,7 +69,7 @@ (match_operand:QI 1 "general_operand_src" "I,rmi>"))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (strict_low_part (match_dup 0)) (match_dup 1)) (clobber (reg:CC CC_REG))])]) @@ -93,7 +93,7 @@ "!TARGET_H8300SX && h8300_move_ok (operands[0], operands[1])" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_dup 1)) (clobber (reg:CC CC_REG))])]) @@ -117,7 +117,7 @@ (match_operand:HI 1 "general_operand_src" "I,P3>X,P4>X,IP8>X,rQi"))] "TARGET_H8300SX" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_dup 1)) (clobber (reg:CC CC_REG))])]) @@ -140,7 +140,7 @@ (match_operand:HI 1 "general_operand_src" "I,P3>X,rmi"))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (strict_low_part (match_dup 0)) (match_dup 1)) (clobber (reg:CC CC_REG))])]) @@ -163,7 +163,7 @@ "(TARGET_H8300S || TARGET_H8300H) && !TARGET_H8300SX && h8300_move_ok (operands[0], operands[1])" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_dup 1)) (clobber (reg:CC CC_REG))])]) @@ -240,7 +240,7 @@ (match_operand:SI 1 "general_operand_src" "I,P3>X,IP8>X,rQi,I,r,*a"))] "TARGET_H8300SX" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_dup 1)) (clobber (reg:CC CC_REG))])]) @@ -265,7 +265,7 @@ (match_operand:SF 1 "general_operand_src" "G,rQi"))] "TARGET_H8300SX" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_dup 1)) (clobber (reg:CC CC_REG))])]) @@ -287,7 +287,7 @@ && (register_operand (operands[0], SFmode) || register_operand (operands[1], SFmode))" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_dup 1)) (clobber (reg:CC CC_REG))])]) @@ -319,7 +319,7 @@ (match_operand:QHI 0 "register_no_sp_elim_operand" "r"))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (mem:QHI (pre_modify:P (reg:P SP_REG) (plus:P (reg:P SP_REG) (const_int -4)))) diff --git a/gcc/config/h8300/multiply.md b/gcc/config/h8300/multiply.md index 56f2b6f..1d56d47 100644 --- a/gcc/config/h8300/multiply.md +++ b/gcc/config/h8300/multiply.md @@ -21,7 +21,7 @@ (match_operand:QI 2 "nibble_operand" "IP4>X")))] "TARGET_H8300SX" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (mult:HI (sign_extend:HI (match_dup 1)) (match_dup 2))) (clobber (reg:CC CC_REG))])]) @@ -41,7 +41,7 @@ (sign_extend:HI (match_operand:QI 2 "register_operand" "r"))))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (mult:HI (sign_extend:HI (match_dup 1)) (sign_extend:HI (match_dup 2)))) @@ -73,7 +73,7 @@ (match_operand:SI 2 "nibble_operand" "IP4>X")))] "TARGET_H8300SX" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (mult:SI (sign_extend:SI (match_dup 1)) (match_dup 2))) (clobber (reg:CC CC_REG))])]) @@ -93,7 +93,7 @@ (sign_extend:SI (match_operand:HI 2 "register_operand" "r"))))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (mult:SI (sign_extend:SI (match_dup 1)) (sign_extend:SI (match_dup 2)))) @@ -172,7 +172,7 @@ (match_operand:HSI 2 "reg_or_nibble_operand" "r IP4>X")))] "TARGET_H8300SX" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (mult:HSI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) @@ -195,7 +195,7 @@ (const_int 32))))] "TARGET_H8300SXMUL" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (truncate:SI (lshiftrt:DI (mult:DI (sign_extend:DI (match_dup 1)) diff --git a/gcc/config/h8300/other.md b/gcc/config/h8300/other.md index 572a29f..c754227 100644 --- a/gcc/config/h8300/other.md +++ b/gcc/config/h8300/other.md @@ -7,7 +7,7 @@ (abs:SF (match_operand:SF 1 "register_operand" "0")))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (abs:SF (match_dup 1))) (clobber (reg:CC CC_REG))])]) diff --git a/gcc/config/h8300/shiftrotate.md b/gcc/config/h8300/shiftrotate.md index 4bf8fe1..23140d9a 100644 --- a/gcc/config/h8300/shiftrotate.md +++ b/gcc/config/h8300/shiftrotate.md @@ -57,7 +57,7 @@ (match_operand:QI 2 "const_int_operand" "")]))] "h8300_operands_match_p (operands)" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) (clobber (reg:CC CC_REG))])]) @@ -107,7 +107,7 @@ (match_operand:QI 2 "nonmemory_operand" "r P5>X")]))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) (clobber (reg:CC CC_REG))])]) @@ -158,7 +158,7 @@ (clobber (match_scratch:QI 4 "=X,&r"))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) (clobber (match_dup 4)) (clobber (reg:CC CC_REG))])]) @@ -186,7 +186,7 @@ && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), QImode, GET_CODE (operands[3])))" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) (clobber (reg:CC CC_REG))])]) @@ -213,7 +213,7 @@ (clobber (match_scratch:QI 4 "=X,&r"))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) (clobber (match_dup 4)) (clobber (reg:CC CC_REG))])]) @@ -241,7 +241,7 @@ && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), HImode, GET_CODE (operands[3])))" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) (clobber (reg:CC CC_REG))])]) @@ -268,7 +268,7 @@ (clobber (match_scratch:QI 4 "=X,&r"))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) (clobber (match_dup 4)) (clobber (reg:CC CC_REG))])]) @@ -296,7 +296,7 @@ && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), SImode, GET_CODE (operands[3])))" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) (clobber (reg:CC CC_REG))])]) @@ -410,7 +410,7 @@ (match_operand:QI 2 "immediate_operand" "")))] "" "#" - "reload_completed" + "&& reload_completed" [(parallel [(set (match_dup 0) (rotate:QHSI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) -- cgit v1.1 From 9147affc04e1188a385748ad0f51eb7491a792ab Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Fri, 28 May 2021 00:21:04 -0500 Subject: m68k: Update unexpected empty split condition gcc/ChangeLog: * config/m68k/m68k.md (*zero_extend_inc, *zero_extend_dec, *zero_extendsidi2): Fix empty split condition. --- gcc/config/m68k/m68k.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/m68k/m68k.md b/gcc/config/m68k/m68k.md index 59a456c..82d075e 100644 --- a/gcc/config/m68k/m68k.md +++ b/gcc/config/m68k/m68k.md @@ -1693,7 +1693,7 @@ GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT && GET_MODE_SIZE (GET_MODE (operands[0])) == GET_MODE_SIZE (GET_MODE (operands[1])) * 2" "#" - "" + "&& 1" [(set (match_dup 0) (const_int 0)) (set (match_dup 0) @@ -1710,7 +1710,7 @@ GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT && GET_MODE_SIZE (GET_MODE (operands[0])) == GET_MODE_SIZE (GET_MODE (operands[1])) * 2" "#" - "" + "&& 1" [(set (match_dup 0) (match_dup 1)) (set (match_dup 0) @@ -1764,7 +1764,7 @@ (zero_extend:DI (match_operand:SI 1 "nonimmediate_src_operand" "")))] "GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM" "#" - "" + "&& 1" [(set (match_dup 2) (match_dup 1)) (set (match_dup 3) -- cgit v1.1 From bdce6760c2098558cbf9c89e166ddb172ec56cc7 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Fri, 28 May 2021 00:21:07 -0500 Subject: mips: Update unexpected empty split condition gcc/ChangeLog: * config/mips/mips.md (, bswapsi2, bswapdi2): Fix empty split condition. --- gcc/config/mips/mips.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md index eef3cfd..455b9b8 100644 --- a/gcc/config/mips/mips.md +++ b/gcc/config/mips/mips.md @@ -5835,7 +5835,7 @@ (match_operand:SI 2 "immediate_operand" "I")))] "TARGET_MIPS16" "#" - "" + "&& 1" [(set (match_dup 0) (match_dup 1)) (set (match_dup 0) (lshiftrt:SI (match_dup 0) (match_dup 2)))] "" @@ -5871,7 +5871,7 @@ (bswap:SI (match_operand:SI 1 "register_operand" "d")))] "ISA_HAS_WSBH && ISA_HAS_ROR" "#" - "" + "&& 1" [(set (match_dup 0) (unspec:SI [(match_dup 1)] UNSPEC_WSBH)) (set (match_dup 0) (rotatert:SI (match_dup 0) (const_int 16)))] "" @@ -5882,7 +5882,7 @@ (bswap:DI (match_operand:DI 1 "register_operand" "d")))] "TARGET_64BIT && ISA_HAS_WSBH" "#" - "" + "&& 1" [(set (match_dup 0) (unspec:DI [(match_dup 1)] UNSPEC_DSBH)) (set (match_dup 0) (unspec:DI [(match_dup 0)] UNSPEC_DSHD))] "" -- cgit v1.1 From 8afd2e822903b3df63e69bb04a2aa533047ceb01 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Fri, 28 May 2021 00:21:11 -0500 Subject: or1k: Update unexpected empty split condition gcc/ChangeLog: * config/or1k/or1k.md (*movdi): Fix empty split condition. --- gcc/config/or1k/or1k.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/or1k/or1k.md b/gcc/config/or1k/or1k.md index eb94efb..495b3e2 100644 --- a/gcc/config/or1k/or1k.md +++ b/gcc/config/or1k/or1k.md @@ -351,7 +351,7 @@ "register_operand (operands[0], DImode) || reg_or_0_operand (operands[1], DImode)" "#" - "" + "&& 1" [(const_int 0)] { rtx l0 = operand_subword (operands[0], 0, 0, DImode); -- cgit v1.1 From 10f36fe50cb3cb75d17903df116719ee2f4e492c Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Fri, 28 May 2021 00:21:18 -0500 Subject: sparc: Update unexpected empty split condition gcc/ChangeLog: * config/sparc/sparc.md (*snedi_zero_vis3, *neg_snedi_zero_subxc, *plus_snedi_zero, *plus_plus_snedi_zero, *minus_snedi_zero, *minus_minus_snedi_zero): Fix empty split condition. --- gcc/config/sparc/sparc.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md index a8d9962..24b76e0 100644 --- a/gcc/config/sparc/sparc.md +++ b/gcc/config/sparc/sparc.md @@ -855,7 +855,7 @@ (clobber (reg:CCX CC_REG))] "TARGET_ARCH64 && TARGET_VIS3" "#" - "" + "&& 1" [(set (reg:CCXC CC_REG) (compare:CCXC (not:DI (match_dup 1)) (const_int -1))) (set (match_dup 0) (ltu:W (reg:CCXC CC_REG) (const_int 0)))] "" @@ -882,7 +882,7 @@ (clobber (reg:CCX CC_REG))] "TARGET_ARCH64 && TARGET_SUBXC" "#" - "" + "&& 1" [(set (reg:CCXC CC_REG) (compare:CCXC (not:DI (match_dup 1)) (const_int -1))) (set (match_dup 0) (neg:W (ltu:W (reg:CCXC CC_REG) (const_int 0))))] "" @@ -984,7 +984,7 @@ (clobber (reg:CCX CC_REG))] "TARGET_ARCH64 && TARGET_VIS3" "#" - "" + "&& 1" [(set (reg:CCXC CC_REG) (compare:CCXC (not:DI (match_dup 1)) (const_int -1))) (set (match_dup 0) (plus:W (ltu:W (reg:CCXC CC_REG) (const_int 0)) (match_dup 2)))] @@ -1000,7 +1000,7 @@ (clobber (reg:CCX CC_REG))] "TARGET_ARCH64 && TARGET_VIS3" "#" - "" + "&& 1" [(set (reg:CCXC CC_REG) (compare:CCXC (not:DI (match_dup 1)) (const_int -1))) (set (match_dup 0) (plus:W (plus:W (ltu:W (reg:CCXC CC_REG) (const_int 0)) (match_dup 2)) @@ -1048,7 +1048,7 @@ (clobber (reg:CCX CC_REG))] "TARGET_ARCH64 && TARGET_SUBXC" "#" - "" + "&& 1" [(set (reg:CCXC CC_REG) (compare:CCXC (not:DI (match_dup 1)) (const_int -1))) (set (match_dup 0) (minus:W (match_dup 2) (ltu:W (reg:CCXC CC_REG) (const_int 0))))] @@ -1064,7 +1064,7 @@ (clobber (reg:CCX CC_REG))] "TARGET_ARCH64 && TARGET_SUBXC" "#" - "" + "&& 1" [(set (reg:CCXC CC_REG) (compare:CCXC (not:DI (match_dup 1)) (const_int -1))) (set (match_dup 0) (minus:W (minus:W (match_dup 2) (ltu:W (reg:CCXC CC_REG) (const_int 0))) -- cgit v1.1 From 081c9dfb67a0d2e7425ddb5420ada588026f92ca Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Fri, 28 May 2021 00:21:14 -0500 Subject: sh: Update unexpected empty split condition gcc/ChangeLog: * config/sh/sh.md (doloop_end_split): Fix empty split condition. --- gcc/config/sh/sh.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md index e3af9ae..93ee7c9 100644 --- a/gcc/config/sh/sh.md +++ b/gcc/config/sh/sh.md @@ -6424,7 +6424,7 @@ (clobber (reg:SI T_REG))] "TARGET_SH2" "#" - "" + "&& 1" [(parallel [(set (reg:SI T_REG) (eq:SI (match_dup 2) (const_int 1))) (set (match_dup 0) (plus:SI (match_dup 2) (const_int -1)))]) -- cgit v1.1 From 09bf5279e87a15f658e37c7f873ccc40e5ef2576 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 6 Jun 2021 22:07:05 +0200 Subject: i386: Clean up constraints.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional changes. 2021-06-06 Uroš Bizjak gcc/ * config/i386/constraints.md (Bs): Remove boolean operators from match_test RTX. (Bw): Ditto. (L): Ditto. (M): Use "mode" variable instead of GET_MODE (op) in match_test RTX. (Wz): Ditto. --- gcc/config/i386/constraints.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md index eaa582d..485e3f5 100644 --- a/gcc/config/i386/constraints.md +++ b/gcc/config/i386/constraints.md @@ -198,7 +198,8 @@ (ior (and (not (match_test "TARGET_INDIRECT_BRANCH_REGISTER")) (not (match_test "TARGET_X32")) (match_operand 0 "sibcall_memory_operand")) - (and (match_test "TARGET_X32 && Pmode == DImode") + (and (match_test "TARGET_X32") + (match_test "Pmode == DImode") (match_operand 0 "GOT_memory_operand")))) (define_constraint "Bw" @@ -206,7 +207,8 @@ (ior (and (not (match_test "TARGET_INDIRECT_BRANCH_REGISTER")) (not (match_test "TARGET_X32")) (match_operand 0 "memory_operand")) - (and (match_test "TARGET_X32 && Pmode == DImode") + (and (match_test "TARGET_X32") + (match_test "Pmode == DImode") (match_operand 0 "GOT_memory_operand")))) (define_constraint "Bz" @@ -239,8 +241,9 @@ "@code{0xFF}, @code{0xFFFF} or @code{0xFFFFFFFF} for AND as a zero-extending move." (and (match_code "const_int") - (match_test "ival == 0xff || ival == 0xffff - || ival == (HOST_WIDE_INT) 0xffffffff"))) + (ior (match_test "ival == 0xff") + (match_test "ival == 0xffff") + (match_test "ival == (HOST_WIDE_INT) 0xffffffff")))) (define_constraint "M" "0, 1, 2, or 3 (shifts for the @code{lea} instruction)." @@ -289,14 +292,14 @@ to fit that range (for sign-extending conversion operations that require non-VOIDmode immediate operands)." (and (match_operand 0 "x86_64_immediate_operand") - (match_test "GET_MODE (op) != VOIDmode"))) + (match_test "mode != VOIDmode"))) (define_constraint "Wz" "32-bit unsigned integer constant, or a symbolic reference known to fit that range (for zero-extending conversion operations that require non-VOIDmode immediate operands)." (and (match_operand 0 "x86_64_zext_immediate_operand") - (match_test "GET_MODE (op) != VOIDmode"))) + (match_test "mode != VOIDmode"))) (define_constraint "Wd" "128-bit integer constant where both the high and low 64-bit word -- cgit v1.1 From 03d921abe60fe9ff54a3c449eff1531e73a19215 Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Wed, 2 Jun 2021 13:32:45 -0400 Subject: aix: Use assemble_name to output BSS section name. The code to emit BSS CSECT needs to support user assembler name. * config/rs6000/rs6000.c (rs6000_xcoff_asm_output_aligned_decl_common): Use assemble_name to output BSS section name. --- gcc/config/rs6000/rs6000.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 835af77..b01bb5c 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -21654,10 +21654,16 @@ rs6000_xcoff_asm_output_aligned_decl_common (FILE *stream, /* Globalize TLS BSS. */ if (TREE_PUBLIC (decl) && DECL_THREAD_LOCAL_P (decl)) - fprintf (stream, "\t.globl %s\n", name); + { + fputs (GLOBAL_ASM_OP, stream); + assemble_name (stream, name); + fputc ('\n', stream); + } /* Switch to section and skip space. */ - fprintf (stream, "\t.csect %s,%u\n", name, align2); + fputs ("\t.csect ", stream); + assemble_name (stream, name); + fprintf (stream, ",%u\n", align2); ASM_DECLARE_OBJECT_NAME (stream, name, decl); ASM_OUTPUT_SKIP (stream, size ? size : 1); return; -- cgit v1.1 From fed94fc9e704b0de228499495b7ca4d4c79ef76b Mon Sep 17 00:00:00 2001 From: Eric Botcazou Date: Sun, 6 Jun 2021 23:54:24 +0200 Subject: Reimplement LEAF_REG_REMAP macro for the SPARC The current implementation as an array of chars is indeed a bit awkward so this reimplements it as a function taking and returning an int. gcc/ * config/sparc/sparc-protos.h (order_regs_for_local_alloc): Rename to... (sparc_order_regs_for_local_alloc): ...this. (sparc_leaf_reg_remap): Declare. * config/sparc/sparc.h (ADJUST_REG_ALLOC_ORDER): Adjust. (LEAF_REG_REMAP): Reimplement as call to sparc_leaf_reg_remap. * config/sparc/sparc.c (leaf_reg_remap): Delete. (order_regs_for_local_alloc): Rename to... (sparc_order_regs_for_local_alloc): ...this. (sparc_leaf_reg_remap): New function. (sparc_conditional_register_usage): Do not modify leaf_reg_remap. --- gcc/config/sparc/sparc-protos.h | 3 +- gcc/config/sparc/sparc.c | 67 ++++++++++++++++++++--------------------- gcc/config/sparc/sparc.h | 5 ++- 3 files changed, 37 insertions(+), 38 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/sparc/sparc-protos.h b/gcc/config/sparc/sparc-protos.h index ad875cc..8879ac3 100644 --- a/gcc/config/sparc/sparc-protos.h +++ b/gcc/config/sparc/sparc-protos.h @@ -30,7 +30,8 @@ extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree); extern unsigned long sparc_type_code (tree); #endif /* TREE_CODE */ -extern void order_regs_for_local_alloc (void); +extern void sparc_order_regs_for_local_alloc (void); +extern int sparc_leaf_reg_remap (int); extern int sparc_initial_elimination_offset (int); extern void sparc_expand_prologue (void); extern void sparc_flat_expand_prologue (void); diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c index b6e66dc..04fc80f 100644 --- a/gcc/config/sparc/sparc.c +++ b/gcc/config/sparc/sparc.c @@ -507,25 +507,6 @@ static const struct processor_costs *sparc_costs = &cypress_costs; ((TARGET_ARCH64 && !TARGET_CM_MEDLOW) || flag_pic) #endif -/* Vector to say how input registers are mapped to output registers. - HARD_FRAME_POINTER_REGNUM cannot be remapped by this function to - eliminate it. You must use -fomit-frame-pointer to get that. */ -char leaf_reg_remap[] = -{ 0, 1, 2, 3, 4, 5, 6, 7, - -1, -1, -1, -1, -1, -1, 14, -1, - -1, -1, -1, -1, -1, -1, -1, -1, - 8, 9, 10, 11, 12, 13, -1, 15, - - 32, 33, 34, 35, 36, 37, 38, 39, - 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, - 56, 57, 58, 59, 60, 61, 62, 63, - 64, 65, 66, 67, 68, 69, 70, 71, - 72, 73, 74, 75, 76, 77, 78, 79, - 80, 81, 82, 83, 84, 85, 86, 87, - 88, 89, 90, 91, 92, 93, 94, 95, - 96, 97, 98, 99, 100, 101, 102}; - /* Vector, indexed by hard register number, which contains 1 for a register that is allowable in a candidate for leaf function treatment. */ @@ -8863,18 +8844,18 @@ epilogue_renumber (rtx *where, int test) /* Leaf functions and non-leaf functions have different needs. */ -static const int -reg_leaf_alloc_order[] = REG_LEAF_ALLOC_ORDER; +static const int reg_leaf_alloc_order[] = REG_LEAF_ALLOC_ORDER; -static const int -reg_nonleaf_alloc_order[] = REG_ALLOC_ORDER; +static const int reg_nonleaf_alloc_order[] = REG_ALLOC_ORDER; -static const int *const reg_alloc_orders[] = { +static const int *const reg_alloc_orders[] = +{ reg_leaf_alloc_order, - reg_nonleaf_alloc_order}; + reg_nonleaf_alloc_order +}; void -order_regs_for_local_alloc (void) +sparc_order_regs_for_local_alloc (void) { static int last_order_nonleaf = 1; @@ -8886,7 +8867,28 @@ order_regs_for_local_alloc (void) FIRST_PSEUDO_REGISTER * sizeof (int)); } } - + +int +sparc_leaf_reg_remap (int regno) +{ + gcc_checking_assert (regno >= 0); + + /* Do not remap in flat mode. */ + if (TARGET_FLAT) + return regno; + + /* Do not remap global, stack pointer or floating-point registers. */ + if (regno < 8 || regno == STACK_POINTER_REGNUM || regno > SPARC_LAST_INT_REG) + return regno; + + /* Neither out nor local nor frame pointer registers must appear. */ + if ((regno >= 8 && regno <= 23) || regno == HARD_FRAME_POINTER_REGNUM) + return -1; + + /* Remap in to out registers. */ + return regno - 16; +} + /* Return 1 if REG and MEM are legitimate enough to allow the various MEM<-->REG splits to be run. */ @@ -12983,14 +12985,11 @@ sparc_conditional_register_usage (void) fixed_regs[4] = 1; else if (fixed_regs[4] == 2) fixed_regs[4] = 0; + + /* Disable leaf function optimization in flat mode. */ if (TARGET_FLAT) - { - int regno; - /* Disable leaf functions. */ - memset (sparc_leaf_regs, 0, FIRST_PSEUDO_REGISTER); - for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - leaf_reg_remap [regno] = regno; - } + memset (sparc_leaf_regs, 0, FIRST_PSEUDO_REGISTER); + if (TARGET_VIS) global_regs[SPARC_GSR_REG] = 1; } diff --git a/gcc/config/sparc/sparc.h b/gcc/config/sparc/sparc.h index 4834575..4da5a06 100644 --- a/gcc/config/sparc/sparc.h +++ b/gcc/config/sparc/sparc.h @@ -965,13 +965,12 @@ extern enum reg_class sparc_regno_reg_class[FIRST_PSEUDO_REGISTER]; 96, 97, 98, 99, /* %fcc0-3 */ \ 100, 0, 14, 30, 31, 101, 102 } /* %icc, %g0, %o6, %i6, %i7, %sfp, %gsr */ -#define ADJUST_REG_ALLOC_ORDER order_regs_for_local_alloc () +#define ADJUST_REG_ALLOC_ORDER sparc_order_regs_for_local_alloc () extern char sparc_leaf_regs[]; #define LEAF_REGISTERS sparc_leaf_regs -extern char leaf_reg_remap[]; -#define LEAF_REG_REMAP(REGNO) (leaf_reg_remap[REGNO]) +#define LEAF_REG_REMAP(REGNO) sparc_leaf_reg_remap (REGNO) /* The class value for index registers, and the one for base regs. */ #define INDEX_REG_CLASS GENERAL_REGS -- cgit v1.1 From 9a90b311f22956addaf4f5f9bdb3592afd45083f Mon Sep 17 00:00:00 2001 From: liuhongt Date: Tue, 1 Jun 2021 09:09:44 +0800 Subject: Fix _mm256_zeroupper by representing the instructions as call_insns in which the call has a special vzeroupper ABI. When __builtin_ia32_vzeroupper is called explicitly, the corresponding vzeroupper pattern does not carry any CLOBBERS or SETs before LRA, which leads to incorrect optimization in pass_reload. In order to solve this problem, this patch refine instructions as call_insns in which the call has a special vzeroupper ABI. gcc/ChangeLog: PR target/82735 * config/i386/i386-expand.c (ix86_expand_builtin): Remove assignment of cfun->machine->has_explicit_vzeroupper. * config/i386/i386-features.c (ix86_add_reg_usage_to_vzerouppers): Delete. (ix86_add_reg_usage_to_vzeroupper): Ditto. (rest_of_handle_insert_vzeroupper): Remove ix86_add_reg_usage_to_vzerouppers, add df_analyze at the end of the function. (gate): Remove cfun->machine->has_explicit_vzeroupper. * config/i386/i386-protos.h (ix86_expand_avx_vzeroupper): Declared. * config/i386/i386.c (ix86_insn_callee_abi): New function. (ix86_initialize_callee_abi): Ditto. (ix86_expand_avx_vzeroupper): Ditto. (ix86_hard_regno_call_part_clobbered): Adjust for vzeroupper ABI. (TARGET_INSN_CALLEE_ABI): Define as ix86_insn_callee_abi. (ix86_emit_mode_set): Call ix86_expand_avx_vzeroupper directly. * config/i386/i386.h (struct GTY(()) machine_function): Delete has_explicit_vzeroupper. * config/i386/i386.md (enum unspec): New member UNSPEC_CALLEE_ABI. (ABI_DEFAULT,ABI_VZEROUPPER,ABI_UNKNOWN): New define_constants for insn callee abi index. * config/i386/predicates.md (vzeroupper_pattern): Adjust. * config/i386/sse.md (UNSPECV_VZEROUPPER): Deleted. (avx_vzeroupper): Call ix86_expand_avx_vzeroupper. (*avx_vzeroupper): Rename to .. (avx_vzeroupper_callee_abi): .. this, and adjust pattern as call_insn which has a special vzeroupper ABI. (*avx_vzeroupper_1): Deleted. gcc/testsuite/ChangeLog: PR target/82735 * gcc.target/i386/pr82735-1.c: New test. * gcc.target/i386/pr82735-2.c: New test. * gcc.target/i386/pr82735-3.c: New test. * gcc.target/i386/pr82735-4.c: New test. * gcc.target/i386/pr82735-5.c: New test. --- gcc/config/i386/i386-expand.c | 4 -- gcc/config/i386/i386-features.c | 99 ++++++----------------------------------- gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/i386.c | 55 ++++++++++++++++++++++- gcc/config/i386/i386.h | 4 -- gcc/config/i386/i386.md | 10 +++++ gcc/config/i386/predicates.md | 5 ++- gcc/config/i386/sse.md | 59 ++++++------------------ 8 files changed, 94 insertions(+), 143 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 804cb59..fb0676f 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -13310,10 +13310,6 @@ rdseed_step: return 0; - case IX86_BUILTIN_VZEROUPPER: - cfun->machine->has_explicit_vzeroupper = true; - break; - default: break; } diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index 77783a1..a25769a 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -1768,92 +1768,22 @@ convert_scalars_to_vector (bool timode_p) return 0; } -/* Modify the vzeroupper pattern in INSN so that it describes the effect - that the instruction has on the SSE registers. LIVE_REGS are the set - of registers that are live across the instruction. - - For a live register R we use: - - (set (reg:V2DF R) (reg:V2DF R)) - - which preserves the low 128 bits but clobbers the upper bits. */ - -static void -ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs) -{ - rtx pattern = PATTERN (insn); - unsigned int nregs = TARGET_64BIT ? 16 : 8; - unsigned int npats = nregs; - for (unsigned int i = 0; i < nregs; ++i) - { - unsigned int regno = GET_SSE_REGNO (i); - if (!bitmap_bit_p (live_regs, regno)) - npats--; - } - if (npats == 0) - return; - rtvec vec = rtvec_alloc (npats + 1); - RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0); - for (unsigned int i = 0, j = 0; i < nregs; ++i) - { - unsigned int regno = GET_SSE_REGNO (i); - if (!bitmap_bit_p (live_regs, regno)) - continue; - rtx reg = gen_rtx_REG (V2DImode, regno); - ++j; - RTVEC_ELT (vec, j) = gen_rtx_SET (reg, reg); - } - XVEC (pattern, 0) = vec; - INSN_CODE (insn) = -1; - df_insn_rescan (insn); -} - -/* Walk the vzeroupper instructions in the function and annotate them - with the effect that they have on the SSE registers. */ - -static void -ix86_add_reg_usage_to_vzerouppers (void) -{ - basic_block bb; - rtx_insn *insn; - auto_bitmap live_regs; - - df_analyze (); - FOR_EACH_BB_FN (bb, cfun) - { - bitmap_copy (live_regs, df_get_live_out (bb)); - df_simulate_initialize_backwards (bb, live_regs); - FOR_BB_INSNS_REVERSE (bb, insn) - { - if (!NONDEBUG_INSN_P (insn)) - continue; - if (vzeroupper_pattern (PATTERN (insn), VOIDmode)) - ix86_add_reg_usage_to_vzeroupper (insn, live_regs); - df_simulate_one_insn_backwards (bb, insn, live_regs); - } - } -} - static unsigned int rest_of_handle_insert_vzeroupper (void) { - if (TARGET_VZEROUPPER - && flag_expensive_optimizations - && !optimize_size) - { - /* vzeroupper instructions are inserted immediately after reload to - account for possible spills from 256bit or 512bit registers. The pass - reuses mode switching infrastructure by re-running mode insertion - pass, so disable entities that have already been processed. */ - for (int i = 0; i < MAX_386_ENTITIES; i++) - ix86_optimize_mode_switching[i] = 0; + /* vzeroupper instructions are inserted immediately after reload to + account for possible spills from 256bit or 512bit registers. The pass + reuses mode switching infrastructure by re-running mode insertion + pass, so disable entities that have already been processed. */ + for (int i = 0; i < MAX_386_ENTITIES; i++) + ix86_optimize_mode_switching[i] = 0; - ix86_optimize_mode_switching[AVX_U128] = 1; + ix86_optimize_mode_switching[AVX_U128] = 1; - /* Call optimize_mode_switching. */ - g->get_passes ()->execute_pass_mode_switching (); - } - ix86_add_reg_usage_to_vzerouppers (); + /* Call optimize_mode_switching. */ + g->get_passes ()->execute_pass_mode_switching (); + + df_analyze (); return 0; } @@ -1882,11 +1812,8 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { - return TARGET_AVX - && ((TARGET_VZEROUPPER - && flag_expensive_optimizations - && !optimize_size) - || cfun->machine->has_explicit_vzeroupper); + return TARGET_AVX && TARGET_VZEROUPPER + && flag_expensive_optimizations && !optimize_size; } virtual unsigned int execute (function *) diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 7782cf11..e6ac939 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -216,6 +216,7 @@ extern rtx ix86_split_stack_guard (void); extern void ix86_move_vector_high_sse_to_mmx (rtx); extern void ix86_split_mmx_pack (rtx[], enum rtx_code); extern void ix86_split_mmx_punpck (rtx[], bool); +extern void ix86_expand_avx_vzeroupper (void); #ifdef TREE_CODE extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 04649b4..b0d19a6 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -14426,7 +14426,7 @@ ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED, break; case AVX_U128: if (mode == AVX_U128_CLEAN) - emit_insn (gen_avx_vzeroupper ()); + ix86_expand_avx_vzeroupper (); break; case I387_ROUNDEVEN: case I387_TRUNC: @@ -19497,15 +19497,63 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) return false; } +/* Implement TARGET_INSN_CALLEE_ABI. */ + +const predefined_function_abi & +ix86_insn_callee_abi (const rtx_insn *insn) +{ + unsigned int abi_id = 0; + rtx pat = PATTERN (insn); + if (vzeroupper_pattern (pat, VOIDmode)) + abi_id = ABI_VZEROUPPER; + + return function_abis[abi_id]; +} + +/* Initialize function_abis with corresponding abi_id, + currently only handle vzeroupper. */ +void +ix86_initialize_callee_abi (unsigned int abi_id) +{ + gcc_assert (abi_id == ABI_VZEROUPPER); + predefined_function_abi &vzeroupper_abi = function_abis[abi_id]; + if (!vzeroupper_abi.initialized_p ()) + { + HARD_REG_SET full_reg_clobbers; + CLEAR_HARD_REG_SET (full_reg_clobbers); + vzeroupper_abi.initialize (ABI_VZEROUPPER, full_reg_clobbers); + } +} + +void +ix86_expand_avx_vzeroupper (void) +{ + /* Initialize vzeroupper_abi here. */ + ix86_initialize_callee_abi (ABI_VZEROUPPER); + rtx_insn *insn = emit_call_insn (gen_avx_vzeroupper_callee_abi ()); + /* Return false for non-local goto in can_nonlocal_goto. */ + make_reg_eh_region_note (insn, 0, INT_MIN); + /* Flag used for call_insn indicates it's a fake call. */ + RTX_FLAG (insn, used) = 1; +} + + /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that saves SSE registers across calls is Win64 (thus no need to check the current ABI here), and with AVX enabled Win64 only guarantees that the low 16 bytes are saved. */ static bool -ix86_hard_regno_call_part_clobbered (unsigned int, unsigned int regno, +ix86_hard_regno_call_part_clobbered (unsigned int abi_id, unsigned int regno, machine_mode mode) { + /* Special ABI for vzeroupper which only clobber higher part of sse regs. */ + if (abi_id == ABI_VZEROUPPER) + return (GET_MODE_SIZE (mode) > 16 + && ((TARGET_64BIT + && (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))) + || (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)))); + return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16; } @@ -23926,6 +23974,9 @@ ix86_run_selftests (void) #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \ ix86_hard_regno_call_part_clobbered +#undef TARGET_INSN_CALLEE_ABI +#define TARGET_INSN_CALLEE_ABI ix86_insn_callee_abi + #undef TARGET_CAN_CHANGE_MODE_CLASS #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 53d503f..919d0b2 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2659,10 +2659,6 @@ struct GTY(()) machine_function { /* True if the function needs a stack frame. */ BOOL_BITFIELD stack_frame_required : 1; - /* True if __builtin_ia32_vzeroupper () has been expanded in current - function. */ - BOOL_BITFIELD has_explicit_vzeroupper : 1; - /* True if we should act silently, rather than raise an error for invalid calls. */ BOOL_BITFIELD silent_p : 1; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index f0bb798..5ff49ec 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -191,6 +191,10 @@ ;; For MOVDIRI and MOVDIR64B support UNSPEC_MOVDIRI UNSPEC_MOVDIR64B + + ;; For insn_callee_abi: + UNSPEC_CALLEE_ABI + ]) (define_c_enum "unspecv" [ @@ -447,6 +451,12 @@ (FIRST_PSEUDO_REG 76) ]) +;; Insn callee abi index. +(define_constants + [(ABI_DEFAULT 0) + (ABI_VZEROUPPER 1) + (ABI_UNKNOWN 2)]) + ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls ;; from i386.c. diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index d2f5f15..3dd134e 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1596,8 +1596,9 @@ ;; return true if OP is a vzeroupper pattern. (define_predicate "vzeroupper_pattern" (and (match_code "parallel") - (match_code "unspec_volatile" "a") - (match_test "XINT (XVECEXP (op, 0, 0), 1) == UNSPECV_VZEROUPPER"))) + (match_code "unspec" "b") + (match_test "XINT (XVECEXP (op, 0, 1), 1) == UNSPEC_CALLEE_ABI") + (match_test "INTVAL (XVECEXP (XVECEXP (op, 0, 1), 0, 0)) == ABI_VZEROUPPER"))) ;; Return true if OP is an addsub vec_merge operation (define_predicate "addsub_vm_operator" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index e4248e5..9fc1176 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -205,7 +205,6 @@ UNSPECV_MONITOR UNSPECV_MWAIT UNSPECV_VZEROALL - UNSPECV_VZEROUPPER ;; For KEYLOCKER UNSPECV_LOADIWKEY @@ -20872,14 +20871,22 @@ ;; if the upper 128bits are unused. Initially we expand the instructions ;; as though they had no effect on the SSE registers, but later add SETs and ;; CLOBBERs to the PARALLEL to model the real effect. + (define_expand "avx_vzeroupper" - [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] - "TARGET_AVX") + [(parallel [(call (mem:QI (const_int 0)) + (const_int 0)) + (unspec [(const_int ABI_VZEROUPPER)] UNSPEC_CALLEE_ABI)])] + "TARGET_AVX" +{ + ix86_expand_avx_vzeroupper (); + DONE; +}) -(define_insn "*avx_vzeroupper" - [(match_parallel 0 "vzeroupper_pattern" - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] - "TARGET_AVX && XVECLEN (operands[0], 0) == (TARGET_64BIT ? 16 : 8) + 1" +(define_insn "avx_vzeroupper_callee_abi" + [(call (mem:QI (const_int 0)) + (const_int 0)) + (unspec [(const_int ABI_VZEROUPPER)] UNSPEC_CALLEE_ABI)] + "TARGET_AVX" "vzeroupper" [(set_attr "type" "sse") (set_attr "modrm" "0") @@ -20888,44 +20895,6 @@ (set_attr "btver2_decode" "vector") (set_attr "mode" "OI")]) -(define_insn_and_split "*avx_vzeroupper_1" - [(match_parallel 0 "vzeroupper_pattern" - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] - "TARGET_AVX && XVECLEN (operands[0], 0) != (TARGET_64BIT ? 16 : 8) + 1" - "#" - "&& epilogue_completed" - [(match_dup 0)] -{ - /* For IPA-RA purposes, make it clear the instruction clobbers - even XMM registers not mentioned explicitly in the pattern. */ - unsigned int nregs = TARGET_64BIT ? 16 : 8; - unsigned int npats = XVECLEN (operands[0], 0); - rtvec vec = rtvec_alloc (nregs + 1); - RTVEC_ELT (vec, 0) = XVECEXP (operands[0], 0, 0); - for (unsigned int i = 0, j = 1; i < nregs; ++i) - { - unsigned int regno = GET_SSE_REGNO (i); - if (j < npats - && REGNO (SET_DEST (XVECEXP (operands[0], 0, j))) == regno) - { - RTVEC_ELT (vec, i + 1) = XVECEXP (operands[0], 0, j); - j++; - } - else - { - rtx reg = gen_rtx_REG (V2DImode, regno); - RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg); - } - } - operands[0] = gen_rtx_PARALLEL (VOIDmode, vec); -} - [(set_attr "type" "sse") - (set_attr "modrm" "0") - (set_attr "memory" "none") - (set_attr "prefix" "vex") - (set_attr "btver2_decode" "vector") - (set_attr "mode" "OI")]) - (define_mode_attr pbroadcast_evex_isa [(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw") (V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw") -- cgit v1.1 From be5efe9c12cb852c788f74f8555e6ab8d755479b Mon Sep 17 00:00:00 2001 From: liuhongt Date: Thu, 3 Jun 2021 16:38:32 +0800 Subject: Fix ICE of insn does not satisfy its constraints. evex encoding vpmovzxbx needs both AVX512BW and AVX512VL which means constraint "Yw" should be used instead of constraint "v". gcc/ChangeLog: PR target/100885 * config/i386/sse.md (*sse4_1_zero_extendv8qiv8hi2_3): Refine constraints. (v4siv4di2): Delete constraints for define_expand. gcc/testsuite/ChangeLog: PR target/100885 * g++.target/i386/pr100885.C: New test. --- gcc/config/i386/sse.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 9fc1176..2a34756 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -18115,10 +18115,10 @@ "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);") (define_insn_and_split "*sse4_1_zero_extendv8qiv8hi2_3" - [(set (match_operand:V16QI 0 "register_operand" "=Yr,*x,v") + [(set (match_operand:V16QI 0 "register_operand" "=Yr,*x,Yw") (vec_select:V16QI (vec_concat:V32QI - (match_operand:V16QI 1 "vector_operand" "YrBm,*xBm,vm") + (match_operand:V16QI 1 "vector_operand" "YrBm,*xBm,Ywm") (match_operand:V16QI 2 "const0_operand" "C,C,C")) (match_parallel 3 "pmovzx_parallel" [(match_operand 4 "const_int_operand" "n,n,n")])))] @@ -18803,9 +18803,9 @@ }) (define_expand "v4siv4di2" - [(set (match_operand:V4DI 0 "register_operand" "=v") + [(set (match_operand:V4DI 0 "register_operand") (any_extend:V4DI - (match_operand:V4SI 1 "nonimmediate_operand" "vm")))] + (match_operand:V4SI 1 "nonimmediate_operand")))] "TARGET_AVX2") (define_insn "sse4_1_v2siv2di2" -- cgit v1.1 From f0d1a675e0f621fc12c7a9db47446ae38289408a Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Sun, 6 Jun 2021 00:44:13 -0400 Subject: Use moves to eliminate redundant test/compare instructions gcc/ * config/h8300/movepush.md: Change most _clobber_flags patterns to instead use subst. (movsi_cczn): New pattern with usable CC cases split out. (movsi_h8sx_cczn): Likewise. --- gcc/config/h8300/movepush.md | 53 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/movepush.md b/gcc/config/h8300/movepush.md index 9ce00fb..ada4ddd 100644 --- a/gcc/config/h8300/movepush.md +++ b/gcc/config/h8300/movepush.md @@ -13,7 +13,7 @@ [(parallel [(set (match_dup 0) (match_dup 1)) (clobber (reg:CC CC_REG))])]) -(define_insn "*movqi_clobber_flags" +(define_insn "*movqi" [(set (match_operand:QI 0 "general_operand_dst" "=r,r ,<,r,r,m") (match_operand:QI 1 "general_operand_src" " I,r>,r,n,m,r")) (clobber (reg:CC CC_REG))] @@ -36,7 +36,7 @@ [(parallel [(set (match_dup 0) (match_dup 1)) (clobber (reg:CC CC_REG))])]) -(define_insn "*movqi_h8sx_clobber_flags" +(define_insn "*movqi_h8sx" [(set (match_operand:QI 0 "general_operand_dst" "=Z,rQ") (match_operand:QI 1 "general_operand_src" "P4>X,rQi")) (clobber (reg:CC CC_REG))] @@ -74,7 +74,7 @@ (clobber (reg:CC CC_REG))])]) -(define_insn "movstrictqi_clobber_flags" +(define_insn "*movstrictqi" [(set (strict_low_part (match_operand:QI 0 "general_operand_dst" "+r,r")) (match_operand:QI 1 "general_operand_src" "I,rmi>")) (clobber (reg:CC CC_REG))] @@ -97,7 +97,7 @@ [(parallel [(set (match_dup 0) (match_dup 1)) (clobber (reg:CC CC_REG))])]) -(define_insn "*movhi_clobber_flags" +(define_insn "*movhi" [(set (match_operand:HI 0 "general_operand_dst" "=r,r,<,r,r,m") (match_operand:HI 1 "general_operand_src" "I,r>,r,i,m,r")) (clobber (reg:CC CC_REG))] @@ -121,7 +121,7 @@ [(parallel [(set (match_dup 0) (match_dup 1)) (clobber (reg:CC CC_REG))])]) -(define_insn "*movhi_h8sx_clobber_flags" +(define_insn "*movhi_h8sx" [(set (match_operand:HI 0 "general_operand_dst" "=r,r,Z,Q,rQ") (match_operand:HI 1 "general_operand_src" "I,P3>X,P4>X,IP8>X,rQi")) (clobber (reg:CC CC_REG))] @@ -144,7 +144,7 @@ [(parallel [(set (strict_low_part (match_dup 0)) (match_dup 1)) (clobber (reg:CC CC_REG))])]) -(define_insn "movstricthi_clobber_flags" +(define_insn "*movstricthi" [(set (strict_low_part (match_operand:HI 0 "general_operand_dst" "+r,r,r")) (match_operand:HI 1 "general_operand_src" "I,P3>X,rmi")) (clobber (reg:CC CC_REG))] @@ -168,8 +168,8 @@ (clobber (reg:CC CC_REG))])]) (define_insn "*movsi_clobber_flags" - [(set (match_operand:SI 0 "general_operand_dst" "=r,r,r,<,r,r,m,*a,*a,r") - (match_operand:SI 1 "general_operand_src" "I,r,i,r,>,m,r,I,r,*a")) + [(set (match_operand:SI 0 "general_operand_dst" "=r,r,r,<,r,r,m,*a,*a, r") + (match_operand:SI 1 "general_operand_src" " I,r,i,r,>,m,r, I, r,*a")) (clobber (reg:CC CC_REG))] "(TARGET_H8300S || TARGET_H8300H) && !TARGET_H8300SX && h8300_move_ok (operands[0], operands[1])" @@ -235,6 +235,25 @@ } [(set (attr "length") (symbol_ref "compute_mov_length (operands)"))]) +(define_insn "*movsi_cczn" + [(set (reg:CCZN CC_REG) + (compare:CCZN + (match_operand:SI 1 "general_operand_src" " I,r,i,r,>,m,r") + (const_int 0))) + (set (match_operand:SI 0 "general_operand_dst" "=r,r,r,<,r,r,m") + (match_dup 1))] + "(TARGET_H8300S || TARGET_H8300H) && !TARGET_H8300SX + && h8300_move_ok (operands[0], operands[1])" + "@ + sub.l %S0,%S0 + mov.l %S1,%S0 + mov.l %S1,%S0 + mov.l %S1,%S0 + mov.l %S1,%S0 + mov.l %S1,%S0 + mov.l %S1,%S0" + [(set (attr "length") (symbol_ref "compute_mov_length (operands)"))]) + (define_insn_and_split "*movsi_h8sx" [(set (match_operand:SI 0 "general_operand_dst" "=r,r,Q,rQ,*a,*a,r") (match_operand:SI 1 "general_operand_src" "I,P3>X,IP8>X,rQi,I,r,*a"))] @@ -260,6 +279,22 @@ [(set_attr "length_table" "*,*,short_immediate,movl,*,*,*") (set_attr "length" "2,2,*,*,2,6,4")]) +(define_insn "*movsi_h8sx_ccnz" + [(set (reg:CCZN CC_REG) + (compare:CCZN + (match_operand:SI 1 "general_operand_src" "I,P3>X,IP8>X,rQi") + (const_int 0))) + (set (match_operand:SI 0 "general_operand_dst" "=r,r,Q,rQ") + (match_dup 1))] + "TARGET_H8300SX" + "@ + sub.l %S0,%S0 + mov.l %S1:3,%S0 + mov.l %S1,%S0 + mov.l %S1,%S0" + [(set_attr "length_table" "*,*,short_immediate,movl") + (set_attr "length" "2,2,*,*")]) + (define_insn_and_split "*movsf_h8sx" [(set (match_operand:SF 0 "general_operand_dst" "=r,rQ") (match_operand:SF 1 "general_operand_src" "G,rQi"))] @@ -326,7 +361,7 @@ (match_dup 0)) (clobber (reg:CC CC_REG))])]) -(define_insn "*push1__clobber_flags" +(define_insn "*push1_" [(set (mem:QHI (pre_modify:P (reg:P SP_REG) -- cgit v1.1 From 64735dc923e0a1a2e04c5313471d91ca8b954e9a Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 7 Jun 2021 22:58:15 +0200 Subject: i386: Add init pattern for V4QI vectors [PR100637] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2021-06-07 Uroš Bizjak gcc/ PR target/100637 * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate): Handle V4QI mode. (ix86_expand_vector_init_one_nonzero): Ditto. (ix86_expand_vector_init_one_var): Ditto. (ix86_expand_vector_init_general): Ditto. * config/i386/mmx.md (vec_initv4qiqi): New expander. gcc/testsuite/ PR target/100637 * gcc.target/i386/pr100637-5b.c: New test. * gcc.target/i386/pr100637-5w.c: Ditto. --- gcc/config/i386/i386-expand.c | 9 +++++++++ gcc/config/i386/mmx.md | 12 +++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index fb0676f..c3ce21b 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -13733,6 +13733,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, return false; case E_V8QImode: + case E_V4QImode: if (!mmx_ok) return false; goto widen; @@ -13878,6 +13879,9 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, case E_V4HImode: use_vector_set = TARGET_SSE || TARGET_3DNOW_A; break; + case E_V4QImode: + use_vector_set = TARGET_SSE4_1; + break; case E_V32QImode: case E_V16HImode: use_vector_set = TARGET_AVX; @@ -14086,6 +14090,10 @@ ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode, break; wmode = V4HImode; goto widen; + case E_V4QImode: + if (TARGET_SSE4_1) + break; + wmode = V2HImode; widen: /* There's no way to set one QImode entry easily. Combine the variable value with its adjacent constant value, and @@ -14535,6 +14543,7 @@ quarter: case E_V8QImode: case E_V2HImode: + case E_V4QImode: break; default: diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index c3fd280..0a17a54 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -3369,7 +3369,17 @@ (match_operand 1)] "TARGET_SSE2" { - ix86_expand_vector_init (false, operands[0], + ix86_expand_vector_init (TARGET_MMX_WITH_SSE, operands[0], + operands[1]); + DONE; +}) + +(define_expand "vec_initv4qiqi" + [(match_operand:V2HI 0 "register_operand") + (match_operand 1)] + "TARGET_SSE2" +{ + ix86_expand_vector_init (TARGET_MMX_WITH_SSE, operands[0], operands[1]); DONE; }) -- cgit v1.1 From 941aa24ca9553b422dba6e267448ddd952bc52d1 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Tue, 8 Jun 2021 10:10:23 -0400 Subject: Further improve redundant test/compare removal on the H8 gcc/ * config/h8300/logical.md (andqi3_1): Move BCLR case into define_insn_and_split. Create length attribute on define_insn_and_split. Only split for cases which we know will use AND. (andqi3_1): Renamed from andqi3_1_clobber_flags. Only handle AND here and fix length computation. (bmsx): Combine QImode and HImode H8/SX patterns using iterator. --- gcc/config/h8300/logical.md | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/logical.md b/gcc/config/h8300/logical.md index 34cf74e..fae3c7c 100644 --- a/gcc/config/h8300/logical.md +++ b/gcc/config/h8300/logical.md @@ -62,22 +62,21 @@ (match_operand:QI 2 "h8300_src_operand" "Y0,rn")))] "register_operand (operands[0], QImode) || single_zero_operand (operands[2], QImode)" - "#" - "&& reload_completed" + "bclr %W2,%R0" + "&& reload_completed && !single_zero_operand (operands[2], QImode)" [(parallel [(set (match_dup 0) (and:QI (match_dup 1) (match_dup 2))) - (clobber (reg:CC CC_REG))])]) + (clobber (reg:CC CC_REG))])] + "" + [(set_attr "length" "8,2")]) -(define_insn "andqi3_1_clobber_flags" - [(set (match_operand:QI 0 "bit_operand" "=U,r") - (and:QI (match_operand:QI 1 "bit_operand" "%0,0") - (match_operand:QI 2 "h8300_src_operand" "Y0,rn"))) +(define_insn "*andqi3_1" + [(set (match_operand:QI 0 "register_operand" "=r") + (and:QI (match_operand:QI 1 "register_operand" "%0") + (match_operand:QI 2 "h8300_src_operand" "rn"))) (clobber (reg:CC CC_REG))] - "register_operand (operands[0], QImode) - || single_zero_operand (operands[2], QImode)" - "@ - bclr %W2,%R0 - and %X2,%X0" - [(set_attr "length" "2,8")]) + "" + "and %X2,%X0" + [(set_attr "length" "2")]) (define_insn_and_split "*andor3" [(set (match_operand:QHSI 0 "register_operand" "=r") @@ -166,22 +165,14 @@ ;; OR/XOR INSTRUCTIONS ;; ---------------------------------------------------------------------- -(define_insn "bqi_msx" - [(set (match_operand:QI 0 "bit_register_indirect_operand" "=WU") - (ors:QI (match_operand:QI 1 "bit_register_indirect_operand" "%0") - (match_operand:QI 2 "single_one_operand" "Y2")))] +(define_insn "b_msx" + [(set (match_operand:QHI 0 "bit_register_indirect_operand" "=WU") + (ors:QHI (match_operand:QHI 1 "bit_register_indirect_operand" "%0") + (match_operand:QHI 2 "single_one_operand" "Y2")))] "TARGET_H8300SX && rtx_equal_p (operands[0], operands[1])" { return == IOR ? "bset\\t%V2,%0" : "bnot\\t%V2,%0"; } [(set_attr "length" "8")]) -(define_insn "bhi_msx" - [(set (match_operand:HI 0 "bit_register_indirect_operand" "=m") - (ors:HI (match_operand:HI 1 "bit_register_indirect_operand" "%0") - (match_operand:HI 2 "single_one_operand" "Y2")))] - "TARGET_H8300SX" - { return == IOR ? "bset\\t%V2,%0" : "bnot\\t%V2,%0"; } - [(set_attr "length" "8")]) - (define_insn_and_split "qi3_1" [(set (match_operand:QI 0 "bit_operand" "=U,rQ") (ors:QI (match_operand:QI 1 "bit_operand" "%0,0") -- cgit v1.1 From 69bb37f9e0143fbca3124069c0e9b6937ccf1fc7 Mon Sep 17 00:00:00 2001 From: Pat Haugen Date: Tue, 8 Jun 2021 11:41:55 -0500 Subject: Update Power10 scheduling description for new fused instruction types. gcc/ChangeLog: * config/rs6000/power10.md (power10-fused-load, power10-fused-store, power10-fused_alu, power10-fused-vec, power10-fused-branch): New. --- gcc/config/rs6000/power10.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/power10.md b/gcc/config/rs6000/power10.md index 665f0f2..0186ae9 100644 --- a/gcc/config/rs6000/power10.md +++ b/gcc/config/rs6000/power10.md @@ -100,6 +100,11 @@ (eq_attr "cpu" "power10")) "DU_any_power10,LU_power10") +(define_insn_reservation "power10-fused-load" 4 + (and (eq_attr "type" "fused_load_cmpi,fused_addis_load,fused_load_load") + (eq_attr "cpu" "power10")) + "DU_even_power10,LU_power10") + (define_insn_reservation "power10-prefixed-load" 4 (and (eq_attr "type" "load") (eq_attr "update" "no") @@ -176,6 +181,11 @@ (eq_attr "cpu" "power10")) "DU_any_power10,STU_power10") +(define_insn_reservation "power10-fused-store" 0 + (and (eq_attr "type" "fused_store_store") + (eq_attr "cpu" "power10")) + "DU_even_power10,STU_power10") + (define_insn_reservation "power10-prefixed-store" 0 (and (eq_attr "type" "store,fpstore,vecstore") (eq_attr "prefixed" "yes") @@ -244,6 +254,11 @@ (define_bypass 4 "power10-alu" "power10-crlogical,power10-mfcr,power10-mfcrf") +(define_insn_reservation "power10-fused_alu" 2 + (and (eq_attr "type" "fused_arith_logical,fused_cmp_isel,fused_carry") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_power10") + ; paddi (define_insn_reservation "power10-paddi" 2 (and (eq_attr "type" "add") @@ -403,6 +418,11 @@ (eq_attr "cpu" "power10")) "DU_any_power10,EXU_power10") +(define_insn_reservation "power10-fused-vec" 2 + (and (eq_attr "type" "fused_vector") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_power10") + (define_insn_reservation "power10-veccmp" 3 (and (eq_attr "type" "veccmp") (eq_attr "cpu" "power10")) @@ -490,6 +510,11 @@ (eq_attr "cpu" "power10")) "DU_any_power10,STU_power10") +(define_insn_reservation "power10-fused-branch" 3 + (and (eq_attr "type" "fused_mtbc") + (eq_attr "cpu" "power10")) + "DU_even_power10,STU_power10") + ; Crypto (define_insn_reservation "power10-crypto" 4 -- cgit v1.1 From f700e4b0ee3ef53b48975cf89be26b9177e3a3f3 Mon Sep 17 00:00:00 2001 From: Xionghu Luo Date: Tue, 8 Jun 2021 21:48:12 -0500 Subject: rs6000: Support doubleword swaps removal in rot64 load store [PR100085] On P8LE, extra rot64+rot64 load or store instructions are generated in float128 to vector __int128 conversion. This patch teaches pass swaps to also handle such pattens to remove extra swap instructions. (insn 7 6 8 2 (set (subreg:V1TI (reg:KF 123) 0) (rotate:V1TI (mem/u/c:V1TI (reg/f:DI 121) [0 S16 A128]) (const_int 64 [0x40]))) {*vsx_le_permute_v1ti}) (insn 8 7 9 2 (set (subreg:V1TI (reg:KF 122) 0) (rotate:V1TI (subreg:V1TI (reg:KF 123) 0) (const_int 64 [0x40]))) {*vsx_le_permute_v1ti}) => (insn 22 6 23 2 (set (subreg:V1TI (reg:KF 123) 0) (mem/u/c:V1TI (and:DI (reg/f:DI 121) (const_int -16 [0xfffffffffffffff0])) [0 S16 A128]))) (insn 23 22 25 2 (set (subreg:V1TI (reg:KF 122) 0) (subreg:V1TI (reg:KF 123) 0))) gcc/ChangeLog: 2021-06-09 Xionghu Luo * config/rs6000/rs6000-p8swap.c (pattern_is_rotate64): New. (insn_is_load_p): Use pattern_is_rotate64. (insn_is_swap_p): Likewise. (quad_aligned_load_p): Likewise. (const_load_sequence_p): Likewise. (replace_swapped_aligned_load): Likewise. (recombine_lvx_pattern): Likewise. (recombine_stvx_pattern): Likewise. gcc/testsuite/ChangeLog: 2021-06-09 Xionghu Luo * gcc.target/powerpc/float128-call.c: Adjust. * gcc.target/powerpc/pr100085.c: New test. --- gcc/config/rs6000/rs6000-p8swap.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-p8swap.c b/gcc/config/rs6000/rs6000-p8swap.c index ad2b302..21cbcb2 100644 --- a/gcc/config/rs6000/rs6000-p8swap.c +++ b/gcc/config/rs6000/rs6000-p8swap.c @@ -250,6 +250,21 @@ union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def) } } +/* Return 1 iff PAT (a SINGLE_SET) is a rotate 64 bit expression; else return + 0. */ + +static bool +pattern_is_rotate64 (rtx pat) +{ + rtx rot = SET_SRC (pat); + + if (GET_CODE (rot) == ROTATE && CONST_INT_P (XEXP (rot, 1)) + && INTVAL (XEXP (rot, 1)) == 64) + return true; + + return false; +} + /* Return 1 iff INSN is a load insn, including permuting loads that represent an lvxd2x instruction; else return 0. */ static unsigned int @@ -266,6 +281,9 @@ insn_is_load_p (rtx insn) && MEM_P (XEXP (SET_SRC (body), 0))) return 1; + if (pattern_is_rotate64 (body) && MEM_P (XEXP (SET_SRC (body), 0))) + return 1; + return 0; } @@ -305,6 +323,8 @@ insn_is_swap_p (rtx insn) if (GET_CODE (body) != SET) return 0; rtx rhs = SET_SRC (body); + if (pattern_is_rotate64 (body)) + return 1; if (GET_CODE (rhs) != VEC_SELECT) return 0; rtx parallel = XEXP (rhs, 1); @@ -392,7 +412,8 @@ quad_aligned_load_p (swap_web_entry *insn_entry, rtx_insn *insn) false. */ rtx body = PATTERN (def_insn); if (GET_CODE (body) != SET - || GET_CODE (SET_SRC (body)) != VEC_SELECT + || !(GET_CODE (SET_SRC (body)) == VEC_SELECT + || pattern_is_rotate64 (body)) || !MEM_P (XEXP (SET_SRC (body), 0))) return false; @@ -531,7 +552,8 @@ const_load_sequence_p (swap_web_entry *insn_entry, rtx insn) false. */ rtx body = PATTERN (def_insn); if (GET_CODE (body) != SET - || GET_CODE (SET_SRC (body)) != VEC_SELECT + || !(GET_CODE (SET_SRC (body)) == VEC_SELECT + || pattern_is_rotate64 (body)) || !MEM_P (XEXP (SET_SRC (body), 0))) return false; @@ -1732,7 +1754,8 @@ replace_swapped_aligned_load (swap_web_entry *insn_entry, rtx swap_insn) swap (indicated by code VEC_SELECT). */ rtx body = PATTERN (def_insn); gcc_assert ((GET_CODE (body) == SET) - && (GET_CODE (SET_SRC (body)) == VEC_SELECT) + && (GET_CODE (SET_SRC (body)) == VEC_SELECT + || pattern_is_rotate64 (body)) && MEM_P (XEXP (SET_SRC (body), 0))); rtx src_exp = XEXP (SET_SRC (body), 0); @@ -2150,7 +2173,8 @@ recombine_lvx_pattern (rtx_insn *insn, del_info *to_delete) { rtx body = PATTERN (insn); gcc_assert (GET_CODE (body) == SET - && GET_CODE (SET_SRC (body)) == VEC_SELECT + && (GET_CODE (SET_SRC (body)) == VEC_SELECT + || pattern_is_rotate64 (body)) && MEM_P (XEXP (SET_SRC (body), 0))); rtx mem = XEXP (SET_SRC (body), 0); @@ -2227,7 +2251,8 @@ recombine_stvx_pattern (rtx_insn *insn, del_info *to_delete) rtx body = PATTERN (insn); gcc_assert (GET_CODE (body) == SET && MEM_P (SET_DEST (body)) - && GET_CODE (SET_SRC (body)) == VEC_SELECT); + && (GET_CODE (SET_SRC (body)) == VEC_SELECT + || pattern_is_rotate64 (body))); rtx mem = SET_DEST (body); rtx base_reg = XEXP (mem, 0); -- cgit v1.1 From 267dbd42f42c52a515f49c0875d296a9cf5988fe Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 9 Jun 2021 09:46:00 +0200 Subject: i386: Do not emit segment overrides for %p and %P [PR100936] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using %p to move the address of a symbol using LEA: asm ("lea %p1, %0" : "=r"(addr) : "m"(var)); emits assembler warning when VAR is declared in a non-generic address space: Warning: segment override on `lea' is ineffectual The problem is with %p operand modifier, which should emit raw symbol name: p -- print raw symbol name. Similar problem exists with %P modifier, trying to CALL or JMP to an overridden symbol,e.g: call %gs:zzz jmp %gs:zzz emits assembler warning: Warning: skipping prefixes on `call' Warning: skipping prefixes on `jmp' Ensure that %p and %P never emit segment overrides. 2021-06-08 Uroš Bizjak gcc/ PR target/100936 * config/i386/i386.c (print_operand_address_as): Rename "no_rip" argument to "raw". Do not emit segment overrides when "raw" is true. gcc/testsuite/ PR target/100936 * gcc.target/i386/pr100936.c: New test. --- gcc/config/i386/i386.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index b0d19a6..05b8dc8 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -13531,7 +13531,7 @@ ix86_print_operand_punct_valid_p (unsigned char code) static void ix86_print_operand_address_as (FILE *file, rtx addr, - addr_space_t as, bool no_rip) + addr_space_t as, bool raw) { struct ix86_address parts; rtx base, index, disp; @@ -13570,7 +13570,7 @@ ix86_print_operand_address_as (FILE *file, rtx addr, else gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg)); - if (!ADDR_SPACE_GENERIC_P (as)) + if (!ADDR_SPACE_GENERIC_P (as) && !raw) { if (ASSEMBLER_DIALECT == ASM_ATT) putc ('%', file); @@ -13589,7 +13589,7 @@ ix86_print_operand_address_as (FILE *file, rtx addr, } /* Use one byte shorter RIP relative addressing for 64bit mode. */ - if (TARGET_64BIT && !base && !index && !no_rip) + if (TARGET_64BIT && !base && !index && !raw) { rtx symbol = disp; -- cgit v1.1 From c0ba7a8af5366c37241f20e8be41e362f7260389 Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Wed, 9 Jun 2021 12:12:57 +0300 Subject: arc: Update 64bit move split patterns. ARCv2HS can use a limited number of instructions to implement 64bit moves. The VADD2 is used as a 64bit move, the LDD/STD are 64 bit loads and stores. All those instructions are not baseline, hence we need to provide alternatives when they are not available or cannot be generate due to instruction restriction. This patch is cleaning up those move patterns, and updates splits instruction lengths. gcc/ 2021-06-09 Claudiu Zissulescu * config/arc/arc-protos.h (arc_split_move_p): New prototype. * config/arc/arc.c (arc_split_move_p): New function. (arc_split_move): Clean up. * config/arc/arc.md (movdi_insn): Clean up, use arc_split_move_p. (movdf_insn): Likewise. * config/arc/simdext.md (mov_insn): Likewise. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/arc-protos.h | 1 + gcc/config/arc/arc.c | 44 ++++++++++++---------- gcc/config/arc/arc.md | 91 +++++++++------------------------------------ gcc/config/arc/simdext.md | 38 ++++--------------- 4 files changed, 52 insertions(+), 122 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h index 1f56a0d..62d7e45 100644 --- a/gcc/config/arc/arc-protos.h +++ b/gcc/config/arc/arc-protos.h @@ -50,6 +50,7 @@ extern void arc_split_ior (rtx *); extern bool arc_check_mov_const (HOST_WIDE_INT ); extern bool arc_split_mov_const (rtx *); extern bool arc_can_use_return_insn (void); +extern bool arc_split_move_p (rtx *); #endif /* RTX_CODE */ extern bool arc_ccfsm_branch_deleted_p (void); diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c index 0d34c96..69f6ae4 100644 --- a/gcc/config/arc/arc.c +++ b/gcc/config/arc/arc.c @@ -10108,6 +10108,31 @@ arc_process_double_reg_moves (rtx *operands) return true; } + +/* Check if we need to split a 64bit move. We do not need to split it if we can + use vadd2 or ldd/std instructions. */ + +bool +arc_split_move_p (rtx *operands) +{ + machine_mode mode = GET_MODE (operands[0]); + + if (TARGET_LL64 + && ((memory_operand (operands[0], mode) + && (even_register_operand (operands[1], mode) + || satisfies_constraint_Cm3 (operands[1]))) + || (memory_operand (operands[1], mode) + && even_register_operand (operands[0], mode)))) + return false; + + if (TARGET_PLUS_QMACW + && even_register_operand (operands[0], mode) + && even_register_operand (operands[1], mode)) + return false; + + return true; +} + /* operands 0..1 are the operands of a 64 bit move instruction. split it into two moves with operands 2/3 and 4/5. */ @@ -10125,25 +10150,6 @@ arc_split_move (rtx *operands) return; } - if (TARGET_LL64 - && ((memory_operand (operands[0], mode) - && (even_register_operand (operands[1], mode) - || satisfies_constraint_Cm3 (operands[1]))) - || (memory_operand (operands[1], mode) - && even_register_operand (operands[0], mode)))) - { - emit_move_insn (operands[0], operands[1]); - return; - } - - if (TARGET_PLUS_QMACW - && even_register_operand (operands[0], mode) - && even_register_operand (operands[1], mode)) - { - emit_move_insn (operands[0], operands[1]); - return; - } - if (TARGET_PLUS_QMACW && GET_CODE (operands[1]) == CONST_VECTOR) { diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index de61b2b..6f13b3a 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -1330,47 +1330,20 @@ core_3, archs4x, archs4xd, archs4xd_slow" "register_operand (operands[0], DImode) || register_operand (operands[1], DImode) || (satisfies_constraint_Cm3 (operands[1]) - && memory_operand (operands[0], DImode))" - "* -{ - switch (which_alternative) - { - default: - return \"#\"; - - case 0: - if (TARGET_PLUS_QMACW - && even_register_operand (operands[0], DImode) - && even_register_operand (operands[1], DImode)) - return \"vadd2%?\\t%0,%1,0\"; - return \"#\"; - - case 2: - if (TARGET_LL64 - && memory_operand (operands[1], DImode) - && even_register_operand (operands[0], DImode)) - return \"ldd%U1%V1 %0,%1%&\"; - return \"#\"; - - case 3: - if (TARGET_LL64 - && memory_operand (operands[0], DImode) - && (even_register_operand (operands[1], DImode) - || satisfies_constraint_Cm3 (operands[1]))) - return \"std%U0%V0 %1,%0\"; - return \"#\"; - } -}" - "&& reload_completed" + && memory_operand (operands[0], DImode))" + "@ + vadd2\\t%0,%1,0 + # + ldd%U1%V1\\t%0,%1 + std%U0%V0\\t%1,%0" + "&& reload_completed && arc_split_move_p (operands)" [(const_int 0)] { arc_split_move (operands); DONE; } [(set_attr "type" "move,move,load,store") - ;; ??? The ld/st values could be 4 if it's [reg,bignum]. - (set_attr "length" "8,16,*,*")]) - + (set_attr "length" "8,16,16,16")]) ;; Floating point move insns. @@ -1409,50 +1382,22 @@ core_3, archs4x, archs4xd, archs4xd_slow" (define_insn_and_split "*movdf_insn" [(set (match_operand:DF 0 "move_dest_operand" "=D,r,r,r,r,m") (match_operand:DF 1 "move_double_src_operand" "r,D,r,E,m,r"))] - "register_operand (operands[0], DFmode) - || register_operand (operands[1], DFmode)" - "* -{ - switch (which_alternative) - { - default: - return \"#\"; - - case 2: - if (TARGET_PLUS_QMACW - && even_register_operand (operands[0], DFmode) - && even_register_operand (operands[1], DFmode)) - return \"vadd2%?\\t%0,%1,0\"; - return \"#\"; - - case 4: - if (TARGET_LL64 - && ((even_register_operand (operands[0], DFmode) - && memory_operand (operands[1], DFmode)) - || (memory_operand (operands[0], DFmode) - && even_register_operand (operands[1], DFmode)))) - return \"ldd%U1%V1 %0,%1%&\"; - return \"#\"; - - case 5: - if (TARGET_LL64 - && ((even_register_operand (operands[0], DFmode) - && memory_operand (operands[1], DFmode)) - || (memory_operand (operands[0], DFmode) - && even_register_operand (operands[1], DFmode)))) - return \"std%U0%V0 %1,%0\"; - return \"#\"; - } -}" - "reload_completed" + "(register_operand (operands[0], DFmode) + || register_operand (operands[1], DFmode))" + "@ + # + # + vadd2\\t%0,%1,0 + # + ldd%U1%V1\\t%0,%1 + std%U0%V0\\t%1,%0" + "&& reload_completed && arc_split_move_p (operands)" [(const_int 0)] { arc_split_move (operands); DONE; } [(set_attr "type" "move,move,move,move,load,store") - (set_attr "predicable" "no,no,no,yes,no,no") - ;; ??? The ld/st values could be 16 if it's [reg,bignum]. (set_attr "length" "4,16,8,16,16,16")]) (define_insn_and_split "*movdf_insn_nolrsr" diff --git a/gcc/config/arc/simdext.md b/gcc/config/arc/simdext.md index dd63f93..303f52c 100644 --- a/gcc/config/arc/simdext.md +++ b/gcc/config/arc/simdext.md @@ -1472,41 +1472,19 @@ (match_operand:VWH 1 "general_operand" "i,r,m,r"))] "(register_operand (operands[0], mode) || register_operand (operands[1], mode))" - "* -{ - switch (which_alternative) - { - default: - return \"#\"; - - case 1: - if (TARGET_PLUS_QMACW - && even_register_operand (operands[0], mode) - && even_register_operand (operands[1], mode)) - return \"vadd2%?\\t%0,%1,0\"; - return \"#\"; - - case 2: - if (TARGET_LL64) - return \"ldd%U1%V1\\t%0,%1\"; - return \"#\"; - - case 3: - if (TARGET_LL64) - return \"std%U0%V0\\t%1,%0\"; - return \"#\"; - } -}" - "reload_completed" + "@ + # + vadd2\\t%0,%1,0 + ldd%U1%V1\\t%0,%1 + std%U0%V0\\t%1,%0" + "&& reload_completed && arc_split_move_p (operands)" [(const_int 0)] { arc_split_move (operands); DONE; } - [(set_attr "type" "move,multi,load,store") - (set_attr "predicable" "no,no,no,no") - (set_attr "iscompact" "false,false,false,false") - ]) + [(set_attr "type" "move,move,load,store") + (set_attr "length" "16,8,16,16")]) (define_expand "movmisalign" [(set (match_operand:VWH 0 "general_operand" "") -- cgit v1.1 From dd4778a59b4693777c732075021375e19eee6a76 Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Wed, 9 Jun 2021 12:12:57 +0300 Subject: arc: Fix (u)maddhisi patterns Rework the (u)maddhisi4 patterns and use VMAC2H(U) instruction instead of the 64bit MAC(U) instruction. This fixes the next execute.exp failures: arith-rand-ll.c -O2 execution test arith-rand-ll.c -O3 execution test pr78726.c -O2 execution test pr78726.c -O3 execution test gcc/ 2021-06-09 Claudiu Zissulescu * config/arc/arc.md (maddhisi4): Use VMAC2H instruction. (machi): New pattern. (umaddhisi4): Use VMAC2HU instruction. (umachi): New pattern. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/arc.md | 66 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 25 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index 6f13b3a..aed0b40 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -6025,48 +6025,64 @@ core_3, archs4x, archs4xd, archs4xd_slow" ;; MAC and DMPY instructions -; Use MAC instruction to emulate 16bit mac. +; Use VMAC2H(U) instruction to emulate scalar 16bit mac. (define_expand "maddhisi4" [(match_operand:SI 0 "register_operand" "") (match_operand:HI 1 "register_operand" "") (match_operand:HI 2 "extend_operand" "") (match_operand:SI 3 "register_operand" "")] - "TARGET_PLUS_DMPY" + "TARGET_PLUS_MACD" "{ - rtx acc_reg = gen_rtx_REG (DImode, ACC_REG_FIRST); - rtx tmp1 = gen_reg_rtx (SImode); - rtx tmp2 = gen_reg_rtx (SImode); - rtx accl = gen_lowpart (SImode, acc_reg); - - emit_move_insn (accl, operands[3]); - emit_insn (gen_rtx_SET (tmp1, gen_rtx_SIGN_EXTEND (SImode, operands[1]))); - emit_insn (gen_rtx_SET (tmp2, gen_rtx_SIGN_EXTEND (SImode, operands[2]))); - emit_insn (gen_mac (tmp1, tmp2)); - emit_move_insn (operands[0], accl); + rtx acc_reg = gen_rtx_REG (SImode, ACC_REG_FIRST); + + emit_move_insn (acc_reg, operands[3]); + emit_insn (gen_machi (operands[1], operands[2])); + emit_move_insn (operands[0], acc_reg); DONE; }") -; The same for the unsigned variant, but using MACU instruction. +(define_insn "machi" + [(set (reg:SI ARCV2_ACC) + (plus:SI + (mult:SI (sign_extend:SI (match_operand:HI 0 "register_operand" "%r")) + (sign_extend:SI (match_operand:HI 1 "register_operand" "r"))) + (reg:SI ARCV2_ACC)))] + "TARGET_PLUS_MACD" + "vmac2h\\t0,%0,%1" + [(set_attr "length" "4") + (set_attr "type" "multi") + (set_attr "predicable" "no") + (set_attr "cond" "nocond")]) + +; The same for the unsigned variant, but using VMAC2HU instruction. (define_expand "umaddhisi4" [(match_operand:SI 0 "register_operand" "") (match_operand:HI 1 "register_operand" "") - (match_operand:HI 2 "extend_operand" "") + (match_operand:HI 2 "register_operand" "") (match_operand:SI 3 "register_operand" "")] - "TARGET_PLUS_DMPY" + "TARGET_PLUS_MACD" "{ - rtx acc_reg = gen_rtx_REG (DImode, ACC_REG_FIRST); - rtx tmp1 = gen_reg_rtx (SImode); - rtx tmp2 = gen_reg_rtx (SImode); - rtx accl = gen_lowpart (SImode, acc_reg); - - emit_move_insn (accl, operands[3]); - emit_insn (gen_rtx_SET (tmp1, gen_rtx_ZERO_EXTEND (SImode, operands[1]))); - emit_insn (gen_rtx_SET (tmp2, gen_rtx_ZERO_EXTEND (SImode, operands[2]))); - emit_insn (gen_macu (tmp1, tmp2)); - emit_move_insn (operands[0], accl); + rtx acc_reg = gen_rtx_REG (SImode, ACC_REG_FIRST); + + emit_move_insn (acc_reg, operands[3]); + emit_insn (gen_umachi (operands[1], operands[2])); + emit_move_insn (operands[0], acc_reg); DONE; }") +(define_insn "umachi" + [(set (reg:SI ARCV2_ACC) + (plus:SI + (mult:SI (zero_extend:SI (match_operand:HI 0 "register_operand" "%r")) + (zero_extend:SI (match_operand:HI 1 "register_operand" "r"))) + (reg:SI ARCV2_ACC)))] + "TARGET_PLUS_MACD" + "vmac2hu\\t0,%0,%1" + [(set_attr "length" "4") + (set_attr "type" "multi") + (set_attr "predicable" "no") + (set_attr "cond" "nocond")]) + (define_expand "maddsidi4" [(match_operand:DI 0 "register_operand" "") (match_operand:SI 1 "register_operand" "") -- cgit v1.1 From 174e75a210753b68de0f2c398a13ace0f512e35b Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Wed, 9 Jun 2021 12:12:57 +0300 Subject: arc: Update doloop_end patterns ARC processor can use LP instruction to implement zero overlay loops. The current inplementation doesn't handle the unlikely situation when the loop iterator is located in memory. Refurbish the loop_end insn pattern into a define_insn_and_split pattern. gcc/ 2021-07-09 Claudiu Zissulescu * config/arc/arc.md (loop_end): Change it to define_insn_and_split. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/arc.md | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index aed0b40..90ba85e 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -4962,7 +4962,7 @@ core_3, archs4x, archs4xd, archs4xd_slow" (define_expand "doloop_end" [(parallel [(set (pc) (if_then_else - (ne (match_operand 0 "" "") + (ne (match_operand 0 "nonimmediate_operand") (const_int 1)) (label_ref (match_operand 1 "" "")) (pc))) @@ -4988,44 +4988,38 @@ core_3, archs4x, archs4xd, archs4xd_slow" ;; if by any chance the lp_count is not used, then use an 'r' ;; register, instead of going to memory. -(define_insn "loop_end" - [(set (pc) - (if_then_else (ne (match_operand:SI 2 "nonimmediate_operand" "0,m") - (const_int 1)) - (label_ref (match_operand 1 "" "")) - (pc))) - (set (match_operand:SI 0 "nonimmediate_operand" "=r,m") - (plus (match_dup 2) (const_int -1))) - (unspec [(const_int 0)] UNSPEC_ARC_LP) - (clobber (match_scratch:SI 3 "=X,&r"))] - "" - "; ZOL_END, begins @%l1" - [(set_attr "length" "0") - (set_attr "predicable" "no") - (set_attr "type" "loop_end")]) - ;; split pattern for the very slim chance when the loop register is ;; memory. -(define_split +(define_insn_and_split "loop_end" [(set (pc) - (if_then_else (ne (match_operand:SI 0 "memory_operand") + (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "+r,!m") (const_int 1)) - (label_ref (match_operand 1 "")) + (label_ref (match_operand 1 "" "")) (pc))) (set (match_dup 0) (plus (match_dup 0) (const_int -1))) (unspec [(const_int 0)] UNSPEC_ARC_LP) - (clobber (match_scratch:SI 2))] - "memory_operand (operands[0], SImode)" + (clobber (match_scratch:SI 2 "=X,&r"))] + "" + "@ + ; ZOL_END, begins @%l1 + #" + "reload_completed && memory_operand (operands[0], Pmode)" [(set (match_dup 2) (match_dup 0)) - (set (match_dup 2) (plus:SI (match_dup 2) (const_int -1))) + (parallel + [(set (reg:CC_ZN CC_REG) + (compare:CC_ZN (plus:SI (match_dup 2) (const_int -1)) + (const_int 0))) + (set (match_dup 2) (plus:SI (match_dup 2) (const_int -1)))]) (set (match_dup 0) (match_dup 2)) - (set (reg:CC CC_REG) (compare:CC (match_dup 2) (const_int 0))) (set (pc) - (if_then_else (ne (reg:CC CC_REG) + (if_then_else (ne (reg:CC_ZN CC_REG) (const_int 0)) (label_ref (match_dup 1)) (pc)))] - "") + "" + [(set_attr "length" "0,24") + (set_attr "predicable" "no") + (set_attr "type" "loop_end")]) (define_insn "loop_fail" [(set (reg:SI LP_COUNT) -- cgit v1.1 From 880198da50e1beac9b7cf8ff1bff570359c5f2a0 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 9 Jun 2021 16:00:01 +0000 Subject: arm: Auto-vectorization for MVE and Neon: vhadd/vrhadd This patch adds support for auto-vectorization of average value computation using vhadd or vrhadd, for both MVE and Neon. The patch adds the needed [u]avg3_[floor|ceil] patterns to vec-common.md, I'm not sure how to factorize them without introducing an unspec iterator? It also adds tests for 'floor' and for 'ceil', each for MVE and Neon. 2021-06-09 Christophe Lyon gcc/ * config/arm/mve.md (mve_vhaddq_): Prefix with '@'. (@mve_vrhaddq_hadd): Likewise. * config/arm/vec-common.md (avg3_floor, uavg3_floor) (avg3_ceil", uavg3_ceil): New patterns. gcc/testsuite/ * gcc.target/arm/simd/mve-vhadd-1.c: New test. * gcc.target/arm/simd/mve-vhadd-2.c: New test. * gcc.target/arm/simd/neon-vhadd-1.c: New test. * gcc.target/arm/simd/neon-vhadd-2.c: New test. --- gcc/config/arm/mve.md | 4 +-- gcc/config/arm/neon.md | 2 +- gcc/config/arm/vec-common.md | 60 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 0bfa6a9..04aa612 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -1030,7 +1030,7 @@ ;; ;; [vhaddq_s, vhaddq_u]) ;; -(define_insn "mve_vhaddq_" +(define_insn "@mve_vhaddq_" [ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1652,7 +1652,7 @@ ;; ;; [vrhaddq_s, vrhaddq_u]) ;; -(define_insn "mve_vrhaddq_" +(define_insn "@mve_vrhaddq_" [ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w") diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 077c62f..18571d8 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -1488,7 +1488,7 @@ ; vhadd and vrhadd. -(define_insn "neon_vhadd" +(define_insn "@neon_vhadd" [(set (match_operand:VDQIW 0 "s_register_operand" "=w") (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w") (match_operand:VDQIW 2 "s_register_operand" "w")] diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 80b2732..2779c1a 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -565,3 +565,63 @@ DONE; }) + +(define_expand "avg3_floor" + [(match_operand:MVE_2 0 "s_register_operand") + (match_operand:MVE_2 1 "s_register_operand") + (match_operand:MVE_2 2 "s_register_operand")] + "ARM_HAVE__ARITH" +{ + if (TARGET_HAVE_MVE) + emit_insn (gen_mve_vhaddq (VHADDQ_S, mode, + operands[0], operands[1], operands[2])); + else + emit_insn (gen_neon_vhadd (UNSPEC_VHADD_S, UNSPEC_VHADD_S, mode, + operands[0], operands[1], operands[2])); + DONE; +}) + +(define_expand "uavg3_floor" + [(match_operand:MVE_2 0 "s_register_operand") + (match_operand:MVE_2 1 "s_register_operand") + (match_operand:MVE_2 2 "s_register_operand")] + "ARM_HAVE__ARITH" +{ + if (TARGET_HAVE_MVE) + emit_insn (gen_mve_vhaddq (VHADDQ_U, mode, + operands[0], operands[1], operands[2])); + else + emit_insn (gen_neon_vhadd (UNSPEC_VHADD_U, UNSPEC_VHADD_U, mode, + operands[0], operands[1], operands[2])); + DONE; +}) + +(define_expand "avg3_ceil" + [(match_operand:MVE_2 0 "s_register_operand") + (match_operand:MVE_2 1 "s_register_operand") + (match_operand:MVE_2 2 "s_register_operand")] + "ARM_HAVE__ARITH" +{ + if (TARGET_HAVE_MVE) + emit_insn (gen_mve_vrhaddq (VRHADDQ_S, mode, + operands[0], operands[1], operands[2])); + else + emit_insn (gen_neon_vhadd (UNSPEC_VRHADD_S, UNSPEC_VRHADD_S, mode, + operands[0], operands[1], operands[2])); + DONE; +}) + +(define_expand "uavg3_ceil" + [(match_operand:MVE_2 0 "s_register_operand") + (match_operand:MVE_2 1 "s_register_operand") + (match_operand:MVE_2 2 "s_register_operand")] + "ARM_HAVE__ARITH" +{ + if (TARGET_HAVE_MVE) + emit_insn (gen_mve_vrhaddq (VRHADDQ_U, mode, + operands[0], operands[1], operands[2])); + else + emit_insn (gen_neon_vhadd (UNSPEC_VRHADD_U, UNSPEC_VRHADD_U, mode, + operands[0], operands[1], operands[2])); + DONE; +}) -- cgit v1.1 From 7969d9c83d061e57ea80795768469cffb1a859f8 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 9 Jun 2021 16:07:43 +0000 Subject: arm: Auto-vectorization for MVE: vclz This patch adds support for auto-vectorization of clz for MVE. It does so by removing the unspec from mve_vclzq_ and uses 'clz' instead. It moves to neon_vclz expander from neon.md to vec-common.md and renames it into the standard name clz2. 2021-06-09 Christophe Lyon gcc/ * config/arm/iterators.md (): Remove VCLZQ_U, VCLZQ_S. (VCLZQ): Remove. * config/arm/mve.md (mve_vclzq_): Add '@' prefix, remove iterator. (mve_vclzq_u): New. * config/arm/neon.md (clz2): Rename to neon_vclz. (neon_vclz" +(define_insn "@mve_vclzq_s" [ (set (match_operand:MVE_2 0 "s_register_operand" "=w") - (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")] - VCLZQ)) + (clz:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE" "vclz.i%# %q0, %q1" [(set_attr "type" "mve_move") ]) +(define_expand "mve_vclzq_u" + [ + (set (match_operand:MVE_2 0 "s_register_operand") + (clz:MVE_2 (match_operand:MVE_2 1 "s_register_operand"))) + ] + "TARGET_HAVE_MVE" +) ;; ;; [vclsq_s]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 18571d8..0fdffaf 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -3018,7 +3018,7 @@ [(set_attr "type" "neon_cls")] ) -(define_insn "clz2" +(define_insn "neon_vclz" [(set (match_operand:VDQIW 0 "s_register_operand" "=w") (clz:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")))] "TARGET_NEON" @@ -3026,15 +3026,6 @@ [(set_attr "type" "neon_cnt")] ) -(define_expand "neon_vclz" - [(match_operand:VDQIW 0 "s_register_operand") - (match_operand:VDQIW 1 "s_register_operand")] - "TARGET_NEON" -{ - emit_insn (gen_clz2 (operands[0], operands[1])); - DONE; -}) - (define_insn "popcount2" [(set (match_operand:VE 0 "s_register_operand" "=w") (popcount:VE (match_operand:VE 1 "s_register_operand" "w")))] diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index ed1bc29..ad1c6ed 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -556,8 +556,6 @@ VQABSQ_S VDUPQ_N_U VDUPQ_N_S - VCLZQ_U - VCLZQ_S VCLSQ_S VADDVQ_S VADDVQ_U diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 2779c1a..430a92c 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -625,3 +625,10 @@ operands[0], operands[1], operands[2])); DONE; }) + +(define_expand "clz2" + [(set (match_operand:VDQIW 0 "s_register_operand") + (clz:VDQIW (match_operand:VDQIW 1 "s_register_operand")))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT" +) -- cgit v1.1 From 2142e34340523e1553c0dc131f657893f307e291 Mon Sep 17 00:00:00 2001 From: Carl Love Date: Mon, 7 Jun 2021 16:06:04 -0500 Subject: rs6000, Fix arguments in altivec_vrlwmi and altivec_rlwdi builtins 2021-06-07 Carl Love gcc/ * config/rs6000/altivec.md (altivec_vrlmi): Fix bug in argument generation. gcc/testsuite/ * gcc.target/powerpc/check-builtin-vec_rlnm-runnable.c: New runnable test case. * gcc.target/powerpc/vec-rlmi-rlnm.c: Update scan assembler times for xxlor instruction. --- gcc/config/rs6000/altivec.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 1351daf..97dc9d2 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -1987,12 +1987,12 @@ (define_insn "altivec_vrlmi" [(set (match_operand:VIlong 0 "register_operand" "=v") - (unspec:VIlong [(match_operand:VIlong 1 "register_operand" "0") - (match_operand:VIlong 2 "register_operand" "v") + (unspec:VIlong [(match_operand:VIlong 1 "register_operand" "v") + (match_operand:VIlong 2 "register_operand" "0") (match_operand:VIlong 3 "register_operand" "v")] UNSPEC_VRLMI))] "TARGET_P9_VECTOR" - "vrlmi %0,%2,%3" + "vrlmi %0,%1,%3" [(set_attr "type" "veclogical")]) (define_insn "altivec_vrlnm" -- cgit v1.1 From f03122f2a7626772fe13ab77f677141377104502 Mon Sep 17 00:00:00 2001 From: Carl Love Date: Sat, 6 Jun 2020 16:56:08 -0500 Subject: RS6000 add 128-bit Integer Operations part 1 2021-06-07 Carl Love gcc/ChangeLog * config/rs6000/altivec.h (vec_dive, vec_mod): Add define for new builtins. * config/rs6000/altivec.md (UNSPEC_VMULEUD, UNSPEC_VMULESD, UNSPEC_VMULOUD, UNSPEC_VMULOSD): New unspecs. (altivec_eqv1ti, altivec_gtv1ti, altivec_gtuv1ti, altivec_vmuleud, altivec_vmuloud, altivec_vmulesd, altivec_vmulosd, altivec_vrlq, altivec_vrlqmi, altivec_vrlqmi_inst, altivec_vrlqnm, altivec_vrlqnm_inst, altivec_vslq, altivec_vsrq, altivec_vsraq, altivec_vcmpequt_p, altivec_vcmpgtst_p, altivec_vcmpgtut_p): New define_insn. (vec_widen_umult_even_v2di, vec_widen_smult_even_v2di, vec_widen_umult_odd_v2di, vec_widen_smult_odd_v2di, altivec_vrlqmi, altivec_vrlqnm): New define_expands. * config/rs6000/rs6000-builtin.def (VCMPEQUT_P, VCMPGTST_P, VCMPGTUT_P): Add macro expansions. (BU_P10V_AV_P): Add builtin predicate definition. (VCMPGTUT, VCMPGTST, VCMPEQUT, CMPNET, CMPGE_1TI, CMPGE_U1TI, CMPLE_1TI, CMPLE_U1TI, VNOR_V1TI_UNS, VNOR_V1TI, VCMPNET_P, VCMPAET_P, VMULEUD, VMULESD, VMULOUD, VMULOSD, VRLQ, VSLQ, VSRQ, VSRAQ, VRLQNM, DIV_V1TI, UDIV_V1TI, DIVES_V1TI, DIVEU_V1TI, MODS_V1TI, MODU_V1TI, VRLQMI): New macro expansions. (VRLQ, VSLQ, VSRQ, VSRAQ, DIVE, MOD): New overload expansions. * config/rs6000/rs6000-call.c (P10_BUILTIN_VCMPEQUT, P10V_BUILTIN_CMPGE_1TI, P10V_BUILTIN_CMPGE_U1TI, P10V_BUILTIN_VCMPGTUT, P10V_BUILTIN_VCMPGTST, P10V_BUILTIN_CMPLE_1TI, P10V_BUILTIN_VCMPLE_U1TI, P10V_BUILTIN_DIV_V1TI, P10V_BUILTIN_UDIV_V1TI, P10V_BUILTIN_VMULESD, P10V_BUILTIN_VMULEUD, P10V_BUILTIN_VMULOSD, P10V_BUILTIN_VMULOUD, P10V_BUILTIN_VNOR_V1TI, P10V_BUILTIN_VNOR_V1TI_UNS, P10V_BUILTIN_VRLQ, P10V_BUILTIN_VRLQMI, P10V_BUILTIN_VRLQNM, P10V_BUILTIN_VSLQ, P10V_BUILTIN_VSRQ, P10V_BUILTIN_VSRAQ, P10V_BUILTIN_VCMPGTUT_P, P10V_BUILTIN_VCMPGTST_P, P10V_BUILTIN_VCMPEQUT_P, P10V_BUILTIN_VCMPGTUT_P, P10V_BUILTIN_VCMPGTST_P, P10V_BUILTIN_CMPNET, P10V_BUILTIN_VCMPNET_P, P10V_BUILTIN_VCMPAET_P, P10V_BUILTIN_DIVES_V1TI, P10V_BUILTIN_MODS_V1TI, P10V_BUILTIN_MODU_V1TI): New overloaded definitions. (rs6000_gimple_fold_builtin) [P10V_BUILTIN_VCMPEQUT, P10V_BUILTIN_CMPNET, P10V_BUILTIN_CMPGE_1TI, P10V_BUILTIN_CMPGE_U1TI, P10V_BUILTIN_VCMPGTUT, P10V_BUILTIN_VCMPGTST, P10V_BUILTIN_CMPLE_1TI, P10V_BUILTIN_CMPLE_U1TI]: New case statements. (rs6000_init_builtins) [bool_V1TI_type_node, int_ftype_int_v1ti_v1ti]: New assignments. (altivec_init_builtins): New E_V1TImode case statement. (builtin_function_type)[P10_BUILTIN_128BIT_VMULEUD, P10_BUILTIN_128BIT_VMULOUD, P10_BUILTIN_128BIT_DIVEU_V1TI, P10_BUILTIN_128BIT_MODU_V1TI, P10_BUILTIN_CMPGE_U1TI, P10_BUILTIN_VCMPGTUT, P10_BUILTIN_VCMPEQUT]: New case statements. * config/rs6000/rs6000.c (rs6000_handle_altivec_attribute) [E_TImode, E_V1TImode]: New case statements. * config/rs6000/rs6000.h (rs6000_builtin_type_index): New enum value RS6000_BTI_bool_V1TI. * config/rs6000/vector.md (vector_gtv1ti,vector_nltv1ti, vector_gtuv1ti, vector_nltuv1ti, vector_ngtv1ti, vector_ngtuv1ti, vector_eq_v1ti_p, vector_ne_v1ti_p, vector_ae_v1ti_p, vector_gt_v1ti_p, vector_gtu_v1ti_p, vrotlv1ti3, vashlv1ti3, vlshrv1ti3, vashrv1ti3): New define_expands. * config/rs6000/vsx.md (UNSPEC_VSX_DIVSQ, UNSPEC_VSX_DIVUQ, UNSPEC_VSX_DIVESQ, UNSPEC_VSX_DIVEUQ, UNSPEC_VSX_MODSQ, UNSPEC_VSX_MODUQ): New unspecs. (mulv2di3, vsx_div_v1ti, vsx_udiv_v1ti, vsx_dives_v1ti, vsx_diveu_v1ti, vsx_mods_v1ti, vsx_modu_v1ti, xxswapd_v1ti): New define_insns. (vcmpnet): New define_expand. * doc/extend.texi: Add documentation for the new builtins vec_rl, vec_rlmi, vec_rlnm, vec_sl, vec_sr, vec_sra, vec_mule, vec_mulo, vec_div, vec_dive, vec_mod, vec_cmpeq, vec_cmpne, vec_cmpgt, vec_cmplt, vec_cmpge, vec_cmple, vec_all_eq, vec_all_ne, vec_all_gt, vec_all_lt, vec_all_ge, vec_all_le, vec_any_eq, vec_any_ne, vec_any_gt, vec_any_lt, vec_any_ge, vec_any_le. gcc/testsuite/ChangeLog * gcc.target/powerpc/int_128bit-runnable.c: New test file. --- gcc/config/rs6000/altivec.h | 3 + gcc/config/rs6000/altivec.md | 241 +++++++++++++++++++++++++++++++++++ gcc/config/rs6000/rs6000-builtin.def | 48 ++++++- gcc/config/rs6000/rs6000-call.c | 136 +++++++++++++++++++- gcc/config/rs6000/rs6000.c | 1 + gcc/config/rs6000/rs6000.h | 3 +- gcc/config/rs6000/vector.md | 186 ++++++++++++++++++++++++++- gcc/config/rs6000/vsx.md | 89 +++++++++++++ 8 files changed, 703 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/altivec.h b/gcc/config/rs6000/altivec.h index 961621a..314695a 100644 --- a/gcc/config/rs6000/altivec.h +++ b/gcc/config/rs6000/altivec.h @@ -715,6 +715,9 @@ __altivec_scalar_pred(vec_any_nle, #define vec_step(x) __builtin_vec_step (* (__typeof__ (x) *) 0) #ifdef _ARCH_PWR10 +#define vec_dive __builtin_vec_dive +#define vec_mod __builtin_vec_mod + /* May modify these macro definitions if future capabilities overload with support for different vector argument and result types. */ #define vec_cntlzm(a, b) __builtin_altivec_vclzdm (a, b) diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 97dc9d2..0fa69b7 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -39,12 +39,16 @@ UNSPEC_VMULESH UNSPEC_VMULEUW UNSPEC_VMULESW + UNSPEC_VMULEUD + UNSPEC_VMULESD UNSPEC_VMULOUB UNSPEC_VMULOSB UNSPEC_VMULOUH UNSPEC_VMULOSH UNSPEC_VMULOUW UNSPEC_VMULOSW + UNSPEC_VMULOUD + UNSPEC_VMULOSD UNSPEC_VPKPX UNSPEC_VPACK_SIGN_SIGN_SAT UNSPEC_VPACK_SIGN_UNS_SAT @@ -619,6 +623,14 @@ "vcmpbfp %0,%1,%2" [(set_attr "type" "veccmp")]) +(define_insn "altivec_eqv1ti" + [(set (match_operand:V1TI 0 "altivec_register_operand" "=v") + (eq:V1TI (match_operand:V1TI 1 "altivec_register_operand" "v") + (match_operand:V1TI 2 "altivec_register_operand" "v")))] + "TARGET_POWER10" + "vcmpequq %0,%1,%2" + [(set_attr "type" "veccmpfx")]) + (define_insn "altivec_eq" [(set (match_operand:VI2 0 "altivec_register_operand" "=v") (eq:VI2 (match_operand:VI2 1 "altivec_register_operand" "v") @@ -635,6 +647,14 @@ "vcmpgts %0,%1,%2" [(set_attr "type" "veccmpfx")]) +(define_insn "*altivec_gtv1ti" + [(set (match_operand:V1TI 0 "altivec_register_operand" "=v") + (gt:V1TI (match_operand:V1TI 1 "altivec_register_operand" "v") + (match_operand:V1TI 2 "altivec_register_operand" "v")))] + "TARGET_POWER10" + "vcmpgtsq %0,%1,%2" + [(set_attr "type" "veccmpfx")]) + (define_insn "*altivec_gtu" [(set (match_operand:VI2 0 "altivec_register_operand" "=v") (gtu:VI2 (match_operand:VI2 1 "altivec_register_operand" "v") @@ -643,6 +663,14 @@ "vcmpgtu %0,%1,%2" [(set_attr "type" "veccmpfx")]) +(define_insn "*altivec_gtuv1ti" + [(set (match_operand:V1TI 0 "altivec_register_operand" "=v") + (gtu:V1TI (match_operand:V1TI 1 "altivec_register_operand" "v") + (match_operand:V1TI 2 "altivec_register_operand" "v")))] + "TARGET_POWER10" + "vcmpgtuq %0,%1,%2" + [(set_attr "type" "veccmpfx")]) + (define_insn "*altivec_eqv4sf" [(set (match_operand:V4SF 0 "altivec_register_operand" "=v") (eq:V4SF (match_operand:V4SF 1 "altivec_register_operand" "v") @@ -1693,6 +1721,19 @@ DONE; }) +(define_expand "vec_widen_umult_even_v2di" + [(use (match_operand:V1TI 0 "register_operand")) + (use (match_operand:V2DI 1 "register_operand")) + (use (match_operand:V2DI 2 "register_operand"))] + "TARGET_POWER10" +{ + if (BYTES_BIG_ENDIAN) + emit_insn (gen_altivec_vmuleud (operands[0], operands[1], operands[2])); + else + emit_insn (gen_altivec_vmuloud (operands[0], operands[1], operands[2])); + DONE; +}) + (define_expand "vec_widen_smult_even_v4si" [(use (match_operand:V2DI 0 "register_operand")) (use (match_operand:V4SI 1 "register_operand")) @@ -1706,6 +1747,19 @@ DONE; }) +(define_expand "vec_widen_smult_even_v2di" + [(use (match_operand:V1TI 0 "register_operand")) + (use (match_operand:V2DI 1 "register_operand")) + (use (match_operand:V2DI 2 "register_operand"))] + "TARGET_POWER10" +{ + if (BYTES_BIG_ENDIAN) + emit_insn (gen_altivec_vmulesd (operands[0], operands[1], operands[2])); + else + emit_insn (gen_altivec_vmulosd (operands[0], operands[1], operands[2])); + DONE; +}) + (define_expand "vec_widen_umult_odd_v16qi" [(use (match_operand:V8HI 0 "register_operand")) (use (match_operand:V16QI 1 "register_operand")) @@ -1771,6 +1825,19 @@ DONE; }) +(define_expand "vec_widen_umult_odd_v2di" + [(use (match_operand:V1TI 0 "register_operand")) + (use (match_operand:V2DI 1 "register_operand")) + (use (match_operand:V2DI 2 "register_operand"))] + "TARGET_POWER10" +{ + if (BYTES_BIG_ENDIAN) + emit_insn (gen_altivec_vmuloud (operands[0], operands[1], operands[2])); + else + emit_insn (gen_altivec_vmuleud (operands[0], operands[1], operands[2])); + DONE; +}) + (define_expand "vec_widen_smult_odd_v4si" [(use (match_operand:V2DI 0 "register_operand")) (use (match_operand:V4SI 1 "register_operand")) @@ -1784,6 +1851,19 @@ DONE; }) +(define_expand "vec_widen_smult_odd_v2di" + [(use (match_operand:V1TI 0 "register_operand")) + (use (match_operand:V2DI 1 "register_operand")) + (use (match_operand:V2DI 2 "register_operand"))] + "TARGET_POWER10" +{ + if (BYTES_BIG_ENDIAN) + emit_insn (gen_altivec_vmulosd (operands[0], operands[1], operands[2])); + else + emit_insn (gen_altivec_vmulesd (operands[0], operands[1], operands[2])); + DONE; +}) + (define_insn "altivec_vmuleub" [(set (match_operand:V8HI 0 "register_operand" "=v") (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v") @@ -1865,6 +1945,15 @@ "vmuleuw %0,%1,%2" [(set_attr "type" "veccomplex")]) +(define_insn "altivec_vmuleud" + [(set (match_operand:V1TI 0 "register_operand" "=v") + (unspec:V1TI [(match_operand:V2DI 1 "register_operand" "v") + (match_operand:V2DI 2 "register_operand" "v")] + UNSPEC_VMULEUD))] + "TARGET_POWER10" + "vmuleud %0,%1,%2" + [(set_attr "type" "veccomplex")]) + (define_insn "altivec_vmulouw" [(set (match_operand:V2DI 0 "register_operand" "=v") (unspec:V2DI [(match_operand:V4SI 1 "register_operand" "v") @@ -1874,6 +1963,15 @@ "vmulouw %0,%1,%2" [(set_attr "type" "veccomplex")]) +(define_insn "altivec_vmuloud" + [(set (match_operand:V1TI 0 "register_operand" "=v") + (unspec:V1TI [(match_operand:V2DI 1 "register_operand" "v") + (match_operand:V2DI 2 "register_operand" "v")] + UNSPEC_VMULOUD))] + "TARGET_POWER10" + "vmuloud %0,%1,%2" + [(set_attr "type" "veccomplex")]) + (define_insn "altivec_vmulesw" [(set (match_operand:V2DI 0 "register_operand" "=v") (unspec:V2DI [(match_operand:V4SI 1 "register_operand" "v") @@ -1883,6 +1981,15 @@ "vmulesw %0,%1,%2" [(set_attr "type" "veccomplex")]) +(define_insn "altivec_vmulesd" + [(set (match_operand:V1TI 0 "register_operand" "=v") + (unspec:V1TI [(match_operand:V2DI 1 "register_operand" "v") + (match_operand:V2DI 2 "register_operand" "v")] + UNSPEC_VMULESD))] + "TARGET_POWER10" + "vmulesd %0,%1,%2" + [(set_attr "type" "veccomplex")]) + (define_insn "altivec_vmulosw" [(set (match_operand:V2DI 0 "register_operand" "=v") (unspec:V2DI [(match_operand:V4SI 1 "register_operand" "v") @@ -1892,6 +1999,15 @@ "vmulosw %0,%1,%2" [(set_attr "type" "veccomplex")]) +(define_insn "altivec_vmulosd" + [(set (match_operand:V1TI 0 "register_operand" "=v") + (unspec:V1TI [(match_operand:V2DI 1 "register_operand" "v") + (match_operand:V2DI 2 "register_operand" "v")] + UNSPEC_VMULOSD))] + "TARGET_POWER10" + "vmulosd %0,%1,%2" + [(set_attr "type" "veccomplex")]) + ;; Vector pack/unpack (define_insn "altivec_vpkpx" [(set (match_operand:V8HI 0 "register_operand" "=v") @@ -1985,6 +2101,15 @@ "vrl %0,%1,%2" [(set_attr "type" "vecsimple")]) +(define_insn "altivec_vrlq" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (rotate:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")))] + "TARGET_POWER10" +;; rotate amount in needs to be in bits[57:63] of operand2. + "vrlq %0,%1,%2" + [(set_attr "type" "vecsimple")]) + (define_insn "altivec_vrlmi" [(set (match_operand:VIlong 0 "register_operand" "=v") (unspec:VIlong [(match_operand:VIlong 1 "register_operand" "v") @@ -1995,6 +2120,34 @@ "vrlmi %0,%1,%3" [(set_attr "type" "veclogical")]) +(define_expand "altivec_vrlqmi" + [(set (match_operand:V1TI 0 "vsx_register_operand") + (unspec:V1TI [(match_operand:V1TI 1 "vsx_register_operand") + (match_operand:V1TI 2 "vsx_register_operand") + (match_operand:V1TI 3 "vsx_register_operand")] + UNSPEC_VRLMI))] + "TARGET_POWER10" +{ + /* Mask bit begin, end fields need to be in bits [41:55] of 128-bit operand2. + Shift amount in needs to be put in bits[57:63] of 128-bit operand2. */ + rtx tmp = gen_reg_rtx (V1TImode); + + emit_insn (gen_xxswapd_v1ti (tmp, operands[3])); + emit_insn (gen_altivec_vrlqmi_inst (operands[0], operands[1], operands[2], + tmp)); + DONE; +}) + +(define_insn "altivec_vrlqmi_inst" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (unspec:V1TI [(match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "0") + (match_operand:V1TI 3 "vsx_register_operand" "v")] + UNSPEC_VRLMI))] + "TARGET_POWER10" + "vrlqmi %0,%1,%3" + [(set_attr "type" "veclogical")]) + (define_insn "altivec_vrlnm" [(set (match_operand:VIlong 0 "register_operand" "=v") (unspec:VIlong [(match_operand:VIlong 1 "register_operand" "v") @@ -2004,6 +2157,31 @@ "vrlnm %0,%1,%2" [(set_attr "type" "veclogical")]) +(define_expand "altivec_vrlqnm" + [(set (match_operand:V1TI 0 "vsx_register_operand") + (unspec:V1TI [(match_operand:V1TI 1 "vsx_register_operand") + (match_operand:V1TI 2 "vsx_register_operand")] + UNSPEC_VRLNM))] + "TARGET_POWER10" +{ + /* Shift amount in needs to be put in bits[57:63] of 128-bit operand2. */ + rtx tmp = gen_reg_rtx (V1TImode); + + emit_insn (gen_xxswapd_v1ti (tmp, operands[2])); + emit_insn (gen_altivec_vrlqnm_inst (operands[0], operands[1], tmp)); + DONE; +}) + +(define_insn "altivec_vrlqnm_inst" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (unspec:V1TI [(match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")] + UNSPEC_VRLNM))] + "TARGET_POWER10" + ;; rotate and mask bits need to be in upper 64-bits of operand2. + "vrlqnm %0,%1,%2" + [(set_attr "type" "veclogical")]) + (define_insn "altivec_vsl" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") @@ -2048,6 +2226,15 @@ "vsl %0,%1,%2" [(set_attr "type" "vecsimple")]) +(define_insn "altivec_vslq" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (ashift:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")))] + "TARGET_POWER10" + /* Shift amount in needs to be in bits[57:63] of 128-bit operand. */ + "vslq %0,%1,%2" + [(set_attr "type" "vecsimple")]) + (define_insn "*altivec_vsr" [(set (match_operand:VI2 0 "register_operand" "=v") (lshiftrt:VI2 (match_operand:VI2 1 "register_operand" "v") @@ -2056,6 +2243,15 @@ "vsr %0,%1,%2" [(set_attr "type" "vecsimple")]) +(define_insn "altivec_vsrq" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (lshiftrt:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")))] + "TARGET_POWER10" + /* Shift amount in needs to be in bits[57:63] of 128-bit operand. */ + "vsrq %0,%1,%2" + [(set_attr "type" "vecsimple")]) + (define_insn "*altivec_vsra" [(set (match_operand:VI2 0 "register_operand" "=v") (ashiftrt:VI2 (match_operand:VI2 1 "register_operand" "v") @@ -2064,6 +2260,15 @@ "vsra %0,%1,%2" [(set_attr "type" "vecsimple")]) +(define_insn "altivec_vsraq" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (ashiftrt:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")))] + "TARGET_POWER10" + /* Shift amount in needs to be in bits[57:63] of 128-bit operand. */ + "vsraq %0,%1,%2" + [(set_attr "type" "vecsimple")]) + (define_insn "altivec_vsr" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") @@ -2624,6 +2829,18 @@ "vcmpequ. %0,%1,%2" [(set_attr "type" "veccmpfx")]) +(define_insn "altivec_vcmpequt_p" + [(set (reg:CC CR6_REGNO) + (unspec:CC [(eq:CC (match_operand:V1TI 1 "altivec_register_operand" "v") + (match_operand:V1TI 2 "altivec_register_operand" "v"))] + UNSPEC_PREDICATE)) + (set (match_operand:V1TI 0 "altivec_register_operand" "=v") + (eq:V1TI (match_dup 1) + (match_dup 2)))] + "TARGET_POWER10" + "vcmpequq. %0,%1,%2" + [(set_attr "type" "veccmpfx")]) + (define_insn "*altivec_vcmpgts_p" [(set (reg:CC CR6_REGNO) (unspec:CC [(gt:CC (match_operand:VI2 1 "register_operand" "v") @@ -2636,6 +2853,18 @@ "vcmpgts. %0,%1,%2" [(set_attr "type" "veccmpfx")]) +(define_insn "*altivec_vcmpgtst_p" + [(set (reg:CC CR6_REGNO) + (unspec:CC [(gt:CC (match_operand:V1TI 1 "register_operand" "v") + (match_operand:V1TI 2 "register_operand" "v"))] + UNSPEC_PREDICATE)) + (set (match_operand:V1TI 0 "register_operand" "=v") + (gt:V1TI (match_dup 1) + (match_dup 2)))] + "TARGET_POWER10" + "vcmpgtsq. %0,%1,%2" + [(set_attr "type" "veccmpfx")]) + (define_insn "*altivec_vcmpgtu_p" [(set (reg:CC CR6_REGNO) (unspec:CC [(gtu:CC (match_operand:VI2 1 "register_operand" "v") @@ -2648,6 +2877,18 @@ "vcmpgtu. %0,%1,%2" [(set_attr "type" "veccmpfx")]) +(define_insn "*altivec_vcmpgtut_p" + [(set (reg:CC CR6_REGNO) + (unspec:CC [(gtu:CC (match_operand:V1TI 1 "register_operand" "v") + (match_operand:V1TI 2 "register_operand" "v"))] + UNSPEC_PREDICATE)) + (set (match_operand:V1TI 0 "register_operand" "=v") + (gtu:V1TI (match_dup 1) + (match_dup 2)))] + "TARGET_POWER10" + "vcmpgtuq. %0,%1,%2" + [(set_attr "type" "veccmpfx")]) + (define_insn "*altivec_vcmpeqfp_p" [(set (reg:CC CR6_REGNO) (unspec:CC [(eq:CC (match_operand:V4SF 1 "register_operand" "v") diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index 609bebd..dba2282 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -1269,6 +1269,15 @@ | RS6000_BTC_TERNARY), \ CODE_FOR_ ## ICODE) /* ICODE */ +/* See the comment on BU_ALTIVEC_P. */ +#define BU_P10V_AV_P(ENUM, NAME, ATTR, ICODE) \ + RS6000_BUILTIN_P (P10V_BUILTIN_ ## ENUM, /* ENUM */ \ + "__builtin_altivec_" NAME, /* NAME */ \ + RS6000_BTM_P10, /* MASK */ \ + (RS6000_BTC_ ## ATTR /* ATTR */ \ + | RS6000_BTC_PREDICATE), \ + CODE_FOR_ ## ICODE) /* ICODE */ + #define BU_P10V_AV_X(ENUM, NAME, ATTR) \ RS6000_BUILTIN_X (P10_BUILTIN_ ## ENUM, /* ENUM */ \ "__builtin_altivec_" NAME, /* NAME */ \ @@ -2880,6 +2889,10 @@ BU_P9_OVERLOAD_2 (CMPRB2, "byte_in_either_range") BU_P9_OVERLOAD_2 (CMPEQB, "byte_in_set") /* Builtins for scalar instructions added in ISA 3.1 (power10). */ +BU_P10V_AV_P (VCMPEQUT_P, "vcmpequt_p", CONST, vector_eq_v1ti_p) +BU_P10V_AV_P (VCMPGTST_P, "vcmpgtst_p", CONST, vector_gt_v1ti_p) +BU_P10V_AV_P (VCMPGTUT_P, "vcmpgtut_p", CONST, vector_gtu_v1ti_p) + BU_P10_POWERPC64_MISC_2 (CFUGED, "cfuged", CONST, cfuged) BU_P10_POWERPC64_MISC_2 (CNTLZDM, "cntlzdm", CONST, cntlzdm) BU_P10_POWERPC64_MISC_2 (CNTTZDM, "cnttzdm", CONST, cnttzdm) @@ -2900,7 +2913,36 @@ BU_P10V_VSX_2 (XXGENPCVM_V16QI, "xxgenpcvm_v16qi", CONST, xxgenpcvm_v16qi) BU_P10V_VSX_2 (XXGENPCVM_V8HI, "xxgenpcvm_v8hi", CONST, xxgenpcvm_v8hi) BU_P10V_VSX_2 (XXGENPCVM_V4SI, "xxgenpcvm_v4si", CONST, xxgenpcvm_v4si) BU_P10V_VSX_2 (XXGENPCVM_V2DI, "xxgenpcvm_v2di", CONST, xxgenpcvm_v2di) - +BU_P10V_AV_2 (VCMPGTUT, "vcmpgtut", CONST, vector_gtuv1ti) +BU_P10V_AV_2 (VCMPGTST, "vcmpgtst", CONST, vector_gtv1ti) +BU_P10V_AV_2 (VCMPEQUT, "vcmpequt", CONST, eqvv1ti3) +BU_P10V_AV_2 (CMPNET, "vcmpnet", CONST, vcmpnet) +BU_P10V_AV_2 (CMPGE_1TI, "cmpge_1ti", CONST, vector_nltv1ti) +BU_P10V_AV_2 (CMPGE_U1TI, "cmpge_u1ti", CONST, vector_nltuv1ti) +BU_P10V_AV_2 (CMPLE_1TI, "cmple_1ti", CONST, vector_ngtv1ti) +BU_P10V_AV_2 (CMPLE_U1TI, "cmple_u1ti", CONST, vector_ngtuv1ti) +BU_P10V_AV_2 (VNOR_V1TI_UNS, "vnor_v1ti_uns",CONST, norv1ti3) +BU_P10V_AV_2 (VNOR_V1TI, "vnor_v1ti", CONST, norv1ti3) +BU_P10V_AV_2 (VCMPNET_P, "vcmpnet_p", CONST, vector_ne_v1ti_p) +BU_P10V_AV_2 (VCMPAET_P, "vcmpaet_p", CONST, vector_ae_v1ti_p) + +BU_P10V_AV_2 (VMULEUD, "vmuleud", CONST, vec_widen_umult_even_v2di) +BU_P10V_AV_2 (VMULESD, "vmulesd", CONST, vec_widen_smult_even_v2di) +BU_P10V_AV_2 (VMULOUD, "vmuloud", CONST, vec_widen_umult_odd_v2di) +BU_P10V_AV_2 (VMULOSD, "vmulosd", CONST, vec_widen_smult_odd_v2di) +BU_P10V_AV_2 (VRLQ, "vrlq", CONST, vrotlv1ti3) +BU_P10V_AV_2 (VSLQ, "vslq", CONST, vashlv1ti3) +BU_P10V_AV_2 (VSRQ, "vsrq", CONST, vlshrv1ti3) +BU_P10V_AV_2 (VSRAQ, "vsraq", CONST, vashrv1ti3) +BU_P10V_AV_2 (VRLQNM, "vrlqnm", CONST, altivec_vrlqnm) +BU_P10V_AV_2 (DIV_V1TI, "div_1ti", CONST, vsx_div_v1ti) +BU_P10V_AV_2 (UDIV_V1TI, "udiv_1ti", CONST, vsx_udiv_v1ti) +BU_P10V_AV_2 (DIVES_V1TI, "dives", CONST, vsx_dives_v1ti) +BU_P10V_AV_2 (DIVEU_V1TI, "diveu", CONST, vsx_diveu_v1ti) +BU_P10V_AV_2 (MODS_V1TI, "mods", CONST, vsx_mods_v1ti) +BU_P10V_AV_2 (MODU_V1TI, "modu", CONST, vsx_modu_v1ti) + +BU_P10V_AV_3 (VRLQMI, "vrlqmi", CONST, altivec_vrlqmi) BU_P10V_AV_3 (VEXTRACTBL, "vextdubvlx", CONST, vextractlv16qi) BU_P10V_AV_3 (VEXTRACTHL, "vextduhvlx", CONST, vextractlv8hi) BU_P10V_AV_3 (VEXTRACTWL, "vextduwvlx", CONST, vextractlv4si) @@ -3025,6 +3067,10 @@ BU_P10_OVERLOAD_2 (CLRR, "clrr") BU_P10_OVERLOAD_2 (GNB, "gnb") BU_P10_OVERLOAD_4 (XXEVAL, "xxeval") BU_P10_OVERLOAD_2 (XXGENPCVM, "xxgenpcvm") +BU_P10_OVERLOAD_2 (VRLQ, "vrlq") +BU_P10_OVERLOAD_2 (VSLQ, "vslq") +BU_P10_OVERLOAD_2 (VSRQ, "vsrq") +BU_P10_OVERLOAD_2 (VSRAQ, "vsraq") BU_P10_OVERLOAD_3 (EXTRACTL, "extractl") BU_P10_OVERLOAD_3 (EXTRACTH, "extracth") diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index b4e13af..d1f29a5 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -843,6 +843,10 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_bool_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, 0 }, { ALTIVEC_BUILTIN_VEC_CMPEQ, P8V_BUILTIN_VCMPEQUD, RS6000_BTI_bool_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, + { ALTIVEC_BUILTIN_VEC_CMPEQ, P10V_BUILTIN_VCMPEQUT, + RS6000_BTI_bool_V1TI, RS6000_BTI_V1TI, RS6000_BTI_V1TI, 0 }, + { ALTIVEC_BUILTIN_VEC_CMPEQ, P10V_BUILTIN_VCMPEQUT, + RS6000_BTI_bool_V1TI, RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, 0 }, { ALTIVEC_BUILTIN_VEC_CMPEQ, ALTIVEC_BUILTIN_VCMPEQFP, RS6000_BTI_bool_V4SI, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0 }, { ALTIVEC_BUILTIN_VEC_CMPEQ, VSX_BUILTIN_XVCMPEQDP, @@ -889,6 +893,12 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { ALTIVEC_BUILTIN_VEC_CMPGE, VSX_BUILTIN_CMPGE_U2DI, RS6000_BTI_bool_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0}, + + { ALTIVEC_BUILTIN_VEC_CMPGE, P10V_BUILTIN_CMPGE_1TI, + RS6000_BTI_bool_V1TI, RS6000_BTI_V1TI, RS6000_BTI_V1TI, 0}, + { ALTIVEC_BUILTIN_VEC_CMPGE, P10V_BUILTIN_CMPGE_U1TI, + RS6000_BTI_bool_V1TI, RS6000_BTI_unsigned_V1TI, + RS6000_BTI_unsigned_V1TI, 0}, { ALTIVEC_BUILTIN_VEC_CMPGT, ALTIVEC_BUILTIN_VCMPGTUB, RS6000_BTI_bool_V16QI, RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI, 0 }, { ALTIVEC_BUILTIN_VEC_CMPGT, ALTIVEC_BUILTIN_VCMPGTSB, @@ -903,8 +913,12 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_bool_V4SI, RS6000_BTI_V4SI, RS6000_BTI_V4SI, 0 }, { ALTIVEC_BUILTIN_VEC_CMPGT, P8V_BUILTIN_VCMPGTUD, RS6000_BTI_bool_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, + { ALTIVEC_BUILTIN_VEC_CMPGT, P10V_BUILTIN_VCMPGTUT, + RS6000_BTI_bool_V1TI, RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, 0 }, { ALTIVEC_BUILTIN_VEC_CMPGT, P8V_BUILTIN_VCMPGTSD, RS6000_BTI_bool_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, 0 }, + { ALTIVEC_BUILTIN_VEC_CMPGT, P10V_BUILTIN_VCMPGTST, + RS6000_BTI_bool_V1TI, RS6000_BTI_V1TI, RS6000_BTI_V1TI, 0 }, { ALTIVEC_BUILTIN_VEC_CMPGT, ALTIVEC_BUILTIN_VCMPGTFP, RS6000_BTI_bool_V4SI, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0 }, { ALTIVEC_BUILTIN_VEC_CMPGT, VSX_BUILTIN_XVCMPGTDP, @@ -947,6 +961,11 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { ALTIVEC_BUILTIN_VEC_CMPLE, VSX_BUILTIN_CMPLE_U2DI, RS6000_BTI_bool_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0}, + { ALTIVEC_BUILTIN_VEC_CMPLE, P10V_BUILTIN_CMPLE_1TI, + RS6000_BTI_bool_V1TI, RS6000_BTI_V1TI, RS6000_BTI_V1TI, 0}, + { ALTIVEC_BUILTIN_VEC_CMPLE, P10V_BUILTIN_CMPLE_U1TI, + RS6000_BTI_bool_V1TI, RS6000_BTI_unsigned_V1TI, + RS6000_BTI_unsigned_V1TI, 0}, { ALTIVEC_BUILTIN_VEC_CMPLT, ALTIVEC_BUILTIN_VCMPGTUB, RS6000_BTI_bool_V16QI, RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI, 0 }, { ALTIVEC_BUILTIN_VEC_CMPLT, ALTIVEC_BUILTIN_VCMPGTSB, @@ -1086,6 +1105,11 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { VSX_BUILTIN_VEC_DIV, P10V_BUILTIN_DIVU_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, + { VSX_BUILTIN_VEC_DIV, P10V_BUILTIN_DIV_V1TI, + RS6000_BTI_V1TI, RS6000_BTI_V1TI, RS6000_BTI_V1TI, 0 }, + { VSX_BUILTIN_VEC_DIV, P10V_BUILTIN_UDIV_V1TI, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, + RS6000_BTI_unsigned_V1TI, 0 }, { P10_BUILTIN_VEC_DIVE, P10V_BUILTIN_DIVES_V4SI, RS6000_BTI_V4SI, RS6000_BTI_V4SI, RS6000_BTI_V4SI, 0 }, @@ -1097,6 +1121,11 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { P10_BUILTIN_VEC_DIVE, P10V_BUILTIN_DIVEU_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, + { P10_BUILTIN_VEC_DIVE, P10V_BUILTIN_DIVES_V1TI, + RS6000_BTI_V1TI, RS6000_BTI_V1TI, RS6000_BTI_V1TI, 0 }, + { P10_BUILTIN_VEC_DIVE, P10V_BUILTIN_DIVEU_V1TI, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, + RS6000_BTI_unsigned_V1TI, 0 }, { P10_BUILTIN_VEC_MOD, P10V_BUILTIN_MODS_V4SI, RS6000_BTI_V4SI, RS6000_BTI_V4SI, RS6000_BTI_V4SI, 0 }, @@ -1108,6 +1137,11 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { P10_BUILTIN_VEC_MOD, P10V_BUILTIN_MODU_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, + { P10_BUILTIN_VEC_MOD, P10V_BUILTIN_MODS_V1TI, + RS6000_BTI_V1TI, RS6000_BTI_V1TI, RS6000_BTI_V1TI, 0 }, + { P10_BUILTIN_VEC_MOD, P10V_BUILTIN_MODU_V1TI, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, + RS6000_BTI_unsigned_V1TI, 0 }, { VSX_BUILTIN_VEC_DOUBLE, VSX_BUILTIN_XVCVSXDDP, RS6000_BTI_V2DF, RS6000_BTI_V2DI, 0, 0 }, @@ -1973,6 +2007,11 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { ALTIVEC_BUILTIN_VEC_MULE, P8V_BUILTIN_VMULEUW, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, 0 }, + { ALTIVEC_BUILTIN_VEC_MULE, P10V_BUILTIN_VMULESD, + RS6000_BTI_V1TI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, 0 }, + { ALTIVEC_BUILTIN_VEC_MULE, P10V_BUILTIN_VMULEUD, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V2DI, + RS6000_BTI_unsigned_V2DI, 0 }, { ALTIVEC_BUILTIN_VEC_VMULEUB, ALTIVEC_BUILTIN_VMULEUB, RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI, 0 }, { ALTIVEC_BUILTIN_VEC_VMULESB, ALTIVEC_BUILTIN_VMULESB, @@ -1996,6 +2035,11 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { ALTIVEC_BUILTIN_VEC_MULO, P8V_BUILTIN_VMULOUW, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, 0 }, + { ALTIVEC_BUILTIN_VEC_MULO, P10V_BUILTIN_VMULOSD, + RS6000_BTI_V1TI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, 0 }, + { ALTIVEC_BUILTIN_VEC_MULO, P10V_BUILTIN_VMULOUD, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V2DI, + RS6000_BTI_unsigned_V2DI, 0 }, { ALTIVEC_BUILTIN_VEC_MULO, ALTIVEC_BUILTIN_VMULOSH, RS6000_BTI_V4SI, RS6000_BTI_V8HI, RS6000_BTI_V8HI, 0 }, { ALTIVEC_BUILTIN_VEC_VMULOSH, ALTIVEC_BUILTIN_VMULOSH, @@ -2038,6 +2082,16 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_bool_V2DI, 0 }, { ALTIVEC_BUILTIN_VEC_NOR, ALTIVEC_BUILTIN_VNOR_V2DI, RS6000_BTI_V2DI, RS6000_BTI_bool_V2DI, RS6000_BTI_V2DI, 0 }, + { ALTIVEC_BUILTIN_VEC_NOR, P10V_BUILTIN_VNOR_V1TI, + RS6000_BTI_V1TI, RS6000_BTI_V1TI, RS6000_BTI_bool_V1TI, 0 }, + { ALTIVEC_BUILTIN_VEC_NOR, P10V_BUILTIN_VNOR_V1TI, + RS6000_BTI_V1TI, RS6000_BTI_bool_V1TI, RS6000_BTI_V1TI, 0 }, + { ALTIVEC_BUILTIN_VEC_NOR, P10V_BUILTIN_VNOR_V1TI_UNS, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, 0 }, + { ALTIVEC_BUILTIN_VEC_NOR, P10V_BUILTIN_VNOR_V1TI_UNS, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, RS6000_BTI_bool_V1TI, 0 }, + { ALTIVEC_BUILTIN_VEC_NOR, P10V_BUILTIN_VNOR_V1TI_UNS, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_bool_V1TI, RS6000_BTI_unsigned_V1TI, 0 }, { ALTIVEC_BUILTIN_VEC_NOR, ALTIVEC_BUILTIN_VNOR_V2DI_UNS, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, { ALTIVEC_BUILTIN_VEC_NOR, ALTIVEC_BUILTIN_VNOR_V2DI_UNS, @@ -2299,6 +2353,11 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, { ALTIVEC_BUILTIN_VEC_RL, P8V_BUILTIN_VRLD, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, + { ALTIVEC_BUILTIN_VEC_RL, P10V_BUILTIN_VRLQ, + RS6000_BTI_V1TI, RS6000_BTI_V1TI, RS6000_BTI_unsigned_V1TI, 0 }, + { ALTIVEC_BUILTIN_VEC_RL, P10V_BUILTIN_VRLQ, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, + RS6000_BTI_unsigned_V1TI, 0 }, { ALTIVEC_BUILTIN_VEC_VRLW, ALTIVEC_BUILTIN_VRLW, RS6000_BTI_V4SI, RS6000_BTI_V4SI, RS6000_BTI_unsigned_V4SI, 0 }, { ALTIVEC_BUILTIN_VEC_VRLW, ALTIVEC_BUILTIN_VRLW, @@ -2317,12 +2376,23 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { P9V_BUILTIN_VEC_RLMI, P9V_BUILTIN_VRLDMI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI }, + { P9V_BUILTIN_VEC_RLMI, P10V_BUILTIN_VRLQMI, + RS6000_BTI_V1TI, RS6000_BTI_V1TI, + RS6000_BTI_V1TI, RS6000_BTI_unsigned_V1TI }, + { P9V_BUILTIN_VEC_RLMI, P10V_BUILTIN_VRLQMI, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI }, { P9V_BUILTIN_VEC_RLNM, P9V_BUILTIN_VRLWNM, RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, 0 }, { P9V_BUILTIN_VEC_RLNM, P9V_BUILTIN_VRLDNM, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, + { P9V_BUILTIN_VEC_RLNM, P10V_BUILTIN_VRLQNM, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, + RS6000_BTI_unsigned_V1TI, 0 }, + { P9V_BUILTIN_VEC_RLNM, P10V_BUILTIN_VRLQNM, + RS6000_BTI_V1TI, RS6000_BTI_V1TI, RS6000_BTI_unsigned_V1TI, 0 }, { ALTIVEC_BUILTIN_VEC_SL, ALTIVEC_BUILTIN_VSLB, RS6000_BTI_V16QI, RS6000_BTI_V16QI, RS6000_BTI_unsigned_V16QI, 0 }, { ALTIVEC_BUILTIN_VEC_SL, ALTIVEC_BUILTIN_VSLB, @@ -2339,6 +2409,11 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, { ALTIVEC_BUILTIN_VEC_SL, P8V_BUILTIN_VSLD, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, + { ALTIVEC_BUILTIN_VEC_SL, P10V_BUILTIN_VSLQ, + RS6000_BTI_V1TI, RS6000_BTI_V1TI, RS6000_BTI_unsigned_V1TI, 0 }, + { ALTIVEC_BUILTIN_VEC_SL, P10V_BUILTIN_VSLQ, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, + RS6000_BTI_unsigned_V1TI, 0 }, { ALTIVEC_BUILTIN_VEC_SQRT, VSX_BUILTIN_XVSQRTDP, RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 }, { ALTIVEC_BUILTIN_VEC_SQRT, VSX_BUILTIN_XVSQRTSP, @@ -2535,6 +2610,11 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, { ALTIVEC_BUILTIN_VEC_SR, P8V_BUILTIN_VSRD, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, + { ALTIVEC_BUILTIN_VEC_SR, P10V_BUILTIN_VSRQ, + RS6000_BTI_V1TI, RS6000_BTI_V1TI, RS6000_BTI_unsigned_V1TI, 0 }, + { ALTIVEC_BUILTIN_VEC_SR, P10V_BUILTIN_VSRQ, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, + RS6000_BTI_unsigned_V1TI, 0 }, { ALTIVEC_BUILTIN_VEC_VSRW, ALTIVEC_BUILTIN_VSRW, RS6000_BTI_V4SI, RS6000_BTI_V4SI, RS6000_BTI_unsigned_V4SI, 0 }, { ALTIVEC_BUILTIN_VEC_VSRW, ALTIVEC_BUILTIN_VSRW, @@ -2563,6 +2643,11 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, { ALTIVEC_BUILTIN_VEC_SRA, P8V_BUILTIN_VSRAD, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0 }, + { ALTIVEC_BUILTIN_VEC_SRA, P10V_BUILTIN_VSRAQ, + RS6000_BTI_V1TI, RS6000_BTI_V1TI, RS6000_BTI_unsigned_V1TI, 0 }, + { ALTIVEC_BUILTIN_VEC_SRA, P10V_BUILTIN_VSRAQ, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, + RS6000_BTI_unsigned_V1TI, 0 }, { ALTIVEC_BUILTIN_VEC_VSRAW, ALTIVEC_BUILTIN_VSRAW, RS6000_BTI_V4SI, RS6000_BTI_V4SI, RS6000_BTI_unsigned_V4SI, 0 }, { ALTIVEC_BUILTIN_VEC_VSRAW, ALTIVEC_BUILTIN_VSRAW, @@ -4180,12 +4265,16 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_bool_V2DI }, { ALTIVEC_BUILTIN_VEC_VCMPGT_P, P8V_BUILTIN_VCMPGTUD_P, RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI }, + { ALTIVEC_BUILTIN_VEC_VCMPGT_P, P10V_BUILTIN_VCMPGTUT_P, + RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI }, { ALTIVEC_BUILTIN_VEC_VCMPGT_P, P8V_BUILTIN_VCMPGTSD_P, RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_bool_V2DI, RS6000_BTI_V2DI }, { ALTIVEC_BUILTIN_VEC_VCMPGT_P, P8V_BUILTIN_VCMPGTSD_P, RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_V2DI, RS6000_BTI_bool_V2DI }, { ALTIVEC_BUILTIN_VEC_VCMPGT_P, P8V_BUILTIN_VCMPGTSD_P, RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_V2DI, RS6000_BTI_V2DI }, + { ALTIVEC_BUILTIN_VEC_VCMPGT_P, P10V_BUILTIN_VCMPGTST_P, + RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_V1TI, RS6000_BTI_V1TI }, { ALTIVEC_BUILTIN_VEC_VCMPGT_P, ALTIVEC_BUILTIN_VCMPGTFP_P, RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_V4SF, RS6000_BTI_V4SF }, { ALTIVEC_BUILTIN_VEC_VCMPGT_P, VSX_BUILTIN_XVCMPGTDP_P, @@ -4250,6 +4339,10 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_V2DI, RS6000_BTI_V2DI }, { ALTIVEC_BUILTIN_VEC_VCMPEQ_P, P8V_BUILTIN_VCMPEQUD_P, RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_bool_V2DI, RS6000_BTI_bool_V2DI }, + { ALTIVEC_BUILTIN_VEC_VCMPEQ_P, P10V_BUILTIN_VCMPEQUT_P, + RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_V1TI, RS6000_BTI_V1TI }, + { ALTIVEC_BUILTIN_VEC_VCMPEQ_P, P10V_BUILTIN_VCMPEQUT_P, + RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI }, { ALTIVEC_BUILTIN_VEC_VCMPEQ_P, ALTIVEC_BUILTIN_VCMPEQFP_P, RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_V4SF, RS6000_BTI_V4SF }, { ALTIVEC_BUILTIN_VEC_VCMPEQ_P, VSX_BUILTIN_XVCMPEQDP_P, @@ -4301,12 +4394,16 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_bool_V2DI }, { ALTIVEC_BUILTIN_VEC_VCMPGE_P, P8V_BUILTIN_VCMPGTUD_P, RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI }, + { ALTIVEC_BUILTIN_VEC_VCMPGE_P, P10V_BUILTIN_VCMPGTUT_P, + RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI }, { ALTIVEC_BUILTIN_VEC_VCMPGE_P, P8V_BUILTIN_VCMPGTSD_P, RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_bool_V2DI, RS6000_BTI_V2DI }, { ALTIVEC_BUILTIN_VEC_VCMPGE_P, P8V_BUILTIN_VCMPGTSD_P, RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_V2DI, RS6000_BTI_bool_V2DI }, { ALTIVEC_BUILTIN_VEC_VCMPGE_P, P8V_BUILTIN_VCMPGTSD_P, RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_V2DI, RS6000_BTI_V2DI }, + { ALTIVEC_BUILTIN_VEC_VCMPGE_P, P10V_BUILTIN_VCMPGTST_P, + RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_V1TI, RS6000_BTI_V1TI }, { ALTIVEC_BUILTIN_VEC_VCMPGE_P, ALTIVEC_BUILTIN_VCMPGEFP_P, RS6000_BTI_INTSI, RS6000_BTI_INTSI, RS6000_BTI_V4SF, RS6000_BTI_V4SF }, { ALTIVEC_BUILTIN_VEC_VCMPGE_P, VSX_BUILTIN_XVCMPGEDP_P, @@ -4955,6 +5052,12 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { ALTIVEC_BUILTIN_VEC_CMPNE, P9V_BUILTIN_CMPNEW, RS6000_BTI_bool_V4SI, RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, 0 }, + { ALTIVEC_BUILTIN_VEC_CMPNE, P10V_BUILTIN_CMPNET, + RS6000_BTI_bool_V1TI, RS6000_BTI_V1TI, + RS6000_BTI_V1TI, 0 }, + { ALTIVEC_BUILTIN_VEC_CMPNE, P10V_BUILTIN_CMPNET, + RS6000_BTI_bool_V1TI, RS6000_BTI_unsigned_V1TI, + RS6000_BTI_unsigned_V1TI, 0 }, /* The following 2 entries have been deprecated. */ { P9V_BUILTIN_VEC_VCMPNE_P, P9V_BUILTIN_VCMPNEB_P, @@ -5055,6 +5158,10 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { P9V_BUILTIN_VEC_VCMPNE_P, P9V_BUILTIN_VCMPNED_P, RS6000_BTI_INTSI, RS6000_BTI_bool_V2DI, RS6000_BTI_bool_V2DI, 0 }, + { P9V_BUILTIN_VEC_VCMPNE_P, P10V_BUILTIN_VCMPNET_P, + RS6000_BTI_INTSI, RS6000_BTI_V1TI, RS6000_BTI_V1TI, 0 }, + { P9V_BUILTIN_VEC_VCMPNE_P, P10V_BUILTIN_VCMPNET_P, + RS6000_BTI_INTSI, RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, 0 }, { P9V_BUILTIN_VEC_VCMPNE_P, P9V_BUILTIN_VCMPNEFP_P, RS6000_BTI_INTSI, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0 }, @@ -5160,7 +5267,10 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { P9V_BUILTIN_VEC_VCMPAE_P, P9V_BUILTIN_VCMPAED_P, RS6000_BTI_INTSI, RS6000_BTI_bool_V2DI, RS6000_BTI_bool_V2DI, 0 }, - + { P9V_BUILTIN_VEC_VCMPAE_P, P10V_BUILTIN_VCMPAET_P, + RS6000_BTI_INTSI, RS6000_BTI_V1TI, RS6000_BTI_V1TI, 0 }, + { P9V_BUILTIN_VEC_VCMPAE_P, P10V_BUILTIN_VCMPAET_P, + RS6000_BTI_INTSI, RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, 0 }, { P9V_BUILTIN_VEC_VCMPAE_P, P9V_BUILTIN_VCMPAEFP_P, RS6000_BTI_INTSI, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0 }, { P9V_BUILTIN_VEC_VCMPAE_P, P9V_BUILTIN_VCMPAEDP_P, @@ -12552,12 +12662,14 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi) case ALTIVEC_BUILTIN_VCMPEQUH: case ALTIVEC_BUILTIN_VCMPEQUW: case P8V_BUILTIN_VCMPEQUD: + case P10V_BUILTIN_VCMPEQUT: fold_compare_helper (gsi, EQ_EXPR, stmt); return true; case P9V_BUILTIN_CMPNEB: case P9V_BUILTIN_CMPNEH: case P9V_BUILTIN_CMPNEW: + case P10V_BUILTIN_CMPNET: fold_compare_helper (gsi, NE_EXPR, stmt); return true; @@ -12569,6 +12681,8 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi) case VSX_BUILTIN_CMPGE_U4SI: case VSX_BUILTIN_CMPGE_2DI: case VSX_BUILTIN_CMPGE_U2DI: + case P10V_BUILTIN_CMPGE_1TI: + case P10V_BUILTIN_CMPGE_U1TI: fold_compare_helper (gsi, GE_EXPR, stmt); return true; @@ -12580,6 +12694,8 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi) case ALTIVEC_BUILTIN_VCMPGTUW: case P8V_BUILTIN_VCMPGTUD: case P8V_BUILTIN_VCMPGTSD: + case P10V_BUILTIN_VCMPGTUT: + case P10V_BUILTIN_VCMPGTST: fold_compare_helper (gsi, GT_EXPR, stmt); return true; @@ -12591,6 +12707,8 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi) case VSX_BUILTIN_CMPLE_U4SI: case VSX_BUILTIN_CMPLE_2DI: case VSX_BUILTIN_CMPLE_U2DI: + case P10V_BUILTIN_CMPLE_1TI: + case P10V_BUILTIN_CMPLE_U1TI: fold_compare_helper (gsi, LE_EXPR, stmt); return true; @@ -13318,6 +13436,8 @@ rs6000_init_builtins (void) ? "__vector __bool long" : "__vector __bool long long", bool_long_long_type_node, 2); + bool_V1TI_type_node = rs6000_vector_type ("__vector __bool __int128", + intTI_type_node, 1); pixel_V8HI_type_node = rs6000_vector_type ("__vector __pixel", pixel_type_node, 8); @@ -13515,6 +13635,10 @@ altivec_init_builtins (void) = build_function_type_list (integer_type_node, integer_type_node, V2DI_type_node, V2DI_type_node, NULL_TREE); + tree int_ftype_int_v1ti_v1ti + = build_function_type_list (integer_type_node, + integer_type_node, V1TI_type_node, + V1TI_type_node, NULL_TREE); tree void_ftype_v4si = build_function_type_list (void_type_node, V4SI_type_node, NULL_TREE); tree v8hi_ftype_void @@ -13882,6 +14006,9 @@ altivec_init_builtins (void) case E_VOIDmode: type = int_ftype_int_opaque_opaque; break; + case E_V1TImode: + type = int_ftype_int_v1ti_v1ti; + break; case E_V2DImode: type = int_ftype_int_v2di_v2di; break; @@ -14487,12 +14614,16 @@ builtin_function_type (machine_mode mode_ret, machine_mode mode_arg0, case P10V_BUILTIN_XXGENPCVM_V2DI: case P10V_BUILTIN_DIVEU_V4SI: case P10V_BUILTIN_DIVEU_V2DI: + case P10V_BUILTIN_DIVEU_V1TI: case P10V_BUILTIN_DIVU_V4SI: case P10V_BUILTIN_DIVU_V2DI: + case P10V_BUILTIN_MODU_V1TI: case P10V_BUILTIN_MODU_V2DI: case P10V_BUILTIN_MODU_V4SI: case P10V_BUILTIN_MULHU_V2DI: case P10V_BUILTIN_MULHU_V4SI: + case P10V_BUILTIN_VMULEUD: + case P10V_BUILTIN_VMULOUD: h.uns_p[0] = 1; h.uns_p[1] = 1; h.uns_p[2] = 1; @@ -14592,10 +14723,13 @@ builtin_function_type (machine_mode mode_ret, machine_mode mode_arg0, case VSX_BUILTIN_CMPGE_U8HI: case VSX_BUILTIN_CMPGE_U4SI: case VSX_BUILTIN_CMPGE_U2DI: + case P10V_BUILTIN_CMPGE_U1TI: case ALTIVEC_BUILTIN_VCMPGTUB: case ALTIVEC_BUILTIN_VCMPGTUH: case ALTIVEC_BUILTIN_VCMPGTUW: case P8V_BUILTIN_VCMPGTUD: + case P10V_BUILTIN_VCMPGTUT: + case P10V_BUILTIN_VCMPEQUT: h.uns_p[1] = 1; h.uns_p[2] = 1; break; diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index b01bb5c..328dc10 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -20217,6 +20217,7 @@ rs6000_handle_altivec_attribute (tree *node, case 'b': switch (mode) { + case E_TImode: case E_V1TImode: result = bool_V1TI_type_node; break; case E_DImode: case E_V2DImode: result = bool_V2DI_type_node; break; case E_SImode: case E_V4SImode: result = bool_V4SI_type_node; break; case E_HImode: case E_V8HImode: result = bool_V8HI_type_node; break; diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index a5f7b1d..4ca6372 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -2321,7 +2321,6 @@ extern int frame_pointer_needed; #define RS6000_BTM_MMA MASK_MMA /* ISA 3.1 MMA. */ #define RS6000_BTM_P10 MASK_POWER10 - #define RS6000_BTM_COMMON (RS6000_BTM_ALTIVEC \ | RS6000_BTM_VSX \ | RS6000_BTM_P8_VECTOR \ @@ -2434,6 +2433,7 @@ enum rs6000_builtin_type_index RS6000_BTI_bool_V8HI, /* __vector __bool short */ RS6000_BTI_bool_V4SI, /* __vector __bool int */ RS6000_BTI_bool_V2DI, /* __vector __bool long */ + RS6000_BTI_bool_V1TI, /* __vector __bool 128-bit */ RS6000_BTI_pixel_V8HI, /* __vector __pixel */ RS6000_BTI_long, /* long_integer_type_node */ RS6000_BTI_unsigned_long, /* long_unsigned_type_node */ @@ -2487,6 +2487,7 @@ enum rs6000_builtin_type_index #define bool_V8HI_type_node (rs6000_builtin_types[RS6000_BTI_bool_V8HI]) #define bool_V4SI_type_node (rs6000_builtin_types[RS6000_BTI_bool_V4SI]) #define bool_V2DI_type_node (rs6000_builtin_types[RS6000_BTI_bool_V2DI]) +#define bool_V1TI_type_node (rs6000_builtin_types[RS6000_BTI_bool_V1TI]) #define pixel_V8HI_type_node (rs6000_builtin_types[RS6000_BTI_pixel_V8HI]) #define long_long_integer_type_internal_node (rs6000_builtin_types[RS6000_BTI_long_long]) diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md index 3446b03..ea88a97 100644 --- a/gcc/config/rs6000/vector.md +++ b/gcc/config/rs6000/vector.md @@ -53,7 +53,7 @@ (define_mode_iterator VEC_N [V4SI V4SF V2DI V2DF V1TI KF TF]) ;; Vector comparison modes -(define_mode_iterator VEC_C [V16QI V8HI V4SI V2DI V4SF V2DF]) +(define_mode_iterator VEC_C [V16QI V8HI V4SI V2DI V4SF V2DF V1TI]) ;; Vector init/extract modes (define_mode_iterator VEC_E [V16QI V8HI V4SI V2DI V4SF V2DF]) @@ -697,6 +697,17 @@ operands[3] = gen_reg_rtx_and_attrs (operands[0]); }) +(define_expand "vector_nltv1ti" + [(set (match_operand:V1TI 3 "vlogical_operand") + (gt:V1TI (match_operand:V1TI 2 "vlogical_operand") + (match_operand:V1TI 1 "vlogical_operand"))) + (set (match_operand:V1TI 0 "vlogical_operand") + (not:V1TI (match_dup 3)))] + "TARGET_POWER10" +{ + operands[3] = gen_reg_rtx_and_attrs (operands[0]); +}) + (define_expand "vector_gtu" [(set (match_operand:VEC_I 0 "vint_operand") (gtu:VEC_I (match_operand:VEC_I 1 "vint_operand") @@ -704,6 +715,13 @@ "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode)" "") +(define_expand "vector_gtuv1ti" + [(set (match_operand:V1TI 0 "altivec_register_operand") + (gtu:V1TI (match_operand:V1TI 1 "altivec_register_operand") + (match_operand:V1TI 2 "altivec_register_operand")))] + "TARGET_POWER10" + "") + ; >= for integer vectors: swap operands and apply not-greater-than (define_expand "vector_nltu" [(set (match_operand:VEC_I 3 "vlogical_operand") @@ -716,6 +734,17 @@ operands[3] = gen_reg_rtx_and_attrs (operands[0]); }) +(define_expand "vector_nltuv1ti" + [(set (match_operand:V1TI 3 "vlogical_operand") + (gtu:V1TI (match_operand:V1TI 2 "vlogical_operand") + (match_operand:V1TI 1 "vlogical_operand"))) + (set (match_operand:V1TI 0 "vlogical_operand") + (not:V1TI (match_dup 3)))] + "TARGET_POWER10" +{ + operands[3] = gen_reg_rtx_and_attrs (operands[0]); +}) + (define_expand "vector_geu" [(set (match_operand:VEC_I 0 "vint_operand") (geu:VEC_I (match_operand:VEC_I 1 "vint_operand") @@ -735,6 +764,17 @@ operands[3] = gen_reg_rtx_and_attrs (operands[0]); }) +(define_expand "vector_ngtv1ti" + [(set (match_operand:V1TI 3 "vlogical_operand") + (gt:V1TI (match_operand:V1TI 1 "vlogical_operand") + (match_operand:V1TI 2 "vlogical_operand"))) + (set (match_operand:V1TI 0 "vlogical_operand") + (not:V1TI (match_dup 3)))] + "TARGET_POWER10" +{ + operands[3] = gen_reg_rtx_and_attrs (operands[0]); +}) + (define_expand "vector_ngtu" [(set (match_operand:VEC_I 3 "vlogical_operand") (gtu:VEC_I (match_operand:VEC_I 1 "vlogical_operand") @@ -746,6 +786,17 @@ operands[3] = gen_reg_rtx_and_attrs (operands[0]); }) +(define_expand "vector_ngtuv1ti" + [(set (match_operand:V1TI 3 "vlogical_operand") + (gtu:V1TI (match_operand:V1TI 1 "vlogical_operand") + (match_operand:V1TI 2 "vlogical_operand"))) + (set (match_operand:V1TI 0 "vlogical_operand") + (not:V1TI (match_dup 3)))] + "TARGET_POWER10" +{ + operands[3] = gen_reg_rtx_and_attrs (operands[0]); +}) + ; There are 14 possible vector FP comparison operators, gt and eq of them have ; been expanded above, so just support 12 remaining operators here. @@ -894,6 +945,18 @@ "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode)" "") +(define_expand "vector_eq_v1ti_p" + [(parallel + [(set (reg:CC CR6_REGNO) + (unspec:CC [(eq:CC (match_operand:V1TI 1 "altivec_register_operand") + (match_operand:V1TI 2 "altivec_register_operand"))] + UNSPEC_PREDICATE)) + (set (match_operand:V1TI 0 "vlogical_operand") + (eq:V1TI (match_dup 1) + (match_dup 2)))])] + "TARGET_POWER10" + "") + ;; This expansion handles the V16QI, V8HI, and V4SI modes in the ;; implementation of the vec_all_ne built-in functions on Power9. (define_expand "vector_ne__p" @@ -976,6 +1039,23 @@ operands[3] = gen_reg_rtx (V2DImode); }) +(define_expand "vector_ne_v1ti_p" + [(parallel + [(set (reg:CC CR6_REGNO) + (unspec:CC [(eq:CC (match_operand:V1TI 1 "altivec_register_operand") + (match_operand:V1TI 2 "altivec_register_operand"))] + UNSPEC_PREDICATE)) + (set (match_dup 3) + (eq:V1TI (match_dup 1) + (match_dup 2)))]) + (set (match_operand:SI 0 "register_operand" "=r") + (eq:SI (reg:CC CR6_REGNO) + (const_int 0)))] + "TARGET_POWER10" +{ + operands[3] = gen_reg_rtx (V1TImode); +}) + ;; This expansion handles the V2DI mode in the implementation of the ;; vec_any_eq built-in function on Power9. ;; @@ -1002,6 +1082,26 @@ operands[3] = gen_reg_rtx (V2DImode); }) +(define_expand "vector_ae_v1ti_p" + [(parallel + [(set (reg:CC CR6_REGNO) + (unspec:CC [(eq:CC (match_operand:V1TI 1 "altivec_register_operand") + (match_operand:V1TI 2 "altivec_register_operand"))] + UNSPEC_PREDICATE)) + (set (match_dup 3) + (eq:V1TI (match_dup 1) + (match_dup 2)))]) + (set (match_operand:SI 0 "register_operand" "=r") + (eq:SI (reg:CC CR6_REGNO) + (const_int 0))) + (set (match_dup 0) + (xor:SI (match_dup 0) + (const_int 1)))] + "TARGET_POWER10" +{ + operands[3] = gen_reg_rtx (V1TImode); +}) + ;; This expansion handles the V4SF and V2DF modes in the Power9 ;; implementation of the vec_all_ne built-in functions. Note that the ;; expansions for this pattern with these modes makes no use of power9- @@ -1061,6 +1161,18 @@ "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode)" "") +(define_expand "vector_gt_v1ti_p" + [(parallel + [(set (reg:CC CR6_REGNO) + (unspec:CC [(gt:CC (match_operand:V1TI 1 "vlogical_operand") + (match_operand:V1TI 2 "vlogical_operand"))] + UNSPEC_PREDICATE)) + (set (match_operand:V1TI 0 "vlogical_operand") + (gt:V1TI (match_dup 1) + (match_dup 2)))])] + "TARGET_POWER10" + "") + (define_expand "vector_ge__p" [(parallel [(set (reg:CC CR6_REGNO) @@ -1085,6 +1197,18 @@ "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode)" "") +(define_expand "vector_gtu_v1ti_p" + [(parallel + [(set (reg:CC CR6_REGNO) + (unspec:CC [(gtu:CC (match_operand:V1TI 1 "altivec_register_operand") + (match_operand:V1TI 2 "altivec_register_operand"))] + UNSPEC_PREDICATE)) + (set (match_operand:V1TI 0 "altivec_register_operand") + (gtu:V1TI (match_dup 1) + (match_dup 2)))])] + "TARGET_POWER10" + "") + ;; AltiVec/VSX predicates. ;; This expansion is triggered during expansion of predicate built-in @@ -1460,6 +1584,20 @@ "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode)" "") +(define_expand "vrotlv1ti3" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (rotate:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")))] + "TARGET_POWER10" +{ + /* Shift amount in needs to be put in bits[57:63] of 128-bit operand2. */ + rtx tmp = gen_reg_rtx (V1TImode); + + emit_insn (gen_xxswapd_v1ti (tmp, operands[2])); + emit_insn (gen_altivec_vrlq (operands[0], operands[1], tmp)); + DONE; +}) + ;; Expanders for rotatert to make use of vrotl (define_expand "vrotr3" [(set (match_operand:VEC_I 0 "vint_operand") @@ -1481,6 +1619,21 @@ "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode)" "") +;; No immediate version of this 128-bit instruction +(define_expand "vashlv1ti3" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (ashift:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")))] + "TARGET_POWER10" +{ + /* Shift amount in needs to be put in bits[57:63] of 128-bit operand2. */ + rtx tmp = gen_reg_rtx (V1TImode); + + emit_insn (gen_xxswapd_v1ti (tmp, operands[2])); + emit_insn (gen_altivec_vslq (operands[0], operands[1], tmp)); + DONE; +}) + ;; Expanders for logical shift right on each vector element (define_expand "vlshr3" [(set (match_operand:VEC_I 0 "vint_operand") @@ -1489,6 +1642,21 @@ "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode)" "") +;; No immediate version of this 128-bit instruction +(define_expand "vlshrv1ti3" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (lshiftrt:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")))] + "TARGET_POWER10" +{ + /* Shift amount in needs to be put into bits[57:63] of 128-bit operand2. */ + rtx tmp = gen_reg_rtx (V1TImode); + + emit_insn (gen_xxswapd_v1ti (tmp, operands[2])); + emit_insn (gen_altivec_vsrq (operands[0], operands[1], tmp)); + DONE; +}) + ;; Expanders for arithmetic shift right on each vector element (define_expand "vashr3" [(set (match_operand:VEC_I 0 "vint_operand") @@ -1496,6 +1664,22 @@ (match_operand:VEC_I 2 "vint_operand")))] "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode)" "") + +;; No immediate version of this 128-bit instruction +(define_expand "vashrv1ti3" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (ashiftrt:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")))] + "TARGET_POWER10" +{ + /* Shift amount in needs to be put into bits[57:63] of 128-bit operand2. */ + rtx tmp = gen_reg_rtx (V1TImode); + + emit_insn (gen_xxswapd_v1ti (tmp, operands[2])); + emit_insn (gen_altivec_vsraq (operands[0], operands[1], tmp)); + DONE; +}) + ;; Vector reduction expanders for VSX ; The (VEC_reduc:... diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index bcb92be..5403d02 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -302,6 +302,12 @@ UNSPEC_VSX_XXSPLTD UNSPEC_VSX_DIVSD UNSPEC_VSX_DIVUD + UNSPEC_VSX_DIVSQ + UNSPEC_VSX_DIVUQ + UNSPEC_VSX_DIVESQ + UNSPEC_VSX_DIVEUQ + UNSPEC_VSX_MODSQ + UNSPEC_VSX_MODUQ UNSPEC_VSX_MULSD UNSPEC_VSX_SIGN_EXTEND UNSPEC_VSX_XVCVBF16SPN @@ -1781,6 +1787,61 @@ } [(set_attr "type" "div")]) +;; Vector integer signed/unsigned divide +(define_insn "vsx_div_v1ti" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (unspec:V1TI [(match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")] + UNSPEC_VSX_DIVSQ))] + "TARGET_POWER10" + "vdivsq %0,%1,%2" + [(set_attr "type" "div")]) + +(define_insn "vsx_udiv_v1ti" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (unspec:V1TI [(match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")] + UNSPEC_VSX_DIVUQ))] + "TARGET_POWER10" + "vdivuq %0,%1,%2" + [(set_attr "type" "div")]) + +(define_insn "vsx_dives_v1ti" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (unspec:V1TI [(match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")] + UNSPEC_VSX_DIVESQ))] + "TARGET_POWER10" + "vdivesq %0,%1,%2" + [(set_attr "type" "div")]) + +(define_insn "vsx_diveu_v1ti" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (unspec:V1TI [(match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")] + UNSPEC_VSX_DIVEUQ))] + "TARGET_POWER10" + "vdiveuq %0,%1,%2" + [(set_attr "type" "div")]) + +(define_insn "vsx_mods_v1ti" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (unspec:V1TI [(match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")] + UNSPEC_VSX_MODSQ))] + "TARGET_POWER10" + "vmodsq %0,%1,%2" + [(set_attr "type" "div")]) + +(define_insn "vsx_modu_v1ti" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (unspec:V1TI [(match_operand:V1TI 1 "vsx_register_operand" "v") + (match_operand:V1TI 2 "vsx_register_operand" "v")] + UNSPEC_VSX_MODUQ))] + "TARGET_POWER10" + "vmoduq %0,%1,%2" + [(set_attr "type" "div")]) + ;; *tdiv* instruction returning the FG flag (define_expand "vsx_tdiv3_fg" [(set (match_dup 3) @@ -3126,6 +3187,21 @@ "xxpermdi %x0,%x1,%x1,2" [(set_attr "type" "vecperm")]) +;; Swap upper/lower 64-bit values in a 128-bit vector +(define_insn "xxswapd_v1ti" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (subreg:V1TI + (vec_select:V2DI + (subreg:V2DI + (match_operand:V1TI 1 "vsx_register_operand" "v") 0 ) + (parallel [(const_int 1)(const_int 0)])) + 0))] + "TARGET_POWER10" +;; AIX does not support extended mnemonic xxswapd. Use the basic +;; mnemonic xxpermdi instead. + "xxpermdi %x0,%x1,%x1,2" + [(set_attr "type" "vecperm")]) + (define_insn "xxgenpcvm__internal" [(set (match_operand:VSX_EXTRACT_I4 0 "altivec_register_operand" "=wa") (unspec:VSX_EXTRACT_I4 @@ -5525,6 +5601,19 @@ "vcmpneb %0,%1,%2" [(set_attr "type" "vecsimple")]) +;; Vector Compare Not Equal v1ti (specified/not+eq:) +(define_expand "vcmpnet" + [(set (match_operand:V1TI 0 "altivec_register_operand") + (not:V1TI + (eq:V1TI (match_operand:V1TI 1 "altivec_register_operand") + (match_operand:V1TI 2 "altivec_register_operand"))))] + "TARGET_POWER10" +{ + emit_insn (gen_eqvv1ti3 (operands[0], operands[1], operands[2])); + emit_insn (gen_one_cmplv1ti2 (operands[0], operands[0])); + DONE; +}) + ;; Vector Compare Not Equal or Zero Byte (define_insn "vcmpnezb" [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") -- cgit v1.1 From 976ffcf87da321a3eead9715032e599b226fcbf7 Mon Sep 17 00:00:00 2001 From: Carl Love Date: Tue, 24 Nov 2020 18:18:05 -0600 Subject: Add 128-bit int to 128-bit DFP (floattitd2) and 128-bit DFP to 128-bit int (fixtdti2) support 2021-06-08 Carl Love gcc/ChangeLog * config/rs6000/dfp.md (floattitd2, fixtdti2): New define_insns. gcc/testsuite/ChangeLog * gcc.target/powerpc/int_128bit-runnable.c: Add 128-bit DFP conversion tests. --- gcc/config/rs6000/dfp.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/dfp.md b/gcc/config/rs6000/dfp.md index 026be5d..b89d5ec 100644 --- a/gcc/config/rs6000/dfp.md +++ b/gcc/config/rs6000/dfp.md @@ -226,6 +226,13 @@ [(set_attr "type" "dfp") (set_attr "size" "128")]) +(define_insn "floattitd2" + [(set (match_operand:TD 0 "gpc_reg_operand" "=d") + (float:TD (match_operand:TI 1 "gpc_reg_operand" "v")))] + "TARGET_POWER10" + "dcffixqq %0,%1" + [(set_attr "type" "dfp")]) + ;; Convert a decimal64/128 to a decimal64/128 whose value is an integer. ;; This is the first stage of converting it to an integer type. @@ -247,6 +254,13 @@ "dctfix %0,%1" [(set_attr "type" "dfp") (set_attr "size" "")]) + +(define_insn "fixtdti2" + [(set (match_operand:TI 0 "gpc_reg_operand" "=v") + (fix:TI (match_operand:TD 1 "gpc_reg_operand" "d")))] + "TARGET_POWER10" + "dctfixqq %0,%1" + [(set_attr "type" "dfp")]) ;; Decimal builtin support -- cgit v1.1 From f170186446d03638cd82e37aa2a1bcf4dc1adba2 Mon Sep 17 00:00:00 2001 From: Carl Love Date: Wed, 16 Sep 2020 16:05:49 -0500 Subject: rs6000, Add test 128-bit shifts for just the int128 type. This patch also renames and moves the VSX_TI iterator from vsx.md to VEC_TI in vector.md. The uses of VEC_TI are also updated. 2021-04-29 Carl Love gcc/ChangeLog * config/rs6000/altivec.md (altivec_vslq, altivec_vsrq): Rename to altivec_vslq_, altivec_vsrq_, mode VEC_TI. * config/rs6000/vector.md (VEC_TI): Was named VSX_TI in vsx.md. (vashlv1ti3): Change to vashl3, mode VEC_TI. (vlshrv1ti3): Change to vlshr3, mode VEC_TI. * config/rs6000/vsx.md (VSX_TI): Remove define_mode_iterator. Update uses of VSX_TI to VEC_TI. gcc/testsuite/ChangeLog * gcc.target/powerpc/int_128bit-runnable.c: Add shift_right, shift_left tests. --- gcc/config/rs6000/altivec.md | 16 ++++++++-------- gcc/config/rs6000/vector.md | 27 +++++++++++++++------------ gcc/config/rs6000/vsx.md | 33 +++++++++++++++------------------ 3 files changed, 38 insertions(+), 38 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 0fa69b7..a27e466 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -2226,10 +2226,10 @@ "vsl %0,%1,%2" [(set_attr "type" "vecsimple")]) -(define_insn "altivec_vslq" - [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") - (ashift:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v") - (match_operand:V1TI 2 "vsx_register_operand" "v")))] +(define_insn "altivec_vslq_" + [(set (match_operand:VEC_TI 0 "vsx_register_operand" "=v") + (ashift:VEC_TI (match_operand:VEC_TI 1 "vsx_register_operand" "v") + (match_operand:VEC_TI 2 "vsx_register_operand" "v")))] "TARGET_POWER10" /* Shift amount in needs to be in bits[57:63] of 128-bit operand. */ "vslq %0,%1,%2" @@ -2243,10 +2243,10 @@ "vsr %0,%1,%2" [(set_attr "type" "vecsimple")]) -(define_insn "altivec_vsrq" - [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") - (lshiftrt:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v") - (match_operand:V1TI 2 "vsx_register_operand" "v")))] +(define_insn "altivec_vsrq_" + [(set (match_operand:VEC_TI 0 "vsx_register_operand" "=v") + (lshiftrt:VEC_TI (match_operand:VEC_TI 1 "vsx_register_operand" "v") + (match_operand:VEC_TI 2 "vsx_register_operand" "v")))] "TARGET_POWER10" /* Shift amount in needs to be in bits[57:63] of 128-bit operand. */ "vsrq %0,%1,%2" diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md index ea88a97..7e36c788 100644 --- a/gcc/config/rs6000/vector.md +++ b/gcc/config/rs6000/vector.md @@ -26,6 +26,9 @@ ;; Vector int modes (define_mode_iterator VEC_I [V16QI V8HI V4SI V2DI]) +;; 128-bit int modes +(define_mode_iterator VEC_TI [V1TI TI]) + ;; Vector int modes for parity (define_mode_iterator VEC_IP [V8HI V4SI @@ -1620,17 +1623,17 @@ "") ;; No immediate version of this 128-bit instruction -(define_expand "vashlv1ti3" - [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") - (ashift:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v") - (match_operand:V1TI 2 "vsx_register_operand" "v")))] +(define_expand "vashl3" + [(set (match_operand:VEC_TI 0 "vsx_register_operand" "=v") + (ashift:VEC_TI (match_operand:VEC_TI 1 "vsx_register_operand") + (match_operand:VEC_TI 2 "vsx_register_operand")))] "TARGET_POWER10" { /* Shift amount in needs to be put in bits[57:63] of 128-bit operand2. */ - rtx tmp = gen_reg_rtx (V1TImode); + rtx tmp = gen_reg_rtx (mode); emit_insn (gen_xxswapd_v1ti (tmp, operands[2])); - emit_insn (gen_altivec_vslq (operands[0], operands[1], tmp)); + emit_insn(gen_altivec_vslq_ (operands[0], operands[1], tmp)); DONE; }) @@ -1643,17 +1646,17 @@ "") ;; No immediate version of this 128-bit instruction -(define_expand "vlshrv1ti3" - [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") - (lshiftrt:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v") - (match_operand:V1TI 2 "vsx_register_operand" "v")))] +(define_expand "vlshr3" + [(set (match_operand:VEC_TI 0 "vsx_register_operand" "=v") + (lshiftrt:VEC_TI (match_operand:VEC_TI 1 "vsx_register_operand") + (match_operand:VEC_TI 2 "vsx_register_operand")))] "TARGET_POWER10" { /* Shift amount in needs to be put into bits[57:63] of 128-bit operand2. */ - rtx tmp = gen_reg_rtx (V1TImode); + rtx tmp = gen_reg_rtx (mode); emit_insn (gen_xxswapd_v1ti (tmp, operands[2])); - emit_insn (gen_altivec_vsrq (operands[0], operands[1], tmp)); + emit_insn(gen_altivec_vsrq_ (operands[0], operands[1], tmp)); DONE; }) diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 5403d02..ce8f82b 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -37,9 +37,6 @@ TI V1TI]) -;; Iterator for 128-bit integer types that go in a single vector register. -(define_mode_iterator VSX_TI [TI V1TI]) - ;; Iterator for the 2 32-bit vector types (define_mode_iterator VSX_W [V4SF V4SI]) @@ -952,9 +949,9 @@ ;; special V1TI container class, which it is not appropriate to use vec_select ;; for the type. (define_insn "*vsx_le_permute_" - [(set (match_operand:VSX_TI 0 "nonimmediate_operand" "=wa,wa,Z,&r,&r,Q") - (rotate:VSX_TI - (match_operand:VSX_TI 1 "input_operand" "wa,Z,wa,r,Q,r") + [(set (match_operand:VEC_TI 0 "nonimmediate_operand" "=wa,wa,Z,&r,&r,Q") + (rotate:VEC_TI + (match_operand:VEC_TI 1 "input_operand" "wa,Z,wa,r,Q,r") (const_int 64)))] "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR" "@ @@ -968,10 +965,10 @@ (set_attr "type" "vecperm,vecload,vecstore,*,load,store")]) (define_insn_and_split "*vsx_le_undo_permute_" - [(set (match_operand:VSX_TI 0 "vsx_register_operand" "=wa,wa") - (rotate:VSX_TI - (rotate:VSX_TI - (match_operand:VSX_TI 1 "vsx_register_operand" "0,wa") + [(set (match_operand:VEC_TI 0 "vsx_register_operand" "=wa,wa") + (rotate:VEC_TI + (rotate:VEC_TI + (match_operand:VEC_TI 1 "vsx_register_operand" "0,wa") (const_int 64)) (const_int 64)))] "!BYTES_BIG_ENDIAN && TARGET_VSX" @@ -1043,11 +1040,11 @@ ;; Peepholes to catch loads and stores for TImode if TImode landed in ;; GPR registers on a little endian system. (define_peephole2 - [(set (match_operand:VSX_TI 0 "int_reg_operand") - (rotate:VSX_TI (match_operand:VSX_TI 1 "memory_operand") + [(set (match_operand:VEC_TI 0 "int_reg_operand") + (rotate:VEC_TI (match_operand:VEC_TI 1 "memory_operand") (const_int 64))) - (set (match_operand:VSX_TI 2 "int_reg_operand") - (rotate:VSX_TI (match_dup 0) + (set (match_operand:VEC_TI 2 "int_reg_operand") + (rotate:VEC_TI (match_dup 0) (const_int 64)))] "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR && (rtx_equal_p (operands[0], operands[2]) @@ -1055,11 +1052,11 @@ [(set (match_dup 2) (match_dup 1))]) (define_peephole2 - [(set (match_operand:VSX_TI 0 "int_reg_operand") - (rotate:VSX_TI (match_operand:VSX_TI 1 "int_reg_operand") + [(set (match_operand:VEC_TI 0 "int_reg_operand") + (rotate:VEC_TI (match_operand:VEC_TI 1 "int_reg_operand") (const_int 64))) - (set (match_operand:VSX_TI 2 "memory_operand") - (rotate:VSX_TI (match_dup 0) + (set (match_operand:VEC_TI 2 "memory_operand") + (rotate:VEC_TI (match_dup 0) (const_int 64)))] "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR && peep2_reg_dead_p (2, operands[0])" -- cgit v1.1 From 9090f4807161876033f6bf0e1133364b38d91865 Mon Sep 17 00:00:00 2001 From: Carl Love Date: Wed, 21 Apr 2021 18:07:39 -0400 Subject: Conversions between 128-bit integer and floating point values. The files fixkfti-sw.c and fixunskfti-sw.c are renamed versions of fixkfti.c and fixunskfti.c respectively to do the conversions in software. The function names in the files were updated with the rename as well as some white spaces fixes. The file float128-p10.c contains the functions for using the ISA 3.1 hardware instructions to perform the conversions. 2021-06-08 Carl Love gcc/ChangeLog * config/rs6000/rs6000.c (__fixkfti, __fixunskfti, __floattikf, __floatuntikf): Names changed to __fixkfti_sw, __fixunskfti_sw, __floattikf_sw, __floatuntikf_sw respectively. * config/rs6000/rs6000.md (floatti2, floatunsti2, fix_truncti2, fixuns_truncti2): Add define_insn for mode IEEE 128. gcc/testsuite/ChangeLog * gcc.target/powerpc/fp128_conversions.c: New file. * gcc.target/powerpc/int_128bit-runnable.c(vextsd2q, vcmpuq, vcmpsq, vcmpequq, vcmpequq., vcmpgtsq, vcmpgtsq. vcmpgtuq, vcmpgtuq.): Update scan-assembler-times. (ppc_native_128bit): Remove dg-require-effective-target. libgcc/ChangeLog * config.host: Add if test and set for libgcc_cv_powerpc_3_1_float128_hw. * config/rs6000/fixkfti.c: Renamed to fixkfti-sw.c. Change calls of __fixkfti to __fixkfti_sw. * config/rs6000/fixunskfti.c: Renamed to fixunskfti-sw.c. Change calls of __fixunskfti to __fixunskfti_sw. * config/rs6000/float128-p10.c (__floattikf_hw, __floatuntikf_hw, __fixkfti_hw, __fixunskfti_hw): New file. * config/rs6000/float128-ifunc.c (SW_OR_HW_ISA3_1): New macro. (__floattikf_resolve, __floatuntikf_resolve, __fixkfti_resolve, __fixunskfti_resolve): Add resolve functions. (__floattikf, __floatuntikf, __fixkfti, __fixunskfti): New functions. * config/rs6000/float128-sed (floattitf, __floatuntitf, __fixtfti, __fixunstfti): Add editor commands to change names. * config/rs6000/float128-sed-hw (__floattitf, __floatuntitf, __fixtfti, __fixunstfti): Add editor commands to change names. * config/rs6000/floattikf.c: Renamed to floattikf-sw.c. * config/rs6000/floatuntikf.c: Renamed to floatuntikf-sw.c. * config/rs6000/quad-float128.h (__floattikf_sw, __floatuntikf_sw, __fixkfti_sw, __fixunskfti_sw, __floattikf_hw, __floatuntikf_hw, __fixkfti_hw, __fixunskfti_hw, __floattikf, __floatuntikf, __fixkfti, __fixunskfti): New extern declarations. * config/rs6000/t-float128 (floattikf, floatuntikf, fixkfti, fixunskfti): Remove file names from fp128_ppc_funcs. (floattikf-sw, floatuntikf-sw, fixkfti-sw, fixunskfti-sw): Add file names to fp128_ppc_funcs. * config/rs6000/t-float128-hw(fp128_3_1_hw_funcs, fp128_3_1_hw_src, fp128_3_1_hw_static_obj, fp128_3_1_hw_shared_obj, fp128_3_1_hw_obj): Add variables for ISA 3.1 support. * config/rs6000/t-float128-p10-hw: New file. * configure: Update script for isa 3.1 128-bit float support. * configure.ac: Add check for 128-bit float hardware support. --- gcc/config/rs6000/rs6000.c | 8 ++++---- gcc/config/rs6000/rs6000.md | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 328dc10..75c2cc4 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -11014,10 +11014,10 @@ init_float128_ieee (machine_mode mode) if (TARGET_POWERPC64) { - set_conv_libfunc (sfix_optab, TImode, mode, "__fixkfti"); - set_conv_libfunc (ufix_optab, TImode, mode, "__fixunskfti"); - set_conv_libfunc (sfloat_optab, mode, TImode, "__floattikf"); - set_conv_libfunc (ufloat_optab, mode, TImode, "__floatuntikf"); + set_conv_libfunc (sfix_optab, TImode, mode, "__fixkfti_sw"); + set_conv_libfunc (ufix_optab, TImode, mode, "__fixunskfti_sw"); + set_conv_libfunc (sfloat_optab, mode, TImode, "__floattikf_sw"); + set_conv_libfunc (ufloat_optab, mode, TImode, "__floatuntikf_sw"); } } diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 3f59b54..89c70f4 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -6441,6 +6441,42 @@ xscvsxddp %x0,%x1" [(set_attr "type" "fp")]) +(define_insn "floatti2" + [(set (match_operand:IEEE128 0 "vsx_register_operand" "=v") + (float:IEEE128 (match_operand:TI 1 "vsx_register_operand" "v")))] + "TARGET_POWER10" +{ + return "xscvsqqp %0,%1"; +} + [(set_attr "type" "fp")]) + +(define_insn "floatunsti2" + [(set (match_operand:IEEE128 0 "vsx_register_operand" "=v") + (unsigned_float:IEEE128 (match_operand:TI 1 "vsx_register_operand" "v")))] + "TARGET_POWER10" +{ + return "xscvuqqp %0,%1"; +} + [(set_attr "type" "fp")]) + +(define_insn "fix_truncti2" + [(set (match_operand:TI 0 "vsx_register_operand" "=v") + (fix:TI (match_operand:IEEE128 1 "vsx_register_operand" "v")))] + "TARGET_POWER10" +{ + return "xscvqpsqz %0,%1"; +} + [(set_attr "type" "fp")]) + +(define_insn "fixuns_truncti2" + [(set (match_operand:TI 0 "vsx_register_operand" "=v") + (unsigned_fix:TI (match_operand:IEEE128 1 "vsx_register_operand" "v")))] + "TARGET_POWER10" +{ + return "xscvqpuqz %0,%1"; +} + [(set_attr "type" "fp")]) + ; Allow the combiner to merge source memory operands to the conversion so that ; the optimizer/register allocator doesn't try to load the value too early in a ; GPR and then use store/load to move it to a FPR and suffer from a store-load -- cgit v1.1 From db042e1603db5057314c404eded73c45f60ad2d6 Mon Sep 17 00:00:00 2001 From: Carl Love Date: Mon, 3 Feb 2020 14:41:42 -0600 Subject: RS6000 Add 128-bit Binary Integer sign extend operations This patch adds the 128-bit sign extension instruction support and corresponding builtin support. RS6000 Add 128-bit Binary Integer sign extend operations 2021-06-08 Carl Love gcc/ChangeLog * config/rs6000/altivec.h (vec_signextll, vec_signexti, vec_signextq): Add define for new builtins. * config/rs6000/altivec.md(altivec_vreveti2): Add define_expand. * config/rs6000/rs6000-builtin.def (VSIGNEXTI, VSIGNEXTLL): Add overloaded builtin definitions. (VSIGNEXTSB2W, VSIGNEXTSH2W, VSIGNEXTSB2D, VSIGNEXTSH2D,VSIGNEXTSW2D, VSIGNEXTSD2Q): Add builtin expansions. (SIGNEXT): Add P10 overload definition. * config/rs6000/rs6000-call.c (P9V_BUILTIN_VEC_VSIGNEXTI, P9V_BUILTIN_VEC_VSIGNEXTLL, P10_BUILTIN_VEC_SIGNEXT): Add overloaded argument definitions. * config/rs6000/vsx.md (vsx_sign_extend_v2di_v1ti): Add define_insn. (vsignextend_v2di_v1ti, vsignextend_qi_, vsignextend_hi_, vsignextend_si_v2di)[VIlong]: Add define_expand. Make define_insn vsx_sign_extend_si_v2di visible. * doc/extend.texi: Add documentation for the vec_signexti, vec_signextll builtins and vec_signextq. gcc/testsuite/ChangeLog * gcc.target/powerpc/int_128bit-runnable.c (extsd2q): Update expected count. Add tests for vec_signextq. * gcc.target/powerpc/p9-sign_extend-runnable.c: New test case. --- gcc/config/rs6000/altivec.h | 3 ++ gcc/config/rs6000/altivec.md | 24 +++++++++++ gcc/config/rs6000/rs6000-builtin.def | 12 ++++++ gcc/config/rs6000/rs6000-call.c | 16 +++++++ gcc/config/rs6000/vsx.md | 83 +++++++++++++++++++++++++++++++++++- 5 files changed, 137 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/altivec.h b/gcc/config/rs6000/altivec.h index 314695a..5b631c7 100644 --- a/gcc/config/rs6000/altivec.h +++ b/gcc/config/rs6000/altivec.h @@ -497,6 +497,8 @@ #define vec_xlx __builtin_vec_vextulx #define vec_xrx __builtin_vec_vexturx +#define vec_signexti __builtin_vec_vsignexti +#define vec_signextll __builtin_vec_vsignextll #endif @@ -715,6 +717,7 @@ __altivec_scalar_pred(vec_any_nle, #define vec_step(x) __builtin_vec_step (* (__typeof__ (x) *) 0) #ifdef _ARCH_PWR10 +#define vec_signextq __builtin_vec_vsignextq #define vec_dive __builtin_vec_dive #define vec_mod __builtin_vec_mod diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index a27e466..dad3a07 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -4291,6 +4291,30 @@ }) ;; Vector reverse elements +(define_expand "altivec_vreveti2" + [(set (match_operand:TI 0 "register_operand" "=v") + (unspec:TI [(match_operand:TI 1 "register_operand" "v")] + UNSPEC_VREVEV))] + "TARGET_ALTIVEC" +{ + int i, j, size, num_elements; + rtvec v = rtvec_alloc (16); + rtx mask = gen_reg_rtx (V16QImode); + + size = GET_MODE_UNIT_SIZE (TImode); + num_elements = GET_MODE_NUNITS (TImode); + + for (j = 0; j < num_elements; j++) + for (i = 0; i < size; i++) + RTVEC_ELT (v, i + j * size) + = GEN_INT (i + (num_elements - 1 - j) * size); + + emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v))); + emit_insn (gen_altivec_vperm_ti (operands[0], operands[1], + operands[1], mask)); + DONE; +}) + (define_expand "altivec_vreve2" [(set (match_operand:VEC_A 0 "register_operand" "=v") (unspec:VEC_A [(match_operand:VEC_A 1 "register_operand" "v")] diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index dba2282..d55095b 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -2877,6 +2877,8 @@ BU_P9V_OVERLOAD_1 (VPRTYBD, "vprtybd") BU_P9V_OVERLOAD_1 (VPRTYBQ, "vprtybq") BU_P9V_OVERLOAD_1 (VPRTYBW, "vprtybw") BU_P9V_OVERLOAD_1 (VPARITY_LSBB, "vparity_lsbb") +BU_P9V_OVERLOAD_1 (VSIGNEXTI, "vsignexti") +BU_P9V_OVERLOAD_1 (VSIGNEXTLL, "vsignextll") /* 2 argument functions added in ISA 3.0 (power9). */ BU_P9_2 (CMPRB, "byte_in_range", CONST, cmprb) @@ -2888,6 +2890,13 @@ BU_P9_OVERLOAD_2 (CMPRB, "byte_in_range") BU_P9_OVERLOAD_2 (CMPRB2, "byte_in_either_range") BU_P9_OVERLOAD_2 (CMPEQB, "byte_in_set") + +BU_P9V_AV_1 (VSIGNEXTSB2W, "vsignextsb2w", CONST, vsignextend_qi_v4si) +BU_P9V_AV_1 (VSIGNEXTSH2W, "vsignextsh2w", CONST, vsignextend_hi_v4si) +BU_P9V_AV_1 (VSIGNEXTSB2D, "vsignextsb2d", CONST, vsignextend_qi_v2di) +BU_P9V_AV_1 (VSIGNEXTSH2D, "vsignextsh2d", CONST, vsignextend_hi_v2di) +BU_P9V_AV_1 (VSIGNEXTSW2D, "vsignextsw2d", CONST, vsignextend_si_v2di) + /* Builtins for scalar instructions added in ISA 3.1 (power10). */ BU_P10V_AV_P (VCMPEQUT_P, "vcmpequt_p", CONST, vector_eq_v1ti_p) BU_P10V_AV_P (VCMPGTST_P, "vcmpgtst_p", CONST, vector_gt_v1ti_p) @@ -2926,6 +2935,8 @@ BU_P10V_AV_2 (VNOR_V1TI, "vnor_v1ti", CONST, norv1ti3) BU_P10V_AV_2 (VCMPNET_P, "vcmpnet_p", CONST, vector_ne_v1ti_p) BU_P10V_AV_2 (VCMPAET_P, "vcmpaet_p", CONST, vector_ae_v1ti_p) +BU_P10V_AV_1 (VSIGNEXTSD2Q, "vsignext", CONST, vsignextend_v2di_v1ti) + BU_P10V_AV_2 (VMULEUD, "vmuleud", CONST, vec_widen_umult_even_v2di) BU_P10V_AV_2 (VMULESD, "vmulesd", CONST, vec_widen_smult_even_v2di) BU_P10V_AV_2 (VMULOUD, "vmuloud", CONST, vec_widen_umult_odd_v2di) @@ -3145,6 +3156,7 @@ BU_CRYPTO_OVERLOAD_2A (VPMSUM, "vpmsum") BU_CRYPTO_OVERLOAD_3A (VPERMXOR, "vpermxor") BU_CRYPTO_OVERLOAD_3 (VSHASIGMA, "vshasigma") +BU_P10_OVERLOAD_1 (SIGNEXT, "vsignextq") /* HTM functions. */ BU_HTM_1 (TABORT, "tabort", CR, tabort) diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index d1f29a5..b0b7f12 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -5821,6 +5821,19 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_INTSI, RS6000_BTI_INTSI }, + /* Sign extend builtins that work work on ISA 3.0, not added until ISA 3.1 */ + { P9V_BUILTIN_VEC_VSIGNEXTI, P9V_BUILTIN_VSIGNEXTSB2W, + RS6000_BTI_V4SI, RS6000_BTI_V16QI, 0, 0 }, + { P9V_BUILTIN_VEC_VSIGNEXTI, P9V_BUILTIN_VSIGNEXTSH2W, + RS6000_BTI_V4SI, RS6000_BTI_V8HI, 0, 0 }, + + { P9V_BUILTIN_VEC_VSIGNEXTLL, P9V_BUILTIN_VSIGNEXTSB2D, + RS6000_BTI_V2DI, RS6000_BTI_V16QI, 0, 0 }, + { P9V_BUILTIN_VEC_VSIGNEXTLL, P9V_BUILTIN_VSIGNEXTSH2D, + RS6000_BTI_V2DI, RS6000_BTI_V8HI, 0, 0 }, + { P9V_BUILTIN_VEC_VSIGNEXTLL, P9V_BUILTIN_VSIGNEXTSW2D, + RS6000_BTI_V2DI, RS6000_BTI_V4SI, 0, 0 }, + /* Overloaded built-in functions for ISA3.1 (power10). */ { P10_BUILTIN_VEC_CLRL, P10V_BUILTIN_VCLRLB, RS6000_BTI_V16QI, RS6000_BTI_V16QI, RS6000_BTI_UINTSI, 0 }, @@ -6184,6 +6197,9 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { { P10_BUILTIN_VEC_XVTLSBB_ONES, P10V_BUILTIN_XVTLSBB_ONES, RS6000_BTI_INTSI, RS6000_BTI_unsigned_V16QI, 0, 0 }, + { P10_BUILTIN_VEC_SIGNEXT, P10V_BUILTIN_VSIGNEXTSD2Q, + RS6000_BTI_V1TI, RS6000_BTI_V2DI, 0, 0 }, + { RS6000_BUILTIN_NONE, RS6000_BUILTIN_NONE, 0, 0, 0, 0 } }; diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index ce8f82b..f2260ba 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -4883,6 +4883,33 @@ (set_attr "type" "vecload")]) +;; ISA 3.1 vector extend sign support +(define_insn "vsx_sign_extend_v2di_v1ti" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (unspec:V1TI [(match_operand:V2DI 1 "vsx_register_operand" "v")] + UNSPEC_VSX_SIGN_EXTEND))] + "TARGET_POWER10" + "vextsd2q %0,%1" +[(set_attr "type" "vecexts")]) + +(define_expand "vsignextend_v2di_v1ti" + [(set (match_operand:V1TI 0 "vsx_register_operand" "=v") + (unspec:V1TI [(match_operand:V2DI 1 "vsx_register_operand" "v")] + UNSPEC_VSX_SIGN_EXTEND))] + "TARGET_POWER10" +{ + if (BYTES_BIG_ENDIAN) + { + rtx tmp = gen_reg_rtx (V2DImode); + + emit_insn (gen_altivec_vrevev2di2(tmp, operands[1])); + emit_insn (gen_vsx_sign_extend_v2di_v1ti(operands[0], tmp)); + DONE; + } + + emit_insn (gen_vsx_sign_extend_v2di_v1ti(operands[0], operands[1])); +}) + ;; ISA 3.0 vector extend sign support (define_insn "vsx_sign_extend_qi_" @@ -4894,6 +4921,24 @@ "vextsb2 %0,%1" [(set_attr "type" "vecexts")]) +(define_expand "vsignextend_qi_" + [(set (match_operand:VIlong 0 "vsx_register_operand" "=v") + (unspec:VIlong + [(match_operand:V16QI 1 "vsx_register_operand" "v")] + UNSPEC_VSX_SIGN_EXTEND))] + "TARGET_P9_VECTOR" +{ + if (BYTES_BIG_ENDIAN) + { + rtx tmp = gen_reg_rtx (V16QImode); + emit_insn (gen_altivec_vrevev16qi2(tmp, operands[1])); + emit_insn (gen_vsx_sign_extend_qi_(operands[0], tmp)); + } + else + emit_insn (gen_vsx_sign_extend_qi_(operands[0], operands[1])); + DONE; +}) + (define_insn "vsx_sign_extend_hi_" [(set (match_operand:VSINT_84 0 "vsx_register_operand" "=v") (unspec:VSINT_84 @@ -4903,7 +4948,25 @@ "vextsh2 %0,%1" [(set_attr "type" "vecexts")]) -(define_insn "*vsx_sign_extend_si_v2di" +(define_expand "vsignextend_hi_" + [(set (match_operand:VIlong 0 "vsx_register_operand" "=v") + (unspec:VIlong + [(match_operand:V8HI 1 "vsx_register_operand" "v")] + UNSPEC_VSX_SIGN_EXTEND))] + "TARGET_P9_VECTOR" +{ + if (BYTES_BIG_ENDIAN) + { + rtx tmp = gen_reg_rtx (V8HImode); + emit_insn (gen_altivec_vrevev8hi2(tmp, operands[1])); + emit_insn (gen_vsx_sign_extend_hi_(operands[0], tmp)); + } + else + emit_insn (gen_vsx_sign_extend_hi_(operands[0], operands[1])); + DONE; +}) + +(define_insn "vsx_sign_extend_si_v2di" [(set (match_operand:V2DI 0 "vsx_register_operand" "=v") (unspec:V2DI [(match_operand:V4SI 1 "vsx_register_operand" "v")] UNSPEC_VSX_SIGN_EXTEND))] @@ -4911,6 +4974,24 @@ "vextsw2d %0,%1" [(set_attr "type" "vecexts")]) +(define_expand "vsignextend_si_v2di" + [(set (match_operand:V2DI 0 "vsx_register_operand" "=v") + (unspec:V2DI [(match_operand:V4SI 1 "vsx_register_operand" "v")] + UNSPEC_VSX_SIGN_EXTEND))] + "TARGET_P9_VECTOR" +{ + if (BYTES_BIG_ENDIAN) + { + rtx tmp = gen_reg_rtx (V4SImode); + + emit_insn (gen_altivec_vrevev4si2(tmp, operands[1])); + emit_insn (gen_vsx_sign_extend_si_v2di(operands[0], tmp)); + } + else + emit_insn (gen_vsx_sign_extend_si_v2di(operands[0], operands[1])); + DONE; +}) + ;; ISA 3.1 vector sign extend ;; Move DI value from GPR to TI mode in VSX register, word 1. (define_insn "mtvsrdd_diti_w1" -- cgit v1.1 From f8b067056ba5dd53f7bc883a1f59833efc26bd3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Chigot?= Date: Wed, 9 Jun 2021 22:00:55 -0400 Subject: aix: Power10 assembler invocation. gcc/ChangeLog: 2021-06-09 Clement Chigot * config/rs6000/aix71.h (ASM_CPU_SPEC): Add Power10 directive. * config/rs6000/aix72.h (ASM_CPU_SPEC): Likewise. --- gcc/config/rs6000/aix71.h | 1 + gcc/config/rs6000/aix72.h | 1 + 2 files changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/aix71.h b/gcc/config/rs6000/aix71.h index 807e260..38cfa9e 100644 --- a/gcc/config/rs6000/aix71.h +++ b/gcc/config/rs6000/aix71.h @@ -78,6 +78,7 @@ do { \ #undef ASM_CPU_SPEC #define ASM_CPU_SPEC \ "%{mcpu=native: %(asm_cpu_native); \ + mcpu=power10: -mpwr10; \ mcpu=power9: -mpwr9; \ mcpu=power8: -mpwr8; \ mcpu=power7: -mpwr7; \ diff --git a/gcc/config/rs6000/aix72.h b/gcc/config/rs6000/aix72.h index 36c5d99..4cd27e3 100644 --- a/gcc/config/rs6000/aix72.h +++ b/gcc/config/rs6000/aix72.h @@ -78,6 +78,7 @@ do { \ #undef ASM_CPU_SPEC #define ASM_CPU_SPEC \ "%{mcpu=native: %(asm_cpu_native); \ + mcpu=power10: -mpwr10; \ mcpu=power9: -mpwr9; \ mcpu=power8: -mpwr8; \ mcpu=power7: -mpwr7; \ -- cgit v1.1 From 6961091b385c5c27c4a555aad0250d3390be05bf Mon Sep 17 00:00:00 2001 From: Robin Dapp Date: Thu, 10 Jun 2021 14:18:28 +0200 Subject: s390: Allow more vcond_mask patterns. Change vcond_mask iterator as to allow the corresponding int mode for the condition/mask so e.g. boolean conditions become possible: vtarget = bool_cond ? vsource1 : vsource2. gcc/ChangeLog: * config/s390/vector.md (vcond_mask_): Change to (vcond_mask_): this. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/vcond-mixed-double.c: New test. * gcc.target/s390/vector/vcond-mixed-float.c: New test. --- gcc/config/s390/vector.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index c80d582..ab605b3 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -715,7 +715,7 @@ DONE; }) -(define_expand "vcond_mask_" +(define_expand "vcond_mask_" [(set (match_operand:V 0 "register_operand" "") (if_then_else:V (eq (match_operand: 3 "register_operand" "") -- cgit v1.1 From 6fcba9ef23e4261a6279a76890b2c1488cc14d12 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Thu, 10 Jun 2021 09:57:51 -0400 Subject: Use memory loads and extensions to eliminate redundant test/compare insns gcc/ * config/h8300/h8300.c (select_cc_mode): Handle MEM. Use REG_P. * config/h8300/extensions.md: Replace _clobber_flags patterns with . --- gcc/config/h8300/extensions.md | 12 ++++++------ gcc/config/h8300/h8300.c | 4 +++- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/extensions.md b/gcc/config/h8300/extensions.md index bc10179..74647c7 100644 --- a/gcc/config/h8300/extensions.md +++ b/gcc/config/h8300/extensions.md @@ -20,7 +20,7 @@ [(parallel [(set (match_dup 0) (zero_extend:HI (match_dup 1))) (clobber (reg:CC CC_REG))])]) -(define_insn "*zero_extendqihi2_clobber_flags" +(define_insn "*zero_extendqihi2" [(set (match_operand:HI 0 "register_operand" "=r,r") (zero_extend:HI (match_operand:QI 1 "general_operand_src" "0,g>"))) (clobber (reg:CC CC_REG))] @@ -95,7 +95,7 @@ [(parallel [(set (match_dup 0) (zero_extend:SI (match_dup 1))) (clobber (reg:CC CC_REG))])]) -(define_insn "*zero_extendqisi2_h8sx_clobber_flags" +(define_insn "*zero_extendqisi2_h8sx" [(set (match_operand:SI 0 "register_operand" "=r") (zero_extend:SI (match_operand:QI 1 "register_operand" "0"))) (clobber (reg:CC CC_REG))] @@ -118,7 +118,7 @@ [(parallel [(set (match_dup 0) (zero_extend:SI (match_dup 1))) (clobber (reg:CC CC_REG))])]) -(define_insn "*zero_extendhisi2_clobber_flags" +(define_insn "*zero_extendhisi2" [(set (match_operand:SI 0 "register_operand" "=r") (zero_extend:SI (match_operand:HI 1 "register_operand" "0"))) (clobber (reg:CC CC_REG))] @@ -141,7 +141,7 @@ [(parallel [(set (match_dup 0) (sign_extend:HI (match_dup 1))) (clobber (reg:CC CC_REG))])]) -(define_insn "*extendqihi2_clobber_flags" +(define_insn "*extendqihi2" [(set (match_operand:HI 0 "register_operand" "=r") (sign_extend:HI (match_operand:QI 1 "register_operand" "0"))) (clobber (reg:CC CC_REG))] @@ -176,7 +176,7 @@ [(parallel [(set (match_dup 0) (sign_extend:SI (match_dup 1))) (clobber (reg:CC CC_REG))])]) -(define_insn "*extendqisi2_h8sx_clobber_flags" +(define_insn "*extendqisi2_h8sx" [(set (match_operand:SI 0 "register_operand" "=r") (sign_extend:SI (match_operand:QI 1 "register_operand" "0"))) (clobber (reg:CC CC_REG))] @@ -199,7 +199,7 @@ [(parallel [(set (match_dup 0) (sign_extend:SI (match_dup 1))) (clobber (reg:CC CC_REG))])]) -(define_insn "*extendhisi2_clobber_flags" +(define_insn "*extendhisi2" [(set (match_operand:SI 0 "register_operand" "=r") (sign_extend:SI (match_operand:HI 1 "register_operand" "0"))) (clobber (reg:CC CC_REG))] diff --git a/gcc/config/h8300/h8300.c b/gcc/config/h8300/h8300.c index ef947aa..1077a2b 100644 --- a/gcc/config/h8300/h8300.c +++ b/gcc/config/h8300/h8300.c @@ -1950,7 +1950,9 @@ h8300_select_cc_mode (enum rtx_code cond, rtx op0, rtx op1) || GET_CODE (op0) == NEG || GET_CODE (op0) == AND || GET_CODE (op0) == IOR || GET_CODE (op0) == XOR || GET_CODE (op0) == NOT || GET_CODE (op0) == ASHIFT - || GET_CODE (op0) == REG || GET_CODE (op0) == MULT)) + || GET_CODE (op0) == MULT + || GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND + || REG_P (op0) || MEM_P (op0))) return CCZNmode; return CCmode; -- cgit v1.1 From 00d07ec6e12451acc7a290cd93be03bed50cb666 Mon Sep 17 00:00:00 2001 From: Peter Bergner Date: Thu, 10 Jun 2021 13:54:12 -0500 Subject: rs6000: Add new __builtin_vsx_build_pair and __builtin_mma_build_acc built-ins The __builtin_vsx_assemble_pair and __builtin_mma_assemble_acc built-ins currently assign their first source operand to the first VSX register in a pair/quad, their second operand to the second register in a pair/quad, etc. This is not endian friendly and forces the user to generate different calls depending on endianness. In agreement with the POWER LLVM team, we've decided to lightly deprecate the assemble built-ins and replace them with "build" built-ins that automatically handle endianness so the same built-in call and be used for both little-endian and big-endian compiles. We are not removing the assemble built-ins, since there is code in the wild that use them, but we are removing their documentation to encourage the use of the new "build" variants. gcc/ * config/rs6000/rs6000-builtin.def (build_pair): New built-in. (build_acc): Likewise. * config/rs6000/rs6000-call.c (mma_expand_builtin): Swap assemble source operands in little-endian mode. (rs6000_gimple_fold_mma_builtin): Handle VSX_BUILTIN_BUILD_PAIR. (mma_init_builtins): Likewise. * config/rs6000/rs6000.c (rs6000_split_multireg_move): Handle endianness ordering for the MMA assemble and build source operands. * doc/extend.texi (__builtin_vsx_build_acc, __builtin_mma_build_pair): Document. (__builtin_mma_assemble_acc, __builtin_mma_assemble_pair): Remove documentation. gcc/testsuite/ * gcc.target/powerpc/mma-builtin-4.c (__builtin_vsx_build_pair): Add tests. Update expected counts. * gcc.target/powerpc/mma-builtin-5.c (__builtin_mma_build_acc): Add tests. Update expected counts. --- gcc/config/rs6000/rs6000-builtin.def | 2 ++ gcc/config/rs6000/rs6000-call.c | 19 ++++++++++++++++--- gcc/config/rs6000/rs6000.c | 6 ++++-- 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index d55095b..231e7c9 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -3265,6 +3265,7 @@ BU_MMA_2 (DISASSEMBLE_ACC, "disassemble_acc", QUAD, mma_disassemble_acc) BU_MMA_V2 (DISASSEMBLE_PAIR, "disassemble_pair", PAIR, vsx_disassemble_pair) BU_COMPAT (VSX_BUILTIN_DISASSEMBLE_PAIR, "mma_disassemble_pair") +BU_MMA_V3 (BUILD_PAIR, "build_pair", MISC, vsx_assemble_pair) BU_MMA_V3 (ASSEMBLE_PAIR, "assemble_pair", MISC, vsx_assemble_pair) BU_COMPAT (VSX_BUILTIN_ASSEMBLE_PAIR, "mma_assemble_pair") BU_MMA_3 (XVBF16GER2, "xvbf16ger2", MISC, mma_xvbf16ger2) @@ -3297,6 +3298,7 @@ BU_MMA_3 (XVI8GER4SPP, "xvi8ger4spp", QUAD, mma_xvi8ger4spp) BU_MMA_3 (XVI16GER2PP, "xvi16ger2pp", QUAD, mma_xvi16ger2pp) BU_MMA_3 (XVI16GER2SPP, "xvi16ger2spp", QUAD, mma_xvi16ger2spp) +BU_MMA_5 (BUILD_ACC, "build_acc", MISC, mma_assemble_acc) BU_MMA_5 (ASSEMBLE_ACC, "assemble_acc", MISC, mma_assemble_acc) BU_MMA_5 (PMXVF32GER, "pmxvf32ger", MISC, mma_pmxvf32ger) BU_MMA_5 (PMXVF64GER, "pmxvf64ger", PAIR, mma_pmxvf64ger) diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index b0b7f12..0ac6b6e 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -10244,12 +10244,23 @@ mma_expand_builtin (tree exp, rtx target, bool *expandedp) pat = GEN_FCN (icode) (op[0], op[1]); break; case 3: + /* The ASSEMBLE builtin source operands are reversed in little-endian + mode, so reorder them. */ + if (fcode == VSX_BUILTIN_ASSEMBLE_PAIR_INTERNAL && !WORDS_BIG_ENDIAN) + std::swap (op[1], op[2]); pat = GEN_FCN (icode) (op[0], op[1], op[2]); break; case 4: pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3]); break; case 5: + /* The ASSEMBLE builtin source operands are reversed in little-endian + mode, so reorder them. */ + if (fcode == MMA_BUILTIN_ASSEMBLE_ACC_INTERNAL && !WORDS_BIG_ENDIAN) + { + std::swap (op[1], op[4]); + std::swap (op[2], op[3]); + } pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4]); break; case 6: @@ -11961,7 +11972,7 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi) gcc_unreachable (); } - if (fncode == VSX_BUILTIN_ASSEMBLE_PAIR) + if (fncode == VSX_BUILTIN_BUILD_PAIR || fncode == VSX_BUILTIN_ASSEMBLE_PAIR) lhs = make_ssa_name (vector_pair_type_node); else lhs = make_ssa_name (vector_quad_type_node); @@ -14293,8 +14304,10 @@ mma_init_builtins (void) machine_mode mode = insn_data[icode].operand[j].mode; if (gimple_func && mode == XOmode) op[nopnds++] = build_pointer_type (vector_quad_type_node); - else if (gimple_func && mode == OOmode - && d->code == VSX_BUILTIN_ASSEMBLE_PAIR) + else if (gimple_func + && mode == OOmode + && (d->code == VSX_BUILTIN_BUILD_PAIR + || d->code == VSX_BUILTIN_ASSEMBLE_PAIR)) op[nopnds++] = build_pointer_type (vector_pair_type_node); else /* MMA uses unsigned types. */ diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 75c2cc4..38f9281 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -16807,9 +16807,11 @@ rs6000_split_multireg_move (rtx dst, rtx src) gcc_assert (VSX_REGNO_P (REGNO (dst))); reg_mode = GET_MODE (XVECEXP (src, 0, 0)); - for (int i = 0; i < XVECLEN (src, 0); i++) + int nvecs = XVECLEN (src, 0); + for (int i = 0; i < nvecs; i++) { - rtx dst_i = gen_rtx_REG (reg_mode, reg + i); + int index = WORDS_BIG_ENDIAN ? i : nvecs - 1 - i; + rtx dst_i = gen_rtx_REG (reg_mode, reg + index); emit_insn (gen_rtx_SET (dst_i, XVECEXP (src, 0, i))); } -- cgit v1.1 From a325bdd195ee96f826b208c3afb9bed2ec077e12 Mon Sep 17 00:00:00 2001 From: Peter Bergner Date: Thu, 10 Jun 2021 13:54:12 -0500 Subject: i386: Add V8QI and other 64bit vector permutations [PR89021] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In addition to V8QI permutations, several other missing permutations are added for 64bit vector modes for TARGET_SSSE3 and TARGET_SSE4_1 targets. 2021-06-10 Uroš Bizjak gcc/ PR target/89021 * config/i386/i386-expand.c (ix86_split_mmx_punpck): Handle V2SF mode. Emit SHUFPS to fixup unpack-high for V2SF mode. (expand_vec_perm_blend): Handle 64bit modes for TARGET_SSE4_1. (expand_vec_perm_pshufb): Handle 64bit modes for TARGET_SSSE3. (expand_vec_perm_pblendv): Handle 64bit modes for TARGET_SSE4_1. (expand_vec_perm_interleave2): Handle 64bit modes. (expand_vec_perm_even_odd_pack): Handle V8QI mode. (expand_vec_perm_even_odd_1): Ditto. (ix86_vectorize_vec_perm_const): Ditto. * config/i386/i386.md (UNSPEC_PSHUFB): Move from ... * config/i386/sse.md: ... here. * config/i386/mmx.md (*vec_interleave_lowv2sf): New insn_and_split pattern. (*vec_interleave_highv2sf): Ditto. (mmx_pshufbv8qi3): New insn pattern. (*mmx_pblendw): Ditto. --- gcc/config/i386/i386-expand.c | 191 ++++++++++++++++++++++++++++++++++++------ gcc/config/i386/i386.md | 1 + gcc/config/i386/mmx.md | 86 +++++++++++++++++-- gcc/config/i386/sse.md | 1 - 4 files changed, 246 insertions(+), 33 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index c3ce21b..9ee5257 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -798,6 +798,15 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p) GEN_INT (1), GEN_INT (5))); break; + case E_V2SFmode: + sse_mode = V4SFmode; + double_sse_mode = V8SFmode; + mask = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (4, + GEN_INT (0), GEN_INT (4), + GEN_INT (1), GEN_INT (5))); + break; + default: gcc_unreachable (); } @@ -812,14 +821,26 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p) rtx insn = gen_rtx_SET (dest, op2); emit_insn (insn); + /* Move bits 64:127 to bits 0:63. */ if (high_p) { - /* Move bits 64:127 to bits 0:63. */ - mask = gen_rtx_PARALLEL (VOIDmode, - gen_rtvec (4, GEN_INT (2), GEN_INT (3), - GEN_INT (0), GEN_INT (0))); - dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest)); - op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask); + if (sse_mode == V4SFmode) + { + mask = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (4, GEN_INT (2), GEN_INT (3), + GEN_INT (4), GEN_INT (5))); + op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest); + op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask); + } + else + { + mask = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (4, GEN_INT (2), GEN_INT (3), + GEN_INT (0), GEN_INT (1))); + dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest)); + op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask); + } + insn = gen_rtx_SET (dest, op1); emit_insn (insn); } @@ -17062,7 +17083,8 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) ; else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; - else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) + else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16 + || GET_MODE_SIZE (vmode) == 8)) ; else return false; @@ -17095,6 +17117,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) case E_V8SFmode: case E_V2DFmode: case E_V4SFmode: + case E_V4HImode: case E_V8HImode: case E_V8SImode: case E_V32HImode: @@ -17111,6 +17134,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) vmode = V8HImode; goto do_subreg; + case E_V2SImode: + for (i = 0; i < 2; ++i) + mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2); + vmode = V4HImode; + goto do_subreg; + case E_V4SImode: for (i = 0; i < 4; ++i) mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2); @@ -17132,7 +17161,9 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); vperm = force_reg (vmode, vperm); - if (GET_MODE_SIZE (vmode) == 16) + if (GET_MODE_SIZE (vmode) == 8) + emit_insn (gen_mmx_pblendvb64 (target, op0, op1, vperm)); + else if (GET_MODE_SIZE (vmode) == 16) emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm)); else emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm)); @@ -17152,6 +17183,16 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) op1 = gen_lowpart (vmode, op1); break; + case E_V8QImode: + for (i = 0; i < 8; i += 2) + if (d->perm[i] + 1 != d->perm[i + 1]) + goto use_pblendvb; + + for (i = 0; i < 4; ++i) + mask |= (d->perm[i * 2] >= 8) << i; + vmode = V4HImode; + goto do_subreg; + case E_V32QImode: /* See if bytes move in pairs. If not, vpblendvb must be used. */ for (i = 0; i < 32; i += 2) @@ -17384,7 +17425,13 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) } else { - if (GET_MODE_SIZE (d->vmode) == 16) + if (GET_MODE_SIZE (d->vmode) == 8) + { + if (!TARGET_SSSE3) + return false; + vmode = V8QImode; + } + else if (GET_MODE_SIZE (d->vmode) == 16) { if (!TARGET_SSSE3) return false; @@ -17506,12 +17553,12 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) eltsz = GET_MODE_UNIT_SIZE (d->vmode); if (!d->one_operand_p) mask = 2 * nelt - 1; - else if (vmode == V16QImode) - mask = nelt - 1; else if (vmode == V64QImode) mask = nelt / 4 - 1; - else + else if (vmode == V32QImode) mask = nelt / 2 - 1; + else + mask = nelt - 1; for (i = 0; i < nelt; ++i) { @@ -17521,9 +17568,18 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) } } - vperm = gen_rtx_CONST_VECTOR (vmode, - gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm)); - vperm = force_reg (vmode, vperm); + machine_mode vpmode = vmode; + + if (vmode == V8QImode) + { + for (i = nelt; i < 16; ++i) + rperm[i] = constm1_rtx; + vpmode = V16QImode; + } + + vperm = gen_rtx_CONST_VECTOR (vpmode, + gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm)); + vperm = force_reg (vpmode, vperm); target = d->target; if (d->vmode != vmode) @@ -17531,7 +17587,9 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) op0 = gen_lowpart (vmode, d->op0); if (d->one_operand_p) { - if (vmode == V16QImode) + if (vmode == V8QImode) + emit_insn (gen_mmx_pshufbv8qi3 (target, op0, vperm)); + else if (vmode == V16QImode) emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); else if (vmode == V32QImode) emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); @@ -18041,7 +18099,8 @@ expand_vec_perm_pblendv (struct expand_vec_perm_d *d) ; else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; - else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) + else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 8 + || GET_MODE_SIZE (vmode) == 16)) ; else return false; @@ -18120,7 +18179,8 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) rtx_insn *seq; bool ok, same_halves = false; - if (GET_MODE_SIZE (d->vmode) == 16) + if (GET_MODE_SIZE (d->vmode) == 8 + || GET_MODE_SIZE (d->vmode) == 16) { if (d->one_operand_p) return false; @@ -18155,7 +18215,44 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) memset (remap, 0xff, sizeof (remap)); dremap = *d; - if (GET_MODE_SIZE (d->vmode) == 16) + if (GET_MODE_SIZE (d->vmode) == 8) + { + unsigned HOST_WIDE_INT h1, h2, h3, h4; + + /* Split the two input vectors into 4 halves. */ + h1 = (HOST_WIDE_INT_1U << nelt2) - 1; + h2 = h1 << nelt2; + h3 = h2 << nelt2; + h4 = h3 << nelt2; + + /* If the elements from the low halves use interleave low, + and similarly for interleave high. */ + if ((contents & (h1 | h3)) == contents) + { + /* punpckl* */ + for (i = 0; i < nelt2; ++i) + { + remap[i] = i * 2; + remap[i + nelt] = i * 2 + 1; + dremap.perm[i * 2] = i; + dremap.perm[i * 2 + 1] = i + nelt; + } + } + else if ((contents & (h2 | h4)) == contents) + { + /* punpckh* */ + for (i = 0; i < nelt2; ++i) + { + remap[i + nelt2] = i * 2; + remap[i + nelt + nelt2] = i * 2 + 1; + dremap.perm[i * 2] = i + nelt2; + dremap.perm[i * 2 + 1] = i + nelt + nelt2; + } + } + else + return false; + } + else if (GET_MODE_SIZE (d->vmode) == 16) { unsigned HOST_WIDE_INT h1, h2, h3, h4; @@ -19328,9 +19425,9 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d) } /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even - and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands - with two "and" and "pack" or two "shift" and "pack" insns. We should - have already failed all two instruction sequences. */ + and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI + operands with two "and" and "pack" or two "shift" and "pack" insns. + We should have already failed all two instruction sequences. */ static bool expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) @@ -19359,6 +19456,15 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) gen_pack = gen_sse4_1_packusdw; gen_shift = gen_lshrv4si3; break; + case E_V8QImode: + /* No check as all instructions are SSE2. */ + c = 0xff; + s = 8; + half_mode = V4HImode; + gen_and = gen_andv4hi3; + gen_pack = gen_mmx_packuswb; + gen_shift = gen_lshrv4hi3; + break; case E_V16QImode: /* No check as all instructions are SSE2. */ c = 0xff; @@ -19391,8 +19497,8 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) end_perm = true; break; default: - /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than - general shuffles. */ + /* Only V8QI, V8HI, V16QI, V16HI and V32QI modes + are more profitable than general shuffles. */ return false; } @@ -19621,6 +19727,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) } break; + case E_V8QImode: case E_V16QImode: return expand_vec_perm_even_odd_pack (d); @@ -19786,6 +19893,41 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) /* These are always implementable using standard shuffle patterns. */ gcc_unreachable (); + case E_V8QImode: + /* These can be implemented via interleave. We save one insn by + stopping once we have promoted to V2SImode and then use pshufd. */ + if (d->testing_p) + return true; + do + { + rtx dest; + rtx (*gen) (rtx, rtx, rtx) + = vmode == V8QImode ? gen_mmx_punpcklbw + : gen_mmx_punpcklwd; + + if (elt >= nelt2) + { + gen = vmode == V8QImode ? gen_mmx_punpckhbw + : gen_mmx_punpckhwd; + elt -= nelt2; + } + nelt2 /= 2; + + dest = gen_reg_rtx (vmode); + emit_insn (gen (dest, op0, op0)); + vmode = get_mode_wider_vector (vmode); + op0 = gen_lowpart (vmode, dest); + } + while (vmode != V2SImode); + + memset (perm2, elt, 2); + dest = gen_reg_rtx (V2SImode); + ok = expand_vselect (dest, op0, perm2, 2, d->testing_p); + gcc_assert (ok); + if (!d->testing_p) + emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); + return true; + case E_V8HImode: case E_V16QImode: /* These can be implemented via interleave. We save one insn by @@ -20289,6 +20431,7 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, case E_V2SFmode: case E_V2SImode: case E_V4HImode: + case E_V8QImode: if (!TARGET_MMX_WITH_SSE) return false; break; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 5ff49ec..7743c61e 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -119,6 +119,7 @@ UNSPEC_MASKMOV UNSPEC_MOVMSK UNSPEC_BLENDV + UNSPEC_PSHUFB UNSPEC_RCP UNSPEC_RSQRT UNSPEC_PSADBW diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 0a17a54..f9e7d27 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1198,6 +1198,40 @@ (set_attr "prefix" "maybe_vex,orig") (set_attr "mode" "V4SF")]) +(define_insn_and_split "*vec_interleave_lowv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=x,v") + (vec_select:V2SF + (vec_concat:V4SF + (match_operand:V2SF 1 "register_operand" "0,v") + (match_operand:V2SF 2 "register_operand" "x,v")) + (parallel [(const_int 0) (const_int 2)])))] + "TARGET_MMX_WITH_SSE" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_mmx_punpck (operands, false); DONE;" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sselog") + (set_attr "prefix" "orig,maybe_evex") + (set_attr "mode" "V4SF")]) + +(define_insn_and_split "*vec_interleave_highv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=x,v") + (vec_select:V2SF + (vec_concat:V4SF + (match_operand:V2SF 1 "register_operand" "0,v") + (match_operand:V2SF 2 "register_operand" "x,v")) + (parallel [(const_int 1) (const_int 3)])))] + "TARGET_MMX_WITH_SSE" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_mmx_punpck (operands, true); DONE;" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sselog") + (set_attr "prefix" "orig,vex") + (set_attr "mode" "V4SF")]) + (define_insn "*vec_dupv2sf" [(set (match_operand:V2SF 0 "register_operand" "=y,Yv,x") (vec_duplicate:V2SF @@ -2415,7 +2449,7 @@ packswb\t{%2, %0|%0, %2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_pack (operands, ); DONE;" @@ -2435,7 +2469,7 @@ packssdw\t{%2, %0|%0, %2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_pack (operands, SS_TRUNCATE); DONE;" @@ -2458,7 +2492,7 @@ punpckhbw\t{%2, %0|%0, %2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_punpck (operands, true); DONE;" @@ -2481,7 +2515,7 @@ punpcklbw\t{%2, %0|%0, %k2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_punpck (operands, false); DONE;" @@ -2502,7 +2536,7 @@ punpckhwd\t{%2, %0|%0, %2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_punpck (operands, true); DONE;" @@ -2523,7 +2557,7 @@ punpcklwd\t{%2, %0|%0, %k2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_punpck (operands, false); DONE;" @@ -2544,7 +2578,7 @@ punpckhdq\t{%2, %0|%0, %2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_punpck (operands, true); DONE;" @@ -2565,7 +2599,7 @@ punpckldq\t{%2, %0|%0, %k2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_punpck (operands, false); DONE;" @@ -2756,6 +2790,24 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) +(define_insn "mmx_pshufbv8qi3" + [(set (match_operand:V8QI 0 "register_operand" "=x,Yw") + (unspec:V8QI + [(match_operand:V8QI 1 "register_operand" "0,Yw") + (match_operand:V16QI 2 "vector_operand" "xBm,Ywm")] + UNSPEC_PSHUFB))] + "TARGET_SSSE3 && TARGET_MMX_WITH_SSE" + "@ + pshufb\t{%2, %0|%0, %2} + vpshufb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sselog1") + (set_attr "prefix_data16" "1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,maybe_evex") + (set_attr "btver2_decode" "vector") + (set_attr "mode" "TI")]) + (define_expand "mmx_pshufw" [(match_operand:V4HI 0 "register_operand") (match_operand:V4HI 1 "register_mmxmem_operand") @@ -2828,6 +2880,24 @@ (set_attr "length_immediate" "1") (set_attr "mode" "TI")]) +(define_insn "*mmx_pblendw" + [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,x") + (vec_merge:V4HI + (match_operand:V4HI 2 "register_operand" "Yr,*x,x") + (match_operand:V4HI 1 "register_operand" "0,0,x") + (match_operand:SI 3 "const_0_to_63_operand" "n,n,n")))] + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "@ + pblendw\t{%3, %2, %0|%0, %2, %3} + pblendw\t{%3, %2, %0|%0, %2, %3} + vpblendw\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "mode" "TI")]) + ;; Optimize V2SImode load from memory, swapping the elements and ;; storing back into the memory into DImode rotate of the memory by 32. (define_split diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 2a34756..8403a07 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -28,7 +28,6 @@ UNSPEC_LDDQU ;; SSSE3 - UNSPEC_PSHUFB UNSPEC_PSIGN UNSPEC_PALIGNR -- cgit v1.1 From 2f5ab546e2be8e42b00416b2e5860d04a881beab Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Fri, 11 Jun 2021 02:43:40 -0500 Subject: rs6000: Support more short/char to float conversion For some cases that when we load unsigned char/short values from the appropriate unsigned char/short memories and convert them to double/single precision floating point value, there would be implicit conversions to int first. It makes GCC not leverage the P9 instructions lxsibzx/lxsihzx. This patch is to add the related define_insn_and_split to support this kind of scenario. Bootstrapped/regtested on powerpc64le-linux-gnu P9 and powerpc64-linux-gnu P8. gcc/ChangeLog: * config/rs6000/rs6000.md (floatsi2_lfiwax__mem_zext): New define_insn_and_split. gcc/testsuite/ChangeLog: * gcc.target/powerpc/p9-fpcvt-3.c: New test. --- gcc/config/rs6000/rs6000.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 89c70f4..510dbff 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -5524,6 +5524,27 @@ [(set_attr "length" "8") (set_attr "type" "fpload")]) +(define_insn_and_split "floatsi2_lfiwax__mem_zext" + [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,") + (float:SFDF + (zero_extend:SI + (match_operand:QHI 1 "indexed_or_indirect_operand" "Z,Z")))) + (clobber (match_scratch:DI 2 "=d,wa"))] + "TARGET_HARD_FLOAT && && TARGET_P9_VECTOR + && TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "#" + "&& 1" + [(pc)] +{ + if (GET_CODE (operands[2]) == SCRATCH) + operands[2] = gen_reg_rtx (DImode); + emit_insn (gen_zero_extendhidi2 (operands[2], operands[1])); + emit_insn (gen_floatdi2 (operands[0], operands[2])); + DONE; +} + [(set_attr "length" "8") + (set_attr "type" "fpload")]) + (define_insn "lfiwzx" [(set (match_operand:DI 0 "gpc_reg_operand" "=d,wa,wa,wa") (unspec:DI [(match_operand:SI 1 "reg_or_indexed_operand" "Z,Z,r,wa")] -- cgit v1.1 From 1fa991d1d74cb1ce96c48ede70ae0be7a9683ce3 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 11 Jun 2021 12:31:42 +0200 Subject: i386: Try to avoid variable permutation instruction [PR101021] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some permutations can be implemented without costly PSHUFB instruction, e.g.: { 8,9,10,11,12,13,14,15, 0,1,2,3,4,5,6,7 } with PALIGNR, { 0,1,2,3, 4,5,6,7, 4,5,6,7, 12,13,14,15 } with PSHUFD, { 0,1, 2,3, 2,3, 6,7, 8,9,10,11,12,13,14,15 } with PSHUFLW and { 0,1,2,3,4,5,6,7, 8,9, 10,11, 10,11, 14,15 } with PSHUFHW. All these instructions have constant shuffle control mask and do not need to load shuffle mask from a memory to a temporary XMM register. 2021-06-11 Uroš Bizjak gcc/ PR target/101021 * config/i386/i386-expand.c (expand_vec_perm_pshufb): Return false if the permutation can be implemented with constant permutation instruction in wider mode. (canonicalize_vector_int_perm): Move above expand_vec_perm_pshufb. Handle V8QImode and V4HImode. gcc/testsuite/ PR target/101021 * gcc.target/i386/pr101021-1.c: New test. * gcc.target/i386/pr101021-2.c: Ditto. --- gcc/config/i386/i386-expand.c | 109 ++++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 51 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 9ee5257..2fa3a18 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -17354,6 +17354,59 @@ expand_vec_perm_vpermil (struct expand_vec_perm_d *d) return true; } +/* For V*[QHS]Imode permutations, check if the same permutation + can't be performed in a 2x, 4x or 8x wider inner mode. */ + +static bool +canonicalize_vector_int_perm (const struct expand_vec_perm_d *d, + struct expand_vec_perm_d *nd) +{ + int i; + machine_mode mode = VOIDmode; + + switch (d->vmode) + { + case E_V8QImode: mode = V4HImode; break; + case E_V16QImode: mode = V8HImode; break; + case E_V32QImode: mode = V16HImode; break; + case E_V64QImode: mode = V32HImode; break; + case E_V4HImode: mode = V2SImode; break; + case E_V8HImode: mode = V4SImode; break; + case E_V16HImode: mode = V8SImode; break; + case E_V32HImode: mode = V16SImode; break; + case E_V4SImode: mode = V2DImode; break; + case E_V8SImode: mode = V4DImode; break; + case E_V16SImode: mode = V8DImode; break; + default: return false; + } + for (i = 0; i < d->nelt; i += 2) + if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1) + return false; + nd->vmode = mode; + nd->nelt = d->nelt / 2; + for (i = 0; i < nd->nelt; i++) + nd->perm[i] = d->perm[2 * i] / 2; + if (GET_MODE_INNER (mode) != DImode) + canonicalize_vector_int_perm (nd, nd); + if (nd != d) + { + nd->one_operand_p = d->one_operand_p; + nd->testing_p = d->testing_p; + if (d->op0 == d->op1) + nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0); + else + { + nd->op0 = gen_lowpart (nd->vmode, d->op0); + nd->op1 = gen_lowpart (nd->vmode, d->op1); + } + if (d->testing_p) + nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1); + else + nd->target = gen_reg_rtx (nd->vmode); + } + return true; +} + /* Return true if permutation D can be performed as VMODE permutation instead. */ @@ -17391,6 +17444,7 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) unsigned i, nelt, eltsz, mask; unsigned char perm[64]; machine_mode vmode = V16QImode; + struct expand_vec_perm_d nd; rtx rperm[64], vperm, target, op0, op1; nelt = d->nelt; @@ -17539,6 +17593,10 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return false; } + /* Try to avoid variable permutation instruction. */ + if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) + return false; + if (d->testing_p) return true; @@ -17617,57 +17675,6 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return true; } -/* For V*[QHS]Imode permutations, check if the same permutation - can't be performed in a 2x, 4x or 8x wider inner mode. */ - -static bool -canonicalize_vector_int_perm (const struct expand_vec_perm_d *d, - struct expand_vec_perm_d *nd) -{ - int i; - machine_mode mode = VOIDmode; - - switch (d->vmode) - { - case E_V16QImode: mode = V8HImode; break; - case E_V32QImode: mode = V16HImode; break; - case E_V64QImode: mode = V32HImode; break; - case E_V8HImode: mode = V4SImode; break; - case E_V16HImode: mode = V8SImode; break; - case E_V32HImode: mode = V16SImode; break; - case E_V4SImode: mode = V2DImode; break; - case E_V8SImode: mode = V4DImode; break; - case E_V16SImode: mode = V8DImode; break; - default: return false; - } - for (i = 0; i < d->nelt; i += 2) - if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1) - return false; - nd->vmode = mode; - nd->nelt = d->nelt / 2; - for (i = 0; i < nd->nelt; i++) - nd->perm[i] = d->perm[2 * i] / 2; - if (GET_MODE_INNER (mode) != DImode) - canonicalize_vector_int_perm (nd, nd); - if (nd != d) - { - nd->one_operand_p = d->one_operand_p; - nd->testing_p = d->testing_p; - if (d->op0 == d->op1) - nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0); - else - { - nd->op0 = gen_lowpart (nd->vmode, d->op0); - nd->op1 = gen_lowpart (nd->vmode, d->op1); - } - if (d->testing_p) - nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1); - else - nd->target = gen_reg_rtx (nd->vmode); - } - return true; -} - /* Try to expand one-operand permutation with constant mask. */ static bool -- cgit v1.1 From a984da88a35b42f444d1f9eeba77aa520b950d35 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 11 Jun 2021 12:58:22 +0200 Subject: i386: Fix up *vec_concat_0_1 [PR101007] On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote: > -(define_insn "*vec_concatv4si_0" > - [(set (match_operand:V4SI 0 "register_operand" "=v,x") > - (vec_concat:V4SI > - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") > - (match_operand:V2SI 2 "const0_operand" " C,C")))] > +(define_insn "*vec_concat_0" > + [(set (match_operand:VI124_128 0 "register_operand" "=v,x") > + (vec_concat:VI124_128 > + (match_operand: 1 "nonimmediate_operand" "vm,?!*y") > + (match_operand: 2 "const0_operand" " C,C")))] > "TARGET_SSE2" > "@ > %vmovq\t{%1, %0|%0, %1} > @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat" > (set_attr "prefix" "maybe_evex") > (set_attr "mode" "")]) > > +(define_insn_and_split "*vec_concat_0" > + [(set (match_operand:V 0 "register_operand") > + (vec_select:V > + (vec_concat: > + (match_operand:V 1 "nonimmediate_operand") > + (match_operand:V 2 "const0_operand")) > + (match_parallel 3 "movq_parallel" > + [(match_operand 4 "const_int_operand")])))] > + "ix86_pre_reload_split ()" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (vec_concat:V (match_dup 1) (match_dup 5)))] > +{ > + operands[1] = gen_lowpart (mode, operands[1]); > + operands[5] = CONST0_RTX (mode); > +}) This regressed the following testcase with -msse -mno-sse2. The define_insn_and_split splits the permutation into *vec_concat_0 or *vec_concatv2di_0 insns which both have TARGET_SSE2 in their conditions (for the former you can see it above), but the define_insn_and_split matches always when the V mode's condition do, which for V16QI/V8HI/V4SI/V2DI/V4SF modes is always (well, when those modes are valid, which is TARGET_SSE). 2021-06-11 Jakub Jelinek PR target/101007 * config/i386/sse.md (*vec_concat_0_1): Require TARGET_SSE2. * gcc.target/i386/sse-pr101007.c: New test. --- gcc/config/i386/sse.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 8403a07..94296bc 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -22394,7 +22394,7 @@ (match_operand:V 2 "const0_operand")) (match_parallel 3 "movq_parallel" [(match_operand 4 "const_int_operand")])))] - "ix86_pre_reload_split ()" + "TARGET_SSE2 && ix86_pre_reload_split ()" "#" "&& 1" [(set (match_dup 0) -- cgit v1.1 From b13f297f01c943aa167f7c6eb94bed40dce0d553 Mon Sep 17 00:00:00 2001 From: Srinath Parvathaneni Date: Fri, 11 Jun 2021 18:11:58 +0100 Subject: arm: Fix polymorphic variants failing with undefined reference to `__ARM_undef` error. This patch fixes the issue mentioned in PR101016, which is mve polymorphic variants failing at linking with undefined reference to "__ARM_undef" error. gcc/ChangeLog: 2021-06-11 Srinath Parvathaneni PR target/101016 * config/arm/arm_mve.h (__arm_vld1q): Change __ARM_mve_coerce(p0, int8_t const *) to __ARM_mve_coerce1(p0, int8_t *) in the argument for the polymorphic variants matching code. (__arm_vld1q_z): Likewise. (__arm_vld2q): Likewise. (__arm_vld4q): Likewise. (__arm_vldrbq_gather_offset): Likewise. (__arm_vldrbq_gather_offset_z): Likewise. gcc/testsuite/ChangeLog: 2021-06-11 Srinath Parvathaneni PR target/101016 * gcc.target/arm/mve/intrinsics/pr101016.c: New test. --- gcc/config/arm/arm_mve.h | 151 ++++++++++++++++++++++++----------------------- 1 file changed, 76 insertions(+), 75 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h index 1380f3a..83f1003 100644 --- a/gcc/config/arm/arm_mve.h +++ b/gcc/config/arm/arm_mve.h @@ -37565,47 +37565,47 @@ extern void *__ARM_undef; #define __arm_vld1q(p0) (\ _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce(p0, uint32_t const *)), \ - int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_f16 (__ARM_mve_coerce(p0, float16_t const *)), \ - int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_f32 (__ARM_mve_coerce(p0, float32_t const *)))) + int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \ + int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \ + int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \ + int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \ + int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \ + int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce1(p0, uint32_t *)), \ + int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_f16 (__ARM_mve_coerce1(p0, float16_t *)), \ + int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_f32 (__ARM_mve_coerce1(p0, float32_t *)))) #define __arm_vld1q_z(p0,p1) ( \ _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce(p0, int8_t const *), p1), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce(p0, int16_t const *), p1), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce(p0, int32_t const *), p1), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce(p0, uint8_t const *), p1), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce(p0, uint16_t const *), p1), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce(p0, uint32_t const *), p1), \ - int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_z_f16 (__ARM_mve_coerce(p0, float16_t const *), p1), \ - int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_z_f32 (__ARM_mve_coerce(p0, float32_t const *), p1))) + int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce1(p0, int8_t *), p1), \ + int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce1(p0, int16_t *), p1), \ + int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce1(p0, int32_t *), p1), \ + int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce1(p0, uint8_t *), p1), \ + int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce1(p0, uint16_t *), p1), \ + int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce1(p0, uint32_t *), p1), \ + int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_z_f16 (__ARM_mve_coerce1(p0, float16_t *), p1), \ + int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_z_f32 (__ARM_mve_coerce1(p0, float32_t *), p1))) #define __arm_vld2q(p0) ( \ _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce(p0, uint32_t const *)), \ - int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld2q_f16 (__ARM_mve_coerce(p0, float16_t const *)), \ - int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld2q_f32 (__ARM_mve_coerce(p0, float32_t const *)))) + int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \ + int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \ + int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \ + int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \ + int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \ + int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce1(p0, uint32_t *)), \ + int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld2q_f16 (__ARM_mve_coerce1(p0, float16_t *)), \ + int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld2q_f32 (__ARM_mve_coerce1(p0, float32_t *)))) #define __arm_vld4q(p0) ( \ _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce(p0, uint32_t const *)), \ - int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld4q_f16 (__ARM_mve_coerce(p0, float16_t const *)), \ - int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld4q_f32 (__ARM_mve_coerce(p0, float32_t const *)))) + int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \ + int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \ + int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \ + int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \ + int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \ + int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce1(p0, uint32_t *)), \ + int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld4q_f16 (__ARM_mve_coerce1(p0, float16_t *)), \ + int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld4q_f32 (__ARM_mve_coerce1(p0, float32_t *)))) #define __arm_vldrhq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ @@ -39631,25 +39631,26 @@ extern void *__ARM_undef; #define __arm_vldrbq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_s8 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_s16 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_s32 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint32x4_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_u8 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint32x4_t)));}) + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_s8 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_s16 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_s32 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t)), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_u8 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));}) #define __arm_vstrwq_scatter_base_p(p0,p1,p2,p3) ({ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_base_p_s32 (p0, p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \ int (*)[__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_base_p_u32 (p0, p1, __ARM_mve_coerce(__p2, uint32x4_t), p3));}) -#define __arm_vld1q(p0) (_Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce(p0, uint32_t const *)))) +#define __arm_vld1q(p0) (\ + _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ + int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \ + int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \ + int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \ + int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \ + int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \ + int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce1(p0, uint32_t *)))) #define __arm_vldrhq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ @@ -40146,29 +40147,29 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint32x4_t]: __arm_vbrsrq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), p2, p3));}) #define __arm_vld1q_z(p0,p1) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce(p0, int8_t const *), p1), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce(p0, int16_t const *), p1), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce(p0, int32_t const *), p1), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce(p0, uint8_t const *), p1), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce(p0, uint16_t const *), p1), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce(p0, uint32_t const *), p1))) + int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce1(p0, int8_t *), p1), \ + int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce1(p0, int16_t *), p1), \ + int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce1(p0, int32_t *), p1), \ + int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce1(p0, uint8_t *), p1), \ + int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce1(p0, uint16_t *), p1), \ + int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce1(p0, uint32_t *), p1))) #define __arm_vld2q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce(p0, uint32_t const *)))) + int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \ + int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \ + int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \ + int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \ + int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \ + int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce1(p0, uint32_t *)))) #define __arm_vld4q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce(p0, uint32_t const *)))) + int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \ + int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \ + int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \ + int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \ + int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \ + int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce1(p0, uint32_t *)))) #define __arm_vgetq_lane(p0,p1) ({ __typeof(p0) __p0 = (p0); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ @@ -40515,12 +40516,12 @@ extern void *__ARM_undef; #define __arm_vldrbq_gather_offset_z(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_z_s8 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_z_s16 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_z_s32 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint32x4_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_z_u8 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_z_u16 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_z_u32 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_z_s8 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_z_s16 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_z_s32 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_z_u8 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_z_u16 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_z_u32 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) #define __arm_vqrdmlahq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ @@ -41201,12 +41202,12 @@ extern void *__ARM_undef; #define __arm_vldrbq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_s8 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_s16 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_s32 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint32x4_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_u8 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint32x4_t)));}) + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_s8(__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_s16(__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \ + int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_s32(__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t)), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_u8(__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16(__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \ + int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32(__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));}) #define __arm_vidupq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ -- cgit v1.1 From 3f04e3782536ad2f9cfbb8cfe6630e9f9dd8af4c Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 11 Jun 2021 07:31:29 -0700 Subject: x86: Replace ix86_red_zone_size with ix86_red_zone_used Add red_zone_used to machine_function to track if red zone is used. When expanding function prologue, set red_zone_used to true if red zone is used. gcc/ PR target/101023 * config/i386/i386.c (ix86_expand_prologue): Set red_zone_used to true if red zone is used. (ix86_output_indirect_jmp): Replace ix86_red_zone_size with ix86_red_zone_used. * config/i386/i386.h (machine_function): Add red_zone_used. (ix86_red_zone_size): Removed. (ix86_red_zone_used): New. * config/i386/i386.md (peephole2 patterns): Replace ix86_red_zone_size with ix86_red_zone_used. gcc/testsuite/ PR target/101023 * g++.target/i386/pr101023a.C: New test. * g++.target/i386/pr101023b.C: Likewise. --- gcc/config/i386/i386.c | 6 +++++- gcc/config/i386/i386.h | 5 ++++- gcc/config/i386/i386.md | 8 ++++---- 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 05b8dc8..a612558 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -8401,10 +8401,14 @@ ix86_expand_prologue (void) || frame.stack_pointer_offset < CHECK_STACK_LIMIT)) { ix86_emit_save_regs_using_mov (frame.reg_save_offset); + cfun->machine->red_zone_used = true; int_registers_saved = true; } } + if (frame.red_zone_size != 0) + cfun->machine->red_zone_used = true; + if (stack_realign_fp) { int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT; @@ -15915,7 +15919,7 @@ ix86_output_indirect_jmp (rtx call_op) { /* We can't have red-zone since "call" in the indirect thunk pushes the return address onto stack, destroying red-zone. */ - if (ix86_red_zone_size != 0) + if (ix86_red_zone_used) gcc_unreachable (); ix86_output_indirect_branch (call_op, "%0", true); diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 919d0b2..182b327 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2663,6 +2663,9 @@ struct GTY(()) machine_function { invalid calls. */ BOOL_BITFIELD silent_p : 1; + /* True if red zone is used. */ + BOOL_BITFIELD red_zone_used : 1; + /* The largest alignment, in bytes, of stack slot actually used. */ unsigned int max_used_stack_alignment; @@ -2693,7 +2696,7 @@ extern GTY(()) tree ms_va_list_type_node; #define ix86_current_function_calls_tls_descriptor \ (ix86_tls_descriptor_calls_expanded_in_cfun && df_regs_ever_live_p (SP_REG)) #define ix86_static_chain_on_stack (cfun->machine->static_chain_on_stack) -#define ix86_red_zone_size (cfun->machine->frame.red_zone_size) +#define ix86_red_zone_used (cfun->machine->red_zone_used) /* Control behavior of x86_file_start. */ #define X86_FILE_START_VERSION_DIRECTIVE false diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 7743c61e..6e4abf3 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -20491,7 +20491,7 @@ (clobber (mem:BLK (scratch)))])] "(TARGET_SINGLE_PUSH || optimize_insn_for_size_p ()) && INTVAL (operands[0]) == -GET_MODE_SIZE (word_mode) - && ix86_red_zone_size == 0" + && !ix86_red_zone_used" [(clobber (match_dup 1)) (parallel [(set (mem:W (pre_dec:P (reg:P SP_REG))) (match_dup 1)) (clobber (mem:BLK (scratch)))])]) @@ -20505,7 +20505,7 @@ (clobber (mem:BLK (scratch)))])] "(TARGET_DOUBLE_PUSH || optimize_insn_for_size_p ()) && INTVAL (operands[0]) == -2*GET_MODE_SIZE (word_mode) - && ix86_red_zone_size == 0" + && !ix86_red_zone_used" [(clobber (match_dup 1)) (set (mem:W (pre_dec:P (reg:P SP_REG))) (match_dup 1)) (parallel [(set (mem:W (pre_dec:P (reg:P SP_REG))) (match_dup 1)) @@ -20520,7 +20520,7 @@ (clobber (reg:CC FLAGS_REG))])] "(TARGET_SINGLE_PUSH || optimize_insn_for_size_p ()) && INTVAL (operands[0]) == -GET_MODE_SIZE (word_mode) - && ix86_red_zone_size == 0" + && !ix86_red_zone_used" [(clobber (match_dup 1)) (set (mem:W (pre_dec:P (reg:P SP_REG))) (match_dup 1))]) @@ -20532,7 +20532,7 @@ (clobber (reg:CC FLAGS_REG))])] "(TARGET_DOUBLE_PUSH || optimize_insn_for_size_p ()) && INTVAL (operands[0]) == -2*GET_MODE_SIZE (word_mode) - && ix86_red_zone_size == 0" + && !ix86_red_zone_used" [(clobber (match_dup 1)) (set (mem:W (pre_dec:P (reg:P SP_REG))) (match_dup 1)) (set (mem:W (pre_dec:P (reg:P SP_REG))) (match_dup 1))]) -- cgit v1.1 From 8a7d54b1e10b8f4fba1358260ed2e7056ed23cbd Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Sun, 13 Jun 2021 11:09:38 -0400 Subject: [committed] More improvements to H8 logicals for test/compare elimination gcc/ * config/h8300/logical.md (qi3_1): New pattern. (andqi3_1): Removed. (qi3_1): Do not split for IOR/XOR a single bit. (H8/SX bit logicals): Split out from other patterns. * config/h8300/multiply.md (mulqihi3_const): Renamed from mulqihi3_const_clobber_flags. (mulqihi3, mulhisi3_const, mulhisi3): Similarly --- gcc/config/h8300/logical.md | 64 +++++++++++++++++++++++++++----------------- gcc/config/h8300/multiply.md | 8 +++--- 2 files changed, 43 insertions(+), 29 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/logical.md b/gcc/config/h8300/logical.md index fae3c7c..cb4c638 100644 --- a/gcc/config/h8300/logical.md +++ b/gcc/config/h8300/logical.md @@ -69,14 +69,6 @@ "" [(set_attr "length" "8,2")]) -(define_insn "*andqi3_1" - [(set (match_operand:QI 0 "register_operand" "=r") - (and:QI (match_operand:QI 1 "register_operand" "%0") - (match_operand:QI 2 "h8300_src_operand" "rn"))) - (clobber (reg:CC CC_REG))] - "" - "and %X2,%X0" - [(set_attr "length" "2")]) (define_insn_and_split "*andor3" [(set (match_operand:QHSI 0 "register_operand" "=r") @@ -179,27 +171,49 @@ (match_operand:QI 2 "h8300_src_operand" "Y2,rQi")))] "TARGET_H8300SX || register_operand (operands[0], QImode) || single_one_operand (operands[2], QImode)" - "#" - "&& reload_completed" + { return == IOR ? "bset\\t%V2,%R0" : "bnot\\t%V2,%R0"; } + "&& reload_completed && !single_one_operand (operands[2], QImode)" [(parallel [(set (match_dup 0) (ors:QI (match_dup 1) (match_dup 2))) - (clobber (reg:CC CC_REG))])]) + (clobber (reg:CC CC_REG))])] + "" + [(set_attr "length" "8")]) -(define_insn "qi3_1_clobber_flags" - [(set (match_operand:QI 0 "bit_operand" "=U,rQ") - (ors:QI (match_operand:QI 1 "bit_operand" "%0,0") - (match_operand:QI 2 "h8300_src_operand" "Y2,rQi"))) +(define_insn "*qi3_1" + [(set (match_operand:QI 0 "bit_operand" "=rQ") + (ors:QI (match_operand:QI 1 "bit_operand" "%0") + (match_operand:QI 2 "h8300_src_operand" "rQi"))) (clobber (reg:CC CC_REG))] - "TARGET_H8300SX || register_operand (operands[0], QImode) - || single_one_operand (operands[2], QImode)" - { - if (which_alternative == 0) - return == IOR ? "bset\\t%V2,%R0" : "bnot\\t%V2,%R0"; - else if (which_alternative == 1) - return == IOR ? "or\\t%X2,%X0" : "xor\\t%X2,%X0"; - gcc_unreachable (); + "TARGET_H8300SX" + { return == IOR ? "or\\t%X2,%X0" : "xor\\t%X2,%X0"; } + [(set_attr "length" "*") + (set_attr "length_table" "logicb")]) + +(define_insn "*qi3_1" + [(set (match_operand:QI 0 "register_operand" "=r") + (ors:QI (match_operand:QI 1 "register_operand" "%0") + (match_operand:QI 2 "h8300_src_operand" "ri"))) + (clobber (reg:CC CC_REG))] + "TARGET_H8300SX" + { return == IOR ? "or\\t%X2,%X0" : "xor\\t%X2,%X0"; } + [(set_attr "length" "*") + (set_attr "length_table" "logicb")]) + +(define_insn "*qi3_1" + [(set (match_operand:QI 0 "register_operand" "=r") + (logicals:QI (match_operand:QI 1 "register_operand" "%0") + (match_operand:QI 2 "h8300_src_operand" "rn"))) + (clobber (reg:CC CC_REG))] + "" + { + if ( == IOR) + return "or\\t%X2,%X0"; + else if ( == XOR) + return "xor\\t%X2,%X0"; + else if ( == AND) + return "and\\t%X2,%X0"; + gcc_unreachable (); } - [(set_attr "length" "8,*") - (set_attr "length_table" "*,logicb")]) + [(set_attr "length" "2")]) ;; ---------------------------------------------------------------------- ;; {AND,IOR,XOR}{HI3,SI3} PATTERNS diff --git a/gcc/config/h8300/multiply.md b/gcc/config/h8300/multiply.md index 1d56d47..8b9328c 100644 --- a/gcc/config/h8300/multiply.md +++ b/gcc/config/h8300/multiply.md @@ -26,7 +26,7 @@ (mult:HI (sign_extend:HI (match_dup 1)) (match_dup 2))) (clobber (reg:CC CC_REG))])]) -(define_insn "*mulqihi3_const_clobber_flags" +(define_insn "*mulqihi3_const" [(set (match_operand:HI 0 "register_operand" "=r") (mult:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "%0")) (match_operand:QI 2 "nibble_operand" "IP4>X"))) @@ -47,7 +47,7 @@ (sign_extend:HI (match_dup 2)))) (clobber (reg:CC CC_REG))])]) -(define_insn "*mulqihi3_clobber_flags" +(define_insn "*mulqihi3" [(set (match_operand:HI 0 "register_operand" "=r") (mult:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "%0")) (sign_extend:HI (match_operand:QI 2 "register_operand" "r")))) @@ -78,7 +78,7 @@ (mult:SI (sign_extend:SI (match_dup 1)) (match_dup 2))) (clobber (reg:CC CC_REG))])]) -(define_insn "*mulhisi3_const_clobber_flags" +(define_insn "*mulhisi3_const" [(set (match_operand:SI 0 "register_operand" "=r") (mult:SI (sign_extend:SI (match_operand:HI 1 "register_operand" "%0")) (match_operand:SI 2 "nibble_operand" "IP4>X"))) @@ -99,7 +99,7 @@ (sign_extend:SI (match_dup 2)))) (clobber (reg:CC CC_REG))])]) -(define_insn "*mulhisi3_clobber_flags" +(define_insn "*mulhisi3" [(set (match_operand:SI 0 "register_operand" "=r") (mult:SI (sign_extend:SI (match_operand:HI 1 "register_operand" "%0")) (sign_extend:SI (match_operand:HI 2 "register_operand" "r")))) -- cgit v1.1 From 681143b9b94d7f1c88a7c34e2250865c31191959 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 13 Jun 2021 21:50:51 +0200 Subject: i386: Improve variable permutation insn avoidance [PR101021] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Emit constant permutation insn directly from expand_vec_perm_shufb. 2021-06-13 Uroš Bizjak gcc/ PR target/101021 * config/i386/i386-expand.c (expand_vec_perm_pshufb): Emit constant permutation insn directly from here. --- gcc/config/i386/i386-expand.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 2fa3a18..6e33f6f 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -17593,13 +17593,16 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return false; } - /* Try to avoid variable permutation instruction. */ - if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) - return false; - if (d->testing_p) return true; + /* Try to avoid variable permutation instruction. */ + if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) + { + emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target)); + return true; + } + if (vmode == V8SImode) for (i = 0; i < 8; ++i) rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7); -- cgit v1.1 From c4c47a84a16a29d7077f537051d90de1f3475957 Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Mon, 14 Jun 2021 15:33:17 +0300 Subject: arc: Add --with-fpu support for ARCv2 cpus Support for a compile-time default FPU. The --with-fpu configuration option is ignored if -mfpu compiler option is specified. The FPU options are only available for ARCv2 cpus. gcc/ 2021-06-14 Claudiu Zissulescu * config.gcc (arc): Add support for with_cpu option. * config/arc/arc.h (OPTION_DEFAULT_SPECS): Add fpu. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/arc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h index 0224ae6..8cd6350 100644 --- a/gcc/config/arc/arc.h +++ b/gcc/config/arc/arc.h @@ -100,7 +100,11 @@ extern const char *arc_cpu_to_as (int argc, const char **argv); "%:cpu_to_as(%{mcpu=*:%*}) %{mspfp*} %{mdpfp*} " \ "%{mfpu=fpuda*:-mfpuda} %{mcode-density}" +/* Support for a compile-time default CPU and FPU. The rules are: + --with-cpu is ignored if -mcpu, mARC*, marc*, mA7, mA6 are specified. + --with-fpu is ignored if -mfpu is specified. */ #define OPTION_DEFAULT_SPECS \ + {"fpu", "%{!mfpu=*:-mfpu=%(VALUE)}"}, \ {"cpu", "%{!mcpu=*:%{!mARC*:%{!marc*:%{!mA7:%{!mA6:-mcpu=%(VALUE)}}}}}" } #ifndef DRIVER_ENDIAN_SELF_SPECS -- cgit v1.1 From 046a3beb1673bf4a61c131373b6a5e84158e92bf Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Thu, 3 Jun 2021 14:35:50 +0000 Subject: arm: Auto-vectorization for MVE: add pack/unpack patterns This patch adds vec_unpack_hi_, vec_unpack_lo_, vec_pack_trunc_ patterns for MVE. It does so by moving the unpack patterns from neon.md to vec-common.md, while adding them support for MVE. The pack expander is derived from the Neon one (which in turn is renamed into neon_quad_vec_pack_trunc_). The patch introduces mve_vec_unpack_lo_ and mve_vec_unpack_hi_ which are similar to their Neon counterparts, except for the assembly syntax. The patch introduces mve_vec_pack_trunc_lo_ to avoid the need for a zero-initialized temporary, which is needed if the vec_pack_trunc_ expander calls @mve_vmovn[bt]q_ instead. With this patch, we can now vectorize the 16 and 8-bit versions of vclz and vshl, although the generated code could still be improved. For test_clz_s16, we now generate vldrh.16 q3, [r1] vmovlb.s16 q2, q3 vmovlt.s16 q3, q3 vclz.i32 q2, q2 vclz.i32 q3, q3 vmovnb.i32 q1, q2 vmovnt.i32 q1, q3 vstrh.16 q1, [r0] which could be improved to vldrh.16 q3, [r1] vclz.i16 q1, q3 vstrh.16 q1, [r0] if we could avoid the need for unpack/pack steps. For reference, clang-12 generates: vldrh.s32 q0, [r1] vldrh.s32 q1, [r1, #8] vclz.i32 q0, q0 vstrh.32 q0, [r0] vclz.i32 q0, q1 vstrh.32 q0, [r0, #8] 2021-06-11 Christophe Lyon gcc/ * config/arm/mve.md (mve_vec_unpack_lo_): New pattern. (mve_vec_unpack_hi_): New pattern. (@mve_vec_pack_trunc_lo_): New pattern. (mve_vmovntq_): Prefix with '@'. * config/arm/neon.md (vec_unpack_hi_): Move to vec-common.md. (vec_unpack_lo_): Likewise. (vec_pack_trunc_): Rename to neon_quad_vec_pack_trunc_. * config/arm/vec-common.md (vec_unpack_hi_): New pattern. (vec_unpack_lo_): New. (vec_pack_trunc_): New. gcc/testsuite/ * gcc.target/arm/simd/mve-vclz.c: Update expected results. * gcc.target/arm/simd/mve-vshl.c: Likewise. * gcc.target/arm/simd/mve-vec-pack.c: New test. * gcc.target/arm/simd/mve-vec-unpack.c: New test. --- gcc/config/arm/mve.md | 35 +++++++++++++++++++++- gcc/config/arm/neon.md | 39 +----------------------- gcc/config/arm/vec-common.md | 70 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 39 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 99e46d0..e393518 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -535,6 +535,26 @@ [(set_attr "type" "mve_move") ]) +(define_insn "mve_vec_unpack_lo_" + [(set (match_operand: 0 "register_operand" "=w") + (SE: (vec_select: + (match_operand:MVE_3 1 "register_operand" "w") + (match_operand:MVE_3 2 "vect_par_constant_low" ""))))] + "TARGET_HAVE_MVE" + "vmovlb.%# %q0, %q1" + [(set_attr "type" "mve_move")] +) + +(define_insn "mve_vec_unpack_hi_" + [(set (match_operand: 0 "register_operand" "=w") + (SE: (vec_select: + (match_operand:MVE_3 1 "register_operand" "w") + (match_operand:MVE_3 2 "vect_par_constant_high" ""))))] + "TARGET_HAVE_MVE" + "vmovlt.%# %q0, %q1" + [(set_attr "type" "mve_move")] +) + ;; ;; [vcvtpq_s, vcvtpq_u]) ;; @@ -2199,10 +2219,23 @@ [(set_attr "type" "mve_move") ]) +;; vmovnb pattern used by the vec_pack_trunc expander to avoid the +;; need for an uninitialized input operand. +(define_insn "@mve_vec_pack_trunc_lo_" + [ + (set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:MVE_5 1 "s_register_operand" "w")] + VMOVNBQ_S)) + ] + "TARGET_HAVE_MVE" + "vmovnb.i%# %q0, %q1" + [(set_attr "type" "mve_move") +]) + ;; ;; [vmovntq_s, vmovntq_u]) ;; -(define_insn "mve_vmovntq_" +(define_insn "@mve_vmovntq_" [ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand: 1 "s_register_operand" "0") diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 0fdffaf..392d960 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5924,43 +5924,6 @@ if (BYTES_BIG_ENDIAN) [(set_attr "type" "neon_shift_imm_long")] ) -(define_expand "vec_unpack_hi_" - [(match_operand: 0 "register_operand") - (SE: (match_operand:VU 1 "register_operand"))] - "TARGET_NEON && !BYTES_BIG_ENDIAN" - { - rtvec v = rtvec_alloc (/2) ; - rtx t1; - int i; - for (i = 0; i < (/2); i++) - RTVEC_ELT (v, i) = GEN_INT ((/2) + i); - - t1 = gen_rtx_PARALLEL (mode, v); - emit_insn (gen_neon_vec_unpack_hi_ (operands[0], - operands[1], - t1)); - DONE; - } -) - -(define_expand "vec_unpack_lo_" - [(match_operand: 0 "register_operand") - (SE: (match_operand:VU 1 "register_operand"))] - "TARGET_NEON && !BYTES_BIG_ENDIAN" - { - rtvec v = rtvec_alloc (/2) ; - rtx t1; - int i; - for (i = 0; i < (/2) ; i++) - RTVEC_ELT (v, i) = GEN_INT (i); - t1 = gen_rtx_PARALLEL (mode, v); - emit_insn (gen_neon_vec_unpack_lo_ (operands[0], - operands[1], - t1)); - DONE; - } -) - (define_insn "neon_vec_mult_lo_" [(set (match_operand: 0 "register_operand" "=w") (mult: (SE: (vec_select: @@ -6176,7 +6139,7 @@ if (BYTES_BIG_ENDIAN) ; because the ordering of vector elements in Q registers is different from what ; the semantics of the instructions require. -(define_insn "vec_pack_trunc_" +(define_insn "neon_quad_vec_pack_trunc_" [(set (match_operand: 0 "register_operand" "=&w") (vec_concat: (truncate: diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 430a92c..f90afa4 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -632,3 +632,73 @@ "ARM_HAVE__ARITH && !TARGET_REALLY_IWMMXT" ) + +;; vmovl[tb] are not available for V4SI on MVE +(define_expand "vec_unpack_hi_" + [(set (match_operand: 0 "register_operand") + (SE: (vec_select: + (match_operand:VU 1 "register_operand") + (match_dup 2))))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT + && ! (mode == V4SImode && TARGET_HAVE_MVE) + && !BYTES_BIG_ENDIAN" + { + rtvec v = rtvec_alloc (/2); + int i; + for (i = 0; i < (/2); i++) + RTVEC_ELT (v, i) = GEN_INT ((/2) + i); + + operands[2] = gen_rtx_PARALLEL (mode, v); + } +) + +;; vmovl[tb] are not available for V4SI on MVE +(define_expand "vec_unpack_lo_" + [(set (match_operand: 0 "register_operand") + (SE: (vec_select: + (match_operand:VU 1 "register_operand") + (match_dup 2))))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT + && ! (mode == V4SImode && TARGET_HAVE_MVE) + && !BYTES_BIG_ENDIAN" + { + rtvec v = rtvec_alloc (/2); + int i; + for (i = 0; i < (/2) ; i++) + RTVEC_ELT (v, i) = GEN_INT (i); + + operands[2] = gen_rtx_PARALLEL (mode, v); + + } +) + +;; vmovn[tb] are not available for V2DI on MVE +(define_expand "vec_pack_trunc_" + [(set (match_operand: 0 "register_operand") + (vec_concat: + (truncate: + (match_operand:VN 1 "register_operand")) + (truncate: + (match_operand:VN 2 "register_operand"))))] + "ARM_HAVE__ARITH + && !TARGET_REALLY_IWMMXT + && ! (mode == V2DImode && TARGET_HAVE_MVE) + && !BYTES_BIG_ENDIAN" + { + if (TARGET_NEON) + { + emit_insn (gen_neon_quad_vec_pack_trunc_ (operands[0], operands[1], + operands[2])); + } + else + { + rtx tmpreg = gen_reg_rtx (mode); + emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, operands[1])); + emit_insn (gen_mve_vmovntq (VMOVNTQ_S, mode, + operands[0], tmpreg, operands[2])); + } + DONE; + } +) -- cgit v1.1 From 4986946f3b761dd4c3e0d79ca735c90e33f4bb83 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 14 Jun 2021 20:56:18 +0200 Subject: i386: Split V2HImode *punpckwd to SSE instruction [PR101058] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit V2HImode *punpckwd should not be split to the insn that depends on TARGET_MMX_WITH_SSE, since the later is disabled on 32bit targets. Also return true early from ix86_vectorize_vec_perm_const when testing with V2HI mode. *punpckwd can be used to implement all permutations. 2021-06-14 Uroš Bizjak gcc/ PR target/101058 * config/i386/i386-expand.c (ix86_vectorize_vec_perm_const): Return true early when testing with V2HImode. * config/i386/mmx.md (*punpckwd): Split to sse2_pshuflw_1. gcc/testsuite/ PR target/101058 * gcc.target/i386/pr101058.c: New test. --- gcc/config/i386/i386-expand.c | 9 ++++++--- gcc/config/i386/mmx.md | 13 +++++++------ 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 6e33f6f..dee3df2 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -20446,9 +20446,12 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, return false; break; case E_V2HImode: - if (!TARGET_SSE2) - return false; - break; + if (!TARGET_SSE2) + return false; + /* All implementable with *punpckwd. */ + if (d.testing_p) + return true; + break; case E_V2DImode: case E_V2DFmode: if (!TARGET_SSE) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index f9e7d27..1a9e7b0 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -3368,16 +3368,18 @@ (vec_concat:V4HI (match_operand:V2HI 1 "register_operand" "0,Yw") (match_operand:V2HI 2 "register_operand" "x,Yw")) - (parallel [(match_operand 3 "const_0_to_3_operand") - (match_operand 4 "const_0_to_3_operand")])))] + (parallel [(match_operand 3 "const_0_to_3_operand") + (match_operand 4 "const_0_to_3_operand")])))] "TARGET_SSE2" "#" "&& reload_completed" [(set (match_dup 5) - (vec_select:V4HI + (vec_select:V8HI (match_dup 5) (parallel [(match_dup 3) (match_dup 4) - (const_int 0) (const_int 0)])))] + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)])))] { rtx dest = lowpart_subreg (V8HImode, operands[0], V2HImode); rtx op1 = lowpart_subreg (V8HImode, operands[1], V2HImode); @@ -3395,8 +3397,7 @@ operands[3] = GEN_INT (sel0); operands[4] = GEN_INT (sel1); - - operands[5] = lowpart_subreg (V4HImode, dest, V8HImode); + operands[5] = dest; } [(set_attr "isa" "noavx,avx") (set_attr "type" "sselog") -- cgit v1.1 From 20073534c0ccca0a4e079c053ee0874af10b2ea0 Mon Sep 17 00:00:00 2001 From: Peter Bergner Date: Mon, 14 Jun 2021 16:55:18 -0500 Subject: rs6000: MMA builtin usage ICEs when used in a #pragma omp parallel and using -fopenmp [PR100777] Using an MMA builtin within an openmp parallel code block, leads to an SSA verification ICE on the temporaries we create while expanding the MMA builtins at gimple time. The solution is to use create_tmp_reg_or_ssa_name(), which knows when to create either an SSA or register temporary. 2021-06-14 Peter Bergner gcc/ PR target/100777 * config/rs6000/rs6000-call.c (rs6000_gimple_fold_mma_builtin): Use create_tmp_reg_or_ssa_name(). gcc/testsuite/ PR target/100777 * gcc.target/powerpc/pr100777.c: New test. --- gcc/config/rs6000/rs6000-call.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index 0ac6b6e..b677898 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -11863,7 +11863,7 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi) tree dst_ptr = gimple_call_arg (stmt, 0); tree src_ptr = gimple_call_arg (stmt, 1); tree src_type = TREE_TYPE (src_ptr); - tree src = make_ssa_name (TREE_TYPE (src_type)); + tree src = create_tmp_reg_or_ssa_name (TREE_TYPE (src_type)); gimplify_assign (src, build_simple_mem_ref (src_ptr), &new_seq); /* If we are not disassembling an accumulator/pair or our destination is @@ -11887,7 +11887,7 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi) { new_decl = rs6000_builtin_decls[MMA_BUILTIN_XXMFACC_INTERNAL]; new_call = gimple_build_call (new_decl, 1, src); - src = make_ssa_name (vector_quad_type_node); + src = create_tmp_reg_or_ssa_name (vector_quad_type_node); gimple_call_set_lhs (new_call, src); gimple_seq_add_stmt (&new_seq, new_call); } @@ -11902,7 +11902,7 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi) unsigned index = WORDS_BIG_ENDIAN ? i : nvec - 1 - i; tree dst = build2 (MEM_REF, unsigned_V16QI_type_node, dst_base, build_int_cst (dst_type, index * 16)); - tree dstssa = make_ssa_name (unsigned_V16QI_type_node); + tree dstssa = create_tmp_reg_or_ssa_name (unsigned_V16QI_type_node); new_call = gimple_build_call (new_decl, 2, src, build_int_cstu (uint16_type_node, i)); gimple_call_set_lhs (new_call, dstssa); @@ -11925,7 +11925,7 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi) { /* This built-in has a pass-by-reference accumulator input, so load it into a temporary accumulator for use as a pass-by-value input. */ - op[0] = make_ssa_name (vector_quad_type_node); + op[0] = create_tmp_reg_or_ssa_name (vector_quad_type_node); for (unsigned i = 1; i < nopnds; i++) op[i] = gimple_call_arg (stmt, i); gimplify_assign (op[0], build_simple_mem_ref (acc), &new_seq); @@ -11973,9 +11973,9 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi) } if (fncode == VSX_BUILTIN_BUILD_PAIR || fncode == VSX_BUILTIN_ASSEMBLE_PAIR) - lhs = make_ssa_name (vector_pair_type_node); + lhs = create_tmp_reg_or_ssa_name (vector_pair_type_node); else - lhs = make_ssa_name (vector_quad_type_node); + lhs = create_tmp_reg_or_ssa_name (vector_quad_type_node); gimple_call_set_lhs (new_call, lhs); gimple_seq_add_stmt (&new_seq, new_call); gimplify_assign (build_simple_mem_ref (acc), lhs, &new_seq); -- cgit v1.1 From 913b13fcb1dceea0e57a04cb77b11097b132cbf6 Mon Sep 17 00:00:00 2001 From: Carl Love Date: Thu, 10 Jun 2021 17:36:25 -0500 Subject: Fix for vcmpequt builtin The vcmpequt builtin define eqvv1ti3 points to the eqv define instruction for the eqv instruction. The vcmpequt builtin define should point to the vector_eqv1ti instruction definition for the vcmpequq instruction. 2021-06-15 Carl Love gcc/ChangeLog PR target/101022 * config/rs6000/rs6000-builtin.def (VCMPEQUT): Fix the ICODE for the enum definition. (VRLQ, VSLQ, VSRQ, VSRAQ): Remove unused BU_P10_OVERLOAD_2 definitions. --- gcc/config/rs6000/rs6000-builtin.def | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index 231e7c9..d7ce4de 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -2924,7 +2924,7 @@ BU_P10V_VSX_2 (XXGENPCVM_V4SI, "xxgenpcvm_v4si", CONST, xxgenpcvm_v4si) BU_P10V_VSX_2 (XXGENPCVM_V2DI, "xxgenpcvm_v2di", CONST, xxgenpcvm_v2di) BU_P10V_AV_2 (VCMPGTUT, "vcmpgtut", CONST, vector_gtuv1ti) BU_P10V_AV_2 (VCMPGTST, "vcmpgtst", CONST, vector_gtv1ti) -BU_P10V_AV_2 (VCMPEQUT, "vcmpequt", CONST, eqvv1ti3) +BU_P10V_AV_2 (VCMPEQUT, "vcmpequt", CONST, vector_eqv1ti) BU_P10V_AV_2 (CMPNET, "vcmpnet", CONST, vcmpnet) BU_P10V_AV_2 (CMPGE_1TI, "cmpge_1ti", CONST, vector_nltv1ti) BU_P10V_AV_2 (CMPGE_U1TI, "cmpge_u1ti", CONST, vector_nltuv1ti) @@ -3078,10 +3078,6 @@ BU_P10_OVERLOAD_2 (CLRR, "clrr") BU_P10_OVERLOAD_2 (GNB, "gnb") BU_P10_OVERLOAD_4 (XXEVAL, "xxeval") BU_P10_OVERLOAD_2 (XXGENPCVM, "xxgenpcvm") -BU_P10_OVERLOAD_2 (VRLQ, "vrlq") -BU_P10_OVERLOAD_2 (VSLQ, "vslq") -BU_P10_OVERLOAD_2 (VSRQ, "vsrq") -BU_P10_OVERLOAD_2 (VSRAQ, "vsraq") BU_P10_OVERLOAD_3 (EXTRACTL, "extractl") BU_P10_OVERLOAD_3 (EXTRACTH, "extracth") -- cgit v1.1 From 3155d51bfd1de8b6c4645dcb2292248a8d7cc3c9 Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Wed, 16 Jun 2021 09:56:09 +0100 Subject: [PATCH] PR rtl-optimization/46235: Improved use of bt for bit tests on x86_64. This patch tackles PR46235 to improve the code generated for bit tests on x86_64 by making more use of the bt instruction. Currently, GCC emits bt instructions when followed by condition jumps (thanks to Uros' splitters). This patch adds splitters in i386.md, to catch the cases where bt is followed by a conditional move (as in the original report), or by a setc/setnc (as in comment 5 of the Bugzilla PR). With this patch, the function in the original PR int foo(int a, int x, int y) { if (a & (1 << x)) return a; return 1; } which with -O2 on mainline generates: foo: movl %edi, %eax movl %esi, %ecx sarl %cl, %eax testb $1, %al movl $1, %eax cmovne %edi, %eax ret now generates: foo: btl %esi, %edi movl $1, %eax cmovc %edi, %eax ret Likewise, IsBitSet1 and IsBitSet2 (from comment 5) bool IsBitSet1(unsigned char byte, int index) { return (byte & (1<> index) & 1; } Before: movzbl %dil, %eax movl %esi, %ecx sarl %cl, %eax andl $1, %eax ret After: movzbl %dil, %edi btl %esi, %edi setc %al ret According to Agner Fog, SAR/SHR r,cl takes 2 cycles on skylake, where BT r,r takes only one, so the performance improvements on recent hardware may be more significant than implied by just the reduced number of instructions. I've avoided transforming cases (such as btsi_setcsi) where using bt sequences may not be a clear win (over sarq/andl). 2010-06-15 Roger Sayle gcc/ChangeLog PR rtl-optimization/46235 * config/i386/i386.md: New define_split for bt followed by cmov. (*bt_setcqi): New define_insn_and_split for bt followed by setc. (*bt_setncqi): New define_insn_and_split for bt then setnc. (*bt_setnc): New define_insn_and_split for bt followed by setnc with zero extension. gcc/testsuite/ChangeLog PR rtl-optimization/46235 * gcc.target/i386/bt-5.c: New test. * gcc.target/i386/bt-6.c: New test. * gcc.target/i386/bt-7.c: New test. --- gcc/config/i386/i386.md | 94 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 6e4abf3..48532eb 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -12794,6 +12794,100 @@ operands[0] = shallow_copy_rtx (operands[0]); PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0]))); }) + +;; Help combine recognize bt followed by cmov +(define_split + [(set (match_operand:SWI248 0 "register_operand") + (if_then_else:SWI248 + (ne + (zero_extract:SWI48 + (match_operand:SWI48 1 "register_operand") + (const_int 1) + (zero_extend:SI (match_operand:QI 2 "register_operand"))) + (const_int 0)) + (match_operand:SWI248 3 "nonimmediate_operand") + (match_operand:SWI248 4 "nonimmediate_operand")))] + "TARGET_USE_BT && TARGET_CMOVE + && !(MEM_P (operands[3]) && MEM_P (operands[4])) + && ix86_pre_reload_split ()" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2)) + (const_int 0))) + (set (match_dup 0) + (if_then_else:SWI248 (eq (reg:CCC FLAGS_REG) (const_int 0)) + (match_dup 3) + (match_dup 4)))] +{ + operands[2] = lowpart_subreg (SImode, operands[2], QImode); +}) + +;; Help combine recognize bt followed by setc +(define_insn_and_split "*bt_setcqi" + [(set (subreg:SWI48 (match_operand:QI 0 "register_operand") 0) + (zero_extract:SWI48 + (match_operand:SWI48 1 "register_operand") + (const_int 1) + (zero_extend:SI (match_operand:QI 2 "register_operand")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_BT && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2)) + (const_int 0))) + (set (match_dup 0) + (eq:QI (reg:CCC FLAGS_REG) (const_int 0)))] +{ + operands[2] = lowpart_subreg (SImode, operands[2], QImode); +}) + +;; Help combine recognize bt followed by setnc +(define_insn_and_split "*bt_setncqi" + [(set (match_operand:QI 0 "register_operand") + (and:QI + (not:QI + (subreg:QI + (lshiftrt:SWI48 (match_operand:SWI48 1 "register_operand") + (match_operand:QI 2 "register_operand")) 0)) + (const_int 1))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_BT && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2)) + (const_int 0))) + (set (match_dup 0) + (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))] +{ + operands[2] = lowpart_subreg (SImode, operands[2], QImode); +}) + +(define_insn_and_split "*bt_setnc" + [(set (match_operand:SWI48 0 "register_operand") + (and:SWI48 + (not:SWI48 + (lshiftrt:SWI48 (match_operand:SWI48 1 "register_operand") + (match_operand:QI 2 "register_operand"))) + (const_int 1))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_BT && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2)) + (const_int 0))) + (set (match_dup 3) + (ne:QI (reg:CCC FLAGS_REG) (const_int 0))) + (set (match_dup 0) (zero_extend:SWI48 (match_dup 3)))] +{ + operands[2] = lowpart_subreg (SImode, operands[2], QImode); + operands[3] = gen_reg_rtx (QImode); +}) ;; Store-flag instructions. -- cgit v1.1 From d8a88cdae9c0c42ab7c5c65a5043c4f8bad349d2 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 11 Jun 2021 15:48:51 +0100 Subject: aarch64: Model zero-high-half semantics of XTN instruction in RTL Modeling the zero-high-half semantics of the XTN narrowing instruction in RTL indicates to the compiler that this is a totally destructive operation. This enables more RTL simplifications and also prevents some register allocation issues. Add new tests to narrow_zero_high_half.c to verify the benefit of this change. gcc/ChangeLog: 2021-06-11 Jonathan Wright * config/aarch64/aarch64-simd.md (aarch64_xtn_insn_le): Define - modeling zero-high-half semantics. (aarch64_xtn): Change to an expander that emits the appropriate instruction depending on endianness. (aarch64_xtn_insn_be): Define - modeling zero-high-half semantics. (aarch64_xtn2_le): Rename to... (aarch64_xtn2_insn_le): This. (aarch64_xtn2_be): Rename to... (aarch64_xtn2_insn_be): This. (vec_pack_trunc_): Emit truncation instruction instead of aarch64_xtn. * config/aarch64/iterators.md (Vnarrowd): Add Vnarrowd mode attribute iterator. gcc/testsuite/ChangeLog: * gcc.target/aarch64/narrow_zero_high_half.c: Add new tests. --- gcc/config/aarch64/aarch64-simd.md | 105 ++++++++++++++++++++++++------------- gcc/config/aarch64/iterators.md | 2 + 2 files changed, 72 insertions(+), 35 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index e750fae..b23556b 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1690,17 +1690,48 @@ ;; Narrowing operations. -;; For doubles. +(define_insn "aarch64_xtn_insn_le" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (truncate: (match_operand:VQN 1 "register_operand" "w")) + (match_operand: 2 "aarch64_simd_or_scalar_imm_zero")))] + "TARGET_SIMD && !BYTES_BIG_ENDIAN" + "xtn\\t%0., %1." + [(set_attr "type" "neon_move_narrow_q")] +) -(define_insn "aarch64_xtn" - [(set (match_operand: 0 "register_operand" "=w") - (truncate: (match_operand:VQN 1 "register_operand" "w")))] - "TARGET_SIMD" +(define_insn "aarch64_xtn_insn_be" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (match_operand: 2 "aarch64_simd_or_scalar_imm_zero") + (truncate: (match_operand:VQN 1 "register_operand" "w"))))] + "TARGET_SIMD && BYTES_BIG_ENDIAN" "xtn\\t%0., %1." [(set_attr "type" "neon_move_narrow_q")] ) -(define_insn "aarch64_xtn2_le" +(define_expand "aarch64_xtn" + [(set (match_operand: 0 "register_operand") + (truncate: (match_operand:VQN 1 "register_operand")))] + "TARGET_SIMD" + { + rtx tmp = gen_reg_rtx (mode); + if (BYTES_BIG_ENDIAN) + emit_insn (gen_aarch64_xtn_insn_be (tmp, operands[1], + CONST0_RTX (mode))); + else + emit_insn (gen_aarch64_xtn_insn_le (tmp, operands[1], + CONST0_RTX (mode))); + + /* The intrinsic expects a narrow result, so emit a subreg that will get + optimized away as appropriate. */ + emit_move_insn (operands[0], lowpart_subreg (mode, tmp, + mode)); + DONE; + } +) + +(define_insn "aarch64_xtn2_insn_le" [(set (match_operand: 0 "register_operand" "=w") (vec_concat: (match_operand: 1 "register_operand" "0") @@ -1710,7 +1741,7 @@ [(set_attr "type" "neon_move_narrow_q")] ) -(define_insn "aarch64_xtn2_be" +(define_insn "aarch64_xtn2_insn_be" [(set (match_operand: 0 "register_operand" "=w") (vec_concat: (truncate: (match_operand:VQN 2 "register_operand" "w")) @@ -1727,15 +1758,17 @@ "TARGET_SIMD" { if (BYTES_BIG_ENDIAN) - emit_insn (gen_aarch64_xtn2_be (operands[0], operands[1], - operands[2])); + emit_insn (gen_aarch64_xtn2_insn_be (operands[0], operands[1], + operands[2])); else - emit_insn (gen_aarch64_xtn2_le (operands[0], operands[1], - operands[2])); + emit_insn (gen_aarch64_xtn2_insn_le (operands[0], operands[1], + operands[2])); DONE; } ) +;; Packing doubles. + (define_expand "vec_pack_trunc_" [(match_operand: 0 "register_operand") (match_operand:VDN 1 "register_operand") @@ -1748,10 +1781,35 @@ emit_insn (gen_move_lo_quad_ (tempreg, operands[lo])); emit_insn (gen_move_hi_quad_ (tempreg, operands[hi])); - emit_insn (gen_aarch64_xtn (operands[0], tempreg)); + emit_insn (gen_trunc2 (operands[0], tempreg)); DONE; }) +;; Packing quads. + +(define_expand "vec_pack_trunc_" + [(set (match_operand: 0 "register_operand") + (vec_concat: + (truncate: (match_operand:VQN 1 "register_operand")) + (truncate: (match_operand:VQN 2 "register_operand"))))] + "TARGET_SIMD" + { + rtx tmpreg = gen_reg_rtx (mode); + int lo = BYTES_BIG_ENDIAN ? 2 : 1; + int hi = BYTES_BIG_ENDIAN ? 1 : 2; + + emit_insn (gen_trunc2 (tmpreg, operands[lo])); + + if (BYTES_BIG_ENDIAN) + emit_insn (gen_aarch64_xtn2_insn_be (operands[0], tmpreg, + operands[hi])); + else + emit_insn (gen_aarch64_xtn2_insn_le (operands[0], tmpreg, + operands[hi])); + DONE; + } +) + (define_insn "aarch64_shrn_insn_le" [(set (match_operand: 0 "register_operand" "=w") (vec_concat: @@ -1936,29 +1994,6 @@ } ) -;; For quads. - -(define_expand "vec_pack_trunc_" - [(set (match_operand: 0 "register_operand") - (vec_concat: - (truncate: (match_operand:VQN 1 "register_operand")) - (truncate: (match_operand:VQN 2 "register_operand"))))] - "TARGET_SIMD" - { - rtx tmpreg = gen_reg_rtx (mode); - int lo = BYTES_BIG_ENDIAN ? 2 : 1; - int hi = BYTES_BIG_ENDIAN ? 1 : 2; - - emit_insn (gen_aarch64_xtn (tmpreg, operands[lo])); - - if (BYTES_BIG_ENDIAN) - emit_insn (gen_aarch64_xtn2_be (operands[0], tmpreg, operands[hi])); - else - emit_insn (gen_aarch64_xtn2_le (operands[0], tmpreg, operands[hi])); - DONE; - } -) - ;; Widening operations. (define_insn "aarch64_simd_vec_unpack_lo_" diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index e9047d0..caa42f8 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -1257,6 +1257,8 @@ ;; Narrowed modes for VDN. (define_mode_attr VNARROWD [(V4HI "V8QI") (V2SI "V4HI") (DI "V2SI")]) +(define_mode_attr Vnarrowd [(V4HI "v8qi") (V2SI "v4hi") + (DI "v2si")]) ;; Narrowed double-modes for VQN (Used for XTN). (define_mode_attr VNARROWQ [(V8HI "V8QI") (V4SI "V4HI") -- cgit v1.1 From c86a3039683a8d2bb1006c1a0277678de3786ceb Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 14 Jun 2021 13:16:35 +0100 Subject: aarch64: Model zero-high-half semantics of SQXTUN instruction in RTL Split the aarch64_sqmovun pattern into separate scalar and vector variants. Further split the vector pattern into big/little endian variants that model the zero-high-half semantics of the underlying instruction. Modeling these semantics allows for better RTL combinations while also removing some register allocation issues as the compiler now knows that the operation is totally destructive. Add new tests to narrow_zero_high_half.c to verify the benefit of this change. gcc/ChangeLog: 2021-06-14 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Split generator for aarch64_sqmovun builtins into scalar and vector variants. * config/aarch64/aarch64-simd.md (aarch64_sqmovun): Split into scalar and vector variants. Change vector variant to an expander that emits the correct instruction depending on endianness. (aarch64_sqmovun_insn_le): Define. (aarch64_sqmovun_insn_be): Define. gcc/testsuite/ChangeLog: * gcc.target/aarch64/narrow_zero_high_half.c: Add new tests. --- gcc/config/aarch64/aarch64-simd-builtins.def | 4 +- gcc/config/aarch64/aarch64-simd.md | 66 +++++++++++++++++++++++----- 2 files changed, 58 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 18baa67..2adb4b1 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -263,7 +263,9 @@ BUILTIN_VQ_HSI (TERNOP, smlal_hi_n, 0, NONE) BUILTIN_VQ_HSI (TERNOPU, umlal_hi_n, 0, NONE) - BUILTIN_VSQN_HSDI (UNOPUS, sqmovun, 0, NONE) + /* Implemented by aarch64_sqmovun. */ + BUILTIN_VQN (UNOPUS, sqmovun, 0, NONE) + BUILTIN_SD_HSDI (UNOPUS, sqmovun, 0, NONE) /* Implemented by aarch64_sqxtun2. */ BUILTIN_VQN (BINOP_UUS, sqxtun2, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index b23556b..59779b8 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4870,17 +4870,6 @@ [(set_attr "type" "neon_qadd")] ) -;; sqmovun - -(define_insn "aarch64_sqmovun" - [(set (match_operand: 0 "register_operand" "=w") - (unspec: [(match_operand:VSQN_HSDI 1 "register_operand" "w")] - UNSPEC_SQXTUN))] - "TARGET_SIMD" - "sqxtun\\t%0, %1" - [(set_attr "type" "neon_sat_shift_imm_narrow_q")] -) - ;; sqmovn and uqmovn (define_insn "aarch64_qmovn" @@ -4931,6 +4920,61 @@ } ) +;; sqmovun + +(define_insn "aarch64_sqmovun" + [(set (match_operand: 0 "register_operand" "=w") + (unspec: [(match_operand:SD_HSDI 1 "register_operand" "w")] + UNSPEC_SQXTUN))] + "TARGET_SIMD" + "sqxtun\\t%0, %1" + [(set_attr "type" "neon_sat_shift_imm_narrow_q")] +) + +(define_insn "aarch64_sqmovun_insn_le" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (unspec: [(match_operand:VQN 1 "register_operand" "w")] + UNSPEC_SQXTUN) + (match_operand: 2 "aarch64_simd_or_scalar_imm_zero")))] + "TARGET_SIMD && !BYTES_BIG_ENDIAN" + "sqxtun\\t%0, %1" + [(set_attr "type" "neon_sat_shift_imm_narrow_q")] +) + +(define_insn "aarch64_sqmovun_insn_be" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (match_operand: 2 "aarch64_simd_or_scalar_imm_zero") + (unspec: [(match_operand:VQN 1 "register_operand" "w")] + UNSPEC_SQXTUN)))] + "TARGET_SIMD && BYTES_BIG_ENDIAN" + "sqxtun\\t%0, %1" + [(set_attr "type" "neon_sat_shift_imm_narrow_q")] +) + +(define_expand "aarch64_sqmovun" + [(set (match_operand: 0 "register_operand") + (unspec: [(match_operand:VQN 1 "register_operand")] + UNSPEC_SQXTUN))] + "TARGET_SIMD" + { + rtx tmp = gen_reg_rtx (mode); + if (BYTES_BIG_ENDIAN) + emit_insn (gen_aarch64_sqmovun_insn_be (tmp, operands[1], + CONST0_RTX (mode))); + else + emit_insn (gen_aarch64_sqmovun_insn_le (tmp, operands[1], + CONST0_RTX (mode))); + + /* The intrinsic expects a narrow result, so emit a subreg that will get + optimized away as appropriate. */ + emit_move_insn (operands[0], lowpart_subreg (mode, tmp, + mode)); + DONE; + } +) + (define_insn "aarch64_sqxtun2_le" [(set (match_operand: 0 "register_operand" "=w") (vec_concat: -- cgit v1.1 From d0889b5d37ff40149b44e3c7d82f693d430cd891 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 14 Jun 2021 15:09:18 +0100 Subject: aarch64: Model zero-high-half semantics of [SU]QXTN instructions Split the aarch64_qmovn pattern into separate scalar and vector variants. Further split the vector RTL pattern into big/ little endian variants that model the zero-high-half semantics of the underlying instruction. Modeling these semantics allows for better RTL combinations while also removing some register allocation issues as the compiler now knows that the operation is totally destructive. Add new tests to narrow_zero_high_half.c to verify the benefit of this change. gcc/ChangeLog: 2021-06-14 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Split generator for aarch64_qmovn builtins into scalar and vector variants. * config/aarch64/aarch64-simd.md (aarch64_qmovn_insn_le): Define. (aarch64_qmovn_insn_be): Define. (aarch64_qmovn): Split into scalar and vector variants. Change vector variant to an expander that emits the correct instruction depending on endianness. gcc/testsuite/ChangeLog: * gcc.target/aarch64/narrow_zero_high_half.c: Add new tests. --- gcc/config/aarch64/aarch64-simd-builtins.def | 6 ++-- gcc/config/aarch64/aarch64-simd.md | 48 ++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 2adb4b1..ac5d4fc 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -271,8 +271,10 @@ BUILTIN_VQN (BINOP_UUS, sqxtun2, 0, NONE) /* Implemented by aarch64_qmovn. */ - BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0, NONE) - BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0, NONE) + BUILTIN_VQN (UNOP, sqmovn, 0, NONE) + BUILTIN_SD_HSDI (UNOP, sqmovn, 0, NONE) + BUILTIN_VQN (UNOP, uqmovn, 0, NONE) + BUILTIN_SD_HSDI (UNOP, uqmovn, 0, NONE) /* Implemented by aarch64_qxtn2. */ BUILTIN_VQN (BINOP, sqxtn2, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 59779b8..2b75e57 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4875,10 +4875,54 @@ (define_insn "aarch64_qmovn" [(set (match_operand: 0 "register_operand" "=w") (SAT_TRUNC: - (match_operand:VSQN_HSDI 1 "register_operand" "w")))] + (match_operand:SD_HSDI 1 "register_operand" "w")))] "TARGET_SIMD" "qxtn\\t%0, %1" - [(set_attr "type" "neon_sat_shift_imm_narrow_q")] + [(set_attr "type" "neon_sat_shift_imm_narrow_q")] +) + +(define_insn "aarch64_qmovn_insn_le" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (SAT_TRUNC: + (match_operand:VQN 1 "register_operand" "w")) + (match_operand: 2 "aarch64_simd_or_scalar_imm_zero")))] + "TARGET_SIMD && !BYTES_BIG_ENDIAN" + "qxtn\\t%0, %1" + [(set_attr "type" "neon_sat_shift_imm_narrow_q")] +) + +(define_insn "aarch64_qmovn_insn_be" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (match_operand: 2 "aarch64_simd_or_scalar_imm_zero") + (SAT_TRUNC: + (match_operand:VQN 1 "register_operand" "w"))))] + "TARGET_SIMD && BYTES_BIG_ENDIAN" + "qxtn\\t%0, %1" + [(set_attr "type" "neon_sat_shift_imm_narrow_q")] +) + +(define_expand "aarch64_qmovn" + [(set (match_operand: 0 "register_operand") + (SAT_TRUNC: + (match_operand:VQN 1 "register_operand")))] + "TARGET_SIMD" + { + rtx tmp = gen_reg_rtx (mode); + if (BYTES_BIG_ENDIAN) + emit_insn (gen_aarch64_qmovn_insn_be (tmp, operands[1], + CONST0_RTX (mode))); + else + emit_insn (gen_aarch64_qmovn_insn_le (tmp, operands[1], + CONST0_RTX (mode))); + + /* The intrinsic expects a narrow result, so emit a subreg that will get + optimized away as appropriate. */ + emit_move_insn (operands[0], lowpart_subreg (mode, tmp, + mode)); + DONE; + } ) (define_insn "aarch64_qxtn2_le" -- cgit v1.1 From dbfc149b639342a9555c60aa9ee787fb3d009316 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 14 Jun 2021 16:18:44 +0100 Subject: aarch64: Model zero-high-half semantics of ADDHN/SUBHN instructions Model the zero-high-half semantics of the narrowing arithmetic Neon instructions in the aarch64_hn RTL pattern. Modeling these semantics allows for better RTL combinations while also removing some register allocation issues as the compiler now knows that the operation is totally destructive. Add new tests to narrow_zero_high_half.c to verify the benefit of this change. gcc/ChangeLog: 2021-06-14 Jonathan Wright * config/aarch64/aarch64-simd.md (aarch64_hn): Change to an expander that emits the correct instruction depending on endianness. (aarch64_hn_insn_le): Define. (aarch64_hn_insn_be): Define. gcc/testsuite/ChangeLog: * gcc.target/aarch64/narrow_zero_high_half.c: Add new tests. --- gcc/config/aarch64/aarch64-simd.md | 49 +++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 2b75e57..540244c 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4661,16 +4661,53 @@ ;; hn. -(define_insn "aarch64_hn" - [(set (match_operand: 0 "register_operand" "=w") - (unspec: [(match_operand:VQN 1 "register_operand" "w") - (match_operand:VQN 2 "register_operand" "w")] - ADDSUBHN))] - "TARGET_SIMD" +(define_insn "aarch64_hn_insn_le" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (unspec: [(match_operand:VQN 1 "register_operand" "w") + (match_operand:VQN 2 "register_operand" "w")] + ADDSUBHN) + (match_operand: 3 "aarch64_simd_or_scalar_imm_zero")))] + "TARGET_SIMD && !BYTES_BIG_ENDIAN" + "hn\\t%0., %1., %2." + [(set_attr "type" "neon__halve_narrow_q")] +) + +(define_insn "aarch64_hn_insn_be" + [(set (match_operand: 0 "register_operand" "=w") + (vec_concat: + (match_operand: 3 "aarch64_simd_or_scalar_imm_zero") + (unspec: [(match_operand:VQN 1 "register_operand" "w") + (match_operand:VQN 2 "register_operand" "w")] + ADDSUBHN)))] + "TARGET_SIMD && BYTES_BIG_ENDIAN" "hn\\t%0., %1., %2." [(set_attr "type" "neon__halve_narrow_q")] ) +(define_expand "aarch64_hn" + [(set (match_operand: 0 "register_operand") + (unspec: [(match_operand:VQN 1 "register_operand") + (match_operand:VQN 2 "register_operand")] + ADDSUBHN))] + "TARGET_SIMD" + { + rtx tmp = gen_reg_rtx (mode); + if (BYTES_BIG_ENDIAN) + emit_insn (gen_aarch64_hn_insn_be (tmp, operands[1], + operands[2], CONST0_RTX (mode))); + else + emit_insn (gen_aarch64_hn_insn_le (tmp, operands[1], + operands[2], CONST0_RTX (mode))); + + /* The intrinsic expects a narrow result, so emit a subreg that will get + optimized away as appropriate. */ + emit_move_insn (operands[0], lowpart_subreg (mode, tmp, + mode)); + DONE; + } +) + (define_insn "aarch64_hn2_insn_le" [(set (match_operand: 0 "register_operand" "=w") (vec_concat: -- cgit v1.1 From dd835ec24be9b1a89c6b0c78673de88c81a23966 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 16 Jun 2021 16:07:01 +0200 Subject: ii386: Add missing two element 64bit vector permutations [PR89021] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In addition to V8QI permutations, several other missing permutations are added for 64bit vector modes for TARGET_SSSE3 and TARGET_SSE4_1 targets. 2021-06-16 Uroš Bizjak gcc/ PR target/89021 * config/i386/i386-expand.c (expand_vec_perm_2perm_pblendv): Handle 64bit modes for TARGET_SSE4_1. (expand_vec_perm_pshufb2): Handle 64bit modes for TARGET_SSSE3. (expand_vec_perm_even_odd_pack): Handle V4HI mode. (expand_vec_perm_even_odd_1) : Expand via expand_vec_perm_pshufb2 for TARGET_SSSE3 and via expand_vec_perm_even_odd_pack for TARGET_SSE4_1. * config/i386/mmx.md (mmx_packusdw): New insn pattern. --- gcc/config/i386/i386-expand.c | 91 +++++++++++++++++++++++++++++++------------ gcc/config/i386/mmx.md | 16 ++++++++ 2 files changed, 82 insertions(+), 25 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index dee3df2..eb6f9b0 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -17633,8 +17633,10 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) if (vmode == V8QImode) { + rtx m128 = GEN_INT (-128); + for (i = nelt; i < 16; ++i) - rperm[i] = constm1_rtx; + rperm[i] = m128; vpmode = V16QImode; } @@ -18972,7 +18974,8 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn) ; else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; - else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) + else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16 + || GET_MODE_SIZE (vmode) == 8)) ; else return false; @@ -19229,14 +19232,31 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) { rtx rperm[2][16], vperm, l, h, op, m128; unsigned int i, nelt, eltsz; + machine_mode mode; + rtx (*gen) (rtx, rtx, rtx); - if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) + if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16 + && GET_MODE_SIZE (d->vmode) != 8)) return false; gcc_assert (!d->one_operand_p); if (d->testing_p) return true; + switch (GET_MODE_SIZE (d->vmode)) + { + case 8: + mode = V8QImode; + gen = gen_mmx_pshufbv8qi3; + break; + case 16: + mode = V16QImode; + gen = gen_ssse3_pshufbv16qi3; + break; + default: + gcc_unreachable (); + } + nelt = d->nelt; eltsz = GET_MODE_UNIT_SIZE (d->vmode); @@ -19247,7 +19267,7 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) m128 = GEN_INT (-128); for (i = 0; i < nelt; ++i) { - unsigned j, e = d->perm[i]; + unsigned j, k, e = d->perm[i]; unsigned which = (e >= nelt); if (e >= nelt) e -= nelt; @@ -19257,26 +19277,29 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j); rperm[1-which][i*eltsz + j] = m128; } + + for (k = i*eltsz + j; k < 16; ++k) + rperm[0][k] = rperm[1][k] = m128; } vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0])); vperm = force_reg (V16QImode, vperm); - l = gen_reg_rtx (V16QImode); - op = gen_lowpart (V16QImode, d->op0); - emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm)); + l = gen_reg_rtx (mode); + op = gen_lowpart (mode, d->op0); + emit_insn (gen (l, op, vperm)); vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1])); vperm = force_reg (V16QImode, vperm); - h = gen_reg_rtx (V16QImode); - op = gen_lowpart (V16QImode, d->op1); - emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm)); + h = gen_reg_rtx (mode); + op = gen_lowpart (mode, d->op1); + emit_insn (gen (h, op, vperm)); op = d->target; - if (d->vmode != V16QImode) - op = gen_reg_rtx (V16QImode); - emit_insn (gen_iorv16qi3 (op, l, h)); + if (d->vmode != mode) + op = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (op, gen_rtx_IOR (mode, l, h))); if (op != d->target) emit_move_insn (d->target, gen_lowpart (d->vmode, op)); @@ -19455,6 +19478,17 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) switch (d->vmode) { + case E_V4HImode: + /* Required for "pack". */ + if (!TARGET_SSE4_1) + return false; + c = 0xffff; + s = 16; + half_mode = V2SImode; + gen_and = gen_andv2si3; + gen_pack = gen_mmx_packusdw; + gen_shift = gen_lshrv2si3; + break; case E_V8HImode: /* Required for "pack". */ if (!TARGET_SSE4_1) @@ -19507,7 +19541,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) end_perm = true; break; default: - /* Only V8QI, V8HI, V16QI, V16HI and V32QI modes + /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes are more profitable than general shuffles. */ return false; } @@ -19698,18 +19732,25 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) break; case E_V4HImode: - if (d->testing_p) - break; - /* We need 2*log2(N)-1 operations to achieve odd/even - with interleave. */ - t1 = gen_reg_rtx (V4HImode); - emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1)); - emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1)); - if (odd) - t2 = gen_mmx_punpckhwd (d->target, d->target, t1); + if (TARGET_SSE4_1) + return expand_vec_perm_even_odd_pack (d); + else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) + return expand_vec_perm_pshufb2 (d); else - t2 = gen_mmx_punpcklwd (d->target, d->target, t1); - emit_insn (t2); + { + if (d->testing_p) + break; + /* We need 2*log2(N)-1 operations to achieve odd/even + with interleave. */ + t1 = gen_reg_rtx (V4HImode); + emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1)); + emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1)); + if (odd) + t2 = gen_mmx_punpckhwd (d->target, d->target, t1); + else + t2 = gen_mmx_punpcklwd (d->target, d->target, t1); + emit_insn (t2); + } break; case E_V8HImode: diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 1a9e7b0..59a16f4 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -2477,6 +2477,22 @@ (set_attr "type" "mmxshft,sselog,sselog") (set_attr "mode" "DI,TI,TI")]) +(define_insn_and_split "mmx_packusdw" + [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,Yw") + (vec_concat:V4HI + (us_truncate:V2HI + (match_operand:V2SI 1 "register_operand" "0,0,Yw")) + (us_truncate:V2HI + (match_operand:V2SI 2 "register_operand" "Yr,*x,Yw"))))] + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_mmx_pack (operands, US_TRUNCATE); DONE;" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "sselog") + (set_attr "mode" "TI")]) + (define_insn_and_split "mmx_punpckhbw" [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw") (vec_select:V8QI -- cgit v1.1 From a4fc63e0c3c6c7b895225c883137d152743be7fc Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Wed, 16 Jun 2021 18:26:48 +0200 Subject: IBM Z: Fix vector intrinsics vec_double and vec_floate Fix the mapping of vec_double and vec_floate to builtins. gcc/ChangeLog: PR target/100871 * config/s390/vecintrin.h (vec_doublee): Fix to use __builtin_s390_vflls. (vec_floate): Fix to use __builtin_s390_vflrd. gcc/testsuite/ChangeLog: * gcc.target/s390/zvector/vec-doublee.c: New test. * gcc.target/s390/zvector/vec-floate.c: New test. --- gcc/config/s390/vecintrin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/vecintrin.h b/gcc/config/s390/vecintrin.h index 6bd26f8..9a3f7c3 100644 --- a/gcc/config/s390/vecintrin.h +++ b/gcc/config/s390/vecintrin.h @@ -109,8 +109,8 @@ __lcbb(const void *ptr, int bndry) #define vec_rint(X) __builtin_s390_vfi((X), 0, 0) #define vec_roundc(X) __builtin_s390_vfi((X), 4, 0) #define vec_round(X) __builtin_s390_vfi((X), 4, 4) -#define vec_doublee(X) __builtin_s390_vfll((X)) -#define vec_floate(X) __builtin_s390_vflr((X), 0, 0) +#define vec_doublee(X) __builtin_s390_vflls((X)) +#define vec_floate(X) __builtin_s390_vflrd((X), 0, 0) #define vec_load_len_r(X,L) \ (__vector unsigned char)__builtin_s390_vlrlr((L),(X)) #define vec_store_len_r(X,Y,L) \ -- cgit v1.1 From 20a2c8ace0ab56c147fd995432abd5e7cf89b0e3 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 17 Jun 2021 15:19:12 +0200 Subject: i386: Add variable vec_set for 64bit vectors [PR97194] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To generate sane code a SSE4.1 variable PBLENDV instruction is needed. 2021-06-17 Uroš Bizjak gcc/ PR target/97194 * config/i386/i386-expand.c (expand_vector_set_var): Handle V2FS mode remapping. Pass TARGET_MMX_WITH_SSE to ix86_expand_vector_init_duplicate. (ix86_expand_vector_init_duplicate): Emit insv_1 for QImode for !TARGET_PARTIAL_REG_STALL. * config/i386/predicates.md (vec_setm_mmx_operand): New predicate. * config/i386/mmx.md (vec_setv2sf): Use vec_setm_mmx_operand as operand 2 predicate. Call ix86_expand_vector_set_var for non-constant index operand. (vec_setv2si): Ditto. (vec_setv4hi): Ditto. (vec_setv8qi): ditto. gcc/testsuite/ PR target/97194 * gcc.target/i386/sse4_1-vec-set-1.c: New test. * gcc.target/i386/sse4_1-vec-set-2.c: ditto. --- gcc/config/i386/i386-expand.c | 24 ++++++++++++++++++------ gcc/config/i386/mmx.md | 36 ++++++++++++++++++++++++------------ gcc/config/i386/predicates.md | 6 ++++++ 3 files changed, 48 insertions(+), 18 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index eb6f9b0..8f4e4e4 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -13811,10 +13811,17 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, wsmode = GET_MODE_INNER (wvmode); val = convert_modes (wsmode, smode, val, true); - x = expand_simple_binop (wsmode, ASHIFT, val, - GEN_INT (GET_MODE_BITSIZE (smode)), - NULL_RTX, 1, OPTAB_LIB_WIDEN); - val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN); + + if (smode == QImode && !TARGET_PARTIAL_REG_STALL) + emit_insn (gen_insv_1 (wsmode, val, val)); + else + { + x = expand_simple_binop (wsmode, ASHIFT, val, + GEN_INT (GET_MODE_BITSIZE (smode)), + NULL_RTX, 1, OPTAB_LIB_WIDEN); + val = expand_simple_binop (wsmode, IOR, val, x, x, 1, + OPTAB_LIB_WIDEN); + } x = gen_reg_rtx (wvmode); ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val); @@ -14788,6 +14795,9 @@ ix86_expand_vector_set_var (rtx target, rtx val, rtx idx) case E_V8DFmode: cmp_mode = V8DImode; break; + case E_V2SFmode: + cmp_mode = V2SImode; + break; case E_V4SFmode: cmp_mode = V4SImode; break; @@ -14809,9 +14819,11 @@ ix86_expand_vector_set_var (rtx target, rtx val, rtx idx) idxv = gen_reg_rtx (cmp_mode); idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1); - ok = ix86_expand_vector_init_duplicate (false, mode, valv, val); + ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE, + mode, valv, val); gcc_assert (ok); - ok = ix86_expand_vector_init_duplicate (false, cmp_mode, idxv, idx_tmp); + ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE, + cmp_mode, idxv, idx_tmp); gcc_assert (ok); vec[0] = target; vec[1] = valv; diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 59a16f4..a107ac5 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1279,11 +1279,14 @@ (define_expand "vec_setv2sf" [(match_operand:V2SF 0 "register_operand") (match_operand:SF 1 "register_operand") - (match_operand 2 "const_int_operand")] + (match_operand 2 "vec_setm_mmx_operand")] "TARGET_MMX || TARGET_MMX_WITH_SSE" { - ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1], - INTVAL (operands[2])); + if (CONST_INT_P (operands[2])) + ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1], + INTVAL (operands[2])); + else + ix86_expand_vector_set_var (operands[0], operands[1], operands[2]); DONE; }) @@ -2989,11 +2992,14 @@ (define_expand "vec_setv2si" [(match_operand:V2SI 0 "register_operand") (match_operand:SI 1 "register_operand") - (match_operand 2 "const_int_operand")] + (match_operand 2 "vec_setm_mmx_operand")] "TARGET_MMX || TARGET_MMX_WITH_SSE" { - ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1], - INTVAL (operands[2])); + if (CONST_INT_P (operands[2])) + ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1], + INTVAL (operands[2])); + else + ix86_expand_vector_set_var (operands[0], operands[1], operands[2]); DONE; }) @@ -3145,11 +3151,14 @@ (define_expand "vec_setv4hi" [(match_operand:V4HI 0 "register_operand") (match_operand:HI 1 "register_operand") - (match_operand 2 "const_int_operand")] + (match_operand 2 "vec_setm_mmx_operand")] "TARGET_MMX || TARGET_MMX_WITH_SSE" { - ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1], - INTVAL (operands[2])); + if (CONST_INT_P (operands[2])) + ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1], + INTVAL (operands[2])); + else + ix86_expand_vector_set_var (operands[0], operands[1], operands[2]); DONE; }) @@ -3177,11 +3186,14 @@ (define_expand "vec_setv8qi" [(match_operand:V8QI 0 "register_operand") (match_operand:QI 1 "register_operand") - (match_operand 2 "const_int_operand")] + (match_operand 2 "vec_setm_mmx_operand")] "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" { - ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1], - INTVAL (operands[2])); + if (CONST_INT_P (operands[2])) + ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1], + INTVAL (operands[2])); + else + ix86_expand_vector_set_var (operands[0], operands[1], operands[2]); DONE; }) diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 3dd134e..e7a8968 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1026,6 +1026,12 @@ (match_test "TARGET_AVX2")) (match_code "const_int"))) +(define_predicate "vec_setm_mmx_operand" + (ior (and (match_operand 0 "register_operand") + (match_test "TARGET_SSE4_1") + (match_test "TARGET_MMX_WITH_SSE")) + (match_code "const_int"))) + ;; True for registers, or 1 or -1. Used to optimize double-word shifts. (define_predicate "reg_or_pm1_operand" (ior (match_operand 0 "register_operand") -- cgit v1.1 From 967b46530234b4e6ad3983057705aea6c20a03c4 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 21 May 2021 11:56:55 -0700 Subject: Add a target calls hook: TARGET_PUSH_ARGUMENT 1. Replace PUSH_ARGS with a target calls hook, TARGET_PUSH_ARGUMENT, which takes an integer argument. When it returns true, push instructions will be used to pass outgoing arguments. If the argument is nonzero, it is the number of bytes to push and indicates the PUSH instruction usage is optional so that the backend can decide if PUSH instructions should be generated. Otherwise, the argument is zero. 2. Implement x86 target hook which returns false when the number of bytes to push is no less than 16 (8 for 32-bit targets) if vector load and store can be used. 3. Remove target PUSH_ARGS definitions which return 0 as it is the same as the default. 4. Define TARGET_PUSH_ARGUMENT of cr16 and m32c to always return true. gcc/ PR target/100704 * calls.c (expand_call): Replace PUSH_ARGS with targetm.calls.push_argument (0). (emit_library_call_value_1): Likewise. * defaults.h (PUSH_ARGS): Removed. (PUSH_ARGS_REVERSED): Replace PUSH_ARGS with targetm.calls.push_argument (0). * expr.c (block_move_libcall_safe_for_call_parm): Likewise. (emit_push_insn): Pass the number bytes to push to targetm.calls.push_argument and pass 0 if ARGS_ADDR is 0. * hooks.c (hook_bool_uint_true): New. * hooks.h (hook_bool_uint_true): Likewise. * rtlanal.c (nonzero_bits1): Replace PUSH_ARGS with targetm.calls.push_argument (0). * target.def (push_argument): Add a targetm.calls hook. * targhooks.c (default_push_argument): New. * targhooks.h (default_push_argument): Likewise. * config/bpf/bpf.h (PUSH_ARGS): Removed. * config/cr16/cr16.c (TARGET_PUSH_ARGUMENT): New. * config/cr16/cr16.h (PUSH_ARGS): Removed. * config/i386/i386.c (ix86_push_argument): New. (TARGET_PUSH_ARGUMENT): Likewise. * config/i386/i386.h (PUSH_ARGS): Removed. * config/m32c/m32c.c (TARGET_PUSH_ARGUMENT): New. * config/m32c/m32c.h (PUSH_ARGS): Removed. * config/nios2/nios2.h (PUSH_ARGS): Likewise. * config/pru/pru.h (PUSH_ARGS): Likewise. * doc/tm.texi.in: Remove PUSH_ARGS documentation. Add TARGET_PUSH_ARGUMENT hook. * doc/tm.texi: Regenerated. gcc/testsuite/ PR target/100704 * gcc.target/i386/pr100704-1.c: New test. * gcc.target/i386/pr100704-2.c: Likewise. * gcc.target/i386/pr100704-3.c: Likewise. --- gcc/config/bpf/bpf.h | 3 --- gcc/config/cr16/cr16.c | 2 ++ gcc/config/cr16/cr16.h | 2 -- gcc/config/i386/i386.c | 14 ++++++++++++++ gcc/config/i386/i386.h | 7 +------ gcc/config/m32c/m32c.c | 3 +++ gcc/config/m32c/m32c.h | 1 - gcc/config/nios2/nios2.h | 1 - gcc/config/pru/pru.h | 1 - 9 files changed, 20 insertions(+), 14 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/bpf/bpf.h b/gcc/config/bpf/bpf.h index 4c5b19e..80195ce 100644 --- a/gcc/config/bpf/bpf.h +++ b/gcc/config/bpf/bpf.h @@ -288,9 +288,6 @@ enum reg_class never used when passing arguments. However, we still have to define the constants below. */ -/* If nonzero, push insns will be used to pass outgoing arguments. */ -#define PUSH_ARGS 0 - /* If nonzero, function arguments will be evaluated from last to first, rather than from first to last. */ #define PUSH_ARGS_REVERSED 1 diff --git a/gcc/config/cr16/cr16.c b/gcc/config/cr16/cr16.c index 6c81c39..aaa2260 100644 --- a/gcc/config/cr16/cr16.c +++ b/gcc/config/cr16/cr16.c @@ -158,6 +158,8 @@ static void cr16_print_operand_address (FILE *, machine_mode, rtx); #define TARGET_CLASS_LIKELY_SPILLED_P cr16_class_likely_spilled_p /* Passing function arguments. */ +#undef TARGET_PUSH_ARGUMENT +#define TARGET_PUSH_ARGUMENT hook_bool_uint_true #undef TARGET_FUNCTION_ARG #define TARGET_FUNCTION_ARG cr16_function_arg #undef TARGET_FUNCTION_ARG_ADVANCE diff --git a/gcc/config/cr16/cr16.h b/gcc/config/cr16/cr16.h index 4ce9e81..a3ad035 100644 --- a/gcc/config/cr16/cr16.h +++ b/gcc/config/cr16/cr16.h @@ -376,8 +376,6 @@ enum reg_class #define ACCUMULATE_OUTGOING_ARGS 0 -#define PUSH_ARGS 1 - #define PUSH_ROUNDING(BYTES) cr16_push_rounding (BYTES) #ifndef CUMULATIVE_ARGS diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index a612558..7d0d414 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -4191,6 +4191,18 @@ ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) } } +/* Implement TARGET_PUSH_ARGUMENT. */ + +static bool +ix86_push_argument (unsigned int npush) +{ + /* If SSE2 is available, use vector move to put large argument onto + stack. NB: In 32-bit mode, use 8-byte vector move. */ + return ((!TARGET_SSE2 || npush < (TARGET_64BIT ? 16 : 8)) + && TARGET_PUSH_ARGS + && !ACCUMULATE_OUTGOING_ARGS); +} + /* Create the va_list data type. */ @@ -23695,6 +23707,8 @@ ix86_run_selftests (void) #define TARGET_C_EXCESS_PRECISION ix86_get_excess_precision #undef TARGET_PROMOTE_PROTOTYPES #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true +#undef TARGET_PUSH_ARGUMENT +#define TARGET_PUSH_ARGUMENT ix86_push_argument #undef TARGET_SETUP_INCOMING_VARARGS #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs #undef TARGET_MUST_PASS_IN_STACK diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 182b327..6e0340a 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1462,13 +1462,8 @@ enum reg_class || TARGET_64BIT_MS_ABI \ || (TARGET_MACHO && crtl->profile)) -/* If defined, a C expression whose value is nonzero when we want to use PUSH - instructions to pass outgoing arguments. */ - -#define PUSH_ARGS (TARGET_PUSH_ARGS && !ACCUMULATE_OUTGOING_ARGS) - /* We want the stack and args grow in opposite directions, even if - PUSH_ARGS is 0. */ + targetm.calls.push_argument returns false. */ #define PUSH_ARGS_REVERSED 1 /* Offset of first parameter from the argument pointer register value. */ diff --git a/gcc/config/m32c/m32c.c b/gcc/config/m32c/m32c.c index b1cb359..d22bdd7 100644 --- a/gcc/config/m32c/m32c.c +++ b/gcc/config/m32c/m32c.c @@ -1296,6 +1296,9 @@ m32c_push_rounding (poly_int64 n) return (n + 1) & ~1; } +#undef TARGET_PUSH_ARGUMENT +#define TARGET_PUSH_ARGUMENT hook_bool_uint_true + /* Passing Arguments in Registers */ /* Implements TARGET_FUNCTION_ARG. Arguments are passed partly in diff --git a/gcc/config/m32c/m32c.h b/gcc/config/m32c/m32c.h index 635f592..228a73d 100644 --- a/gcc/config/m32c/m32c.h +++ b/gcc/config/m32c/m32c.h @@ -472,7 +472,6 @@ enum reg_class /* Passing Function Arguments on the Stack */ -#define PUSH_ARGS 1 #define PUSH_ROUNDING(N) m32c_push_rounding (N) #define CALL_POPS_ARGS(C) 0 diff --git a/gcc/config/nios2/nios2.h b/gcc/config/nios2/nios2.h index 1840a46..dfca12c 100644 --- a/gcc/config/nios2/nios2.h +++ b/gcc/config/nios2/nios2.h @@ -297,7 +297,6 @@ typedef struct nios2_args ((REGNO) >= FIRST_ARG_REGNO && (REGNO) <= LAST_ARG_REGNO) /* Passing function arguments on stack. */ -#define PUSH_ARGS 0 #define ACCUMULATE_OUTGOING_ARGS 1 /* We define TARGET_RETURN_IN_MEMORY, so set to zero. */ diff --git a/gcc/config/pru/pru.h b/gcc/config/pru/pru.h index 4c35a7d..9b6be32 100644 --- a/gcc/config/pru/pru.h +++ b/gcc/config/pru/pru.h @@ -339,7 +339,6 @@ typedef struct pru_args ((REGNO) >= FIRST_ARG_REGNUM && (REGNO) <= LAST_ARG_REGNUM) /* Passing function arguments on stack. */ -#define PUSH_ARGS 0 #define ACCUMULATE_OUTGOING_ARGS 1 /* We define TARGET_RETURN_IN_MEMORY, so set to zero. */ -- cgit v1.1 From 00f730ec3a24fd1453b3ee96e8a50a29d5db3ac3 Mon Sep 17 00:00:00 2001 From: Aaron Sawdey Date: Wed, 16 Jun 2021 10:58:08 -0500 Subject: Add needed earlyclobber to fusion patterns The add-logical and add-add fusion patterns all have constraint alternatives "=0,1,&r,r" for the output (3). The inputs 0 and 1 are used in the first fusion instruction and then either may be reused as a temp for the output of the first insn which is input to the second. However, if input 2 is the same as 0 or 1, it gets clobbered unexpectedly. So the first 2 alts need to be "=&0,&1,&r,r" instead to indicate that in alts 0 and 1, the register used for 3 is earlyclobber, hence can't be the same as input 2. This was actually encountered in the backport of the add-logical fusion patch to gcc-11. Some code in go hit this case: : andc r30,r30,r9 r30 now (~(x|((x&c)+c)))&(~c) --> this is new x : b : addi r31,r31,-1 r31 now m-1 : srd r31,r30,r31 r31 now x>>(m-1) : subf r30,r31,r30 r30 now x-(x>>(m-1)) : or r30,r30,r30 # mdoom nop : not r3,r30 r3 now ~(x-(x>>(m-1))) -- WHOOPS The or r30,r30,r30 was meant to be or-ing in the earlier value of r30 which was overwritten by the output of the subf. gcc/ChangeLog * config/rs6000/genfusion.pl (gen_logical_addsubf): Add earlyclobber to alts 0/1. (gen_addadd): Add earlyclobber to alts 0/1. * config/rs6000/fusion.md: Regenerate file. --- gcc/config/rs6000/fusion.md | 300 ++++++++++++++++++++--------------------- gcc/config/rs6000/genfusion.pl | 4 +- 2 files changed, 152 insertions(+), 152 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md index e642ff5..516baa0 100644 --- a/gcc/config/rs6000/fusion.md +++ b/gcc/config/rs6000/fusion.md @@ -358,7 +358,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> and (define_insn "*fuse_and_and" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -376,7 +376,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> and (define_insn "*fuse_andc_and" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -394,7 +394,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> and (define_insn "*fuse_eqv_and" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -412,7 +412,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> and (define_insn "*fuse_nand_and" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -430,7 +430,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> and (define_insn "*fuse_nor_and" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -448,7 +448,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> and (define_insn "*fuse_or_and" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -466,7 +466,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> and (define_insn "*fuse_orc_and" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -484,7 +484,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> and (define_insn "*fuse_xor_and" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -502,7 +502,7 @@ ;; add-logical fusion pattern generated by gen_logical_addsubf ;; scalar add -> and (define_insn "*fuse_add_and" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (plus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -520,7 +520,7 @@ ;; add-logical fusion pattern generated by gen_logical_addsubf ;; scalar subf -> and (define_insn "*fuse_subf_and" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (minus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -538,7 +538,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> andc (define_insn "*fuse_and_andc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -556,7 +556,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> andc (define_insn "*fuse_andc_andc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -574,7 +574,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> andc (define_insn "*fuse_eqv_andc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -592,7 +592,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> andc (define_insn "*fuse_nand_andc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -610,7 +610,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> andc (define_insn "*fuse_nor_andc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -628,7 +628,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> andc (define_insn "*fuse_or_andc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -646,7 +646,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> andc (define_insn "*fuse_orc_andc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -664,7 +664,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> andc (define_insn "*fuse_xor_andc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -682,7 +682,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> eqv (define_insn "*fuse_and_eqv" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (not:GPR (xor:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -700,7 +700,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> eqv (define_insn "*fuse_andc_eqv" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (not:GPR (xor:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -718,7 +718,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> eqv (define_insn "*fuse_eqv_eqv" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (not:GPR (xor:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -736,7 +736,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> eqv (define_insn "*fuse_nand_eqv" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (not:GPR (xor:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -754,7 +754,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> eqv (define_insn "*fuse_nor_eqv" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (not:GPR (xor:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -772,7 +772,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> eqv (define_insn "*fuse_or_eqv" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (not:GPR (xor:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -790,7 +790,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> eqv (define_insn "*fuse_orc_eqv" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (not:GPR (xor:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -808,7 +808,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> eqv (define_insn "*fuse_xor_eqv" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (not:GPR (xor:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -826,7 +826,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> nand (define_insn "*fuse_and_nand" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (not:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -844,7 +844,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> nand (define_insn "*fuse_andc_nand" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (not:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -862,7 +862,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> nand (define_insn "*fuse_eqv_nand" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (not:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -880,7 +880,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> nand (define_insn "*fuse_nand_nand" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (not:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -898,7 +898,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> nand (define_insn "*fuse_nor_nand" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (not:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -916,7 +916,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> nand (define_insn "*fuse_or_nand" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (not:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -934,7 +934,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> nand (define_insn "*fuse_orc_nand" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (not:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -952,7 +952,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> nand (define_insn "*fuse_xor_nand" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -970,7 +970,7 @@ ;; add-logical fusion pattern generated by gen_logical_addsubf ;; scalar add -> nand (define_insn "*fuse_add_nand" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (not:GPR (plus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -988,7 +988,7 @@ ;; add-logical fusion pattern generated by gen_logical_addsubf ;; scalar subf -> nand (define_insn "*fuse_subf_nand" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (not:GPR (minus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1006,7 +1006,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> nor (define_insn "*fuse_and_nor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (not:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1024,7 +1024,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> nor (define_insn "*fuse_andc_nor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (not:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1042,7 +1042,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> nor (define_insn "*fuse_eqv_nor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (not:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1060,7 +1060,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> nor (define_insn "*fuse_nand_nor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (not:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1078,7 +1078,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> nor (define_insn "*fuse_nor_nor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (not:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1096,7 +1096,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> nor (define_insn "*fuse_or_nor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (not:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1114,7 +1114,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> nor (define_insn "*fuse_orc_nor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (not:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1132,7 +1132,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> nor (define_insn "*fuse_xor_nor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1150,7 +1150,7 @@ ;; add-logical fusion pattern generated by gen_logical_addsubf ;; scalar add -> nor (define_insn "*fuse_add_nor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (not:GPR (plus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1168,7 +1168,7 @@ ;; add-logical fusion pattern generated by gen_logical_addsubf ;; scalar subf -> nor (define_insn "*fuse_subf_nor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (and:GPR (not:GPR (minus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1186,7 +1186,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> or (define_insn "*fuse_and_or" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1204,7 +1204,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> or (define_insn "*fuse_andc_or" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1222,7 +1222,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> or (define_insn "*fuse_eqv_or" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1240,7 +1240,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> or (define_insn "*fuse_nand_or" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1258,7 +1258,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> or (define_insn "*fuse_nor_or" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1276,7 +1276,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> or (define_insn "*fuse_or_or" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1294,7 +1294,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> or (define_insn "*fuse_orc_or" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1312,7 +1312,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> or (define_insn "*fuse_xor_or" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1330,7 +1330,7 @@ ;; add-logical fusion pattern generated by gen_logical_addsubf ;; scalar add -> or (define_insn "*fuse_add_or" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (plus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1348,7 +1348,7 @@ ;; add-logical fusion pattern generated by gen_logical_addsubf ;; scalar subf -> or (define_insn "*fuse_subf_or" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (minus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1366,7 +1366,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> orc (define_insn "*fuse_and_orc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1384,7 +1384,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> orc (define_insn "*fuse_andc_orc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1402,7 +1402,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> orc (define_insn "*fuse_eqv_orc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1420,7 +1420,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> orc (define_insn "*fuse_nand_orc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1438,7 +1438,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> orc (define_insn "*fuse_nor_orc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1456,7 +1456,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> orc (define_insn "*fuse_or_orc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1474,7 +1474,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> orc (define_insn "*fuse_orc_orc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1492,7 +1492,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> orc (define_insn "*fuse_xor_orc" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (ior:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) @@ -1510,7 +1510,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar and -> xor (define_insn "*fuse_and_xor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (xor:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1528,7 +1528,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar andc -> xor (define_insn "*fuse_andc_xor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (xor:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1546,7 +1546,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar eqv -> xor (define_insn "*fuse_eqv_xor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (xor:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1564,7 +1564,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nand -> xor (define_insn "*fuse_nand_xor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (xor:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1582,7 +1582,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar nor -> xor (define_insn "*fuse_nor_xor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (xor:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1600,7 +1600,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar or -> xor (define_insn "*fuse_or_xor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (xor:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1618,7 +1618,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar orc -> xor (define_insn "*fuse_orc_xor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (xor:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1636,7 +1636,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; scalar xor -> xor (define_insn "*fuse_xor_xor" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (xor:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1654,7 +1654,7 @@ ;; logical-add fusion pattern generated by gen_logical_addsubf ;; scalar and -> add (define_insn "*fuse_and_add" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (plus:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1672,7 +1672,7 @@ ;; logical-add fusion pattern generated by gen_logical_addsubf ;; scalar nand -> add (define_insn "*fuse_nand_add" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (plus:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1690,7 +1690,7 @@ ;; logical-add fusion pattern generated by gen_logical_addsubf ;; scalar nor -> add (define_insn "*fuse_nor_add" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (plus:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1708,7 +1708,7 @@ ;; logical-add fusion pattern generated by gen_logical_addsubf ;; scalar or -> add (define_insn "*fuse_or_add" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (plus:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1726,7 +1726,7 @@ ;; logical-add fusion pattern generated by gen_logical_addsubf ;; scalar and -> subf (define_insn "*fuse_and_subf" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (minus:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1744,7 +1744,7 @@ ;; logical-add fusion pattern generated by gen_logical_addsubf ;; scalar nand -> subf (define_insn "*fuse_nand_subf" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (minus:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1762,7 +1762,7 @@ ;; logical-add fusion pattern generated by gen_logical_addsubf ;; scalar nor -> subf (define_insn "*fuse_nor_subf" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (minus:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1780,7 +1780,7 @@ ;; logical-add fusion pattern generated by gen_logical_addsubf ;; scalar or -> subf (define_insn "*fuse_or_subf" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (minus:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) @@ -1798,7 +1798,7 @@ ;; logical-add fusion pattern generated by gen_logical_addsubf ;; scalar and -> rsubf (define_insn "*fuse_and_rsubf" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (minus:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r") (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) @@ -1816,7 +1816,7 @@ ;; logical-add fusion pattern generated by gen_logical_addsubf ;; scalar nand -> rsubf (define_insn "*fuse_nand_rsubf" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (minus:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r") (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))) @@ -1834,7 +1834,7 @@ ;; logical-add fusion pattern generated by gen_logical_addsubf ;; scalar nor -> rsubf (define_insn "*fuse_nor_rsubf" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (minus:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r") (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))) @@ -1852,7 +1852,7 @@ ;; logical-add fusion pattern generated by gen_logical_addsubf ;; scalar or -> rsubf (define_insn "*fuse_or_rsubf" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (minus:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r") (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) @@ -1870,7 +1870,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vand (define_insn "*fuse_vand_vand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -1888,7 +1888,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vand (define_insn "*fuse_vandc_vand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -1906,7 +1906,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vand (define_insn "*fuse_veqv_vand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -1924,7 +1924,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vand (define_insn "*fuse_vnand_vand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -1942,7 +1942,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vand (define_insn "*fuse_vnor_vand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -1960,7 +1960,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vand (define_insn "*fuse_vor_vand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -1978,7 +1978,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vand (define_insn "*fuse_vorc_vand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -1996,7 +1996,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vand (define_insn "*fuse_vxor_vand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2014,7 +2014,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vandc (define_insn "*fuse_vand_vandc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2032,7 +2032,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vandc (define_insn "*fuse_vandc_vandc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2050,7 +2050,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vandc (define_insn "*fuse_veqv_vandc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2068,7 +2068,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vandc (define_insn "*fuse_vnand_vandc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2086,7 +2086,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vandc (define_insn "*fuse_vnor_vandc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2104,7 +2104,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vandc (define_insn "*fuse_vor_vandc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2122,7 +2122,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vandc (define_insn "*fuse_vorc_vandc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2140,7 +2140,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vandc (define_insn "*fuse_vxor_vandc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2158,7 +2158,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> veqv (define_insn "*fuse_vand_veqv" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (not:VM (xor:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2176,7 +2176,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> veqv (define_insn "*fuse_vandc_veqv" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (not:VM (xor:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2194,7 +2194,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> veqv (define_insn "*fuse_veqv_veqv" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (not:VM (xor:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2212,7 +2212,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> veqv (define_insn "*fuse_vnand_veqv" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (not:VM (xor:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2230,7 +2230,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> veqv (define_insn "*fuse_vnor_veqv" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (not:VM (xor:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2248,7 +2248,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> veqv (define_insn "*fuse_vor_veqv" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (not:VM (xor:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2266,7 +2266,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> veqv (define_insn "*fuse_vorc_veqv" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (not:VM (xor:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2284,7 +2284,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> veqv (define_insn "*fuse_vxor_veqv" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (not:VM (xor:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2302,7 +2302,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vnand (define_insn "*fuse_vand_vnand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (not:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2320,7 +2320,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vnand (define_insn "*fuse_vandc_vnand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (not:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2338,7 +2338,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vnand (define_insn "*fuse_veqv_vnand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (not:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2356,7 +2356,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vnand (define_insn "*fuse_vnand_vnand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (not:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2374,7 +2374,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vnand (define_insn "*fuse_vnor_vnand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (not:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2392,7 +2392,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vnand (define_insn "*fuse_vor_vnand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (not:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2410,7 +2410,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vnand (define_insn "*fuse_vorc_vnand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (not:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2428,7 +2428,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vnand (define_insn "*fuse_vxor_vnand" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2446,7 +2446,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vnor (define_insn "*fuse_vand_vnor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (not:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2464,7 +2464,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vnor (define_insn "*fuse_vandc_vnor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (not:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2482,7 +2482,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vnor (define_insn "*fuse_veqv_vnor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (not:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2500,7 +2500,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vnor (define_insn "*fuse_vnand_vnor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (not:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2518,7 +2518,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vnor (define_insn "*fuse_vnor_vnor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (not:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2536,7 +2536,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vnor (define_insn "*fuse_vor_vnor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (not:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2554,7 +2554,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vnor (define_insn "*fuse_vorc_vnor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (not:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2572,7 +2572,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vnor (define_insn "*fuse_vxor_vnor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2590,7 +2590,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vor (define_insn "*fuse_vand_vor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2608,7 +2608,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vor (define_insn "*fuse_vandc_vor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2626,7 +2626,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vor (define_insn "*fuse_veqv_vor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2644,7 +2644,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vor (define_insn "*fuse_vnand_vor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2662,7 +2662,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vor (define_insn "*fuse_vnor_vor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2680,7 +2680,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vor (define_insn "*fuse_vor_vor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2698,7 +2698,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vor (define_insn "*fuse_vorc_vor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2716,7 +2716,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vor (define_insn "*fuse_vxor_vor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2734,7 +2734,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vorc (define_insn "*fuse_vand_vorc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2752,7 +2752,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vorc (define_insn "*fuse_vandc_vorc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2770,7 +2770,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vorc (define_insn "*fuse_veqv_vorc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2788,7 +2788,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vorc (define_insn "*fuse_vnand_vorc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2806,7 +2806,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vorc (define_insn "*fuse_vnor_vorc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2824,7 +2824,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vorc (define_insn "*fuse_vor_vorc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2842,7 +2842,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vorc (define_insn "*fuse_vorc_vorc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2860,7 +2860,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vorc (define_insn "*fuse_vxor_vorc" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (ior:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) @@ -2878,7 +2878,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vand -> vxor (define_insn "*fuse_vand_vxor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (xor:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2896,7 +2896,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vandc -> vxor (define_insn "*fuse_vandc_vxor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (xor:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2914,7 +2914,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector veqv -> vxor (define_insn "*fuse_veqv_vxor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (xor:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2932,7 +2932,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnand -> vxor (define_insn "*fuse_vnand_vxor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (xor:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2950,7 +2950,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vnor -> vxor (define_insn "*fuse_vnor_vxor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (xor:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2968,7 +2968,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vor -> vxor (define_insn "*fuse_vor_vxor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (xor:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -2986,7 +2986,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vorc -> vxor (define_insn "*fuse_vorc_vxor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (xor:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -3004,7 +3004,7 @@ ;; logical-logical fusion pattern generated by gen_logical_addsubf ;; vector vxor -> vxor (define_insn "*fuse_vxor_vxor" - [(set (match_operand:VM 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:VM 3 "altivec_register_operand" "=&0,&1,&v,v") (xor:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) @@ -3021,7 +3021,7 @@ ;; add-add fusion pattern generated by gen_addadd (define_insn "*fuse_add_add" - [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,&r,r") + [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,&r,r") (plus:GPR (plus:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) @@ -3039,7 +3039,7 @@ ;; vaddudm-vaddudm fusion pattern generated by gen_addadd (define_insn "*fuse_vaddudm_vaddudm" - [(set (match_operand:V2DI 3 "altivec_register_operand" "=0,1,&v,v") + [(set (match_operand:V2DI 3 "altivec_register_operand" "=&0,&1,&v,v") (plus:V2DI (plus:V2DI (match_operand:V2DI 0 "altivec_register_operand" "v,v,v,v") (match_operand:V2DI 1 "altivec_register_operand" "%v,v,v,v")) diff --git a/gcc/config/rs6000/genfusion.pl b/gcc/config/rs6000/genfusion.pl index 577b955..ac22852 100755 --- a/gcc/config/rs6000/genfusion.pl +++ b/gcc/config/rs6000/genfusion.pl @@ -263,7 +263,7 @@ sub gen_logical_addsubf ;; $ftype fusion pattern generated by gen_logical_addsubf ;; $kind $inner_op -> $outer_name (define_insn "*fuse_${inner_op}_${outer_name}" - [(set (match_operand:${mode} 3 "${pred}" "=0,1,&${constraint},${constraint}") + [(set (match_operand:${mode} 3 "${pred}" "=&0,&1,&${constraint},${constraint}") ${outer_exp}) (clobber (match_scratch:${mode} 4 "=X,X,X,&r"))] "(TARGET_P10_FUSION && $target_flag)" @@ -307,7 +307,7 @@ sub gen_addadd ;; ${op}-${op} fusion pattern generated by gen_addadd (define_insn "*fuse_${op}_${op}" - [(set (match_operand:${mode} 3 "${pred}" "=0,1,&${constraint},${constraint}") + [(set (match_operand:${mode} 3 "${pred}" "=&0,&1,&${constraint},${constraint}") (plus:${mode} (plus:${mode} (match_operand:${mode} 0 "${pred}" "${c4}") (match_operand:${mode} 1 "${pred}" "%${c4}")) -- cgit v1.1 From 7d08043da935095543172f91f691917bd6379c53 Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Thu, 17 Jun 2021 22:05:16 -0400 Subject: Add IEEE 128-bit min/max support on PowerPC. This patch adds the support for the IEEE 128-bit floating point C minimum and maximum instructions. The next patch will add the support for using the compare and set mask instruction to implement conditional moves. This patch does not try to re-use the code used for SF/DF min/max support. It defines a separate insn for the IEEE 128-bit support. It uses the code iterator to simplify adding both operations. GCC will not convert ternary operations into using min/max instructions provided in this patch unless the user uses -Ofast. The next patch that adds conditional move instructions will enable the ternary conversion in many cases. gcc/ 2021-06-17 Michael Meissner * config/rs6000/rs6000.c (rs6000_emit_minmax): Add support for ISA 3.1 IEEE 128-bit floating point xsmaxcqp/xsmincqp instructions. * config/rs6000/rs6000.md (s3, IEEE128 iterator): New insns. gcc/testsuite/ 2021-06-17 Michael Meissner * gcc.target/powerpc/float128-minmax-2.c: New test. --- gcc/config/rs6000/rs6000.c | 3 ++- gcc/config/rs6000/rs6000.md | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 38f9281..2c249e1 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -16103,7 +16103,8 @@ rs6000_emit_minmax (rtx dest, enum rtx_code code, rtx op0, rtx op1) /* VSX/altivec have direct min/max insns. */ if ((code == SMAX || code == SMIN) && (VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode) - || (mode == SFmode && VECTOR_UNIT_VSX_P (DFmode)))) + || (mode == SFmode && VECTOR_UNIT_VSX_P (DFmode)) + || (TARGET_POWER10 && TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)))) { emit_insn (gen_rtx_SET (dest, gen_rtx_fmt_ee (code, mode, op0, op1))); return; diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 510dbff..abd825f 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -5214,6 +5214,17 @@ } [(set_attr "type" "fp")]) +;; Min/max for ISA 3.1 IEEE 128-bit floating point +(define_insn "s3" + [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v") + (fp_minmax:IEEE128 + (match_operand:IEEE128 1 "altivec_register_operand" "v") + (match_operand:IEEE128 2 "altivec_register_operand" "v")))] + "TARGET_POWER10 && TARGET_FLOAT128_HW" + "xscqp %0,%1,%2" + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) + ;; The conditional move instructions allow us to perform max and min operations ;; even when we don't have the appropriate max/min instruction using the FSEL ;; instruction. -- cgit v1.1 From cfa1f8226f275447015e2cb3fb0d876133e6509b Mon Sep 17 00:00:00 2001 From: Marcel Vollweiler Date: Fri, 18 Jun 2021 04:50:36 -0700 Subject: gcc/configure.ac: fix register issue for global_load assembler functions gcc/ChangeLog: * config.in: Regenerate. * config/gcn/gcn.c (print_operand_address): Fix for global_load assembler functions. * configure: Regenerate. * configure.ac: Fix for global_load assembler functions. --- gcc/config/gcn/gcn.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 283a91f..54a1c0b 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -5481,13 +5481,22 @@ print_operand_address (FILE *file, rtx mem) if (vgpr_offset == NULL_RTX) /* In this case, the vector offset is zero, so we use the first lane of v1, which is initialized to zero. */ - fprintf (file, "v[1:2]"); + { + if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED) + fprintf (file, "v1"); + else + fprintf (file, "v[1:2]"); + } else if (REG_P (vgpr_offset) && VGPR_REGNO_P (REGNO (vgpr_offset))) { - fprintf (file, "v[%d:%d]", - REGNO (vgpr_offset) - FIRST_VGPR_REG, - REGNO (vgpr_offset) - FIRST_VGPR_REG + 1); + if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED) + fprintf (file, "v%d", + REGNO (vgpr_offset) - FIRST_VGPR_REG); + else + fprintf (file, "v[%d:%d]", + REGNO (vgpr_offset) - FIRST_VGPR_REG, + REGNO (vgpr_offset) - FIRST_VGPR_REG + 1); } else output_operand_lossage ("bad ADDR_SPACE_GLOBAL address"); -- cgit v1.1 From f58d03b5df25f9eab06b7eacea8da780fc2e0219 Mon Sep 17 00:00:00 2001 From: Srinath Parvathaneni Date: Fri, 18 Jun 2021 13:21:51 +0100 Subject: arm: Fix multilib mapping for CDE extensions [PR100856]. On passing +cdecp[0-7] extension to the -march string in command line options, multilib linking is failing as mentioned in PR100856. This patch fixes this issue by generating a separate canonical string by removing compiler options which are not required for multilib linking from march string and assign the new string to mlibarch option. This mlibarch string is used for multilib comparison. gcc/ChangeLog: 2021-06-10 Srinath Parvathaneni PR target/100856 * common/config/arm/arm-common.c (arm_canon_arch_option_1): New function derived from arm_canon_arch. (arm_canon_arch_option): Call it. (arm_canon_arch_multilib_option): New function. * config/arm/arm-cpus.in (IGNORE_FOR_MULTILIB): New fgroup. * config/arm/arm.h (arm_canon_arch_multilib_option): New prototype. (CANON_ARCH_MULTILIB_SPEC_FUNCTION): New macro. (MULTILIB_ARCH_CANONICAL_SPECS): New macro. (DRIVER_SELF_SPECS): Add MULTILIB_ARCH_CANONICAL_SPECS. * config/arm/arm.opt (mlibarch): New option. * config/arm/t-rmprofile (MULTILIB_MATCHES): For armv8*-m, replace use of march on RHS with mlibarch. gcc/testsuite/ChangeLog: 2021-06-10 Srinath Parvathaneni PR target/100856 * gcc.target/arm/acle/pr100856.c: New test. * gcc.target/arm/multilib.exp: Add tests for cde options. --- gcc/config/arm/arm-cpus.in | 2 ++ gcc/config/arm/arm.h | 15 +++++++++++++++ gcc/config/arm/arm.opt | 6 ++++++ gcc/config/arm/t-rmprofile | 25 ++++++++++++++++++------- 4 files changed, 41 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in index 0becb43..ab4b6ac 100644 --- a/gcc/config/arm/arm-cpus.in +++ b/gcc/config/arm/arm-cpus.in @@ -324,6 +324,8 @@ define implied vfp_base MVE MVE_FP ALL_FP # need to ignore it for matching purposes. define fgroup ALL_QUIRKS quirk_no_volatile_ce quirk_armv6kz quirk_cm3_ldrd xscale quirk_no_asmcpu +define fgroup IGNORE_FOR_MULTILIB cdecp0 cdecp1 cdecp2 cdecp3 cdecp4 cdecp5 cdecp6 cdecp7 + # Architecture entries # format: # begin arch diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index 8e5bd57..015299c 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -2444,10 +2444,14 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #endif const char *arm_canon_arch_option (int argc, const char **argv); +const char *arm_canon_arch_multilib_option (int argc, const char **argv); #define CANON_ARCH_SPEC_FUNCTION \ { "canon_arch", arm_canon_arch_option }, +#define CANON_ARCH_MULTILIB_SPEC_FUNCTION \ + { "canon_arch_multilib", arm_canon_arch_multilib_option }, + const char *arm_be8_option (int argc, const char **argv); #define BE8_SPEC_FUNCTION \ { "be8_linkopt", arm_be8_option }, @@ -2456,6 +2460,7 @@ const char *arm_be8_option (int argc, const char **argv); MCPU_MTUNE_NATIVE_FUNCTIONS \ ASM_CPU_SPEC_FUNCTIONS \ CANON_ARCH_SPEC_FUNCTION \ + CANON_ARCH_MULTILIB_SPEC_FUNCTION \ TARGET_MODE_SPEC_FUNCTIONS \ BE8_SPEC_FUNCTION @@ -2476,12 +2481,22 @@ const char *arm_be8_option (int argc, const char **argv); " %{mfloat-abi=*: abi %*}" \ " % Date: Fri, 18 Jun 2021 18:02:16 -0400 Subject: [committed] More useless code elimination on the H8 gcc/ * config/h8300/h8300.c (h8300_select_cc_mode): Handle SYMBOL_REF. * config/h8300/logical.md (3 logcial expander): Generate more efficient code when the source can be trivially simplified. --- gcc/config/h8300/h8300.c | 2 +- gcc/config/h8300/logical.md | 22 +++++++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300.c b/gcc/config/h8300/h8300.c index 1077a2b..2b88325 100644 --- a/gcc/config/h8300/h8300.c +++ b/gcc/config/h8300/h8300.c @@ -1950,7 +1950,7 @@ h8300_select_cc_mode (enum rtx_code cond, rtx op0, rtx op1) || GET_CODE (op0) == NEG || GET_CODE (op0) == AND || GET_CODE (op0) == IOR || GET_CODE (op0) == XOR || GET_CODE (op0) == NOT || GET_CODE (op0) == ASHIFT - || GET_CODE (op0) == MULT + || GET_CODE (op0) == MULT || GET_CODE (op0) == SYMBOL_REF || GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND || REG_P (op0) || MEM_P (op0))) return CCZNmode; diff --git a/gcc/config/h8300/logical.md b/gcc/config/h8300/logical.md index cb4c638..07d36cf 100644 --- a/gcc/config/h8300/logical.md +++ b/gcc/config/h8300/logical.md @@ -4,7 +4,27 @@ (logicals:QHSI (match_operand:QHSI 1 "register_operand" "") (match_operand:QHSI 2 "h8300_src_operand" "")))] "" - "") + " + { + enum machine_mode mode = GET_MODE (operands[0]); + /* DImodes are not considered tieable, as a result operations involving + subregs of DImode objects are considered expensive which can prevent + CSE from doing obvious simplifications. + + We may ultimately change what is tieable, but this is an immediate + workaround while we evaluate changes to tieable modes. + + The key in terms of what we want to handle is then the result of + the operation is not a constant. */ + if (( == AND && operands[2] == CONSTM1_RTX (mode)) + || ( == IOR && operands[2] == CONST0_RTX (mode)) + || ( == XOR && operands[2] == CONST0_RTX (mode)) + || (( == AND || == IOR) && operands[1] == operands[2])) + { + emit_move_insn (operands[0], operands[1]); + DONE; + } + }") ;; There's a ton of cleanup to do from here below. ;; ---------------------------------------------------------------------- -- cgit v1.1 From 9cedbaab8e048b90ceb9ceef0d851385fae67cde Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Mon, 21 Jun 2021 08:54:50 +0100 Subject: PR target/11877: Use xor to write zero to memory with -Os The following patch attempts to resolve PR target/11877 (without triggering PR/23102). On x86_64, writing an SImode or DImode zero to memory uses an instruction encoding that is larger than first clearing a register (using xor) then writing that to memory. Hence, after reload, the peephole2 pass can determine if there's a suitable free register, and if so, use that to shrink the code size with -Os. To improve code size, and avoid inserting a large number of xor instructions (PR target/23102), this patch makes use of peephole2's efficient pattern matching to use a single temporary for a run of consecutive writes. In theory, one could do better still with a new target-specific pass, gated on -Os, to shrink these instructions (like stv), but that's probably overkill for the little remaining space savings. Evaluating this patch on the CSiBE benchmark (v2.1.1) results in a 0.26% code size improvement (3715273 bytes down to 3705477) on x86_64 with -Os [saving 1 byte every 400]. 549 of 894 tests improve, two tests grow larger. Analysis of these 2 pathological cases reveals that although peephole2's match_scratch prefers to use a call-clobbered register (to avoid requiring a new stack frame), very rarely this interacts with GCC's shrink wrapping optimization, which may previously have avoided saving/restoring a call clobbered register, such as %eax, in the calling function. 2021-06-21 Roger Sayle gcc/ChangeLog PR target/11877 * config/i386/i386.md: New define_peephole2s to shrink writing 1, 2 or 4 consecutive zeros to memory when optimizing for size. gcc/testsuite/ChangeLog PR target/11877 * gcc.target/i386/pr11877.c: New test case. --- gcc/config/i386/i386.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 48532eb..2333261 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -19357,6 +19357,42 @@ ix86_expand_clear (operands[1]); }) +;; When optimizing for size, zeroing memory should use a register. +(define_peephole2 + [(match_scratch:SWI48 0 "r") + (set (match_operand:SWI48 1 "memory_operand" "") (const_int 0)) + (set (match_operand:SWI48 2 "memory_operand" "") (const_int 0)) + (set (match_operand:SWI48 3 "memory_operand" "") (const_int 0)) + (set (match_operand:SWI48 4 "memory_operand" "") (const_int 0))] + "optimize_insn_for_size_p () && peep2_regno_dead_p (0, FLAGS_REG)" + [(set (match_dup 1) (match_dup 0)) + (set (match_dup 2) (match_dup 0)) + (set (match_dup 3) (match_dup 0)) + (set (match_dup 4) (match_dup 0))] +{ + ix86_expand_clear (operands[0]); +}) + +(define_peephole2 + [(match_scratch:SWI48 0 "r") + (set (match_operand:SWI48 1 "memory_operand" "") (const_int 0)) + (set (match_operand:SWI48 2 "memory_operand" "") (const_int 0))] + "optimize_insn_for_size_p () && peep2_regno_dead_p (0, FLAGS_REG)" + [(set (match_dup 1) (match_dup 0)) + (set (match_dup 2) (match_dup 0))] +{ + ix86_expand_clear (operands[0]); +}) + +(define_peephole2 + [(match_scratch:SWI48 0 "r") + (set (match_operand:SWI48 1 "memory_operand" "") (const_int 0))] + "optimize_insn_for_size_p () && peep2_regno_dead_p (0, FLAGS_REG)" + [(set (match_dup 1) (match_dup 0))] +{ + ix86_expand_clear (operands[0]); +}) + ;; Reload dislikes loading constants directly into class_likely_spilled ;; hard registers. Try to tidy things up here. (define_peephole2 -- cgit v1.1 From 08c85f609a73fb36fdcbd9f327a5a645c20ac816 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Tue, 15 Jun 2021 16:25:16 +0800 Subject: Disparage slightly the mask register alternative for bitwise operations. The avx512 supports bitwise operations with mask registers, but the throughput of those instructions is much lower than that of the corresponding gpr version, so we would additionally disparages slightly the mask register alternative for bitwise operations in the LRA. Also when allocano cost of GENERAL_REGS is same as MASK_REGS, allocate MASK_REGS first since it has already been disparaged. gcc/ChangeLog: PR target/101142 * config/i386/i386.md: (*anddi_1): Disparage slightly the mask register alternative. (*and_1): Ditto. (*andqi_1): Ditto. (*andn_1): Ditto. (*_1): Ditto. (*qi_1): Ditto. (*one_cmpl2_1): Ditto. (*one_cmplsi2_1_zext): Ditto. (*one_cmplqi2_1): Ditto. * config/i386/i386.c (x86_order_regs_for_local_alloc): Change the order of mask registers to be before general registers. gcc/testsuite/ChangeLog: PR target/101142 * gcc.target/i386/spill_to_mask-1.c: Adjust testcase. * gcc.target/i386/spill_to_mask-2.c: Adjust testcase. * gcc.target/i386/spill_to_mask-3.c: Adjust testcase. * gcc.target/i386/spill_to_mask-4.c: Adjust testcase. --- gcc/config/i386/i386.c | 13 +++++++++---- gcc/config/i386/i386.md | 20 ++++++++++---------- 2 files changed, 19 insertions(+), 14 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7d0d414..c3740ff 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -20475,6 +20475,15 @@ x86_order_regs_for_local_alloc (void) int pos = 0; int i; + /* When allocano cost of GENERAL_REGS is same as MASK_REGS, allocate + MASK_REGS first since it has already been disparaged. This is for + testcase bitwise_mask_op3.c where the input is allocated as mask + registers, then mask bitwise instructions should be used there. + Refer to pr101142. */ + /* Mask register. */ + for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++) + reg_alloc_order [pos++] = i; + /* First allocate the local general purpose registers. */ for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) if (GENERAL_REGNO_P (i) && call_used_or_fixed_reg_p (i)) @@ -20501,10 +20510,6 @@ x86_order_regs_for_local_alloc (void) for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++) reg_alloc_order [pos++] = i; - /* Mask register. */ - for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++) - reg_alloc_order [pos++] = i; - /* x87 registers. */ if (TARGET_SSE_MATH) for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 2333261..9116828 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -9138,7 +9138,7 @@ }) (define_insn "*anddi_1" - [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k") + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,?k") (and:DI (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k") (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k"))) @@ -9226,7 +9226,7 @@ (set_attr "mode" "SI")]) (define_insn "*and_1" - [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k") + [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,?k") (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k") (match_operand:SWI24 2 "" "r,m,L,k"))) (clobber (reg:CC FLAGS_REG))] @@ -9255,7 +9255,7 @@ (set_attr "mode" ",,SI,")]) (define_insn "*andqi_1" - [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k") + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k") (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k") (match_operand:QI 2 "general_operand" "qn,m,rn,k"))) (clobber (reg:CC FLAGS_REG))] @@ -9651,7 +9651,7 @@ }) (define_insn "*andn_1" - [(set (match_operand:SWI48 0 "register_operand" "=r,r,k") + [(set (match_operand:SWI48 0 "register_operand" "=r,r,?k") (and:SWI48 (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k")) (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k"))) @@ -9667,7 +9667,7 @@ (set_attr "mode" "")]) (define_insn "*andn_1" - [(set (match_operand:SWI12 0 "register_operand" "=r,k") + [(set (match_operand:SWI12 0 "register_operand" "=r,?k") (and:SWI12 (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k")) (match_operand:SWI12 2 "register_operand" "r,k"))) @@ -9757,7 +9757,7 @@ }) (define_insn "*_1" - [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k") + [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,?k") (any_or:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k") (match_operand:SWI248 2 "" "r,m,k"))) @@ -9847,7 +9847,7 @@ (set_attr "mode" "SI")]) (define_insn "*qi_1" - [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k") + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k") (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k") (match_operand:QI 2 "general_operand" "qn,m,rn,k"))) (clobber (reg:CC FLAGS_REG))] @@ -10603,7 +10603,7 @@ "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);") (define_insn "*one_cmpl2_1" - [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k") + [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,?k") (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))] "ix86_unary_operator_ok (NOT, mode, operands)" "@ @@ -10620,7 +10620,7 @@ (set_attr "mode" "")]) (define_insn "*one_cmplsi2_1_zext" - [(set (match_operand:DI 0 "register_operand" "=r,k") + [(set (match_operand:DI 0 "register_operand" "=r,?k") (zero_extend:DI (not:SI (match_operand:SI 1 "register_operand" "0,k"))))] "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)" @@ -10632,7 +10632,7 @@ (set_attr "mode" "SI,SI")]) (define_insn "*one_cmplqi2_1" - [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k") + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,?k") (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))] "ix86_unary_operator_ok (NOT, QImode, operands)" "@ -- cgit v1.1 From 29a539a675b8ffd8e20fd3926d6ba0482ea0f275 Mon Sep 17 00:00:00 2001 From: "prathamesh.kulkarni" Date: Mon, 21 Jun 2021 14:38:32 +0530 Subject: arm/97906: Adjust neon_vca patterns to use GLTE instead of GTGE iterator. gcc/ChangeLog: PR target/97906 * config/arm/iterators.md (NEON_VACMP): Remove. * config/arm/neon.md (neon_vca): Use GLTE instead of GTGE iterator. (neon_vca_insn): Likewise. (neon_vca_insn_unspec): Use NEON_VAGLTE instead of NEON_VACMP. gcc/testsuite/ChangeLog: PR target/97906 * gcc.target/arm/simd/pr97906.c: New test. --- gcc/config/arm/iterators.md | 2 -- gcc/config/arm/neon.md | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 5c4fe89..fafbd2f 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -364,8 +364,6 @@ (define_int_iterator NEON_VCMP [UNSPEC_VCEQ UNSPEC_VCGT UNSPEC_VCGE UNSPEC_VCLT UNSPEC_VCLE]) -(define_int_iterator NEON_VACMP [UNSPEC_VCAGE UNSPEC_VCAGT]) - (define_int_iterator NEON_VAGLTE [UNSPEC_VCAGE UNSPEC_VCAGT UNSPEC_VCALE UNSPEC_VCALT]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 392d960..81cc8d3 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2387,7 +2387,7 @@ (define_expand "neon_vca" [(set (match_operand: 0 "s_register_operand") (neg: - (GTGE: + (GLTE: (abs:VCVTF (match_operand:VCVTF 1 "s_register_operand")) (abs:VCVTF (match_operand:VCVTF 2 "s_register_operand")))))] "TARGET_NEON" @@ -2406,7 +2406,7 @@ (define_insn "neon_vca_insn" [(set (match_operand: 0 "s_register_operand" "=w") (neg: - (GTGE: + (GLTE: (abs:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")) (abs:VCVTF (match_operand:VCVTF 2 "s_register_operand" "w")))))] "TARGET_NEON && flag_unsafe_math_optimizations" @@ -2418,7 +2418,7 @@ [(set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand:VCVTF 1 "s_register_operand" "w") (match_operand:VCVTF 2 "s_register_operand" "w")] - NEON_VACMP))] + NEON_VAGLTE))] "TARGET_NEON" "vac.\t%0, %1, %2" [(set_attr "type" "neon_fp_compare_s")] -- cgit v1.1 From 316dd79876873222552bdf6aa31338012bc9b955 Mon Sep 17 00:00:00 2001 From: "prathamesh.kulkarni" Date: Mon, 21 Jun 2021 14:52:54 +0530 Subject: arm/66791: Replace builtins in vceq_* (a, b) with a == b. gcc/ChangeLog: * config/arm/arm_neon.h (vceq_s8): Replace builtin with __a == __b. (vceq_s16): Likewise. (vceq_s32): Likewise. (vceq_u8): Likewise. (vceq_u16): Likewise. (vceq_u32): Likewise. (vceq_p8): Likewise. (vceqq_s8): Likewise. (vceqq_s16): Likewise. (vceqq_s32): Likewise. (vceqq_u8): Likewise. (vceqq_u16): Likewise. (vceqq_u32): Likewise. (vceqq_p8): Likewise. (vceq_f32): Gate __a == __b on __FAST_MATH__. (vceqq_f32): Likewise. (vceq_f16): Likewise. (vceqq_f16): Likewise. --- gcc/config/arm/arm_neon.h | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index dcd533f..7a800062 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -2359,112 +2359,120 @@ __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceq_s8 (int8x8_t __a, int8x8_t __b) { - return (uint8x8_t)__builtin_neon_vceqv8qi (__a, __b); + return (uint8x8_t) (__a == __b); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceq_s16 (int16x4_t __a, int16x4_t __b) { - return (uint16x4_t)__builtin_neon_vceqv4hi (__a, __b); + return (uint16x4_t) (__a == __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceq_s32 (int32x2_t __a, int32x2_t __b) { - return (uint32x2_t)__builtin_neon_vceqv2si (__a, __b); + return (uint32x2_t) (__a == __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceq_f32 (float32x2_t __a, float32x2_t __b) { +#ifdef __FAST_MATH__ + return (uint32x2_t) (__a == __b); +#else return (uint32x2_t)__builtin_neon_vceqv2sf (__a, __b); +#endif } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceq_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b); + return (uint8x8_t) (__a == __b); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceq_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint16x4_t)__builtin_neon_vceqv4hi ((int16x4_t) __a, (int16x4_t) __b); + return (uint16x4_t) (__a == __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceq_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint32x2_t)__builtin_neon_vceqv2si ((int32x2_t) __a, (int32x2_t) __b); + return (uint32x2_t) (__a == __b); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceq_p8 (poly8x8_t __a, poly8x8_t __b) { - return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b); + return (uint8x8_t) (__a == __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqq_s8 (int8x16_t __a, int8x16_t __b) { - return (uint8x16_t)__builtin_neon_vceqv16qi (__a, __b); + return (uint8x16_t) (__a == __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqq_s16 (int16x8_t __a, int16x8_t __b) { - return (uint16x8_t)__builtin_neon_vceqv8hi (__a, __b); + return (uint16x8_t) (__a == __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqq_s32 (int32x4_t __a, int32x4_t __b) { - return (uint32x4_t)__builtin_neon_vceqv4si (__a, __b); + return (uint32x4_t) (__a == __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqq_f32 (float32x4_t __a, float32x4_t __b) { +#ifdef __FAST_MATH__ + return (uint32x4_t) (__a == __b); +#else return (uint32x4_t)__builtin_neon_vceqv4sf (__a, __b); +#endif } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqq_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b); + return (uint8x16_t) (__a == __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqq_u16 (uint16x8_t __a, uint16x8_t __b) { - return (uint16x8_t)__builtin_neon_vceqv8hi ((int16x8_t) __a, (int16x8_t) __b); + return (uint16x8_t) (__a == __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqq_u32 (uint32x4_t __a, uint32x4_t __b) { - return (uint32x4_t)__builtin_neon_vceqv4si ((int32x4_t) __a, (int32x4_t) __b); + return (uint32x4_t) (__a == __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqq_p8 (poly8x16_t __a, poly8x16_t __b) { - return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b); + return (uint8x16_t) (__a == __b); } __extension__ extern __inline uint8x8_t @@ -17195,14 +17203,22 @@ __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceq_f16 (float16x4_t __a, float16x4_t __b) { +#ifdef __FAST_MATH__ + return (uint16x4_t) (__a == __b); +#else return (uint16x4_t)__builtin_neon_vceqv4hf (__a, __b); +#endif } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqq_f16 (float16x8_t __a, float16x8_t __b) { +#ifdef __FAST_MATH__ + return (uint16x8_t) (__a == __b); +#else return (uint16x8_t)__builtin_neon_vceqv8hf (__a, __b); +#endif } __extension__ extern __inline uint16x4_t -- cgit v1.1 From b6efffa552cee6a20a58c91e5f41466c5715d73d Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 28 Apr 2021 14:52:59 +0800 Subject: Fix ICE for vpexpand*. gcc/ChangeLog PR target/100310 * config/i386/i386-expand.c (ix86_expand_special_args_builtin): Keep constm1_operand only if it satisfies insn's operand predicate. gcc/testsuite/ChangeLog PR target/100310 * gcc.target/i386/pr100310.c: New test. --- gcc/config/i386/i386-expand.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 8f4e4e4..cc2eaee 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -10969,11 +10969,12 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, op = fixup_modeless_constant (op, mode); - /* NB: 3-operands load implied it's a mask load, + /* NB: 3-operands load implied it's a mask load or v{p}expand*, and that mask operand shoud be at the end. Keep all-ones mask which would be simplified by the expander. */ if (nargs == 3 && i == 2 && klass == load - && constm1_operand (op, mode)) + && constm1_operand (op, mode) + && insn_p->operand[i].predicate (op, mode)) ; else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) op = copy_to_mode_reg (mode, op); -- cgit v1.1 From f51618f301664d02cc41205f1386c0c9b9a29a54 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Thu, 29 Apr 2021 18:27:09 +0800 Subject: Optimize vpexpand* to mask mov when mask have all ones in it's lower part (including 0 and -1). gcc/ChangeLog: PR target/100267 * config/i386/i386-builtin.def (BDESC): Adjust builtin name. * config/i386/sse.md (_expand_mask): Rename to .. (expand_mask): this .. (*expand_mask): New pre_reload splitter to transform v{,p}expand* to vmov* when mask is zero, all ones, or has all ones in it's lower part, otherwise still generate v{,p}expand*. gcc/testsuite/ChangeLog: PR target/100267 * gcc.target/i386/avx512bw-pr100267-1.c: New test. * gcc.target/i386/avx512bw-pr100267-b-2.c: New test. * gcc.target/i386/avx512bw-pr100267-d-2.c: New test. * gcc.target/i386/avx512bw-pr100267-q-2.c: New test. * gcc.target/i386/avx512bw-pr100267-w-2.c: New test. * gcc.target/i386/avx512f-pr100267-1.c: New test. * gcc.target/i386/avx512f-pr100267-pd-2.c: New test. * gcc.target/i386/avx512f-pr100267-ps-2.c: New test. * gcc.target/i386/avx512vl-pr100267-1.c: New test. * gcc.target/i386/avx512vl-pr100267-pd-2.c: New test. * gcc.target/i386/avx512vl-pr100267-ps-2.c: New test. * gcc.target/i386/avx512vlbw-pr100267-1.c: New test. * gcc.target/i386/avx512vlbw-pr100267-b-2.c: New test. * gcc.target/i386/avx512vlbw-pr100267-d-2.c: New test. * gcc.target/i386/avx512vlbw-pr100267-q-2.c: New test. * gcc.target/i386/avx512vlbw-pr100267-w-2.c: New test. --- gcc/config/i386/i386-builtin.def | 48 ++++++++++++++-------------- gcc/config/i386/sse.md | 69 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 25 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 80c2a2c..31df3a6 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -204,13 +204,13 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev16sf_mask, "__ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev16si_mask, "__builtin_ia32_compressstoresi512_mask", IX86_BUILTIN_PCOMPRESSDSTORE512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_UHI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev8df_mask, "__builtin_ia32_compressstoredf512_mask", IX86_BUILTIN_COMPRESSPDSTORE512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_UQI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev8di_mask, "__builtin_ia32_compressstoredi512_mask", IX86_BUILTIN_PCOMPRESSQSTORE512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16sf_mask, "__builtin_ia32_expandloadsf512_mask", IX86_BUILTIN_EXPANDPSLOAD512, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_UHI) +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv16sf_mask, "__builtin_ia32_expandloadsf512_mask", IX86_BUILTIN_EXPANDPSLOAD512, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_UHI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16sf_maskz, "__builtin_ia32_expandloadsf512_maskz", IX86_BUILTIN_EXPANDPSLOAD512Z, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16si_mask, "__builtin_ia32_expandloadsi512_mask", IX86_BUILTIN_PEXPANDDLOAD512, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_UHI) +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv16si_mask, "__builtin_ia32_expandloadsi512_mask", IX86_BUILTIN_PEXPANDDLOAD512, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_UHI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16si_maskz, "__builtin_ia32_expandloadsi512_maskz", IX86_BUILTIN_PEXPANDDLOAD512Z, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_UHI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8df_mask, "__builtin_ia32_expandloaddf512_mask", IX86_BUILTIN_EXPANDPDLOAD512, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_UQI) +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv8df_mask, "__builtin_ia32_expandloaddf512_mask", IX86_BUILTIN_EXPANDPDLOAD512, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_UQI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8df_maskz, "__builtin_ia32_expandloaddf512_maskz", IX86_BUILTIN_EXPANDPDLOAD512Z, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8di_mask, "__builtin_ia32_expandloaddi512_mask", IX86_BUILTIN_PEXPANDQLOAD512, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv8di_mask, "__builtin_ia32_expandloaddi512_mask", IX86_BUILTIN_PEXPANDQLOAD512, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_UQI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8di_maskz, "__builtin_ia32_expandloaddi512_maskz", IX86_BUILTIN_PEXPANDQLOAD512Z, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_UQI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv16si_mask, "__builtin_ia32_loaddqusi512_mask", IX86_BUILTIN_LOADDQUSI512, UNKNOWN, (int) V16SI_FTYPE_PCINT_V16SI_UHI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv8di_mask, "__builtin_ia32_loaddqudi512_mask", IX86_BUILTIN_LOADDQUDI512, UNKNOWN, (int) V8DI_FTYPE_PCINT64_V8DI_UQI) @@ -337,14 +337,14 @@ BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_compressstorev4di_mask, "_ BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_compressstorev2di_mask, "__builtin_ia32_compressstoredi128_mask", IX86_BUILTIN_PCOMPRESSQSTORE128, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_compressstorev8si_mask, "__builtin_ia32_compressstoresi256_mask", IX86_BUILTIN_PCOMPRESSDSTORE256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_compressstorev4si_mask, "__builtin_ia32_compressstoresi128_mask", IX86_BUILTIN_PCOMPRESSDSTORE128, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv4df_mask, "__builtin_ia32_expandloaddf256_mask", IX86_BUILTIN_EXPANDPDLOAD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv2df_mask, "__builtin_ia32_expandloaddf128_mask", IX86_BUILTIN_EXPANDPDLOAD128, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv8sf_mask, "__builtin_ia32_expandloadsf256_mask", IX86_BUILTIN_EXPANDPSLOAD256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv4sf_mask, "__builtin_ia32_expandloadsf128_mask", IX86_BUILTIN_EXPANDPSLOAD128, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv4di_mask, "__builtin_ia32_expandloaddi256_mask", IX86_BUILTIN_PEXPANDQLOAD256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv2di_mask, "__builtin_ia32_expandloaddi128_mask", IX86_BUILTIN_PEXPANDQLOAD128, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv8si_mask, "__builtin_ia32_expandloadsi256_mask", IX86_BUILTIN_PEXPANDDLOAD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv4si_mask, "__builtin_ia32_expandloadsi128_mask", IX86_BUILTIN_PEXPANDDLOAD128, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv4df_mask, "__builtin_ia32_expandloaddf256_mask", IX86_BUILTIN_EXPANDPDLOAD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv2df_mask, "__builtin_ia32_expandloaddf128_mask", IX86_BUILTIN_EXPANDPDLOAD128, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv8sf_mask, "__builtin_ia32_expandloadsf256_mask", IX86_BUILTIN_EXPANDPSLOAD256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv4sf_mask, "__builtin_ia32_expandloadsf128_mask", IX86_BUILTIN_EXPANDPSLOAD128, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv4di_mask, "__builtin_ia32_expandloaddi256_mask", IX86_BUILTIN_PEXPANDQLOAD256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv2di_mask, "__builtin_ia32_expandloaddi128_mask", IX86_BUILTIN_PEXPANDQLOAD128, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv8si_mask, "__builtin_ia32_expandloadsi256_mask", IX86_BUILTIN_PEXPANDDLOAD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv4si_mask, "__builtin_ia32_expandloadsi128_mask", IX86_BUILTIN_PEXPANDDLOAD128, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv4df_maskz, "__builtin_ia32_expandloaddf256_maskz", IX86_BUILTIN_EXPANDPDLOAD256Z, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv2df_maskz, "__builtin_ia32_expandloaddf128_maskz", IX86_BUILTIN_EXPANDPDLOAD128Z, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv8sf_maskz, "__builtin_ia32_expandloadsf256_maskz", IX86_BUILTIN_EXPANDPSLOAD256Z, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SF_UQI) @@ -1342,9 +1342,9 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_floatv8siv8df2_mask, "__builtin_ia32 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vcvtps2ph512_mask, "__builtin_ia32_vcvtps2ph512_mask", IX86_BUILTIN_CVTPS2PH512, UNKNOWN, (int) V16HI_FTYPE_V16SF_INT_V16HI_UHI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_ufloatv8siv8df2_mask, "__builtin_ia32_cvtudq2pd512_mask", IX86_BUILTIN_CVTUDQ2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SI_V8DF_UQI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_cvtusi2sd32, "__builtin_ia32_cvtusi2sd32", IX86_BUILTIN_CVTUSI2SD32, UNKNOWN, (int) V2DF_FTYPE_V2DF_UINT) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8df_mask, "__builtin_ia32_expanddf512_mask", IX86_BUILTIN_EXPANDPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_UQI) +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv8df_mask, "__builtin_ia32_expanddf512_mask", IX86_BUILTIN_EXPANDPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_UQI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8df_maskz, "__builtin_ia32_expanddf512_maskz", IX86_BUILTIN_EXPANDPD512Z, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16sf_mask, "__builtin_ia32_expandsf512_mask", IX86_BUILTIN_EXPANDPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_UHI) +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv16sf_mask, "__builtin_ia32_expandsf512_mask", IX86_BUILTIN_EXPANDPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_UHI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16sf_maskz, "__builtin_ia32_expandsf512_maskz", IX86_BUILTIN_EXPANDPS512Z, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_UHI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vextractf32x4_mask, "__builtin_ia32_extractf32x4_mask", IX86_BUILTIN_EXTRACTF32X4, UNKNOWN, (int) V4SF_FTYPE_V16SF_INT_V4SF_UQI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vextractf64x4_mask, "__builtin_ia32_extractf64x4_mask", IX86_BUILTIN_EXTRACTF64X4, UNKNOWN, (int) V4DF_FTYPE_V8DF_INT_V4DF_UQI) @@ -1381,9 +1381,9 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_gtv16si3_mask, "__builtin_ia BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_gtv8di3_mask, "__builtin_ia32_pcmpgtq512_mask", IX86_BUILTIN_PCMPGTQ512_MASK, UNKNOWN, (int) UQI_FTYPE_V8DI_V8DI_UQI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressv16si_mask, "__builtin_ia32_compresssi512_mask", IX86_BUILTIN_PCOMPRESSD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_UHI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressv8di_mask, "__builtin_ia32_compressdi512_mask", IX86_BUILTIN_PCOMPRESSQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16si_mask, "__builtin_ia32_expandsi512_mask", IX86_BUILTIN_PEXPANDD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_UHI) +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv16si_mask, "__builtin_ia32_expandsi512_mask", IX86_BUILTIN_PEXPANDD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_UHI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16si_maskz, "__builtin_ia32_expandsi512_maskz", IX86_BUILTIN_PEXPANDD512Z, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_UHI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8di_mask, "__builtin_ia32_expanddi512_mask", IX86_BUILTIN_PEXPANDQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv8di_mask, "__builtin_ia32_expanddi512_mask", IX86_BUILTIN_PEXPANDQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_UQI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8di_maskz, "__builtin_ia32_expanddi512_maskz", IX86_BUILTIN_PEXPANDQ512Z, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_UQI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_smaxv16si3_mask, "__builtin_ia32_pmaxsd512_mask", IX86_BUILTIN_PMAXSD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_smaxv8di3_mask, "__builtin_ia32_pmaxsq512_mask", IX86_BUILTIN_PMAXSQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_UQI) @@ -2187,14 +2187,14 @@ BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_compressv4di_mask, "__buil BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_compressv2di_mask, "__builtin_ia32_compressdi128_mask", IX86_BUILTIN_PCOMPRESSQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_compressv8si_mask, "__builtin_ia32_compresssi256_mask", IX86_BUILTIN_PCOMPRESSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_compressv4si_mask, "__builtin_ia32_compresssi128_mask", IX86_BUILTIN_PCOMPRESSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv4df_mask, "__builtin_ia32_expanddf256_mask", IX86_BUILTIN_EXPANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv2df_mask, "__builtin_ia32_expanddf128_mask", IX86_BUILTIN_EXPANDPD128, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv8sf_mask, "__builtin_ia32_expandsf256_mask", IX86_BUILTIN_EXPANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv4sf_mask, "__builtin_ia32_expandsf128_mask", IX86_BUILTIN_EXPANDPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv4di_mask, "__builtin_ia32_expanddi256_mask", IX86_BUILTIN_PEXPANDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv2di_mask, "__builtin_ia32_expanddi128_mask", IX86_BUILTIN_PEXPANDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv8si_mask, "__builtin_ia32_expandsi256_mask", IX86_BUILTIN_PEXPANDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv4si_mask, "__builtin_ia32_expandsi128_mask", IX86_BUILTIN_PEXPANDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv4df_mask, "__builtin_ia32_expanddf256_mask", IX86_BUILTIN_EXPANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv2df_mask, "__builtin_ia32_expanddf128_mask", IX86_BUILTIN_EXPANDPD128, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv8sf_mask, "__builtin_ia32_expandsf256_mask", IX86_BUILTIN_EXPANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv4sf_mask, "__builtin_ia32_expandsf128_mask", IX86_BUILTIN_EXPANDPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv4di_mask, "__builtin_ia32_expanddi256_mask", IX86_BUILTIN_PEXPANDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv2di_mask, "__builtin_ia32_expanddi128_mask", IX86_BUILTIN_PEXPANDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv8si_mask, "__builtin_ia32_expandsi256_mask", IX86_BUILTIN_PEXPANDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_expandv4si_mask, "__builtin_ia32_expandsi128_mask", IX86_BUILTIN_PEXPANDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv4df_maskz, "__builtin_ia32_expanddf256_maskz", IX86_BUILTIN_EXPANDPD256Z, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv2df_maskz, "__builtin_ia32_expanddf128_maskz", IX86_BUILTIN_EXPANDPD128Z, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_expandv8sf_maskz, "__builtin_ia32_expandsf256_maskz", IX86_BUILTIN_EXPANDPS256Z, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_UQI) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 94296bc..f5f9403 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -699,6 +699,17 @@ (V4DI "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")]) +(define_mode_iterator VI12_VI48F_AVX512VLBW + [(V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") + (V8DI "TARGET_AVX512F") (V8DF "TARGET_AVX512F") + (V8SI "TARGET_AVX512VL") (V8SF "TARGET_AVX512VL") + (V4DI "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL") + (V4SI "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL") + (V2DI "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL") + (V64QI "TARGET_AVX512BW") (V16QI "TARGET_AVX512VL") + (V32QI "TARGET_AVX512VL && TARGET_AVX512BW") (V32HI "TARGET_AVX512BW") + (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")]) + (define_mode_iterator VI48F_256 [V8SI V8SF V4DI V4DF]) (define_mode_iterator VF_AVX512 @@ -23009,7 +23020,7 @@ "TARGET_AVX512F" "operands[2] = CONST0_RTX (mode);") -(define_insn "_expand_mask" +(define_insn "expand_mask" [(set (match_operand:VI48F 0 "register_operand" "=v,v") (unspec:VI48F [(match_operand:VI48F 1 "nonimmediate_operand" "v,m") @@ -23037,6 +23048,62 @@ (set_attr "memory" "none,load") (set_attr "mode" "")]) +(define_insn_and_split "*expand_mask" + [(set (match_operand:VI12_VI48F_AVX512VLBW 0 "register_operand") + (unspec:VI12_VI48F_AVX512VLBW + [(match_operand:VI12_VI48F_AVX512VLBW 1 "nonimmediate_operand") + (match_operand:VI12_VI48F_AVX512VLBW 2 "nonimm_or_0_operand") + (match_operand 3 "const_int_operand")] + UNSPEC_EXPAND))] + "ix86_pre_reload_split () + && (TARGET_AVX512VBMI2 || GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4)" + "#" + "&& 1" + [(const_int 0)] +{ + unsigned HOST_WIDE_INT mask = INTVAL (operands[3]); + bool has_zero = false; + unsigned n = GET_MODE_NUNITS (mode), i; + unsigned ones = 0; + + /* If all ones bits is in mask's lower part, + get number of ones and assign it to ONES. */ + for (i = 0; i != n; i++) + { + if ((mask & HOST_WIDE_INT_1U << i) && has_zero) + break; + + /* Record first zero bit. */ + if (!(mask & HOST_WIDE_INT_1U << i) && !has_zero) + { + has_zero = true; + ones = i; + } + } + + if (!has_zero) + ones = n; + + if (i != n || (ones != 0 && ones != n)) + { + rtx reg = gen_reg_rtx (mode); + emit_move_insn (reg, operands[3]); + enum insn_code icode; + if (i == n) + /* For masks with all one bits in it's lower part, + we can transform v{,p}expand* to vmovdq* with + mask operand. */ + icode = CODE_FOR__load_mask; + else + icode = CODE_FOR_expand_mask; + emit_insn (GEN_FCN (icode) (operands[0], operands[1], operands[2], reg)); + } + else + /* For ALL_MASK_ONES or CONST0_RTX mask, transform it to simple mov. */ + emit_move_insn (operands[0], ones ? operands[1] : operands[2]); + DONE; +}) + (define_expand "expand_maskz" [(set (match_operand:VI12_AVX512VLBW 0 "register_operand") (unspec:VI12_AVX512VLBW -- cgit v1.1 From d58a66aa0faa64bfbd85e528be5104293dd41d0e Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 22 Jun 2021 10:16:18 +0200 Subject: i386: Use xor to write zero to memory with -Os even for more than 4 stores [PR11877] > > 2021-06-20 Roger Sayle > > > > gcc/ChangeLog > > PR target/11877 > > * config/i386/i386.md: New define_peephole2s to shrink writing > > 1, 2 or 4 consecutive zeros to memory when optimizing for size. It unfortunately doesn't extend well to larger memory clearing. Consider e.g. void foo (int *p) { p[0] = 0; p[7] = 0; p[23] = 0; p[41] = 0; p[48] = 0; p[59] = 0; p[69] = 0; p[78] = 0; p[83] = 0; p[89] = 0; p[98] = 0; p[121] = 0; p[132] = 0; p[143] = 0; p[154] = 0; } where with the patch we emit: xorl %eax, %eax xorl %edx, %edx xorl %ecx, %ecx xorl %esi, %esi xorl %r8d, %r8d movl %eax, (%rdi) movl %eax, 28(%rdi) movl %eax, 92(%rdi) movl %eax, 164(%rdi) movl %edx, 192(%rdi) movl %edx, 236(%rdi) movl %edx, 276(%rdi) movl %edx, 312(%rdi) movl %ecx, 332(%rdi) movl %ecx, 356(%rdi) movl %ecx, 392(%rdi) movl %ecx, 484(%rdi) movl %esi, 528(%rdi) movl %esi, 572(%rdi) movl %r8d, 616(%rdi) Here is an incremental patch that emits: xorl %eax, %eax movl %eax, (%rdi) movl %eax, 28(%rdi) movl %eax, 92(%rdi) movl %eax, 164(%rdi) movl %eax, 192(%rdi) movl %eax, 236(%rdi) movl %eax, 276(%rdi) movl %eax, 312(%rdi) movl %eax, 332(%rdi) movl %eax, 356(%rdi) movl %eax, 392(%rdi) movl %eax, 484(%rdi) movl %eax, 528(%rdi) movl %eax, 572(%rdi) movl %eax, 616(%rdi) instead. 2021-06-22 Jakub Jelinek PR target/11877 * config/i386/i386-protos.h (ix86_last_zero_store_uid): Declare. * config/i386/i386-expand.c (ix86_last_zero_store_uid): New variable. * config/i386/i386.c (ix86_expand_prologue): Clear it. * config/i386/i386.md (peephole2s for 1/2/4 stores of const0_rtx): Remove "" from match_operand. Emit new insns using emit_move_insn and set ix86_last_zero_store_uid to INSN_UID of the last store. Add peephole2s for 1/2/4 stores of const0_rtx following previous successful peep2s. * gcc.target/i386/pr11877-2.c: New test. --- gcc/config/i386/i386-expand.c | 3 ++ gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/i386.c | 1 + gcc/config/i386/i386.md | 87 ++++++++++++++++++++++++++++++++++++------- 4 files changed, 78 insertions(+), 14 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index cc2eaee..2986b49 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -1316,6 +1316,9 @@ find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2) return false; } +/* INSN_UID of the last insn emitted by zero store peephole2s. */ +int ix86_last_zero_store_uid; + /* Split lea instructions into a sequence of instructions which are executed on ALU to avoid AGU stalls. It is assumed that it is allowed to clobber flags register diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index e6ac939..1d05206 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -111,6 +111,7 @@ extern bool ix86_use_lea_for_mov (rtx_insn *, rtx[]); extern bool ix86_avoid_lea_for_addr (rtx_insn *, rtx[]); extern void ix86_split_lea_for_addr (rtx_insn *, rtx[], machine_mode); extern bool ix86_lea_for_add_ok (rtx_insn *, rtx[]); +extern int ix86_last_zero_store_uid; extern bool ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high); extern bool ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn); extern bool ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index c3740ff..3d5883b 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -8196,6 +8196,7 @@ ix86_expand_prologue (void) bool save_stub_call_needed; rtx static_chain = NULL_RTX; + ix86_last_zero_store_uid = 0; if (ix86_function_naked (current_function_decl)) { if (flag_stack_usage_info) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 9116828..700c158 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -19360,37 +19360,96 @@ ;; When optimizing for size, zeroing memory should use a register. (define_peephole2 [(match_scratch:SWI48 0 "r") - (set (match_operand:SWI48 1 "memory_operand" "") (const_int 0)) - (set (match_operand:SWI48 2 "memory_operand" "") (const_int 0)) - (set (match_operand:SWI48 3 "memory_operand" "") (const_int 0)) - (set (match_operand:SWI48 4 "memory_operand" "") (const_int 0))] + (set (match_operand:SWI48 1 "memory_operand") (const_int 0)) + (set (match_operand:SWI48 2 "memory_operand") (const_int 0)) + (set (match_operand:SWI48 3 "memory_operand") (const_int 0)) + (set (match_operand:SWI48 4 "memory_operand") (const_int 0))] "optimize_insn_for_size_p () && peep2_regno_dead_p (0, FLAGS_REG)" - [(set (match_dup 1) (match_dup 0)) - (set (match_dup 2) (match_dup 0)) - (set (match_dup 3) (match_dup 0)) - (set (match_dup 4) (match_dup 0))] + [(const_int 0)] { ix86_expand_clear (operands[0]); + emit_move_insn (operands[1], operands[0]); + emit_move_insn (operands[2], operands[0]); + emit_move_insn (operands[3], operands[0]); + ix86_last_zero_store_uid + = INSN_UID (emit_move_insn (operands[4], operands[0])); + DONE; }) (define_peephole2 [(match_scratch:SWI48 0 "r") - (set (match_operand:SWI48 1 "memory_operand" "") (const_int 0)) - (set (match_operand:SWI48 2 "memory_operand" "") (const_int 0))] + (set (match_operand:SWI48 1 "memory_operand") (const_int 0)) + (set (match_operand:SWI48 2 "memory_operand") (const_int 0))] "optimize_insn_for_size_p () && peep2_regno_dead_p (0, FLAGS_REG)" - [(set (match_dup 1) (match_dup 0)) - (set (match_dup 2) (match_dup 0))] + [(const_int 0)] { ix86_expand_clear (operands[0]); + emit_move_insn (operands[1], operands[0]); + ix86_last_zero_store_uid + = INSN_UID (emit_move_insn (operands[2], operands[0])); + DONE; }) (define_peephole2 [(match_scratch:SWI48 0 "r") - (set (match_operand:SWI48 1 "memory_operand" "") (const_int 0))] + (set (match_operand:SWI48 1 "memory_operand") (const_int 0))] "optimize_insn_for_size_p () && peep2_regno_dead_p (0, FLAGS_REG)" - [(set (match_dup 1) (match_dup 0))] + [(const_int 0)] { ix86_expand_clear (operands[0]); + ix86_last_zero_store_uid + = INSN_UID (emit_move_insn (operands[1], operands[0])); + DONE; +}) + +(define_peephole2 + [(set (match_operand:SWI48 5 "memory_operand") + (match_operand:SWI48 0 "general_reg_operand")) + (set (match_operand:SWI48 1 "memory_operand") (const_int 0)) + (set (match_operand:SWI48 2 "memory_operand") (const_int 0)) + (set (match_operand:SWI48 3 "memory_operand") (const_int 0)) + (set (match_operand:SWI48 4 "memory_operand") (const_int 0))] + "optimize_insn_for_size_p () + && INSN_UID (peep2_next_insn (0)) == ix86_last_zero_store_uid" + [(const_int 0)] +{ + emit_move_insn (operands[5], operands[0]); + emit_move_insn (operands[1], operands[0]); + emit_move_insn (operands[2], operands[0]); + emit_move_insn (operands[3], operands[0]); + ix86_last_zero_store_uid + = INSN_UID (emit_move_insn (operands[4], operands[0])); + DONE; +}) + +(define_peephole2 + [(set (match_operand:SWI48 3 "memory_operand") + (match_operand:SWI48 0 "general_reg_operand")) + (set (match_operand:SWI48 1 "memory_operand") (const_int 0)) + (set (match_operand:SWI48 2 "memory_operand") (const_int 0))] + "optimize_insn_for_size_p () + && INSN_UID (peep2_next_insn (0)) == ix86_last_zero_store_uid" + [(const_int 0)] +{ + emit_move_insn (operands[3], operands[0]); + emit_move_insn (operands[1], operands[0]); + ix86_last_zero_store_uid + = INSN_UID (emit_move_insn (operands[2], operands[0])); + DONE; +}) + +(define_peephole2 + [(set (match_operand:SWI48 2 "memory_operand") + (match_operand:SWI48 0 "general_reg_operand")) + (set (match_operand:SWI48 1 "memory_operand") (const_int 0))] + "optimize_insn_for_size_p () + && INSN_UID (peep2_next_insn (0)) == ix86_last_zero_store_uid" + [(const_int 0)] +{ + emit_move_insn (operands[2], operands[0]); + ix86_last_zero_store_uid + = INSN_UID (emit_move_insn (operands[1], operands[0])); + DONE; }) ;; Reload dislikes loading constants directly into class_likely_spilled -- cgit v1.1 From 7822285515cd4dab86f722a9f4969b6952904a37 Mon Sep 17 00:00:00 2001 From: Jojo R Date: Mon, 21 Jun 2021 20:42:43 +0800 Subject: RISC-V: Add tune info for T-HEAD C906. gcc/ * config/riscv/riscv.c (thead_c906_tune_info): New. (riscv_tune_info_table): Use new tune. --- gcc/config/riscv/riscv.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c index 1baa299..576960b 100644 --- a/gcc/config/riscv/riscv.c +++ b/gcc/config/riscv/riscv.c @@ -300,6 +300,19 @@ static const struct riscv_tune_param sifive_7_tune_info = { true, /* slow_unaligned_access */ }; +/* Costs to use when optimizing for T-HEAD c906. */ +static const struct riscv_tune_param thead_c906_tune_info = { + {COSTS_N_INSNS (4), COSTS_N_INSNS (5)}, /* fp_add */ + {COSTS_N_INSNS (4), COSTS_N_INSNS (5)}, /* fp_mul */ + {COSTS_N_INSNS (20), COSTS_N_INSNS (20)}, /* fp_div */ + {COSTS_N_INSNS (4), COSTS_N_INSNS (4)}, /* int_mul */ + {COSTS_N_INSNS (6), COSTS_N_INSNS (6)}, /* int_div */ + 1, /* issue_rate */ + 3, /* branch_cost */ + 5, /* memory_cost */ + false, /* slow_unaligned_access */ +}; + /* Costs to use when optimizing for size. */ static const struct riscv_tune_param optimize_size_tune_info = { {COSTS_N_INSNS (1), COSTS_N_INSNS (1)}, /* fp_add */ @@ -348,6 +361,7 @@ static const struct riscv_tune_info riscv_tune_info_table[] = { { "sifive-3-series", generic, &rocket_tune_info }, { "sifive-5-series", generic, &rocket_tune_info }, { "sifive-7-series", sifive_7, &sifive_7_tune_info }, + { "thead-c906", generic, &thead_c906_tune_info }, { "size", generic, &optimize_size_tune_info }, }; -- cgit v1.1 From 1e16f2b472c7d253d564556a048dc4ae16119c00 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 23 Jun 2021 12:50:53 +0200 Subject: i386: Prevent unwanted combine from LZCNT to BSR [PR101175] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current RTX pattern for BSR allows combine pass to convert LZCNT insn to BSR. Note that the LZCNT has a defined behavior to return the operand size when operand is zero, where BSR has not. Add a BSR specific setting of zero-flag to RTX pattern of BSR insn in order to avoid matching unwanted combinations. 2021-06-23 Uroš Bizjak gcc/ PR target/101175 * config/i386/i386.md (bsr_rex64): Add zero-flag setting RTX. (bsr): Ditto. (*bsrhi): Remove. (clz2): Update RTX pattern for additions. gcc/testsuite/ PR target/101175 * gcc.target/i386/pr101175.c: New test. --- gcc/config/i386/i386.md | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 700c158..4e24210 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -14533,10 +14533,12 @@ (set_attr "mode" "SI")]) (define_insn "bsr_rex64" - [(set (match_operand:DI 0 "register_operand" "=r") + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (match_operand:DI 1 "nonimmediate_operand" "rm") + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") (minus:DI (const_int 63) - (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm")))) - (clobber (reg:CC FLAGS_REG))] + (clz:DI (match_dup 1))))] "TARGET_64BIT" "bsr{q}\t{%1, %0|%0, %1}" [(set_attr "type" "alu1") @@ -14545,10 +14547,12 @@ (set_attr "mode" "DI")]) (define_insn "bsr" - [(set (match_operand:SI 0 "register_operand" "=r") + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm") + (const_int 0))) + (set (match_operand:SI 0 "register_operand" "=r") (minus:SI (const_int 31) - (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))) - (clobber (reg:CC FLAGS_REG))] + (clz:SI (match_dup 1))))] "" "bsr{l}\t{%1, %0|%0, %1}" [(set_attr "type" "alu1") @@ -14556,25 +14560,15 @@ (set_attr "znver1_decode" "vector") (set_attr "mode" "SI")]) -(define_insn "*bsrhi" - [(set (match_operand:HI 0 "register_operand" "=r") - (minus:HI (const_int 15) - (clz:HI (match_operand:HI 1 "nonimmediate_operand" "rm")))) - (clobber (reg:CC FLAGS_REG))] - "" - "bsr{w}\t{%1, %0|%0, %1}" - [(set_attr "type" "alu1") - (set_attr "prefix_0f" "1") - (set_attr "znver1_decode" "vector") - (set_attr "mode" "HI")]) - (define_expand "clz2" [(parallel - [(set (match_operand:SWI48 0 "register_operand") + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int 0))) + (set (match_operand:SWI48 0 "register_operand") (minus:SWI48 (match_dup 2) - (clz:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")))) - (clobber (reg:CC FLAGS_REG))]) + (clz:SWI48 (match_dup 1))))]) (parallel [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2))) (clobber (reg:CC FLAGS_REG))])] -- cgit v1.1 From 37e93925366676201b526624e9f8dc32d82b4ff2 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 23 Jun 2021 16:14:31 +0200 Subject: i386: Add PPERM two-operand 64bit vector permutation [PR89021] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add emulation of V8QI PPERM permutations for TARGET_XOP target. Similar to PSHUFB, the permutation is performed with V16QI PPERM instruction, where selector is defined in V16QI mode with inactive elements set to 0x80. Specific to two operand permutations is the remapping of elements from the second operand (e.g. e[8] -> e[16]), as we have to account for the inactive elements from the first operand. 2021-06-23 Uroš Bizjak gcc/ PR target/89021 * config/i386/i386-expand.c (expand_vec_perm_pshufb): Handle 64bit modes for TARGET_XOP. Use indirect gen_* functions. * config/i386/mmx.md (mmx_ppermv64): New insn pattern. * config/i386/i386.md (unspec): Move UNSPEC_XOP_PERMUTE from ... * config/i386/sse.md (unspec): ... here. --- gcc/config/i386/i386-expand.c | 75 +++++++++++++++++++++++++++++++++++-------- gcc/config/i386/i386.md | 1 + gcc/config/i386/mmx.md | 13 ++++++++ gcc/config/i386/sse.md | 1 - 4 files changed, 75 insertions(+), 15 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 2986b49..9c922bf 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -17467,10 +17467,23 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) if (!d->one_operand_p) { - if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16) + if (GET_MODE_SIZE (d->vmode) == 8) + { + if (!TARGET_XOP) + return false; + vmode = V8QImode; + } + else if (GET_MODE_SIZE (d->vmode) == 16) + { + if (!TARGET_XOP) + return false; + } + else if (GET_MODE_SIZE (d->vmode) == 32) { - if (TARGET_AVX2 - && valid_perm_using_mode_p (V2TImode, d)) + if (!TARGET_AVX2) + return false; + + if (valid_perm_using_mode_p (V2TImode, d)) { if (d->testing_p) return true; @@ -17492,6 +17505,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) } return false; } + else + return false; } else { @@ -17651,8 +17666,22 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) { rtx m128 = GEN_INT (-128); + /* Remap elements from the second operand, as we have to + account for inactive top 8 elements from the first operand. */ + if (!d->one_operand_p) + for (i = 0; i < nelt; ++i) + { + int ival = INTVAL (rperm[i]); + if (ival >= 8) + ival += 8; + rperm[i] = GEN_INT (ival); + } + + /* V8QI is emulated with V16QI instruction, fill inactive + elements in the top 8 positions with zeros. */ for (i = nelt; i < 16; ++i) rperm[i] = m128; + vpmode = V16QImode; } @@ -17660,36 +17689,54 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm)); vperm = force_reg (vpmode, vperm); - target = d->target; - if (d->vmode != vmode) + if (vmode == d->vmode) + target = d->target; + else target = gen_reg_rtx (vmode); + op0 = gen_lowpart (vmode, d->op0); + if (d->one_operand_p) { + rtx (*gen) (rtx, rtx, rtx); + if (vmode == V8QImode) - emit_insn (gen_mmx_pshufbv8qi3 (target, op0, vperm)); + gen = gen_mmx_pshufbv8qi3; else if (vmode == V16QImode) - emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); + gen = gen_ssse3_pshufbv16qi3; else if (vmode == V32QImode) - emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); + gen = gen_avx2_pshufbv32qi3; else if (vmode == V64QImode) - emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm)); + gen = gen_avx512bw_pshufbv64qi3; else if (vmode == V8SFmode) - emit_insn (gen_avx2_permvarv8sf (target, op0, vperm)); + gen = gen_avx2_permvarv8sf; else if (vmode == V8SImode) - emit_insn (gen_avx2_permvarv8si (target, op0, vperm)); + gen = gen_avx2_permvarv8si; else if (vmode == V16SFmode) - emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm)); + gen = gen_avx512f_permvarv16sf; else if (vmode == V16SImode) - emit_insn (gen_avx512f_permvarv16si (target, op0, vperm)); + gen = gen_avx512f_permvarv16si; else gcc_unreachable (); + + emit_insn (gen (target, op0, vperm)); } else { + rtx (*gen) (rtx, rtx, rtx, rtx); + op1 = gen_lowpart (vmode, d->op1); - emit_insn (gen_xop_pperm (target, op0, op1, vperm)); + + if (vmode == V8QImode) + gen = gen_mmx_ppermv64; + else if (vmode == V16QImode) + gen = gen_xop_pperm; + else + gcc_unreachable (); + + emit_insn (gen (target, op0, op1, vperm)); } + if (target != d->target) emit_move_insn (d->target, gen_lowpart (d->vmode, target)); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 4e24210..9043be3 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -120,6 +120,7 @@ UNSPEC_MOVMSK UNSPEC_BLENDV UNSPEC_PSHUFB + UNSPEC_XOP_PERMUTE UNSPEC_RCP UNSPEC_RSQRT UNSPEC_PSADBW diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index a107ac5..7a827dc 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -2331,6 +2331,19 @@ "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "sse4arg")]) +;; XOP permute instructions +(define_insn "mmx_ppermv64" + [(set (match_operand:V8QI 0 "register_operand" "=x") + (unspec:V8QI + [(match_operand:V8QI 1 "register_operand" "x") + (match_operand:V8QI 2 "register_operand" "x") + (match_operand:V16QI 3 "nonimmediate_operand" "xm")] + UNSPEC_XOP_PERMUTE))] + "TARGET_XOP && TARGET_MMX_WITH_SSE" + "vpperm\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sse4arg") + (set_attr "mode" "TI")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral logical operations diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index f5f9403..c5f739c 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -53,7 +53,6 @@ UNSPEC_FMADDSUB UNSPEC_XOP_UNSIGNED_CMP UNSPEC_XOP_TRUEFALSE - UNSPEC_XOP_PERMUTE UNSPEC_FRCZ ;; For AES support -- cgit v1.1 From 402c818ac0b19d168e9ffc0b3413344dd6020f6a Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Tue, 22 Jun 2021 15:25:11 -0400 Subject: Use more logicals to eliminate useless test/compare instructions gcc/ * config/h8300/logical.md (3): Use so this pattern can be used for test/compare removal. Pass current insn to compute_logical_op_length and output_logical_op. * config/h8300/h8300.c (compute_logical_op_cc): Remove. (h8300_and_costs): Add argument to compute_logical_op_length. (output_logical_op): Add new argument. Use it to determine if the condition codes are used and adjust the output accordingly. (compute_logical_op_length): Add new argument and update length computations when condition codes are used. * config/h8300/h8300-protos.h (compute_logical_op_length): Update prototype. (output_logical_op): Likewise. --- gcc/config/h8300/h8300-protos.h | 7 ++- gcc/config/h8300/h8300.c | 136 ++++++++++++++-------------------------- gcc/config/h8300/logical.md | 7 +-- 3 files changed, 55 insertions(+), 95 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300-protos.h b/gcc/config/h8300/h8300-protos.h index af65329..d7efa97 100644 --- a/gcc/config/h8300/h8300-protos.h +++ b/gcc/config/h8300/h8300-protos.h @@ -36,10 +36,11 @@ extern const char *output_simode_bld (int, rtx[]); extern void final_prescan_insn (rtx_insn *, rtx *, int); extern int h8300_expand_movsi (rtx[]); extern machine_mode h8300_select_cc_mode (RTX_CODE, rtx, rtx); -extern const char *output_logical_op (machine_mode, rtx_code code, rtx *); -extern unsigned int compute_logical_op_length (machine_mode, rtx_code, rtx *); +extern const char *output_logical_op (machine_mode, rtx_code code, + rtx *, rtx_insn *); +extern unsigned int compute_logical_op_length (machine_mode, rtx_code, + rtx *, rtx_insn *); -extern int compute_logical_op_cc (machine_mode, rtx *); extern int compute_a_shift_cc (rtx, rtx *); #ifdef HAVE_ATTR_cc extern enum attr_cc compute_plussi_cc (rtx *); diff --git a/gcc/config/h8300/h8300.c b/gcc/config/h8300/h8300.c index 2b88325..511c2b2 100644 --- a/gcc/config/h8300/h8300.c +++ b/gcc/config/h8300/h8300.c @@ -1100,7 +1100,7 @@ h8300_and_costs (rtx x) operands[1] = XEXP (x, 0); operands[2] = XEXP (x, 1); operands[3] = x; - return compute_logical_op_length (GET_MODE (x), AND, operands) / 2; + return compute_logical_op_length (GET_MODE (x), AND, operands, NULL) / 2; } /* Compute the cost of a shift insn. */ @@ -2881,7 +2881,7 @@ compute_plussi_cc (rtx *operands) /* Output a logical insn. */ const char * -output_logical_op (machine_mode mode, rtx_code code, rtx *operands) +output_logical_op (machine_mode mode, rtx_code code, rtx *operands, rtx_insn *insn) { /* Pretend that every byte is affected if both operands are registers. */ const unsigned HOST_WIDE_INT intval = @@ -2906,6 +2906,19 @@ output_logical_op (machine_mode mode, rtx_code code, rtx *operands) const char *opname; char insn_buf[100]; + /* INSN is the current insn, we examine its overall form to see if we're + supposed to set or clobber the condition codes. + + This is important to know. If we are setting condition codes, then we + must do the operation in MODE and not in some smaller size. + + The key is to look at the second object in the PARALLEL. If it is not + a CLOBBER, then we care about the condition codes. */ + rtx pattern = PATTERN (insn); + gcc_assert (GET_CODE (pattern) == PARALLEL); + rtx second_op = XVECEXP (pattern, 0, 1); + bool cc_meaningful = (GET_CODE (second_op) != CLOBBER); + switch (code) { case AND: @@ -2928,8 +2941,9 @@ output_logical_op (machine_mode mode, rtx_code code, rtx *operands) output_asm_insn (insn_buf, operands); break; case E_HImode: - /* First, see if we can finish with one insn. */ - if (b0 != 0 && b1 != 0) + /* First, see if we can (or must) finish with one insn. */ + if (cc_meaningful + || (b0 != 0 && b1 != 0)) { sprintf (insn_buf, "%s.w\t%%T2,%%T0", opname); output_asm_insn (insn_buf, operands); @@ -2964,10 +2978,11 @@ output_logical_op (machine_mode mode, rtx_code code, rtx *operands) /* Check if doing everything with one insn is no worse than using multiple insns. */ - if (w0 != 0 && w1 != 0 - && !(lower_half_easy_p && upper_half_easy_p) - && !(code == IOR && w1 == 0xffff - && (w0 & 0x8000) != 0 && lower_half_easy_p)) + if (cc_meaningful + || (w0 != 0 && w1 != 0 + && !(lower_half_easy_p && upper_half_easy_p) + && !(code == IOR && w1 == 0xffff + && (w0 & 0x8000) != 0 && lower_half_easy_p))) { sprintf (insn_buf, "%s.l\t%%S2,%%S0", opname); output_asm_insn (insn_buf, operands); @@ -3037,7 +3052,7 @@ output_logical_op (machine_mode mode, rtx_code code, rtx *operands) /* Compute the length of a logical insn. */ unsigned int -compute_logical_op_length (machine_mode mode, rtx_code code, rtx *operands) +compute_logical_op_length (machine_mode mode, rtx_code code, rtx *operands, rtx_insn *insn) { /* Pretend that every byte is affected if both operands are registers. */ const unsigned HOST_WIDE_INT intval = @@ -3061,6 +3076,23 @@ compute_logical_op_length (machine_mode mode, rtx_code code, rtx *operands) /* Insn length. */ unsigned int length = 0; + /* INSN is the current insn, we examine its overall form to see if we're + supposed to set or clobber the condition codes. + + This is important to know. If we are setting condition codes, then we + must do the operation in MODE and not in some smaller size. + + The key is to look at the second object in the PARALLEL. If it is not + a CLOBBER, then we care about the condition codes. */ + bool cc_meaningful = false; + if (insn) + { + rtx pattern = PATTERN (insn); + gcc_assert (GET_CODE (pattern) == PARALLEL); + rtx second_op = XVECEXP (pattern, 0, 1); + cc_meaningful = (GET_CODE (second_op) != CLOBBER); + } + switch (mode) { case E_QImode: @@ -3068,7 +3100,8 @@ compute_logical_op_length (machine_mode mode, rtx_code code, rtx *operands) case E_HImode: /* First, see if we can finish with one insn. */ - if (b0 != 0 && b1 != 0) + if (cc_meaningful + || (b0 != 0 && b1 != 0)) { length = h8300_length_from_table (operands[1], operands[2], &logicw_length_table); @@ -3098,10 +3131,11 @@ compute_logical_op_length (machine_mode mode, rtx_code code, rtx *operands) /* Check if doing everything with one insn is no worse than using multiple insns. */ - if (w0 != 0 && w1 != 0 - && !(lower_half_easy_p && upper_half_easy_p) - && !(code == IOR && w1 == 0xffff - && (w0 & 0x8000) != 0 && lower_half_easy_p)) + if (cc_meaningful + || (w0 != 0 && w1 != 0 + && !(lower_half_easy_p && upper_half_easy_p) + && !(code == IOR && w1 == 0xffff + && (w0 & 0x8000) != 0 && lower_half_easy_p))) { length = h8300_length_from_table (operands[1], operands[2], &logicl_length_table); @@ -3158,80 +3192,6 @@ compute_logical_op_length (machine_mode mode, rtx_code code, rtx *operands) return length; } -/* Compute which flag bits are valid after a logical insn. */ - -int -compute_logical_op_cc (machine_mode mode, rtx *operands) -{ - /* Figure out the logical op that we need to perform. */ - enum rtx_code code = GET_CODE (operands[3]); - /* Pretend that every byte is affected if both operands are registers. */ - const unsigned HOST_WIDE_INT intval = - (unsigned HOST_WIDE_INT) ((GET_CODE (operands[2]) == CONST_INT) - /* Always use the full instruction if the - first operand is in memory. It is better - to use define_splits to generate the shorter - sequence where valid. */ - && register_operand (operands[1], VOIDmode) - ? INTVAL (operands[2]) : 0x55555555); - /* The determinant of the algorithm. If we perform an AND, 0 - affects a bit. Otherwise, 1 affects a bit. */ - const unsigned HOST_WIDE_INT det = (code != AND) ? intval : ~intval; - /* Break up DET into pieces. */ - const unsigned HOST_WIDE_INT b0 = (det >> 0) & 0xff; - const unsigned HOST_WIDE_INT b1 = (det >> 8) & 0xff; - const unsigned HOST_WIDE_INT w0 = (det >> 0) & 0xffff; - const unsigned HOST_WIDE_INT w1 = (det >> 16) & 0xffff; - int lower_half_easy_p = 0; - int upper_half_easy_p = 0; - /* Condition code. */ - enum attr_old_cc cc = OLD_CC_CLOBBER; - - switch (mode) - { - case E_HImode: - /* First, see if we can finish with one insn. */ - if (b0 != 0 && b1 != 0) - { - cc = OLD_CC_SET_ZNV; - } - break; - case E_SImode: - /* Determine if the lower half can be taken care of in no more - than two bytes. */ - lower_half_easy_p = (b0 == 0 - || b1 == 0 - || (code != IOR && w0 == 0xffff)); - - /* Determine if the upper half can be taken care of in no more - than two bytes. */ - upper_half_easy_p = ((code != IOR && w1 == 0xffff) - || (code == AND && w1 == 0xff00)); - - /* Check if doing everything with one insn is no worse than - using multiple insns. */ - if (w0 != 0 && w1 != 0 - && !(lower_half_easy_p && upper_half_easy_p) - && !(code == IOR && w1 == 0xffff - && (w0 & 0x8000) != 0 && lower_half_easy_p)) - { - cc = OLD_CC_SET_ZNV; - } - else - { - if (code == IOR - && w1 == 0xffff - && (w0 & 0x8000) != 0) - { - cc = OLD_CC_SET_ZNV; - } - } - break; - default: - gcc_unreachable (); - } - return cc; -} #if 0 /* Expand a conditional branch. */ diff --git a/gcc/config/h8300/logical.md b/gcc/config/h8300/logical.md index 07d36cf..f07c79e 100644 --- a/gcc/config/h8300/logical.md +++ b/gcc/config/h8300/logical.md @@ -251,17 +251,16 @@ (logicals:QHSI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) -(define_insn "*3_clobber_flags" +(define_insn "*3" [(set (match_operand:QHSI 0 "h8300_dst_operand" "=rQ") (logicals:QHSI (match_operand:QHSI 1 "h8300_dst_operand" "%0") (match_operand:QHSI 2 "h8300_src_operand" "rQi"))) (clobber (reg:CC CC_REG))] "h8300_operands_match_p (operands)" - { return output_logical_op (mode, , operands); } + { return output_logical_op (mode, , operands, insn); } [(set (attr "length") - (symbol_ref "compute_logical_op_length (mode, , operands)"))]) - + (symbol_ref "compute_logical_op_length (mode, , operands, insn)"))]) ;; ---------------------------------------------------------------------- ;; NOT INSTRUCTIONS -- cgit v1.1 From b7574ca6edb5ac7fab741f46d4f50ac4294c8180 Mon Sep 17 00:00:00 2001 From: Aaron Sawdey Date: Tue, 22 Jun 2021 16:02:15 -0500 Subject: Do not enable pcrel-opt by default SPEC2017 testing on p10 shows that this optimization does not have a positive impact on performance. So we are no longer going to enable it by default. The test cases for it needed to be updated so they always enable it to test it. gcc/ * config/rs6000/rs6000-cpus.def: Take OPTION_MASK_PCREL_OPT out of OTHER_POWER10_MASKS so it will not be enabled by default. gcc/testsuite/ * gcc.target/powerpc/pcrel-opt-inc-di.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-ld-df.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-ld-di.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-ld-hi.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-ld-qi.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-ld-sf.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-ld-si.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-ld-vector.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-st-df.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-st-di.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-st-hi.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-st-qi.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-st-sf.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-st-si.c: Enable -mpcrel-opt to test it. * gcc.target/powerpc/pcrel-opt-st-vector.c: Enable -mpcrel-opt to test it. --- gcc/config/rs6000/rs6000-cpus.def | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def index 52ce848..6758296 100644 --- a/gcc/config/rs6000/rs6000-cpus.def +++ b/gcc/config/rs6000/rs6000-cpus.def @@ -75,9 +75,11 @@ | OPTION_MASK_P9_VECTOR) /* Flags that need to be turned off if -mno-power10. */ +/* We comment out PCREL_OPT here to disable it by default because SPEC2017 + performance was degraded by it. */ #define OTHER_POWER10_MASKS (OPTION_MASK_MMA \ | OPTION_MASK_PCREL \ - | OPTION_MASK_PCREL_OPT \ + /* | OPTION_MASK_PCREL_OPT */ \ | OPTION_MASK_PREFIXED) #define ISA_3_1_MASKS_SERVER (ISA_3_0_MASKS_SERVER \ -- cgit v1.1 From 3bd86940c428de9dde53e41265fb1435ed236f5e Mon Sep 17 00:00:00 2001 From: liuhongt Date: Tue, 26 Jan 2021 16:29:32 +0800 Subject: i386: Add vashlm3/vashrm3/vlshrm3 to enable vectorization of vector shift vector. [PR98434] Add expanders for vashl, vlshr, vashr and vashr. Besides there's some assumption in expand_mult_const that mul and add must be available at the same time, but for i386, addv8qi is restricted under TARGET_64BIT, but mulv8qi not, that could cause ICE. So restrict mulv8qi and shiftv8qi under TARGET_64BIT. gcc/ChangeLog: PR target/98434 * config/i386/i386-expand.c (ix86_expand_vec_interleave): Adjust comments for ix86_expand_vecop_qihi2. (ix86_expand_vecmul_qihi): Renamed to .. (ix86_expand_vecop_qihi2): Adjust function prototype to support shift operation, add static to definition. (ix86_expand_vec_shift_qihi_constant): Add static to definition. (ix86_expand_vecop_qihi): Call ix86_expand_vecop_qihi2 and ix86_expand_vec_shift_qihi_constant. * config/i386/i386-protos.h (ix86_expand_vecmul_qihi): Deleted. (ix86_expand_vec_shift_qihi_constant): Deleted. * config/i386/sse.md (VI12_256_512_AVX512VL): New mode iterator. (mulv8qi3): Call ix86_expand_vecop_qihi directly, add condition TARGET_64BIT. (mul3): Ditto. (3): Ditto. (vlshr3): Extend to support avx512 vlshr. (v3): New expander for vashr/vlshr/vashl. (vv8qi3): Ditto. (vashrv8hi3): Renamed to .. (vashr3): And extend to support V16QImode for avx512. (vashrv16qi3): Deleted. (vashrv2di3): Extend expander to support avx512 instruction. gcc/testsuite/ChangeLog: PR target/98434 * gcc.target/i386/pr98434-1.c: New test. * gcc.target/i386/pr98434-2.c: New test. * gcc.target/i386/avx512vl-pr95488-1.c: Adjust testcase. --- gcc/config/i386/i386-expand.c | 73 +++++++++++++++++++++------ gcc/config/i386/i386-protos.h | 3 -- gcc/config/i386/sse.md | 111 +++++++++++++++++++++++++++++------------- 3 files changed, 133 insertions(+), 54 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 9c922bf..2cb939e 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -20705,8 +20705,9 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p) gcc_assert (ok); } -/* Optimize vector MUL generation for V8QI, V16QI and V32QI - under TARGET_AVX512BW. i.e. for v16qi a * b, it has +/* This function is similar as ix86_expand_vecop_qihi, + but optimized under AVX512BW by using vpmovwb. + For example, optimize vector MUL generation like vpmovzxbw ymm2, xmm0 vpmovzxbw ymm3, xmm1 @@ -20716,13 +20717,14 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p) it would take less instructions than ix86_expand_vecop_qihi. Return true if success. */ -bool -ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2) +static bool +ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2) { machine_mode himode, qimode = GET_MODE (dest); rtx hop1, hop2, hdest; rtx (*gen_extend)(rtx, rtx); rtx (*gen_truncate)(rtx, rtx); + bool uns_p = (code == ASHIFTRT) ? false : true; /* There's no V64HImode multiplication instruction. */ if (qimode == E_V64QImode) @@ -20743,17 +20745,17 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2) { case E_V8QImode: himode = V8HImode; - gen_extend = gen_zero_extendv8qiv8hi2; + gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2; gen_truncate = gen_truncv8hiv8qi2; break; case E_V16QImode: himode = V16HImode; - gen_extend = gen_zero_extendv16qiv16hi2; + gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2; gen_truncate = gen_truncv16hiv16qi2; break; case E_V32QImode: himode = V32HImode; - gen_extend = gen_zero_extendv32qiv32hi2; + gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2; gen_truncate = gen_truncv32hiv32qi2; break; default: @@ -20765,7 +20767,7 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2) hdest = gen_reg_rtx (himode); emit_insn (gen_extend (hop1, op1)); emit_insn (gen_extend (hop2, op2)); - emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode, + emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode, hop1, hop2))); emit_insn (gen_truncate (dest, hdest)); return true; @@ -20773,8 +20775,9 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2) /* Expand a vector operation shift by constant for a V*QImode in terms of the same operation on V*HImode. Return true if success. */ -bool -ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2) +static bool +ix86_expand_vec_shift_qihi_constant (enum rtx_code code, + rtx dest, rtx op1, rtx op2) { machine_mode qimode, himode; HOST_WIDE_INT and_constant, xor_constant; @@ -20886,6 +20889,16 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) bool uns_p = false; int i; + if (CONST_INT_P (op2) + && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT) + && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2)) + return; + + if (TARGET_AVX512BW + && VECTOR_MODE_P (GET_MODE (op2)) + && ix86_expand_vecop_qihi2 (code, dest, op1, op2)) + return; + switch (qimode) { case E_V16QImode: @@ -20907,7 +20920,6 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) gcc_unreachable (); } - op2_l = op2_h = op2; switch (code) { case MULT: @@ -20936,17 +20948,46 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) op1_h = gen_reg_rtx (himode); ix86_expand_sse_unpack (op1_l, op1, uns_p, false); ix86_expand_sse_unpack (op1_h, op1, uns_p, true); + /* vashr/vlshr/vashl */ + if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT) + { + rtx tmp = force_reg (qimode, op2); + op2_l = gen_reg_rtx (himode); + op2_h = gen_reg_rtx (himode); + ix86_expand_sse_unpack (op2_l, tmp, uns_p, false); + ix86_expand_sse_unpack (op2_h, tmp, uns_p, true); + } + else + op2_l = op2_h = op2; + full_interleave = true; break; default: gcc_unreachable (); } - /* Perform the operation. */ - res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX, - 1, OPTAB_DIRECT); - res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX, - 1, OPTAB_DIRECT); + /* Perform vashr/vlshr/vashl. */ + if (code != MULT + && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT) + { + res_l = gen_reg_rtx (himode); + res_h = gen_reg_rtx (himode); + emit_insn (gen_rtx_SET (res_l, + simplify_gen_binary (code, himode, + op1_l, op2_l))); + emit_insn (gen_rtx_SET (res_h, + simplify_gen_binary (code, himode, + op1_h, op2_h))); + } + /* Performance mult/ashr/lshr/ashl. */ + else + { + res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX, + 1, OPTAB_DIRECT); + res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX, + 1, OPTAB_DIRECT); + } + gcc_assert (res_l && res_h); /* Merge the data back into the right place. */ diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 1d05206..65fc307 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -208,10 +208,7 @@ extern void ix86_expand_round (rtx, rtx); extern void ix86_expand_rounddf_32 (rtx, rtx); extern void ix86_expand_round_sse4 (rtx, rtx); -extern bool ix86_expand_vecmul_qihi (rtx, rtx, rtx); extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx); -extern bool ix86_expand_vec_shift_qihi_constant (enum rtx_code, rtx, rtx, rtx); - extern rtx ix86_split_stack_guard (void); extern void ix86_move_vector_high_sse_to_mmx (rtx); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index c5f739c..5bd65dd 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -397,6 +397,10 @@ (define_mode_iterator VI1_AVX512F [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI]) +(define_mode_iterator VI12_256_512_AVX512VL + [V64QI (V32QI "TARGET_AVX512VL") + V32HI (V16HI "TARGET_AVX512VL")]) + (define_mode_iterator VI2_AVX2 [(V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX2") V8HI]) @@ -11780,9 +11784,9 @@ [(set (match_operand:V8QI 0 "register_operand") (mult:V8QI (match_operand:V8QI 1 "register_operand") (match_operand:V8QI 2 "register_operand")))] - "TARGET_AVX512VL && TARGET_AVX512BW" + "TARGET_AVX512VL && TARGET_AVX512BW && TARGET_64BIT" { - gcc_assert (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2])); + ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]); DONE; }) @@ -11792,8 +11796,6 @@ (match_operand:VI1_AVX512 2 "register_operand")))] "TARGET_SSE2" { - if (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2])) - DONE; ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]); DONE; }) @@ -20239,12 +20241,20 @@ (lshiftrt:VI12_128 (match_operand:VI12_128 1 "register_operand") (match_operand:VI12_128 2 "nonimmediate_operand")))] - "TARGET_XOP" + "TARGET_XOP || (TARGET_AVX512BW && TARGET_AVX512VL)" { - rtx neg = gen_reg_rtx (mode); - emit_insn (gen_neg2 (neg, operands[2])); - emit_insn (gen_xop_shl3 (operands[0], operands[1], neg)); - DONE; + if (TARGET_XOP) + { + rtx neg = gen_reg_rtx (mode); + emit_insn (gen_neg2 (neg, operands[2])); + emit_insn (gen_xop_shl3 (operands[0], operands[1], neg)); + DONE; + } + else if (mode == V16QImode) + { + ix86_expand_vecop_qihi (LSHIFTRT, operands[0], operands[1], operands[2]); + DONE; + } }) (define_expand "vlshr3" @@ -20263,6 +20273,31 @@ } }) +(define_expand "v3" + [(set (match_operand:VI12_256_512_AVX512VL 0 "register_operand") + (any_shift:VI12_256_512_AVX512VL + (match_operand:VI12_256_512_AVX512VL 1 "register_operand") + (match_operand:VI12_256_512_AVX512VL 2 "nonimmediate_operand")))] + "TARGET_AVX512BW" +{ + if (mode == V32QImode || mode == V64QImode) + { + ix86_expand_vecop_qihi (, operands[0], operands[1], operands[2]); + DONE; + } +}) + +(define_expand "vv8qi3" + [(set (match_operand:V8QI 0 "register_operand") + (any_shift:V8QI + (match_operand:V8QI 1 "register_operand") + (match_operand:V8QI 2 "nonimmediate_operand")))] + "TARGET_AVX512BW && TARGET_AVX512VL && TARGET_64BIT" +{ + ix86_expand_vecop_qihi (, operands[0], operands[1], operands[2]); + DONE; +}) + (define_expand "vlshr3" [(set (match_operand:VI48_512 0 "register_operand") (lshiftrt:VI48_512 @@ -20277,33 +20312,32 @@ (match_operand:VI48_256 2 "nonimmediate_operand")))] "TARGET_AVX2") -(define_expand "vashrv8hi3" - [(set (match_operand:V8HI 0 "register_operand") - (ashiftrt:V8HI - (match_operand:V8HI 1 "register_operand") - (match_operand:V8HI 2 "nonimmediate_operand")))] +(define_expand "vashr3" + [(set (match_operand:VI8_256_512 0 "register_operand") + (ashiftrt:VI8_256_512 + (match_operand:VI8_256_512 1 "register_operand") + (match_operand:VI8_256_512 2 "nonimmediate_operand")))] + "TARGET_AVX512F") + +(define_expand "vashr3" + [(set (match_operand:VI12_128 0 "register_operand") + (ashiftrt:VI12_128 + (match_operand:VI12_128 1 "register_operand") + (match_operand:VI12_128 2 "nonimmediate_operand")))] "TARGET_XOP || (TARGET_AVX512BW && TARGET_AVX512VL)" { if (TARGET_XOP) { - rtx neg = gen_reg_rtx (V8HImode); - emit_insn (gen_negv8hi2 (neg, operands[2])); - emit_insn (gen_xop_shav8hi3 (operands[0], operands[1], neg)); + rtx neg = gen_reg_rtx (mode); + emit_insn (gen_neg2 (neg, operands[2])); + emit_insn (gen_xop_sha3 (operands[0], operands[1], neg)); + DONE; + } + else if(mode == V16QImode) + { + ix86_expand_vecop_qihi (ASHIFTRT, operands[0],operands[1], operands[2]); DONE; } -}) - -(define_expand "vashrv16qi3" - [(set (match_operand:V16QI 0 "register_operand") - (ashiftrt:V16QI - (match_operand:V16QI 1 "register_operand") - (match_operand:V16QI 2 "nonimmediate_operand")))] - "TARGET_XOP" -{ - rtx neg = gen_reg_rtx (V16QImode); - emit_insn (gen_negv16qi2 (neg, operands[2])); - emit_insn (gen_xop_shav16qi3 (operands[0], operands[1], neg)); - DONE; }) (define_expand "vashrv2di3" @@ -20354,10 +20388,18 @@ (ashift:VI12_128 (match_operand:VI12_128 1 "register_operand") (match_operand:VI12_128 2 "nonimmediate_operand")))] - "TARGET_XOP" + "TARGET_XOP || (TARGET_AVX512BW && TARGET_AVX512VL)" { - emit_insn (gen_xop_sha3 (operands[0], operands[1], operands[2])); - DONE; + if (TARGET_XOP) + { + emit_insn (gen_xop_sha3 (operands[0], operands[1], operands[2])); + DONE; + } + else if (mode == V16QImode) + { + ix86_expand_vecop_qihi (ASHIFT, operands[0], operands[1], operands[2]); + DONE; + } }) (define_expand "vashl3" @@ -20461,8 +20503,7 @@ gen = ( == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3); emit_insn (gen (operands[0], operands[1], tmp)); } - else if (!ix86_expand_vec_shift_qihi_constant (, operands[0], - operands[1], operands[2])) + else ix86_expand_vecop_qihi (, operands[0], operands[1], operands[2]); DONE; }) -- cgit v1.1 From a1c1b7a888ade6f21bc7c7f05a2cbff290273fcc Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 17 Jun 2021 14:18:17 +0200 Subject: IBM Z: Define NO_PROFILE_COUNTERS s390 glibc does not need counters in the .data section, since it stores edge hits in its own data structure. Therefore counters only waste space and confuse diffing tools (e.g. kpatch), so don't generate them. gcc/ChangeLog: * config/s390/s390.c (s390_function_profiler): Ignore labelno parameter. * config/s390/s390.h (NO_PROFILE_COUNTERS): Define. gcc/testsuite/ChangeLog: * gcc.target/s390/mnop-mcount-m31-mzarch.c: Adapt to the new prologue size. * gcc.target/s390/mnop-mcount-m64.c: Likewise. --- gcc/config/s390/s390.c | 42 ++++++++++++++++-------------------------- gcc/config/s390/s390.h | 2 ++ 2 files changed, 18 insertions(+), 26 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 6bbeb64..590dd8f 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -13110,33 +13110,25 @@ output_asm_nops (const char *user, int hw) } } -/* Output assembler code to FILE to increment profiler label # LABELNO - for profiling a function entry. */ +/* Output assembler code to FILE to call a profiler hook. */ void -s390_function_profiler (FILE *file, int labelno) +s390_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) { - rtx op[8]; - - char label[128]; - ASM_GENERATE_INTERNAL_LABEL (label, "LP", labelno); + rtx op[4]; fprintf (file, "# function profiler \n"); op[0] = gen_rtx_REG (Pmode, RETURN_REGNUM); op[1] = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM); op[1] = gen_rtx_MEM (Pmode, plus_constant (Pmode, op[1], UNITS_PER_LONG)); - op[7] = GEN_INT (UNITS_PER_LONG); - - op[2] = gen_rtx_REG (Pmode, 1); - op[3] = gen_rtx_SYMBOL_REF (Pmode, label); - SYMBOL_REF_FLAGS (op[3]) = SYMBOL_FLAG_LOCAL; + op[3] = GEN_INT (UNITS_PER_LONG); - op[4] = gen_rtx_SYMBOL_REF (Pmode, flag_fentry ? "__fentry__" : "_mcount"); + op[2] = gen_rtx_SYMBOL_REF (Pmode, flag_fentry ? "__fentry__" : "_mcount"); if (flag_pic) { - op[4] = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op[4]), UNSPEC_PLT); - op[4] = gen_rtx_CONST (Pmode, op[4]); + op[2] = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op[2]), UNSPEC_PLT); + op[2] = gen_rtx_CONST (Pmode, op[2]); } if (flag_record_mcount) @@ -13150,20 +13142,19 @@ s390_function_profiler (FILE *file, int labelno) warning (OPT_Wcannot_profile, "nested functions cannot be profiled " "with %<-mfentry%> on s390"); else - output_asm_insn ("brasl\t0,%4", op); + output_asm_insn ("brasl\t0,%2", op); } else if (TARGET_64BIT) { if (flag_nop_mcount) - output_asm_nops ("-mnop-mcount", /* stg */ 3 + /* larl */ 3 + - /* brasl */ 3 + /* lg */ 3); + output_asm_nops ("-mnop-mcount", /* stg */ 3 + /* brasl */ 3 + + /* lg */ 3); else { output_asm_insn ("stg\t%0,%1", op); if (flag_dwarf2_cfi_asm) - output_asm_insn (".cfi_rel_offset\t%0,%7", op); - output_asm_insn ("larl\t%2,%3", op); - output_asm_insn ("brasl\t%0,%4", op); + output_asm_insn (".cfi_rel_offset\t%0,%3", op); + output_asm_insn ("brasl\t%0,%2", op); output_asm_insn ("lg\t%0,%1", op); if (flag_dwarf2_cfi_asm) output_asm_insn (".cfi_restore\t%0", op); @@ -13172,15 +13163,14 @@ s390_function_profiler (FILE *file, int labelno) else { if (flag_nop_mcount) - output_asm_nops ("-mnop-mcount", /* st */ 2 + /* larl */ 3 + - /* brasl */ 3 + /* l */ 2); + output_asm_nops ("-mnop-mcount", /* st */ 2 + /* brasl */ 3 + + /* l */ 2); else { output_asm_insn ("st\t%0,%1", op); if (flag_dwarf2_cfi_asm) - output_asm_insn (".cfi_rel_offset\t%0,%7", op); - output_asm_insn ("larl\t%2,%3", op); - output_asm_insn ("brasl\t%0,%4", op); + output_asm_insn (".cfi_rel_offset\t%0,%3", op); + output_asm_insn ("brasl\t%0,%2", op); output_asm_insn ("l\t%0,%1", op); if (flag_dwarf2_cfi_asm) output_asm_insn (".cfi_restore\t%0", op); diff --git a/gcc/config/s390/s390.h b/gcc/config/s390/s390.h index 3b87616..fb16a45 100644 --- a/gcc/config/s390/s390.h +++ b/gcc/config/s390/s390.h @@ -787,6 +787,8 @@ CUMULATIVE_ARGS; #define PROFILE_BEFORE_PROLOGUE 1 +#define NO_PROFILE_COUNTERS 1 + /* Trampolines for nested functions. */ -- cgit v1.1 From 7a6c31f0f84a7295433ebac09b94fae2d5cc2892 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 31 May 2021 13:19:01 +0200 Subject: Add x86 addsub SLP pattern This addds SLP pattern recognition for the SSE3/AVX [v]addsubp{ds} v0, v1 instructions which compute { v0[0] - v1[0], v0[1], + v1[1], ... } thus subtract, add alternating on lanes, starting with subtract. It adds a corresponding optab and direct internal function, vec_addsub$a3 and renames the existing i386 backend patterns to the new canonical name. The SLP pattern matches the exact alternating lane sequence rather than trying to be clever and anticipating incoming permutes - we could permute the two input vectors to the needed lane alternation, do the addsub and then permute the result vector back but that's only profitable in case the two input or the output permute will vanish - something Tamars refactoring of SLP pattern recog should make possible. 2021-06-17 Richard Biener * config/i386/sse.md (avx_addsubv4df3): Rename to vec_addsubv4df3. (avx_addsubv8sf3): Rename to vec_addsubv8sf3. (sse3_addsubv2df3): Rename to vec_addsubv2df3. (sse3_addsubv4sf3): Rename to vec_addsubv4sf3. * config/i386/i386-builtin.def: Adjust. * internal-fn.def (VEC_ADDSUB): New internal optab fn. * optabs.def (vec_addsub_optab): New optab. * tree-vect-slp-patterns.c (class addsub_pattern): New. (slp_patterns): Add addsub_pattern. * tree-vect-slp.c (vect_optimize_slp): Disable propagation across CFN_VEC_ADDSUB. * tree-vectorizer.h (vect_pattern::vect_pattern): Make m_ops optional. * doc/md.texi (vec_addsub3): Document. * gcc.target/i386/vect-addsubv2df.c: New testcase. * gcc.target/i386/vect-addsubv4sf.c: Likewise. * gcc.target/i386/vect-addsubv4df.c: Likewise. * gcc.target/i386/vect-addsubv8sf.c: Likewise. * gcc.target/i386/vect-addsub-2.c: Likewise. * gcc.target/i386/vect-addsub-3.c: Likewise. --- gcc/config/i386/i386-builtin.def | 8 ++++---- gcc/config/i386/sse.md | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 31df3a6..ea79e0b 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -855,8 +855,8 @@ BDESC (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_mmx_subv1di3, "__ BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF) BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF) -BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF) -BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF) +BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_vec_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF) +BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_vec_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF) BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF) BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF) BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF) @@ -996,8 +996,8 @@ BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128 /* AVX */ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF) -BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF) -BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF) +BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_vec_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF) +BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_vec_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 5bd65dd..1f1db82 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2410,7 +2410,7 @@ (set_attr "prefix" "") (set_attr "mode" "")]) -(define_insn "avx_addsubv4df3" +(define_insn "vec_addsubv4df3" [(set (match_operand:V4DF 0 "register_operand" "=x") (vec_merge:V4DF (minus:V4DF @@ -2424,7 +2424,7 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4DF")]) -(define_insn "sse3_addsubv2df3" +(define_insn "vec_addsubv2df3" [(set (match_operand:V2DF 0 "register_operand" "=x,x") (vec_merge:V2DF (minus:V2DF @@ -2442,7 +2442,7 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "V2DF")]) -(define_insn "avx_addsubv8sf3" +(define_insn "vec_addsubv8sf3" [(set (match_operand:V8SF 0 "register_operand" "=x") (vec_merge:V8SF (minus:V8SF @@ -2456,7 +2456,7 @@ (set_attr "prefix" "vex") (set_attr "mode" "V8SF")]) -(define_insn "sse3_addsubv4sf3" +(define_insn "vec_addsubv4sf3" [(set (match_operand:V4SF 0 "register_operand" "=x,x") (vec_merge:V4SF (minus:V4SF -- cgit v1.1 From 67e872336d0945523071ceec6580a584db7061a9 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 18 Jun 2021 09:29:10 +0200 Subject: Merge vec_addsub patterns This merges the vec_addsub3 patterns using a mode attribute for the vec_merge merge operand. 2021-06-18 Richard Biener * config/i386/sse.md (vec_addsubv4df3, vec_addsubv2df3, vec_addsubv8sf3, vec_addsubv4sf3): Merge into ... (vec_addsub3): ... using a new addsub_cst mode attribute. --- gcc/config/i386/sse.md | 81 +++++++++++++++----------------------------------- 1 file changed, 24 insertions(+), 57 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 1f1db82..2d29877 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2410,69 +2410,36 @@ (set_attr "prefix" "") (set_attr "mode" "")]) -(define_insn "vec_addsubv4df3" - [(set (match_operand:V4DF 0 "register_operand" "=x") - (vec_merge:V4DF - (minus:V4DF - (match_operand:V4DF 1 "register_operand" "x") - (match_operand:V4DF 2 "nonimmediate_operand" "xm")) - (plus:V4DF (match_dup 1) (match_dup 2)) - (const_int 5)))] - "TARGET_AVX" - "vaddsubpd\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "sseadd") - (set_attr "prefix" "vex") - (set_attr "mode" "V4DF")]) - -(define_insn "vec_addsubv2df3" - [(set (match_operand:V2DF 0 "register_operand" "=x,x") - (vec_merge:V2DF - (minus:V2DF - (match_operand:V2DF 1 "register_operand" "0,x") - (match_operand:V2DF 2 "vector_operand" "xBm,xm")) - (plus:V2DF (match_dup 1) (match_dup 2)) - (const_int 1)))] - "TARGET_SSE3" - "@ - addsubpd\t{%2, %0|%0, %2} - vaddsubpd\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "isa" "noavx,avx") - (set_attr "type" "sseadd") - (set_attr "atom_unit" "complex") - (set_attr "prefix" "orig,vex") - (set_attr "mode" "V2DF")]) - -(define_insn "vec_addsubv8sf3" - [(set (match_operand:V8SF 0 "register_operand" "=x") - (vec_merge:V8SF - (minus:V8SF - (match_operand:V8SF 1 "register_operand" "x") - (match_operand:V8SF 2 "nonimmediate_operand" "xm")) - (plus:V8SF (match_dup 1) (match_dup 2)) - (const_int 85)))] - "TARGET_AVX" - "vaddsubps\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "sseadd") - (set_attr "prefix" "vex") - (set_attr "mode" "V8SF")]) +(define_mode_attr addsub_cst [(V4DF "5") (V2DF "1") + (V4SF "5") (V8SF "85")]) -(define_insn "vec_addsubv4sf3" - [(set (match_operand:V4SF 0 "register_operand" "=x,x") - (vec_merge:V4SF - (minus:V4SF - (match_operand:V4SF 1 "register_operand" "0,x") - (match_operand:V4SF 2 "vector_operand" "xBm,xm")) - (plus:V4SF (match_dup 1) (match_dup 2)) - (const_int 5)))] +(define_insn "vec_addsub3" + [(set (match_operand:VF_128_256 0 "register_operand" "=x,x") + (vec_merge:VF_128_256 + (minus:VF_128_256 + (match_operand:VF_128_256 1 "register_operand" "0,x") + (match_operand:VF_128_256 2 "vector_operand" "xBm, xm")) + (plus:VF_128_256 (match_dup 1) (match_dup 2)) + (const_int )))] "TARGET_SSE3" "@ - addsubps\t{%2, %0|%0, %2} - vaddsubps\t{%2, %1, %0|%0, %1, %2}" + addsub\t{%2, %0|%0, %2} + vaddsub\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sseadd") + (set (attr "atom_unit") + (if_then_else + (match_test "mode == V2DFmode") + (const_string "complex") + (const_string "*"))) (set_attr "prefix" "orig,vex") - (set_attr "prefix_rep" "1,*") - (set_attr "mode" "V4SF")]) + (set (attr "prefix_rep") + (if_then_else + (and (match_test "mode == V4SFmode") + (eq_attr "alternative" "0")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "")]) (define_split [(set (match_operand:VF_128_256 0 "register_operand") -- cgit v1.1 From 836328b2c99f5b8d45dcca5797f162af322e74da Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 24 Jun 2021 15:39:26 +0200 Subject: i386: Add pack/unpack patterns for 64bit vectors [PR89021] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2021-06-24 Uroš Bizjak gcc/ PR target/89021 * config/i386/i386-expand.c (ix86_expand_sse_unpack): Handle V8QI and V4HI modes. * config/i386/mmx.md (sse4_1_v4qiv4hi2): New insn pattern. (sse4_1_v4qiv4hi2): Ditto. (mmxpackmode): New mode attribute. (vec_pack_trunc_): New expander. (mmxunpackmode): New mode attribute. (vec_unpacks_lo_): New expander. (vec_unpacks_hi_): Ditto. (vec_unpacku_lo_): Ditto. (vec_unpacku_hi_): Ditto. * config/i386/i386.md (extsuffix): Move from ... * config/i386/sse.md: ... here. gcc/testsuite/ PR target/89021 * gcc.dg/vect/vect-nb-iter-ub-3.c (dg-additional-options): Add --param vect-epilogues-nomask=0. * gcc.target/i386/pr97249-1.c (foo): Add #pragma GCC unroll to avoid loop vectorization. (foo1): Ditto. (foo2): Ditto. --- gcc/config/i386/i386-expand.c | 46 ++++++++++++++++++++++++--- gcc/config/i386/i386.md | 3 ++ gcc/config/i386/mmx.md | 72 +++++++++++++++++++++++++++++++++++++++++++ gcc/config/i386/sse.md | 3 -- 4 files changed, 117 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 2cb939e..e9763eb 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -5161,6 +5161,18 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) else unpack = gen_sse4_1_sign_extendv2siv2di2; break; + case E_V8QImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv4qiv4hi2; + else + unpack = gen_sse4_1_sign_extendv4qiv4hi2; + break; + case E_V4HImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv2hiv2si2; + else + unpack = gen_sse4_1_sign_extendv2hiv2si2; + break; default: gcc_unreachable (); } @@ -5172,10 +5184,24 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) } else if (high_p) { - /* Shift higher 8 bytes to lower 8 bytes. */ - tmp = gen_reg_rtx (V1TImode); - emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src), - GEN_INT (64))); + switch (GET_MODE_SIZE (imode)) + { + case 16: + /* Shift higher 8 bytes to lower 8 bytes. */ + tmp = gen_reg_rtx (V1TImode); + emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src), + GEN_INT (64))); + break; + case 8: + /* Shift higher 4 bytes to lower 4 bytes. */ + tmp = gen_reg_rtx (V1DImode); + emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src), + GEN_INT (32))); + break; + default: + gcc_unreachable (); + } + tmp = gen_lowpart (imode, tmp); } else @@ -5207,6 +5233,18 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) else unpack = gen_vec_interleave_lowv4si; break; + case E_V8QImode: + if (high_p) + unpack = gen_mmx_punpckhbw; + else + unpack = gen_mmx_punpcklbw; + break; + case E_V4HImode: + if (high_p) + unpack = gen_mmx_punpckhwd; + else + unpack = gen_mmx_punpcklwd; + break; default: gcc_unreachable (); } diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 9043be3..9b619e2 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -1000,6 +1000,9 @@ (define_code_attr trunsuffix [(ss_truncate "s") (truncate "") (us_truncate "us")]) +;; Instruction suffix for SSE sign and zero extensions. +(define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")]) + ;; Used in signed and unsigned fix. (define_code_iterator any_fix [fix unsigned_fix]) (define_code_attr fixsuffix [(fix "") (unsigned_fix "u")]) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 7a827dc..e887f034 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -2639,6 +2639,78 @@ (set_attr "type" "mmxcvt,sselog,sselog") (set_attr "mode" "DI,TI,TI")]) +(define_insn "sse4_1_v4qiv4hi2" + [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,Yw") + (any_extend:V4HI + (vec_select:V4QI + (match_operand:V8QI 1 "register_operand" "Yr,*x,Yw") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "%vpmovbw\t{%1, %0|%0, %1}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,maybe_evex") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_v2hiv2si2" + [(set (match_operand:V2SI 0 "register_operand" "=Yr,*x,v") + (any_extend:V2SI + (vec_select:V2HI + (match_operand:V4HI 1 "register_operand" "Yr,*x,v") + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "%vpmovwd\t{%1, %0|%0, %1}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,maybe_evex") + (set_attr "mode" "TI")]) + +;; Pack/unpack vector modes +(define_mode_attr mmxpackmode + [(V4HI "V8QI") (V2SI "V4HI")]) + +(define_expand "vec_pack_trunc_" + [(match_operand: 0 "register_operand") + (match_operand:MMXMODE24 1 "register_operand") + (match_operand:MMXMODE24 2 "register_operand")] + "TARGET_MMX_WITH_SSE" +{ + rtx op1 = gen_lowpart (mode, operands[1]); + rtx op2 = gen_lowpart (mode, operands[2]); + ix86_expand_vec_extract_even_odd (operands[0], op1, op2, 0); + DONE; +}) + +(define_mode_attr mmxunpackmode + [(V8QI "V4HI") (V4HI "V2SI")]) + +(define_expand "vec_unpacks_lo_" + [(match_operand: 0 "register_operand") + (match_operand:MMXMODE12 1 "register_operand")] + "TARGET_MMX_WITH_SSE" + "ix86_expand_sse_unpack (operands[0], operands[1], false, false); DONE;") + +(define_expand "vec_unpacks_hi_" + [(match_operand: 0 "register_operand") + (match_operand:MMXMODE12 1 "register_operand")] + "TARGET_MMX_WITH_SSE" + "ix86_expand_sse_unpack (operands[0], operands[1], false, true); DONE;") + +(define_expand "vec_unpacku_lo_" + [(match_operand: 0 "register_operand") + (match_operand:MMXMODE12 1 "register_operand")] + "TARGET_MMX_WITH_SSE" + "ix86_expand_sse_unpack (operands[0], operands[1], true, false); DONE;") + +(define_expand "vec_unpacku_hi_" + [(match_operand: 0 "register_operand") + (match_operand:MMXMODE12 1 "register_operand")] + "TARGET_MMX_WITH_SSE" + "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;") + (define_insn "*mmx_pinsrd" [(set (match_operand:V2SI 0 "register_operand" "=x,Yv") (vec_merge:V2SI diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 2d29877..e4f01e6 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -976,9 +976,6 @@ [(V8SI "si") (V8SF "ps") (V4DF "pd") (V16SI "si") (V16SF "ps") (V8DF "pd")]) -;; Instruction suffix for sign and zero extensions. -(define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")]) - ;; i128 for integer vectors and TARGET_AVX2, f128 otherwise. ;; i64x4 or f64x4 for 512bit modes. (define_mode_attr i128 -- cgit v1.1 From 980e278dbe5b50dc5a856ea627359c521f1cda53 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Thu, 24 Jun 2021 16:14:13 +0800 Subject: Revert x86_order_regs_for_local_alloc changes in r12-1669. Still put general regs as first alloca order. gcc/ChangeLog: PR target/101185 * config/i386/i386.c (x86_order_regs_for_local_alloc): Revert r12-1669. gcc/testsuite/ChangeLog PR target/101185 * gcc.target/i386/bitwise_mask_op-3.c: Add xfail to temporarily avoid regression, eventually xfail should be removed. --- gcc/config/i386/i386.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 3d5883b..c71c9e6 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -20476,15 +20476,6 @@ x86_order_regs_for_local_alloc (void) int pos = 0; int i; - /* When allocano cost of GENERAL_REGS is same as MASK_REGS, allocate - MASK_REGS first since it has already been disparaged. This is for - testcase bitwise_mask_op3.c where the input is allocated as mask - registers, then mask bitwise instructions should be used there. - Refer to pr101142. */ - /* Mask register. */ - for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++) - reg_alloc_order [pos++] = i; - /* First allocate the local general purpose registers. */ for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) if (GENERAL_REGNO_P (i) && call_used_or_fixed_reg_p (i)) @@ -20511,6 +20502,10 @@ x86_order_regs_for_local_alloc (void) for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++) reg_alloc_order [pos++] = i; + /* Mask register. */ + for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++) + reg_alloc_order [pos++] = i; + /* x87 registers. */ if (TARGET_SSE_MATH) for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++) -- cgit v1.1 From 3a50aed09edc5e69080a0e49851acdb874227256 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Fri, 25 Jun 2021 09:22:28 -0400 Subject: Use right shifts to eliminate redundant test/compare insns on the H8 gcc/ * config/h8300/h8300.c (select_cc_mode): Handle ASHIFTRT and LSHIFTRT. --- gcc/config/h8300/h8300.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300.c b/gcc/config/h8300/h8300.c index 511c2b2..d8b4bfc 100644 --- a/gcc/config/h8300/h8300.c +++ b/gcc/config/h8300/h8300.c @@ -1947,9 +1947,10 @@ h8300_select_cc_mode (enum rtx_code cond, rtx op0, rtx op1) if (op1 == const0_rtx && (cond == EQ || cond == NE || cond == LT || cond == GE) && (GET_CODE (op0) == PLUS || GET_CODE (op0) == MINUS - || GET_CODE (op0) == NEG || GET_CODE (op0) == AND - || GET_CODE (op0) == IOR || GET_CODE (op0) == XOR - || GET_CODE (op0) == NOT || GET_CODE (op0) == ASHIFT + || GET_CODE (op0) == NEG || GET_CODE (op0) == AND + || GET_CODE (op0) == IOR || GET_CODE (op0) == XOR + || GET_CODE (op0) == NOT || GET_CODE (op0) == ASHIFT + || GET_CODE (op0) == ASHIFTRT || GET_CODE (op0) == LSHIFTRT || GET_CODE (op0) == MULT || GET_CODE (op0) == SYMBOL_REF || GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND || REG_P (op0) || MEM_P (op0))) -- cgit v1.1 From 3966726333b2a4cf54333549c8331d833364266e Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Thu, 24 Jun 2021 15:40:25 -0400 Subject: aix: Add AIX 7.3 configuration and SPDX License Identifiers. The anticipated release of AIX 7.3 has been announced. This patch adds the configuration bits based on AIX 7.2 configuration. gcc/ChangeLog: * config.gcc: Add SPDX License Identifier. (powerpc-ibm-aix789): Default to aix73.h. (powerpc-ibm-aix7.2.*.*): New stanza. * config/rs6000/aix72.h: Add SPDX License Identifier. * config/rs6000/aix73.h: New file. --- gcc/config/rs6000/aix72.h | 5 +- gcc/config/rs6000/aix73.h | 294 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 297 insertions(+), 2 deletions(-) create mode 100644 gcc/config/rs6000/aix73.h (limited to 'gcc/config') diff --git a/gcc/config/rs6000/aix72.h b/gcc/config/rs6000/aix72.h index 4cd27e3..a497a7d 100644 --- a/gcc/config/rs6000/aix72.h +++ b/gcc/config/rs6000/aix72.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-3.0-or-later /* Definitions of target machine for GNU compiler, for IBM RS/6000 POWER running AIX V7.2. Copyright (C) 2002-2021 Free Software Foundation, Inc. @@ -124,7 +125,7 @@ do { \ %{mpe: -I%R/usr/lpp/ppe.poe/include} \ %{pthread: -D_THREAD_SAFE}" -/* The GNU C++ standard library requires that these macros be +/* The GNU C++ standard library requires that these macros be defined. Synchronize with libstdc++ os_defines.h. */ #define CPLUSPLUS_CPP_SPEC_COMMON \ "-D_ALL_SOURCE -D__COMPATMATH__ \ @@ -254,7 +255,7 @@ do { \ #define LD_INIT_SWITCH "-binitfini" #ifndef _AIX52 -extern long long int atoll(const char *); +extern long long int atoll(const char *); #endif /* This target uses the aix64.opt file. */ diff --git a/gcc/config/rs6000/aix73.h b/gcc/config/rs6000/aix73.h new file mode 100644 index 0000000..c707c7e --- /dev/null +++ b/gcc/config/rs6000/aix73.h @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-3.0-or-later +/* Definitions of target machine for GNU compiler, + for IBM RS/6000 POWER running AIX V7.3. + Copyright (C) 2002-2021 Free Software Foundation, Inc. + Contributed by David Edelsohn (edelsohn@gnu.org). + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* The macro SUBTARGET_OVERRIDE_OPTIONS is provided for subtargets, to + get control in TARGET_OPTION_OVERRIDE. */ + +#define SUBTARGET_OVERRIDE_OPTIONS \ +do { \ + if (TARGET_64BIT && ! TARGET_POWERPC64) \ + { \ + rs6000_isa_flags |= OPTION_MASK_POWERPC64; \ + warning (0, "%<-maix64%> requires PowerPC64 architecture remain enabled"); \ + } \ + if (TARGET_SOFT_FLOAT && TARGET_LONG_DOUBLE_128) \ + { \ + rs6000_long_double_type_size = 64; \ + if (global_options_set.x_rs6000_long_double_type_size) \ + warning (0, "soft-float and long-double-128 are incompatible"); \ + } \ + if (TARGET_POWERPC64 && ! TARGET_64BIT) \ + { \ + error ("%<-maix64%> required: 64-bit computation with 32-bit addressing not yet supported"); \ + } \ + if ((rs6000_isa_flags_explicit \ + & OPTION_MASK_MINIMAL_TOC) != 0) \ + { \ + if (global_options_set.x_rs6000_current_cmodel \ + && rs6000_current_cmodel != CMODEL_SMALL) \ + error ("%<-mcmodel%> incompatible with other toc options"); \ + SET_CMODEL (CMODEL_SMALL); \ + } \ + if (rs6000_current_cmodel != CMODEL_SMALL) \ + { \ + TARGET_NO_FP_IN_TOC = 1; \ + TARGET_NO_SUM_IN_TOC = 1; \ + } \ + if (rs6000_current_cmodel == CMODEL_MEDIUM) \ + { \ + rs6000_current_cmodel = CMODEL_LARGE; \ + } \ + if (! strcmp (lang_hooks.name, "GNU Go") \ + && TARGET_32BIT) \ + { \ + /* aix/ppc doesn't support -mvsx and -maltivec with Go */ \ + rs6000_isa_flags &= ~(OPTION_MASK_VSX | OPTION_MASK_ALTIVEC); \ + } \ + if (!global_options_set.x_dwarf_version) \ + /* AIX only supports DWARF 4. */ \ + dwarf_version = 4; \ +} while (0) + +#define ASM_SPEC32 "-a32" +#define ASM_SPEC64 "-a64" +#define ASM_SPEC_COMMON "-u %(asm_cpu)" + +/* Common ASM definitions used by ASM_SPEC amongst the various targets for + handling -mcpu=xxx switches. There is a parallel list in driver-rs6000.c to + provide the default assembler options if the user uses -mcpu=native, so if + you make changes here, make them there also. */ +#undef ASM_CPU_SPEC +#define ASM_CPU_SPEC \ +"%{mcpu=native: %(asm_cpu_native); \ + mcpu=power10: -mpwr10; \ + mcpu=power9: -mpwr9; \ + mcpu=power8: -mpwr8; \ + mcpu=power7: -mpwr7; \ + mcpu=power6x|mcpu=power6: -mpwr6; \ + mcpu=power5+: -mpwr5x; \ + mcpu=power5: -mpwr5; \ + mcpu=power4: -mpwr4; \ + mcpu=power3: -m620; \ + mcpu=powerpc: -mppc; \ + mcpu=rs64: -mppc; \ + mcpu=603: -m603; \ + mcpu=603e: -m603; \ + mcpu=604: -m604; \ + mcpu=604e: -m604; \ + mcpu=620: -m620; \ + mcpu=630: -m620; \ + mcpu=970|mcpu=G5: -m970; \ + !mcpu*: %(asm_default)} \ +-many" + +#undef ASM_DEFAULT_SPEC +#define ASM_DEFAULT_SPEC "-mpwr7" + +#undef TARGET_OS_CPP_BUILTINS +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + builtin_define ("_AIX43"); \ + builtin_define ("_AIX51"); \ + builtin_define ("_AIX52"); \ + builtin_define ("_AIX53"); \ + builtin_define ("_AIX61"); \ + builtin_define ("_AIX71"); \ + builtin_define ("_AIX72"); \ + builtin_define ("_AIX73"); \ + TARGET_OS_AIX_CPP_BUILTINS (); \ + } \ + while (0) + +#define CPP_SPEC32 "" +#define CPP_SPEC64 "-D__64BIT__" +#define CPP_SPEC_COMMON "%{posix: -D_POSIX_SOURCE} \ + %{ansi: -D_ANSI_C_SOURCE} \ + %{mpe: -I%R/usr/lpp/ppe.poe/include} \ + %{pthread: -D_THREAD_SAFE}" + +/* The GNU C++ standard library requires that these macros be + defined. Synchronize with libstdc++ os_defines.h. */ +#define CPLUSPLUS_CPP_SPEC_COMMON \ + "-D_ALL_SOURCE -D__COMPATMATH__ \ + %{mpe: -I%R/usr/lpp/ppe.poe/include} \ + %{pthread: -D_THREAD_SAFE}" + +#define RS6000_CPU(NAME, CPU, FLAGS) +#include "rs6000-cpus.def" +#undef RS6000_CPU + +#undef TARGET_DEFAULT +#ifdef RS6000_BI_ARCH +#define TARGET_DEFAULT (ISA_2_6_MASKS_EMBEDDED | MASK_POWERPC64 | MASK_64BIT) +#else +#define TARGET_DEFAULT ISA_2_6_MASKS_EMBEDDED +#endif + +#undef PROCESSOR_DEFAULT +#define PROCESSOR_DEFAULT PROCESSOR_POWER7 +#undef PROCESSOR_DEFAULT64 +#define PROCESSOR_DEFAULT64 PROCESSOR_POWER7 + +/* AIX 7.2 kernel and assembler have necessary support for Altivec and VSX. */ +#undef OS_MISSING_ALTIVEC + +/* Define this macro as a C expression for the initializer of an + array of string to tell the driver program which options are + defaults for this target and thus do not need to be handled + specially when using `MULTILIB_OPTIONS'. + + Do not define this macro if `MULTILIB_OPTIONS' is not defined in + the target makefile fragment or if none of the options listed in + `MULTILIB_OPTIONS' are set by default. *Note Target Fragment::. */ + +#undef MULTILIB_DEFAULTS + +#define DEFAULT_ARCH64_P (TARGET_DEFAULT & MASK_64BIT) + +#define LIB_SPEC32 "%{!shared:%{g*:-lg}}" +#define LIB_SPEC64 "" +#define LIB_SPEC_COMMON "%{pg:-L%R/lib/profiled -L%R/usr/lib/profiled}\ + %{p:-L%R/lib/profiled -L%R/usr/lib/profiled}\ + %{fprofile-arcs|fprofile-generate*|coverage:-lpthreads}\ + %{mpe:-L%R/usr/lpp/ppe.poe/lib -lmpi -lvtd}\ + %{mlong-double-128:-lc128}\ + %{pthread:-lpthreads} -lc" + +#define LINK_SPEC32 "%{!shared:%{g*: %(link_libg) }} -b32" +#define LINK_SPEC64 "-b64" +#define LINK_SPEC_COMMON "-bpT:0x10000000 -bpD:0x20000000 %{!r:-btextro}\ + %{static:-bnso %(link_syscalls) } %{shared:-bM:SRE %{!e:-bnoentry}}\ + %{mpe:-binitfini:poe_remote_main} " + +#undef STARTFILE_SPEC +#if DEFAULT_ARCH64_P +#define STARTFILE_SPEC "%{!shared:\ + %{!maix32:%{pg:gcrt0_64%O%s;:%{p:mcrt0_64%O%s;:crt0_64%O%s}};:\ + %{pthread:%{pg:gcrt0_r%O%s;:%{p:mcrt0_r%O%s;:crt0_r%O%s}};:\ + %{pg:gcrt0%O%s;:%{p:mcrt0%O%s;:crt0%O%s}}}}}\ + %{!maix32:%{shared:crtcxa_64_s%O%s;:crtcxa_64%O%s} crtdbase_64%O%s;:\ + %{shared:crtcxa_s%O%s;:crtcxa%O%s} crtdbase%O%s}" +#else +#define STARTFILE_SPEC "%{!shared:\ + %{maix64:%{pg:gcrt0_64%O%s;:%{p:mcrt0_64%O%s;:crt0_64%O%s}};:\ + %{pthread:%{pg:gcrt0_r%O%s;:%{p:mcrt0_r%O%s;:crt0_r%O%s}};:\ + %{pg:gcrt0%O%s;:%{p:mcrt0%O%s;:crt0%O%s}}}}}\ + %{maix64:%{shared:crtcxa_64_s%O%s;:crtcxa_64%O%s} crtdbase_64%O%s;:\ + %{shared:crtcxa_s%O%s;:crtcxa%O%s} crtdbase%O%s}" +#endif + + +#undef ASM_SPEC +#undef CPP_SPEC +#undef CPLUSPLUS_CPP_SPEC +#undef LIB_SPEC +#undef LINK_SPEC + +#if DEFAULT_ARCH64_P +#define ASM_SPEC "%{maix32:%(asm_spec32);:%(asm_spec64)} %(asm_spec_common)" +#define CPP_SPEC "%{maix32:%(cpp_spec32);:%(cpp_spec64)} %(cpp_spec_common)" +#define CPLUSPLUS_CPP_SPEC "%{maix32:%(cpp_spec32);:%(cpp_spec64)} %(cplusplus_cpp_spec_common)" +#define LIB_SPEC "%{maix32:%(lib_spec32);:%(lib_spec64)} %(lib_spec_common)" +#define LINK_SPEC "%{maix32:%(link_spec32);:%(link_spec64)} %(link_spec_common)" +#else +#define ASM_SPEC "%{maix64:%(asm_spec64);:%(asm_spec32)} %(asm_spec_common)" +#define CPP_SPEC "%{maix64:%(cpp_spec64);:%(cpp_spec32)} %(cpp_spec_common)" +#define CPLUSPLUS_CPP_SPEC "%{maix64:%(cpp_spec64);:%(cpp_spec32)} %(cplusplus_cpp_spec_common)" +#define LIB_SPEC "%{maix64:%(lib_spec64);:%(lib_spec32)} %(lib_spec_common)" +#define LINK_SPEC "%{maix64:%(link_spec64);:%(link_spec32)} %(link_spec_common)" +#endif + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + { "asm_spec_common", ASM_SPEC_COMMON }, \ + { "asm_spec32", ASM_SPEC32 }, \ + { "asm_spec64", ASM_SPEC64 }, \ + { "cpp_spec_common", CPP_SPEC_COMMON }, \ + { "cplusplus_cpp_spec_common", CPLUSPLUS_CPP_SPEC_COMMON }, \ + { "cpp_spec32", CPP_SPEC32 }, \ + { "cpp_spec64", CPP_SPEC64 }, \ + { "lib_spec_common", LIB_SPEC_COMMON }, \ + { "lib_spec32", LIB_SPEC32 }, \ + { "lib_spec64", LIB_SPEC64 }, \ + { "link_spec_common", LINK_SPEC_COMMON }, \ + { "link_spec32", LINK_SPEC32 }, \ + { "link_spec64", LINK_SPEC64 }, + +/* AIX V5 typedefs ptrdiff_t as "long" while earlier releases used "int". */ + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "long int" + +/* Type used for wchar_t, as a string used in a declaration. */ +#undef WCHAR_TYPE +#define WCHAR_TYPE (!TARGET_64BIT ? "short unsigned int" : "unsigned int") + +/* Width of wchar_t in bits. */ +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE (!TARGET_64BIT ? 16 : 32) + +/* AIX 4.2 and above provides initialization and finalization function + support from linker command line. */ +#undef HAS_INIT_SECTION +#define HAS_INIT_SECTION + +#undef LD_INIT_SWITCH +#define LD_INIT_SWITCH "-binitfini" + +#ifndef _AIX52 +extern long long int atoll(const char *); +#endif + +/* This target uses the aix64.opt file. */ +#define TARGET_USES_AIX64_OPT 1 + +/* Large TOC Support */ +#ifdef HAVE_LD_LARGE_TOC +#undef TARGET_CMODEL +#define TARGET_CMODEL rs6000_current_cmodel +#define SET_CMODEL(opt) rs6000_current_cmodel = opt +#else +#define SET_CMODEL(opt) do {} while (0) +#endif + +/* This target defines SUPPORTS_WEAK and TARGET_ASM_NAMED_SECTION, + but does not have crtbegin/end. */ + +#define TARGET_AIX_VERSION 72 + +/* AIX 7.2 supports DWARF3+ debugging. */ +#define DWARF2_DEBUGGING_INFO 1 +#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG +#define DEBUG_INFO_SECTION "0x10000" +#define DEBUG_LINE_SECTION "0x20000" +#define DEBUG_PUBNAMES_SECTION "0x30000" +#define DEBUG_PUBTYPES_SECTION "0x40000" +#define DEBUG_ARANGES_SECTION "0x50000" +#define DEBUG_ABBREV_SECTION "0x60000" +#define DEBUG_STR_SECTION "0x70000" +#define DEBUG_RANGES_SECTION "0x80000" +#define DEBUG_LOC_SECTION "0x90000" +#define DEBUG_FRAME_SECTION "0xA0000" +#define DEBUG_MACINFO_SECTION "0xB0000" +#define DEBUG_MACRO_SECTION "0xB0000" + -- cgit v1.1 From 28560c6d4043d8f6ac570f35fb84e952e9c719fe Mon Sep 17 00:00:00 2001 From: liuhongt Date: Fri, 21 May 2021 09:48:18 +0800 Subject: Fold blendv builtins into gimple. Fold __builtin_ia32_pblendvb128 (a, b, c) as VEC_COND_EXPR (c < 0, b, a), similar for float version but with mask operand VIEW_CONVERT_EXPR to same sized integer vectype. gcc/ChangeLog: * config/i386/i386-builtin.def (IX86_BUILTIN_BLENDVPD256, IX86_BUILTIN_BLENDVPS256, IX86_BUILTIN_PBLENDVB256, IX86_BUILTIN_BLENDVPD, IX86_BUILTIN_BLENDVPS, IX86_BUILTIN_PBLENDVB128): Replace icode with CODE_FOR_nothing. * config/i386/i386.c (ix86_gimple_fold_builtin): Fold blendv builtins. * config/i386/sse.md (*_pblendvb_lt_subreg_not): New pre_reload splitter. gcc/testsuite/ChangeLog: * gcc.target/i386/funcspec-8.c: Replace __builtin_ia32_blendvpd with __builtin_ia32_roundps_az. * gcc.target/i386/blendv-1.c: New test. * gcc.target/i386/blendv-2.c: New test. --- gcc/config/i386/i386-builtin.def | 12 ++++++------ gcc/config/i386/i386.c | 37 +++++++++++++++++++++++++++++++++++++ gcc/config/i386/sse.md | 22 ++++++++++++++++++++++ 3 files changed, 65 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index ea79e0b..1cc0cc6 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -902,13 +902,13 @@ BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_ssse3_palignrdi, /* SSE4.1 */ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT) -BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF) -BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF) +BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF) +BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT) -BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI) +BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI) @@ -1028,8 +1028,8 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpe BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT) -BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF) -BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF) +BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF) +BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT) @@ -1154,7 +1154,7 @@ BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index c71c9e6..a93128f 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -17983,6 +17983,43 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) } break; + case IX86_BUILTIN_PBLENDVB128: + case IX86_BUILTIN_PBLENDVB256: + case IX86_BUILTIN_BLENDVPS: + case IX86_BUILTIN_BLENDVPD: + case IX86_BUILTIN_BLENDVPS256: + case IX86_BUILTIN_BLENDVPD256: + gcc_assert (n_args == 3); + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + arg2 = gimple_call_arg (stmt, 2); + if (gimple_call_lhs (stmt)) + { + location_t loc = gimple_location (stmt); + tree type = TREE_TYPE (arg2); + gimple_seq stmts = NULL; + if (VECTOR_FLOAT_TYPE_P (type)) + { + tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode + ? intSI_type_node : intDI_type_node; + type = get_same_sized_vectype (itype, type); + arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2); + } + tree zero_vec = build_zero_cst (type); + tree cmp_type = truth_type_for (type); + tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + gimple *g = gimple_build_assign (gimple_call_lhs (stmt), + VEC_COND_EXPR, cmp, + arg1, arg0); + gimple_set_location (g, loc); + gsi_replace (gsi, g, false); + } + else + gsi_replace (gsi, gimple_build_nop (), false); + return true; + + case IX86_BUILTIN_PCMPEQB128: case IX86_BUILTIN_PCMPEQW128: case IX86_BUILTIN_PCMPEQD128: diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index e4f01e6..3100635 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17898,6 +17898,28 @@ (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "")]) +(define_insn_and_split "*_pblendvb_lt_subreg_not" + [(set (match_operand:VI1_AVX2 0 "register_operand") + (unspec:VI1_AVX2 + [(match_operand:VI1_AVX2 2 "vector_operand") + (match_operand:VI1_AVX2 1 "register_operand") + (lt:VI1_AVX2 + (subreg:VI1_AVX2 + (not (match_operand 3 "register_operand")) 0) + (match_operand:VI1_AVX2 4 "const0_operand"))] + UNSPEC_BLENDV))] + "TARGET_SSE4_1 + && GET_MODE_CLASS (GET_MODE (operands[3])) == MODE_VECTOR_INT + && GET_MODE_SIZE (GET_MODE (operands[3])) == + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:VI1_AVX2 + [(match_dup 1) (match_dup 2) + (lt:VI1_AVX2 (match_dup 3) (match_dup 4))] UNSPEC_BLENDV))] + "operands[3] = gen_lowpart (mode, operands[3]);") + (define_insn "sse4_1_pblendw" [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,x") (vec_merge:V8HI -- cgit v1.1 From 3f1a08d9d731975d4061c306837ab28d52f37c7e Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 24 May 2021 10:57:52 +0800 Subject: For 128/256-bit vec_cond_expr, When mask operands is lt reg const0_rtx, blendv can be used instead of avx512 mask. gcc/ChangeLog: PR target/100648 * config/i386/sse.md (*avx_cmp3_lt): New define_insn_and_split. (*avx_cmp3_ltint): Ditto. (*avx2_pcmp3_3): Ditto. (*avx2_pcmp3_4): Ditto. (*avx2_pcmp3_5): Ditto. gcc/testsuite/ChangeLog: PR target/100648 * g++.target/i386/avx2-pr54700-2.C: Adjust testcase. * g++.target/i386/avx512vl-pr54700-1a.C: New test. * g++.target/i386/avx512vl-pr54700-1b.C: New test. * g++.target/i386/avx512vl-pr54700-2a.C: New test. * g++.target/i386/avx512vl-pr54700-2b.C: New test. * gcc.target/i386/avx512vl-pr100648.c: New test. * gcc.target/i386/avx512vl-blendv-1.c: New test. * gcc.target/i386/avx512vl-blendv-2.c: New test. --- gcc/config/i386/sse.md | 152 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 3100635..ffcc0c8 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -3048,6 +3048,68 @@ UNSPEC_PCMP))] "operands[5] = GEN_INT (INTVAL (operands[5]) ^ 4);") +(define_insn_and_split "*avx_cmp3_lt" + [(set (match_operand:VF_128_256 0 "register_operand") + (vec_merge:VF_128_256 + (match_operand:VF_128_256 1 "vector_operand") + (match_operand:VF_128_256 2 "vector_operand") + (unspec: + [(match_operand: 3 "register_operand") + (match_operand: 4 "const0_operand") + (match_operand:SI 5 "const_0_to_7_operand")] + UNSPEC_PCMP)))] + "TARGET_AVX512VL && ix86_pre_reload_split () + /* LT or GE 0 */ + && ((INTVAL (operands[5]) == 1 && !MEM_P (operands[2])) + || (INTVAL (operands[5]) == 5 && !MEM_P (operands[1])))" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:VF_128_256 + [(match_dup 2) + (match_dup 1) + (lt:VF_128_256 + (match_dup 3) + (match_dup 4))] + UNSPEC_BLENDV))] +{ + if (INTVAL (operands[5]) == 5) + std::swap (operands[1], operands[2]); +}) + +(define_insn_and_split "*avx_cmp3_ltint" + [(set (match_operand:VI48_AVX 0 "register_operand") + (vec_merge:VI48_AVX + (match_operand:VI48_AVX 1 "vector_operand") + (match_operand:VI48_AVX 2 "vector_operand") + (unspec: + [(match_operand:VI48_AVX 3 "register_operand") + (match_operand:VI48_AVX 4 "const0_operand") + (match_operand:SI 5 "const_0_to_7_operand")] + UNSPEC_PCMP)))] + "TARGET_AVX512VL && ix86_pre_reload_split () + /* LT or GE 0 */ + && ((INTVAL (operands[5]) == 1 && !MEM_P (operands[2])) + || (INTVAL (operands[5]) == 5 && !MEM_P (operands[1])))" + "#" + "&& 1" + [(set (match_dup 0) + (unspec: + [(match_dup 2) + (match_dup 1) + (subreg: + (lt:VI48_AVX + (match_dup 3) + (match_dup 4)) 0)] + UNSPEC_BLENDV))] +{ + if (INTVAL (operands[5]) == 5) + std::swap (operands[1], operands[2]); + operands[0] = gen_lowpart (mode, operands[0]); + operands[1] = gen_lowpart (mode, operands[1]); + operands[2] = gen_lowpart (mode, operands[2]); +}) + (define_insn "avx_vmcmp3" [(set (match_operand:VF_128 0 "register_operand" "=x") (vec_merge:VF_128 @@ -13063,6 +13125,96 @@ DONE; }) +(define_insn_and_split "*avx2_pcmp3_3" + [(set (match_operand:VI1_AVX2 0 "register_operand") + (vec_merge:VI1_AVX2 + (match_operand:VI1_AVX2 1 "vector_operand") + (match_operand:VI1_AVX2 2 "vector_operand") + (unspec: + [(match_operand:VI1_AVX2 3 "register_operand") + (match_operand:VI1_AVX2 4 "const0_operand") + (match_operand:SI 5 "const_0_to_7_operand")] + UNSPEC_PCMP)))] + "TARGET_AVX512VL && ix86_pre_reload_split () + /* LT or GE 0 */ + && ((INTVAL (operands[5]) == 1 && !MEM_P (operands[2])) + || (INTVAL (operands[5]) == 5 && !MEM_P (operands[1])))" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:VI1_AVX2 + [(match_dup 2) + (match_dup 1) + (lt:VI1_AVX2 + (match_dup 3) + (match_dup 4))] + UNSPEC_BLENDV))] +{ + if (INTVAL (operands[5]) == 5) + std::swap (operands[1], operands[2]); +}) + +(define_insn_and_split "*avx2_pcmp3_4" + [(set (match_operand:VI1_AVX2 0 "register_operand") + (vec_merge:VI1_AVX2 + (match_operand:VI1_AVX2 1 "vector_operand") + (match_operand:VI1_AVX2 2 "vector_operand") + (unspec: + [(subreg:VI1_AVX2 (not (match_operand 3 "register_operand")) 0) + (match_operand:VI1_AVX2 4 "const0_operand") + (match_operand:SI 5 "const_0_to_7_operand")] + UNSPEC_PCMP)))] + "TARGET_AVX512VL && ix86_pre_reload_split () + && GET_MODE_CLASS (GET_MODE (operands[3])) == MODE_VECTOR_INT + && GET_MODE_SIZE (GET_MODE (operands[3])) == + /* LT or GE 0 */ + && ((INTVAL (operands[5]) == 1 && !MEM_P (operands[1])) + || (INTVAL (operands[5]) == 5 && !MEM_P (operands[2])))" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:VI1_AVX2 + [(match_dup 1) + (match_dup 2) + (lt:VI1_AVX2 + (match_dup 3) + (match_dup 4))] + UNSPEC_BLENDV))] +{ + if (INTVAL (operands[5]) == 1) + std::swap (operands[1], operands[2]); + operands[3] = gen_lowpart (mode, operands[3]); +}) + +(define_insn_and_split "*avx2_pcmp3_5" + [(set (match_operand:VI1_AVX2 0 "register_operand") + (vec_merge:VI1_AVX2 + (match_operand:VI1_AVX2 1 "vector_operand") + (match_operand:VI1_AVX2 2 "vector_operand") + (unspec: + [(not:VI1_AVX2 (match_operand:VI1_AVX2 3 "register_operand")) + (match_operand:VI1_AVX2 4 "const0_operand") + (match_operand:SI 5 "const_0_to_7_operand")] + UNSPEC_PCMP)))] + "TARGET_AVX512VL && ix86_pre_reload_split () + /* LT or GE 0 */ + && ((INTVAL (operands[5]) == 1 && !MEM_P (operands[1])) + || (INTVAL (operands[5]) == 5 && !MEM_P (operands[2])))" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:VI1_AVX2 + [(match_dup 1) + (match_dup 2) + (lt:VI1_AVX2 + (match_dup 3) + (match_dup 4))] + UNSPEC_BLENDV))] +{ + if (INTVAL (operands[5]) == 1) + std::swap (operands[1], operands[2]); +}) + (define_expand "_eq3" [(set (match_operand: 0 "register_operand") (unspec: -- cgit v1.1 From 22069036efda4661862aeee213859f1ee8511ab5 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Wed, 23 Jun 2021 15:46:22 +0200 Subject: v850: add v850_can_inline_p target hook gcc/ChangeLog: * config/v850/v850.c (v850_option_override): Build default target node. (v850_can_inline_p): New. Allow MASK_PROLOG_FUNCTION to be ignored for inlining. (TARGET_CAN_INLINE_P): New. --- gcc/config/v850/v850.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/v850/v850.c b/gcc/config/v850/v850.c index e0e5005..371e602 100644 --- a/gcc/config/v850/v850.c +++ b/gcc/config/v850/v850.c @@ -3140,6 +3140,11 @@ v850_option_override (void) /* The RH850 ABI does not (currently) support the use of the CALLT instruction. */ if (! TARGET_GCC_ABI) target_flags |= MASK_DISABLE_CALLT; + + /* Save the initial options in case the user does function specific + options. */ + target_option_default_node = target_option_current_node + = build_target_option_node (&global_options, &global_options_set); } const char * @@ -3192,6 +3197,29 @@ v850_modes_tieable_p (machine_mode mode1, machine_mode mode2) return (mode1 == mode2 || (GET_MODE_SIZE (mode1) <= 4 && GET_MODE_SIZE (mode2) <= 4)); } + +static bool +v850_can_inline_p (tree caller, tree callee) +{ + tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller); + tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee); + + const unsigned HOST_WIDE_INT safe_flags = MASK_PROLOG_FUNCTION; + + if (!callee_tree) + callee_tree = target_option_default_node; + if (!caller_tree) + caller_tree = target_option_default_node; + if (callee_tree == caller_tree) + return true; + + cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree); + cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree); + + return ((caller_opts->x_target_flags & ~safe_flags) + == (callee_opts->x_target_flags & ~safe_flags)); +} + /* Initialize the GCC target structure. */ @@ -3306,6 +3334,10 @@ v850_modes_tieable_p (machine_mode mode1, machine_mode mode2) #undef TARGET_HAVE_SPECULATION_SAFE_VALUE #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed +#undef TARGET_CAN_INLINE_P +#define TARGET_CAN_INLINE_P v850_can_inline_p + + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-v850.h" -- cgit v1.1 From ad26c076aaa6f9af144c33c9c04c5dc8010ad156 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Wed, 23 Jun 2021 15:48:28 +0200 Subject: v850: silent 2 warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Silents: /home/marxin/Programming/gcc/gcc/config/v850/v850.c: In function ‘char* construct_dispose_instruction(rtx)’: /home/marxin/Programming/gcc/gcc/config/v850/v850.c:2690:22: warning: ‘%s’ directive writing up to 99 bytes into a region of size between 79 and 89 [-Wformat-overflow=] 2690 | sprintf (buff, "dispose %d {%s}, r31", stack_bytes / 4, regs); | ^~~~~~~~~~~~~~~~~~~~~~ ~~~~ /home/marxin/Programming/gcc/gcc/config/v850/v850.c:2690:15: note: ‘sprintf’ output between 18 and 127 bytes into a destination of size 100 2690 | sprintf (buff, "dispose %d {%s}, r31", stack_bytes / 4, regs); | ~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /home/marxin/Programming/gcc/gcc/config/v850/v850.c: In function ‘char* construct_prepare_instruction(rtx)’: /home/marxin/Programming/gcc/gcc/config/v850/v850.c:2814:22: warning: ‘%s’ directive writing up to 99 bytes into a region of size 91 [-Wformat-overflow=] 2814 | sprintf (buff, "prepare {%s}, %d", regs, (- stack_bytes) / 4); | ^~~~~~~~~~~~~~~~~~ ~~~~ /home/marxin/Programming/gcc/gcc/config/v850/v850.c:2814:15: note: ‘sprintf’ output between 14 and 123 bytes into a destination of size 100 2814 | sprintf (buff, "prepare {%s}, %d", regs, (- stack_bytes) / 4); | ~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ gcc/ChangeLog: * config/v850/v850.c (construct_dispose_instruction): Allocate a bigger buffer. (construct_prepare_instruction): Likewise. --- gcc/config/v850/v850.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/v850/v850.c b/gcc/config/v850/v850.c index 371e602..4978faf 100644 --- a/gcc/config/v850/v850.c +++ b/gcc/config/v850/v850.c @@ -2583,7 +2583,7 @@ construct_dispose_instruction (rtx op) int stack_bytes; unsigned long int mask; int i; - static char buff[ 100 ]; /* XXX */ + static char buff[ 120 ]; /* XXX */ int use_callt = 0; if (count <= 2) @@ -2704,7 +2704,7 @@ construct_prepare_instruction (rtx op) int stack_bytes; unsigned long int mask; int i; - static char buff[ 100 ]; /* XXX */ + static char buff[ 120 ]; /* XXX */ int use_callt = 0; if (XVECLEN (op, 0) <= 1) -- cgit v1.1 From d5cf2b5db325fd5c053ca7bc8d6a54a06cd71124 Mon Sep 17 00:00:00 2001 From: Indu Bhagat Date: Thu, 20 May 2021 11:21:39 -0700 Subject: Enable BTF generation in the BPF backend This patch changes the BPF GCC backend in order to use the DWARF debug hooks and therefore enables the user to generate BTF debugging information with -gbtf. Generating BTF is crucial when compiling BPF programs, since the CO-RE (compile-once, run-everwhere) mechanism used by the kernel BPF loader relies on it. Note that since in eBPF it is not possible to unwind frames due to the restrictive nature of the target architecture, we are disabling the generation of CFA in this target. 2021-06-28 David Faust * config/bpf/bpf.c (bpf_expand_prologue): Do not mark insns as frame related. (bpf_expand_epilogue): Likewise. * config/bpf/bpf.h (DWARF2_FRAME_INFO): Define to 0. Do not define DBX_DEBUGGING_INFO. --- gcc/config/bpf/bpf.c | 4 ---- gcc/config/bpf/bpf.h | 12 ++---------- 2 files changed, 2 insertions(+), 14 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/bpf/bpf.c b/gcc/config/bpf/bpf.c index 126d4a2..e635f9e 100644 --- a/gcc/config/bpf/bpf.c +++ b/gcc/config/bpf/bpf.c @@ -349,7 +349,6 @@ bpf_expand_prologue (void) hard_frame_pointer_rtx, fp_offset - 8)); insn = emit_move_insn (mem, gen_rtx_REG (DImode, regno)); - RTX_FRAME_RELATED_P (insn) = 1; fp_offset -= 8; } } @@ -364,7 +363,6 @@ bpf_expand_prologue (void) { insn = emit_move_insn (stack_pointer_rtx, hard_frame_pointer_rtx); - RTX_FRAME_RELATED_P (insn) = 1; if (size > 0) { @@ -372,7 +370,6 @@ bpf_expand_prologue (void) gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (-size)))); - RTX_FRAME_RELATED_P (insn) = 1; } } } @@ -412,7 +409,6 @@ bpf_expand_epilogue (void) hard_frame_pointer_rtx, fp_offset - 8)); insn = emit_move_insn (gen_rtx_REG (DImode, regno), mem); - RTX_FRAME_RELATED_P (insn) = 1; fp_offset -= 8; } } diff --git a/gcc/config/bpf/bpf.h b/gcc/config/bpf/bpf.h index 80195ce..82be0c3 100644 --- a/gcc/config/bpf/bpf.h +++ b/gcc/config/bpf/bpf.h @@ -235,17 +235,9 @@ enum reg_class /**** Debugging Info ****/ -/* We cannot support DWARF2 because of the limitations of eBPF. */ +/* In eBPF it is not possible to unwind frames. Disable CFA. */ -/* elfos.h insists in using DWARF. Undo that here. */ -#ifdef DWARF2_DEBUGGING_INFO -# undef DWARF2_DEBUGGING_INFO -#endif -#ifdef PREFERRED_DEBUGGING_TYPE -# undef PREFERRED_DEBUGGING_TYPE -#endif - -#define DBX_DEBUGGING_INFO +#define DWARF2_FRAME_INFO 0 /**** Stack Layout and Calling Conventions. */ -- cgit v1.1 From dc6866b033b1b85f690bf32c0dae7d787cbf58f0 Mon Sep 17 00:00:00 2001 From: Jan-Benedict Glaw Date: Tue, 29 Jun 2021 09:23:07 +0200 Subject: pdp11: Fix signednedd warnings, remove "register" keywords gcc/ChangeLog: * config/pdp11/pdp11.h (ASM_OUTPUT_SKIP): Fix signedness warning. * config/pdp11/pdp11.c (pdp11_asm_print_operand_punct_valid_p): Remove "register" keyword. (pdp11_initial_elimination_offset) Remove unused variable. (pdp11_cmp_length) Ditto. (pdp11_insn_cost): Ditto, and fix signedness warning. --- gcc/config/pdp11/pdp11.c | 27 ++++++++++++--------------- gcc/config/pdp11/pdp11.h | 10 ++++++---- 2 files changed, 18 insertions(+), 19 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/pdp11/pdp11.c b/gcc/config/pdp11/pdp11.c index b663b43..4cab3ae 100644 --- a/gcc/config/pdp11/pdp11.c +++ b/gcc/config/pdp11/pdp11.c @@ -829,12 +829,12 @@ pdp11_asm_print_operand_punct_valid_p (unsigned char c) } void -print_operand_address (FILE *file, register rtx addr) +print_operand_address (FILE *file, rtx addr) { - register rtx breg; + rtx breg; rtx offset; int again = 0; - + retry: switch (GET_CODE (addr)) @@ -1160,12 +1160,11 @@ pdp11_addr_cost (rtx addr, machine_mode mode, addr_space_t as ATTRIBUTE_UNUSED, static int pdp11_insn_cost (rtx_insn *insn, bool speed) { - int base_cost, i; + int base_cost; rtx pat, set, dest, src, src2; machine_mode mode; - const char *fmt; enum rtx_code op; - + if (recog_memoized (insn) < 0) return 0; @@ -1462,24 +1461,24 @@ bool pushpop_regeq (rtx op, int regno) { rtx addr; - + /* False if not memory reference. */ if (GET_CODE (op) != MEM) return FALSE; - + /* Get the address of the memory reference. */ addr = XEXP (op, 0); if (GET_CODE (addr) == MEM) addr = XEXP (addr, 0); - + switch (GET_CODE (addr)) { case PRE_DEC: case POST_INC: case PRE_MODIFY: case POST_MODIFY: - return REGNO (XEXP (addr, 0)) == regno; + return REGNO (XEXP (addr, 0)) == (unsigned) regno; default: return FALSE; } @@ -1771,8 +1770,7 @@ int pdp11_initial_elimination_offset (int from, int to) { /* Get the size of the register save area. */ - int spoff; - + if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM) return get_frame_size (); else if (from == ARG_POINTER_REGNUM && to == FRAME_POINTER_REGNUM) @@ -2106,15 +2104,14 @@ pdp11_cmp_length (rtx *operands, int words) { rtx inops[2]; rtx exops[4][2]; - rtx lb[1]; int i, len = 0; if (!reload_completed) return 2; - + inops[0] = operands[0]; inops[1] = operands[1]; - + pdp11_expand_operands (inops, exops, 2, words, NULL, big); for (i = 0; i < words; i++) diff --git a/gcc/config/pdp11/pdp11.h b/gcc/config/pdp11/pdp11.h index a21ae64..9bc5e08 100644 --- a/gcc/config/pdp11/pdp11.h +++ b/gcc/config/pdp11/pdp11.h @@ -618,10 +618,12 @@ extern int current_first_parm_offset; fprintf (FILE, "\t.even\n") #define ASM_OUTPUT_SKIP(FILE,SIZE) \ - if (TARGET_DEC_ASM) \ - fprintf (FILE, "\t.blkb\t%o\n", (SIZE) & 0xffff); \ - else \ - fprintf (FILE, "\t.=.+ %#o\n", (SIZE) & 0xffff); + do { \ + if (TARGET_DEC_ASM) \ + fprintf (FILE, "\t.blkb\t%o\n", (int) ((SIZE) & 0xffff)); \ + else \ + fprintf (FILE, "\t.=.+ %#o\n", (int) ((SIZE) & 0xffff)); \ + } while (0) /* This says how to output an assembler line to define a global common symbol. */ -- cgit v1.1 From 5c127c4cac308429cba483a2ac4e175c2ab26165 Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Mon, 28 Jun 2021 06:58:52 -0700 Subject: amdgcn: Mark s_mulk_i32 as clobbering SCC The s_mulk_i32 instruction sets the SCC status register according to whether the multiplication overflows, but that is not currently modelled in the GCN backend. AFAIK this is a latent bug and hasn't been noticed "in the wild", but it should be fixed. 2021-06-29 Julian Brown gcc/ * config/gcn/gcn.md (mulsi3): Make s_mulk_i32 variant clobber SCC. --- gcc/config/gcn/gcn.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index b5f895a..cca4552 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -1371,10 +1371,13 @@ ; Vector multiply has vop3a encoding, but no corresponding vop2a, so no long ; immediate. +; The "s_mulk_i32" variant sets SCC to indicate overflow (which we don't care +; about here, but we need to indicate the clobbering). (define_insn "mulsi3" [(set (match_operand:SI 0 "register_operand" "= Sg,Sg, Sg, v") (mult:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA, v") - (match_operand:SI 2 "gcn_alu_operand" " SgA, J, B,vASv")))] + (match_operand:SI 2 "gcn_alu_operand" " SgA, J, B,vASv"))) + (clobber (match_scratch:BI 3 "=X,cs, X, X"))] "" "@ s_mul_i32\t%0, %1, %2 -- cgit v1.1 From 53b1d1691857a1d3e28566d05bb434fa555c4e8a Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Thu, 17 Jun 2021 11:02:55 -0700 Subject: amdgcn: Add [us]mulsi3_highpart SGPR alternatives This patch splits the mulsi3_highpart pattern into an expander and register/immediate alternatives (to avoid meaningless sign/zero_extends on constants), and adds alternatives for SGPR high-part multiply instructions on GCN5+. 2021-06-29 Julian Brown gcc/ * config/gcn/gcn.md (mulsi3_highpart): Change to expander. (mulsi3_highpart_reg, mulsi3_highpart_imm): New patterns. --- gcc/config/gcn/gcn.md | 62 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index cca4552..d1d4998 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -1394,20 +1394,68 @@ (define_code_attr iu [(sign_extend "i") (zero_extend "u")]) (define_code_attr e [(sign_extend "e") (zero_extend "")]) -(define_insn "mulsi3_highpart" - [(set (match_operand:SI 0 "register_operand" "= v") +(define_expand "mulsi3_highpart" + [(set (match_operand:SI 0 "register_operand" "") (truncate:SI (lshiftrt:DI (mult:DI (any_extend:DI - (match_operand:SI 1 "register_operand" "% v")) + (match_operand:SI 1 "register_operand" "")) (any_extend:DI - (match_operand:SI 2 "register_operand" "vSv"))) + (match_operand:SI 2 "gcn_alu_operand" ""))) (const_int 32))))] "" - "v_mul_hi0\t%0, %2, %1" - [(set_attr "type" "vop3a") - (set_attr "length" "8")]) +{ + if (can_create_pseudo_p () + && !TARGET_GCN5 + && !gcn_inline_immediate_operand (operands[2], SImode)) + operands[2] = force_reg (SImode, operands[2]); + + if (REG_P (operands[2])) + emit_insn (gen_mulsi3_highpart_reg (operands[0], operands[1], + operands[2])); + else + emit_insn (gen_mulsi3_highpart_imm (operands[0], operands[1], + operands[2])); + + DONE; +}) + +(define_insn "mulsi3_highpart_reg" + [(set (match_operand:SI 0 "register_operand" "=Sg, v") + (truncate:SI + (lshiftrt:DI + (mult:DI + (any_extend:DI + (match_operand:SI 1 "register_operand" "%Sg, v")) + (any_extend:DI + (match_operand:SI 2 "register_operand" "Sg,vSv"))) + (const_int 32))))] + "" + "@ + s_mul_hi0\t%0, %1, %2 + v_mul_hi0\t%0, %2, %1" + [(set_attr "type" "sop2,vop3a") + (set_attr "length" "4,8") + (set_attr "gcn_version" "gcn5,*")]) + +(define_insn "mulsi3_highpart_imm" + [(set (match_operand:SI 0 "register_operand" "=Sg,Sg,v") + (truncate:SI + (lshiftrt:DI + (mult:DI + (any_extend:DI + (match_operand:SI 1 "register_operand" "Sg,Sg,v")) + (match_operand:DI 2 "gcn_32bit_immediate_operand" "A, B,A")) + (const_int 32))))] + "TARGET_GCN5 || gcn_inline_immediate_operand (operands[2], SImode)" + "@ + s_mul_hi0\t%0, %1, %2 + s_mul_hi0\t%0, %1, %2 + v_mul_hi0\t%0, %2, %1" + [(set_attr "type" "sop2,sop2,vop3a") + (set_attr "length" "4,8,8") + (set_attr "gcn_version" "gcn5,gcn5,*")]) (define_insn "mulhisi3" [(set (match_operand:SI 0 "register_operand" "=v") -- cgit v1.1 From 8f332122589f97b9c974b168ca5b0b186296f0e4 Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Tue, 29 Jun 2021 03:57:31 -0700 Subject: amdgcn: Add [us]mulsid3/muldi3 patterns This patch improves 64-bit multiplication for AMD GCN: patterns for unsigned and signed 32x32->64 bit multiplication have been added, and also 64x64->64 bit multiplication is now open-coded rather than calling a library function (which may be a win for code size as well as speed: the function calling sequence isn't particularly concise for GCN). This version of the patch uses define_insn_and_split in order to keep multiply operations together during RTL optimisations up to register allocation: this appears to produce more compact code via inspection on small test cases than the previous approach using an expander. The DImode multiply implementation is lost from libgcc if we build it for DImode/TImode rather than SImode/DImode, a change we make in a later patch in this series. 2021-06-29 Julian Brown gcc/ * config/gcn/gcn.md (mulsidi3, mulsidi3_reg, mulsidi3_imm, muldi3): Add patterns. --- gcc/config/gcn/gcn.md | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index d1d4998..82f7a46 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -1457,6 +1457,100 @@ (set_attr "length" "4,8,8") (set_attr "gcn_version" "gcn5,gcn5,*")]) +(define_expand "mulsidi3" + [(set (match_operand:DI 0 "register_operand" "") + (mult:DI (any_extend:DI + (match_operand:SI 1 "register_operand" "")) + (any_extend:DI + (match_operand:SI 2 "nonmemory_operand" ""))))] + "" +{ + if (can_create_pseudo_p () + && !TARGET_GCN5 + && !gcn_inline_immediate_operand (operands[2], SImode)) + operands[2] = force_reg (SImode, operands[2]); + + if (REG_P (operands[2])) + emit_insn (gen_mulsidi3_reg (operands[0], operands[1], operands[2])); + else + emit_insn (gen_mulsidi3_imm (operands[0], operands[1], operands[2])); + + DONE; +}) + +(define_insn_and_split "mulsidi3_reg" + [(set (match_operand:DI 0 "register_operand" "=&Sg, &v") + (mult:DI (any_extend:DI + (match_operand:SI 1 "register_operand" "%Sg, v")) + (any_extend:DI + (match_operand:SI 2 "register_operand" "Sg,vSv"))))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx dstlo = gen_lowpart (SImode, operands[0]); + rtx dsthi = gen_highpart_mode (SImode, DImode, operands[0]); + emit_insn (gen_mulsi3 (dstlo, operands[1], operands[2])); + emit_insn (gen_mulsi3_highpart (dsthi, operands[1], operands[2])); + DONE; + } + [(set_attr "gcn_version" "gcn5,*")]) + +(define_insn_and_split "mulsidi3_imm" + [(set (match_operand:DI 0 "register_operand" "=&Sg,&Sg,&v") + (mult:DI (any_extend:DI + (match_operand:SI 1 "register_operand" "Sg, Sg, v")) + (match_operand:DI 2 "gcn_32bit_immediate_operand" + "A, B, A")))] + "TARGET_GCN5 || gcn_inline_immediate_operand (operands[2], SImode)" + "#" + "&& reload_completed" + [(const_int 0)] + { + rtx dstlo = gen_lowpart (SImode, operands[0]); + rtx dsthi = gen_highpart_mode (SImode, DImode, operands[0]); + emit_insn (gen_mulsi3 (dstlo, operands[1], operands[2])); + emit_insn (gen_mulsi3_highpart (dsthi, operands[1], operands[2])); + DONE; + } + [(set_attr "gcn_version" "gcn5,gcn5,*")]) + +(define_insn_and_split "muldi3" + [(set (match_operand:DI 0 "register_operand" "=&Sg,&Sg, &v,&v") + (mult:DI (match_operand:DI 1 "register_operand" "%Sg, Sg, v, v") + (match_operand:DI 2 "nonmemory_operand" "Sg, i,vSv, A"))) + (clobber (match_scratch:SI 3 "=&Sg,&Sg,&v,&v")) + (clobber (match_scratch:BI 4 "=cs, cs, X, X")) + (clobber (match_scratch:DI 5 "=X, X,cV,cV"))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx tmp = operands[3]; + rtx dsthi = gen_highpart_mode (SImode, DImode, operands[0]); + rtx op1lo = gcn_operand_part (DImode, operands[1], 0); + rtx op1hi = gcn_operand_part (DImode, operands[1], 1); + rtx op2lo = gcn_operand_part (DImode, operands[2], 0); + rtx op2hi = gcn_operand_part (DImode, operands[2], 1); + emit_insn (gen_umulsidi3 (operands[0], op1lo, op2lo)); + emit_insn (gen_mulsi3 (tmp, op1lo, op2hi)); + rtx add = gen_rtx_SET (dsthi, gen_rtx_PLUS (SImode, dsthi, tmp)); + rtx clob1 = gen_rtx_CLOBBER (VOIDmode, operands[4]); + rtx clob2 = gen_rtx_CLOBBER (VOIDmode, operands[5]); + add = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (3, add, clob1, clob2)); + emit_insn (add); + emit_insn (gen_mulsi3 (tmp, op1hi, op2lo)); + add = gen_rtx_SET (dsthi, gen_rtx_PLUS (SImode, dsthi, tmp)); + clob1 = gen_rtx_CLOBBER (VOIDmode, operands[4]); + clob2 = gen_rtx_CLOBBER (VOIDmode, operands[5]); + add = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (3, add, clob1, clob2)); + emit_insn (add); + DONE; + } + [(set_attr "gcn_version" "gcn5,gcn5,*,*")]) + (define_insn "mulhisi3" [(set (match_operand:SI 0 "register_operand" "=v") (mult:SI -- cgit v1.1 From 0c06e46a81d86d70d788ca1a93d27b6902bd4dc1 Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Thu, 17 Jun 2021 10:52:51 -0700 Subject: amdgcn: Add clrsbsi2/clrsbdi2 implementation This patch adds an open-coded implementation of the clrsb2 (count leading redundant sign bit) standard names using the GCN flbit_i* instructions for SImode and DImode. Those don't count exactly as we need, so we need a couple of other instructions to fix up the result afterwards. These patterns are lost from libgcc if we build it for DImode/TImode rather than SImode/DImode, a change we make in a later patch in this series. 2021-06-18 Julian Brown gcc/ * config/gcn/gcn.md (UNSPEC_FLBIT_INT): New unspec constant. (s_mnemonic): Add clrsb. (gcn_flbit_int): Add insn pattern for SImode/DImode. (clrsb2): Add expander for SImode/DImode. --- gcc/config/gcn/gcn.md | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index 82f7a46..ae7249a 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -81,7 +81,8 @@ UNSPEC_MOV_FROM_LANE63 UNSPEC_GATHER UNSPEC_SCATTER - UNSPEC_RCP]) + UNSPEC_RCP + UNSPEC_FLBIT_INT]) ;; }}} ;; {{{ Attributes @@ -338,7 +339,8 @@ [(not "not%b") (popcount "bcnt1_i32%b") (clz "flbit_i32%b") - (ctz "ff1_i32%b")]) + (ctz "ff1_i32%b") + (clrsb "flbit_i32%i")]) (define_code_attr revmnemonic [(minus "subrev%i") @@ -1611,6 +1613,40 @@ [(set_attr "type" "sop1") (set_attr "length" "4,8")]) +(define_insn "gcn_flbit_int" + [(set (match_operand:SI 0 "register_operand" "=Sg,Sg") + (unspec:SI [(match_operand:SIDI 1 "gcn_alu_operand" "SgA, B")] + UNSPEC_FLBIT_INT))] + "" + { + if (mode == SImode) + return "s_flbit_i32\t%0, %1"; + else + return "s_flbit_i32_i64\t%0, %1"; + } + [(set_attr "type" "sop1") + (set_attr "length" "4,8")]) + +(define_expand "clrsb2" + [(set (match_operand:SI 0 "register_operand" "") + (clrsb:SI (match_operand:SIDI 1 "gcn_alu_operand" "")))] + "" + { + rtx tmp = gen_reg_rtx (SImode); + /* FLBIT_I* counts sign or zero bits at the most-significant end of the + input register (and returns -1 for 0/-1 inputs). We want the number of + *redundant* bits (i.e. that value minus one), and an answer of 31/63 for + 0/-1 inputs. We can do that in three instructions... */ + emit_insn (gen_gcn_flbit_int (tmp, operands[1])); + emit_insn (gen_uminsi3 (tmp, tmp, + gen_int_mode (GET_MODE_BITSIZE (mode), + SImode))); + /* If we put this last, it can potentially be folded into a subsequent + arithmetic operation. */ + emit_insn (gen_subsi3 (operands[0], tmp, const1_rtx)); + DONE; + }) + ;; }}} ;; {{{ ALU: generic 32-bit binop -- cgit v1.1 From a8a730cd99184e62c4d026b8c775b96589a9c262 Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Wed, 9 Jun 2021 06:18:23 -0700 Subject: amdgcn: Enable support for TImode for AMD GCN This patch enables support for TImode for AMD GCN, the lack of which is currently causing a number of test failures for the target and which is also needed to support "omp_depend_kind" for OpenMP 5.0, since that is implemented as a 128-bit integer. Several libgcc support routines are built by default for the "word size" of a machine, and also for "2 * word size" of the machine. The libgcc build for AMD GCN is changed so that it builds for a "word size" of 64 bits, in order to better match the (64-bit) host compiler. However it isn't really true that we have 64-bit words -- GCN has 32-bit registers, so changing UNITS_PER_WORD unconditionally would be the wrong thing to do. Changing this setting for libgcc (only) means that support routines are built for "single word" operations that are DImode (64 bits), and those for "double word" operations are built for TImode (128 bits). That leaves some gaps regarding previous operations that were built for a "single word" size of 32 bits and a "double word" size of 64 bits (generic code doesn't cover both alternatives for all operations that might be needed). Those gaps are filled in by this patch, or by the preceding patches in the series. 2021-06-18 Julian Brown gcc/ * config/gcn/gcn.c (gcn_init_libfuncs): New function. (TARGET_INIT_LIBFUNCS): Define target hook using above function. * config/gcn/gcn.h (UNITS_PER_WORD): Define to 8 for IN_LIBGCC2, 4 otherwise. (LIBGCC2_UNITS_PER_WORD, BITS_PER_WORD): Remove definitions. (MAX_FIXED_MODE_SIZE): Change to 128. libgcc/ * config/gcn/lib2-bswapti2.c: New file. * config/gcn/lib2-divmod-di.c: New file. * config/gcn/lib2-gcn.h (DItype, UDItype, TItype, UTItype): Add typedefs. (__divdi3, __moddi3, __udivdi3, __umoddi3): Add prototypes. * config/gcn/t-amdgcn (LIB2ADD): Add lib2-divmod-di.c and lib2-bswapti2.c. --- gcc/config/gcn/gcn.c | 30 ++++++++++++++++++++++++++++++ gcc/config/gcn/gcn.h | 11 +++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 54a1c0b..aa9d455 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -3610,6 +3610,34 @@ gcn_init_builtins (void) #endif } +/* Implement TARGET_INIT_LIBFUNCS. */ + +static void +gcn_init_libfuncs (void) +{ + /* BITS_PER_UNIT * 2 is 64 bits, which causes + optabs-libfuncs.c:gen_int_libfunc to omit TImode (i.e 128 bits) + libcalls that we need to support operations for that type. Initialise + them here instead. */ + set_optab_libfunc (udiv_optab, TImode, "__udivti3"); + set_optab_libfunc (umod_optab, TImode, "__umodti3"); + set_optab_libfunc (sdiv_optab, TImode, "__divti3"); + set_optab_libfunc (smod_optab, TImode, "__modti3"); + set_optab_libfunc (smul_optab, TImode, "__multi3"); + set_optab_libfunc (addv_optab, TImode, "__addvti3"); + set_optab_libfunc (subv_optab, TImode, "__subvti3"); + set_optab_libfunc (negv_optab, TImode, "__negvti2"); + set_optab_libfunc (absv_optab, TImode, "__absvti2"); + set_optab_libfunc (smulv_optab, TImode, "__mulvti3"); + set_optab_libfunc (ffs_optab, TImode, "__ffsti2"); + set_optab_libfunc (clz_optab, TImode, "__clzti2"); + set_optab_libfunc (ctz_optab, TImode, "__ctzti2"); + set_optab_libfunc (clrsb_optab, TImode, "__clrsbti2"); + set_optab_libfunc (popcount_optab, TImode, "__popcountti2"); + set_optab_libfunc (parity_optab, TImode, "__parityti2"); + set_optab_libfunc (bswap_optab, TImode, "__bswapti2"); +} + /* Expand the CMP_SWAP GCN builtins. We have our own versions that do not require taking the address of any object, other than the memory cell being operated on. @@ -6345,6 +6373,8 @@ gcn_dwarf_register_span (rtx rtl) #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed #undef TARGET_INIT_BUILTINS #define TARGET_INIT_BUILTINS gcn_init_builtins +#undef TARGET_INIT_LIBFUNCS +#define TARGET_INIT_LIBFUNCS gcn_init_libfuncs #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \ gcn_ira_change_pseudo_allocno_class diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h index eba4646..540835b 100644 --- a/gcc/config/gcn/gcn.h +++ b/gcc/config/gcn/gcn.h @@ -46,9 +46,12 @@ #define BYTES_BIG_ENDIAN 0 #define WORDS_BIG_ENDIAN 0 -#define BITS_PER_WORD 32 -#define UNITS_PER_WORD (BITS_PER_WORD/BITS_PER_UNIT) -#define LIBGCC2_UNITS_PER_WORD 4 +#ifdef IN_LIBGCC2 +/* We want DImode and TImode helpers. */ +#define UNITS_PER_WORD 8 +#else +#define UNITS_PER_WORD 4 +#endif #define POINTER_SIZE 64 #define PARM_BOUNDARY 64 @@ -56,7 +59,7 @@ #define FUNCTION_BOUNDARY 32 #define BIGGEST_ALIGNMENT 64 #define EMPTY_FIELD_BOUNDARY 32 -#define MAX_FIXED_MODE_SIZE 64 +#define MAX_FIXED_MODE_SIZE 128 #define MAX_REGS_PER_ADDRESS 2 #define STACK_SIZE_MODE DImode #define Pmode DImode -- cgit v1.1 From c60d9160b4d966dbea5b1bbea4f817c64d0bee2d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 29 Jun 2021 19:14:35 +0200 Subject: i386: Add V2SFmode vec_addsub pattern [PR95046] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc/ 2021-06-21 Uroš Bizjak PR target/95046 * config/i386/mmx.md (vec_addsubv2sf3): New insn pattern. gcc/testsuite/ 2021-06-21 Uroš Bizjak PR target/95046 * gcc.target/i386/pr95046-9.c: New test. --- gcc/config/i386/mmx.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index e887f034..5f105727 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -788,6 +788,24 @@ (set_attr "prefix_extra" "1") (set_attr "mode" "V2SF")]) +(define_insn "vec_addsubv2sf3" + [(set (match_operand:V2SF 0 "register_operand" "=x,x") + (vec_merge:V2SF + (minus:V2SF + (match_operand:V2SF 1 "register_operand" "0,x") + (match_operand:V2SF 2 "register_operand" "x,x")) + (plus:V2SF (match_dup 1) (match_dup 2)) + (const_int 1)))] + "TARGET_SSE3 && TARGET_MMX_WITH_SSE" + "@ + addsubps\t{%2, %0|%0, %2} + vaddsubps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseadd") + (set_attr "prefix" "orig,vex") + (set_attr "prefix_rep" "1,*") + (set_attr "mode" "V4SF")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel single-precision floating point comparisons -- cgit v1.1 From 652abe22205f268c90b3b15f28c56c030ef68a34 Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Tue, 11 May 2021 21:07:19 -0400 Subject: aix: align text CSECTs to at least 32 bytes. gcc/ChangeLog: * config/rs6000/rs6000.c (rs6000_xcoff_section_type_flags): Increase code CSECT alignment to at least 32 bytes. * config/rs6000/xcoff.h (TEXT_SECTION_ASM_OP): Add 32 byte alignment designation. --- gcc/config/rs6000/rs6000.c | 6 +++++- gcc/config/rs6000/xcoff.h | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 2c249e1..075c156 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-3.0-or-later /* Subroutines used for code generation on IBM RS/6000. Copyright (C) 1991-2021 Free Software Foundation, Inc. Contributed by Richard Kenner (kenner@vlsi1.ultra.nyu.edu) @@ -21361,8 +21362,11 @@ rs6000_xcoff_section_type_flags (tree decl, const char *name, int reloc) flags |= SECTION_BSS; /* Align to at least UNIT size. */ - if ((flags & SECTION_CODE) != 0 || !decl || !DECL_P (decl)) + if (!decl || !DECL_P (decl)) align = MIN_UNITS_PER_WORD; + /* Align code CSECT to at least 32 bytes. */ + else if ((flags & SECTION_CODE) != 0) + align = MAX ((DECL_ALIGN (decl) / BITS_PER_UNIT), 32); else /* Increase alignment of large objects if not already stricter. */ align = MAX ((DECL_ALIGN (decl) / BITS_PER_UNIT), diff --git a/gcc/config/rs6000/xcoff.h b/gcc/config/rs6000/xcoff.h index 5ba565f..f3546fa 100644 --- a/gcc/config/rs6000/xcoff.h +++ b/gcc/config/rs6000/xcoff.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-3.0-or-later /* Definitions of target machine for GNU compiler, for some generic XCOFF file format Copyright (C) 2001-2021 Free Software Foundation, Inc. @@ -249,7 +250,7 @@ #define DOUBLE_INT_ASM_OP "\t.llong\t" /* Output before instructions. */ -#define TEXT_SECTION_ASM_OP "\t.csect .text[PR]" +#define TEXT_SECTION_ASM_OP "\t.csect .text[PR],5" /* Output before writable data. */ #define DATA_SECTION_ASM_OP \ -- cgit v1.1 From ed392e9db434898eccec81edd85323d21d555fe9 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 28 Jun 2021 19:27:23 +0800 Subject: The upper bits of FIXUPIMMS{S,D} should come from src1 not dest. gcc/ChangeLog: PR target/101248 * config/i386/sse.md (avx512f_sfixupimm): Refined to .. (avx512f_sfixupimm): this. (avx512f_sfixupimm_mask"): Refined. * config/i386/subst.md (maskz_scalar): New define_subst. (maskz_scalar_name): New subst_attr. (maskz_scalar_op5): Ditto. (round_saeonly_maskz_scalar_op5): Ditto. (round_saeonly_maskz_scalar_operand5): Ditto. gcc/testsuite/ChangeLog PR target/101248 * gcc.target/i386/pr101248.c: New test. --- gcc/config/i386/sse.md | 8 ++++---- gcc/config/i386/subst.md | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index ffcc0c8..d3f5a74 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -9942,7 +9942,7 @@ DONE; }) -(define_insn "avx512f_sfixupimm" +(define_insn "avx512f_sfixupimm" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 @@ -9951,10 +9951,10 @@ (match_operand: 3 "" "") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_FIXUPIMM) - (match_dup 1) + (match_dup 2) (const_int 1)))] "TARGET_AVX512F" - "vfixupimm\t{%4, %3, %2, %0|%0, %2, %3, %4}"; + "vfixupimm\t{%4, %3, %2, %0|%0, %2, %3, %4}"; [(set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -9968,7 +9968,7 @@ (match_operand: 3 "" "") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_FIXUPIMM) - (match_dup 1) + (match_dup 2) (const_int 1)) (match_dup 1) (match_operand: 5 "register_operand" "Yk")))] diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index 477a898..6614e04 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -117,6 +117,25 @@ (match_operand: 3 "register_operand" "Yk"))) ]) +(define_subst_attr "maskz_scalar_name" "maskz_scalar" "" "_maskz_1") +(define_subst_attr "maskz_scalar_op5" "maskz_scalar" "" "%{%6%}%N5") + +(define_subst "maskz_scalar" + [(set (match_operand:SUBST_V 0) + (vec_merge:SUBST_V + (match_operand:SUBST_V 1) + (match_operand:SUBST_V 2) + (const_int 1)))] + "TARGET_AVX512F" + [(set (match_dup 0) + (vec_merge:SUBST_V + (vec_merge:SUBST_V + (match_dup 1) + (match_operand:SUBST_V 3 "const0_operand" "C") + (match_operand: 4 "register_operand" "Yk")) + (match_dup 2) + (const_int 1)))]) + (define_subst_attr "round_name" "round" "" "_round") (define_subst_attr "round_mask_operand2" "mask" "%R2" "%R4") (define_subst_attr "round_mask_operand3" "mask" "%R3" "%R5") @@ -163,6 +182,7 @@ (define_subst_attr "round_saeonly_mask_operand3" "mask" "%r3" "%r5") (define_subst_attr "round_saeonly_mask_operand4" "mask" "%r4" "%r6") (define_subst_attr "round_saeonly_mask_scalar_merge_operand4" "mask_scalar_merge" "%r4" "%r5") +(define_subst_attr "round_saeonly_maskz_scalar_operand5" "maskz_scalar" "%r5" "%r7") (define_subst_attr "round_saeonly_sd_mask_operand5" "sd" "%r5" "%r7") (define_subst_attr "round_saeonly_op2" "round_saeonly" "" "%r2") (define_subst_attr "round_saeonly_op3" "round_saeonly" "" "%r3") @@ -175,6 +195,7 @@ (define_subst_attr "round_saeonly_mask_op4" "round_saeonly" "" "") (define_subst_attr "round_saeonly_mask_scalar_merge_op4" "round_saeonly" "" "") (define_subst_attr "round_saeonly_sd_mask_op5" "round_saeonly" "" "") +(define_subst_attr "round_saeonly_maskz_scalar_op5" "round_saeonly" "" "") (define_subst_attr "round_saeonly_mask_arg3" "round_saeonly" "" ", operands[]") (define_subst_attr "round_saeonly_constraint" "round_saeonly" "vm" "v") (define_subst_attr "round_saeonly_constraint2" "round_saeonly" "m" "v") -- cgit v1.1 From 0a9d038ec10aa0d109ca965cc435934bfea92d14 Mon Sep 17 00:00:00 2001 From: "prathamesh.kulkarni" Date: Wed, 30 Jun 2021 15:12:06 +0530 Subject: arm/66791: Gate comparison in vca intrinsics on __FAST_MATH__. gcc/ChangeLog: PR target/66791 * config/arm/arm_neon.h: Move vabs intrinsics before vcage_f32. (vcage_f32): Gate comparison on __FAST_MATH__. (vcageq_f32): Likewise. (vcale_f32): Likewise. (vcaleq_f32): Likewise. (vcagt_f32): Likewise. (vcagtq_f32): Likewise. (vcalt_f32): Likewise. (vcaltq_f32): Likewise. (vcage_f16): Likewise. (vcageq_f16): Likewise. (vcale_f16): Likewise. (vcaleq_f16): Likewise. (vcagt_f16): Likewise. (vcagtq_f16): Likewise. (vcalt_f16): Likewise. (vcaltq_f16): Likewise. --- gcc/config/arm/arm_neon.h | 259 ++++++++++++++++++++++++++++------------------ 1 file changed, 161 insertions(+), 98 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 7a800062..f42a15f 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -2867,60 +2867,189 @@ vcltq_u32 (uint32x4_t __a, uint32x4_t __b) return (__a < __b); } +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vabs_s8 (int8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vabsv8qi (__a); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vabs_s16 (int16x4_t __a) +{ + return (int16x4_t)__builtin_neon_vabsv4hi (__a); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vabs_s32 (int32x2_t __a) +{ + return (int32x2_t)__builtin_neon_vabsv2si (__a); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vabs_f32 (float32x2_t __a) +{ + return (float32x2_t)__builtin_neon_vabsv2sf (__a); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vabsq_s8 (int8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vabsv16qi (__a); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vabsq_s16 (int16x8_t __a) +{ + return (int16x8_t)__builtin_neon_vabsv8hi (__a); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vabsq_s32 (int32x4_t __a) +{ + return (int32x4_t)__builtin_neon_vabsv4si (__a); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vabsq_f32 (float32x4_t __a) +{ + return (float32x4_t)__builtin_neon_vabsv4sf (__a); +} + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqabs_s8 (int8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vqabsv8qi (__a); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqabs_s16 (int16x4_t __a) +{ + return (int16x4_t)__builtin_neon_vqabsv4hi (__a); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqabs_s32 (int32x2_t __a) +{ + return (int32x2_t)__builtin_neon_vqabsv2si (__a); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqabsq_s8 (int8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vqabsv16qi (__a); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqabsq_s16 (int16x8_t __a) +{ + return (int16x8_t)__builtin_neon_vqabsv8hi (__a); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqabsq_s32 (int32x4_t __a) +{ + return (int32x4_t)__builtin_neon_vqabsv4si (__a); +} __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcage_f32 (float32x2_t __a, float32x2_t __b) { +#ifdef __FAST_MATH__ + return (uint32x2_t) (vabs_f32 (__a) >= vabs_f32 (__b)); +#else return (uint32x2_t)__builtin_neon_vcagev2sf (__a, __b); +#endif } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcageq_f32 (float32x4_t __a, float32x4_t __b) { +#ifdef __FAST_MATH__ + return (uint32x4_t) (vabsq_f32 (__a) >= vabsq_f32 (__b)); +#else return (uint32x4_t)__builtin_neon_vcagev4sf (__a, __b); +#endif } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcale_f32 (float32x2_t __a, float32x2_t __b) { +#ifdef __FAST_MATH__ + return (uint32x2_t) (vabs_f32 (__a) <= vabs_f32 (__b)); +#else return (uint32x2_t)__builtin_neon_vcagev2sf (__b, __a); +#endif } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcaleq_f32 (float32x4_t __a, float32x4_t __b) { +#ifdef __FAST_MATH__ + return (uint32x4_t) (vabsq_f32 (__a) <= vabsq_f32 (__b)); +#else return (uint32x4_t)__builtin_neon_vcagev4sf (__b, __a); +#endif } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcagt_f32 (float32x2_t __a, float32x2_t __b) { +#ifdef __FAST_MATH__ + return (uint32x2_t) (vabs_f32 (__a) > vabs_f32 (__b)); +#else return (uint32x2_t)__builtin_neon_vcagtv2sf (__a, __b); +#endif } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcagtq_f32 (float32x4_t __a, float32x4_t __b) { +#ifdef __FAST_MATH__ + return (uint32x4_t) (vabsq_f32 (__a) > vabsq_f32 (__b)); +#else return (uint32x4_t)__builtin_neon_vcagtv4sf (__a, __b); +#endif } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcalt_f32 (float32x2_t __a, float32x2_t __b) { +#ifdef __FAST_MATH__ + return (uint32x2_t) (vabs_f32 (__a) < vabs_f32 (__b)); +#else return (uint32x2_t)__builtin_neon_vcagtv2sf (__b, __a); +#endif } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcaltq_f32 (float32x4_t __a, float32x4_t __b) { +#ifdef __FAST_MATH__ + return (uint32x4_t) (vabsq_f32 (__a) < vabsq_f32 (__b)); +#else return (uint32x4_t)__builtin_neon_vcagtv4sf (__b, __a); +#endif } __extension__ extern __inline uint8x8_t @@ -5622,104 +5751,6 @@ vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c) __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabs_s8 (int8x8_t __a) -{ - return (int8x8_t)__builtin_neon_vabsv8qi (__a); -} - -__extension__ extern __inline int16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabs_s16 (int16x4_t __a) -{ - return (int16x4_t)__builtin_neon_vabsv4hi (__a); -} - -__extension__ extern __inline int32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabs_s32 (int32x2_t __a) -{ - return (int32x2_t)__builtin_neon_vabsv2si (__a); -} - -__extension__ extern __inline float32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabs_f32 (float32x2_t __a) -{ - return (float32x2_t)__builtin_neon_vabsv2sf (__a); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabsq_s8 (int8x16_t __a) -{ - return (int8x16_t)__builtin_neon_vabsv16qi (__a); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabsq_s16 (int16x8_t __a) -{ - return (int16x8_t)__builtin_neon_vabsv8hi (__a); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabsq_s32 (int32x4_t __a) -{ - return (int32x4_t)__builtin_neon_vabsv4si (__a); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabsq_f32 (float32x4_t __a) -{ - return (float32x4_t)__builtin_neon_vabsv4sf (__a); -} - -__extension__ extern __inline int8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqabs_s8 (int8x8_t __a) -{ - return (int8x8_t)__builtin_neon_vqabsv8qi (__a); -} - -__extension__ extern __inline int16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqabs_s16 (int16x4_t __a) -{ - return (int16x4_t)__builtin_neon_vqabsv4hi (__a); -} - -__extension__ extern __inline int32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqabs_s32 (int32x2_t __a) -{ - return (int32x2_t)__builtin_neon_vqabsv2si (__a); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqabsq_s8 (int8x16_t __a) -{ - return (int8x16_t)__builtin_neon_vqabsv16qi (__a); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqabsq_s16 (int16x8_t __a) -{ - return (int16x8_t)__builtin_neon_vqabsv8hi (__a); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqabsq_s32 (int32x4_t __a) -{ - return (int32x4_t)__builtin_neon_vqabsv4si (__a); -} - -__extension__ extern __inline int8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vneg_s8 (int8x8_t __a) { return -__a; @@ -17147,56 +17178,88 @@ __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcage_f16 (float16x4_t __a, float16x4_t __b) { +#ifdef __FAST_MATH__ + return (uint16x4_t) (vabs_f16 (__a) >= vabs_f16 (__b)); +#else return (uint16x4_t)__builtin_neon_vcagev4hf (__a, __b); +#endif } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcageq_f16 (float16x8_t __a, float16x8_t __b) { +#ifdef __FAST_MATH__ + return (uint16x8_t) (vabsq_f16 (__a) >= vabsq_f16 (__b)); +#else return (uint16x8_t)__builtin_neon_vcagev8hf (__a, __b); +#endif } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcagt_f16 (float16x4_t __a, float16x4_t __b) { +#ifdef __FAST_MATH__ + return (uint16x4_t) (vabs_f16 (__a) > vabs_f16 (__b)); +#else return (uint16x4_t)__builtin_neon_vcagtv4hf (__a, __b); +#endif } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcagtq_f16 (float16x8_t __a, float16x8_t __b) { +#ifdef __FAST_MATH__ + return (uint16x8_t) (vabsq_f16 (__a) > vabsq_f16 (__b)); +#else return (uint16x8_t)__builtin_neon_vcagtv8hf (__a, __b); +#endif } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcale_f16 (float16x4_t __a, float16x4_t __b) { +#ifdef __FAST_MATH__ + return (uint16x4_t) (vabs_f16 (__a) <= vabs_f16 (__b)); +#else return (uint16x4_t)__builtin_neon_vcalev4hf (__a, __b); +#endif } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcaleq_f16 (float16x8_t __a, float16x8_t __b) { +#ifdef __FAST_MATH__ + return (uint16x8_t) (vabsq_f16 (__a) <= vabsq_f16 (__b)); +#else return (uint16x8_t)__builtin_neon_vcalev8hf (__a, __b); +#endif } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcalt_f16 (float16x4_t __a, float16x4_t __b) { +#ifdef __FAST_MATH__ + return (uint16x4_t) (vabs_f16 (__a) < vabs_f16 (__b)); +#else return (uint16x4_t)__builtin_neon_vcaltv4hf (__a, __b); +#endif } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcaltq_f16 (float16x8_t __a, float16x8_t __b) { +#ifdef __FAST_MATH__ + return (uint16x8_t) (vabsq_f16 (__a) < vabsq_f16 (__b)); +#else return (uint16x8_t)__builtin_neon_vcaltv8hf (__a, __b); +#endif } __extension__ extern __inline uint16x4_t -- cgit v1.1 From 251697a64ba4d7f10f072702771a59d30364cc26 Mon Sep 17 00:00:00 2001 From: Hafiz Abid Qadeer Date: Fri, 28 May 2021 17:49:46 +0100 Subject: [amdgcn] Update CFI configuration Currently we don't get any call frame information for the amdgcn target. This patch makes necessary adjustments to generate CFI that can work with ROCGDB (ROCm 3.8+). gcc/ * config/gcn/gcn.c (move_callee_saved_registers): Emit CFI notes for prologue register saves. (gcn_debug_unwind_info): Use UI_DWARF2. (gcn_dwarf_register_number): Map DWARF_LINK_REGISTER to DWARF PC. (gcn_dwarf_register_span): DWARF_LINK_REGISTER doesn't span. * config/gcn/gcn.h: (DWARF_FRAME_RETURN_COLUMN): New define. (DWARF_LINK_REGISTER): New define. (FIRST_PSEUDO_REGISTER): Increment. (FIXED_REGISTERS): Add entry for DWARF_LINK_REGISTER. (CALL_USED_REGISTERS): Likewise. (REGISTER_NAMES): Likewise. --- gcc/config/gcn/gcn.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++----- gcc/config/gcn/gcn.h | 10 ++++--- 2 files changed, 81 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index aa9d455..ef4ed7c 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -2649,6 +2649,7 @@ move_callee_saved_registers (rtx sp, machine_function *offsets, rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE); HOST_WIDE_INT exec_set = 0; int offreg_set = 0; + auto_vec saved_sgprs; start_sequence (); @@ -2665,7 +2666,10 @@ move_callee_saved_registers (rtx sp, machine_function *offsets, int lane = saved_scalars % 64; if (prologue) - emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane))); + { + emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane))); + saved_sgprs.safe_push (regno); + } else emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane))); @@ -2698,7 +2702,7 @@ move_callee_saved_registers (rtx sp, machine_function *offsets, gcn_gen_undef (V64SImode), exec)); /* Move vectors. */ - for (regno = FIRST_VGPR_REG, offset = offsets->pretend_size; + for (regno = FIRST_VGPR_REG, offset = 0; regno < FIRST_PSEUDO_REGISTER; regno++) if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno)) || (regno == VGPR_REGNO (6) && saved_scalars > 0) @@ -2719,8 +2723,67 @@ move_callee_saved_registers (rtx sp, machine_function *offsets, } if (prologue) - emit_insn (gen_scatterv64si_insn_1offset_exec (vsp, const0_rtx, reg, - as, const0_rtx, exec)); + { + rtx insn = emit_insn (gen_scatterv64si_insn_1offset_exec + (vsp, const0_rtx, reg, as, const0_rtx, + exec)); + + /* Add CFI metadata. */ + rtx note; + if (regno == VGPR_REGNO (6) || regno == VGPR_REGNO (7)) + { + int start = (regno == VGPR_REGNO (7) ? 64 : 0); + int count = MIN (saved_scalars - start, 64); + int add_lr = (regno == VGPR_REGNO (6) + && df_regs_ever_live_p (LINK_REGNUM)); + int lrdest = -1; + rtvec seq = rtvec_alloc (count + add_lr); + + /* Add an REG_FRAME_RELATED_EXPR entry for each scalar + register that was saved in this batch. */ + for (int idx = 0; idx < count; idx++) + { + int stackaddr = offset + idx * 4; + rtx dest = gen_rtx_MEM (SImode, + gen_rtx_PLUS + (DImode, sp, + GEN_INT (stackaddr))); + rtx src = gen_rtx_REG (SImode, saved_sgprs[start + idx]); + rtx set = gen_rtx_SET (dest, src); + RTX_FRAME_RELATED_P (set) = 1; + RTVEC_ELT (seq, idx) = set; + + if (saved_sgprs[start + idx] == LINK_REGNUM) + lrdest = stackaddr; + } + + /* Add an additional expression for DWARF_LINK_REGISTER if + LINK_REGNUM was saved. */ + if (lrdest != -1) + { + rtx dest = gen_rtx_MEM (DImode, + gen_rtx_PLUS + (DImode, sp, + GEN_INT (lrdest))); + rtx src = gen_rtx_REG (DImode, DWARF_LINK_REGISTER); + rtx set = gen_rtx_SET (dest, src); + RTX_FRAME_RELATED_P (set) = 1; + RTVEC_ELT (seq, count) = set; + } + + note = gen_rtx_SEQUENCE (VOIDmode, seq); + } + else + { + rtx dest = gen_rtx_MEM (V64SImode, + gen_rtx_PLUS (DImode, sp, + GEN_INT (offset))); + rtx src = gen_rtx_REG (V64SImode, regno); + note = gen_rtx_SET (dest, src); + } + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, note); + } else emit_insn (gen_gatherv64si_insn_1offset_exec (reg, vsp, const0_rtx, as, const0_rtx, @@ -3224,8 +3287,7 @@ gcn_cannot_copy_insn_p (rtx_insn *insn) static enum unwind_info_type gcn_debug_unwind_info () { - /* No support for debug info, yet. */ - return UI_NONE; + return UI_DWARF2; } /* Determine if there is a suitable hardware conversion instruction. @@ -6251,6 +6313,8 @@ gcn_dwarf_register_number (unsigned int regno) return 768; */ else if (regno == SCC_REG) return 128; + else if (regno == DWARF_LINK_REGISTER) + return 16; else if (SGPR_REGNO_P (regno)) { if (regno - FIRST_SGPR_REG < 64) @@ -6280,8 +6344,12 @@ gcn_dwarf_register_span (rtx rtl) if (GET_MODE_SIZE (mode) != 8) return NULL_RTX; - rtx p = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2)); unsigned regno = REGNO (rtl); + + if (regno == DWARF_LINK_REGISTER) + return NULL_RTX; + + rtx p = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2)); XVECEXP (p, 0, 0) = gen_rtx_REG (SImode, regno); XVECEXP (p, 0, 1) = gen_rtx_REG (SImode, regno + 1); diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h index 540835b..5822ec3 100644 --- a/gcc/config/gcn/gcn.h +++ b/gcc/config/gcn/gcn.h @@ -88,6 +88,7 @@ #define FIRST_PARM_OFFSET(FNDECL) 0 #define DYNAMIC_CHAIN_ADDRESS(FP) plus_constant (Pmode, (FP), -16) #define INCOMING_RETURN_ADDR_RTX gen_rtx_REG (Pmode, LINK_REGNUM) +#define DWARF_FRAME_RETURN_COLUMN 16 #define STACK_DYNAMIC_OFFSET(FNDECL) (-crtl->outgoing_args_size) #define ACCUMULATE_OUTGOING_ARGS 1 #define RETURN_ADDR_RTX(COUNT,FRAMEADDR) \ @@ -138,7 +139,8 @@ #define WORK_ITEM_ID_Z_REG 162 #define SOFT_ARG_REG 416 #define FRAME_POINTER_REGNUM 418 -#define FIRST_PSEUDO_REGISTER 420 +#define DWARF_LINK_REGISTER 420 +#define FIRST_PSEUDO_REGISTER 421 #define FIRST_PARM_REG 24 #define NUM_PARM_REGS 6 @@ -200,7 +202,7 @@ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ /* Other registers. */ \ - 1, 1, 1, 1 \ + 1, 1, 1, 1, 1 \ } #define CALL_USED_REGISTERS { \ @@ -238,7 +240,7 @@ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ /* Other registers. */ \ - 1, 1, 1, 1 \ + 1, 1, 1, 1, 1 \ } @@ -517,7 +519,7 @@ enum gcn_address_spaces "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243", "v244", \ "v245", "v246", "v247", "v248", "v249", "v250", "v251", "v252", "v253", \ "v254", "v255", \ - "?ap0", "?ap1", "?fp0", "?fp1" } + "?ap0", "?ap1", "?fp0", "?fp1", "?dwlr" } #define PRINT_OPERAND(FILE, X, CODE) print_operand(FILE, X, CODE) #define PRINT_OPERAND_ADDRESS(FILE, ADDR) print_operand_address (FILE, ADDR) -- cgit v1.1 From 22f201e4b32a4f8bc1e6462ee19643edae5d25a3 Mon Sep 17 00:00:00 2001 From: Hafiz Abid Qadeer Date: Mon, 21 Jun 2021 22:47:58 +0100 Subject: [amdgcn] Use frame pointer for CFA expressions. As size of address is bigger than registers in amdgcn, we are forced to use DW_CFA_def_cfa_expression to make an expression that concatenates multiple registers for the value of the CFA. This then prohibits us from using many of the dwarf ops which expect CFA rule to be a single regsiter plus an offset. Using frame pointer in the CFA rule is only real possibility as it is saved in every frame and it is easy to unwind its value. So unless user gives fomit-frame-pointer, we use frame pointer for the cfi information. This options also has a different default now. gcc/ * common/config/gcn/gcn-common.c (gcn_option_optimization_table): Change OPT_fomit_frame_pointer to -O3. * config/gcn/gcn.c (gcn_expand_prologue): Prefer the frame pointer when emitting CFI. (gcn_expand_prologue): Prefer the frame pointer when emitting CFI. (gcn_frame_pointer_rqd): New function. (TARGET_FRAME_POINTER_REQUIRED): New hook. --- gcc/config/gcn/gcn.c | 60 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 14 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index ef4ed7c..a999897 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -2900,10 +2900,14 @@ gcn_expand_prologue () rtx adjustment = gen_int_mode (sp_adjust, SImode); rtx insn = emit_insn (gen_addsi3_scalar_carry (sp_lo, sp_lo, adjustment, scc)); - RTX_FRAME_RELATED_P (insn) = 1; - add_reg_note (insn, REG_FRAME_RELATED_EXPR, - gen_rtx_SET (sp, - gen_rtx_PLUS (DImode, sp, adjustment))); + if (!offsets->need_frame_pointer) + { + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, + gen_rtx_SET (sp, + gen_rtx_PLUS (DImode, sp, + adjustment))); + } emit_insn (gen_addcsi3_scalar_zero (sp_hi, sp_hi, scc)); } @@ -2917,25 +2921,24 @@ gcn_expand_prologue () rtx adjustment = gen_int_mode (fp_adjust, SImode); rtx insn = emit_insn (gen_addsi3_scalar_carry(fp_lo, sp_lo, adjustment, scc)); - RTX_FRAME_RELATED_P (insn) = 1; - add_reg_note (insn, REG_FRAME_RELATED_EXPR, - gen_rtx_SET (fp, - gen_rtx_PLUS (DImode, sp, adjustment))); emit_insn (gen_addcsi3_scalar (fp_hi, sp_hi, (fp_adjust < 0 ? GEN_INT (-1) : const0_rtx), scc, scc)); + + /* Set the CFA to the entry stack address, as an offset from the + frame pointer. This is preferred because the frame pointer is + saved in each frame, whereas the stack pointer is not. */ + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_CFA_DEF_CFA, + gen_rtx_PLUS (DImode, fp, + GEN_INT (-(offsets->pretend_size + + offsets->callee_saves)))); } rtx_insn *seq = get_insns (); end_sequence (); - /* FIXME: Prologue insns should have this flag set for debug output, etc. - but it causes issues for now. - for (insn = seq; insn; insn = NEXT_INSN (insn)) - if (INSN_P (insn)) - RTX_FRAME_RELATED_P (insn) = 1;*/ - emit_insn (seq); } else @@ -3011,6 +3014,16 @@ gcn_expand_prologue () gen_rtx_SET (sp, gen_rtx_PLUS (DImode, sp, dbg_adjustment))); + if (offsets->need_frame_pointer) + { + /* Set the CFA to the entry stack address, as an offset from the + frame pointer. This is necessary when alloca is used, and + harmless otherwise. */ + rtx neg_adjust = gen_int_mode (-offsets->callee_saves, DImode); + add_reg_note (insn, REG_CFA_DEF_CFA, + gen_rtx_PLUS (DImode, fp, neg_adjust)); + } + /* Make sure the flat scratch reg doesn't get optimised away. */ emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG))); } @@ -3114,6 +3127,23 @@ gcn_expand_epilogue (void) emit_jump_insn (gen_gcn_return ()); } +/* Implement TARGET_FRAME_POINTER_REQUIRED. + + Return true if the frame pointer should not be eliminated. */ + +bool +gcn_frame_pointer_rqd (void) +{ + /* GDB needs the frame pointer in order to unwind properly, + but that's not important for the entry point, unless alloca is used. + It's not important for code execution, so we should repect the + -fomit-frame-pointer flag. */ + return (!flag_omit_frame_pointer + && cfun + && (cfun->calls_alloca + || (cfun->machine && cfun->machine->normal_function))); +} + /* Implement TARGET_CAN_ELIMINATE. Return true if the compiler is allowed to try to replace register number @@ -6410,6 +6440,8 @@ gcn_dwarf_register_span (rtx rtl) #define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init #undef TARGET_EXPAND_BUILTIN #define TARGET_EXPAND_BUILTIN gcn_expand_builtin +#undef TARGET_FRAME_POINTER_REQUIRED +#define TARGET_FRAME_POINTER_REQUIRED gcn_frame_pointer_rqd #undef TARGET_FUNCTION_ARG #undef TARGET_FUNCTION_ARG_ADVANCE #define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance -- cgit v1.1 From b5bb7f328d56cccfb85932ff586138e5a4ef1436 Mon Sep 17 00:00:00 2001 From: Hafiz Abid Qadeer Date: Fri, 28 May 2021 18:06:57 +0100 Subject: [amdgcn] Add hook for DWARF address spaces. Map GCN address spaces to the proposed DWARF address spaces defined by AMD at https://llvm.org/docs/AMDGPUUsage.html#amdgpu-dwarf-address-class-mapping-table gcc/ * config/gcn/gcn.c: Include dwarf2.h. (gcn_addr_space_debug): New function. (TARGET_ADDR_SPACE_DEBUG): New hook. --- gcc/config/gcn/gcn.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index a999897..6d02a4a 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -50,6 +50,7 @@ #include "varasm.h" #include "intl.h" #include "rtl-iter.h" +#include "dwarf2.h" /* This file should be included last. */ #include "target-def.h" @@ -1497,6 +1498,32 @@ gcn_addr_space_convert (rtx op, tree from_type, tree to_type) gcc_unreachable (); } +/* Implement TARGET_ADDR_SPACE_DEBUG. + + Return the dwarf address space class for each hardware address space. */ + +static int +gcn_addr_space_debug (addr_space_t as) +{ + switch (as) + { + case ADDR_SPACE_DEFAULT: + case ADDR_SPACE_FLAT: + case ADDR_SPACE_SCALAR_FLAT: + case ADDR_SPACE_FLAT_SCRATCH: + return DW_ADDR_none; + case ADDR_SPACE_GLOBAL: + return 1; // DW_ADDR_LLVM_global + case ADDR_SPACE_LDS: + return 3; // DW_ADDR_LLVM_group + case ADDR_SPACE_SCRATCH: + return 4; // DW_ADDR_LLVM_private + case ADDR_SPACE_GDS: + return 0x8000; // DW_ADDR_AMDGPU_region + } + gcc_unreachable (); +} + /* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h @@ -6391,6 +6418,8 @@ gcn_dwarf_register_span (rtx rtl) #undef TARGET_ADDR_SPACE_ADDRESS_MODE #define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode +#undef TARGET_ADDR_SPACE_DEBUG +#define TARGET_ADDR_SPACE_DEBUG gcn_addr_space_debug #undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P #define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \ gcn_addr_space_legitimate_address_p -- cgit v1.1 From 9f6aeb85ee87c6b4e580b6b71e26cbe99e1dab70 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 1 Jul 2021 10:56:32 +0200 Subject: i386: Add integer nabs instructions [PR101044] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch adds integer nabs "(NEG (ABS (...)))" instructions, adds STV conversion and adjusts STV cost calculations accordingly. When CMOV instruction is used to implement abs, the sign is determined from the preceeding operand negation, and CMOVS is used to select between negated and non-negated value. To implement nabs, just reverse the condition and emit CMOVNS instead. The STV costs are adjusted for inherent NOT of nabs insn. V2DI NOT is somehow costly operation, since it is implemented as a load of zero, followed by a SUB insn. OTOH, integer nabs with inherent NOT is relatively cheap, so some STV chains became less profitable for conversion. The patch rewrites operand scanner in compute_convert_gain to a switch and reorders case instances in general_scalar_to_vector_candidate_p to benefit from fallthroughs, and to remove special processing of andnot in the later case. gcc/ 2021-07-01 Uroš Bizjak PR target/101044 * config/i386/i386.md (*nabs2_doubleword): New insn_and_split pattern. (*nabs2_1): Ditto. * config/i386/i386-features.c (general_scalar_chain::compute_convert_gain): Handle (NEG (ABS (...))) RTX. Rewrite src code scanner as switch statement. (general_scalar_chain::convert_insn): Handle (NEG (ABS (...))) RTX. (general_scalar_to_vector_candidate_p): Detect (NEG (ABS (...))) RTX. Reorder case statements for (AND (NOT (...) ...)) fallthrough. gcc/testsuite/ 2021-07-01 Uroš Bizjak PR target/101044 * gcc.target/i386/pr101044.c: New test. --- gcc/config/i386/i386-features.c | 195 +++++++++++++++++++++++----------------- gcc/config/i386/i386.md | 72 +++++++++++++++ 2 files changed, 187 insertions(+), 80 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index a25769a..cbd430a 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -544,71 +544,83 @@ general_scalar_chain::compute_convert_gain () += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx]; else if (MEM_P (src) && REG_P (dst)) igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx]; - else if (GET_CODE (src) == ASHIFT - || GET_CODE (src) == ASHIFTRT - || GET_CODE (src) == LSHIFTRT) - { - if (m == 2) - { - if (INTVAL (XEXP (src, 1)) >= 32) - igain += ix86_cost->add; - else - igain += ix86_cost->shift_const; - } + else + switch (GET_CODE (src)) + { + case ASHIFT: + case ASHIFTRT: + case LSHIFTRT: + if (m == 2) + { + if (INTVAL (XEXP (src, 1)) >= 32) + igain += ix86_cost->add; + else + igain += ix86_cost->shift_const; + } - igain += ix86_cost->shift_const - ix86_cost->sse_op; + igain += ix86_cost->shift_const - ix86_cost->sse_op; - if (CONST_INT_P (XEXP (src, 0))) - igain -= vector_const_cost (XEXP (src, 0)); - } - else if (GET_CODE (src) == PLUS - || GET_CODE (src) == MINUS - || GET_CODE (src) == IOR - || GET_CODE (src) == XOR - || GET_CODE (src) == AND) - { - igain += m * ix86_cost->add - ix86_cost->sse_op; - /* Additional gain for andnot for targets without BMI. */ - if (GET_CODE (XEXP (src, 0)) == NOT - && !TARGET_BMI) - igain += m * ix86_cost->add; - - if (CONST_INT_P (XEXP (src, 0))) - igain -= vector_const_cost (XEXP (src, 0)); - if (CONST_INT_P (XEXP (src, 1))) - igain -= vector_const_cost (XEXP (src, 1)); - } - else if (GET_CODE (src) == NEG - || GET_CODE (src) == NOT) - igain += m * ix86_cost->add - ix86_cost->sse_op - COSTS_N_INSNS (1); - else if (GET_CODE (src) == ABS - || GET_CODE (src) == SMAX - || GET_CODE (src) == SMIN - || GET_CODE (src) == UMAX - || GET_CODE (src) == UMIN) - { - /* We do not have any conditional move cost, estimate it as a - reg-reg move. Comparisons are costed as adds. */ - igain += m * (COSTS_N_INSNS (2) + ix86_cost->add); - /* Integer SSE ops are all costed the same. */ - igain -= ix86_cost->sse_op; - } - else if (GET_CODE (src) == COMPARE) - { - /* Assume comparison cost is the same. */ - } - else if (CONST_INT_P (src)) - { - if (REG_P (dst)) - /* DImode can be immediate for TARGET_64BIT and SImode always. */ - igain += m * COSTS_N_INSNS (1); - else if (MEM_P (dst)) - igain += (m * ix86_cost->int_store[2] - - ix86_cost->sse_store[sse_cost_idx]); - igain -= vector_const_cost (src); - } - else - gcc_unreachable (); + if (CONST_INT_P (XEXP (src, 0))) + igain -= vector_const_cost (XEXP (src, 0)); + break; + + case AND: + case IOR: + case XOR: + case PLUS: + case MINUS: + igain += m * ix86_cost->add - ix86_cost->sse_op; + /* Additional gain for andnot for targets without BMI. */ + if (GET_CODE (XEXP (src, 0)) == NOT + && !TARGET_BMI) + igain += m * ix86_cost->add; + + if (CONST_INT_P (XEXP (src, 0))) + igain -= vector_const_cost (XEXP (src, 0)); + if (CONST_INT_P (XEXP (src, 1))) + igain -= vector_const_cost (XEXP (src, 1)); + break; + + case NEG: + case NOT: + igain -= ix86_cost->sse_op + COSTS_N_INSNS (1); + + if (GET_CODE (XEXP (src, 0)) != ABS) + { + igain += m * ix86_cost->add; + break; + } + /* FALLTHRU */ + + case ABS: + case SMAX: + case SMIN: + case UMAX: + case UMIN: + /* We do not have any conditional move cost, estimate it as a + reg-reg move. Comparisons are costed as adds. */ + igain += m * (COSTS_N_INSNS (2) + ix86_cost->add); + /* Integer SSE ops are all costed the same. */ + igain -= ix86_cost->sse_op; + break; + + case COMPARE: + /* Assume comparison cost is the same. */ + break; + + case CONST_INT: + if (REG_P (dst)) + /* DImode can be immediate for TARGET_64BIT and SImode always. */ + igain += m * COSTS_N_INSNS (1); + else if (MEM_P (dst)) + igain += (m * ix86_cost->int_store[2] + - ix86_cost->sse_store[sse_cost_idx]); + igain -= vector_const_cost (src); + break; + + default: + gcc_unreachable (); + } if (igain != 0 && dump_file) { @@ -1009,7 +1021,19 @@ general_scalar_chain::convert_insn (rtx_insn *insn) case NEG: src = XEXP (src, 0); - convert_op (&src, insn); + + if (GET_CODE (src) == ABS) + { + src = XEXP (src, 0); + convert_op (&src, insn); + subreg = gen_reg_rtx (vmode); + emit_insn_before (gen_rtx_SET (subreg, + gen_rtx_ABS (vmode, src)), insn); + src = subreg; + } + else + convert_op (&src, insn); + subreg = gen_reg_rtx (vmode); emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn); src = gen_rtx_MINUS (vmode, subreg, src); @@ -1042,9 +1066,10 @@ general_scalar_chain::convert_insn (rtx_insn *insn) gcc_assert (REG_P (src) && GET_MODE (src) == DImode); subreg = gen_rtx_SUBREG (V2DImode, src, 0); - emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg), - copy_rtx_if_shared (subreg), - copy_rtx_if_shared (subreg)), + emit_insn_before (gen_vec_interleave_lowv2di + (copy_rtx_if_shared (subreg), + copy_rtx_if_shared (subreg), + copy_rtx_if_shared (subreg)), insn); dst = gen_rtx_REG (CCmode, FLAGS_REG); src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (subreg), @@ -1400,11 +1425,11 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode) return false; /* Fallthru. */ - case PLUS: - case MINUS: + case AND: case IOR: case XOR: - case AND: + case PLUS: + case MINUS: if (!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)) && !CONST_INT_P (XEXP (src, 1))) @@ -1413,18 +1438,32 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode) if (GET_MODE (XEXP (src, 1)) != mode && !CONST_INT_P (XEXP (src, 1))) return false; + + /* Check for andnot case. */ + if (GET_CODE (src) != AND + || GET_CODE (XEXP (src, 0)) != NOT) + break; + + src = XEXP (src, 0); + /* FALLTHRU */ + + case NOT: break; + case NEG: + /* Check for nabs case. */ + if (GET_CODE (XEXP (src, 0)) != ABS) + break; + + src = XEXP (src, 0); + /* FALLTHRU */ + case ABS: if ((mode == DImode && !TARGET_AVX512VL) || (mode == SImode && !TARGET_SSSE3)) return false; break; - case NEG: - case NOT: - break; - case REG: return true; @@ -1438,12 +1477,8 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode) if (!REG_P (XEXP (src, 0)) && !MEM_P (XEXP (src, 0)) - && !CONST_INT_P (XEXP (src, 0)) - /* Check for andnot case. */ - && (GET_CODE (src) != AND - || GET_CODE (XEXP (src, 0)) != NOT - || !REG_P (XEXP (XEXP (src, 0), 0)))) - return false; + && !CONST_INT_P (XEXP (src, 0))) + return false; if (GET_MODE (XEXP (src, 0)) != mode && !CONST_INT_P (XEXP (src, 0))) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 9b619e2..156c6a9 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -10305,6 +10305,50 @@ split_double_mode (mode, &operands[0], 3, &operands[0], &operands[3]); }) +(define_insn_and_split "*nabs2_doubleword" + [(set (match_operand: 0 "register_operand") + (neg: + (abs: + (match_operand: 1 "general_operand")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_CMOVE + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(parallel + [(set (reg:CCC FLAGS_REG) + (ne:CCC (match_dup 1) (const_int 0))) + (set (match_dup 2) (neg:DWIH (match_dup 1)))]) + (parallel + [(set (match_dup 5) + (plus:DWIH (plus:DWIH (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0)) + (match_dup 4)) + (const_int 0))) + (clobber (reg:CC FLAGS_REG))]) + (parallel + [(set (reg:CCGOC FLAGS_REG) + (compare:CCGOC + (neg:DWIH (match_dup 5)) + (const_int 0))) + (set (match_dup 5) + (neg:DWIH (match_dup 5)))]) + (set (match_dup 0) + (if_then_else:DWIH + (lt (reg:CCGOC FLAGS_REG) (const_int 0)) + (match_dup 2) + (match_dup 1))) + (set (match_dup 3) + (if_then_else:DWIH + (lt (reg:CCGOC FLAGS_REG) (const_int 0)) + (match_dup 5) + (match_dup 4)))] +{ + operands[1] = force_reg (mode, operands[1]); + operands[2] = gen_reg_rtx (mode); + + split_double_mode (mode, &operands[0], 3, &operands[0], &operands[3]); +}) + (define_insn_and_split "*abs2_1" [(set (match_operand:SWI 0 "register_operand") (abs:SWI @@ -10332,6 +10376,34 @@ operands[2] = gen_reg_rtx (mode); }) +(define_insn_and_split "*nabs2_1" + [(set (match_operand:SWI 0 "register_operand") + (neg:SWI + (abs:SWI + (match_operand:SWI 1 "general_operand")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_CMOVE + && (mode != QImode || !TARGET_PARTIAL_REG_STALL) + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(parallel + [(set (reg:CCGOC FLAGS_REG) + (compare:CCGOC + (neg:SWI (match_dup 1)) + (const_int 0))) + (set (match_dup 2) + (neg:SWI (match_dup 1)))]) + (set (match_dup 0) + (if_then_else:SWI + (lt (reg:CCGOC FLAGS_REG) (const_int 0)) + (match_dup 2) + (match_dup 1)))] +{ + operands[1] = force_reg (mode, operands[1]); + operands[2] = gen_reg_rtx (mode); +}) + (define_expand "tf2" [(set (match_operand:TF 0 "register_operand") (absneg:TF (match_operand:TF 1 "register_operand")))] -- cgit v1.1 From a86b3453fc6e29cf0e19916b01c393652d838d56 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 1 Jul 2021 15:18:50 +0200 Subject: Change the type of predicates to bool. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch was tested on many targets, but some fallout is expected. To fix the build error, simply change the function type from int to bool, as was done in the patch for several targets. 2021-07-01 Uroš Bizjak gcc/ * genpreds.c (write_predicate_subfunction): Change the type of written subfunction to bool. (write_one_predicate_function): Change the type of written function to bool. (write_tm_preds_h): Ditto. * recog.h (*insn_operand_predicate_fn): Change the type to bool. * recog.c (general_operand): Change the type to bool. (address_operand): Ditto. (register_operand): Ditto. (pmode_register_operand): Ditto. (scratch_operand): Ditto. (immediate_operand): Ditto. (const_int_operand): Ditto. (const_scalar_int_operand): Ditto. (const_double_operand): Ditto. (nonimmediate_operand): Ditto. (nonmemory_operand): Ditto. (push_operand): Ditto. (pop_operand): Ditto. (memory_operand): Ditto. (indirect_operand): Ditto. (ordered_comparison_operator): Ditto. (comparison_operator): Ditto. * config/i386/i386-expand.c (ix86_expand_sse_cmp): Change the type of indirect predicate function to bool. * config/rs6000/rs6000.c (easy_vector_constant): Change the type to bool. * config/mips/mips-protos.h (m16_based_address_p): Change the type of operand 3 to bool. --- gcc/config/i386/i386-expand.c | 2 +- gcc/config/mips/mips-protos.h | 2 +- gcc/config/rs6000/rs6000.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index e9763eb..76d6afd 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -3571,7 +3571,7 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1, cmp_op0 = force_reg (cmp_ops_mode, cmp_op0); - int (*op1_predicate)(rtx, machine_mode) + bool (*op1_predicate)(rtx, machine_mode) = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand; if (!op1_predicate (cmp_op1, cmp_ops_mode)) diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h index 2cf4ed5..51b82b1 100644 --- a/gcc/config/mips/mips-protos.h +++ b/gcc/config/mips/mips-protos.h @@ -366,7 +366,7 @@ extern bool umips_12bit_offset_address_p (rtx, machine_mode); extern bool mips_9bit_offset_address_p (rtx, machine_mode); extern bool lwsp_swsp_address_p (rtx, machine_mode); extern bool m16_based_address_p (rtx, machine_mode, - int (*)(rtx_def*, machine_mode)); + bool (*)(rtx_def*, machine_mode)); extern rtx mips_expand_thread_pointer (rtx); extern void mips16_expand_get_fcsr (rtx); extern void mips16_expand_set_fcsr (rtx); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 075c156..f3e5f95 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1146,7 +1146,7 @@ static bool set_to_load_agen (rtx_insn *,rtx_insn *); static bool insn_terminates_group_p (rtx_insn *, enum group_termination); static bool insn_must_be_first_in_group (rtx_insn *); static bool insn_must_be_last_in_group (rtx_insn *); -int easy_vector_constant (rtx, machine_mode); +bool easy_vector_constant (rtx, machine_mode); static rtx rs6000_debug_legitimize_address (rtx, rtx, machine_mode); static rtx rs6000_legitimize_tls_address (rtx, enum tls_model); #if TARGET_MACHO -- cgit v1.1 From d63454815de3b93331025bd990efdad5296ae706 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 1 Jul 2021 16:57:57 +0200 Subject: i386: Return true/false instead of 1/0 from predicates. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional changes. 2021-07-01 Uroš Bizjak gcc/ * config/i386/predicates.md (ix86_endbr_immediate_operand): Return true/false instead of 1/0. (movq_parallel): Ditto. --- gcc/config/i386/predicates.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index e7a8968..c4b35c8 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -145,16 +145,16 @@ unsigned HOST_WIDE_INT val = TARGET_64BIT ? 0xfa1e0ff3 : 0xfb1e0ff3; if (imm == val) - return 1; + return true; /* NB: Encoding is byte based. */ if (TARGET_64BIT) for (; imm >= val; imm >>= 8) if (imm == val) - return 1; + return true; } - return 0; + return false; }) ;; Return true if VALUE can be stored in a sign extended immediate field. @@ -1559,15 +1559,15 @@ unsigned HOST_WIDE_INT ei; if (!CONST_INT_P (er)) - return 0; + return false; ei = INTVAL (er); if (i < nelt2 && ei != i) - return 0; + return false; if (i >= nelt2 && (ei < nelt || ei >= nelt << 1)) - return 0; + return false; } - return 1; + return true; }) ;; Return true if OP is a vzeroall operation, known to be a PARALLEL. -- cgit v1.1 From edafb35bdadf309ebb9d1eddc5549f9e1ad49c09 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 2 Jun 2021 07:15:45 -0700 Subject: x86: Convert CONST_WIDE_INT/CONST_VECTOR to broadcast 1. Update move expanders to convert the CONST_WIDE_INT and CONST_VECTOR operands to vector broadcast from an integer with AVX. 2. Add ix86_gen_scratch_sse_rtx to return a scratch SSE register which won't increase stack alignment requirement and blocks transformation by the combine pass. A small benchmark: https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/memset/broadcast shows that broadcast is a little bit faster on Intel Core i7-8559U: $ make gcc -g -I. -O2 -c -o test.o test.c gcc -g -c -o memory.o memory.S gcc -g -c -o broadcast.o broadcast.S gcc -g -c -o vec_dup_sse2.o vec_dup_sse2.S gcc -o test test.o memory.o broadcast.o vec_dup_sse2.o ./test memory : 147215 broadcast : 121213 vec_dup_sse2: 171366 $ broadcast is also smaller: $ size memory.o broadcast.o text data bss dec hex filename 132 0 0 132 84 memory.o 122 0 0 122 7a broadcast.o $ 3. Update PR 87767 tests to expect integer broadcast instead of broadcast from memory. 4. Update avx512f_cond_move.c to expect integer broadcast. A small benchmark: https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vpaddd/broadcast shows that integer broadcast is faster than embedded memory broadcast: $ make gcc -g -I. -O2 -march=skylake-avx512 -c -o test.o test.c gcc -g -c -o memory.o memory.S gcc -g -c -o broadcast.o broadcast.S gcc -o test test.o memory.o broadcast.o ./test memory : 425538 broadcast : 375260 $ gcc/ PR target/100865 * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate): New prototype. (ix86_byte_broadcast): New function. (ix86_convert_const_wide_int_to_broadcast): Likewise. (ix86_expand_move): Convert CONST_WIDE_INT to broadcast if mode size is 16 bytes or bigger. (ix86_broadcast_from_integer_constant): New function. (ix86_expand_vector_move): Convert CONST_WIDE_INT and CONST_VECTOR to broadcast if mode size is 16 bytes or bigger. * config/i386/i386-protos.h (ix86_gen_scratch_sse_rtx): New prototype. * config/i386/i386.c (ix86_gen_scratch_sse_rtx): New function. gcc/testsuite/ PR target/100865 * gcc.target/i386/avx512f-broadcast-pr87767-1.c: Expect integer broadcast. * gcc.target/i386/avx512f-broadcast-pr87767-5.c: Likewise. * gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Likewise. * gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Likewise. * gcc.target/i386/avx512f_cond_move.c: Also pass -mprefer-vector-width=512 and expect integer broadcast. * gcc.target/i386/pr100865-1.c: New test. * gcc.target/i386/pr100865-2.c: Likewise. * gcc.target/i386/pr100865-3.c: Likewise. * gcc.target/i386/pr100865-4a.c: Likewise. * gcc.target/i386/pr100865-4b.c: Likewise. * gcc.target/i386/pr100865-5a.c: Likewise. * gcc.target/i386/pr100865-5b.c: Likewise. * gcc.target/i386/pr100865-6a.c: Likewise. * gcc.target/i386/pr100865-6b.c: Likewise. * gcc.target/i386/pr100865-6c.c: Likewise. * gcc.target/i386/pr100865-7a.c: Likewise. * gcc.target/i386/pr100865-7b.c: Likewise. * gcc.target/i386/pr100865-7c.c: Likewise. * gcc.target/i386/pr100865-8a.c: Likewise. * gcc.target/i386/pr100865-8b.c: Likewise. * gcc.target/i386/pr100865-8c.c: Likewise. * gcc.target/i386/pr100865-9a.c: Likewise. * gcc.target/i386/pr100865-9b.c: Likewise. * gcc.target/i386/pr100865-9c.c: Likewise. * gcc.target/i386/pr100865-10a.c: Likewise. * gcc.target/i386/pr100865-10b.c: Likewise. * gcc.target/i386/pr100865-11a.c: Likewise. * gcc.target/i386/pr100865-11b.c: Likewise. * gcc.target/i386/pr100865-11c.c: Likewise. * gcc.target/i386/pr100865-12a.c: Likewise. * gcc.target/i386/pr100865-12b.c: Likewise. * gcc.target/i386/pr100865-12c.c: Likewise. --- gcc/config/i386/i386-expand.c | 194 +++++++++++++++++++++++++++++++++++++++--- gcc/config/i386/i386-protos.h | 2 + gcc/config/i386/i386.c | 13 +++ 3 files changed, 197 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 76d6afd..0738141 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -93,6 +93,9 @@ along with GCC; see the file COPYING3. If not see #include "i386-builtins.h" #include "i386-expand.h" +static bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx, + rtx); + /* Split one or more double-mode RTL references into pairs of half-mode references. The RTL can be REG, offsettable MEM, integer constant, or CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to @@ -190,6 +193,82 @@ ix86_expand_clear (rtx dest) emit_insn (tmp); } +/* Return true if V can be broadcasted from an integer of WIDTH bits + which is returned in VAL_BROADCAST. Otherwise, return false. */ + +static bool +ix86_broadcast (HOST_WIDE_INT v, unsigned int width, + HOST_WIDE_INT &val_broadcast) +{ + wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT); + val_broadcast = wi::extract_uhwi (val, 0, width); + for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width) + { + HOST_WIDE_INT each = wi::extract_uhwi (val, i, width); + if (val_broadcast != each) + return false; + } + val_broadcast = sext_hwi (val_broadcast, width); + return true; +} + +/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */ + +static rtx +ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op) +{ + /* Don't use integer vector broadcast if we can't move from GPR to SSE + register directly. */ + if (!TARGET_INTER_UNIT_MOVES_TO_VEC) + return nullptr; + + /* Convert CONST_WIDE_INT to a non-standard SSE constant integer + broadcast only if vector broadcast is available. */ + if (!TARGET_AVX + || !CONST_WIDE_INT_P (op) + || standard_sse_constant_p (op, mode)) + return nullptr; + + HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0); + HOST_WIDE_INT val_broadcast; + scalar_int_mode broadcast_mode; + if (TARGET_AVX2 + && ix86_broadcast (val, GET_MODE_BITSIZE (QImode), + val_broadcast)) + broadcast_mode = QImode; + else if (TARGET_AVX2 + && ix86_broadcast (val, GET_MODE_BITSIZE (HImode), + val_broadcast)) + broadcast_mode = HImode; + else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode), + val_broadcast)) + broadcast_mode = SImode; + else if (TARGET_64BIT + && ix86_broadcast (val, GET_MODE_BITSIZE (DImode), + val_broadcast)) + broadcast_mode = DImode; + else + return nullptr; + + /* Check if OP can be broadcasted from VAL. */ + for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++) + if (val != CONST_WIDE_INT_ELT (op, i)) + return nullptr; + + unsigned int nunits = (GET_MODE_SIZE (mode) + / GET_MODE_SIZE (broadcast_mode)); + machine_mode vector_mode; + if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode)) + gcc_unreachable (); + rtx target = ix86_gen_scratch_sse_rtx (vector_mode); + bool ok = ix86_expand_vector_init_duplicate (false, vector_mode, + target, + GEN_INT (val_broadcast)); + gcc_assert (ok); + target = lowpart_subreg (mode, target, vector_mode); + return target; +} + void ix86_expand_move (machine_mode mode, rtx operands[]) { @@ -347,20 +426,29 @@ ix86_expand_move (machine_mode mode, rtx operands[]) && optimize) op1 = copy_to_mode_reg (mode, op1); - if (can_create_pseudo_p () - && CONST_DOUBLE_P (op1)) + if (can_create_pseudo_p ()) { - /* If we are loading a floating point constant to a register, - force the value to memory now, since we'll get better code - out the back end. */ + if (CONST_DOUBLE_P (op1)) + { + /* If we are loading a floating point constant to a + register, force the value to memory now, since we'll + get better code out the back end. */ - op1 = validize_mem (force_const_mem (mode, op1)); - if (!register_operand (op0, mode)) + op1 = validize_mem (force_const_mem (mode, op1)); + if (!register_operand (op0, mode)) + { + rtx temp = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (temp, op1)); + emit_move_insn (op0, temp); + return; + } + } + else if (GET_MODE_SIZE (mode) >= 16) { - rtx temp = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (temp, op1)); - emit_move_insn (op0, temp); - return; + rtx tmp = ix86_convert_const_wide_int_to_broadcast + (GET_MODE (op0), op1); + if (tmp != nullptr) + op1 = tmp; } } } @@ -368,6 +456,59 @@ ix86_expand_move (machine_mode mode, rtx operands[]) emit_insn (gen_rtx_SET (op0, op1)); } +static rtx +ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) +{ + int nunits = GET_MODE_NUNITS (mode); + if (nunits < 2) + return nullptr; + + /* Don't use integer vector broadcast if we can't move from GPR to SSE + register directly. */ + if (!TARGET_INTER_UNIT_MOVES_TO_VEC) + return nullptr; + + /* Convert CONST_VECTOR to a non-standard SSE constant integer + broadcast only if vector broadcast is available. */ + if (!(TARGET_AVX2 + || (TARGET_AVX + && (GET_MODE_INNER (mode) == SImode + || GET_MODE_INNER (mode) == DImode))) + || standard_sse_constant_p (op, mode)) + return nullptr; + + /* Don't broadcast from a 64-bit integer constant in 32-bit mode. */ + if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT) + return nullptr; + + rtx constant = get_pool_constant (XEXP (op, 0)); + if (GET_CODE (constant) != CONST_VECTOR) + return nullptr; + + /* There could be some rtx like + (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) + but with "*.LC1" refer to V2DI constant vector. */ + if (GET_MODE (constant) != mode) + { + constant = simplify_subreg (mode, constant, GET_MODE (constant), + 0); + if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR) + return nullptr; + } + + rtx first = XVECEXP (constant, 0, 0); + + for (int i = 1; i < nunits; ++i) + { + rtx tmp = XVECEXP (constant, 0, i); + /* Vector duplicate value. */ + if (!rtx_equal_p (tmp, first)) + return nullptr; + } + + return first; +} + void ix86_expand_vector_move (machine_mode mode, rtx operands[]) { @@ -407,7 +548,36 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1)); } else - op1 = validize_mem (force_const_mem (mode, op1)); + { + machine_mode mode = GET_MODE (op0); + rtx tmp = ix86_convert_const_wide_int_to_broadcast + (mode, op1); + if (tmp == nullptr) + op1 = validize_mem (force_const_mem (mode, op1)); + else + op1 = tmp; + } + } + + if (can_create_pseudo_p () + && GET_MODE_SIZE (mode) >= 16 + && GET_MODE_CLASS (mode) == MODE_VECTOR_INT + && (MEM_P (op1) + && SYMBOL_REF_P (XEXP (op1, 0)) + && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0)))) + { + rtx first = ix86_broadcast_from_integer_constant (mode, op1); + if (first != nullptr) + { + /* Broadcast to XMM/YMM/ZMM register from an integer + constant. */ + op1 = ix86_gen_scratch_sse_rtx (mode); + bool ok = ix86_expand_vector_init_duplicate (false, mode, + op1, first); + gcc_assert (ok); + emit_move_insn (op0, op1); + return; + } } /* We need to check memory alignment for SSE mode since attribute diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 65fc307..71745b9 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -50,6 +50,8 @@ extern void ix86_reset_previous_fndecl (void); extern bool ix86_using_red_zone (void); +extern rtx ix86_gen_scratch_sse_rtx (machine_mode); + extern unsigned int ix86_regmode_natural_size (machine_mode); #ifdef RTX_CODE extern int standard_80387_constant_p (rtx); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index a93128f..2fbaae7 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -23163,6 +23163,19 @@ ix86_optab_supported_p (int op, machine_mode mode1, machine_mode, } } +/* Return a scratch register in MODE for vector load and store. */ + +rtx +ix86_gen_scratch_sse_rtx (machine_mode mode) +{ + if (TARGET_SSE) + return gen_rtx_REG (mode, (TARGET_64BIT + ? LAST_REX_SSE_REG + : LAST_SSE_REG)); + else + return gen_reg_rtx (mode); +} + /* Address space support. This is not "far pointers" in the 16-bit sense, but an easy way -- cgit v1.1 From 51c30227fa6c739e2d367abf327b3b7dfd86dd46 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 7 Jun 2021 14:23:04 -0700 Subject: x86: Add vec_duplicate expander Add vec_duplicate expander for SSE2 if we can move from GPR to SSE register directly. * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate): Make it global. * config/i386/i386-protos.h (ix86_expand_vector_init_duplicate): New prototype. * config/i386/sse.md (INT_BROADCAST_MODE): New mode iterator. (vec_duplicate): New expander. --- gcc/config/i386/i386-expand.c | 5 +---- gcc/config/i386/i386-protos.h | 2 ++ gcc/config/i386/sse.md | 31 +++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 0738141..5c9170e 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -93,9 +93,6 @@ along with GCC; see the file COPYING3. If not see #include "i386-builtins.h" #include "i386-expand.h" -static bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx, - rtx); - /* Split one or more double-mode RTL references into pairs of half-mode references. The RTL can be REG, offsettable MEM, integer constant, or CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to @@ -13909,7 +13906,7 @@ static bool expand_vec_perm_1 (struct expand_vec_perm_d *d); /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector with all elements equal to VAR. Return true if successful. */ -static bool +bool ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, rtx target, rtx val) { diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 71745b9..51376fc 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -258,6 +258,8 @@ extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool); extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx); extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); extern void ix86_expand_sse2_abs (rtx, rtx); +extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx, + rtx); /* In i386-c.c */ extern void ix86_target_macros (void); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index d3f5a74..f0b450a 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -24814,3 +24814,34 @@ "TARGET_WIDEKL" "aes\t{%0}" [(set_attr "type" "other")]) + +;; Modes handled by broadcast patterns. NB: Allow V64QI and V32HI with +;; TARGET_AVX512F since ix86_expand_vector_init_duplicate can expand +;; without TARGET_AVX512BW which is used by memset vector broadcast +;; expander to XI with: +;; vmovd %edi, %xmm15 +;; vpbroadcastb %xmm15, %ymm15 +;; vinserti64x4 $0x1, %ymm15, %zmm15, %zmm15 + +(define_mode_iterator INT_BROADCAST_MODE + [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI + (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI + (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI + (V8DI "TARGET_AVX512F && TARGET_64BIT") + (V4DI "TARGET_AVX && TARGET_64BIT") (V2DI "TARGET_64BIT")]) + +;; Broadcast from an integer. NB: Enable broadcast only if we can move +;; from GPR to SSE register directly. +(define_expand "vec_duplicate" + [(set (match_operand:INT_BROADCAST_MODE 0 "register_operand") + (vec_duplicate:INT_BROADCAST_MODE + (match_operand: 1 "nonimmediate_operand")))] + "TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC" +{ + if (!ix86_expand_vector_init_duplicate (false, + GET_MODE (operands[0]), + operands[0], + operands[1])) + gcc_unreachable (); + DONE; +}) -- cgit v1.1 From cc8453012f75dc6dbd20bf3a94c4819a2bff46db Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Thu, 1 Jul 2021 15:02:43 +0100 Subject: Darwin: Define a suitable section name for CTF [PR101283] This is a placeholder name ahead of any CTF implementation on LLVM (which sets Darwin ABI). Ideally, we would get agreement on this choice (or any replacement) before GCC12 is shipped. PR debug/101283 - Several tests fail on Darwin with -gctf PR debug/101283 gcc/ChangeLog: * config/darwin.h (CTF_INFO_SECTION_NAME): New. --- gcc/config/darwin.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h index d2b2c14..b7c3af3 100644 --- a/gcc/config/darwin.h +++ b/gcc/config/darwin.h @@ -1115,4 +1115,8 @@ extern void darwin_driver_init (unsigned int *,struct cl_decoded_option **); # endif #endif +/* CTF support. */ +#undef CTF_INFO_SECTION_NAME +#define CTF_INFO_SECTION_NAME "__CTF,__ctf,regular,debug" + #endif /* CONFIG_DARWIN_H */ -- cgit v1.1 From b97486f465ff7ee2ed1a5305bcc211563891c37e Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Thu, 1 Jul 2021 14:50:34 -0400 Subject: Add IEEE 128-bit fp conditional move on PowerPC. This patch adds the support for power10 IEEE 128-bit floating point conditional move and for automatically generating min/max. In this patch, I simplified things compared to previous patches. Instead of allowing any four of the modes to be used for the conditional move comparison and the move itself could use different modes, I restricted the conditional move to just the same mode. I.e. you can do: _Float128 a, b, c, d, e, r; r = (a == b) ? c : d; But you can't do: _Float128 c, d, r; double a, b; r = (a == b) ? c : d; or: _Float128 a, b; double c, d, r; r = (a == b) ? c : d; This eliminates a lot of the complexity of the code, because you don't have to worry about the sizes being different, and the IEEE 128-bit types being restricted to Altivec registers, while the SF/DF modes can use any VSX register. I did not modify the existing support that allowed conditional moves where SFmode operands are compared and DFmode operands are moved (and vice versa). I modified the test cases that I added to reflect this change. I have also fixed the test for not equal to use '!=' instead of '=='. 2021-07-01 Michael Meissner gcc/ * config/rs6000/rs6000.c (rs6000_maybe_emit_fp_cmove): Add IEEE 128-bit floating point conditional move support. (have_compare_and_set_mask): Add IEEE 128-bit floating point types. * config/rs6000/rs6000.md (movcc, IEEE128 iterator): New insn. (movcc_p10, IEEE128 iterator): New insn. (movcc_invert_p10, IEEE128 iterator): New insn. (fpmask, IEEE128 iterator): New insn. (xxsel, IEEE128 iterator): New insn. gcc/testsuite/ * gcc.target/powerpc/float128-cmove.c: New test. * gcc.target/powerpc/float128-minmax-3.c: New test. --- gcc/config/rs6000/rs6000.c | 32 ++++++++++++- gcc/config/rs6000/rs6000.md | 106 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index f3e5f95..9a5db63 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -15699,8 +15699,8 @@ rs6000_emit_vector_cond_expr (rtx dest, rtx op_true, rtx op_false, return 1; } -/* Possibly emit the xsmaxcdp and xsmincdp instructions to emit a maximum or - minimum with "C" semantics. +/* Possibly emit the xsmaxc{dp,qp} and xsminc{dp,qp} instructions to emit a + maximum or minimum with "C" semantics. Unless you use -ffast-math, you can't use these instructions to replace conditions that implicitly reverse the condition because the comparison @@ -15776,6 +15776,7 @@ rs6000_maybe_emit_fp_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond) enum rtx_code code = GET_CODE (op); rtx op0 = XEXP (op, 0); rtx op1 = XEXP (op, 1); + machine_mode compare_mode = GET_MODE (op0); machine_mode result_mode = GET_MODE (dest); rtx compare_rtx; rtx cmove_rtx; @@ -15784,6 +15785,29 @@ rs6000_maybe_emit_fp_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond) if (!can_create_pseudo_p ()) return 0; + /* We allow the comparison to be either SFmode/DFmode and the true/false + condition to be either SFmode/DFmode. I.e. we allow: + + float a, b; + double c, d, r; + + r = (a == b) ? c : d; + + and: + + double a, b; + float c, d, r; + + r = (a == b) ? c : d; + + but we don't allow intermixing the IEEE 128-bit floating point types with + the 32/64-bit scalar types. */ + + if (!(compare_mode == result_mode + || (compare_mode == SFmode && result_mode == DFmode) + || (compare_mode == DFmode && result_mode == SFmode))) + return false; + switch (code) { case EQ: @@ -15836,6 +15860,10 @@ have_compare_and_set_mask (machine_mode mode) case E_DFmode: return TARGET_P9_MINMAX; + case E_KFmode: + case E_TFmode: + return TARGET_POWER10 && TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode); + default: break; } diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index abd825f..e84d031 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -5449,6 +5449,112 @@ "xxsel %x0,%x4,%x3,%x1" [(set_attr "type" "vecmove")]) +;; Support for ISA 3.1 IEEE 128-bit conditional move. The mode used in the +;; comparison must be the same as used in the move. +(define_expand "movcc" + [(set (match_operand:IEEE128 0 "gpc_reg_operand") + (if_then_else:IEEE128 (match_operand 1 "comparison_operator") + (match_operand:IEEE128 2 "gpc_reg_operand") + (match_operand:IEEE128 3 "gpc_reg_operand")))] + "TARGET_POWER10 && TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" +{ + if (rs6000_emit_cmove (operands[0], operands[1], operands[2], operands[3])) + DONE; + else + FAIL; +}) + +(define_insn_and_split "*movcc_p10" + [(set (match_operand:IEEE128 0 "altivec_register_operand" "=&v,v") + (if_then_else:IEEE128 + (match_operator:CCFP 1 "fpmask_comparison_operator" + [(match_operand:IEEE128 2 "altivec_register_operand" "v,v") + (match_operand:IEEE128 3 "altivec_register_operand" "v,v")]) + (match_operand:IEEE128 4 "altivec_register_operand" "v,v") + (match_operand:IEEE128 5 "altivec_register_operand" "v,v"))) + (clobber (match_scratch:V2DI 6 "=0,&v"))] + "TARGET_POWER10 && TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" + "#" + "&& 1" + [(set (match_dup 6) + (if_then_else:V2DI (match_dup 1) + (match_dup 7) + (match_dup 8))) + (set (match_dup 0) + (if_then_else:IEEE128 (ne (match_dup 6) + (match_dup 8)) + (match_dup 4) + (match_dup 5)))] +{ + if (GET_CODE (operands[6]) == SCRATCH) + operands[6] = gen_reg_rtx (V2DImode); + + operands[7] = CONSTM1_RTX (V2DImode); + operands[8] = CONST0_RTX (V2DImode); +} + [(set_attr "length" "8") + (set_attr "type" "vecperm")]) + +;; Handle inverting the fpmask comparisons. +(define_insn_and_split "*movcc_invert_p10" + [(set (match_operand:IEEE128 0 "altivec_register_operand" "=&v,v") + (if_then_else:IEEE128 + (match_operator:CCFP 1 "invert_fpmask_comparison_operator" + [(match_operand:IEEE128 2 "altivec_register_operand" "v,v") + (match_operand:IEEE128 3 "altivec_register_operand" "v,v")]) + (match_operand:IEEE128 4 "altivec_register_operand" "v,v") + (match_operand:IEEE128 5 "altivec_register_operand" "v,v"))) + (clobber (match_scratch:V2DI 6 "=0,&v"))] + "TARGET_POWER10 && TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" + "#" + "&& 1" + [(set (match_dup 6) + (if_then_else:V2DI (match_dup 9) + (match_dup 7) + (match_dup 8))) + (set (match_dup 0) + (if_then_else:IEEE128 (ne (match_dup 6) + (match_dup 8)) + (match_dup 5) + (match_dup 4)))] +{ + rtx op1 = operands[1]; + enum rtx_code cond = reverse_condition_maybe_unordered (GET_CODE (op1)); + + if (GET_CODE (operands[6]) == SCRATCH) + operands[6] = gen_reg_rtx (V2DImode); + + operands[7] = CONSTM1_RTX (V2DImode); + operands[8] = CONST0_RTX (V2DImode); + + operands[9] = gen_rtx_fmt_ee (cond, CCFPmode, operands[2], operands[3]); +} + [(set_attr "length" "8") + (set_attr "type" "vecperm")]) + +(define_insn "*fpmask" + [(set (match_operand:V2DI 0 "altivec_register_operand" "=v") + (if_then_else:V2DI + (match_operator:CCFP 1 "fpmask_comparison_operator" + [(match_operand:IEEE128 2 "altivec_register_operand" "v") + (match_operand:IEEE128 3 "altivec_register_operand" "v")]) + (match_operand:V2DI 4 "all_ones_constant" "") + (match_operand:V2DI 5 "zero_constant" "")))] + "TARGET_POWER10 && TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" + "xscmp%V1qp %0,%2,%3" + [(set_attr "type" "fpcompare")]) + +(define_insn "*xxsel" + [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v") + (if_then_else:IEEE128 + (ne (match_operand:V2DI 1 "altivec_register_operand" "v") + (match_operand:V2DI 2 "zero_constant" "")) + (match_operand:IEEE128 3 "altivec_register_operand" "v") + (match_operand:IEEE128 4 "altivec_register_operand" "v")))] + "TARGET_POWER10 && TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" + "xxsel %x0,%x4,%x3,%x1" + [(set_attr "type" "vecmove")]) + ;; Conversions to and from floating-point. -- cgit v1.1 From 73494401241b183ca188954a035734fcc53d97de Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 30 Jun 2021 17:10:44 +0800 Subject: Fix typo in standard pattern name of trunc2. gcc/ChangeLog * config/i386/sse.md (trunc2): Refined to .. (trunc2): this. --- gcc/config/i386/sse.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index f0b450a..bcf1605 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -11199,12 +11199,14 @@ (define_mode_iterator PMOV_SRC_MODE_4 [V4DI V2DI V4SI]) (define_mode_attr pmov_dst_4 [(V4DI "V4HI") (V2DI "V2HI") (V4SI "V4HI")]) +(define_mode_attr pmov_dst_4_lower + [(V4DI "v4hi") (V2DI "v2hi") (V4SI "v4hi")]) (define_mode_attr pmov_dst_zeroed_4 [(V4DI "V4HI") (V2DI "V6HI") (V4SI "V4HI")]) (define_mode_attr pmov_suff_4 [(V4DI "qw") (V2DI "qw") (V4SI "dw")]) -(define_expand "trunc2" +(define_expand "trunc2" [(set (match_operand: 0 "register_operand") (truncate: (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))] -- cgit v1.1 From 01d402c5e0ac1ddf5618bbe316b50067625fda46 Mon Sep 17 00:00:00 2001 From: Eugene Rozenfeld Date: Thu, 1 Jul 2021 16:21:36 -0700 Subject: Update gen_autofdo_event.py and gcc-auto-profile. gen_autofdo_event.py was stumbling on models with stepping so I updated the script to handle this case similar to the code in https://github.com/andikleen/pmu-tools/blob/c6a5f63aede19def8886d6a8b74d7a55c38ca947/event_download.py The second change was to tolerate cases when the CPU supports PEBS but the perf command with /p fails. This can happen in, e.g., a virtual machine. I regenerated gcc-auto-profile using the updated script. contrib/ChangeLog: * gen_autofdo_event.py: handle stepping, non-working PEBS gcc/ChangeLog: * config/i386/gcc-auto-profile: regenerate --- gcc/config/i386/gcc-auto-profile | 41 +++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/gcc-auto-profile b/gcc/config/i386/gcc-auto-profile index 5da5c63..56f64cb 100755 --- a/gcc/config/i386/gcc-auto-profile +++ b/gcc/config/i386/gcc-auto-profile @@ -1,7 +1,7 @@ #!/bin/sh -# profile workload for gcc profile feedback (autofdo) using Linux perf -# auto generated. to regenerate for new CPUs run -# contrib/gen_autofdo_event.py --shell --all in gcc source +# Profile workload for gcc profile feedback (autofdo) using Linux perf. +# Auto generated. To regenerate for new CPUs run +# contrib/gen_autofdo_event.py --script --all in gcc source # usages: # gcc-auto-profile program (profile program and children) @@ -10,7 +10,7 @@ # gcc-auto-profile --kernel -a sleep X (profile kernel) # gcc-auto-profile --all -a sleep X (profile kernel and user space) -# identify branches taken event for CPU +# Identify branches taken event for CPU. # FLAGS=u @@ -37,7 +37,12 @@ case `egrep -q "^cpu family\s*: 6" /proc/cpuinfo && egrep "^model\s*:" /proc/cpuinfo | head -n1` in model*:\ 55|\ model*:\ 77|\ -model*:\ 76) E="cpu/event=0xC4,umask=0xFE/p$FLAGS" ;; +model*:\ 76|\ +model*:\ 92|\ +model*:\ 95|\ +model*:\ 87|\ +model*:\ 133|\ +model*:\ 122) E="cpu/event=0xC4,umask=0xFE/p$FLAGS" ;; model*:\ 42|\ model*:\ 45|\ model*:\ 58|\ @@ -48,9 +53,16 @@ model*:\ 70|\ model*:\ 63|\ model*:\ 61|\ model*:\ 71|\ +model*:\ 79|\ model*:\ 86|\ model*:\ 78|\ -model*:\ 94) E="cpu/event=0xC4,umask=0x20/p$FLAGS" ;; +model*:\ 94|\ +model*:\ 142|\ +model*:\ 158|\ +model*:\ 165|\ +model*:\ 166|\ +model*:\ 85|\ +model*:\ 85) E="cpu/event=0xC4,umask=0x20/p$FLAGS" ;; model*:\ 46|\ model*:\ 30|\ model*:\ 31|\ @@ -63,8 +75,23 @@ model*:\ 38|\ model*:\ 39|\ model*:\ 54|\ model*:\ 53) E="cpu/event=0x88,umask=0x41/p$FLAGS" ;; +model*:\ 126|\ +model*:\ 140|\ +model*:\ 141|\ +model*:\ 106|\ +model*:\ 108) E="cpu/event=0xc4,umask=0x20/p$FLAGS" ;; *) echo >&2 "Unknown CPU. Run contrib/gen_autofdo_event.py --all --script to update script." exit 1 ;; esac -exec perf record -e $E -b "$@" +set -x +if ! perf record -e $E -b "$@" ; then + # PEBS may not actually be working even if the processor supports it + # (e.g., in a virtual machine). Trying to run without /p. + set +x + echo >&2 "Retrying without /p." + E="$(echo "${E}" | sed -e 's/\/p/\//')" + set -x + exec perf record -e $E -b "$@" + set +x +fi -- cgit v1.1 From 1aeefa5720a71e622e2f26bf10ec8e7ecbd76f4c Mon Sep 17 00:00:00 2001 From: Hongyu Wang Date: Wed, 30 Jun 2021 14:38:31 +0800 Subject: Clear odata for aes(enc|dec)(wide)?kl intrinsics when ZF is set. For Keylocker aesenc/aesdec intrinsics, current implementation moves idata to odata unconditionally, which causes safety issue when the instruction meets runtime error. So we add a branch to clear odata when ZF is set after instruction exectution. gcc/ChangeLog: * config/i386/i386-expand.c (ix86_expand_builtin): Add branch to clear odata when ZF is set for asedecenc_expand and wideaesdecenc_expand. gcc/testsuite/ChangeLog: * gcc.target/i386/keylocker-aesdec128kl.c: Update test. * gcc.target/i386/keylocker-aesdec256kl.c: Likewise. * gcc.target/i386/keylocker-aesdecwide128kl.c: Likewise. * gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise. * gcc.target/i386/keylocker-aesenc128kl.c: Likewise. * gcc.target/i386/keylocker-aesenc256kl.c: Likewise. * gcc.target/i386/keylocker-aesencwide128kl.c: Likewise. * gcc.target/i386/keylocker-aesencwide256kl.c: Likewise. --- gcc/config/i386/i386-expand.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 5c9170e..a63319a 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -11753,10 +11753,24 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, if (target == 0) target = gen_reg_rtx (QImode); - pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG), - const0_rtx); - emit_insn (gen_rtx_SET (target, pat)); + /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime + error occurs. Then the output should be cleared for safety. */ + rtx_code_label *ok_label; + rtx tmp; + + tmp = gen_rtx_REG (CCZmode, FLAGS_REG); + pat = gen_rtx_EQ (QImode, tmp, const0_rtx); + ok_label = gen_label_rtx (); + emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp), + true, ok_label); + /* Usually the runtime error seldom occur, so predict OK path as + hotspot to optimize it as fallthrough block. */ + predict_jump (REG_BR_PROB_BASE * 90 / 100); + + emit_insn (gen_rtx_SET (op1, const0_rtx)); + emit_label (ok_label); + emit_insn (gen_rtx_SET (target, pat)); emit_insn (gen_rtx_SET (op0, op1)); return target; @@ -11811,8 +11825,17 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, if (target == 0) target = gen_reg_rtx (QImode); - pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG), - const0_rtx); + tmp = gen_rtx_REG (CCZmode, FLAGS_REG); + pat = gen_rtx_EQ (QImode, tmp, const0_rtx); + ok_label = gen_label_rtx (); + emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp), + true, ok_label); + predict_jump (REG_BR_PROB_BASE * 90 / 100); + + for (i = 0; i < 8; i++) + emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx)); + + emit_label (ok_label); emit_insn (gen_rtx_SET (target, pat)); for (i = 0; i < 8; i++) -- cgit v1.1 From f7cad1a0ffe9f003ec347521dfd33f320f4c2b04 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 2 Jul 2021 10:06:56 +0200 Subject: i386: Punt on broadcasts from TImode integers [PR101286] ix86_expand_vector_init_duplicate doesn't handle TImode -> V2TImode or TImode -> V4TImode broadcasts, so I think we should punt on TImode inner mode in ix86_broadcast_from_integer_constant, otherwise we ICE in ix86_expand_vector_move when ix86_broadcast_from_integer_constant returns non-NULL and ix86_expand_vector_init_duplicate returns false. In theory TImode element broadcasts could be handled by some permutations, but I'm not sure it is worth it. 2021-07-02 Jakub Jelinek PR target/101286 * config/i386/i386-expand.c (ix86_broadcast_from_integer_constant): Return nullptr for TImode inner mode. * gcc.target/i386/avx2-pr101286.c: New test. --- gcc/config/i386/i386-expand.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index a63319a..b37642e 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -478,6 +478,9 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT) return nullptr; + if (GET_MODE_INNER (mode) == TImode) + return nullptr; + rtx constant = get_pool_constant (XEXP (op, 0)); if (GET_CODE (constant) != CONST_VECTOR) return nullptr; -- cgit v1.1 From 496e1d6a1f973b3952a37163441f9149501dfb26 Mon Sep 17 00:00:00 2001 From: Eric Botcazou Date: Fri, 2 Jul 2021 10:21:11 +0200 Subject: Change EH pointer encodings to PC relative on Windows A big difference between ELF and PE-COFF is that, with the latter, you can build position-independent executables or DLLs without generating PIC; as a matter of fact, flag_pic has historically been forced to 0 for 32-bit: /* Don't allow flag_pic to propagate since gas may produce invalid code otherwise. */ \ do { \ flag_pic = TARGET_64BIT ? 1 : 0; \ } while (0) The reason is that the linker builds a .reloc section that collects the absolute relocations in the generated binary, and the loader uses them to relocate it at load time if need be (e.g. if --dynamicbase is enabled). Up to binutils 2.35, the GNU linker didn't build the .reloc section for executables and defaulted to --enable-auto-image-base for DLLs, which means that DLLs had an essentially unique load address and, therefore, need not be relocated by the loader in most cases. With binutils 2.36 and later, the GNU linker builds a .reloc section for executables (thus making them PIE), --enable-auto-image-base is disabled and --dynamicbase is enabled by default, which means that essentially all the binaries are relocated at load time. This badly breaks the 32-bit compiler configured to use DWARF-2 EH because the loader corrupts the .eh_frame section when processing the relocations contained in the .reloc section. gcc/ * config/i386/i386.c (asm_preferred_eh_data_format): Always use the PIC encodings for PE-COFF targets. --- gcc/config/i386/i386.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 2fbaae7..cff2690 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -21930,10 +21930,12 @@ ix86_stack_protect_fail (void) After all, the relocation needed is the same as for the call insn. Whether or not a particular assembler allows us to enter such, I guess we'll have to see. */ + int asm_preferred_eh_data_format (int code, int global) { - if (flag_pic) + /* PE-COFF is effectively always -fPIC because of the .reloc section. */ + if (flag_pic || TARGET_PECOFF) { int type = DW_EH_PE_sdata8; if (!TARGET_64BIT @@ -21942,9 +21944,11 @@ asm_preferred_eh_data_format (int code, int global) type = DW_EH_PE_sdata4; return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type; } + if (ix86_cmodel == CM_SMALL || (ix86_cmodel == CM_MEDIUM && code)) return DW_EH_PE_udata4; + return DW_EH_PE_absptr; } -- cgit v1.1 From a6fef2e1b6d7e8cea0c0489496cc8f96391200c6 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Fri, 2 Jul 2021 10:31:31 -0400 Subject: Fix xstormy16 target specific fallout from recent int->bool changes gcc/ * config/stormy16/stormy16-protos.h (xstormy16_below_100_symbol): Change return type to a bool. (nonimmediate_nonstack_operand): Likewise. (xstormy16_splittable_below100_operand): Likewise. * config/stormy16/stormy16.c (xstormy16_below_100_symbol): Fix return type. (xstormy16_splittable_below100_operand): Likewise. --- gcc/config/stormy16/stormy16-protos.h | 6 +++--- gcc/config/stormy16/stormy16.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/stormy16/stormy16-protos.h b/gcc/config/stormy16/stormy16-protos.h index c81ea8c..d8010af 100644 --- a/gcc/config/stormy16/stormy16-protos.h +++ b/gcc/config/stormy16/stormy16-protos.h @@ -54,7 +54,7 @@ extern void xstormy16_expand_andqi3 (rtx *); #if defined (HAVE_MACHINE_MODES) && defined (RTX_CODE) extern void xstormy16_split_cbranch (machine_mode, rtx, rtx, rtx); extern int short_memory_operand (rtx, machine_mode); -extern int nonimmediate_nonstack_operand (rtx, machine_mode); +extern bool nonimmediate_nonstack_operand (rtx, machine_mode); extern enum reg_class xstormy16_secondary_reload_class (enum reg_class, machine_mode, rtx); extern void xstormy16_split_move (machine_mode, rtx, rtx); @@ -63,8 +63,8 @@ extern void xstormy16_expand_arith (machine_mode, enum rtx_code, rtx, rtx, rtx); extern const char * xstormy16_output_shift (machine_mode, enum rtx_code, rtx, rtx, rtx); -extern int xstormy16_below100_symbol (rtx, machine_mode); -extern int xstormy16_splittable_below100_operand (rtx, machine_mode); +extern bool xstormy16_below100_symbol (rtx, machine_mode); +extern bool xstormy16_splittable_below100_operand (rtx, machine_mode); extern bool xstormy16_legitimate_address_p (machine_mode, rtx, bool); #endif diff --git a/gcc/config/stormy16/stormy16.c b/gcc/config/stormy16/stormy16.c index fb7670f..92011fd 100644 --- a/gcc/config/stormy16/stormy16.c +++ b/gcc/config/stormy16/stormy16.c @@ -516,7 +516,7 @@ xstormy16_preferred_reload_class (rtx x, reg_class_t rclass) /* Predicate for symbols and addresses that reflect special 8-bit addressing. */ -int +bool xstormy16_below100_symbol (rtx x, machine_mode mode ATTRIBUTE_UNUSED) { @@ -542,7 +542,7 @@ xstormy16_below100_symbol (rtx x, /* Likewise, but only for non-volatile MEMs, for patterns where the MEM will get split into smaller sized accesses. */ -int +bool xstormy16_splittable_below100_operand (rtx x, machine_mode mode) { if (MEM_P (x) && MEM_VOLATILE_P (x)) -- cgit v1.1 From ef9cc434a476954b5ef3493955d4e668338990c2 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Fri, 2 Jul 2021 10:37:52 -0400 Subject: Fix frv target specific fallout from recent int->bool changes gcc/ChangeLog * config/frv/frv-protos.h (integer_register_operand): Adjust return type. (frv_load_operand, gpr_or_fpr_operand, gpr_no_subreg_operand): Likewise. (fpr_or_int6_operand, gpr_or_int_operand); Likewise. (gpr_or_int12_operand, gpr_or_int10_operand); Likewise. (move_source_operand, move_destination_operand): Likewise. (condexec_source_operand, condexec_dest_operand): Likewise. (lr_operand, gpr_or_memory_operand, fpr_or_memory_operand): Likewise. (reg_or_0_operand, fcc_operand, icc_operand, cc_operand): Likewise. (fcr_operand, icr_operand, cr_operand, call_operand): Likewise. (fpr_operand, even_reg_operand, odd_reg_operand): Likewise. (even_gpr_operand, odd_gpr_operand, quad_fpr_operand): Likewise. (even_fpr_operand, odd_fpr_operand): Likewise. (dbl_memory_one_insn_operand, dbl_memory_two_insn_operand): Likewise. (int12_operand, int6_operand, int5_operand, uint5_operand): Likewise. (uint4_operand, uint1_operand, int_2word_operand): Likewise (upper_int16_operand, uint16_operand, symbolic_operand): Likewise. (relational_operator, float_relational_operator): Likewise. (ccr_eqne_operator, minmax_operator): Likewise. (condexec_si_binary_operator, condexec_si_media_operator): Likewise. (condexec_si_divide_operator, condexec_si_unary_operator): Likewise. (condexec_sf_conv_operator, condexec_sf_add_operator): Likewise. (intop_compare_operator, acc_operand, even_acc_operand): Likewise. (quad_acc_operand, accg_operand): Likewise. --- gcc/config/frv/frv-protos.h | 118 ++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 59 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/frv/frv-protos.h b/gcc/config/frv/frv-protos.h index 8d79d73..29d4425 100644 --- a/gcc/config/frv/frv-protos.h +++ b/gcc/config/frv/frv-protos.h @@ -89,73 +89,73 @@ extern int frv_adjust_field_align (tree, int); #endif #ifdef RTX_CODE -extern int integer_register_operand (rtx, machine_mode); -extern int frv_load_operand (rtx, machine_mode); -extern int gpr_or_fpr_operand (rtx, machine_mode); -extern int gpr_no_subreg_operand (rtx, machine_mode); +extern bool integer_register_operand (rtx, machine_mode); +extern bool frv_load_operand (rtx, machine_mode); +extern bool gpr_or_fpr_operand (rtx, machine_mode); +extern bool gpr_no_subreg_operand (rtx, machine_mode); extern int gpr_or_int6_operand (rtx, machine_mode); -extern int fpr_or_int6_operand (rtx, machine_mode); -extern int gpr_or_int_operand (rtx, machine_mode); -extern int gpr_or_int12_operand (rtx, machine_mode); -extern int gpr_fpr_or_int12_operand (rtx, machine_mode); -extern int gpr_or_int10_operand (rtx, machine_mode); -extern int move_source_operand (rtx, machine_mode); -extern int move_destination_operand (rtx, machine_mode); -extern int condexec_source_operand (rtx, machine_mode); -extern int condexec_dest_operand (rtx, machine_mode); -extern int lr_operand (rtx, machine_mode); -extern int gpr_or_memory_operand (rtx, machine_mode); -extern int fpr_or_memory_operand (rtx, machine_mode); -extern int reg_or_0_operand (rtx, machine_mode); -extern int fcc_operand (rtx, machine_mode); -extern int icc_operand (rtx, machine_mode); -extern int cc_operand (rtx, machine_mode); -extern int fcr_operand (rtx, machine_mode); -extern int icr_operand (rtx, machine_mode); -extern int cr_operand (rtx, machine_mode); -extern int call_operand (rtx, machine_mode); -extern int fpr_operand (rtx, machine_mode); -extern int even_reg_operand (rtx, machine_mode); -extern int odd_reg_operand (rtx, machine_mode); -extern int even_gpr_operand (rtx, machine_mode); -extern int odd_gpr_operand (rtx, machine_mode); -extern int quad_fpr_operand (rtx, machine_mode); -extern int even_fpr_operand (rtx, machine_mode); -extern int odd_fpr_operand (rtx, machine_mode); -extern int dbl_memory_one_insn_operand (rtx, machine_mode); -extern int dbl_memory_two_insn_operand (rtx, machine_mode); -extern int int12_operand (rtx, machine_mode); -extern int int6_operand (rtx, machine_mode); -extern int int5_operand (rtx, machine_mode); -extern int uint5_operand (rtx, machine_mode); -extern int uint4_operand (rtx, machine_mode); -extern int uint1_operand (rtx, machine_mode); -extern int int_2word_operand (rtx, machine_mode); +extern bool fpr_or_int6_operand (rtx, machine_mode); +extern bool gpr_or_int_operand (rtx, machine_mode); +extern bool gpr_or_int12_operand (rtx, machine_mode); +extern bool gpr_fpr_or_int12_operand (rtx, machine_mode); +extern bool gpr_or_int10_operand (rtx, machine_mode); +extern bool move_source_operand (rtx, machine_mode); +extern bool move_destination_operand (rtx, machine_mode); +extern bool condexec_source_operand (rtx, machine_mode); +extern bool condexec_dest_operand (rtx, machine_mode); +extern bool lr_operand (rtx, machine_mode); +extern bool gpr_or_memory_operand (rtx, machine_mode); +extern bool fpr_or_memory_operand (rtx, machine_mode); +extern bool reg_or_0_operand (rtx, machine_mode); +extern bool fcc_operand (rtx, machine_mode); +extern bool icc_operand (rtx, machine_mode); +extern bool cc_operand (rtx, machine_mode); +extern bool fcr_operand (rtx, machine_mode); +extern bool icr_operand (rtx, machine_mode); +extern bool cr_operand (rtx, machine_mode); +extern bool call_operand (rtx, machine_mode); +extern bool fpr_operand (rtx, machine_mode); +extern bool even_reg_operand (rtx, machine_mode); +extern bool odd_reg_operand (rtx, machine_mode); +extern bool even_gpr_operand (rtx, machine_mode); +extern bool odd_gpr_operand (rtx, machine_mode); +extern bool quad_fpr_operand (rtx, machine_mode); +extern bool even_fpr_operand (rtx, machine_mode); +extern bool odd_fpr_operand (rtx, machine_mode); +extern bool dbl_memory_one_insn_operand (rtx, machine_mode); +extern bool dbl_memory_two_insn_operand (rtx, machine_mode); +extern bool int12_operand (rtx, machine_mode); +extern bool int6_operand (rtx, machine_mode); +extern bool int5_operand (rtx, machine_mode); +extern bool uint5_operand (rtx, machine_mode); +extern bool uint4_operand (rtx, machine_mode); +extern bool uint1_operand (rtx, machine_mode); +extern bool int_2word_operand (rtx, machine_mode); extern int pic_register_operand (rtx, machine_mode); extern int pic_symbolic_operand (rtx, machine_mode); extern int small_data_register_operand (rtx, machine_mode); extern int small_data_symbolic_operand (rtx, machine_mode); -extern int upper_int16_operand (rtx, machine_mode); -extern int uint16_operand (rtx, machine_mode); -extern int symbolic_operand (rtx, machine_mode); -extern int relational_operator (rtx, machine_mode); +extern bool upper_int16_operand (rtx, machine_mode); +extern bool uint16_operand (rtx, machine_mode); +extern bool symbolic_operand (rtx, machine_mode); +extern bool relational_operator (rtx, machine_mode); extern int signed_relational_operator (rtx, machine_mode); extern int unsigned_relational_operator (rtx, machine_mode); -extern int float_relational_operator (rtx, machine_mode); -extern int ccr_eqne_operator (rtx, machine_mode); -extern int minmax_operator (rtx, machine_mode); -extern int condexec_si_binary_operator (rtx, machine_mode); -extern int condexec_si_media_operator (rtx, machine_mode); -extern int condexec_si_divide_operator (rtx, machine_mode); -extern int condexec_si_unary_operator (rtx, machine_mode); -extern int condexec_sf_conv_operator (rtx, machine_mode); -extern int condexec_sf_add_operator (rtx, machine_mode); +extern bool float_relational_operator (rtx, machine_mode); +extern bool ccr_eqne_operator (rtx, machine_mode); +extern bool minmax_operator (rtx, machine_mode); +extern bool condexec_si_binary_operator (rtx, machine_mode); +extern bool condexec_si_media_operator (rtx, machine_mode); +extern bool condexec_si_divide_operator (rtx, machine_mode); +extern bool condexec_si_unary_operator (rtx, machine_mode); +extern bool condexec_sf_conv_operator (rtx, machine_mode); +extern bool condexec_sf_add_operator (rtx, machine_mode); extern int condexec_memory_operand (rtx, machine_mode); -extern int intop_compare_operator (rtx, machine_mode); -extern int acc_operand (rtx, machine_mode); -extern int even_acc_operand (rtx, machine_mode); -extern int quad_acc_operand (rtx, machine_mode); -extern int accg_operand (rtx, machine_mode); +extern bool intop_compare_operator (rtx, machine_mode); +extern bool acc_operand (rtx, machine_mode); +extern bool even_acc_operand (rtx, machine_mode); +extern bool quad_acc_operand (rtx, machine_mode); +extern bool accg_operand (rtx, machine_mode); extern rtx frv_matching_accg_for_acc (rtx); extern void frv_expand_fdpic_call (rtx *, bool, bool); extern rtx frv_gen_GPsym2reg (rtx, rtx); -- cgit v1.1 From f6aa1c34e4a89c8c93518c49a108f3c43b78ea47 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Fri, 2 Jul 2021 10:48:26 -0400 Subject: Fix m32r target specific fallout from recent int->bool changes gcc/ChangeLog * config/m32r/m32r-protos.h (call_operand): Adjust return type. (small_data_operand, memreg_operand, small_insn_p): Likewise. * config/m32r/m32r.c (call_operand): Adjust return type. (small_data_operand, memreg_operand): Likewise. --- gcc/config/m32r/m32r-protos.h | 8 ++++---- gcc/config/m32r/m32r.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/m32r/m32r-protos.h b/gcc/config/m32r/m32r-protos.h index 23313fb..82b2c70 100644 --- a/gcc/config/m32r/m32r-protos.h +++ b/gcc/config/m32r/m32r-protos.h @@ -49,13 +49,13 @@ extern rtx m32r_return_addr (int); extern rtx m32r_function_symbol (const char *); #ifdef HAVE_MACHINE_MODES -extern int call_operand (rtx, machine_mode); -extern int small_data_operand (rtx, machine_mode); +extern bool call_operand (rtx, machine_mode); +extern bool small_data_operand (rtx, machine_mode); extern int addr24_operand (rtx, machine_mode); extern int addr32_operand (rtx, machine_mode); extern int call26_operand (rtx, machine_mode); -extern int memreg_operand (rtx, machine_mode); -extern int small_insn_p (rtx, machine_mode); +extern bool memreg_operand (rtx, machine_mode); +extern bool small_insn_p (rtx, machine_mode); #endif /* HAVE_MACHINE_MODES */ diff --git a/gcc/config/m32r/m32r.c b/gcc/config/m32r/m32r.c index 3444ed4..1aaba94 100644 --- a/gcc/config/m32r/m32r.c +++ b/gcc/config/m32r/m32r.c @@ -532,7 +532,7 @@ m32r_init_expanders (void) to make it easy to experiment. */ } -int +bool call_operand (rtx op, machine_mode mode) { if (!MEM_P (op)) @@ -543,7 +543,7 @@ call_operand (rtx op, machine_mode mode) /* Return 1 if OP is a reference to an object in .sdata/.sbss. */ -int +bool small_data_operand (rtx op, machine_mode mode ATTRIBUTE_UNUSED) { if (! TARGET_SDATA_USE) @@ -674,7 +674,7 @@ easy_df_const (rtx op) /* Return 1 if OP is (mem (reg ...)). This is used in insn length calcs. */ -int +bool memreg_operand (rtx op, machine_mode mode ATTRIBUTE_UNUSED) { return MEM_P (op) && REG_P (XEXP (op, 0)); -- cgit v1.1 From eb817f27e82769aef545d580a0c47a3aa50d1ec4 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Fri, 2 Jul 2021 09:44:59 +0100 Subject: Darwin, BTF: Provide a suitable section name for BTF [PR101283]. In a similar manner to r12-1960-gcc8453012f75d, this provides a placeholder section name for BTF data. This change groups BTF and CTF debug formats in the same segment, but keeps them in separate sections. As per the CTF section designation, this should be agreed or amended to an agreed form before GCC 12 ships. Signed-off-by: Iain Sandoe PR debug/101283 - Several tests fail on Darwin with -gctf/gbtf PR debug/101283 gcc/ChangeLog: * config/darwin.h (CTF_INFO_SECTION_NAME): Update the segment to include BTF. (BTF_INFO_SECTION_NAME): New. --- gcc/config/darwin.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h index b7c3af3..6840aeb 100644 --- a/gcc/config/darwin.h +++ b/gcc/config/darwin.h @@ -1115,8 +1115,10 @@ extern void darwin_driver_init (unsigned int *,struct cl_decoded_option **); # endif #endif -/* CTF support. */ +/* CTF and BTF support. */ #undef CTF_INFO_SECTION_NAME -#define CTF_INFO_SECTION_NAME "__CTF,__ctf,regular,debug" +#define CTF_INFO_SECTION_NAME "__CTF_BTF,__ctf,regular,debug" +#undef BTF_INFO_SECTION_NAME +#define BTF_INFO_SECTION_NAME "__CTF_BTF,__btf,regular,debug" #endif /* CONFIG_DARWIN_H */ -- cgit v1.1 From 85017431068251628478f38346c273418c71209b Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Fri, 2 Jul 2021 09:51:57 +0100 Subject: Darwin, CTF, BTF: Do not run the DWARF debug link for BTF/CTF [PR101283]. Darwin uses an efficient two-stage process for debug linking. The static linker (ld64) notes the inputs required but does not link the debug. When required / on demand the debug is linked into a separate package by the debug linker (dsymutil). At present none of the Darwin tools consume or understand BTF/CTF. The static linker silently accepts the sections (but will not act on them as containing anything to be processed). However, the debug linker produces a warning that it has been presented with input with no [DWARF] debug content: warning: no debug symbols in executable (-arch x86_64). This causes several testsuite fails with excess errors. Signed-off-by: Iain Sandoe PR debug/101283 - Several tests fail on Darwin with -gctf/gbtf PR debug/101283 gcc/ChangeLog: * config/darwin.h (DSYMUTIL_SPEC): Do not try to run dsymutil for BTF/CTF. --- gcc/config/darwin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h index 6840aeb..5f11978 100644 --- a/gcc/config/darwin.h +++ b/gcc/config/darwin.h @@ -250,10 +250,10 @@ extern GTY(()) int darwin_ms_struct; #define DSYMUTIL_SPEC \ "%{!fdump=*:%{!fsyntax-only:%{!c:%{!M:%{!MM:%{!E:%{!S:\ %{v} \ - %{g*:%{!gstabs*:%{%:debug-level-gt(0): -idsym}}}\ + %{g*:%{!gctf:%{!gbtf:%{!gstabs*:%{%:debug-level-gt(0): -idsym}}}}}\ %{.c|.cc|.C|.cpp|.cp|.c++|.cxx|.CPP|.m|.mm|.s|.f|.f90|\ .f95|.f03|.f77|.for|.F|.F90|.F95|.F03: \ - %{g*:%{!gstabs*:%{%:debug-level-gt(0): -dsym}}}}}}}}}}}" + %{g*:%{!gctf:%{!gbtf:%{!gstabs*:%{%:debug-level-gt(0): -dsym}}}}}}}}}}}}}" #define LINK_COMMAND_SPEC LINK_COMMAND_SPEC_A DSYMUTIL_SPEC -- cgit v1.1 From b60761baa6fd6acf3200e732283d133f4ce0f0e9 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Fri, 2 Jul 2021 11:07:37 -0400 Subject: Preparing to use shifts to eliminate redundant test/compare insns on H8 * config/h8300/h8300-protos.h (output_a_shift): Make first argument an array of rtx rather than a pointer to rtx. Add code argument. (compute_a_shift_length): Similarly. * config/h8300/h8300.c (h8300_shift_costs): Adjust now that the shift itself isn't an operand. Create dummy operand[0] to carry a mode and pass a suitable rtx code to compute_a_shift_length. (get_shift_alg): Adjust operand number of clobber in output templates. (output_a_shift): Make first argument an array of rtx rather than a pointer to rtx. Add code argument for the type of shift. Adjust now that the shift itself is no longer an operand. (compute_a_shift_length): Similarly. * config/h8300/shiftrotate.md (shiftqi, shifthi, shiftsi): Use an iterator rather than nshift_operator. (shiftqi_noscratch, shifthi_noscratch, shiftsi_noscratch): Likewise. (shiftqi_clobber_flags): Adjust to API changes in output_a_shift and compute_a_shift_length. (shiftqi_noscratch_clobber_flags): Likewise. (shifthi_noscratch_clobber_flags): Likewise. (shiftsi_noscratch_clobber_flags): Likewise. --- gcc/config/h8300/h8300-protos.h | 4 +- gcc/config/h8300/h8300.c | 33 ++++------ gcc/config/h8300/shiftrotate.md | 143 +++++++++++++++++++--------------------- 3 files changed, 85 insertions(+), 95 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300-protos.h b/gcc/config/h8300/h8300-protos.h index d7efa97..86bcc3f 100644 --- a/gcc/config/h8300/h8300-protos.h +++ b/gcc/config/h8300/h8300-protos.h @@ -28,8 +28,8 @@ along with GCC; see the file COPYING3. If not see extern unsigned int compute_mov_length (rtx *); extern const char *output_plussi (rtx *, bool); extern unsigned int compute_plussi_length (rtx *, bool); -extern const char *output_a_shift (rtx *); -extern unsigned int compute_a_shift_length (rtx *); +extern const char *output_a_shift (rtx[4], rtx_code); +extern unsigned int compute_a_shift_length (rtx[4], rtx_code); extern const char *output_a_rotate (enum rtx_code, rtx *); extern unsigned int compute_a_rotate_length (rtx *); extern const char *output_simode_bld (int, rtx[]); diff --git a/gcc/config/h8300/h8300.c b/gcc/config/h8300/h8300.c index d8b4bfc..0fdc68b 100644 --- a/gcc/config/h8300/h8300.c +++ b/gcc/config/h8300/h8300.c @@ -1108,18 +1108,17 @@ h8300_and_costs (rtx x) static int h8300_shift_costs (rtx x) { - rtx operands[4]; + rtx operands[3]; if (GET_MODE (x) != QImode && GET_MODE (x) != HImode && GET_MODE (x) != SImode) return 100; - operands[0] = NULL; + operands[0] = gen_rtx_REG (GET_MODE (x), 0); operands[1] = NULL; operands[2] = XEXP (x, 1); - operands[3] = x; - return compute_a_shift_length (operands) / 2; + return compute_a_shift_length (operands, GET_CODE (x)) / 2; } /* Worker function for TARGET_RTX_COSTS. */ @@ -3759,13 +3758,13 @@ get_shift_alg (enum shift_type shift_type, enum shift_mode shift_mode, switch (shift_type) { case SHIFT_ASHIFT: - info->special = "mov.w\t%e0,%f4\n\tmov.b\t%s4,%t4\n\tmov.b\t%t0,%s4\n\tmov.b\t%s0,%t0\n\tsub.b\t%s0,%s0\n\tmov.w\t%f4,%e0"; + info->special = "mov.w\t%e0,%f3\n\tmov.b\t%s3,%t3\n\tmov.b\t%t0,%s3\n\tmov.b\t%s0,%t0\n\tsub.b\t%s0,%s0\n\tmov.w\t%f3,%e0"; goto end; case SHIFT_LSHIFTRT: - info->special = "mov.w\t%e0,%f4\n\tmov.b\t%t0,%s0\n\tmov.b\t%s4,%t0\n\tmov.b\t%t4,%s4\n\textu.w\t%f4\n\tmov.w\t%f4,%e0"; + info->special = "mov.w\t%e0,%f3\n\tmov.b\t%t0,%s0\n\tmov.b\t%s3,%t0\n\tmov.b\t%t3,%s3\n\textu.w\t%f3\n\tmov.w\t%f3,%e0"; goto end; case SHIFT_ASHIFTRT: - info->special = "mov.w\t%e0,%f4\n\tmov.b\t%t0,%s0\n\tmov.b\t%s4,%t0\n\tmov.b\t%t4,%s4\n\texts.w\t%f4\n\tmov.w\t%f4,%e0"; + info->special = "mov.w\t%e0,%f3\n\tmov.b\t%t0,%s0\n\tmov.b\t%s3,%t0\n\tmov.b\t%t3,%s3\n\texts.w\t%f3\n\tmov.w\t%f3,%e0"; goto end; } } @@ -3985,12 +3984,10 @@ h8300_shift_needs_scratch_p (int count, machine_mode mode, enum rtx_code type) /* Output the assembler code for doing shifts. */ const char * -output_a_shift (rtx *operands) +output_a_shift (rtx operands[4], rtx_code code) { static int loopend_lab; - rtx shift = operands[3]; - machine_mode mode = GET_MODE (shift); - enum rtx_code code = GET_CODE (shift); + machine_mode mode = GET_MODE (operands[0]); enum shift_type shift_type; enum shift_mode shift_mode; struct shift_info info; @@ -4114,10 +4111,10 @@ output_a_shift (rtx *operands) if (info.shift2 != NULL) { fprintf (asm_out_file, "\tmov.b #%d,%sl\n", n / 2, - names_big[REGNO (operands[4])]); + names_big[REGNO (operands[3])]); fprintf (asm_out_file, ".Llt%d:\n", loopend_lab); output_asm_insn (info.shift2, operands); - output_asm_insn ("add #0xff,%X4", operands); + output_asm_insn ("add #0xff,%X3", operands); fprintf (asm_out_file, "\tbne .Llt%d\n", loopend_lab); if (n % 2) output_asm_insn (info.shift1, operands); @@ -4125,10 +4122,10 @@ output_a_shift (rtx *operands) else { fprintf (asm_out_file, "\tmov.b #%d,%sl\n", n, - names_big[REGNO (operands[4])]); + names_big[REGNO (operands[3])]); fprintf (asm_out_file, ".Llt%d:\n", loopend_lab); output_asm_insn (info.shift1, operands); - output_asm_insn ("add #0xff,%X4", operands); + output_asm_insn ("add #0xff,%X3", operands); fprintf (asm_out_file, "\tbne .Llt%d\n", loopend_lab); } return ""; @@ -4155,11 +4152,9 @@ h8300_asm_insn_count (const char *templ) /* Compute the length of a shift insn. */ unsigned int -compute_a_shift_length (rtx *operands) +compute_a_shift_length (rtx operands[3], rtx_code code) { - rtx shift = operands[3]; - machine_mode mode = GET_MODE (shift); - enum rtx_code code = GET_CODE (shift); + enum machine_mode mode = GET_MODE (operands[0]); enum shift_type shift_type; enum shift_mode shift_mode; struct shift_info info; diff --git a/gcc/config/h8300/shiftrotate.md b/gcc/config/h8300/shiftrotate.md index 23140d9a..c5d32cd 100644 --- a/gcc/config/h8300/shiftrotate.md +++ b/gcc/config/h8300/shiftrotate.md @@ -152,168 +152,163 @@ (define_insn_and_split "*shiftqi" [(set (match_operand:QI 0 "register_operand" "=r,r") - (match_operator:QI 3 "nshift_operator" - [(match_operand:QI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "R,rn")])) - (clobber (match_scratch:QI 4 "=X,&r"))] + (shifts:QI + (match_operand:QI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "R,rn"))) + (clobber (match_scratch:QI 3 "=X,&r"))] "" "#" "&& reload_completed" - [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) - (clobber (match_dup 4)) + [(parallel [(set (match_dup 0) (shifts:QI (match_dup 1) (match_dup 2))) + (clobber (match_dup 3)) (clobber (reg:CC CC_REG))])]) (define_insn "*shiftqi_clobber_flags" [(set (match_operand:QI 0 "register_operand" "=r,r") - (match_operator:QI 3 "nshift_operator" - [(match_operand:QI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "R,rn")])) - (clobber (match_scratch:QI 4 "=X,&r")) + (shifts:QI + (match_operand:QI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "R,rn"))) + (clobber (match_scratch:QI 3 "=X,&r")) (clobber (reg:CC CC_REG))] "" { - return output_a_shift (operands); + return output_a_shift (operands, ); } [(set (attr "length") - (symbol_ref "compute_a_shift_length (operands)"))]) + (symbol_ref "compute_a_shift_length (operands, )"))]) (define_insn_and_split "*shiftqi_noscratch" [(set (match_operand:QI 0 "register_operand" "=r,r") - (match_operator:QI 3 "nshift_operator" - [(match_operand:QI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "R,rn")]))] + (shifts:QI + (match_operand:QI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "R,rn")))] "(GET_CODE (operands[2]) == CONST_INT && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), QImode, - GET_CODE (operands[3])))" + ))" "#" "&& reload_completed" - [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) + [(parallel [(set (match_dup 0) (shifts:QI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) (define_insn "*shiftqi_noscratch_clobber_flags" [(set (match_operand:QI 0 "register_operand" "=r,r") - (match_operator:QI 3 "nshift_operator" - [(match_operand:QI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "R,rn")])) + (shifts:QI + (match_operand:QI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "R,rn"))) (clobber (reg:CC CC_REG))] "(GET_CODE (operands[2]) == CONST_INT - && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), QImode, - GET_CODE (operands[3])))" + && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), QImode, ))" { - return output_a_shift (operands); + return output_a_shift (operands, ); } [(set (attr "length") - (symbol_ref "compute_a_shift_length (operands)"))]) + (symbol_ref "compute_a_shift_length (operands, )"))]) (define_insn_and_split "*shifthi" [(set (match_operand:HI 0 "register_operand" "=r,r") - (match_operator:HI 3 "nshift_operator" - [(match_operand:HI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "S,rn")])) - (clobber (match_scratch:QI 4 "=X,&r"))] + (shifts:HI + (match_operand:HI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "S,rn"))) + (clobber (match_scratch:QI 3 "=X,&r"))] "" "#" "&& reload_completed" - [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) - (clobber (match_dup 4)) + [(parallel [(set (match_dup 0) (shifts:HI (match_dup 1) (match_dup 2))) + (clobber (match_dup 3)) (clobber (reg:CC CC_REG))])]) (define_insn "*shifthi_clobber_flags" [(set (match_operand:HI 0 "register_operand" "=r,r") - (match_operator:HI 3 "nshift_operator" - [(match_operand:HI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "S,rn")])) - (clobber (match_scratch:QI 4 "=X,&r")) + (shifts:HI + (match_operand:HI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "S,rn"))) + (clobber (match_scratch:QI 3 "=X,&r")) (clobber (reg:CC CC_REG))] "" { - return output_a_shift (operands); + return output_a_shift (operands, ); } [(set (attr "length") - (symbol_ref "compute_a_shift_length (operands)"))]) + (symbol_ref "compute_a_shift_length (operands, )"))]) (define_insn_and_split "*shifthi_noscratch" [(set (match_operand:HI 0 "register_operand" "=r,r") - (match_operator:HI 3 "nshift_operator" - [(match_operand:HI 1 "register_operand" "0,0") - (match_operand:HI 2 "nonmemory_operand" "S,rn")]))] + (shifts:HI + (match_operand:HI 1 "register_operand" "0,0") + (match_operand:HI 2 "nonmemory_operand" "S,rn")))] "(GET_CODE (operands[2]) == CONST_INT - && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), HImode, - GET_CODE (operands[3])))" + && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), HImode, ))" "#" "&& reload_completed" - [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) + [(parallel [(set (match_dup 0) (shifts:HI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) (define_insn "*shifthi_noscratch_clobber_flags" [(set (match_operand:HI 0 "register_operand" "=r,r") - (match_operator:HI 3 "nshift_operator" - [(match_operand:HI 1 "register_operand" "0,0") - (match_operand:HI 2 "nonmemory_operand" "S,rn")])) + (shifts:HI + (match_operand:HI 1 "register_operand" "0,0") + (match_operand:HI 2 "nonmemory_operand" "S,rn"))) (clobber (reg:CC CC_REG))] "(GET_CODE (operands[2]) == CONST_INT - && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), HImode, - GET_CODE (operands[3])))" + && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), HImode, ))" { - return output_a_shift (operands); + return output_a_shift (operands, ); } [(set (attr "length") - (symbol_ref "compute_a_shift_length (operands)"))]) + (symbol_ref "compute_a_shift_length (operands, )"))]) (define_insn_and_split "*shiftsi" [(set (match_operand:SI 0 "register_operand" "=r,r") - (match_operator:SI 3 "nshift_operator" - [(match_operand:SI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "T,rn")])) - (clobber (match_scratch:QI 4 "=X,&r"))] + (shifts:SI + (match_operand:SI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "T,rn"))) + (clobber (match_scratch:QI 3 "=X,&r"))] "" "#" "&& reload_completed" - [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) - (clobber (match_dup 4)) + [(parallel [(set (match_dup 0) (shifts:SI (match_dup 1) (match_dup 2))) + (clobber (match_dup 3)) (clobber (reg:CC CC_REG))])]) (define_insn "*shiftsi_clobber_flags" [(set (match_operand:SI 0 "register_operand" "=r,r") - (match_operator:SI 3 "nshift_operator" - [(match_operand:SI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "T,rn")])) - (clobber (match_scratch:QI 4 "=X,&r")) + (shifts:SI + (match_operand:SI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "T,rn"))) + (clobber (match_scratch:QI 3 "=X,&r")) (clobber (reg:CC CC_REG))] "" { - return output_a_shift (operands); + return output_a_shift (operands, ); } [(set (attr "length") - (symbol_ref "compute_a_shift_length (operands)"))]) + (symbol_ref "compute_a_shift_length (operands, )"))]) (define_insn_and_split "*shiftsi_noscratch" [(set (match_operand:SI 0 "register_operand" "=r,r") - (match_operator:SI 3 "nshift_operator" - [(match_operand:SI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "T,rn")]))] + (shifts:SI + (match_operand:SI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "T,rn")))] "(GET_CODE (operands[2]) == CONST_INT - && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), SImode, - GET_CODE (operands[3])))" + && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), SImode, ))" "#" "&& reload_completed" - [(parallel [(set (match_dup 0) (match_op_dup 3 [(match_dup 1) (match_dup 2)])) + [(parallel [(set (match_dup 0) (shifts:SI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))])]) (define_insn "*shiftsi_noscratch_clobber_flags" [(set (match_operand:SI 0 "register_operand" "=r,r") - (match_operator:SI 3 "nshift_operator" - [(match_operand:SI 1 "register_operand" "0,0") - (match_operand:SI 2 "nonmemory_operand" "T,rn")])) + (shifts:SI + (match_operand:SI 1 "register_operand" "0,0") + (match_operand:SI 2 "nonmemory_operand" "T,rn"))) (clobber (reg:CC CC_REG))] "(GET_CODE (operands[2]) == CONST_INT - && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), SImode, - GET_CODE (operands[3])))" + && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), SImode, ))" { - return output_a_shift (operands); + return output_a_shift (operands, ); } [(set (attr "length") - (symbol_ref "compute_a_shift_length (operands)"))]) + (symbol_ref "compute_a_shift_length (operands, )"))]) ;; Split a variable shift into a loop. If the register containing ;; the shift count dies, then we just use that register. -- cgit v1.1 From 7aa5fb17a30ff0ce9928e5eac35b892d95e7eba5 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Fri, 2 Jul 2021 11:58:47 -0400 Subject: Use shift instructions to eliminate redundant compare/test instructions on the H8 gcc/ChangeLog * config/h8300/h8300-protos.h (compute_a_shift_cc): Accept additional argument for the code. * config/h8300/h8300.c (compute_a_shift_cc): Accept additional argument for the code. Just return if the ZN bits are useful or not rather than the old style CC_* enums. * config/h8300/shiftrotate.md (shiftqi_noscratch): Move before more generic shiftqi patterns. (shifthi_noscratch, shiftsi_noscratch): Similarly. (shiftqi_noscratch_set_flags): New pattern. (shifthi_noscratch_set_flags, shiftsi_noscratch_set_flags): Likewise. --- gcc/config/h8300/h8300-protos.h | 2 +- gcc/config/h8300/h8300.c | 20 +++--- gcc/config/h8300/shiftrotate.md | 153 +++++++++++++++++++++++++++------------- 3 files changed, 116 insertions(+), 59 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300-protos.h b/gcc/config/h8300/h8300-protos.h index 86bcc3f..744337d 100644 --- a/gcc/config/h8300/h8300-protos.h +++ b/gcc/config/h8300/h8300-protos.h @@ -41,7 +41,7 @@ extern const char *output_logical_op (machine_mode, rtx_code code, extern unsigned int compute_logical_op_length (machine_mode, rtx_code, rtx *, rtx_insn *); -extern int compute_a_shift_cc (rtx, rtx *); +extern int compute_a_shift_cc (rtx *, rtx_code); #ifdef HAVE_ATTR_cc extern enum attr_cc compute_plussi_cc (rtx *); #endif diff --git a/gcc/config/h8300/h8300.c b/gcc/config/h8300/h8300.c index 0fdc68b..d2f6548 100644 --- a/gcc/config/h8300/h8300.c +++ b/gcc/config/h8300/h8300.c @@ -4297,11 +4297,9 @@ compute_a_shift_length (rtx operands[3], rtx_code code) /* Compute which flag bits are valid after a shift insn. */ int -compute_a_shift_cc (rtx insn ATTRIBUTE_UNUSED, rtx *operands) +compute_a_shift_cc (rtx operands[3], rtx_code code) { - rtx shift = operands[3]; - machine_mode mode = GET_MODE (shift); - enum rtx_code code = GET_CODE (shift); + machine_mode mode = GET_MODE (operands[0]); enum shift_type shift_type; enum shift_mode shift_mode; struct shift_info info; @@ -4358,16 +4356,18 @@ compute_a_shift_cc (rtx insn ATTRIBUTE_UNUSED, rtx *operands) { case SHIFT_SPECIAL: if (info.remainder == 0) - return info.cc_special; + return (info.cc_special == OLD_CC_SET_ZN + || info.cc_special == OLD_CC_SET_ZNV); /* Fall through. */ case SHIFT_INLINE: - return info.cc_inline; + return (info.cc_inline == OLD_CC_SET_ZN + || info.cc_inline == OLD_CC_SET_ZNV); case SHIFT_ROT_AND: /* This case always ends with an and instruction. */ - return OLD_CC_SET_ZNV; + return true; case SHIFT_LOOP: /* A loop to shift by a "large" constant value. @@ -4375,9 +4375,11 @@ compute_a_shift_cc (rtx insn ATTRIBUTE_UNUSED, rtx *operands) if (info.shift2 != NULL) { if (n % 2) - return info.cc_inline; + return (info.cc_inline == OLD_CC_SET_ZN + || info.cc_inline == OLD_CC_SET_ZNV); + } - return OLD_CC_CLOBBER; + return false; default: gcc_unreachable (); diff --git a/gcc/config/h8300/shiftrotate.md b/gcc/config/h8300/shiftrotate.md index c5d32cd..0476324 100644 --- a/gcc/config/h8300/shiftrotate.md +++ b/gcc/config/h8300/shiftrotate.md @@ -150,33 +150,6 @@ } [(set_attr "length" "4")]) -(define_insn_and_split "*shiftqi" - [(set (match_operand:QI 0 "register_operand" "=r,r") - (shifts:QI - (match_operand:QI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "R,rn"))) - (clobber (match_scratch:QI 3 "=X,&r"))] - "" - "#" - "&& reload_completed" - [(parallel [(set (match_dup 0) (shifts:QI (match_dup 1) (match_dup 2))) - (clobber (match_dup 3)) - (clobber (reg:CC CC_REG))])]) - -(define_insn "*shiftqi_clobber_flags" - [(set (match_operand:QI 0 "register_operand" "=r,r") - (shifts:QI - (match_operand:QI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "R,rn"))) - (clobber (match_scratch:QI 3 "=X,&r")) - (clobber (reg:CC CC_REG))] - "" -{ - return output_a_shift (operands, ); -} - [(set (attr "length") - (symbol_ref "compute_a_shift_length (operands, )"))]) - (define_insn_and_split "*shiftqi_noscratch" [(set (match_operand:QI 0 "register_operand" "=r,r") (shifts:QI @@ -204,24 +177,43 @@ [(set (attr "length") (symbol_ref "compute_a_shift_length (operands, )"))]) -(define_insn_and_split "*shifthi" - [(set (match_operand:HI 0 "register_operand" "=r,r") - (shifts:HI - (match_operand:HI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "S,rn"))) +(define_insn "*shiftqi_noscratch_set_flags" + [(set (reg:CCZN CC_REG) + (compare:CCZN + (shifts:QI + (match_operand:QI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "R,rn")) + (const_int 0))) + (set (match_operand:QI 0 "register_operand" "=r,r") + (shifts:QI (match_dup 1) (match_dup 2)))] + "(GET_CODE (operands[2]) == CONST_INT + && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), QImode, ) + && compute_a_shift_cc (operands, ))" +{ + return output_a_shift (operands, ); +} + [(set (attr "length") + (symbol_ref "compute_a_shift_length (operands, )"))]) + + +(define_insn_and_split "*shiftqi" + [(set (match_operand:QI 0 "register_operand" "=r,r") + (shifts:QI + (match_operand:QI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "R,rn"))) (clobber (match_scratch:QI 3 "=X,&r"))] "" "#" "&& reload_completed" - [(parallel [(set (match_dup 0) (shifts:HI (match_dup 1) (match_dup 2))) + [(parallel [(set (match_dup 0) (shifts:QI (match_dup 1) (match_dup 2))) (clobber (match_dup 3)) (clobber (reg:CC CC_REG))])]) -(define_insn "*shifthi_clobber_flags" - [(set (match_operand:HI 0 "register_operand" "=r,r") - (shifts:HI - (match_operand:HI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "S,rn"))) +(define_insn "*shiftqi_clobber_flags" + [(set (match_operand:QI 0 "register_operand" "=r,r") + (shifts:QI + (match_operand:QI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "R,rn"))) (clobber (match_scratch:QI 3 "=X,&r")) (clobber (reg:CC CC_REG))] "" @@ -257,24 +249,41 @@ [(set (attr "length") (symbol_ref "compute_a_shift_length (operands, )"))]) -(define_insn_and_split "*shiftsi" - [(set (match_operand:SI 0 "register_operand" "=r,r") - (shifts:SI - (match_operand:SI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "T,rn"))) +(define_insn "*shifthi_noscratch_setzn" + [(set (reg:CCZN CC_REG) + (compare:CCZN + (shifts:HI (match_operand:HI 1 "register_operand" "0,0") + (match_operand:HI 2 "nonmemory_operand" "S,rn")) + (const_int 0))) + (set (match_operand:HI 0 "register_operand" "=r,r") + (shifts:HI (match_dup 1) (match_dup 2)))] + "(GET_CODE (operands[2]) == CONST_INT + && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), HImode, ) + && compute_a_shift_cc (operands, ))" +{ + return output_a_shift (operands, ); +} + [(set (attr "length") + (symbol_ref "compute_a_shift_length (operands, )"))]) + +(define_insn_and_split "*shifthi" + [(set (match_operand:HI 0 "register_operand" "=r,r") + (shifts:HI + (match_operand:HI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "S,rn"))) (clobber (match_scratch:QI 3 "=X,&r"))] "" "#" "&& reload_completed" - [(parallel [(set (match_dup 0) (shifts:SI (match_dup 1) (match_dup 2))) + [(parallel [(set (match_dup 0) (shifts:HI (match_dup 1) (match_dup 2))) (clobber (match_dup 3)) (clobber (reg:CC CC_REG))])]) -(define_insn "*shiftsi_clobber_flags" - [(set (match_operand:SI 0 "register_operand" "=r,r") - (shifts:SI - (match_operand:SI 1 "register_operand" "0,0") - (match_operand:QI 2 "nonmemory_operand" "T,rn"))) +(define_insn "*shifthi_clobber_flags" + [(set (match_operand:HI 0 "register_operand" "=r,r") + (shifts:HI + (match_operand:HI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "S,rn"))) (clobber (match_scratch:QI 3 "=X,&r")) (clobber (reg:CC CC_REG))] "" @@ -310,9 +319,55 @@ [(set (attr "length") (symbol_ref "compute_a_shift_length (operands, )"))]) +(define_insn "*shiftsi_noscratch_cczn" + [(set (reg:CCZN CC_REG) + (compare:CCZN + (shifts:SI + (match_operand:SI 1 "register_operand" "0,0") + (match_operand:SI 2 "nonmemory_operand" "T,rn")) + (const_int 0))) + (set (match_operand:SI 0 "register_operand" "=r,r") + (shifts:SI (match_dup 1) (match_dup 2)))] + "(GET_CODE (operands[2]) == CONST_INT + && !h8300_shift_needs_scratch_p (INTVAL (operands[2]), SImode, ) + && compute_a_shift_cc (operands, ))" +{ + return output_a_shift (operands, ); +} + [(set (attr "length") + (symbol_ref "compute_a_shift_length (operands, )"))]) + ;; Split a variable shift into a loop. If the register containing ;; the shift count dies, then we just use that register. + +(define_insn_and_split "*shiftsi" + [(set (match_operand:SI 0 "register_operand" "=r,r") + (shifts:SI + (match_operand:SI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "T,rn"))) + (clobber (match_scratch:QI 3 "=X,&r"))] + "" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) (shifts:SI (match_dup 1) (match_dup 2))) + (clobber (match_dup 3)) + (clobber (reg:CC CC_REG))])]) + +(define_insn "*shiftsi_clobber_flags" + [(set (match_operand:SI 0 "register_operand" "=r,r") + (shifts:SI + (match_operand:SI 1 "register_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "T,rn"))) + (clobber (match_scratch:QI 3 "=X,&r")) + (clobber (reg:CC CC_REG))] + "" +{ + return output_a_shift (operands, ); +} + [(set (attr "length") + (symbol_ref "compute_a_shift_length (operands, )"))]) + (define_split [(set (match_operand 0 "register_operand" "") (match_operator 2 "nshift_operator" -- cgit v1.1 From 42a9e4e1381ba14d8ef21b331494945c2f51e6be Mon Sep 17 00:00:00 2001 From: Peter Bergner Date: Fri, 2 Jul 2021 11:27:52 -0500 Subject: rs6000: Add MMA __builtin_vsx_lxvp and __builtin_vsx_stxvp built-ins Add support for the __builtin_vsx_lxvp and __builtin_vsx_stxvp built-ins to keep in sync with LLVM which added these earlier. 2021-07-02 Peter Bergner gcc/ * config/rs6000/rs6000-builtin.def (BU_MMA_PAIR_LD, BU_MMA_PAIR_ST): New macros. (__builtin_vsx_lxvp, __builtin_vsx_stxvp): New built-ins. * config/rs6000/rs6000-call.c (rs6000_gimple_fold_mma_builtin): Expand lxvp and stxvp built-ins. (mma_init_builtins): Handle lxvp and stxvp built-ins. (builtin_function_type): Likewise. * doc/extend.texi (__builtin_vsx_lxvp, __builtin_mma_stxvp): Document. gcc/testsuite/ * gcc.target/powerpc/mma-builtin-7.c: New test. * gcc.target/powerpc/mma-builtin-8.c: New test. --- gcc/config/rs6000/rs6000-builtin.def | 22 ++++++++++++++ gcc/config/rs6000/rs6000-call.c | 56 ++++++++++++++++++++++++++++++++++-- 2 files changed, 76 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index d7ce4de..6270444 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -484,6 +484,25 @@ | RS6000_BTC_SENARY), \ CODE_FOR_ ## ICODE) /* ICODE */ +#define BU_MMA_PAIR_LD(ENUM, NAME, ATTR) \ + RS6000_BUILTIN_M (VSX_BUILTIN_ ## ENUM, /* ENUM */ \ + "__builtin_vsx_" NAME, /* NAME */ \ + RS6000_BTM_MMA, /* MASK */ \ + (RS6000_BTC_ ## ATTR /* ATTR */ \ + | RS6000_BTC_BINARY \ + | RS6000_BTC_GIMPLE), \ + CODE_FOR_nothing) /* ICODE */ + +#define BU_MMA_PAIR_ST(ENUM, NAME, ATTR) \ + RS6000_BUILTIN_M (VSX_BUILTIN_ ## ENUM, /* ENUM */ \ + "__builtin_vsx_" NAME, /* NAME */ \ + RS6000_BTM_MMA, /* MASK */ \ + (RS6000_BTC_ ## ATTR /* ATTR */ \ + | RS6000_BTC_TERNARY \ + | RS6000_BTC_VOID \ + | RS6000_BTC_GIMPLE), \ + CODE_FOR_nothing) /* ICODE */ + /* ISA 2.05 (power6) convenience macros. */ /* For functions that depend on the CMPB instruction */ #define BU_P6_2(ENUM, NAME, ATTR, ICODE) \ @@ -3253,6 +3272,9 @@ BU_SPECIAL_X (RS6000_BUILTIN_CFSTRING, "__builtin_cfstring", RS6000_BTM_ALWAYS, BU_P10V_VSX_1 (XVCVBF16SPN, "xvcvbf16spn", MISC, vsx_xvcvbf16spn) BU_P10V_VSX_1 (XVCVSPBF16, "xvcvspbf16", MISC, vsx_xvcvspbf16) +BU_MMA_PAIR_LD (LXVP, "lxvp", MISC) +BU_MMA_PAIR_ST (STXVP, "stxvp", PAIR) + BU_MMA_1 (XXMFACC, "xxmfacc", QUAD, mma_xxmfacc) BU_MMA_1 (XXMTACC, "xxmtacc", QUAD, mma_xxmtacc) BU_MMA_1 (XXSETACCZ, "xxsetaccz", MISC, mma_xxsetaccz) diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index b677898..6115e3b 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -11913,6 +11913,32 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi) gsi_replace_with_seq (gsi, new_seq, true); return true; } + else if (fncode == VSX_BUILTIN_LXVP) + { + push_gimplify_context (true); + tree offset = gimple_call_arg (stmt, 0); + tree ptr = gimple_call_arg (stmt, 1); + tree lhs = gimple_call_lhs (stmt); + tree mem = build_simple_mem_ref (build2 (POINTER_PLUS_EXPR, + TREE_TYPE (ptr), ptr, offset)); + gimplify_assign (lhs, mem, &new_seq); + pop_gimplify_context (NULL); + gsi_replace_with_seq (gsi, new_seq, true); + return true; + } + else if (fncode == VSX_BUILTIN_STXVP) + { + push_gimplify_context (true); + tree src = gimple_call_arg (stmt, 0); + tree offset = gimple_call_arg (stmt, 1); + tree ptr = gimple_call_arg (stmt, 2); + tree mem = build_simple_mem_ref (build2 (POINTER_PLUS_EXPR, + TREE_TYPE (ptr), ptr, offset)); + gimplify_assign (mem, src, &new_seq); + pop_gimplify_context (NULL); + gsi_replace_with_seq (gsi, new_seq, true); + return true; + } /* Convert this built-in into an internal version that uses pass-by-value arguments. The internal built-in follows immediately after this one. */ @@ -14264,11 +14290,14 @@ mma_init_builtins (void) if (gimple_func) { gcc_assert (icode == CODE_FOR_nothing); - op[nopnds++] = void_type_node; /* Some MMA built-ins that are expanded into gimple are converted into internal MMA built-ins that are expanded into rtl. The internal built-in follows immediately after this built-in. */ - icode = d[1].icode; + if (d[1].icode != CODE_FOR_nothing) + { + op[nopnds++] = void_type_node; + icode = d[1].icode; + } } else { @@ -14291,6 +14320,19 @@ mma_init_builtins (void) else op[nopnds++] = build_pointer_type (vector_pair_type_node); } + else if (d->code == VSX_BUILTIN_LXVP) + { + op[nopnds++] = vector_pair_type_node; + op[nopnds++] = sizetype; + op[nopnds++] = build_pointer_type (vector_pair_type_node); + } + else if (d->code == VSX_BUILTIN_STXVP) + { + op[nopnds++] = void_type_node; + op[nopnds++] = vector_pair_type_node; + op[nopnds++] = sizetype; + op[nopnds++] = build_pointer_type (vector_pair_type_node); + } else { /* This is a normal MMA built-in function. */ @@ -14781,6 +14823,16 @@ builtin_function_type (machine_mode mode_ret, machine_mode mode_arg0, h.uns_p[2] = 1; break; + case VSX_BUILTIN_LXVP: + h.uns_p[0] = 1; + h.uns_p[2] = 1; + break; + + case VSX_BUILTIN_STXVP: + h.uns_p[1] = 1; + h.uns_p[3] = 1; + break; + default: break; } -- cgit v1.1 From d07092a61d5a6907b2d92563e810bf5bb8e61c01 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Fri, 30 Apr 2021 16:20:42 +0100 Subject: Darwin, config : Adjust X86 biarch definitions ordering [PR100269]. This reorganises the biarch definitions to cater for the macro changes and removals at r12-36. Signed-off-by: Iain Sandoe PR target/100269 - [12 Regression] i686 biarch compiler fails for Darwin after r12-36. PR target/100269 gcc/ChangeLog: PR target/100269 * config.gcc: Ensure that Darwin biarch definitions are added before i386.h. * config/i386/darwin.h (TARGET_64BIT): Remove. (PR80556_WORKAROUND): New. (REAL_LIBGCC_SPEC): Amend to use PR80556_WORKAROUND. (DARWIN_SUBARCH_SPEC): New. * config/i386/darwin32-biarch.h (TARGET_64BIT_DEFAULT, TARGET_BI_ARCH, PR80556_WORKAROUND): New. (REAL_LIBGCC_SPEC): Remove. * config/i386/darwin64-biarch.h (TARGET_64BIT_DEFAULT, TARGET_BI_ARCH, PR80556_WORKAROUND): New. (REAL_LIBGCC_SPEC): Remove. --- gcc/config/i386/darwin.h | 22 ++++++++++------------ gcc/config/i386/darwin32-biarch.h | 22 ++++++---------------- gcc/config/i386/darwin64-biarch.h | 22 ++++++---------------- 3 files changed, 22 insertions(+), 44 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h index 5312003..bac3219 100644 --- a/gcc/config/i386/darwin.h +++ b/gcc/config/i386/darwin.h @@ -25,15 +25,6 @@ along with GCC; see the file COPYING3. If not see #undef DARWIN_X86 #define DARWIN_X86 1 -#ifdef IN_LIBGCC2 -#undef TARGET_64BIT -#ifdef __x86_64__ -#define TARGET_64BIT 1 -#else -#define TARGET_64BIT 0 -#endif -#endif - /* WORKAROUND pr80556: For x86_64 Darwin10 and later, the unwinder is in libunwind (redirected from libSystem). This doesn't use the keymgr (see keymgr.c) and therefore @@ -44,11 +35,15 @@ along with GCC; see the file COPYING3. If not see even when static-libgcc is specified. We put libSystem first so that unwinder symbols are satisfied from there. We default to 64b for single-arch builds, so apply this unconditionally. */ +#ifndef PR80556_WORKAROUND +#define PR80556_WORKAROUND \ +" %:version-compare(>= 10.6 mmacosx-version-min= -lSystem) " +#endif #undef REAL_LIBGCC_SPEC #define REAL_LIBGCC_SPEC \ - "%{static-libgcc|static: \ - %:version-compare(>= 10.6 mmacosx-version-min= -lSystem) \ - -lgcc_eh -lgcc; \ + "%{static-libgcc|static: " \ + PR80556_WORKAROUND \ + " -lgcc_eh -lgcc; \ shared-libgcc|fexceptions|fgnu-runtime: \ %:version-compare(!> 10.5 mmacosx-version-min= -lgcc_s.10.4) \ %:version-compare(>< 10.5 10.6 mmacosx-version-min= -lgcc_s.10.5) \ @@ -142,8 +137,11 @@ along with GCC; see the file COPYING3. If not see %{mpc64:crtprec64.o%s} \ %{mpc80:crtprec80.o%s}" TM_DESTRUCTOR +#ifndef DARWIN_ARCH_SPEC /* We default to x86_64 for single-arch builds, bi-arch overrides. */ #define DARWIN_ARCH_SPEC "x86_64" +#define DARWIN_SUBARCH_SPEC DARWIN_ARCH_SPEC +#endif #undef SUBTARGET_EXTRA_SPECS #define SUBTARGET_EXTRA_SPECS \ diff --git a/gcc/config/i386/darwin32-biarch.h b/gcc/config/i386/darwin32-biarch.h index 73b83eb..5470edf 100644 --- a/gcc/config/i386/darwin32-biarch.h +++ b/gcc/config/i386/darwin32-biarch.h @@ -21,6 +21,9 @@ along with GCC; see the file COPYING3. If not see #undef DARWIN_ARCH_SPEC #define DARWIN_ARCH_SPEC "%{m64:x86_64;:i386}" +#define TARGET_64BIT_DEFAULT 0 +#define TARGET_BI_ARCH 1 + /* WORKAROUND pr80556: For x86_64 Darwin10 and later, the unwinder is in libunwind (redirected from libSystem). This doesn't use the keymgr (see keymgr.c) and therefore @@ -30,22 +33,9 @@ along with GCC; see the file COPYING3. If not see Therefore, for 64b exes at least, we must use the libunwind implementation, even when static-libgcc is specified. We put libSystem first so that unwinder symbols are satisfied from there. */ -#undef REAL_LIBGCC_SPEC -#define REAL_LIBGCC_SPEC \ - "%{static-libgcc|static: \ - %{m64:%:version-compare(>= 10.6 mmacosx-version-min= -lSystem)} \ - -lgcc_eh -lgcc; \ - shared-libgcc|fexceptions|fgnu-runtime: \ - %:version-compare(!> 10.5 mmacosx-version-min= -lgcc_s.10.4) \ - %:version-compare(>< 10.5 10.6 mmacosx-version-min= -lgcc_s.10.5) \ - %:version-compare(!> 10.5 mmacosx-version-min= -lgcc_ext.10.4) \ - %:version-compare(>= 10.5 mmacosx-version-min= -lgcc_ext.10.5) \ - -lgcc ; \ - :%:version-compare(>< 10.3.9 10.5 mmacosx-version-min= -lgcc_s.10.4) \ - %:version-compare(>< 10.5 10.6 mmacosx-version-min= -lgcc_s.10.5) \ - %:version-compare(!> 10.5 mmacosx-version-min= -lgcc_ext.10.4) \ - %:version-compare(>= 10.5 mmacosx-version-min= -lgcc_ext.10.5) \ - -lgcc }" +#undef PR80556_WORKAROUND +#define PR80556_WORKAROUND \ +" %{m64:%:version-compare(>= 10.6 mmacosx-version-min= -lSystem)} " #undef DARWIN_SUBARCH_SPEC #define DARWIN_SUBARCH_SPEC DARWIN_ARCH_SPEC diff --git a/gcc/config/i386/darwin64-biarch.h b/gcc/config/i386/darwin64-biarch.h index 1ae76b8..f5bc3d6 100644 --- a/gcc/config/i386/darwin64-biarch.h +++ b/gcc/config/i386/darwin64-biarch.h @@ -22,6 +22,9 @@ along with GCC; see the file COPYING3. If not see #undef DARWIN_ARCH_SPEC #define DARWIN_ARCH_SPEC "%{m32:i386;:x86_64}" +#define TARGET_64BIT_DEFAULT (OPTION_MASK_ISA_64BIT | OPTION_MASK_ABI_64) +#define TARGET_BI_ARCH 1 + /* WORKAROUND pr80556: For x86_64 Darwin10 and later, the unwinder is in libunwind (redirected from libSystem). This doesn't use the keymgr (see keymgr.c) and therefore @@ -31,22 +34,9 @@ along with GCC; see the file COPYING3. If not see Therefore, for 64b exes at least, we must use the libunwind implementation, even when static-libgcc is specified. We put libSystem first so that unwinder symbols are satisfied from there. */ -#undef REAL_LIBGCC_SPEC -#define REAL_LIBGCC_SPEC \ - "%{static-libgcc|static: \ - %{!m32:%:version-compare(>= 10.6 mmacosx-version-min= -lSystem)} \ - -lgcc_eh -lgcc; \ - shared-libgcc|fexceptions|fgnu-runtime: \ - %:version-compare(!> 10.5 mmacosx-version-min= -lgcc_s.10.4) \ - %:version-compare(>< 10.5 10.6 mmacosx-version-min= -lgcc_s.10.5) \ - %:version-compare(!> 10.5 mmacosx-version-min= -lgcc_ext.10.4) \ - %:version-compare(>= 10.5 mmacosx-version-min= -lgcc_ext.10.5) \ - -lgcc ; \ - :%:version-compare(>< 10.3.9 10.5 mmacosx-version-min= -lgcc_s.10.4) \ - %:version-compare(>< 10.5 10.6 mmacosx-version-min= -lgcc_s.10.5) \ - %:version-compare(!> 10.5 mmacosx-version-min= -lgcc_ext.10.4) \ - %:version-compare(>= 10.5 mmacosx-version-min= -lgcc_ext.10.5) \ - -lgcc }" +#undef PR80556_WORKAROUND +#define PR80556_WORKAROUND \ +" %{!m32:%:version-compare(>= 10.6 mmacosx-version-min= -lSystem)} " #undef DARWIN_SUBARCH_SPEC #define DARWIN_SUBARCH_SPEC DARWIN_ARCH_SPEC -- cgit v1.1 From be8749f939a933bca6de19d9cf1a510d5954c2fa Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 5 Jul 2021 21:05:10 +0200 Subject: i386: Implement 4-byte vector (V4QI/V2HI) constant permutations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2021-07-05 Uroš Bizjak gcc/ * config/i386/i386-expand.c (ix86_split_mmx_punpck): Handle V4QI and V2HI modes. (expand_vec_perm_blend): Allow 4-byte vector modes with TARGET_SSE4_1. Handle V4QI mode. Emit mmx_pblendvb32 for 4-byte modes. (expand_vec_perm_pshufb): Rewrite to use switch statemets. Handle 4-byte dual operands with TARGET_XOP and single operands with TARGET_SSSE3. Emit mmx_ppermv32 for TARGET_XOP and mmx_pshufbv4qi3 for TARGET_SSSE3. (expand_vec_perm_pblendv): Allow 4-byte vector modes with TARGET_SSE4_1. (expand_vec_perm_interleave2): Allow 4-byte vector modes. (expand_vec_perm_pshufb2): Allow 4-byte vector modes with TARGET_SSSE3. (expand_vec_perm_even_odd_1): Handle V4QI mode. (expand_vec_perm_broadcast_1): Handle V4QI mode. (ix86_vectorize_vec_perm_const): Handle V4QI mode. * config/i386/mmx.md (mmx_ppermv32): New insn pattern. (mmx_pshufbv4qi3): Ditto. (*mmx_pblendw32): Ditto. (*mmx_pblendw64): Rename from *mmx_pblendw. (mmx_punpckhbw_low): New insn_and_split pattern. (mmx_punpcklbw_low): Ditto. --- gcc/config/i386/i386-expand.c | 469 ++++++++++++++++++++++++++---------------- gcc/config/i386/mmx.md | 86 +++++++- 2 files changed, 370 insertions(+), 185 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index b37642e..58c208e 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -933,6 +933,7 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p) switch (mode) { + case E_V4QImode: case E_V8QImode: sse_mode = V16QImode; double_sse_mode = V32QImode; @@ -949,6 +950,7 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p) break; case E_V4HImode: + case E_V2HImode: sse_mode = V8HImode; double_sse_mode = V16HImode; mask = gen_rtx_PARALLEL (VOIDmode, @@ -991,7 +993,7 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p) rtx insn = gen_rtx_SET (dest, op2); emit_insn (insn); - /* Move bits 64:127 to bits 0:63. */ + /* Move high bits to low bits. */ if (high_p) { if (sse_mode == V4SFmode) @@ -1004,9 +1006,19 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p) } else { - mask = gen_rtx_PARALLEL (VOIDmode, - gen_rtvec (4, GEN_INT (2), GEN_INT (3), - GEN_INT (0), GEN_INT (1))); + int sz = GET_MODE_SIZE (mode); + + if (sz == 4) + mask = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (4, GEN_INT (1), GEN_INT (0), + GEN_INT (0), GEN_INT (1))); + else if (sz == 8) + mask = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (4, GEN_INT (2), GEN_INT (3), + GEN_INT (0), GEN_INT (1))); + else + gcc_unreachable (); + dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest)); op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask); } @@ -17331,7 +17343,8 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16 - || GET_MODE_SIZE (vmode) == 8)) + || GET_MODE_SIZE (vmode) == 8 + || GET_MODE_SIZE (vmode) == 4)) ; else return false; @@ -17408,7 +17421,9 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); vperm = force_reg (vmode, vperm); - if (GET_MODE_SIZE (vmode) == 8) + if (GET_MODE_SIZE (vmode) == 4) + emit_insn (gen_mmx_pblendvb32 (target, op0, op1, vperm)); + else if (GET_MODE_SIZE (vmode) == 8) emit_insn (gen_mmx_pblendvb64 (target, op0, op1, vperm)); else if (GET_MODE_SIZE (vmode) == 16) emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm)); @@ -17440,6 +17455,16 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) vmode = V4HImode; goto do_subreg; + case E_V4QImode: + for (i = 0; i < 4; i += 2) + if (d->perm[i] + 1 != d->perm[i + 1]) + goto use_pblendvb; + + for (i = 0; i < 2; ++i) + mask |= (d->perm[i * 2] >= 4) << i; + vmode = V2HImode; + goto do_subreg; + case E_V32QImode: /* See if bytes move in pairs. If not, vpblendvb must be used. */ for (i = 0; i < 32; i += 2) @@ -17697,163 +17722,176 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) nelt = d->nelt; if (!d->one_operand_p) - { - if (GET_MODE_SIZE (d->vmode) == 8) - { - if (!TARGET_XOP) - return false; - vmode = V8QImode; - } - else if (GET_MODE_SIZE (d->vmode) == 16) - { - if (!TARGET_XOP) - return false; - } - else if (GET_MODE_SIZE (d->vmode) == 32) - { - if (!TARGET_AVX2) - return false; + switch (GET_MODE_SIZE (d->vmode)) + { + case 4: + if (!TARGET_XOP) + return false; + vmode = V4QImode; + break; - if (valid_perm_using_mode_p (V2TImode, d)) - { - if (d->testing_p) - return true; + case 8: + if (!TARGET_XOP) + return false; + vmode = V8QImode; + break; - /* Use vperm2i128 insn. The pattern uses - V4DImode instead of V2TImode. */ - target = d->target; - if (d->vmode != V4DImode) - target = gen_reg_rtx (V4DImode); - op0 = gen_lowpart (V4DImode, d->op0); - op1 = gen_lowpart (V4DImode, d->op1); - rperm[0] - = GEN_INT ((d->perm[0] / (nelt / 2)) - | ((d->perm[nelt / 2] / (nelt / 2)) * 16)); - emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0])); - if (target != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, target)); - return true; - } + case 16: + if (!TARGET_XOP) return false; - } - else + break; + + case 32: + if (!TARGET_AVX2) + return false; + + if (valid_perm_using_mode_p (V2TImode, d)) + { + if (d->testing_p) + return true; + + /* Use vperm2i128 insn. The pattern uses + V4DImode instead of V2TImode. */ + target = d->target; + if (d->vmode != V4DImode) + target = gen_reg_rtx (V4DImode); + op0 = gen_lowpart (V4DImode, d->op0); + op1 = gen_lowpart (V4DImode, d->op1); + rperm[0] + = GEN_INT ((d->perm[0] / (nelt / 2)) + | ((d->perm[nelt / 2] / (nelt / 2)) * 16)); + emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0])); + if (target != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, target)); + return true; + } + /* FALLTHRU */ + + default: return false; - } + } else - { - if (GET_MODE_SIZE (d->vmode) == 8) - { - if (!TARGET_SSSE3) - return false; - vmode = V8QImode; - } - else if (GET_MODE_SIZE (d->vmode) == 16) - { - if (!TARGET_SSSE3) - return false; - } - else if (GET_MODE_SIZE (d->vmode) == 32) - { - if (!TARGET_AVX2) - return false; + switch (GET_MODE_SIZE (d->vmode)) + { + case 4: + if (!TARGET_SSSE3) + return false; + vmode = V4QImode; + break; - /* V4DImode should be already handled through - expand_vselect by vpermq instruction. */ - gcc_assert (d->vmode != V4DImode); + case 8: + if (!TARGET_SSSE3) + return false; + vmode = V8QImode; + break; - vmode = V32QImode; - if (d->vmode == V8SImode - || d->vmode == V16HImode - || d->vmode == V32QImode) - { - /* First see if vpermq can be used for - V8SImode/V16HImode/V32QImode. */ - if (valid_perm_using_mode_p (V4DImode, d)) - { - for (i = 0; i < 4; i++) - perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3; - if (d->testing_p) + case 16: + if (!TARGET_SSSE3) + return false; + break; + + case 32: + if (!TARGET_AVX2) + return false; + + /* V4DImode should be already handled through + expand_vselect by vpermq instruction. */ + gcc_assert (d->vmode != V4DImode); + + vmode = V32QImode; + if (d->vmode == V8SImode + || d->vmode == V16HImode + || d->vmode == V32QImode) + { + /* First see if vpermq can be used for + V8SImode/V16HImode/V32QImode. */ + if (valid_perm_using_mode_p (V4DImode, d)) + { + for (i = 0; i < 4; i++) + perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3; + if (d->testing_p) + return true; + target = gen_reg_rtx (V4DImode); + if (expand_vselect (target, gen_lowpart (V4DImode, d->op0), + perm, 4, false)) + { + emit_move_insn (d->target, + gen_lowpart (d->vmode, target)); return true; - target = gen_reg_rtx (V4DImode); - if (expand_vselect (target, gen_lowpart (V4DImode, d->op0), - perm, 4, false)) - { - emit_move_insn (d->target, - gen_lowpart (d->vmode, target)); - return true; - } - return false; - } + } + return false; + } - /* Next see if vpermd can be used. */ - if (valid_perm_using_mode_p (V8SImode, d)) - vmode = V8SImode; - } - /* Or if vpermps can be used. */ - else if (d->vmode == V8SFmode) - vmode = V8SImode; + /* Next see if vpermd can be used. */ + if (valid_perm_using_mode_p (V8SImode, d)) + vmode = V8SImode; + } + /* Or if vpermps can be used. */ + else if (d->vmode == V8SFmode) + vmode = V8SImode; - if (vmode == V32QImode) - { - /* vpshufb only works intra lanes, it is not - possible to shuffle bytes in between the lanes. */ - for (i = 0; i < nelt; ++i) - if ((d->perm[i] ^ i) & (nelt / 2)) - return false; - } - } - else if (GET_MODE_SIZE (d->vmode) == 64) - { - if (!TARGET_AVX512BW) - return false; + if (vmode == V32QImode) + { + /* vpshufb only works intra lanes, it is not + possible to shuffle bytes in between the lanes. */ + for (i = 0; i < nelt; ++i) + if ((d->perm[i] ^ i) & (nelt / 2)) + return false; + } + break; - /* If vpermq didn't work, vpshufb won't work either. */ - if (d->vmode == V8DFmode || d->vmode == V8DImode) - return false; + case 64: + if (!TARGET_AVX512BW) + return false; - vmode = V64QImode; - if (d->vmode == V16SImode - || d->vmode == V32HImode - || d->vmode == V64QImode) - { - /* First see if vpermq can be used for - V16SImode/V32HImode/V64QImode. */ - if (valid_perm_using_mode_p (V8DImode, d)) - { - for (i = 0; i < 8; i++) - perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7; - if (d->testing_p) + /* If vpermq didn't work, vpshufb won't work either. */ + if (d->vmode == V8DFmode || d->vmode == V8DImode) + return false; + + vmode = V64QImode; + if (d->vmode == V16SImode + || d->vmode == V32HImode + || d->vmode == V64QImode) + { + /* First see if vpermq can be used for + V16SImode/V32HImode/V64QImode. */ + if (valid_perm_using_mode_p (V8DImode, d)) + { + for (i = 0; i < 8; i++) + perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7; + if (d->testing_p) + return true; + target = gen_reg_rtx (V8DImode); + if (expand_vselect (target, gen_lowpart (V8DImode, d->op0), + perm, 8, false)) + { + emit_move_insn (d->target, + gen_lowpart (d->vmode, target)); return true; - target = gen_reg_rtx (V8DImode); - if (expand_vselect (target, gen_lowpart (V8DImode, d->op0), - perm, 8, false)) - { - emit_move_insn (d->target, - gen_lowpart (d->vmode, target)); - return true; - } - return false; - } + } + return false; + } - /* Next see if vpermd can be used. */ - if (valid_perm_using_mode_p (V16SImode, d)) - vmode = V16SImode; - } - /* Or if vpermps can be used. */ - else if (d->vmode == V16SFmode) - vmode = V16SImode; - if (vmode == V64QImode) - { - /* vpshufb only works intra lanes, it is not - possible to shuffle bytes in between the lanes. */ - for (i = 0; i < nelt; ++i) - if ((d->perm[i] ^ i) & (3 * nelt / 4)) - return false; - } - } - else + /* Next see if vpermd can be used. */ + if (valid_perm_using_mode_p (V16SImode, d)) + vmode = V16SImode; + } + /* Or if vpermps can be used. */ + else if (d->vmode == V16SFmode) + vmode = V16SImode; + if (vmode == V64QImode) + { + /* vpshufb only works intra lanes, it is not + possible to shuffle bytes in between the lanes. */ + for (i = 0; i < nelt; ++i) + if ((d->perm[i] ^ i) & (3 * nelt / 4)) + return false; + } + break; + + default: return false; - } + } if (d->testing_p) return true; @@ -17893,23 +17931,28 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) machine_mode vpmode = vmode; - if (vmode == V8QImode) + if (vmode == V4QImode + || vmode == V8QImode) { rtx m128 = GEN_INT (-128); /* Remap elements from the second operand, as we have to - account for inactive top 8 elements from the first operand. */ + account for inactive top elements from the first operand. */ if (!d->one_operand_p) - for (i = 0; i < nelt; ++i) - { - int ival = INTVAL (rperm[i]); - if (ival >= 8) - ival += 8; - rperm[i] = GEN_INT (ival); - } + { + int sz = GET_MODE_SIZE (vmode); - /* V8QI is emulated with V16QI instruction, fill inactive - elements in the top 8 positions with zeros. */ + for (i = 0; i < nelt; ++i) + { + int ival = INTVAL (rperm[i]); + if (ival >= sz) + ival += 16-sz; + rperm[i] = GEN_INT (ival); + } + } + + /* V4QI/V8QI is emulated with V16QI instruction, fill inactive + elements in the top positions with zeros. */ for (i = nelt; i < 16; ++i) rperm[i] = m128; @@ -17931,7 +17974,9 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) { rtx (*gen) (rtx, rtx, rtx); - if (vmode == V8QImode) + if (vmode == V4QImode) + gen = gen_mmx_pshufbv4qi3; + else if (vmode == V8QImode) gen = gen_mmx_pshufbv8qi3; else if (vmode == V16QImode) gen = gen_ssse3_pshufbv16qi3; @@ -17958,7 +18003,9 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) op1 = gen_lowpart (vmode, d->op1); - if (vmode == V8QImode) + if (vmode == V4QImode) + gen = gen_mmx_ppermv32; + else if (vmode == V8QImode) gen = gen_mmx_ppermv64; else if (vmode == V16QImode) gen = gen_xop_pperm; @@ -18405,7 +18452,8 @@ expand_vec_perm_pblendv (struct expand_vec_perm_d *d) ; else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; - else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 8 + else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4 + || GET_MODE_SIZE (vmode) == 8 || GET_MODE_SIZE (vmode) == 16)) ; else @@ -18485,7 +18533,8 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) rtx_insn *seq; bool ok, same_halves = false; - if (GET_MODE_SIZE (d->vmode) == 8 + if (GET_MODE_SIZE (d->vmode) == 4 + || GET_MODE_SIZE (d->vmode) == 8 || GET_MODE_SIZE (d->vmode) == 16) { if (d->one_operand_p) @@ -18521,7 +18570,8 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) memset (remap, 0xff, sizeof (remap)); dremap = *d; - if (GET_MODE_SIZE (d->vmode) == 8) + if (GET_MODE_SIZE (d->vmode) == 4 + || GET_MODE_SIZE (d->vmode) == 8) { unsigned HOST_WIDE_INT h1, h2, h3, h4; @@ -19269,7 +19319,8 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn) else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16 - || GET_MODE_SIZE (vmode) == 8)) + || GET_MODE_SIZE (vmode) == 8 + || GET_MODE_SIZE (vmode) == 4)) ; else return false; @@ -19530,7 +19581,8 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) rtx (*gen) (rtx, rtx, rtx); if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16 - && GET_MODE_SIZE (d->vmode) != 8)) + && GET_MODE_SIZE (d->vmode) != 8 + && GET_MODE_SIZE (d->vmode) != 4)) return false; gcc_assert (!d->one_operand_p); @@ -19539,6 +19591,10 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) switch (GET_MODE_SIZE (d->vmode)) { + case 4: + mode = V4QImode; + gen = gen_mmx_pshufbv4qi3; + break; case 8: mode = V8QImode; gen = gen_mmx_pshufbv8qi3; @@ -20025,6 +20081,26 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) return false; break; + case E_V4QImode: + if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) + return expand_vec_perm_pshufb2 (d); + else + { + if (d->testing_p) + break; + /* We need 2*log2(N)-1 operations to achieve odd/even + with interleave. */ + t1 = gen_reg_rtx (V4QImode); + emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1)); + emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1)); + if (odd) + t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1); + else + t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1); + emit_insn (t2); + } + break; + case E_V4HImode: if (TARGET_SSE4_1) return expand_vec_perm_even_odd_pack (d); @@ -20214,6 +20290,7 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) { unsigned elt = d->perm[0], nelt2 = d->nelt / 2; machine_mode vmode = d->vmode; + rtx (*gen) (rtx, rtx, rtx); unsigned char perm2[4]; rtx op0 = d->op0, dest; bool ok; @@ -20238,24 +20315,48 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) /* These are always implementable using standard shuffle patterns. */ gcc_unreachable (); + case E_V4QImode: + /* This can be implemented via interleave and pshuflw. */ + if (d->testing_p) + return true; + + if (elt >= nelt2) + { + gen = gen_mmx_punpckhbw_low; + elt -= nelt2; + } + else + gen = gen_mmx_punpcklbw_low; + + dest = gen_reg_rtx (vmode); + emit_insn (gen (dest, op0, op0)); + vmode = get_mode_wider_vector (vmode); + op0 = gen_lowpart (vmode, dest); + + memset (perm2, elt, 2); + dest = gen_reg_rtx (vmode); + ok = expand_vselect (dest, op0, perm2, 2, d->testing_p); + gcc_assert (ok); + + emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); + return true; + case E_V8QImode: - /* These can be implemented via interleave. We save one insn by + /* This can be implemented via interleave. We save one insn by stopping once we have promoted to V2SImode and then use pshufd. */ if (d->testing_p) return true; do { - rtx dest; - rtx (*gen) (rtx, rtx, rtx) - = vmode == V8QImode ? gen_mmx_punpcklbw - : gen_mmx_punpcklwd; - if (elt >= nelt2) { gen = vmode == V8QImode ? gen_mmx_punpckhbw : gen_mmx_punpckhwd; elt -= nelt2; } + else + gen = vmode == V8QImode ? gen_mmx_punpcklbw + : gen_mmx_punpcklwd; nelt2 /= 2; dest = gen_reg_rtx (vmode); @@ -20266,11 +20367,11 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) while (vmode != V2SImode); memset (perm2, elt, 2); - dest = gen_reg_rtx (V2SImode); + dest = gen_reg_rtx (vmode); ok = expand_vselect (dest, op0, perm2, 2, d->testing_p); gcc_assert (ok); - if (!d->testing_p) - emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); + + emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); return true; case E_V8HImode: @@ -20281,17 +20382,15 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) return true; do { - rtx dest; - rtx (*gen) (rtx, rtx, rtx) - = vmode == V16QImode ? gen_vec_interleave_lowv16qi - : gen_vec_interleave_lowv8hi; - if (elt >= nelt2) { gen = vmode == V16QImode ? gen_vec_interleave_highv16qi : gen_vec_interleave_highv8hi; elt -= nelt2; } + else + gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi + : gen_vec_interleave_lowv8hi; nelt2 /= 2; dest = gen_reg_rtx (vmode); @@ -20302,11 +20401,11 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) while (vmode != V4SImode); memset (perm2, elt, 4); - dest = gen_reg_rtx (V4SImode); + dest = gen_reg_rtx (vmode); ok = expand_vselect (dest, op0, perm2, 4, d->testing_p); gcc_assert (ok); - if (!d->testing_p) - emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); + + emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); return true; case E_V64QImode: @@ -20787,6 +20886,10 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, if (d.testing_p) return true; break; + case E_V4QImode: + if (!TARGET_SSE2) + return false; + break; case E_V2DImode: case E_V2DFmode: if (!TARGET_SSE) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 5f105727..4ead8be 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -2362,6 +2362,18 @@ [(set_attr "type" "sse4arg") (set_attr "mode" "TI")]) +(define_insn "mmx_ppermv32" + [(set (match_operand:V4QI 0 "register_operand" "=x") + (unspec:V4QI + [(match_operand:V4QI 1 "register_operand" "x") + (match_operand:V4QI 2 "register_operand" "x") + (match_operand:V16QI 3 "nonimmediate_operand" "xm")] + UNSPEC_XOP_PERMUTE))] + "TARGET_XOP" + "vpperm\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sse4arg") + (set_attr "mode" "TI")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral logical operations @@ -2550,6 +2562,23 @@ (set_attr "type" "mmxcvt,sselog,sselog") (set_attr "mode" "DI,TI,TI")]) +(define_insn_and_split "mmx_punpckhbw_low" + [(set (match_operand:V4QI 0 "register_operand" "=x,Yw") + (vec_select:V4QI + (vec_concat:V8QI + (match_operand:V4QI 1 "register_operand" "0,Yw") + (match_operand:V4QI 2 "register_operand" "x,Yw")) + (parallel [(const_int 2) (const_int 6) + (const_int 3) (const_int 7)])))] + "TARGET_SSE2" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_mmx_punpck (operands, true); DONE;" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sselog") + (set_attr "mode" "TI")]) + (define_insn_and_split "mmx_punpcklbw" [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw") (vec_select:V8QI @@ -2573,6 +2602,23 @@ (set_attr "type" "mmxcvt,sselog,sselog") (set_attr "mode" "DI,TI,TI")]) +(define_insn_and_split "mmx_punpcklbw_low" + [(set (match_operand:V4QI 0 "register_operand" "=x,Yw") + (vec_select:V4QI + (vec_concat:V8QI + (match_operand:V4QI 1 "register_operand" "0,Yw") + (match_operand:V4QI 2 "register_operand" "x,Yw")) + (parallel [(const_int 0) (const_int 4) + (const_int 1) (const_int 5)])))] + "TARGET_SSE2" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_mmx_punpck (operands, false); DONE;" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sselog") + (set_attr "mode" "TI")]) + (define_insn_and_split "mmx_punpckhwd" [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yw") (vec_select:V4HI @@ -2930,6 +2976,24 @@ (set_attr "btver2_decode" "vector") (set_attr "mode" "TI")]) +(define_insn "mmx_pshufbv4qi3" + [(set (match_operand:V4QI 0 "register_operand" "=x,Yw") + (unspec:V4QI + [(match_operand:V4QI 1 "register_operand" "0,Yw") + (match_operand:V16QI 2 "vector_operand" "xBm,Ywm")] + UNSPEC_PSHUFB))] + "TARGET_SSSE3" + "@ + pshufb\t{%2, %0|%0, %2} + vpshufb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sselog1") + (set_attr "prefix_data16" "1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,maybe_evex") + (set_attr "btver2_decode" "vector") + (set_attr "mode" "TI")]) + (define_expand "mmx_pshufw" [(match_operand:V4HI 0 "register_operand") (match_operand:V4HI 1 "register_mmxmem_operand") @@ -3002,12 +3066,12 @@ (set_attr "length_immediate" "1") (set_attr "mode" "TI")]) -(define_insn "*mmx_pblendw" +(define_insn "*mmx_pblendw64" [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,x") (vec_merge:V4HI (match_operand:V4HI 2 "register_operand" "Yr,*x,x") (match_operand:V4HI 1 "register_operand" "0,0,x") - (match_operand:SI 3 "const_0_to_63_operand" "n,n,n")))] + (match_operand:SI 3 "const_0_to_15_operand" "n,n,n")))] "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" "@ pblendw\t{%3, %2, %0|%0, %2, %3} @@ -3020,6 +3084,24 @@ (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "TI")]) +(define_insn "*mmx_pblendw32" + [(set (match_operand:V2HI 0 "register_operand" "=Yr,*x,x") + (vec_merge:V2HI + (match_operand:V2HI 2 "register_operand" "Yr,*x,x") + (match_operand:V2HI 1 "register_operand" "0,0,x") + (match_operand:SI 3 "const_0_to_7_operand" "n,n,n")))] + "TARGET_SSE4_1" + "@ + pblendw\t{%3, %2, %0|%0, %2, %3} + pblendw\t{%3, %2, %0|%0, %2, %3} + vpblendw\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "mode" "TI")]) + ;; Optimize V2SImode load from memory, swapping the elements and ;; storing back into the memory into DImode rotate of the memory by 32. (define_split -- cgit v1.1 From 3b5e8ee4f1ecc6d407f391695f65960bcbd63cff Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Sun, 13 Dec 2020 17:21:16 +0000 Subject: Darwin, configury : Allow for specification and detection of dsymutil. In order to enable DWARF versions > 2 we need a sufficiently modern version of dsymutil (in addition to the assembler / linker). This allows the user to configure a different path from the installed one. In addition, there are several sources of dsymutil so we differentiate these in order to be get accurate version information. Signed-off-by: Iain Sandoe gcc/ChangeLog: * configure.ac: Handle --with-dsymutil in the same way as we do for the assembler and linker. (DEFAULT_DSYMUTIL): New. Extract the type and version for the dsymutil configured or found by the default searches. * config.in: Regenerated. * configure: Regenerated. * collect2.c (do_dsymutil): Handle locating dsymutil in the same way as for the assembler and linker. * config/darwin.h (DSYMUTIL): Delete. * gcc.c: Report a configured dsymutil correctly. * exec-tool.in: Allow for dsymutil. ChangeLog: * Makefile.def: Add dsymutil defs. * Makefile.in: Regenerated. * Makefile.tpl: Add dsymutil to flags. * configure: Regenerated. * configure.ac: Add dsymutil to target and build recipes. --- gcc/config/darwin.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h index 5f11978..20d6b1e 100644 --- a/gcc/config/darwin.h +++ b/gcc/config/darwin.h @@ -240,8 +240,6 @@ extern GTY(()) int darwin_ms_struct; DARWIN_NOCOMPACT_UNWIND \ "}}}}}}} % Date: Mon, 5 Jul 2021 17:23:43 -0400 Subject: Remove redundant compare in shift loop on H8 gcc/ChangeLog * config/h8300/shiftrotate.md (shift-by-variable patterns): Update to generate condition code aware RTL directly. --- gcc/config/h8300/shiftrotate.md | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/shiftrotate.md b/gcc/config/h8300/shiftrotate.md index 0476324..485303c 100644 --- a/gcc/config/h8300/shiftrotate.md +++ b/gcc/config/h8300/shiftrotate.md @@ -385,10 +385,15 @@ (parallel [(set (match_dup 0) (match_op_dup 2 [(match_dup 0) (const_int 1)])) - (clobber (scratch:QI))]) - (set (match_dup 1) (plus:QI (match_dup 1) (const_int -1))) + (clobber (reg:CC CC_REG))]) + (parallel + [(set (reg:CCZN CC_REG) + (compare:CCZN + (plus:QI (match_dup 1) (const_int -1)) + (const_int 0))) + (set (match_dup 1) (plus:QI (match_dup 1) (const_int -1)))]) (set (pc) - (if_then_else (ne (match_dup 1) (const_int 0)) + (if_then_else (ne (reg:CCZN CC_REG) (const_int 0)) (label_ref (match_dup 4)) (pc))) (match_dup 5)] @@ -416,10 +421,15 @@ (parallel [(set (match_dup 0) (match_op_dup 2 [(match_dup 0) (const_int 1)])) - (clobber (scratch:QI))]) - (set (match_dup 3) (plus:QI (match_dup 3) (const_int -1))) + (clobber (reg:CC CC_REG))]) + (parallel + [(set (reg:CCZN CC_REG) + (compare:CCZN + (plus:QI (match_dup 3) (const_int -1)) + (const_int 0))) + (set (match_dup 3) (plus:QI (match_dup 3) (const_int -1)))]) (set (pc) - (if_then_else (ne (match_dup 3) (const_int 0)) + (if_then_else (ne (reg:CCZN CC_REG) (const_int 0)) (label_ref (match_dup 4)) (pc))) (match_dup 5)] -- cgit v1.1 From c64d15224c6dc4dc3b6b3c2c6b6fd5c7e0e9c072 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Mon, 5 Jul 2021 20:54:16 -0500 Subject: i386: Disable param ira-consider-dup-in-all-alts [PR100328] With Hongtao's SPEC2017 performance evaluation result here: https://gcc.gnu.org/pipermail/gcc-patches/2021-June/573986.html this new parameter ira-consider-dup-in-all-alts has negative effects on i386, this patch is to disable it explicitly on i386. Bootstrapped & regtested on x86_64-redhat-linux. gcc/ChangeLog: PR rtl-optimization/100328 * config/i386/i386-options.c (ix86_option_override_internal): Set param_ira_consider_dup_in_all_alts to 0. --- gcc/config/i386/i386-options.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 0eccb54..7a35c46 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -2831,6 +2831,8 @@ ix86_option_override_internal (bool main_args_p, if (ix86_indirect_branch != indirect_branch_keep) SET_OPTION_IF_UNSET (opts, opts_set, flag_jump_tables, 0); + SET_OPTION_IF_UNSET (opts, opts_set, param_ira_consider_dup_in_all_alts, 0); + return true; } -- cgit v1.1 From 7d810646d421f6975300c0d06f4e9af27a48f26d Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 5 Jul 2021 15:51:49 +0200 Subject: Add FMADDSUB and FMSUBADD SLP vectorization patterns and optabs This adds named expanders for vec_fmaddsub4 and vec_fmsubadd4 which map to x86 vfmaddsubXXXp{ds} and vfmsubaddXXXp{ds} instructions. This complements the previous addition of ADDSUB support. x86 lacks SUBADD and the negate variants of FMA with mixed plus minus so I did not add optabs or patterns for those but it would not be difficult if there's a target that has them. 2021-07-05 Richard Biener * doc/md.texi (vec_fmaddsub4): Document. (vec_fmsubadd4): Likewise. * optabs.def (vec_fmaddsub$a4): Add. (vec_fmsubadd$a4): Likewise. * internal-fn.def (IFN_VEC_FMADDSUB): Add. (IFN_VEC_FMSUBADD): Likewise. * tree-vect-slp-patterns.c (addsub_pattern::recognize): Refactor to handle IFN_VEC_FMADDSUB and IFN_VEC_FMSUBADD. (addsub_pattern::build): Likewise. * tree-vect-slp.c (vect_optimize_slp): CFN_VEC_FMADDSUB and CFN_VEC_FMSUBADD are not transparent for permutes. * config/i386/sse.md (vec_fmaddsub4): New expander. (vec_fmsubadd4): Likewise. * gcc.target/i386/vect-fmaddsubXXXpd.c: New testcase. * gcc.target/i386/vect-fmaddsubXXXps.c: Likewise. * gcc.target/i386/vect-fmsubaddXXXpd.c: Likewise. * gcc.target/i386/vect-fmsubaddXXXps.c: Likewise. --- gcc/config/i386/sse.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index bcf1605..17c9e57 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -4644,6 +4644,25 @@ ;; ;; But this doesn't seem useful in practice. +(define_expand "vec_fmaddsub4" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF + [(match_operand:VF 1 "nonimmediate_operand") + (match_operand:VF 2 "nonimmediate_operand") + (match_operand:VF 3 "nonimmediate_operand")] + UNSPEC_FMADDSUB))] + "TARGET_FMA || TARGET_FMA4 || ( == 64 || TARGET_AVX512VL)") + +(define_expand "vec_fmsubadd4" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF + [(match_operand:VF 1 "nonimmediate_operand") + (match_operand:VF 2 "nonimmediate_operand") + (neg:VF + (match_operand:VF 3 "nonimmediate_operand"))] + UNSPEC_FMADDSUB))] + "TARGET_FMA || TARGET_FMA4 || ( == 64 || TARGET_AVX512VL)") + (define_expand "fmaddsub_" [(set (match_operand:VF 0 "register_operand") (unspec:VF -- cgit v1.1 From f99f6eb58e1f894dae024f63cc2fe30fa7605e59 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Sat, 3 Jul 2021 19:47:48 +0100 Subject: X86: Provide a CTOR for stringop_algs [PR100246]. Several older compilers fail to build modern GCC because of missing or incomplete C++11 support. Signed-off-by: Iain Sandoe PR bootstrap/100246 - [11/12 Regression] GCC will not bootstrap with clang 3.4/3.5 [xcode 5/6, Darwin 12/13] PR bootstrap/100246 gcc/ChangeLog: * config/i386/i386.h (struct stringop_algs): Define a CTOR for this type. --- gcc/config/i386/i386.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 6e0340a..03d1761 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -73,6 +73,15 @@ struct stringop_algs { const enum stringop_alg unknown_size; const struct stringop_strategy { + /* Several older compilers delete the default constructor because of the + const entries (see PR100246). Manually specifying a CTOR works around + this issue. Since this header is used by code compiled with the C + compiler we must guard the addition. */ +#ifdef __cplusplus + stringop_strategy(int _max = -1, enum stringop_alg _alg = libcall, + int _noalign = false) + : max (_max), alg (_alg), noalign (_noalign) {} +#endif const int max; const enum stringop_alg alg; int noalign; -- cgit v1.1 From 73c49ff53235d92aba4ee748fcb06b06e83e0b8f Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Tue, 6 Jul 2021 10:55:53 -0400 Subject: Use H8 nop moves as tst insns gcc * config/h8300/jumpcall.md (*branch): When possible, generate the comparison in CCZN mode. * config/h8300/predicates.md (simple_memory_operand): Reject all auto-increment addressing modes. --- gcc/config/h8300/jumpcall.md | 27 +++++++++++++++++++++++---- gcc/config/h8300/predicates.md | 2 ++ 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/jumpcall.md b/gcc/config/h8300/jumpcall.md index 7b6a66a..e1f0418 100644 --- a/gcc/config/h8300/jumpcall.md +++ b/gcc/config/h8300/jumpcall.md @@ -23,13 +23,32 @@ "" "#" "&& reload_completed" - [(set (reg:H8cc CC_REG) - (compare:H8cc (match_dup 1) (match_dup 2))) + [(set (match_dup 4) + (match_dup 5)) (set (pc) (if_then_else (match_op_dup 0 - [(reg:H8cc CC_REG) (const_int 0)]) + [(match_dup 4) (const_int 0)]) (label_ref (match_dup 3)) (pc)))] - "") + " +{ + machine_mode mode; + + if (REG_P (operands[1]) + && operands[2] == const0_rtx + && (GET_CODE (operands[0]) == EQ + || GET_CODE (operands[0]) == NE + || GET_CODE (operands[0]) == LT + || GET_CODE (operands[0]) == GE + /* Our tstxx insns will set ZN and clear V, so we can handle + a couple additional cases. */ + || GET_CODE (operands[0]) == LE + || GET_CODE (operands[0]) == GT)) + mode = E_CCZNmode; + else + mode = E_CCmode; + operands[4] = gen_rtx_REG (mode, CC_REG); + operands[5] = gen_rtx_COMPARE (mode, operands[1], operands[2]); +}") (define_insn "*branch_1" [(set (pc) diff --git a/gcc/config/h8300/predicates.md b/gcc/config/h8300/predicates.md index f4e3ed4..bed23e9 100644 --- a/gcc/config/h8300/predicates.md +++ b/gcc/config/h8300/predicates.md @@ -506,6 +506,8 @@ { if (GET_MODE (op) == mode && (GET_CODE (XEXP (op, 0)) != PRE_DEC + && GET_CODE (XEXP (op, 0)) != PRE_INC + && GET_CODE (XEXP (op, 0)) != POST_DEC && GET_CODE (XEXP (op, 0)) != POST_INC)) return 1; return 0; -- cgit v1.1 From f65878178ab05180a5937f11f8fdb755678a82ce Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 6 Jul 2021 19:27:34 +0200 Subject: i386: Add variable vec_set for 32bit vectors [PR97194] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To generate sane code a SSE4.1 variable PBLENDV instruction is needed. Also enable variable vec_set through vec_setm_operand predicate for TARGET_SSE4_1 instead of TARGET_AVX2. ix86_expand_vector_init_duplicate is able to emulate vpbroadcast{b,w} with pxor/pshufb. 2021-07-06 Uroš Bizjak gcc/ PR target/97194 * config/i386/predicates.md (vec_setm_operand): Enable register_operand for TARGET_SSE4_1. * config/i386/mmx.md (vec_setv2hi): Use vec_setm_operand as operand 2 predicate. Call ix86_expand_vector_set_var for non-constant index operand. (vec_setv4qi): Use vec_setm_mmx_operand as operand 2 predicate. Call ix86_expand_vector_set_var for non-constant index operand. gcc/testsuite/ PR target/97194 * gcc.target/i386/sse4_1-vec-set-1a.c: New test. * gcc.target/i386/sse4_1-vec-set-2a.c: Ditto. --- gcc/config/i386/mmx.md | 18 ++++++++++++------ gcc/config/i386/predicates.md | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 4ead8be..7e83b64 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -3534,11 +3534,14 @@ (define_expand "vec_setv2hi" [(match_operand:V2HI 0 "register_operand") (match_operand:HI 1 "register_operand") - (match_operand 2 "const_int_operand")] + (match_operand 2 "vec_setm_operand")] "TARGET_SSE2" { - ix86_expand_vector_set (false, operands[0], operands[1], - INTVAL (operands[2])); + if (CONST_INT_P (operands[2])) + ix86_expand_vector_set (false, operands[0], operands[1], + INTVAL (operands[2])); + else + ix86_expand_vector_set_var (operands[0], operands[1], operands[2]); DONE; }) @@ -3556,11 +3559,14 @@ (define_expand "vec_setv4qi" [(match_operand:V4QI 0 "register_operand") (match_operand:QI 1 "register_operand") - (match_operand 2 "const_int_operand")] + (match_operand 2 "vec_setm_mmx_operand")] "TARGET_SSE4_1" { - ix86_expand_vector_set (false, operands[0], operands[1], - INTVAL (operands[2])); + if (CONST_INT_P (operands[2])) + ix86_expand_vector_set (false, operands[0], operands[1], + INTVAL (operands[2])); + else + ix86_expand_vector_set_var (operands[0], operands[1], operands[2]); DONE; }) diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index c4b35c8..9488632 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1023,7 +1023,7 @@ ;; True for registers, or const_int_operand, used to vec_setm expander. (define_predicate "vec_setm_operand" (ior (and (match_operand 0 "register_operand") - (match_test "TARGET_AVX2")) + (match_test "TARGET_SSE4_1")) (match_code "const_int"))) (define_predicate "vec_setm_mmx_operand" -- cgit v1.1 From 62e43587ef1c874b62a6c4c5c3980969e4a2da97 Mon Sep 17 00:00:00 2001 From: Martin Sebor Date: Tue, 6 Jul 2021 13:45:54 -0600 Subject: Improve warning suppression for inlined functions. gcc/ChangeLog: * config/aarch64/aarch64-builtins.c (aarch64_simd_expand_builtin): Remove %K and use error_at. (aarch64_expand_fcmla_builtin): Same. (aarch64_expand_builtin_tme): Same. (aarch64_expand_builtin_memtag): Same. * config/arm/arm-builtins.c (arm_expand_acle_builtin): Same. (arm_expand_builtin): Same. * config/arm/arm.c (bounds_check): Same. --- gcc/config/aarch64/aarch64-builtins.c | 23 +++++++++++++++-------- gcc/config/arm/arm-builtins.c | 33 ++++++++++++++++++++------------- gcc/config/arm/arm.c | 4 ++-- 3 files changed, 37 insertions(+), 23 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index 3cab3ec..9ed4b72 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -1598,8 +1598,9 @@ constant_arg: if (!(*insn_data[icode].operand[opc].predicate) (op[opc], mode)) { - error ("%Kargument %d must be a constant immediate", - exp, opc + 1 - have_retval); + error_at (EXPR_LOCATION (exp), + "argument %d must be a constant immediate", + opc + 1 - have_retval); return const0_rtx; } break; @@ -1669,10 +1670,13 @@ aarch64_simd_expand_builtin (int fcode, tree exp, rtx target) / UINTVAL (elementsize), exp); else - error ("%Klane index must be a constant immediate", exp); + error_at (EXPR_LOCATION (exp), + "lane index must be a constant immediate"); } else - error ("%Ktotal size and element size must be a non-zero constant immediate", exp); + error_at (EXPR_LOCATION (exp), + "total size and element size must be a non-zero " + "constant immediate"); /* Don't generate any RTL. */ return const0_rtx; } @@ -1828,7 +1832,8 @@ aarch64_expand_fcmla_builtin (tree exp, rtx target, int fcode) /* Validate that the lane index is a constant. */ if (!CONST_INT_P (lane_idx)) { - error ("%Kargument %d must be a constant immediate", exp, 4); + error_at (EXPR_LOCATION (exp), + "argument %d must be a constant immediate", 4); return const0_rtx; } @@ -1917,7 +1922,8 @@ aarch64_expand_builtin_tme (int fcode, tree exp, rtx target) emit_insn (GEN_FCN (CODE_FOR_tcancel) (op0)); else { - error ("%Kargument must be a 16-bit constant immediate", exp); + error_at (EXPR_LOCATION (exp), + "argument must be a 16-bit constant immediate"); return const0_rtx; } } @@ -2006,8 +2012,9 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, rtx target) pat = GEN_FCN (icode) (target, op0, const0_rtx, op1); break; } - error ("%Kargument %d must be a constant immediate " - "in range [0,15]", exp, 2); + error_at (EXPR_LOCATION (exp), + "argument %d must be a constant immediate " + "in range [0,15]", 2); return const0_rtx; } else diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c index fa0fb0b..3a9ff8f 100644 --- a/gcc/config/arm/arm-builtins.c +++ b/gcc/config/arm/arm-builtins.c @@ -3092,26 +3092,30 @@ constant_arg: unsigned int cp_bit = (CONST_INT_P (op[argc]) ? UINTVAL (op[argc]) : -1); if (IN_RANGE (cp_bit, 0, ARM_CDE_CONST_COPROC)) - error ("%Kcoprocessor %d is not enabled " - "with +cdecp%d", exp, cp_bit, cp_bit); + error_at (EXPR_LOCATION (exp), + "coprocessor %d is not enabled " + "with +cdecp%d", cp_bit, cp_bit); else - error ("%Kcoproc must be a constant immediate in " - "range [0-%d] enabled with +cdecp", exp, - ARM_CDE_CONST_COPROC); + error_at (EXPR_LOCATION (exp), + "coproc must be a constant immediate in " + "range [0-%d] enabled with +cdecp", + ARM_CDE_CONST_COPROC); } else /* Here we mention the builtin name to follow the same format that the C/C++ frontends use for referencing a given argument index. */ - error ("%Kargument %d to %qE must be a constant immediate " - "in range [0-%d]", exp, argc + 1, + error_at (EXPR_LOCATION (exp), + "argument %d to %qE must be a constant " + "immediate in range [0-%d]", argc + 1, arm_builtin_decls[fcode], cde_builtin_data[fcode - ARM_BUILTIN_CDE_PATTERN_START].imm_max); } else - error ("%Kargument %d must be a constant immediate", - exp, argc + 1); + error_at (EXPR_LOCATION (exp), + "argument %d must be a constant immediate", + argc + 1); /* We have failed to expand the pattern, and are safely in to invalid code. But the mid-end will still try to build an assignment for this node while it expands, @@ -3328,11 +3332,13 @@ arm_expand_acle_builtin (int fcode, tree exp, rtx target) if (CONST_INT_P (sat_imm)) { if (!IN_RANGE (sat_imm, min_sat, max_sat)) - error ("%Ksaturation bit range must be in the range [%wd, %wd]", - exp, UINTVAL (min_sat), UINTVAL (max_sat)); + error_at (EXPR_LOCATION (exp), + "saturation bit range must be in the range [%wd, %wd]", + UINTVAL (min_sat), UINTVAL (max_sat)); } else - error ("%Ksaturation bit range must be a constant immediate", exp); + error_at (EXPR_LOCATION (exp), + "saturation bit range must be a constant immediate"); /* Don't generate any RTL. */ return const0_rtx; } @@ -3455,7 +3461,8 @@ arm_expand_builtin (tree exp, if (CONST_INT_P (lane_idx)) neon_lane_bounds (lane_idx, 0, TREE_INT_CST_LOW (nlanes), exp); else - error ("%Klane index must be a constant immediate", exp); + error_at (EXPR_LOCATION (exp), + "lane index must be a constant immediate"); /* Don't generate any RTL. */ return const0_rtx; } diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 7b37e1b..de37c90 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -13244,8 +13244,8 @@ bounds_check (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high, if (lane < low || lane >= high) { if (exp) - error ("%K%s %wd out of range %wd - %wd", - exp, desc, lane, low, high - 1); + error_at (EXPR_LOCATION (exp), + "%s %wd out of range %wd - %wd", desc, lane, low, high - 1); else error ("%s %wd out of range %wd - %wd", desc, lane, low, high - 1); } -- cgit v1.1 From 06357071d0557c4f0e9b0a6dfc4d37f5680e34cc Mon Sep 17 00:00:00 2001 From: Martin Sebor Date: Wed, 7 Jul 2021 08:49:13 -0600 Subject: aarch64: Remove a vestigial %K [PR101363] gcc/ChangeLog: PR target/101363 * config/aarch64/aarch64.c (aarch64_simd_lane_bounds): Remove a stray %K from error_at() missed in r12-2088. --- gcc/config/aarch64/aarch64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 2753c85..f5b25a7 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -20128,7 +20128,8 @@ aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high, if (lane < low || lane >= high) { if (exp) - error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1); + error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd", + lane, low, high - 1); else error ("lane %wd out of range %wd - %wd", lane, low, high - 1); } -- cgit v1.1 From 6278065af07634278ba30029d92a82b089969baa Mon Sep 17 00:00:00 2001 From: Peter Bergner Date: Wed, 7 Jul 2021 11:39:34 -0500 Subject: rs6000: Harden mma_init_builtins The previous MMA patch added some fragile code to initialize its new built-ins. This patch hardens the initialization. 2021-07-07 Peter Bergner gcc/ * config/rs6000/rs6000-call.c (mma_init_builtins): Use VSX_BUILTIN_LXVP and VSX_BUILTIN_STXVP. --- gcc/config/rs6000/rs6000-call.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index 6115e3b..904e104 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -14293,7 +14293,8 @@ mma_init_builtins (void) /* Some MMA built-ins that are expanded into gimple are converted into internal MMA built-ins that are expanded into rtl. The internal built-in follows immediately after this built-in. */ - if (d[1].icode != CODE_FOR_nothing) + if (d->code != VSX_BUILTIN_LXVP + && d->code != VSX_BUILTIN_STXVP) { op[nopnds++] = void_type_node; icode = d[1].icode; -- cgit v1.1 From 852b11da11a181df517c0348df044354ff0656d6 Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Wed, 7 Jul 2021 21:55:38 -0400 Subject: Generate 128-bit int divide/modulus on power10. This patch adds support for the VDIVSQ, VDIVUQ, VMODSQ, and VMODUQ instructions to do 128-bit arithmetic. 2021-07-07 Michael Meissner gcc/ PR target/100809 * config/rs6000/rs6000.md (udivti3): New insn. (divti3): New insn. (umodti3): New insn. (modti3): New insn. gcc/testsuite/ PR target/100809 * gcc.target/powerpc/p10-vdivq-vmodq.c: New test. --- gcc/config/rs6000/rs6000.md | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index e84d031..2368153 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -3234,6 +3234,14 @@ [(set_attr "type" "div") (set_attr "size" "")]) +(define_insn "udivti3" + [(set (match_operand:TI 0 "altivec_register_operand" "=v") + (udiv:TI (match_operand:TI 1 "altivec_register_operand" "v") + (match_operand:TI 2 "altivec_register_operand" "v")))] + "TARGET_POWER10 && TARGET_POWERPC64" + "vdivuq %0,%1,%2" + [(set_attr "type" "vecdiv") + (set_attr "size" "128")]) ;; For powers of two we can do sra[wd]i/addze for divide and then adjust for ;; modulus. If it isn't a power of two, force operands into register and do @@ -3324,6 +3332,15 @@ (set_attr "length" "8,12") (set_attr "cell_micro" "not")]) +(define_insn "divti3" + [(set (match_operand:TI 0 "altivec_register_operand" "=v") + (div:TI (match_operand:TI 1 "altivec_register_operand" "v") + (match_operand:TI 2 "altivec_register_operand" "v")))] + "TARGET_POWER10 && TARGET_POWERPC64" + "vdivsq %0,%1,%2" + [(set_attr "type" "vecdiv") + (set_attr "size" "128")]) + (define_expand "mod3" [(set (match_operand:GPR 0 "gpc_reg_operand") (mod:GPR (match_operand:GPR 1 "gpc_reg_operand") @@ -3424,6 +3441,23 @@ (minus:GPR (match_dup 1) (match_dup 3)))]) +(define_insn "umodti3" + [(set (match_operand:TI 0 "altivec_register_operand" "=v") + (umod:TI (match_operand:TI 1 "altivec_register_operand" "v") + (match_operand:TI 2 "altivec_register_operand" "v")))] + "TARGET_POWER10 && TARGET_POWERPC64" + "vmoduq %0,%1,%2" + [(set_attr "type" "vecdiv") + (set_attr "size" "128")]) + +(define_insn "modti3" + [(set (match_operand:TI 0 "altivec_register_operand" "=v") + (mod:TI (match_operand:TI 1 "altivec_register_operand" "v") + (match_operand:TI 2 "altivec_register_operand" "v")))] + "TARGET_POWER10 && TARGET_POWERPC64" + "vmodsq %0,%1,%2" + [(set_attr "type" "vecdiv") + (set_attr "size" "128")]) ;; Logical instructions ;; The logical instructions are mostly combined by using match_operator, -- cgit v1.1 From 663a014e77709bfbd4145c605b178169eaf334fc Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 8 Jul 2021 12:19:54 +0200 Subject: i386: Add pack/unpack patterns for 32bit vectors [PR100637] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit V1SI mode shift is needed to shift 32bit operands and consequently we need to implement V1SI moves and pushes. 2021-07-08 Uroš Bizjak gcc/ PR target/100637 * config/i386/i386-expand.c (ix86_expand_sse_unpack): Handle V4QI mode. * config/i386/mmx.md (V_32): New mode iterator. (mov): Use V_32 mode iterator. (*mov_internal): Ditto. (*push2_rex64): Ditto. (*push2): Ditto. (movmisalign): Ditto. (mmx_v1si3): New insn pattern. (sse4_1_v2qiv2hi2): Ditto. (vec_unpacks_lo_v4qi): New expander. (vec_unpacks_hi_v4qi): Ditto. (vec_unpacku_lo_v4qi): Ditto. (vec_unpacku_hi_v4qi): Ditto. * config/i386/i386.h (VALID_SSE2_REG_MODE): Add V1SImode. (VALID_INT_MODE_P): Ditto. --- gcc/config/i386/i386-expand.c | 18 ++++++++ gcc/config/i386/i386.h | 4 +- gcc/config/i386/mmx.md | 100 +++++++++++++++++++++++++++++++++++------- 3 files changed, 105 insertions(+), 17 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 58c208e..65764ad 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -5355,6 +5355,12 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) else unpack = gen_sse4_1_sign_extendv2hiv2si2; break; + case E_V4QImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv2qiv2hi2; + else + unpack = gen_sse4_1_sign_extendv2qiv2hi2; + break; default: gcc_unreachable (); } @@ -5380,6 +5386,12 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src), GEN_INT (32))); break; + case 4: + /* Shift higher 2 bytes to lower 2 bytes. */ + tmp = gen_reg_rtx (V1SImode); + emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src), + GEN_INT (16))); + break; default: gcc_unreachable (); } @@ -5427,6 +5439,12 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) else unpack = gen_mmx_punpcklwd; break; + case E_V4QImode: + if (high_p) + unpack = gen_mmx_punpckhbw_low; + else + unpack = gen_mmx_punpcklbw_low; + break; default: gcc_unreachable (); } diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 03d1761..8c3eace 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1016,7 +1016,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define VALID_SSE2_REG_MODE(MODE) \ ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode \ - || (MODE) == V4QImode || (MODE) == V2HImode \ + || (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode \ || (MODE) == V2DImode || (MODE) == DFmode) #define VALID_SSE_REG_MODE(MODE) \ @@ -1048,7 +1048,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); || (MODE) == SImode || (MODE) == DImode \ || (MODE) == CQImode || (MODE) == CHImode \ || (MODE) == CSImode || (MODE) == CDImode \ - || (MODE) == V4QImode || (MODE) == V2HImode \ + || (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode \ || (TARGET_64BIT \ && ((MODE) == TImode || (MODE) == CTImode \ || (MODE) == TFmode || (MODE) == TCmode \ diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 7e83b64..986b758 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -57,10 +57,13 @@ (define_mode_iterator MMXMODE24 [V4HI V2SI]) (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI]) -;; All 32bit integer vector modes +;; All 4-byte integer vector modes +(define_mode_iterator V_32 [V4QI V2HI V1SI]) + +;; 4-byte integer vector modes (define_mode_iterator VI_32 [V4QI V2HI]) -;; All V2S* modes +;; V2S* modes (define_mode_iterator V2FI [V2SF V2SI]) ;; Mapping from integer vector mode to mnemonic suffix @@ -238,8 +241,8 @@ }) (define_expand "mov" - [(set (match_operand:VI_32 0 "nonimmediate_operand") - (match_operand:VI_32 1 "nonimmediate_operand"))] + [(set (match_operand:V_32 0 "nonimmediate_operand") + (match_operand:V_32 1 "nonimmediate_operand"))] "TARGET_SSE2" { ix86_expand_vector_move (mode, operands); @@ -247,9 +250,9 @@ }) (define_insn "*mov_internal" - [(set (match_operand:VI_32 0 "nonimmediate_operand" + [(set (match_operand:V_32 0 "nonimmediate_operand" "=r ,m ,v,v,v,m,r,v") - (match_operand:VI_32 1 "general_operand" + (match_operand:V_32 1 "general_operand" "rmC,rC,C,v,m,v,v,r"))] "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))" @@ -304,8 +307,8 @@ ;; For TARGET_64BIT we always round up to 8 bytes. (define_insn "*push2_rex64" - [(set (match_operand:VI_32 0 "push_operand" "=X,X") - (match_operand:VI_32 1 "nonmemory_no_elim_operand" "rC,*v"))] + [(set (match_operand:V_32 0 "push_operand" "=X,X") + (match_operand:V_32 1 "nonmemory_no_elim_operand" "rC,*v"))] "TARGET_SSE2 && TARGET_64BIT" "@ push{q}\t%q1 @@ -314,8 +317,8 @@ (set_attr "mode" "DI")]) (define_insn "*push2" - [(set (match_operand:VI_32 0 "push_operand" "=<,<") - (match_operand:VI_32 1 "general_no_elim_operand" "rC*m,*v"))] + [(set (match_operand:V_32 0 "push_operand" "=<,<") + (match_operand:V_32 1 "general_no_elim_operand" "rC*m,*v"))] "TARGET_SSE2 && !TARGET_64BIT" "@ push{l}\t%1 @@ -324,20 +327,20 @@ (set_attr "mode" "SI")]) (define_split - [(set (match_operand:VI_32 0 "push_operand") - (match_operand:VI_32 1 "sse_reg_operand"))] + [(set (match_operand:V_32 0 "push_operand") + (match_operand:V_32 1 "sse_reg_operand"))] "TARGET_SSE2 && reload_completed" [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2))) (set (match_dup 0) (match_dup 1))] { - operands[2] = GEN_INT (-PUSH_ROUNDING (GET_MODE_SIZE (mode))); + operands[2] = GEN_INT (-PUSH_ROUNDING (GET_MODE_SIZE (mode))); /* Preserve memory attributes. */ operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx); }) (define_expand "movmisalign" - [(set (match_operand:VI_32 0 "nonimmediate_operand") - (match_operand:VI_32 1 "nonimmediate_operand"))] + [(set (match_operand:V_32 0 "nonimmediate_operand") + (match_operand:V_32 1 "nonimmediate_operand"))] "TARGET_SSE2" { ix86_expand_vector_move (mode, operands); @@ -2006,6 +2009,23 @@ (match_operand:DI 2 "nonmemory_operand")))] "TARGET_MMX_WITH_SSE") +(define_insn "mmx_v1si3" + [(set (match_operand:V1SI 0 "register_operand" "=x,Yw") + (any_lshift:V1SI + (match_operand:V1SI 1 "register_operand" "0,Yw") + (match_operand:DI 2 "nonmemory_operand" "xN,YwN")))] + "TARGET_SSE2" + "@ + pd\t{%2, %0|%0, %2} + vpd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseishft") + (set (attr "length_immediate") + (if_then_else (match_operand 2 "const_int_operand") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "TI")]) + (define_insn "v2hi3" [(set (match_operand:V2HI 0 "register_operand" "=x,Yw") (any_shift:V2HI @@ -2732,6 +2752,20 @@ (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "TI")]) +(define_insn "sse4_1_v2qiv2hi2" + [(set (match_operand:V2HI 0 "register_operand" "=Yr,*x,Yw") + (any_extend:V2HI + (vec_select:V2QI + (match_operand:V4QI 1 "register_operand" "Yr,*x,Yw") + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_SSE4_1" + "%vpmovbw\t{%1, %0|%0, %1}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,maybe_evex") + (set_attr "mode" "TI")]) + ;; Pack/unpack vector modes (define_mode_attr mmxpackmode [(V4HI "V8QI") (V2SI "V4HI")]) @@ -2748,6 +2782,18 @@ DONE; }) +(define_expand "vec_pack_trunc_v2hi" + [(match_operand:V4QI 0 "register_operand") + (match_operand:V2HI 1 "register_operand") + (match_operand:V2HI 2 "register_operand")] + "TARGET_SSE2" +{ + rtx op1 = gen_lowpart (V4QImode, operands[1]); + rtx op2 = gen_lowpart (V4QImode, operands[2]); + ix86_expand_vec_extract_even_odd (operands[0], op1, op2, 0); + DONE; +}) + (define_mode_attr mmxunpackmode [(V8QI "V4HI") (V4HI "V2SI")]) @@ -2775,6 +2821,30 @@ "TARGET_MMX_WITH_SSE" "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;") +(define_expand "vec_unpacks_lo_v4qi" + [(match_operand:V2HI 0 "register_operand") + (match_operand:V4QI 1 "register_operand")] + "TARGET_SSE2" + "ix86_expand_sse_unpack (operands[0], operands[1], false, false); DONE;") + +(define_expand "vec_unpacks_hi_v4qi" + [(match_operand:V2HI 0 "register_operand") + (match_operand:V4QI 1 "register_operand")] + "TARGET_SSE2" + "ix86_expand_sse_unpack (operands[0], operands[1], false, true); DONE;") + +(define_expand "vec_unpacku_lo_v4qi" + [(match_operand:V2HI 0 "register_operand") + (match_operand:V4QI 1 "register_operand")] + "TARGET_SSE2" + "ix86_expand_sse_unpack (operands[0], operands[1], true, false); DONE;") + +(define_expand "vec_unpacku_hi_v4qi" + [(match_operand:V2HI 0 "register_operand") + (match_operand:V4QI 1 "register_operand")] + "TARGET_SSE2" + "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;") + (define_insn "*mmx_pinsrd" [(set (match_operand:V2SI 0 "register_operand" "=x,Yv") (vec_merge:V2SI -- cgit v1.1 From b14ac7b29c9a05c94f62fe065c219bbaa83653db Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Thu, 8 Jul 2021 17:09:36 -0400 Subject: Further improvements to H8 variable shift patterns gcc/ * config/h8300/shiftrotate.md (variable shifts): Expose condition code handling for the test before the loop. --- gcc/config/h8300/shiftrotate.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/shiftrotate.md b/gcc/config/h8300/shiftrotate.md index 485303c..d3aa6be 100644 --- a/gcc/config/h8300/shiftrotate.md +++ b/gcc/config/h8300/shiftrotate.md @@ -377,8 +377,10 @@ (clobber (reg:CC CC_REG))] "epilogue_completed && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))" - [(set (pc) - (if_then_else (le (match_dup 1) (const_int 0)) + [(set (reg:CCZN CC_REG) + (compare:CCZN (match_dup 1) (const_int 0))) + (set (pc) + (if_then_else (le (reg:CCZN CC_REG) (const_int 0)) (label_ref (match_dup 5)) (pc))) (match_dup 4) @@ -411,10 +413,12 @@ (clobber (reg:CC CC_REG))] "epilogue_completed && !find_regno_note (insn, REG_DEAD, REGNO (operands[1]))" - [(set (match_dup 3) - (match_dup 1)) + [(parallel + [(set (reg:CCZN CC_REG) + (compare:CCZN (match_dup 1) (const_int 0))) + (set (match_dup 3) (match_dup 1))]) (set (pc) - (if_then_else (le (match_dup 3) (const_int 0)) + (if_then_else (le (reg:CCZN CC_REG) (const_int 0)) (label_ref (match_dup 5)) (pc))) (match_dup 4) -- cgit v1.1 From 062c762ef264dca89d01ebca2ef023ea91f31d50 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Thu, 8 Jul 2021 22:00:24 -0500 Subject: rs6000: Support [u]mod3 for vector modulo insns This patch is to make Power10 newly introduced vector modulo instructions exploited in vectorized loops, it just simply renames existing define_insns as standard pattern names. gcc/ChangeLog: * config/rs6000/vsx.md (mods_): Rename to... (mod3): ... this. (modu_): Rename to... (umod3): ... this. * config/rs6000/rs6000-builtin.def (MODS_V2DI, MODS_V4SI, MODU_V2DI, MODU_V4SI): Adjust. gcc/testsuite/ChangeLog: * gcc.target/powerpc/mod-vectorize.c: New test. --- gcc/config/rs6000/rs6000-builtin.def | 8 ++++---- gcc/config/rs6000/vsx.md | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index 6270444..a0dfefc 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -3031,10 +3031,10 @@ BU_P10V_AV_2 (DIVS_V4SI, "vdivsw", CONST, divv4si3) BU_P10V_AV_2 (DIVS_V2DI, "vdivsd", CONST, divv2di3) BU_P10V_AV_2 (DIVU_V4SI, "vdivuw", CONST, udivv4si3) BU_P10V_AV_2 (DIVU_V2DI, "vdivud", CONST, udivv2di3) -BU_P10V_AV_2 (MODS_V2DI, "vmodsd", CONST, mods_v2di) -BU_P10V_AV_2 (MODS_V4SI, "vmodsw", CONST, mods_v4si) -BU_P10V_AV_2 (MODU_V2DI, "vmodud", CONST, modu_v2di) -BU_P10V_AV_2 (MODU_V4SI, "vmoduw", CONST, modu_v4si) +BU_P10V_AV_2 (MODS_V2DI, "vmodsd", CONST, modv2di3) +BU_P10V_AV_2 (MODS_V4SI, "vmodsw", CONST, modv4si3) +BU_P10V_AV_2 (MODU_V2DI, "vmodud", CONST, umodv2di3) +BU_P10V_AV_2 (MODU_V4SI, "vmoduw", CONST, umodv4si3) BU_P10V_AV_2 (MULHS_V2DI, "vmulhsd", CONST, mulhs_v2di) BU_P10V_AV_2 (MULHS_V4SI, "vmulhsw", CONST, mulhs_v4si) BU_P10V_AV_2 (MULHU_V2DI, "vmulhud", CONST, mulhu_v2di) diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index f2260ba..f622873 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -6333,7 +6333,7 @@ [(set_attr "type" "vecdiv") (set_attr "size" "")]) -(define_insn "mods_" +(define_insn "mod3" [(set (match_operand:VIlong 0 "vsx_register_operand" "=v") (mod:VIlong (match_operand:VIlong 1 "vsx_register_operand" "v") (match_operand:VIlong 2 "vsx_register_operand" "v")))] @@ -6342,7 +6342,7 @@ [(set_attr "type" "vecdiv") (set_attr "size" "")]) -(define_insn "modu_" +(define_insn "umod3" [(set (match_operand:VIlong 0 "vsx_register_operand" "=v") (umod:VIlong (match_operand:VIlong 1 "vsx_register_operand" "v") (match_operand:VIlong 2 "vsx_register_operand" "v")))] -- cgit v1.1 From 82625a42e652d52fc6bbe6070f8d0589d5e0c8ad Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Fri, 18 Jun 2021 20:11:42 +0800 Subject: mips: check MSA support for vector modes [PR100760,PR100761,PR100762] Check if the vector mode is really supported by MSA in certain cases, instead of testing ISA_HAS_MSA. Simply testing ISA_HAS_MSA can cause ICE when MSA is enabled besides other MIPS SIMD extensions (notably, Loongson MMI). gcc/ PR target/100760 PR target/100761 PR target/100762 * config/mips/mips.c (mips_const_insns): Use MSA_SUPPORTED_MODE_P instead of ISA_HAS_MSA. (mips_expand_vec_unpack): Likewise. (mips_expand_vector_init): Likewise. gcc/testsuite/ PR target/100760 PR target/100761 PR target/100762 * gcc.target/mips/pr100760.c: New test. * gcc.target/mips/pr100761.c: New test. * gcc.target/mips/pr100762.c: New test. --- gcc/config/mips/mips.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 1f1475c..00a8eef 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -2879,7 +2879,7 @@ mips_const_insns (rtx x) return mips_build_integer (codes, INTVAL (x)); case CONST_VECTOR: - if (ISA_HAS_MSA + if (MSA_SUPPORTED_MODE_P (GET_MODE (x)) && mips_const_vector_same_int_p (x, GET_MODE (x), -512, 511)) return 1; /* Fall through. */ @@ -21732,7 +21732,7 @@ mips_expand_vec_unpack (rtx operands[2], bool unsigned_p, bool high_p) rtx (*cmpFunc) (rtx, rtx, rtx); rtx tmp, dest, zero; - if (ISA_HAS_MSA) + if (MSA_SUPPORTED_MODE_P (imode)) { switch (imode) { @@ -21994,7 +21994,7 @@ mips_expand_vector_init (rtx target, rtx vals) all_same = false; } - if (ISA_HAS_MSA) + if (MSA_SUPPORTED_MODE_P (vmode)) { if (all_same) { -- cgit v1.1 From 10722fb36de944c2385a275bb2b270727f7c93a4 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 9 Jul 2021 11:45:54 +0200 Subject: i386: Fix *udivmodsi4_pow2_zext_? patterns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In addition to the obvious cut-n-pasto where *udivmodsi4_pow2_zext_2 never matches, limit the range of the immediate operand to prevent out of range immediate operand of AND instruction. Found by inspection, the patterns rarely match (if at all), since tree optimizers do the transformation before RTL is generated. But according to the comment above *udivmod4_pow2, the constant can materialize after expansion, so leave these patterns around for now. 2021-07-09 Uroš Bizjak gcc/ * config/i386/i386.md (*udivmodsi4_pow2_zext_1): Limit the log2 range of operands[3] to [1,31]. (*udivmodsi4_pow2_zext_2): Ditto. Correct insn RTX pattern. --- gcc/config/i386/i386.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 156c6a9..26fb81b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -8518,7 +8518,7 @@ (umod:SI (match_dup 2) (match_dup 3))) (clobber (reg:CC FLAGS_REG))] "TARGET_64BIT - && exact_log2 (UINTVAL (operands[3])) > 0" + && IN_RANGE (exact_log2 (UINTVAL (operands[3])), 1, 31)" "#" "&& reload_completed" [(set (match_dup 1) (match_dup 2)) @@ -8599,10 +8599,10 @@ (umod:SI (match_operand:SI 2 "register_operand" "0") (match_operand:SI 3 "const_int_operand" "n")))) (set (match_operand:SI 0 "register_operand" "=r") - (umod:SI (match_dup 2) (match_dup 3))) + (udiv:SI (match_dup 2) (match_dup 3))) (clobber (reg:CC FLAGS_REG))] "TARGET_64BIT - && exact_log2 (UINTVAL (operands[3])) > 0" + && IN_RANGE (exact_log2 (UINTVAL (operands[3])), 1, 31)" "#" "&& reload_completed" [(set (match_dup 1) (match_dup 2)) -- cgit v1.1 From 41bd1b190358fce213f5add8396faf14a32d5c23 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Mon, 3 May 2021 08:22:53 +0100 Subject: Darwin, X86: Adjust call clobbers to allow for lazy-binding [PR 100152]. We allow public functions defined in a TU to bind locally for PIC code (the default) on 64bit Mach-O. If such functions are not inlined, we cannot tell at compile-time if they might be called via the lazy symbol resolver (this can depend on options given at link-time). Therefore, we must assume that the lazy resolver could be used which clobbers R11 and R10. Signed-off-by: Iain Sandoe gcc/ChangeLog: PR target/100152 * config/i386/i386-expand.c (ix86_expand_call): If a call is to a non-local-binding, or local but to a public symbol, then assume that it might be indirected via the lazy symbol binder. Mark R10 and R10 as clobbered in that case. --- gcc/config/i386/i386-expand.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 65764ad..69ea79e 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -8410,6 +8410,7 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, pop = NULL; gcc_assert (!TARGET_64BIT || !pop); + rtx addr = XEXP (fnaddr, 0); if (TARGET_MACHO && !TARGET_64BIT) { #if TARGET_MACHO @@ -8422,7 +8423,6 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, /* Static functions and indirect calls don't need the pic register. Also, check if PLT was explicitly avoided via no-plt or "noplt" attribute, making it an indirect call. */ - rtx addr = XEXP (fnaddr, 0); if (flag_pic && GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr)) @@ -8585,6 +8585,20 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, } } + if (TARGET_MACHO && TARGET_64BIT && !sibcall + && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr)) + || !fndecl || TREE_PUBLIC (fndecl))) + { + /* We allow public functions defined in a TU to bind locally for PIC + code (the default) on 64bit Mach-O. + If such functions are not inlined, we cannot tell at compile-time if + they will be called via the lazy symbol resolver (this can depend on + options given at link-time). Therefore, we must assume that the lazy + resolver could be used which clobbers R11 and R10. */ + clobber_reg (&use, gen_rtx_REG (DImode, R11_REG)); + clobber_reg (&use, gen_rtx_REG (DImode, R10_REG)); + } + if (vec_len > 1) call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec)); rtx_insn *call_insn = emit_call_insn (call); -- cgit v1.1 From 59045273cc648e354ba72f9188f69927f00802e2 Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Fri, 9 Jul 2021 17:45:40 +0100 Subject: Improvement to signed division of integer constant on x86_64. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch tweaks the way GCC handles 32-bit integer division on x86_64, when the numerator is constant. Currently the function int foo (int x) { return 100/x; } generates the code: foo: movl $100, %eax cltd idivl %edi ret where the sign-extension instruction "cltd" creates a long dependency chain, as it depends on the "mov" before it, and is depended upon by "idivl" after it. With this patch, GCC now matches both icc and LLVM and uses an xor instead, generating: foo: xorl %edx, %edx movl $100, %eax idivl %edi ret Microbenchmarking confirms that this is faster on Intel processors (Kaby lake), and no worse on AMD processors (Zen2), which agrees with intuition, but oddly disagrees with the llvm-mca cycle count prediction on godbolt.org. The tricky bit is that this sign-extension instruction is only produced by late (postreload) splitting, and unfortunately none of the subsequent passes (e.g. cprop_hardreg) is able to propagate and simplify its constant argument. The solution here is to introduce a define_insn_and_split that allows the constant numerator operand to be captured (by combine) and then split into an optimal form after reload. The above microbenchmarking also shows that eliminating the sign extension of negative values (using movl $-1,%edx) is also a performance improvement, as performed by icc but not by LLVM. Both the xor and movl sign-extensions are larger than cltd, so this transformation is prevented for -Os. 2021-07-09 Roger Sayle Uroš Bizjak gcc/ChangeLog * config/i386/i386.md (*divmodsi4_const): Optimize SImode divmod of a constant numerator with new define_insn_and_split. gcc/testsuite/ChangeLog * gcc.target/i386/divmod-9.c: New test case. --- gcc/config/i386/i386.md | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 26fb81b..8b809c4 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -8385,7 +8385,7 @@ (ashiftrt:SWIM248 (match_dup 4) (match_dup 5))) (clobber (reg:CC FLAGS_REG))]) (parallel [(set (match_dup 0) - (div:SWIM248 (match_dup 2) (match_dup 3))) + (div:SWIM248 (match_dup 2) (match_dup 3))) (set (match_dup 1) (mod:SWIM248 (match_dup 2) (match_dup 3))) (use (match_dup 1)) @@ -8661,6 +8661,31 @@ [(set_attr "type" "idiv") (set_attr "mode" "SI")]) +;; Avoid sign-extension (using cdq) for constant numerators. +(define_insn_and_split "*divmodsi4_const" + [(set (match_operand:SI 0 "register_operand" "=&a") + (div:SI (match_operand:SI 2 "const_int_operand" "n") + (match_operand:SI 3 "nonimmediate_operand" "rm"))) + (set (match_operand:SI 1 "register_operand" "=&d") + (mod:SI (match_dup 2) (match_dup 3))) + (clobber (reg:CC FLAGS_REG))] + "!optimize_function_for_size_p (cfun)" + "#" + "reload_completed" + [(set (match_dup 0) (match_dup 2)) + (set (match_dup 1) (match_dup 4)) + (parallel [(set (match_dup 0) + (div:SI (match_dup 0) (match_dup 3))) + (set (match_dup 1) + (mod:SI (match_dup 0) (match_dup 3))) + (use (match_dup 1)) + (clobber (reg:CC FLAGS_REG))])] +{ + operands[4] = INTVAL (operands[2]) < 0 ? constm1_rtx : const0_rtx; +} + [(set_attr "type" "multi") + (set_attr "mode" "SI")]) + (define_expand "divmodqi4" [(parallel [(set (match_operand:QI 0 "register_operand") (div:QI -- cgit v1.1 From 1e72c24d2f3b1427f5e117e371928e7af50d2036 Mon Sep 17 00:00:00 2001 From: "prathamesh.kulkarni" Date: Mon, 12 Jul 2021 13:23:06 +0530 Subject: arm/98435: Missed optimization in expanding vector constructor. The patch moves vec_init pattern from neon.md to vec-common.md, and adjusts the mode to VDQX to accomodate binary floats. Also, the pattern is additionally gated on VALID_MVE_MODE. gcc/ChangeLog: PR target/98435 * config/arm/neon.md (vec_init): Move to ... * config/arm/vec-common.md (vec_init): ... here. Change the pattern's mode to VDQX and gate it on VALID_MVE_MODE. gcc/testsuite/ChangeLog: PR target/98435 * gcc.target/arm/simd/pr98435.c: New test. --- gcc/config/arm/neon.md | 9 --------- gcc/config/arm/vec-common.md | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 81cc8d3..64365e0 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -458,15 +458,6 @@ [(set_attr "type" "neon_store1_one_lane_q,neon_to_gp_q")] ) -(define_expand "vec_init" - [(match_operand:VDQ 0 "s_register_operand") - (match_operand 1 "" "")] - "TARGET_NEON || TARGET_HAVE_MVE" -{ - neon_expand_vector_init (operands[0], operands[1]); - DONE; -}) - ;; Doubleword and quadword arithmetic. ;; NOTE: some other instructions also support 64-bit integer diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index f90afa4..68de4f0 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -702,3 +702,12 @@ DONE; } ) + +(define_expand "vec_init" + [(match_operand:VDQX 0 "s_register_operand") + (match_operand 1 "" "")] + "TARGET_NEON || (TARGET_HAVE_MVE && VALID_MVE_MODE (mode))" +{ + neon_expand_vector_init (operands[0], operands[1]); + DONE; +}) -- cgit v1.1 From 6785eb595981abd93ad85edcfdf1d2e43c0841f5 Mon Sep 17 00:00:00 2001 From: "prathamesh.kulkarni" Date: Mon, 12 Jul 2021 15:18:21 +0530 Subject: arm/66791: Replace builtins for unsigned and fp vmul_n intrinsics. gcc/ChangeLog: PR target/66791 * config/arm/arm_neon.h (vmul_n_u32): Replace call to builtin with __a * __b. (vmulq_n_u32): Likewise. (vmul_n_f32): Gate __a * __b on __FAST_MATH__. (vmulq_n_f32): Likewise. (vmul_n_f16): Likewise. (vmulq_n_f16): Likewise. gcc/testsuite/ChangeLog: PR target/66791 * gcc.target/arm/armv8_2-fp16-neon-2.c: Adjust. --- gcc/config/arm/arm_neon.h | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index f42a15f..41b596b 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -8384,21 +8384,25 @@ __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmul_n_f32 (float32x2_t __a, float32_t __b) { +#ifdef __FAST_MATH__ + return __a * __b; +#else return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, (__builtin_neon_sf) __b); +#endif } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmul_n_u16 (uint16x4_t __a, uint16_t __b) { - return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b); + return __a * __b; } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmul_n_u32 (uint32x2_t __a, uint32_t __b) { - return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b); + return __a * __b; } __extension__ extern __inline int16x8_t @@ -8419,21 +8423,25 @@ __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmulq_n_f32 (float32x4_t __a, float32_t __b) { +#ifdef __FAST_MATH__ + return __a * __b; +#else return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, (__builtin_neon_sf) __b); +#endif } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmulq_n_u16 (uint16x8_t __a, uint16_t __b) { - return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, (__builtin_neon_hi) __b); + return __a * __b; } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmulq_n_u32 (uint32x4_t __a, uint32_t __b) { - return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, (__builtin_neon_si) __b); + return __a * __b; } __extension__ extern __inline int32x4_t @@ -17740,7 +17748,11 @@ __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmul_n_f16 (float16x4_t __a, float16_t __b) { +#ifdef __FAST_MATH__ + return __a * __b; +#else return __builtin_neon_vmul_nv4hf (__a, __b); +#endif } __extension__ extern __inline float16x8_t @@ -17765,7 +17777,11 @@ __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmulq_n_f16 (float16x8_t __a, float16_t __b) { +#ifdef __FAST_MATH__ + return __a * __b; +#else return __builtin_neon_vmul_nv8hf (__a, __b); +#endif } __extension__ extern __inline float16x4_t -- cgit v1.1 From 8d980e84240c82502661758fbecd5f456018ea89 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 12 Jul 2021 21:06:32 +0200 Subject: i386: Fix vec_set expanders [PR101424] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AVX does not support 32-byte integer compares, required by ix86_expand_vector_set_var. The following patch fixes vec_set expanders by introducing new vec_setm_avx2_operand predicate for AVX vector modes. gcc/ 2021-07-12 Uroš Bizjak PR target/101424 * config/i386/predicates.md (vec_setm_sse41_operand): Rename from vec_setm_operand. (vec_setm_avx2_operand): New predicate. * config/i386/sse.md (vec_set): Use V_128 mode iterator. Use vec_setm_sse41_operand as operand 2 predicate. (vec_set PR target/101424 * gcc.target/i386/pr101424.c: New test. --- gcc/config/i386/mmx.md | 2 +- gcc/config/i386/predicates.md | 7 ++++++- gcc/config/i386/sse.md | 18 ++++++++++++++++-- 3 files changed, 23 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 986b758..0984f7c 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -3604,7 +3604,7 @@ (define_expand "vec_setv2hi" [(match_operand:V2HI 0 "register_operand") (match_operand:HI 1 "register_operand") - (match_operand 2 "vec_setm_operand")] + (match_operand 2 "vec_setm_sse41_operand")] "TARGET_SSE2" { if (CONST_INT_P (operands[2])) diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 9488632..6aa1ea3 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1021,11 +1021,16 @@ }) ;; True for registers, or const_int_operand, used to vec_setm expander. -(define_predicate "vec_setm_operand" +(define_predicate "vec_setm_sse41_operand" (ior (and (match_operand 0 "register_operand") (match_test "TARGET_SSE4_1")) (match_code "const_int"))) +(define_predicate "vec_setm_avx2_operand" + (ior (and (match_operand 0 "register_operand") + (match_test "TARGET_AVX2")) + (match_code "const_int"))) + (define_predicate "vec_setm_mmx_operand" (ior (and (match_operand 0 "register_operand") (match_test "TARGET_SSE4_1") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 17c9e57..ab29999 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -8486,9 +8486,9 @@ (set_attr "mode" "DF")]) (define_expand "vec_set" - [(match_operand:V 0 "register_operand") + [(match_operand:V_128 0 "register_operand") (match_operand: 1 "register_operand") - (match_operand 2 "vec_setm_operand")] + (match_operand 2 "vec_setm_sse41_operand")] "TARGET_SSE" { if (CONST_INT_P (operands[2])) @@ -8499,6 +8499,20 @@ DONE; }) +(define_expand "vec_set" + [(match_operand:V_256_512 0 "register_operand") + (match_operand: 1 "register_operand") + (match_operand 2 "vec_setm_avx2_operand")] + "TARGET_AVX" +{ + if (CONST_INT_P (operands[2])) + ix86_expand_vector_set (false, operands[0], operands[1], + INTVAL (operands[2])); + else + ix86_expand_vector_set_var (operands[0], operands[1], operands[2]); + DONE; +}) + (define_insn_and_split "*vec_extractv4sf_0" [(set (match_operand:SF 0 "nonimmediate_operand" "=v,m,f,r") (vec_select:SF -- cgit v1.1 From 7591309696537212a1d0497bc09c09b7abc7e650 Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Mon, 12 Jul 2021 23:50:38 -0400 Subject: Change rs6000_const_f32_to_i32 return type. The function rs6000_const_f32_to_i32 called REAL_VALUE_TO_TARGET_SINGLE with a long long type and returns it. This patch changes the type to long which is the proper type for REAL_VALUE_TO_TARGET_SINGLE. 2021-07-12 Michael Meissner gcc/ * config/rs6000/altivec.md (xxspltiw_v4sf): Change local variable value to to long. * config/rs6000/rs6000-protos.h (rs6000_const_f32_to_i32): Change return type to long. * config/rs6000/rs6000.c (rs6000_const_f32_to_i32): Change return type to long. --- gcc/config/rs6000/altivec.md | 2 +- gcc/config/rs6000/rs6000-protos.h | 2 +- gcc/config/rs6000/rs6000.c | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index dad3a07..a20d6ac 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -863,7 +863,7 @@ UNSPEC_XXSPLTIW))] "TARGET_POWER10" { - long long value = rs6000_const_f32_to_i32 (operands[1]); + long value = rs6000_const_f32_to_i32 (operands[1]); emit_insn (gen_xxspltiw_v4sf_inst (operands[0], GEN_INT (value))); DONE; }) diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 9de294d..94bf961 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -281,7 +281,7 @@ extern void rs6000_asm_output_dwarf_pcrel (FILE *file, int size, const char *label); extern void rs6000_asm_output_dwarf_datarel (FILE *file, int size, const char *label); -extern long long rs6000_const_f32_to_i32 (rtx operand); +extern long rs6000_const_f32_to_i32 (rtx operand); /* Declare functions in rs6000-c.c */ diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 9a5db63..de11de5 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -27936,10 +27936,12 @@ rs6000_invalid_conversion (const_tree fromtype, const_tree totype) return NULL; } -long long +/* Convert a SFmode constant to the integer bit pattern. */ + +long rs6000_const_f32_to_i32 (rtx operand) { - long long value; + long value; const struct real_value *rv = CONST_DOUBLE_REAL_VALUE (operand); gcc_assert (GET_MODE (operand) == SFmode); -- cgit v1.1 From 18a463bb666cc8f3421589e7641ec617acb84741 Mon Sep 17 00:00:00 2001 From: Kito Cheng Date: Fri, 2 Jul 2021 10:19:30 +0800 Subject: docs: Add 'S' to Machine Constraints for RISC-V It was undocument before, but it might used in linux kernel for resolve code model issue, so LLVM community suggest we should document that, so that make it become supported/documented/non-internal machine constraints. gcc/ChangeLog: PR target/101275 * config/riscv/constraints.md ("S"): Update description and remove @internal. * doc/md.texi (Machine Constraints): Document the 'S' constraints for RISC-V. --- gcc/config/riscv/constraints.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md index 8c15c6c..c87d5b7 100644 --- a/gcc/config/riscv/constraints.md +++ b/gcc/config/riscv/constraints.md @@ -67,8 +67,7 @@ (match_test "GET_CODE(XEXP(op,0)) == REG"))) (define_constraint "S" - "@internal - A constant call address." + "A constraint that matches an absolute symbolic address." (match_operand 0 "absolute_symbolic_operand")) (define_constraint "U" -- cgit v1.1 From acd4b9103c1a30c833de4eee31fb69c3ff13cd77 Mon Sep 17 00:00:00 2001 From: "Paul A. Clarke" Date: Tue, 29 Jun 2021 09:18:55 -0500 Subject: rs6000: Add support for SSE4.1 "test" intrinsics 2021-07-13 Paul A. Clarke gcc * config/rs6000/smmintrin.h (_mm_testz_si128, _mm_testc_si128, _mm_testnzc_si128, _mm_test_all_ones, _mm_test_all_zeros, _mm_test_mix_ones_zeros): New. --- gcc/config/rs6000/smmintrin.h | 56 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h index bdf6eb3..16fd34d 100644 --- a/gcc/config/rs6000/smmintrin.h +++ b/gcc/config/rs6000/smmintrin.h @@ -116,4 +116,60 @@ _mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask) return (__m128i) vec_sel ((__v16qu) __A, (__v16qu) __B, __lmask); } +__inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testz_si128 (__m128i __A, __m128i __B) +{ + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + const __v16qu __zero = {0}; + return vec_all_eq (vec_and ((__v16qu) __A, (__v16qu) __B), __zero); +} + +__inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testc_si128 (__m128i __A, __m128i __B) +{ + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + const __v16qu __zero = {0}; + const __v16qu __notA = vec_nor ((__v16qu) __A, (__v16qu) __A); + return vec_all_eq (vec_and ((__v16qu) __notA, (__v16qu) __B), __zero); +} + +__inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testnzc_si128 (__m128i __A, __m128i __B) +{ + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + return _mm_testz_si128 (__A, __B) == 0 && _mm_testc_si128 (__A, __B) == 0; +} + +__inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_test_all_zeros (__m128i __A, __m128i __mask) +{ + const __v16qu __zero = {0}; + return vec_all_eq (vec_and ((__v16qu) __A, (__v16qu) __mask), __zero); +} + +__inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_test_all_ones (__m128i __A) +{ + const __v16qu __ones = vec_splats ((unsigned char) 0xff); + return vec_all_eq ((__v16qu) __A, __ones); +} + +__inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_test_mix_ones_zeros (__m128i __A, __m128i __mask) +{ + const __v16qu __zero = {0}; + const __v16qu __Amasked = vec_and ((__v16qu) __A, (__v16qu) __mask); + const int any_ones = vec_any_ne (__Amasked, __zero); + const __v16qu __notA = vec_nor ((__v16qu) __A, (__v16qu) __A); + const __v16qu __notAmasked = vec_and ((__v16qu) __notA, (__v16qu) __mask); + const int any_zeros = vec_any_ne (__notAmasked, __zero); + return any_ones * any_zeros; +} + #endif -- cgit v1.1 From 8695bf78dad1a42636775843ca832a2f4dba4da3 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 2 Jun 2021 16:55:00 +0100 Subject: gcc: Add vec_select -> subreg RTL simplification Add a new RTL simplification for the case of a VEC_SELECT selecting the low part of a vector. The simplification returns a SUBREG. The primary goal of this patch is to enable better combinations of Neon RTL patterns - specifically allowing generation of 'write-to- high-half' narrowing intructions. Adding this RTL simplification means that the expected results for a number of tests need to be updated: * aarch64 Neon: Update the scan-assembler regex for intrinsics tests to expect a scalar register instead of lane 0 of a vector. * aarch64 SVE: Likewise. * arm MVE: Use lane 1 instead of lane 0 for lane-extraction intrinsics tests (as the move instructions get optimized away for lane 0.) This patch also adds new code generation tests to narrow_high_combine.c to verify the benefit of this RTL simplification. gcc/ChangeLog: 2021-06-08 Jonathan Wright * combine.c (combine_simplify_rtx): Add vec_select -> subreg simplification. * config/aarch64/aarch64.md (*zero_extend2_aarch64): Add Neon to general purpose register case for zero-extend pattern. * config/arm/vfp.md (*arm_movsi_vfp): Remove "*" from *t -> r case to prevent some cases opting to go through memory. * cse.c (fold_rtx): Add vec_select -> subreg simplification. * rtl.c (rtvec_series_p): Define predicate to determine whether a vector contains a linear series of integers. * rtl.h (rtvec_series_p): Define. * rtlanal.c (vec_series_lowpart_p): Define predicate to determine if a vector selection is equivalent to the low part of the vector. * rtlanal.h (vec_series_lowpart_p): Define. * simplify-rtx.c (simplify_context::simplify_binary_operation_1): Add vec_select -> subreg simplification. gcc/testsuite/ChangeLog: * gcc.target/aarch64/extract_zero_extend.c: Remove dump scan for RTL pattern match. * gcc.target/aarch64/narrow_high_combine.c: Add new tests. * gcc.target/aarch64/simd/vmulx_laneq_f64_1.c: Update scan-assembler regex to look for a scalar register instead of lane 0 of a vector. * gcc.target/aarch64/simd/vmulxd_laneq_f64_1.c: Likewise. * gcc.target/aarch64/simd/vmulxs_lane_f32_1.c: Likewise. * gcc.target/aarch64/simd/vmulxs_laneq_f32_1.c: Likewise. * gcc.target/aarch64/simd/vqdmlalh_lane_s16.c: Likewise. * gcc.target/aarch64/simd/vqdmlals_lane_s32.c: Likewise. * gcc.target/aarch64/simd/vqdmlslh_lane_s16.c: Likewise. * gcc.target/aarch64/simd/vqdmlsls_lane_s32.c: Likewise. * gcc.target/aarch64/simd/vqdmullh_lane_s16.c: Likewise. * gcc.target/aarch64/simd/vqdmullh_laneq_s16.c: Likewise. * gcc.target/aarch64/simd/vqdmulls_lane_s32.c: Likewise. * gcc.target/aarch64/simd/vqdmulls_laneq_s32.c: Likewise. * gcc.target/aarch64/sve/dup_lane_1.c: Likewise. * gcc.target/aarch64/sve/extract_1.c: Likewise. * gcc.target/aarch64/sve/extract_2.c: Likewise. * gcc.target/aarch64/sve/extract_3.c: Likewise. * gcc.target/aarch64/sve/extract_4.c: Likewise. * gcc.target/aarch64/sve/live_1.c: Update scan-assembler regex cases to look for 'b' and 'h' registers instead of 'w'. * gcc.target/arm/crypto-vsha1cq_u32.c: Update scan-assembler regex to reflect lane 0 vector extractions being simplified to scalar register moves. * gcc.target/arm/crypto-vsha1h_u32.c: Likewise. * gcc.target/arm/crypto-vsha1mq_u32.c: Likewise. * gcc.target/arm/crypto-vsha1pq_u32.c: Likewise. * gcc.target/arm/mve/intrinsics/vgetq_lane_f16.c: Extract lane 1 as the moves for lane 0 now get optimized away. * gcc.target/arm/mve/intrinsics/vgetq_lane_f32.c: Likewise. * gcc.target/arm/mve/intrinsics/vgetq_lane_s16.c: Likewise. * gcc.target/arm/mve/intrinsics/vgetq_lane_s32.c: Likewise. * gcc.target/arm/mve/intrinsics/vgetq_lane_s8.c: Likewise. * gcc.target/arm/mve/intrinsics/vgetq_lane_u16.c: Likewise. * gcc.target/arm/mve/intrinsics/vgetq_lane_u32.c: Likewise. * gcc.target/arm/mve/intrinsics/vgetq_lane_u8.c: Likewise. --- gcc/config/aarch64/aarch64.md | 11 ++++++----- gcc/config/arm/vfp.md | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index aef6da9..f12a0be 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1884,15 +1884,16 @@ ) (define_insn "*zero_extend2_aarch64" - [(set (match_operand:GPI 0 "register_operand" "=r,r,w") - (zero_extend:GPI (match_operand:SHORT 1 "nonimmediate_operand" "r,m,m")))] + [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r") + (zero_extend:GPI (match_operand:SHORT 1 "nonimmediate_operand" "r,m,m,w")))] "" "@ and\t%0, %1, ldr\t%w0, %1 - ldr\t%0, %1" - [(set_attr "type" "logic_imm,load_4,f_loads") - (set_attr "arch" "*,*,fp")] + ldr\t%0, %1 + umov\t%w0, %1.[0]" + [(set_attr "type" "logic_imm,load_4,f_loads,neon_to_gp") + (set_attr "arch" "*,*,fp,fp")] ) (define_expand "qihi2" diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md index 55b6c1a..93e96369 100644 --- a/gcc/config/arm/vfp.md +++ b/gcc/config/arm/vfp.md @@ -224,7 +224,7 @@ ;; problems because small constants get converted into adds. (define_insn "*arm_movsi_vfp" [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m ,*t,r,*t,*t, *Uv") - (match_operand:SI 1 "general_operand" "rk, I,K,j,mi,rk,r,*t,*t,*Uvi,*t"))] + (match_operand:SI 1 "general_operand" "rk, I,K,j,mi,rk,r,t,*t,*Uvi,*t"))] "TARGET_ARM && TARGET_HARD_FLOAT && ( s_register_operand (operands[0], SImode) || s_register_operand (operands[1], SImode))" -- cgit v1.1 From cc11b924bfe7752edbba052ca71653f46a60887a Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 9 Jul 2021 09:16:01 -0700 Subject: x86: Don't enable UINTR in 32-bit mode UINTR is available only in 64-bit mode. Since the codegen target is unknown when the the gcc driver is processing -march=native, to properly handle UINTR for -march=native: 1. Pass "arch [32|64]" and "tune [32|64]" to host_detect_local_cpu to indicate 32-bit and 64-bit codegen. 2. Change ix86_option_override_internal to enable UINTR only in 64-bit mode for -march=CPU when PTA_CPU includes PTA_UINTR. gcc/ PR target/101395 * config/i386/driver-i386.c (host_detect_local_cpu): Check "arch [32|64]" and "tune [32|64]" for 32-bit and 64-bit codegen. Enable UINTR only for 64-bit codegen. * config/i386/i386-options.c (ix86_option_override_internal::DEF_PTA): Skip PTA_UINTR if not in 64-bit mode. * config/i386/i386.h (ARCH_ARG): New. (CC1_CPU_SPEC): Pass "[arch|tune] 32" for 32-bit codegen and "[arch|tune] 64" for 64-bit codegen. gcc/testsuite/ PR target/101395 * gcc.target/i386/pr101395-1.c: New test. * gcc.target/i386/pr101395-2.c: Likewise. * gcc.target/i386/pr101395-3.c: Likewise. --- gcc/config/i386/driver-i386.c | 25 +++++++++++++++++++------ gcc/config/i386/i386-options.c | 1 + gcc/config/i386/i386.h | 7 ++++--- 3 files changed, 24 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c index dd92366..f844a16 100644 --- a/gcc/config/i386/driver-i386.c +++ b/gcc/config/i386/driver-i386.c @@ -370,9 +370,9 @@ detect_caches_intel (bool xeon_mp, unsigned max_level, } /* This will be called by the spec parser in gcc.c when it sees - a %:local_cpu_detect(args) construct. Currently it will be called - with either "arch" or "tune" as argument depending on if -march=native - or -mtune=native is to be substituted. + a %:local_cpu_detect(args) construct. Currently it will be + called with either "arch [32|64]" or "tune [32|64]" as argument + depending on if -march=native or -mtune=native is to be substituted. It returns a string containing new command line parameters to be put at the place of the above two options, depending on what CPU @@ -401,7 +401,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) unsigned int l2sizekb = 0; - if (argc < 1) + if (argc < 2) return NULL; arch = !strcmp (argv[0], "arch"); @@ -409,6 +409,15 @@ const char *host_detect_local_cpu (int argc, const char **argv) if (!arch && strcmp (argv[0], "tune")) return NULL; + bool codegen_x86_64; + + if (!strcmp (argv[1], "32")) + codegen_x86_64 = false; + else if (!strcmp (argv[1], "64")) + codegen_x86_64 = true; + else + return NULL; + struct __processor_model cpu_model = { }; struct __processor_model2 cpu_model2 = { }; unsigned int cpu_features2[SIZE_OF_CPU_FEATURES] = { }; @@ -804,8 +813,12 @@ const char *host_detect_local_cpu (int argc, const char **argv) if (isa_names_table[i].option) { if (has_feature (isa_names_table[i].feature)) - options = concat (options, " ", - isa_names_table[i].option, NULL); + { + if (codegen_x86_64 + || isa_names_table[i].feature != FEATURE_UINTR) + options = concat (options, " ", + isa_names_table[i].option, NULL); + } else options = concat (options, neg_option, isa_names_table[i].option + 2, NULL); diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 7a35c46..7cba655 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -2109,6 +2109,7 @@ ix86_option_override_internal (bool main_args_p, #define DEF_PTA(NAME) \ if (((processor_alias_table[i].flags & PTA_ ## NAME) != 0) \ && PTA_ ## NAME != PTA_64BIT \ + && (TARGET_64BIT || PTA_ ## NAME != PTA_UINTR) \ && !TARGET_EXPLICIT_ ## NAME ## _P (opts)) \ SET_TARGET_ ## NAME (opts); #include "i386-isa.def" diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 8c3eace..324e8a9 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -576,10 +576,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #ifndef HAVE_LOCAL_CPU_DETECT #define CC1_CPU_SPEC CC1_CPU_SPEC_1 #else +#define ARCH_ARG "%{" OPT_ARCH64 ":64;:32}" #define CC1_CPU_SPEC CC1_CPU_SPEC_1 \ -"%{march=native:%>march=native %:local_cpu_detect(arch) \ - %{!mtune=*:%>mtune=native %:local_cpu_detect(tune)}} \ -%{mtune=native:%>mtune=native %:local_cpu_detect(tune)}" +"%{march=native:%>march=native %:local_cpu_detect(arch " ARCH_ARG ") \ + %{!mtune=*:%>mtune=native %:local_cpu_detect(tune " ARCH_ARG ")}} \ +%{mtune=native:%>mtune=native %:local_cpu_detect(tune " ARCH_ARG ")}" #endif #endif -- cgit v1.1 From 752045ed1eea0eddc48923df78999dab7f2827ba Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Wed, 14 Jul 2021 15:19:32 +0100 Subject: AArch64: Add support for sign differing dot-product usdot for NEON and SVE. Hi All, This adds optabs implementing usdot_prod. The following testcase: #define N 480 #define SIGNEDNESS_1 unsigned #define SIGNEDNESS_2 signed #define SIGNEDNESS_3 signed #define SIGNEDNESS_4 unsigned SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, SIGNEDNESS_4 char *restrict b) { for (__INTPTR_TYPE__ i = 0; i < N; ++i) { int av = a[i]; int bv = b[i]; SIGNEDNESS_2 short mult = av * bv; res += mult; } return res; } Generates for NEON f: movi v0.4s, 0 mov x3, 0 .p2align 3,,7 .L2: ldr q1, [x2, x3] ldr q2, [x1, x3] usdot v0.4s, v1.16b, v2.16b add x3, x3, 16 cmp x3, 480 bne .L2 addv s0, v0.4s fmov w1, s0 add w0, w0, w1 ret and for SVE f: mov x3, 0 cntb x5 mov w4, 480 mov z1.b, #0 whilelo p0.b, wzr, w4 mov z3.b, #0 ptrue p1.b, all .p2align 3,,7 .L2: ld1b z2.b, p0/z, [x1, x3] ld1b z0.b, p0/z, [x2, x3] add x3, x3, x5 sel z0.b, p0, z0.b, z3.b whilelo p0.b, w3, w4 usdot z1.s, z0.b, z2.b b.any .L2 uaddv d0, p1, z1.s fmov x1, d0 add w0, w0, w1 ret instead of f: movi v0.4s, 0 mov x3, 0 .p2align 3,,7 .L2: ldr q2, [x1, x3] ldr q1, [x2, x3] add x3, x3, 16 sxtl v4.8h, v2.8b sxtl2 v3.8h, v2.16b uxtl v2.8h, v1.8b uxtl2 v1.8h, v1.16b mul v2.8h, v2.8h, v4.8h mul v1.8h, v1.8h, v3.8h saddw v0.4s, v0.4s, v2.4h saddw2 v0.4s, v0.4s, v2.8h saddw v0.4s, v0.4s, v1.4h saddw2 v0.4s, v0.4s, v1.8h cmp x3, 480 bne .L2 addv s0, v0.4s fmov w1, s0 add w0, w0, w1 ret and f: mov x3, 0 cnth x5 mov w4, 480 mov z1.b, #0 whilelo p0.h, wzr, w4 ptrue p2.b, all .p2align 3,,7 .L2: ld1sb z2.h, p0/z, [x1, x3] punpklo p1.h, p0.b ld1b z0.h, p0/z, [x2, x3] add x3, x3, x5 mul z0.h, p2/m, z0.h, z2.h sunpklo z2.s, z0.h sunpkhi z0.s, z0.h add z1.s, p1/m, z1.s, z2.s punpkhi p1.h, p0.b whilelo p0.h, w3, w4 add z1.s, p1/m, z1.s, z0.s b.any .L2 uaddv d0, p2, z1.s fmov x1, d0 add w0, w0, w1 ret gcc/ChangeLog: * config/aarch64/aarch64-simd.md (aarch64_usdot): Rename to... (usdot_prod): ... This. * config/aarch64/aarch64-simd-builtins.def (usdot): Rename to... (usdot_prod): ...This. * config/aarch64/arm_neon.h (vusdot_s32, vusdotq_s32): Likewise. * config/aarch64/aarch64-sve.md (@aarch64_dot_prod): Rename to... (@dot_prod): ...This. * config/aarch64/aarch64-sve-builtins-base.cc (svusdot_impl::expand): Use it. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/vusdot-autovec.c: New test. * gcc.target/aarch64/sve/vusdot-autovec.c: New test. --- gcc/config/aarch64/aarch64-simd-builtins.def | 5 +++-- gcc/config/aarch64/aarch64-simd.md | 2 +- gcc/config/aarch64/aarch64-sve-builtins-base.cc | 2 +- gcc/config/aarch64/aarch64-sve.md | 2 +- gcc/config/aarch64/arm_neon.h | 4 ++-- 5 files changed, 8 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index ac5d4fc..063f503 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -374,10 +374,11 @@ BUILTIN_VSDQ_I_DI (BINOP, srshl, 0, NONE) BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) - /* Implemented by aarch64_{_lane}{q}. */ + /* Implemented by _prod. */ BUILTIN_VB (TERNOP, sdot, 0, NONE) BUILTIN_VB (TERNOPU, udot, 0, NONE) - BUILTIN_VB (TERNOP_SSUS, usdot, 0, NONE) + BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE) + /* Implemented by aarch64__lane{q}. */ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, NONE) BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 540244c..7489098 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -601,7 +601,7 @@ ;; These instructions map to the __builtins for the armv8.6a I8MM usdot ;; (vector) Dot Product operation. -(define_insn "aarch64_usdot" +(define_insn "usdot_prod" [(set (match_operand:VS 0 "register_operand" "=w") (plus:VS (unspec:VS [(match_operand: 2 "register_operand" "w") diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index 8fd6d3f..02e42a7 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -2366,7 +2366,7 @@ public: Hence we do the same rotation on arguments as svdot_impl does. */ e.rotate_inputs_left (0, 3); machine_mode mode = e.vector_mode (0); - insn_code icode = code_for_aarch64_dot_prod (UNSPEC_USDOT, mode); + insn_code icode = code_for_dot_prod (UNSPEC_USDOT, mode); return e.use_exact_insn (icode); } diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 9e48c0e..359fe0e 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -6870,7 +6870,7 @@ [(set_attr "movprfx" "*,yes")] ) -(define_insn "@aarch64_dot_prod" +(define_insn "@dot_prod" [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w") (plus:VNx4SI_ONLY (unspec:VNx4SI_ONLY diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 17e059e..00d76ea 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -34039,14 +34039,14 @@ __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b); + return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b); + return __builtin_aarch64_usdot_prodv16qi_ssus (__r, __a, __b); } __extension__ extern __inline int32x2_t -- cgit v1.1 From 6412c58c781f64b60e7353e762cd5cec62a863e7 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Wed, 14 Jul 2021 15:20:45 +0100 Subject: AArch32: Add support for sign differing dot-product usdot for NEON. This adds optabs implementing usdot_prod. The following testcase: #define N 480 #define SIGNEDNESS_1 unsigned #define SIGNEDNESS_2 signed #define SIGNEDNESS_3 signed #define SIGNEDNESS_4 unsigned SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, SIGNEDNESS_4 char *restrict b) { for (__INTPTR_TYPE__ i = 0; i < N; ++i) { int av = a[i]; int bv = b[i]; SIGNEDNESS_2 short mult = av * bv; res += mult; } return res; } Generates f: vmov.i32 q8, #0 @ v4si add r3, r2, #480 .L2: vld1.8 {q10}, [r2]! vld1.8 {q9}, [r1]! vusdot.s8 q8, q9, q10 cmp r3, r2 bne .L2 vadd.i32 d16, d16, d17 vpadd.i32 d16, d16, d16 vmov.32 r3, d16[0] add r0, r0, r3 bx lr instead of f: vmov.i32 q8, #0 @ v4si add r3, r2, #480 .L2: vld1.8 {q9}, [r2]! vld1.8 {q11}, [r1]! cmp r3, r2 vmull.s8 q10, d18, d22 vmull.s8 q9, d19, d23 vaddw.s16 q8, q8, d20 vaddw.s16 q8, q8, d21 vaddw.s16 q8, q8, d18 vaddw.s16 q8, q8, d19 bne .L2 vadd.i32 d16, d16, d17 vpadd.i32 d16, d16, d16 vmov.32 r3, d16[0] add r0, r0, r3 bx lr For NEON. I couldn't figure out if the MVE instruction vmlaldav.s16 could be used to emulate this. Because it would require additional widening to work I left MVE out of this patch set but perhaps someone should take a look. gcc/ChangeLog: * config/arm/neon.md (usdot_prod): New. gcc/testsuite/ChangeLog: * gcc.target/arm/simd/vusdot-autovec.c: New test. --- gcc/config/arm/neon.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 64365e0..8b0a396 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2969,6 +2969,18 @@ DONE; }) +;; Auto-vectorizer pattern for usdot +(define_expand "usdot_prod" + [(set (match_operand:VCVTI 0 "register_operand") + (plus:VCVTI (unspec:VCVTI [(match_operand: 1 + "register_operand") + (match_operand: 2 + "register_operand")] + UNSPEC_DOT_US) + (match_operand:VCVTI 3 "register_operand")))] + "TARGET_I8MM" +) + (define_expand "neon_copysignf" [(match_operand:VCVTF 0 "register_operand") (match_operand:VCVTF 1 "register_operand") -- cgit v1.1 From c9165e2d58bb037793c1c93e1b5633a61f88db30 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Wed, 14 Jul 2021 15:22:37 +0100 Subject: AArch32: Correct sdot RTL on aarch32 The RTL Generated from dot_prod is invalid as operand3 cannot be written to, it's a normal input. For the expand it's just another operand but the caller does not expect it to be written to. gcc/ChangeLog: * config/arm/neon.md (dot_prod): Drop statements. --- gcc/config/arm/neon.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 8b0a396..7645121 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2961,13 +2961,7 @@ DOTPROD) (match_operand:VCVTI 3 "register_operand")))] "TARGET_DOTPROD" -{ - emit_insn ( - gen_neon_dot (operands[3], operands[3], operands[1], - operands[2])); - emit_insn (gen_rtx_SET (operands[0], operands[3])); - DONE; -}) +) ;; Auto-vectorizer pattern for usdot (define_expand "usdot_prod" -- cgit v1.1 From 6d1cdb27828d2ef1ae1ab0209836646a269b9610 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Wed, 14 Jul 2021 15:23:23 +0100 Subject: AArch64: Correct dot-product auto-vect optab RTL The current RTL for the vectorizer patterns for dot-product are incorrect. Operand3 isn't an output parameter so we can't write to it. This fixes this issue and reduces the number of RTL. gcc/ChangeLog: * config/aarch64/aarch64-simd-builtins.def (udot, sdot): Rename to... (sdot_prod, udot_prod): ...These. * config/aarch64/aarch64-simd.md (dot_prod): Remove. (aarch64_dot): Rename to... (dot_prod): ...This. * config/aarch64/arm_neon.h (vdot_u32, vdotq_u32, vdot_s32, vdotq_s32): Update builtins. --- gcc/config/aarch64/aarch64-simd-builtins.def | 4 +- gcc/config/aarch64/aarch64-simd.md | 62 +++++++++++----------------- gcc/config/aarch64/arm_neon.h | 8 ++-- 3 files changed, 29 insertions(+), 45 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 063f503..99e7348 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -375,8 +375,8 @@ BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) /* Implemented by _prod. */ - BUILTIN_VB (TERNOP, sdot, 0, NONE) - BUILTIN_VB (TERNOPU, udot, 0, NONE) + BUILTIN_VB (TERNOP, sdot_prod, 10, NONE) + BUILTIN_VB (TERNOPU, udot_prod, 10, NONE) BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE) /* Implemented by aarch64__lane{q}. */ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7489098..88fa5ba 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -587,8 +587,28 @@ DONE; }) -;; These instructions map to the __builtins for the Dot Product operations. -(define_insn "aarch64_dot" +;; These expands map to the Dot Product optab the vectorizer checks for +;; and to the intrinsics patttern. +;; The auto-vectorizer expects a dot product builtin that also does an +;; accumulation into the provided register. +;; Given the following pattern +;; +;; for (i=0; idot_prod" [(set (match_operand:VS 0 "register_operand" "=w") (plus:VS (match_operand:VS 1 "register_operand" "0") (unspec:VS [(match_operand: 2 "register_operand" "w") @@ -613,41 +633,6 @@ [(set_attr "type" "neon_dot")] ) -;; These expands map to the Dot Product optab the vectorizer checks for. -;; The auto-vectorizer expects a dot product builtin that also does an -;; accumulation into the provided register. -;; Given the following pattern -;; -;; for (i=0; idot_prod" - [(set (match_operand:VS 0 "register_operand") - (plus:VS (unspec:VS [(match_operand: 1 "register_operand") - (match_operand: 2 "register_operand")] - DOTPROD) - (match_operand:VS 3 "register_operand")))] - "TARGET_DOTPROD" -{ - emit_insn ( - gen_aarch64_dot (operands[3], operands[3], operands[1], - operands[2])); - emit_insn (gen_rtx_SET (operands[0], operands[3])); - DONE; -}) - ;; These instructions map to the __builtins for the Dot Product ;; indexed operations. (define_insn "aarch64_dot_lane" @@ -944,8 +929,7 @@ rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode)); rtx abd = gen_reg_rtx (V16QImode); emit_insn (gen_aarch64_abdv16qi (abd, operands[1], operands[2])); - emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3], - abd, ones)); + emit_insn (gen_udot_prodv16qi (operands[0], operands[3], abd, ones)); DONE; } rtx reduc = gen_reg_rtx (V8HImode); diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 00d76ea..597f44c 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -31767,28 +31767,28 @@ __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udot_prodv8qi_uuuu (__r, __a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udot_prodv16qi_uuuu (__r, __a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_sdotv8qi (__r, __a, __b); + return __builtin_aarch64_sdot_prodv8qi (__r, __a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_sdotv16qi (__r, __a, __b); + return __builtin_aarch64_sdot_prodv16qi (__r, __a, __b); } __extension__ extern __inline uint32x2_t -- cgit v1.1 From 7d914777fc6c6151f430d798fc97bae927a430f7 Mon Sep 17 00:00:00 2001 From: Peter Bergner Date: Wed, 14 Jul 2021 18:23:31 -0500 Subject: rs6000: Move rs6000_split_multireg_move to later in file An upcoming change to rs6000_split_multireg_move requires it to be moved later in the file to fix a declaration issue. 2021-07-14 Peter Bergner gcc/ * config/rs6000/rs6000.c (rs6000_split_multireg_move): Move to later in the file. --- gcc/config/rs6000/rs6000.c | 1845 ++++++++++++++++++++++---------------------- 1 file changed, 922 insertions(+), 923 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index de11de5..1d27bb8 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -16690,533 +16690,157 @@ rs6000_expand_atomic_op (enum rtx_code code, rtx mem, rtx val, emit_move_insn (orig_after, after); } -/* Emit instructions to move SRC to DST. Called by splitters for - multi-register moves. It will emit at most one instruction for - each register that is accessed; that is, it won't emit li/lis pairs - (or equivalent for 64-bit code). One of SRC or DST must be a hard - register. */ +static GTY(()) alias_set_type TOC_alias_set = -1; -void -rs6000_split_multireg_move (rtx dst, rtx src) +alias_set_type +get_TOC_alias_set (void) { - /* The register number of the first register being moved. */ - int reg; - /* The mode that is to be moved. */ - machine_mode mode; - /* The mode that the move is being done in, and its size. */ - machine_mode reg_mode; - int reg_mode_size; - /* The number of registers that will be moved. */ - int nregs; + if (TOC_alias_set == -1) + TOC_alias_set = new_alias_set (); + return TOC_alias_set; +} - reg = REG_P (dst) ? REGNO (dst) : REGNO (src); - mode = GET_MODE (dst); - nregs = hard_regno_nregs (reg, mode); +/* The mode the ABI uses for a word. This is not the same as word_mode + for -m32 -mpowerpc64. This is used to implement various target hooks. */ - /* If we have a vector quad register for MMA, and this is a load or store, - see if we can use vector paired load/stores. */ - if (mode == XOmode && TARGET_MMA - && (MEM_P (dst) || MEM_P (src))) - { - reg_mode = OOmode; - nregs /= 2; - } - /* If we have a vector pair/quad mode, split it into two/four separate - vectors. */ - else if (mode == OOmode || mode == XOmode) - reg_mode = V1TImode; - else if (FP_REGNO_P (reg)) - reg_mode = DECIMAL_FLOAT_MODE_P (mode) ? DDmode : - (TARGET_HARD_FLOAT ? DFmode : SFmode); - else if (ALTIVEC_REGNO_P (reg)) - reg_mode = V16QImode; +static scalar_int_mode +rs6000_abi_word_mode (void) +{ + return TARGET_32BIT ? SImode : DImode; +} + +/* Implement the TARGET_OFFLOAD_OPTIONS hook. */ +static char * +rs6000_offload_options (void) +{ + if (TARGET_64BIT) + return xstrdup ("-foffload-abi=lp64"); else - reg_mode = word_mode; - reg_mode_size = GET_MODE_SIZE (reg_mode); + return xstrdup ("-foffload-abi=ilp32"); +} - gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode)); + +/* A quick summary of the various types of 'constant-pool tables' + under PowerPC: - /* TDmode residing in FP registers is special, since the ISA requires that - the lower-numbered word of a register pair is always the most significant - word, even in little-endian mode. This does not match the usual subreg - semantics, so we cannnot use simplify_gen_subreg in those cases. Access - the appropriate constituent registers "by hand" in little-endian mode. + Target Flags Name One table per + AIX (none) AIX TOC object file + AIX -mfull-toc AIX TOC object file + AIX -mminimal-toc AIX minimal TOC translation unit + SVR4/EABI (none) SVR4 SDATA object file + SVR4/EABI -fpic SVR4 pic object file + SVR4/EABI -fPIC SVR4 PIC translation unit + SVR4/EABI -mrelocatable EABI TOC function + SVR4/EABI -maix AIX TOC object file + SVR4/EABI -maix -mminimal-toc + AIX minimal TOC translation unit - Note we do not need to check for destructive overlap here since TDmode - can only reside in even/odd register pairs. */ - if (FP_REGNO_P (reg) && DECIMAL_FLOAT_MODE_P (mode) && !BYTES_BIG_ENDIAN) - { - rtx p_src, p_dst; - int i; + Name Reg. Set by entries contains: + made by addrs? fp? sum? - for (i = 0; i < nregs; i++) - { - if (REG_P (src) && FP_REGNO_P (REGNO (src))) - p_src = gen_rtx_REG (reg_mode, REGNO (src) + nregs - 1 - i); - else - p_src = simplify_gen_subreg (reg_mode, src, mode, - i * reg_mode_size); + AIX TOC 2 crt0 as Y option option + AIX minimal TOC 30 prolog gcc Y Y option + SVR4 SDATA 13 crt0 gcc N Y N + SVR4 pic 30 prolog ld Y not yet N + SVR4 PIC 30 prolog gcc Y option option + EABI TOC 30 prolog gcc Y option option - if (REG_P (dst) && FP_REGNO_P (REGNO (dst))) - p_dst = gen_rtx_REG (reg_mode, REGNO (dst) + nregs - 1 - i); - else - p_dst = simplify_gen_subreg (reg_mode, dst, mode, - i * reg_mode_size); +*/ - emit_insn (gen_rtx_SET (p_dst, p_src)); - } +/* Hash functions for the hash table. */ - return; - } +static unsigned +rs6000_hash_constant (rtx k) +{ + enum rtx_code code = GET_CODE (k); + machine_mode mode = GET_MODE (k); + unsigned result = (code << 3) ^ mode; + const char *format; + int flen, fidx; - /* The __vector_pair and __vector_quad modes are multi-register - modes, so if we have to load or store the registers, we have to be - careful to properly swap them if we're in little endian mode - below. This means the last register gets the first memory - location. We also need to be careful of using the right register - numbers if we are splitting XO to OO. */ - if (mode == OOmode || mode == XOmode) + format = GET_RTX_FORMAT (code); + flen = strlen (format); + fidx = 0; + + switch (code) { - nregs = hard_regno_nregs (reg, mode); - int reg_mode_nregs = hard_regno_nregs (reg, reg_mode); - if (MEM_P (dst)) - { - unsigned offset = 0; - unsigned size = GET_MODE_SIZE (reg_mode); + case LABEL_REF: + return result * 1231 + (unsigned) INSN_UID (XEXP (k, 0)); - /* If we are reading an accumulator register, we have to - deprime it before we can access it. */ - if (TARGET_MMA - && GET_MODE (src) == XOmode && FP_REGNO_P (REGNO (src))) - emit_insn (gen_mma_xxmfacc (src, src)); + case CONST_WIDE_INT: + { + int i; + flen = CONST_WIDE_INT_NUNITS (k); + for (i = 0; i < flen; i++) + result = result * 613 + CONST_WIDE_INT_ELT (k, i); + return result; + } - for (int i = 0; i < nregs; i += reg_mode_nregs) - { - unsigned subreg = - (WORDS_BIG_ENDIAN) ? i : (nregs - reg_mode_nregs - i); - rtx dst2 = adjust_address (dst, reg_mode, offset); - rtx src2 = gen_rtx_REG (reg_mode, reg + subreg); - offset += size; - emit_insn (gen_rtx_SET (dst2, src2)); - } + case CONST_DOUBLE: + return real_hash (CONST_DOUBLE_REAL_VALUE (k)) * result; - return; - } + case CODE_LABEL: + fidx = 3; + break; - if (MEM_P (src)) + default: + break; + } + + for (; fidx < flen; fidx++) + switch (format[fidx]) + { + case 's': { - unsigned offset = 0; - unsigned size = GET_MODE_SIZE (reg_mode); + unsigned i, len; + const char *str = XSTR (k, fidx); + len = strlen (str); + result = result * 613 + len; + for (i = 0; i < len; i++) + result = result * 613 + (unsigned) str[i]; + break; + } + case 'u': + case 'e': + result = result * 1231 + rs6000_hash_constant (XEXP (k, fidx)); + break; + case 'i': + case 'n': + result = result * 613 + (unsigned) XINT (k, fidx); + break; + case 'w': + if (sizeof (unsigned) >= sizeof (HOST_WIDE_INT)) + result = result * 613 + (unsigned) XWINT (k, fidx); + else + { + size_t i; + for (i = 0; i < sizeof (HOST_WIDE_INT) / sizeof (unsigned); i++) + result = result * 613 + (unsigned) (XWINT (k, fidx) + >> CHAR_BIT * i); + } + break; + case '0': + break; + default: + gcc_unreachable (); + } - for (int i = 0; i < nregs; i += reg_mode_nregs) - { - unsigned subreg = - (WORDS_BIG_ENDIAN) ? i : (nregs - reg_mode_nregs - i); - rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg); - rtx src2 = adjust_address (src, reg_mode, offset); - offset += size; - emit_insn (gen_rtx_SET (dst2, src2)); - } + return result; +} - /* If we are writing an accumulator register, we have to - prime it after we've written it. */ - if (TARGET_MMA - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) - emit_insn (gen_mma_xxmtacc (dst, dst)); +hashval_t +toc_hasher::hash (toc_hash_struct *thc) +{ + return rs6000_hash_constant (thc->key) ^ thc->key_mode; +} - return; - } +/* Compare H1 and H2 for equivalence. */ - if (GET_CODE (src) == UNSPEC) - { - gcc_assert (XINT (src, 1) == UNSPEC_MMA_ASSEMBLE); - gcc_assert (REG_P (dst)); - if (GET_MODE (src) == XOmode) - gcc_assert (FP_REGNO_P (REGNO (dst))); - if (GET_MODE (src) == OOmode) - gcc_assert (VSX_REGNO_P (REGNO (dst))); - - reg_mode = GET_MODE (XVECEXP (src, 0, 0)); - int nvecs = XVECLEN (src, 0); - for (int i = 0; i < nvecs; i++) - { - int index = WORDS_BIG_ENDIAN ? i : nvecs - 1 - i; - rtx dst_i = gen_rtx_REG (reg_mode, reg + index); - emit_insn (gen_rtx_SET (dst_i, XVECEXP (src, 0, i))); - } - - /* We are writing an accumulator register, so we have to - prime it after we've written it. */ - if (GET_MODE (src) == XOmode) - emit_insn (gen_mma_xxmtacc (dst, dst)); - - return; - } - - /* Register -> register moves can use common code. */ - } - - if (REG_P (src) && REG_P (dst) && (REGNO (src) < REGNO (dst))) - { - /* If we are reading an accumulator register, we have to - deprime it before we can access it. */ - if (TARGET_MMA - && GET_MODE (src) == XOmode && FP_REGNO_P (REGNO (src))) - emit_insn (gen_mma_xxmfacc (src, src)); - - /* Move register range backwards, if we might have destructive - overlap. */ - int i; - /* XO/OO are opaque so cannot use subregs. */ - if (mode == OOmode || mode == XOmode ) - { - for (i = nregs - 1; i >= 0; i--) - { - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i); - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i); - emit_insn (gen_rtx_SET (dst_i, src_i)); - } - } - else - { - for (i = nregs - 1; i >= 0; i--) - emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode, - i * reg_mode_size), - simplify_gen_subreg (reg_mode, src, mode, - i * reg_mode_size))); - } - - /* If we are writing an accumulator register, we have to - prime it after we've written it. */ - if (TARGET_MMA - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) - emit_insn (gen_mma_xxmtacc (dst, dst)); - } - else - { - int i; - int j = -1; - bool used_update = false; - rtx restore_basereg = NULL_RTX; - - if (MEM_P (src) && INT_REGNO_P (reg)) - { - rtx breg; - - if (GET_CODE (XEXP (src, 0)) == PRE_INC - || GET_CODE (XEXP (src, 0)) == PRE_DEC) - { - rtx delta_rtx; - breg = XEXP (XEXP (src, 0), 0); - delta_rtx = (GET_CODE (XEXP (src, 0)) == PRE_INC - ? GEN_INT (GET_MODE_SIZE (GET_MODE (src))) - : GEN_INT (-GET_MODE_SIZE (GET_MODE (src)))); - emit_insn (gen_add3_insn (breg, breg, delta_rtx)); - src = replace_equiv_address (src, breg); - } - else if (! rs6000_offsettable_memref_p (src, reg_mode, true)) - { - if (GET_CODE (XEXP (src, 0)) == PRE_MODIFY) - { - rtx basereg = XEXP (XEXP (src, 0), 0); - if (TARGET_UPDATE) - { - rtx ndst = simplify_gen_subreg (reg_mode, dst, mode, 0); - emit_insn (gen_rtx_SET (ndst, - gen_rtx_MEM (reg_mode, - XEXP (src, 0)))); - used_update = true; - } - else - emit_insn (gen_rtx_SET (basereg, - XEXP (XEXP (src, 0), 1))); - src = replace_equiv_address (src, basereg); - } - else - { - rtx basereg = gen_rtx_REG (Pmode, reg); - emit_insn (gen_rtx_SET (basereg, XEXP (src, 0))); - src = replace_equiv_address (src, basereg); - } - } - - breg = XEXP (src, 0); - if (GET_CODE (breg) == PLUS || GET_CODE (breg) == LO_SUM) - breg = XEXP (breg, 0); - - /* If the base register we are using to address memory is - also a destination reg, then change that register last. */ - if (REG_P (breg) - && REGNO (breg) >= REGNO (dst) - && REGNO (breg) < REGNO (dst) + nregs) - j = REGNO (breg) - REGNO (dst); - } - else if (MEM_P (dst) && INT_REGNO_P (reg)) - { - rtx breg; - - if (GET_CODE (XEXP (dst, 0)) == PRE_INC - || GET_CODE (XEXP (dst, 0)) == PRE_DEC) - { - rtx delta_rtx; - breg = XEXP (XEXP (dst, 0), 0); - delta_rtx = (GET_CODE (XEXP (dst, 0)) == PRE_INC - ? GEN_INT (GET_MODE_SIZE (GET_MODE (dst))) - : GEN_INT (-GET_MODE_SIZE (GET_MODE (dst)))); - - /* We have to update the breg before doing the store. - Use store with update, if available. */ - - if (TARGET_UPDATE) - { - rtx nsrc = simplify_gen_subreg (reg_mode, src, mode, 0); - emit_insn (TARGET_32BIT - ? (TARGET_POWERPC64 - ? gen_movdi_si_update (breg, breg, delta_rtx, nsrc) - : gen_movsi_si_update (breg, breg, delta_rtx, nsrc)) - : gen_movdi_di_update (breg, breg, delta_rtx, nsrc)); - used_update = true; - } - else - emit_insn (gen_add3_insn (breg, breg, delta_rtx)); - dst = replace_equiv_address (dst, breg); - } - else if (!rs6000_offsettable_memref_p (dst, reg_mode, true) - && GET_CODE (XEXP (dst, 0)) != LO_SUM) - { - if (GET_CODE (XEXP (dst, 0)) == PRE_MODIFY) - { - rtx basereg = XEXP (XEXP (dst, 0), 0); - if (TARGET_UPDATE) - { - rtx nsrc = simplify_gen_subreg (reg_mode, src, mode, 0); - emit_insn (gen_rtx_SET (gen_rtx_MEM (reg_mode, - XEXP (dst, 0)), - nsrc)); - used_update = true; - } - else - emit_insn (gen_rtx_SET (basereg, - XEXP (XEXP (dst, 0), 1))); - dst = replace_equiv_address (dst, basereg); - } - else - { - rtx basereg = XEXP (XEXP (dst, 0), 0); - rtx offsetreg = XEXP (XEXP (dst, 0), 1); - gcc_assert (GET_CODE (XEXP (dst, 0)) == PLUS - && REG_P (basereg) - && REG_P (offsetreg) - && REGNO (basereg) != REGNO (offsetreg)); - if (REGNO (basereg) == 0) - { - rtx tmp = offsetreg; - offsetreg = basereg; - basereg = tmp; - } - emit_insn (gen_add3_insn (basereg, basereg, offsetreg)); - restore_basereg = gen_sub3_insn (basereg, basereg, offsetreg); - dst = replace_equiv_address (dst, basereg); - } - } - else if (GET_CODE (XEXP (dst, 0)) != LO_SUM) - gcc_assert (rs6000_offsettable_memref_p (dst, reg_mode, true)); - } - - /* If we are reading an accumulator register, we have to - deprime it before we can access it. */ - if (TARGET_MMA && REG_P (src) - && GET_MODE (src) == XOmode && FP_REGNO_P (REGNO (src))) - emit_insn (gen_mma_xxmfacc (src, src)); - - for (i = 0; i < nregs; i++) - { - /* Calculate index to next subword. */ - ++j; - if (j == nregs) - j = 0; - - /* If compiler already emitted move of first word by - store with update, no need to do anything. */ - if (j == 0 && used_update) - continue; - - /* XO/OO are opaque so cannot use subregs. */ - if (mode == OOmode || mode == XOmode ) - { - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j); - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j); - emit_insn (gen_rtx_SET (dst_i, src_i)); - } - else - emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode, - j * reg_mode_size), - simplify_gen_subreg (reg_mode, src, mode, - j * reg_mode_size))); - } - - /* If we are writing an accumulator register, we have to - prime it after we've written it. */ - if (TARGET_MMA && REG_P (dst) - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) - emit_insn (gen_mma_xxmtacc (dst, dst)); - - if (restore_basereg != NULL_RTX) - emit_insn (restore_basereg); - } -} - -static GTY(()) alias_set_type TOC_alias_set = -1; - -alias_set_type -get_TOC_alias_set (void) -{ - if (TOC_alias_set == -1) - TOC_alias_set = new_alias_set (); - return TOC_alias_set; -} - -/* The mode the ABI uses for a word. This is not the same as word_mode - for -m32 -mpowerpc64. This is used to implement various target hooks. */ - -static scalar_int_mode -rs6000_abi_word_mode (void) -{ - return TARGET_32BIT ? SImode : DImode; -} - -/* Implement the TARGET_OFFLOAD_OPTIONS hook. */ -static char * -rs6000_offload_options (void) -{ - if (TARGET_64BIT) - return xstrdup ("-foffload-abi=lp64"); - else - return xstrdup ("-foffload-abi=ilp32"); -} - - -/* A quick summary of the various types of 'constant-pool tables' - under PowerPC: - - Target Flags Name One table per - AIX (none) AIX TOC object file - AIX -mfull-toc AIX TOC object file - AIX -mminimal-toc AIX minimal TOC translation unit - SVR4/EABI (none) SVR4 SDATA object file - SVR4/EABI -fpic SVR4 pic object file - SVR4/EABI -fPIC SVR4 PIC translation unit - SVR4/EABI -mrelocatable EABI TOC function - SVR4/EABI -maix AIX TOC object file - SVR4/EABI -maix -mminimal-toc - AIX minimal TOC translation unit - - Name Reg. Set by entries contains: - made by addrs? fp? sum? - - AIX TOC 2 crt0 as Y option option - AIX minimal TOC 30 prolog gcc Y Y option - SVR4 SDATA 13 crt0 gcc N Y N - SVR4 pic 30 prolog ld Y not yet N - SVR4 PIC 30 prolog gcc Y option option - EABI TOC 30 prolog gcc Y option option - -*/ - -/* Hash functions for the hash table. */ - -static unsigned -rs6000_hash_constant (rtx k) -{ - enum rtx_code code = GET_CODE (k); - machine_mode mode = GET_MODE (k); - unsigned result = (code << 3) ^ mode; - const char *format; - int flen, fidx; - - format = GET_RTX_FORMAT (code); - flen = strlen (format); - fidx = 0; - - switch (code) - { - case LABEL_REF: - return result * 1231 + (unsigned) INSN_UID (XEXP (k, 0)); - - case CONST_WIDE_INT: - { - int i; - flen = CONST_WIDE_INT_NUNITS (k); - for (i = 0; i < flen; i++) - result = result * 613 + CONST_WIDE_INT_ELT (k, i); - return result; - } - - case CONST_DOUBLE: - return real_hash (CONST_DOUBLE_REAL_VALUE (k)) * result; - - case CODE_LABEL: - fidx = 3; - break; - - default: - break; - } - - for (; fidx < flen; fidx++) - switch (format[fidx]) - { - case 's': - { - unsigned i, len; - const char *str = XSTR (k, fidx); - len = strlen (str); - result = result * 613 + len; - for (i = 0; i < len; i++) - result = result * 613 + (unsigned) str[i]; - break; - } - case 'u': - case 'e': - result = result * 1231 + rs6000_hash_constant (XEXP (k, fidx)); - break; - case 'i': - case 'n': - result = result * 613 + (unsigned) XINT (k, fidx); - break; - case 'w': - if (sizeof (unsigned) >= sizeof (HOST_WIDE_INT)) - result = result * 613 + (unsigned) XWINT (k, fidx); - else - { - size_t i; - for (i = 0; i < sizeof (HOST_WIDE_INT) / sizeof (unsigned); i++) - result = result * 613 + (unsigned) (XWINT (k, fidx) - >> CHAR_BIT * i); - } - break; - case '0': - break; - default: - gcc_unreachable (); - } - - return result; -} - -hashval_t -toc_hasher::hash (toc_hash_struct *thc) -{ - return rs6000_hash_constant (thc->key) ^ thc->key_mode; -} - -/* Compare H1 and H2 for equivalence. */ - -bool -toc_hasher::equal (toc_hash_struct *h1, toc_hash_struct *h2) -{ - rtx r1 = h1->key; - rtx r2 = h2->key; +bool +toc_hasher::equal (toc_hash_struct *h1, toc_hash_struct *h2) +{ + rtx r1 = h1->key; + rtx r2 = h2->key; if (h1->key_mode != h2->key_mode) return 0; @@ -26450,538 +26074,913 @@ prefixed_load_p (rtx_insn *insn) if (non_prefixed == NON_PREFIXED_X && is_lfs_stfs_insn (insn)) return address_is_prefixed (XEXP (mem, 0), mem_mode, NON_PREFIXED_DEFAULT); else - return address_is_prefixed (XEXP (mem, 0), mem_mode, non_prefixed); + return address_is_prefixed (XEXP (mem, 0), mem_mode, non_prefixed); +} + +/* Whether a store instruction is a prefixed instruction. This is called from + the prefixed attribute processing. */ + +bool +prefixed_store_p (rtx_insn *insn) +{ + /* Validate the insn to make sure it is a normal store insn. */ + extract_insn_cached (insn); + if (recog_data.n_operands < 2) + return false; + + rtx mem = recog_data.operand[0]; + rtx reg = recog_data.operand[1]; + + if (!REG_P (reg) && !SUBREG_P (reg)) + return false; + + if (!MEM_P (mem)) + return false; + + /* Prefixed store instructions do not support update or indexed forms. */ + if (get_attr_indexed (insn) == INDEXED_YES + || get_attr_update (insn) == UPDATE_YES) + return false; + + machine_mode mem_mode = GET_MODE (mem); + rtx addr = XEXP (mem, 0); + enum non_prefixed_form non_prefixed = reg_to_non_prefixed (reg, mem_mode); + + /* Need to make sure we aren't looking at a stfs which doesn't look + like the other things reg_to_non_prefixed/address_is_prefixed + looks for. */ + if (non_prefixed == NON_PREFIXED_X && is_lfs_stfs_insn (insn)) + return address_is_prefixed (addr, mem_mode, NON_PREFIXED_DEFAULT); + else + return address_is_prefixed (addr, mem_mode, non_prefixed); +} + +/* Whether a load immediate or add instruction is a prefixed instruction. This + is called from the prefixed attribute processing. */ + +bool +prefixed_paddi_p (rtx_insn *insn) +{ + rtx set = single_set (insn); + if (!set) + return false; + + rtx dest = SET_DEST (set); + rtx src = SET_SRC (set); + + if (!REG_P (dest) && !SUBREG_P (dest)) + return false; + + /* Is this a load immediate that can't be done with a simple ADDI or + ADDIS? */ + if (CONST_INT_P (src)) + return (satisfies_constraint_eI (src) + && !satisfies_constraint_I (src) + && !satisfies_constraint_L (src)); + + /* Is this a PADDI instruction that can't be done with a simple ADDI or + ADDIS? */ + if (GET_CODE (src) == PLUS) + { + rtx op1 = XEXP (src, 1); + + return (CONST_INT_P (op1) + && satisfies_constraint_eI (op1) + && !satisfies_constraint_I (op1) + && !satisfies_constraint_L (op1)); + } + + /* If not, is it a load of a PC-relative address? */ + if (!TARGET_PCREL || GET_MODE (dest) != Pmode) + return false; + + if (!SYMBOL_REF_P (src) && !LABEL_REF_P (src) && GET_CODE (src) != CONST) + return false; + + enum insn_form iform = address_to_insn_form (src, Pmode, + NON_PREFIXED_DEFAULT); + + return (iform == INSN_FORM_PCREL_EXTERNAL || iform == INSN_FORM_PCREL_LOCAL); +} + +/* Whether the next instruction needs a 'p' prefix issued before the + instruction is printed out. */ +static bool prepend_p_to_next_insn; + +/* Define FINAL_PRESCAN_INSN if some processing needs to be done before + outputting the assembler code. On the PowerPC, we remember if the current + insn is a prefixed insn where we need to emit a 'p' before the insn. + + In addition, if the insn is part of a PC-relative reference to an external + label optimization, this is recorded also. */ +void +rs6000_final_prescan_insn (rtx_insn *insn, rtx [], int) +{ + prepend_p_to_next_insn = (get_attr_maybe_prefixed (insn) + == MAYBE_PREFIXED_YES + && get_attr_prefixed (insn) == PREFIXED_YES); + return; +} + +/* Define ASM_OUTPUT_OPCODE to do anything special before emitting an opcode. + We use it to emit a 'p' for prefixed insns that is set in + FINAL_PRESCAN_INSN. */ +void +rs6000_asm_output_opcode (FILE *stream) +{ + if (prepend_p_to_next_insn) + { + fprintf (stream, "p"); + + /* Reset the flag in the case where there are separate insn lines in the + sequence, so the 'p' is only emitted for the first line. This shows up + when we are doing the PCREL_OPT optimization, in that the label created + with %r would have a leading 'p' printed. */ + prepend_p_to_next_insn = false; + } + + return; +} + +/* Emit the relocation to tie the next instruction to a previous instruction + that loads up an external address. This is used to do the PCREL_OPT + optimization. Note, the label is generated after the PLD of the got + pc-relative address to allow for the assembler to insert NOPs before the PLD + instruction. The operand is a constant integer that is the label + number. */ + +void +output_pcrel_opt_reloc (rtx label_num) +{ + rtx operands[1] = { label_num }; + output_asm_insn (".reloc .Lpcrel%0-8,R_PPC64_PCREL_OPT,.-(.Lpcrel%0-8)", + operands); +} + +/* Adjust the length of an INSN. LENGTH is the currently-computed length and + should be adjusted to reflect any required changes. This macro is used when + there is some systematic length adjustment required that would be difficult + to express in the length attribute. + + In the PowerPC, we use this to adjust the length of an instruction if one or + more prefixed instructions are generated, using the attribute + num_prefixed_insns. A prefixed instruction is 8 bytes instead of 4, but the + hardware requires that a prefied instruciton does not cross a 64-byte + boundary. This means the compiler has to assume the length of the first + prefixed instruction is 12 bytes instead of 8 bytes. Since the length is + already set for the non-prefixed instruction, we just need to udpate for the + difference. */ + +int +rs6000_adjust_insn_length (rtx_insn *insn, int length) +{ + if (TARGET_PREFIXED && NONJUMP_INSN_P (insn)) + { + rtx pattern = PATTERN (insn); + if (GET_CODE (pattern) != USE && GET_CODE (pattern) != CLOBBER + && get_attr_prefixed (insn) == PREFIXED_YES) + { + int num_prefixed = get_attr_max_prefixed_insns (insn); + length += 4 * (num_prefixed + 1); + } + } + + return length; +} + + +#ifdef HAVE_GAS_HIDDEN +# define USE_HIDDEN_LINKONCE 1 +#else +# define USE_HIDDEN_LINKONCE 0 +#endif + +/* Fills in the label name that should be used for a 476 link stack thunk. */ + +void +get_ppc476_thunk_name (char name[32]) +{ + gcc_assert (TARGET_LINK_STACK); + + if (USE_HIDDEN_LINKONCE) + sprintf (name, "__ppc476.get_thunk"); + else + ASM_GENERATE_INTERNAL_LABEL (name, "LPPC476_", 0); } -/* Whether a store instruction is a prefixed instruction. This is called from - the prefixed attribute processing. */ +/* This function emits the simple thunk routine that is used to preserve + the link stack on the 476 cpu. */ -bool -prefixed_store_p (rtx_insn *insn) +static void rs6000_code_end (void) ATTRIBUTE_UNUSED; +static void +rs6000_code_end (void) { - /* Validate the insn to make sure it is a normal store insn. */ - extract_insn_cached (insn); - if (recog_data.n_operands < 2) - return false; + char name[32]; + tree decl; - rtx mem = recog_data.operand[0]; - rtx reg = recog_data.operand[1]; + if (!TARGET_LINK_STACK) + return; - if (!REG_P (reg) && !SUBREG_P (reg)) - return false; + get_ppc476_thunk_name (name); - if (!MEM_P (mem)) - return false; + decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, get_identifier (name), + build_function_type_list (void_type_node, NULL_TREE)); + DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL, + NULL_TREE, void_type_node); + TREE_PUBLIC (decl) = 1; + TREE_STATIC (decl) = 1; - /* Prefixed store instructions do not support update or indexed forms. */ - if (get_attr_indexed (insn) == INDEXED_YES - || get_attr_update (insn) == UPDATE_YES) - return false; +#if RS6000_WEAK + if (USE_HIDDEN_LINKONCE && !TARGET_XCOFF) + { + cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl)); + targetm.asm_out.unique_section (decl, 0); + switch_to_section (get_named_section (decl, NULL, 0)); + DECL_WEAK (decl) = 1; + ASM_WEAKEN_DECL (asm_out_file, decl, name, 0); + targetm.asm_out.globalize_label (asm_out_file, name); + targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN); + ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl); + } + else +#endif + { + switch_to_section (text_section); + ASM_OUTPUT_LABEL (asm_out_file, name); + } - machine_mode mem_mode = GET_MODE (mem); - rtx addr = XEXP (mem, 0); - enum non_prefixed_form non_prefixed = reg_to_non_prefixed (reg, mem_mode); + DECL_INITIAL (decl) = make_node (BLOCK); + current_function_decl = decl; + allocate_struct_function (decl, false); + init_function_start (decl); + first_function_block_is_cold = false; + /* Make sure unwind info is emitted for the thunk if needed. */ + final_start_function (emit_barrier (), asm_out_file, 1); - /* Need to make sure we aren't looking at a stfs which doesn't look - like the other things reg_to_non_prefixed/address_is_prefixed - looks for. */ - if (non_prefixed == NON_PREFIXED_X && is_lfs_stfs_insn (insn)) - return address_is_prefixed (addr, mem_mode, NON_PREFIXED_DEFAULT); - else - return address_is_prefixed (addr, mem_mode, non_prefixed); + fputs ("\tblr\n", asm_out_file); + + final_end_function (); + init_insn_lengths (); + free_after_compilation (cfun); + set_cfun (NULL); + current_function_decl = NULL; } -/* Whether a load immediate or add instruction is a prefixed instruction. This - is called from the prefixed attribute processing. */ +/* Add r30 to hard reg set if the prologue sets it up and it is not + pic_offset_table_rtx. */ -bool -prefixed_paddi_p (rtx_insn *insn) +static void +rs6000_set_up_by_prologue (struct hard_reg_set_container *set) { - rtx set = single_set (insn); - if (!set) - return false; + if (!TARGET_SINGLE_PIC_BASE + && TARGET_TOC + && TARGET_MINIMAL_TOC + && !constant_pool_empty_p ()) + add_to_hard_reg_set (&set->set, Pmode, RS6000_PIC_OFFSET_TABLE_REGNUM); + if (cfun->machine->split_stack_argp_used) + add_to_hard_reg_set (&set->set, Pmode, 12); - rtx dest = SET_DEST (set); - rtx src = SET_SRC (set); + /* Make sure the hard reg set doesn't include r2, which was possibly added + via PIC_OFFSET_TABLE_REGNUM. */ + if (TARGET_TOC) + remove_from_hard_reg_set (&set->set, Pmode, TOC_REGNUM); +} - if (!REG_P (dest) && !SUBREG_P (dest)) - return false; + +/* Helper function for rs6000_split_logical to emit a logical instruction after + spliting the operation to single GPR registers. - /* Is this a load immediate that can't be done with a simple ADDI or - ADDIS? */ - if (CONST_INT_P (src)) - return (satisfies_constraint_eI (src) - && !satisfies_constraint_I (src) - && !satisfies_constraint_L (src)); + DEST is the destination register. + OP1 and OP2 are the input source registers. + CODE is the base operation (AND, IOR, XOR, NOT). + MODE is the machine mode. + If COMPLEMENT_FINAL_P is true, wrap the whole operation with NOT. + If COMPLEMENT_OP1_P is true, wrap operand1 with NOT. + If COMPLEMENT_OP2_P is true, wrap operand2 with NOT. */ - /* Is this a PADDI instruction that can't be done with a simple ADDI or - ADDIS? */ - if (GET_CODE (src) == PLUS) +static void +rs6000_split_logical_inner (rtx dest, + rtx op1, + rtx op2, + enum rtx_code code, + machine_mode mode, + bool complement_final_p, + bool complement_op1_p, + bool complement_op2_p) +{ + rtx bool_rtx; + + /* Optimize AND of 0/0xffffffff and IOR/XOR of 0. */ + if (op2 && CONST_INT_P (op2) + && (mode == SImode || (mode == DImode && TARGET_POWERPC64)) + && !complement_final_p && !complement_op1_p && !complement_op2_p) { - rtx op1 = XEXP (src, 1); + HOST_WIDE_INT mask = GET_MODE_MASK (mode); + HOST_WIDE_INT value = INTVAL (op2) & mask; - return (CONST_INT_P (op1) - && satisfies_constraint_eI (op1) - && !satisfies_constraint_I (op1) - && !satisfies_constraint_L (op1)); + /* Optimize AND of 0 to just set 0. Optimize AND of -1 to be a move. */ + if (code == AND) + { + if (value == 0) + { + emit_insn (gen_rtx_SET (dest, const0_rtx)); + return; + } + + else if (value == mask) + { + if (!rtx_equal_p (dest, op1)) + emit_insn (gen_rtx_SET (dest, op1)); + return; + } + } + + /* Optimize IOR/XOR of 0 to be a simple move. Split large operations + into separate ORI/ORIS or XORI/XORIS instrucitons. */ + else if (code == IOR || code == XOR) + { + if (value == 0) + { + if (!rtx_equal_p (dest, op1)) + emit_insn (gen_rtx_SET (dest, op1)); + return; + } + } } - /* If not, is it a load of a PC-relative address? */ - if (!TARGET_PCREL || GET_MODE (dest) != Pmode) - return false; + if (code == AND && mode == SImode + && !complement_final_p && !complement_op1_p && !complement_op2_p) + { + emit_insn (gen_andsi3 (dest, op1, op2)); + return; + } - if (!SYMBOL_REF_P (src) && !LABEL_REF_P (src) && GET_CODE (src) != CONST) - return false; + if (complement_op1_p) + op1 = gen_rtx_NOT (mode, op1); - enum insn_form iform = address_to_insn_form (src, Pmode, - NON_PREFIXED_DEFAULT); + if (complement_op2_p) + op2 = gen_rtx_NOT (mode, op2); - return (iform == INSN_FORM_PCREL_EXTERNAL || iform == INSN_FORM_PCREL_LOCAL); -} + /* For canonical RTL, if only one arm is inverted it is the first. */ + if (!complement_op1_p && complement_op2_p) + std::swap (op1, op2); -/* Whether the next instruction needs a 'p' prefix issued before the - instruction is printed out. */ -static bool prepend_p_to_next_insn; + bool_rtx = ((code == NOT) + ? gen_rtx_NOT (mode, op1) + : gen_rtx_fmt_ee (code, mode, op1, op2)); -/* Define FINAL_PRESCAN_INSN if some processing needs to be done before - outputting the assembler code. On the PowerPC, we remember if the current - insn is a prefixed insn where we need to emit a 'p' before the insn. + if (complement_final_p) + bool_rtx = gen_rtx_NOT (mode, bool_rtx); - In addition, if the insn is part of a PC-relative reference to an external - label optimization, this is recorded also. */ -void -rs6000_final_prescan_insn (rtx_insn *insn, rtx [], int) -{ - prepend_p_to_next_insn = (get_attr_maybe_prefixed (insn) - == MAYBE_PREFIXED_YES - && get_attr_prefixed (insn) == PREFIXED_YES); - return; + emit_insn (gen_rtx_SET (dest, bool_rtx)); } -/* Define ASM_OUTPUT_OPCODE to do anything special before emitting an opcode. - We use it to emit a 'p' for prefixed insns that is set in - FINAL_PRESCAN_INSN. */ -void -rs6000_asm_output_opcode (FILE *stream) +/* Split a DImode AND/IOR/XOR with a constant on a 32-bit system. These + operations are split immediately during RTL generation to allow for more + optimizations of the AND/IOR/XOR. + + OPERANDS is an array containing the destination and two input operands. + CODE is the base operation (AND, IOR, XOR, NOT). + MODE is the machine mode. + If COMPLEMENT_FINAL_P is true, wrap the whole operation with NOT. + If COMPLEMENT_OP1_P is true, wrap operand1 with NOT. + If COMPLEMENT_OP2_P is true, wrap operand2 with NOT. + CLOBBER_REG is either NULL or a scratch register of type CC to allow + formation of the AND instructions. */ + +static void +rs6000_split_logical_di (rtx operands[3], + enum rtx_code code, + bool complement_final_p, + bool complement_op1_p, + bool complement_op2_p) { - if (prepend_p_to_next_insn) + const HOST_WIDE_INT lower_32bits = HOST_WIDE_INT_C(0xffffffff); + const HOST_WIDE_INT upper_32bits = ~ lower_32bits; + const HOST_WIDE_INT sign_bit = HOST_WIDE_INT_C(0x80000000); + enum hi_lo { hi = 0, lo = 1 }; + rtx op0_hi_lo[2], op1_hi_lo[2], op2_hi_lo[2]; + size_t i; + + op0_hi_lo[hi] = gen_highpart (SImode, operands[0]); + op1_hi_lo[hi] = gen_highpart (SImode, operands[1]); + op0_hi_lo[lo] = gen_lowpart (SImode, operands[0]); + op1_hi_lo[lo] = gen_lowpart (SImode, operands[1]); + + if (code == NOT) + op2_hi_lo[hi] = op2_hi_lo[lo] = NULL_RTX; + else { - fprintf (stream, "p"); + if (!CONST_INT_P (operands[2])) + { + op2_hi_lo[hi] = gen_highpart_mode (SImode, DImode, operands[2]); + op2_hi_lo[lo] = gen_lowpart (SImode, operands[2]); + } + else + { + HOST_WIDE_INT value = INTVAL (operands[2]); + HOST_WIDE_INT value_hi_lo[2]; - /* Reset the flag in the case where there are separate insn lines in the - sequence, so the 'p' is only emitted for the first line. This shows up - when we are doing the PCREL_OPT optimization, in that the label created - with %r would have a leading 'p' printed. */ - prepend_p_to_next_insn = false; - } + gcc_assert (!complement_final_p); + gcc_assert (!complement_op1_p); + gcc_assert (!complement_op2_p); - return; -} + value_hi_lo[hi] = value >> 32; + value_hi_lo[lo] = value & lower_32bits; -/* Emit the relocation to tie the next instruction to a previous instruction - that loads up an external address. This is used to do the PCREL_OPT - optimization. Note, the label is generated after the PLD of the got - pc-relative address to allow for the assembler to insert NOPs before the PLD - instruction. The operand is a constant integer that is the label - number. */ + for (i = 0; i < 2; i++) + { + HOST_WIDE_INT sub_value = value_hi_lo[i]; -void -output_pcrel_opt_reloc (rtx label_num) -{ - rtx operands[1] = { label_num }; - output_asm_insn (".reloc .Lpcrel%0-8,R_PPC64_PCREL_OPT,.-(.Lpcrel%0-8)", - operands); -} + if (sub_value & sign_bit) + sub_value |= upper_32bits; -/* Adjust the length of an INSN. LENGTH is the currently-computed length and - should be adjusted to reflect any required changes. This macro is used when - there is some systematic length adjustment required that would be difficult - to express in the length attribute. + op2_hi_lo[i] = GEN_INT (sub_value); - In the PowerPC, we use this to adjust the length of an instruction if one or - more prefixed instructions are generated, using the attribute - num_prefixed_insns. A prefixed instruction is 8 bytes instead of 4, but the - hardware requires that a prefied instruciton does not cross a 64-byte - boundary. This means the compiler has to assume the length of the first - prefixed instruction is 12 bytes instead of 8 bytes. Since the length is - already set for the non-prefixed instruction, we just need to udpate for the - difference. */ + /* If this is an AND instruction, check to see if we need to load + the value in a register. */ + if (code == AND && sub_value != -1 && sub_value != 0 + && !and_operand (op2_hi_lo[i], SImode)) + op2_hi_lo[i] = force_reg (SImode, op2_hi_lo[i]); + } + } + } -int -rs6000_adjust_insn_length (rtx_insn *insn, int length) -{ - if (TARGET_PREFIXED && NONJUMP_INSN_P (insn)) + for (i = 0; i < 2; i++) { - rtx pattern = PATTERN (insn); - if (GET_CODE (pattern) != USE && GET_CODE (pattern) != CLOBBER - && get_attr_prefixed (insn) == PREFIXED_YES) + /* Split large IOR/XOR operations. */ + if ((code == IOR || code == XOR) + && CONST_INT_P (op2_hi_lo[i]) + && !complement_final_p + && !complement_op1_p + && !complement_op2_p + && !logical_const_operand (op2_hi_lo[i], SImode)) { - int num_prefixed = get_attr_max_prefixed_insns (insn); - length += 4 * (num_prefixed + 1); + HOST_WIDE_INT value = INTVAL (op2_hi_lo[i]); + HOST_WIDE_INT hi_16bits = value & HOST_WIDE_INT_C(0xffff0000); + HOST_WIDE_INT lo_16bits = value & HOST_WIDE_INT_C(0x0000ffff); + rtx tmp = gen_reg_rtx (SImode); + + /* Make sure the constant is sign extended. */ + if ((hi_16bits & sign_bit) != 0) + hi_16bits |= upper_32bits; + + rs6000_split_logical_inner (tmp, op1_hi_lo[i], GEN_INT (hi_16bits), + code, SImode, false, false, false); + + rs6000_split_logical_inner (op0_hi_lo[i], tmp, GEN_INT (lo_16bits), + code, SImode, false, false, false); } + else + rs6000_split_logical_inner (op0_hi_lo[i], op1_hi_lo[i], op2_hi_lo[i], + code, SImode, complement_final_p, + complement_op1_p, complement_op2_p); } - return length; + return; } - -#ifdef HAVE_GAS_HIDDEN -# define USE_HIDDEN_LINKONCE 1 -#else -# define USE_HIDDEN_LINKONCE 0 -#endif +/* Split the insns that make up boolean operations operating on multiple GPR + registers. The boolean MD patterns ensure that the inputs either are + exactly the same as the output registers, or there is no overlap. -/* Fills in the label name that should be used for a 476 link stack thunk. */ + OPERANDS is an array containing the destination and two input operands. + CODE is the base operation (AND, IOR, XOR, NOT). + If COMPLEMENT_FINAL_P is true, wrap the whole operation with NOT. + If COMPLEMENT_OP1_P is true, wrap operand1 with NOT. + If COMPLEMENT_OP2_P is true, wrap operand2 with NOT. */ void -get_ppc476_thunk_name (char name[32]) +rs6000_split_logical (rtx operands[3], + enum rtx_code code, + bool complement_final_p, + bool complement_op1_p, + bool complement_op2_p) { - gcc_assert (TARGET_LINK_STACK); - - if (USE_HIDDEN_LINKONCE) - sprintf (name, "__ppc476.get_thunk"); - else - ASM_GENERATE_INTERNAL_LABEL (name, "LPPC476_", 0); -} + machine_mode mode = GET_MODE (operands[0]); + machine_mode sub_mode; + rtx op0, op1, op2; + int sub_size, regno0, regno1, nregs, i; -/* This function emits the simple thunk routine that is used to preserve - the link stack on the 476 cpu. */ + /* If this is DImode, use the specialized version that can run before + register allocation. */ + if (mode == DImode && !TARGET_POWERPC64) + { + rs6000_split_logical_di (operands, code, complement_final_p, + complement_op1_p, complement_op2_p); + return; + } -static void rs6000_code_end (void) ATTRIBUTE_UNUSED; -static void -rs6000_code_end (void) -{ - char name[32]; - tree decl; + op0 = operands[0]; + op1 = operands[1]; + op2 = (code == NOT) ? NULL_RTX : operands[2]; + sub_mode = (TARGET_POWERPC64) ? DImode : SImode; + sub_size = GET_MODE_SIZE (sub_mode); + regno0 = REGNO (op0); + regno1 = REGNO (op1); - if (!TARGET_LINK_STACK) - return; + gcc_assert (reload_completed); + gcc_assert (IN_RANGE (regno0, FIRST_GPR_REGNO, LAST_GPR_REGNO)); + gcc_assert (IN_RANGE (regno1, FIRST_GPR_REGNO, LAST_GPR_REGNO)); - get_ppc476_thunk_name (name); + nregs = rs6000_hard_regno_nregs[(int)mode][regno0]; + gcc_assert (nregs > 1); - decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, get_identifier (name), - build_function_type_list (void_type_node, NULL_TREE)); - DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL, - NULL_TREE, void_type_node); - TREE_PUBLIC (decl) = 1; - TREE_STATIC (decl) = 1; + if (op2 && REG_P (op2)) + gcc_assert (IN_RANGE (REGNO (op2), FIRST_GPR_REGNO, LAST_GPR_REGNO)); -#if RS6000_WEAK - if (USE_HIDDEN_LINKONCE && !TARGET_XCOFF) - { - cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl)); - targetm.asm_out.unique_section (decl, 0); - switch_to_section (get_named_section (decl, NULL, 0)); - DECL_WEAK (decl) = 1; - ASM_WEAKEN_DECL (asm_out_file, decl, name, 0); - targetm.asm_out.globalize_label (asm_out_file, name); - targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN); - ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl); - } - else -#endif + for (i = 0; i < nregs; i++) { - switch_to_section (text_section); - ASM_OUTPUT_LABEL (asm_out_file, name); - } - - DECL_INITIAL (decl) = make_node (BLOCK); - current_function_decl = decl; - allocate_struct_function (decl, false); - init_function_start (decl); - first_function_block_is_cold = false; - /* Make sure unwind info is emitted for the thunk if needed. */ - final_start_function (emit_barrier (), asm_out_file, 1); + int offset = i * sub_size; + rtx sub_op0 = simplify_subreg (sub_mode, op0, mode, offset); + rtx sub_op1 = simplify_subreg (sub_mode, op1, mode, offset); + rtx sub_op2 = ((code == NOT) + ? NULL_RTX + : simplify_subreg (sub_mode, op2, mode, offset)); - fputs ("\tblr\n", asm_out_file); + rs6000_split_logical_inner (sub_op0, sub_op1, sub_op2, code, sub_mode, + complement_final_p, complement_op1_p, + complement_op2_p); + } - final_end_function (); - init_insn_lengths (); - free_after_compilation (cfun); - set_cfun (NULL); - current_function_decl = NULL; + return; } -/* Add r30 to hard reg set if the prologue sets it up and it is not - pic_offset_table_rtx. */ +/* Emit instructions to move SRC to DST. Called by splitters for + multi-register moves. It will emit at most one instruction for + each register that is accessed; that is, it won't emit li/lis pairs + (or equivalent for 64-bit code). One of SRC or DST must be a hard + register. */ -static void -rs6000_set_up_by_prologue (struct hard_reg_set_container *set) +void +rs6000_split_multireg_move (rtx dst, rtx src) { - if (!TARGET_SINGLE_PIC_BASE - && TARGET_TOC - && TARGET_MINIMAL_TOC - && !constant_pool_empty_p ()) - add_to_hard_reg_set (&set->set, Pmode, RS6000_PIC_OFFSET_TABLE_REGNUM); - if (cfun->machine->split_stack_argp_used) - add_to_hard_reg_set (&set->set, Pmode, 12); + /* The register number of the first register being moved. */ + int reg; + /* The mode that is to be moved. */ + machine_mode mode; + /* The mode that the move is being done in, and its size. */ + machine_mode reg_mode; + int reg_mode_size; + /* The number of registers that will be moved. */ + int nregs; - /* Make sure the hard reg set doesn't include r2, which was possibly added - via PIC_OFFSET_TABLE_REGNUM. */ - if (TARGET_TOC) - remove_from_hard_reg_set (&set->set, Pmode, TOC_REGNUM); -} + reg = REG_P (dst) ? REGNO (dst) : REGNO (src); + mode = GET_MODE (dst); + nregs = hard_regno_nregs (reg, mode); - -/* Helper function for rs6000_split_logical to emit a logical instruction after - spliting the operation to single GPR registers. + /* If we have a vector quad register for MMA, and this is a load or store, + see if we can use vector paired load/stores. */ + if (mode == XOmode && TARGET_MMA + && (MEM_P (dst) || MEM_P (src))) + { + reg_mode = OOmode; + nregs /= 2; + } + /* If we have a vector pair/quad mode, split it into two/four separate + vectors. */ + else if (mode == OOmode || mode == XOmode) + reg_mode = V1TImode; + else if (FP_REGNO_P (reg)) + reg_mode = DECIMAL_FLOAT_MODE_P (mode) ? DDmode : + (TARGET_HARD_FLOAT ? DFmode : SFmode); + else if (ALTIVEC_REGNO_P (reg)) + reg_mode = V16QImode; + else + reg_mode = word_mode; + reg_mode_size = GET_MODE_SIZE (reg_mode); - DEST is the destination register. - OP1 and OP2 are the input source registers. - CODE is the base operation (AND, IOR, XOR, NOT). - MODE is the machine mode. - If COMPLEMENT_FINAL_P is true, wrap the whole operation with NOT. - If COMPLEMENT_OP1_P is true, wrap operand1 with NOT. - If COMPLEMENT_OP2_P is true, wrap operand2 with NOT. */ + gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode)); -static void -rs6000_split_logical_inner (rtx dest, - rtx op1, - rtx op2, - enum rtx_code code, - machine_mode mode, - bool complement_final_p, - bool complement_op1_p, - bool complement_op2_p) -{ - rtx bool_rtx; + /* TDmode residing in FP registers is special, since the ISA requires that + the lower-numbered word of a register pair is always the most significant + word, even in little-endian mode. This does not match the usual subreg + semantics, so we cannnot use simplify_gen_subreg in those cases. Access + the appropriate constituent registers "by hand" in little-endian mode. - /* Optimize AND of 0/0xffffffff and IOR/XOR of 0. */ - if (op2 && CONST_INT_P (op2) - && (mode == SImode || (mode == DImode && TARGET_POWERPC64)) - && !complement_final_p && !complement_op1_p && !complement_op2_p) + Note we do not need to check for destructive overlap here since TDmode + can only reside in even/odd register pairs. */ + if (FP_REGNO_P (reg) && DECIMAL_FLOAT_MODE_P (mode) && !BYTES_BIG_ENDIAN) { - HOST_WIDE_INT mask = GET_MODE_MASK (mode); - HOST_WIDE_INT value = INTVAL (op2) & mask; + rtx p_src, p_dst; + int i; - /* Optimize AND of 0 to just set 0. Optimize AND of -1 to be a move. */ - if (code == AND) + for (i = 0; i < nregs; i++) { - if (value == 0) - { - emit_insn (gen_rtx_SET (dest, const0_rtx)); - return; - } + if (REG_P (src) && FP_REGNO_P (REGNO (src))) + p_src = gen_rtx_REG (reg_mode, REGNO (src) + nregs - 1 - i); + else + p_src = simplify_gen_subreg (reg_mode, src, mode, + i * reg_mode_size); - else if (value == mask) - { - if (!rtx_equal_p (dest, op1)) - emit_insn (gen_rtx_SET (dest, op1)); - return; - } - } + if (REG_P (dst) && FP_REGNO_P (REGNO (dst))) + p_dst = gen_rtx_REG (reg_mode, REGNO (dst) + nregs - 1 - i); + else + p_dst = simplify_gen_subreg (reg_mode, dst, mode, + i * reg_mode_size); - /* Optimize IOR/XOR of 0 to be a simple move. Split large operations - into separate ORI/ORIS or XORI/XORIS instrucitons. */ - else if (code == IOR || code == XOR) - { - if (value == 0) - { - if (!rtx_equal_p (dest, op1)) - emit_insn (gen_rtx_SET (dest, op1)); - return; - } + emit_insn (gen_rtx_SET (p_dst, p_src)); } - } - if (code == AND && mode == SImode - && !complement_final_p && !complement_op1_p && !complement_op2_p) - { - emit_insn (gen_andsi3 (dest, op1, op2)); return; } - if (complement_op1_p) - op1 = gen_rtx_NOT (mode, op1); - - if (complement_op2_p) - op2 = gen_rtx_NOT (mode, op2); - - /* For canonical RTL, if only one arm is inverted it is the first. */ - if (!complement_op1_p && complement_op2_p) - std::swap (op1, op2); - - bool_rtx = ((code == NOT) - ? gen_rtx_NOT (mode, op1) - : gen_rtx_fmt_ee (code, mode, op1, op2)); + /* The __vector_pair and __vector_quad modes are multi-register + modes, so if we have to load or store the registers, we have to be + careful to properly swap them if we're in little endian mode + below. This means the last register gets the first memory + location. We also need to be careful of using the right register + numbers if we are splitting XO to OO. */ + if (mode == OOmode || mode == XOmode) + { + nregs = hard_regno_nregs (reg, mode); + int reg_mode_nregs = hard_regno_nregs (reg, reg_mode); + if (MEM_P (dst)) + { + unsigned offset = 0; + unsigned size = GET_MODE_SIZE (reg_mode); - if (complement_final_p) - bool_rtx = gen_rtx_NOT (mode, bool_rtx); + /* If we are reading an accumulator register, we have to + deprime it before we can access it. */ + if (TARGET_MMA + && GET_MODE (src) == XOmode && FP_REGNO_P (REGNO (src))) + emit_insn (gen_mma_xxmfacc (src, src)); - emit_insn (gen_rtx_SET (dest, bool_rtx)); -} + for (int i = 0; i < nregs; i += reg_mode_nregs) + { + unsigned subreg = + (WORDS_BIG_ENDIAN) ? i : (nregs - reg_mode_nregs - i); + rtx dst2 = adjust_address (dst, reg_mode, offset); + rtx src2 = gen_rtx_REG (reg_mode, reg + subreg); + offset += size; + emit_insn (gen_rtx_SET (dst2, src2)); + } -/* Split a DImode AND/IOR/XOR with a constant on a 32-bit system. These - operations are split immediately during RTL generation to allow for more - optimizations of the AND/IOR/XOR. + return; + } - OPERANDS is an array containing the destination and two input operands. - CODE is the base operation (AND, IOR, XOR, NOT). - MODE is the machine mode. - If COMPLEMENT_FINAL_P is true, wrap the whole operation with NOT. - If COMPLEMENT_OP1_P is true, wrap operand1 with NOT. - If COMPLEMENT_OP2_P is true, wrap operand2 with NOT. - CLOBBER_REG is either NULL or a scratch register of type CC to allow - formation of the AND instructions. */ + if (MEM_P (src)) + { + unsigned offset = 0; + unsigned size = GET_MODE_SIZE (reg_mode); -static void -rs6000_split_logical_di (rtx operands[3], - enum rtx_code code, - bool complement_final_p, - bool complement_op1_p, - bool complement_op2_p) -{ - const HOST_WIDE_INT lower_32bits = HOST_WIDE_INT_C(0xffffffff); - const HOST_WIDE_INT upper_32bits = ~ lower_32bits; - const HOST_WIDE_INT sign_bit = HOST_WIDE_INT_C(0x80000000); - enum hi_lo { hi = 0, lo = 1 }; - rtx op0_hi_lo[2], op1_hi_lo[2], op2_hi_lo[2]; - size_t i; + for (int i = 0; i < nregs; i += reg_mode_nregs) + { + unsigned subreg = + (WORDS_BIG_ENDIAN) ? i : (nregs - reg_mode_nregs - i); + rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg); + rtx src2 = adjust_address (src, reg_mode, offset); + offset += size; + emit_insn (gen_rtx_SET (dst2, src2)); + } - op0_hi_lo[hi] = gen_highpart (SImode, operands[0]); - op1_hi_lo[hi] = gen_highpart (SImode, operands[1]); - op0_hi_lo[lo] = gen_lowpart (SImode, operands[0]); - op1_hi_lo[lo] = gen_lowpart (SImode, operands[1]); + /* If we are writing an accumulator register, we have to + prime it after we've written it. */ + if (TARGET_MMA + && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) + emit_insn (gen_mma_xxmtacc (dst, dst)); - if (code == NOT) - op2_hi_lo[hi] = op2_hi_lo[lo] = NULL_RTX; - else - { - if (!CONST_INT_P (operands[2])) - { - op2_hi_lo[hi] = gen_highpart_mode (SImode, DImode, operands[2]); - op2_hi_lo[lo] = gen_lowpart (SImode, operands[2]); + return; } - else + + if (GET_CODE (src) == UNSPEC) { - HOST_WIDE_INT value = INTVAL (operands[2]); - HOST_WIDE_INT value_hi_lo[2]; + gcc_assert (XINT (src, 1) == UNSPEC_MMA_ASSEMBLE); + gcc_assert (REG_P (dst)); + if (GET_MODE (src) == XOmode) + gcc_assert (FP_REGNO_P (REGNO (dst))); + if (GET_MODE (src) == OOmode) + gcc_assert (VSX_REGNO_P (REGNO (dst))); - gcc_assert (!complement_final_p); - gcc_assert (!complement_op1_p); - gcc_assert (!complement_op2_p); + reg_mode = GET_MODE (XVECEXP (src, 0, 0)); + int nvecs = XVECLEN (src, 0); + for (int i = 0; i < nvecs; i++) + { + int index = WORDS_BIG_ENDIAN ? i : nvecs - 1 - i; + rtx dst_i = gen_rtx_REG (reg_mode, reg + index); + emit_insn (gen_rtx_SET (dst_i, XVECEXP (src, 0, i))); + } - value_hi_lo[hi] = value >> 32; - value_hi_lo[lo] = value & lower_32bits; + /* We are writing an accumulator register, so we have to + prime it after we've written it. */ + if (GET_MODE (src) == XOmode) + emit_insn (gen_mma_xxmtacc (dst, dst)); - for (i = 0; i < 2; i++) - { - HOST_WIDE_INT sub_value = value_hi_lo[i]; + return; + } - if (sub_value & sign_bit) - sub_value |= upper_32bits; + /* Register -> register moves can use common code. */ + } - op2_hi_lo[i] = GEN_INT (sub_value); + if (REG_P (src) && REG_P (dst) && (REGNO (src) < REGNO (dst))) + { + /* If we are reading an accumulator register, we have to + deprime it before we can access it. */ + if (TARGET_MMA + && GET_MODE (src) == XOmode && FP_REGNO_P (REGNO (src))) + emit_insn (gen_mma_xxmfacc (src, src)); - /* If this is an AND instruction, check to see if we need to load - the value in a register. */ - if (code == AND && sub_value != -1 && sub_value != 0 - && !and_operand (op2_hi_lo[i], SImode)) - op2_hi_lo[i] = force_reg (SImode, op2_hi_lo[i]); + /* Move register range backwards, if we might have destructive + overlap. */ + int i; + /* XO/OO are opaque so cannot use subregs. */ + if (mode == OOmode || mode == XOmode ) + { + for (i = nregs - 1; i >= 0; i--) + { + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i); + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i); + emit_insn (gen_rtx_SET (dst_i, src_i)); } } - } + else + { + for (i = nregs - 1; i >= 0; i--) + emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode, + i * reg_mode_size), + simplify_gen_subreg (reg_mode, src, mode, + i * reg_mode_size))); + } - for (i = 0; i < 2; i++) + /* If we are writing an accumulator register, we have to + prime it after we've written it. */ + if (TARGET_MMA + && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) + emit_insn (gen_mma_xxmtacc (dst, dst)); + } + else { - /* Split large IOR/XOR operations. */ - if ((code == IOR || code == XOR) - && CONST_INT_P (op2_hi_lo[i]) - && !complement_final_p - && !complement_op1_p - && !complement_op2_p - && !logical_const_operand (op2_hi_lo[i], SImode)) + int i; + int j = -1; + bool used_update = false; + rtx restore_basereg = NULL_RTX; + + if (MEM_P (src) && INT_REGNO_P (reg)) { - HOST_WIDE_INT value = INTVAL (op2_hi_lo[i]); - HOST_WIDE_INT hi_16bits = value & HOST_WIDE_INT_C(0xffff0000); - HOST_WIDE_INT lo_16bits = value & HOST_WIDE_INT_C(0x0000ffff); - rtx tmp = gen_reg_rtx (SImode); + rtx breg; - /* Make sure the constant is sign extended. */ - if ((hi_16bits & sign_bit) != 0) - hi_16bits |= upper_32bits; + if (GET_CODE (XEXP (src, 0)) == PRE_INC + || GET_CODE (XEXP (src, 0)) == PRE_DEC) + { + rtx delta_rtx; + breg = XEXP (XEXP (src, 0), 0); + delta_rtx = (GET_CODE (XEXP (src, 0)) == PRE_INC + ? GEN_INT (GET_MODE_SIZE (GET_MODE (src))) + : GEN_INT (-GET_MODE_SIZE (GET_MODE (src)))); + emit_insn (gen_add3_insn (breg, breg, delta_rtx)); + src = replace_equiv_address (src, breg); + } + else if (! rs6000_offsettable_memref_p (src, reg_mode, true)) + { + if (GET_CODE (XEXP (src, 0)) == PRE_MODIFY) + { + rtx basereg = XEXP (XEXP (src, 0), 0); + if (TARGET_UPDATE) + { + rtx ndst = simplify_gen_subreg (reg_mode, dst, mode, 0); + emit_insn (gen_rtx_SET (ndst, + gen_rtx_MEM (reg_mode, + XEXP (src, 0)))); + used_update = true; + } + else + emit_insn (gen_rtx_SET (basereg, + XEXP (XEXP (src, 0), 1))); + src = replace_equiv_address (src, basereg); + } + else + { + rtx basereg = gen_rtx_REG (Pmode, reg); + emit_insn (gen_rtx_SET (basereg, XEXP (src, 0))); + src = replace_equiv_address (src, basereg); + } + } - rs6000_split_logical_inner (tmp, op1_hi_lo[i], GEN_INT (hi_16bits), - code, SImode, false, false, false); + breg = XEXP (src, 0); + if (GET_CODE (breg) == PLUS || GET_CODE (breg) == LO_SUM) + breg = XEXP (breg, 0); - rs6000_split_logical_inner (op0_hi_lo[i], tmp, GEN_INT (lo_16bits), - code, SImode, false, false, false); + /* If the base register we are using to address memory is + also a destination reg, then change that register last. */ + if (REG_P (breg) + && REGNO (breg) >= REGNO (dst) + && REGNO (breg) < REGNO (dst) + nregs) + j = REGNO (breg) - REGNO (dst); } - else - rs6000_split_logical_inner (op0_hi_lo[i], op1_hi_lo[i], op2_hi_lo[i], - code, SImode, complement_final_p, - complement_op1_p, complement_op2_p); - } - - return; -} - -/* Split the insns that make up boolean operations operating on multiple GPR - registers. The boolean MD patterns ensure that the inputs either are - exactly the same as the output registers, or there is no overlap. + else if (MEM_P (dst) && INT_REGNO_P (reg)) + { + rtx breg; - OPERANDS is an array containing the destination and two input operands. - CODE is the base operation (AND, IOR, XOR, NOT). - If COMPLEMENT_FINAL_P is true, wrap the whole operation with NOT. - If COMPLEMENT_OP1_P is true, wrap operand1 with NOT. - If COMPLEMENT_OP2_P is true, wrap operand2 with NOT. */ + if (GET_CODE (XEXP (dst, 0)) == PRE_INC + || GET_CODE (XEXP (dst, 0)) == PRE_DEC) + { + rtx delta_rtx; + breg = XEXP (XEXP (dst, 0), 0); + delta_rtx = (GET_CODE (XEXP (dst, 0)) == PRE_INC + ? GEN_INT (GET_MODE_SIZE (GET_MODE (dst))) + : GEN_INT (-GET_MODE_SIZE (GET_MODE (dst)))); -void -rs6000_split_logical (rtx operands[3], - enum rtx_code code, - bool complement_final_p, - bool complement_op1_p, - bool complement_op2_p) -{ - machine_mode mode = GET_MODE (operands[0]); - machine_mode sub_mode; - rtx op0, op1, op2; - int sub_size, regno0, regno1, nregs, i; + /* We have to update the breg before doing the store. + Use store with update, if available. */ - /* If this is DImode, use the specialized version that can run before - register allocation. */ - if (mode == DImode && !TARGET_POWERPC64) - { - rs6000_split_logical_di (operands, code, complement_final_p, - complement_op1_p, complement_op2_p); - return; - } + if (TARGET_UPDATE) + { + rtx nsrc = simplify_gen_subreg (reg_mode, src, mode, 0); + emit_insn (TARGET_32BIT + ? (TARGET_POWERPC64 + ? gen_movdi_si_update (breg, breg, delta_rtx, nsrc) + : gen_movsi_si_update (breg, breg, delta_rtx, nsrc)) + : gen_movdi_di_update (breg, breg, delta_rtx, nsrc)); + used_update = true; + } + else + emit_insn (gen_add3_insn (breg, breg, delta_rtx)); + dst = replace_equiv_address (dst, breg); + } + else if (!rs6000_offsettable_memref_p (dst, reg_mode, true) + && GET_CODE (XEXP (dst, 0)) != LO_SUM) + { + if (GET_CODE (XEXP (dst, 0)) == PRE_MODIFY) + { + rtx basereg = XEXP (XEXP (dst, 0), 0); + if (TARGET_UPDATE) + { + rtx nsrc = simplify_gen_subreg (reg_mode, src, mode, 0); + emit_insn (gen_rtx_SET (gen_rtx_MEM (reg_mode, + XEXP (dst, 0)), + nsrc)); + used_update = true; + } + else + emit_insn (gen_rtx_SET (basereg, + XEXP (XEXP (dst, 0), 1))); + dst = replace_equiv_address (dst, basereg); + } + else + { + rtx basereg = XEXP (XEXP (dst, 0), 0); + rtx offsetreg = XEXP (XEXP (dst, 0), 1); + gcc_assert (GET_CODE (XEXP (dst, 0)) == PLUS + && REG_P (basereg) + && REG_P (offsetreg) + && REGNO (basereg) != REGNO (offsetreg)); + if (REGNO (basereg) == 0) + { + rtx tmp = offsetreg; + offsetreg = basereg; + basereg = tmp; + } + emit_insn (gen_add3_insn (basereg, basereg, offsetreg)); + restore_basereg = gen_sub3_insn (basereg, basereg, offsetreg); + dst = replace_equiv_address (dst, basereg); + } + } + else if (GET_CODE (XEXP (dst, 0)) != LO_SUM) + gcc_assert (rs6000_offsettable_memref_p (dst, reg_mode, true)); + } - op0 = operands[0]; - op1 = operands[1]; - op2 = (code == NOT) ? NULL_RTX : operands[2]; - sub_mode = (TARGET_POWERPC64) ? DImode : SImode; - sub_size = GET_MODE_SIZE (sub_mode); - regno0 = REGNO (op0); - regno1 = REGNO (op1); + /* If we are reading an accumulator register, we have to + deprime it before we can access it. */ + if (TARGET_MMA && REG_P (src) + && GET_MODE (src) == XOmode && FP_REGNO_P (REGNO (src))) + emit_insn (gen_mma_xxmfacc (src, src)); - gcc_assert (reload_completed); - gcc_assert (IN_RANGE (regno0, FIRST_GPR_REGNO, LAST_GPR_REGNO)); - gcc_assert (IN_RANGE (regno1, FIRST_GPR_REGNO, LAST_GPR_REGNO)); + for (i = 0; i < nregs; i++) + { + /* Calculate index to next subword. */ + ++j; + if (j == nregs) + j = 0; - nregs = rs6000_hard_regno_nregs[(int)mode][regno0]; - gcc_assert (nregs > 1); + /* If compiler already emitted move of first word by + store with update, no need to do anything. */ + if (j == 0 && used_update) + continue; - if (op2 && REG_P (op2)) - gcc_assert (IN_RANGE (REGNO (op2), FIRST_GPR_REGNO, LAST_GPR_REGNO)); + /* XO/OO are opaque so cannot use subregs. */ + if (mode == OOmode || mode == XOmode ) + { + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j); + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j); + emit_insn (gen_rtx_SET (dst_i, src_i)); + } + else + emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode, + j * reg_mode_size), + simplify_gen_subreg (reg_mode, src, mode, + j * reg_mode_size))); + } - for (i = 0; i < nregs; i++) - { - int offset = i * sub_size; - rtx sub_op0 = simplify_subreg (sub_mode, op0, mode, offset); - rtx sub_op1 = simplify_subreg (sub_mode, op1, mode, offset); - rtx sub_op2 = ((code == NOT) - ? NULL_RTX - : simplify_subreg (sub_mode, op2, mode, offset)); + /* If we are writing an accumulator register, we have to + prime it after we've written it. */ + if (TARGET_MMA && REG_P (dst) + && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) + emit_insn (gen_mma_xxmtacc (dst, dst)); - rs6000_split_logical_inner (sub_op0, sub_op1, sub_op2, code, sub_mode, - complement_final_p, complement_op1_p, - complement_op2_p); + if (restore_basereg != NULL_RTX) + emit_insn (restore_basereg); } - - return; } - /* Return true if the peephole2 can combine a load involving a combination of an addis instruction and a load with an offset that can be fused together on -- cgit v1.1 From 69feb7601e86274fa9abbfb420b00c8adf947e7b Mon Sep 17 00:00:00 2001 From: Peter Bergner Date: Wed, 14 Jul 2021 18:27:02 -0500 Subject: rs6000: Generate an lxvp instead of two adjacent lxv instructions The MMA build built-ins currently use individual lxv instructions to load up the registers of a __vector_pair or __vector_quad. If the memory addresses of the built-in operands are to adjacent locations, then we can use an lxvp in some cases to load up two registers at once. The patch below adds support for checking whether memory addresses are adjacent and emitting an lxvp instead of two lxv instructions. 2021-07-14 Peter Bergner gcc/ * config/rs6000/rs6000.c (adjacent_mem_locations): Return the lower addressed memory rtx, if any. (rs6000_split_multireg_move): Fix code formatting. Handle MMA build built-ins with operands in adjacent memory locations. gcc/testsuite/ * gcc.target/powerpc/mma-builtin-9.c: New test. --- gcc/config/rs6000/rs6000.c | 82 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 18 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 1d27bb8..ce29b37 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -18051,23 +18051,29 @@ get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset, return true; } -/* The function returns true if the target storage location of - mem1 is adjacent to the target storage location of mem2 */ -/* Return 1 if memory locations are adjacent. */ +/* If the target storage locations of arguments MEM1 and MEM2 are + adjacent, then return the argument that has the lower address. + Otherwise, return NULL_RTX. */ -static bool +static rtx adjacent_mem_locations (rtx mem1, rtx mem2) { rtx reg1, reg2; HOST_WIDE_INT off1, size1, off2, size2; - if (get_memref_parts (mem1, ®1, &off1, &size1) - && get_memref_parts (mem2, ®2, &off2, &size2)) - return ((REGNO (reg1) == REGNO (reg2)) - && ((off1 + size1 == off2) - || (off2 + size2 == off1))); + if (MEM_P (mem1) + && MEM_P (mem2) + && get_memref_parts (mem1, ®1, &off1, &size1) + && get_memref_parts (mem2, ®2, &off2, &size2) + && REGNO (reg1) == REGNO (reg2)) + { + if (off1 + size1 == off2) + return mem1; + else if (off2 + size2 == off1) + return mem2; + } - return false; + return NULL_RTX; } /* This function returns true if it can be determined that the two MEM @@ -26708,8 +26714,8 @@ rs6000_split_multireg_move (rtx dst, rtx src) for (int i = 0; i < nregs; i += reg_mode_nregs) { - unsigned subreg = - (WORDS_BIG_ENDIAN) ? i : (nregs - reg_mode_nregs - i); + unsigned subreg + = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i); rtx dst2 = adjust_address (dst, reg_mode, offset); rtx src2 = gen_rtx_REG (reg_mode, reg + subreg); offset += size; @@ -26726,8 +26732,8 @@ rs6000_split_multireg_move (rtx dst, rtx src) for (int i = 0; i < nregs; i += reg_mode_nregs) { - unsigned subreg = - (WORDS_BIG_ENDIAN) ? i : (nregs - reg_mode_nregs - i); + unsigned subreg + = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i); rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg); rtx src2 = adjust_address (src, reg_mode, offset); offset += size; @@ -26752,13 +26758,53 @@ rs6000_split_multireg_move (rtx dst, rtx src) if (GET_MODE (src) == OOmode) gcc_assert (VSX_REGNO_P (REGNO (dst))); - reg_mode = GET_MODE (XVECEXP (src, 0, 0)); int nvecs = XVECLEN (src, 0); for (int i = 0; i < nvecs; i++) { - int index = WORDS_BIG_ENDIAN ? i : nvecs - 1 - i; - rtx dst_i = gen_rtx_REG (reg_mode, reg + index); - emit_insn (gen_rtx_SET (dst_i, XVECEXP (src, 0, i))); + rtx op; + int regno = reg + i; + + if (WORDS_BIG_ENDIAN) + { + op = XVECEXP (src, 0, i); + + /* If we are loading an even VSX register and the memory location + is adjacent to the next register's memory location (if any), + then we can load them both with one LXVP instruction. */ + if ((regno & 1) == 0) + { + rtx op2 = XVECEXP (src, 0, i + 1); + if (adjacent_mem_locations (op, op2) == op) + { + op = adjust_address (op, OOmode, 0); + /* Skip the next register, since we're going to + load it together with this register. */ + i++; + } + } + } + else + { + op = XVECEXP (src, 0, nvecs - i - 1); + + /* If we are loading an even VSX register and the memory location + is adjacent to the next register's memory location (if any), + then we can load them both with one LXVP instruction. */ + if ((regno & 1) == 0) + { + rtx op2 = XVECEXP (src, 0, nvecs - i - 2); + if (adjacent_mem_locations (op2, op) == op2) + { + op = adjust_address (op2, OOmode, 0); + /* Skip the next register, since we're going to + load it together with this register. */ + i++; + } + } + } + + rtx dst_i = gen_rtx_REG (GET_MODE (op), regno); + emit_insn (gen_rtx_SET (dst_i, op)); } /* We are writing an accumulator register, so we have to -- cgit v1.1 From 8d76ff99220c7aa428516e93998457dbe299f037 Mon Sep 17 00:00:00 2001 From: Trevor Saunders Date: Mon, 12 Jul 2021 02:55:17 -0400 Subject: pass location to md_asm_adjust So the hook can use it as the location of diagnostics. gcc/ChangeLog: * cfgexpand.c (expand_asm_loc): Adjust. (expand_asm_stmt): Likewise. * config/arm/aarch-common-protos.h (arm_md_asm_adjust): Likewise. * config/arm/aarch-common.c (arm_md_asm_adjust): Likewise. * config/arm/arm.c (thumb1_md_asm_adjust): Likewise. * config/avr/avr.c (avr_md_asm_adjust): Likewise. * config/cris/cris.c (cris_md_asm_adjust): Likewise. * config/i386/i386.c (ix86_md_asm_adjust): Likewise. * config/mn10300/mn10300.c (mn10300_md_asm_adjust): Likewise. * config/nds32/nds32.c (nds32_md_asm_adjust): Likewise. * config/pdp11/pdp11.c (pdp11_md_asm_adjust): Likewise. * config/rs6000/rs6000.c (rs6000_md_asm_adjust): Likewise. * config/s390/s390.c (s390_md_asm_adjust): Likewise. * config/vax/vax.c (vax_md_asm_adjust): Likewise. * config/visium/visium.c (visium_md_asm_adjust): Likewise. * doc/tm.texi: Regenerate. * target.def: Add location argument to md_asm_adjust. Signed-off-by: Trevor Saunders --- gcc/config/arm/aarch-common-protos.h | 3 ++- gcc/config/arm/aarch-common.c | 8 ++++---- gcc/config/arm/arm.c | 4 ++-- gcc/config/avr/avr.c | 3 ++- gcc/config/cris/cris.c | 4 ++-- gcc/config/i386/i386.c | 8 ++++---- gcc/config/mn10300/mn10300.c | 2 +- gcc/config/nds32/nds32.c | 3 ++- gcc/config/pdp11/pdp11.c | 4 ++-- gcc/config/rs6000/rs6000.c | 2 +- gcc/config/s390/s390.c | 2 +- gcc/config/vax/vax.c | 5 +++-- gcc/config/visium/visium.c | 4 ++-- 13 files changed, 28 insertions(+), 24 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h index b6171e8..6be5fb1 100644 --- a/gcc/config/arm/aarch-common-protos.h +++ b/gcc/config/arm/aarch-common-protos.h @@ -147,6 +147,7 @@ struct cpu_cost_table rtx_insn *arm_md_asm_adjust (vec &outputs, vec & /*inputs*/, vec & /*input_modes*/, vec &constraints, - vec &clobbers, HARD_REG_SET &clobbered_regs); + vec &clobbers, HARD_REG_SET &clobbered_regs, + location_t loc); #endif /* GCC_AARCH_COMMON_PROTOS_H */ diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c index 0dbdc56..67343fe 100644 --- a/gcc/config/arm/aarch-common.c +++ b/gcc/config/arm/aarch-common.c @@ -534,7 +534,7 @@ rtx_insn * arm_md_asm_adjust (vec &outputs, vec & /*inputs*/, vec & /*input_modes*/, vec &constraints, vec & /*clobbers*/, - HARD_REG_SET & /*clobbered_regs*/) + HARD_REG_SET & /*clobbered_regs*/, location_t loc) { bool saw_asm_flag = false; @@ -547,7 +547,7 @@ arm_md_asm_adjust (vec &outputs, vec & /*inputs*/, con += 4; if (strchr (con, ',') != NULL) { - error ("alternatives not allowed in % flag output"); + error_at (loc, "alternatives not allowed in % flag output"); continue; } @@ -608,7 +608,7 @@ arm_md_asm_adjust (vec &outputs, vec & /*inputs*/, mode = CC_Vmode, code = NE; break; default: - error ("unknown % flag output %qs", constraints[i]); + error_at (loc, "unknown % flag output %qs", constraints[i]); continue; } @@ -618,7 +618,7 @@ arm_md_asm_adjust (vec &outputs, vec & /*inputs*/, machine_mode dest_mode = GET_MODE (dest); if (!SCALAR_INT_MODE_P (dest_mode)) { - error ("invalid type for % flag output"); + error_at (loc, "invalid type for % flag output"); continue; } diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index de37c90..6d781e2 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -333,7 +333,7 @@ static HOST_WIDE_INT arm_constant_alignment (const_tree, HOST_WIDE_INT); static rtx_insn *thumb1_md_asm_adjust (vec &, vec &, vec &, vec &, vec &, - HARD_REG_SET &); + HARD_REG_SET &, location_t); /* Table of machine attributes. */ static const struct attribute_spec arm_attribute_table[] = @@ -34105,7 +34105,7 @@ rtx_insn * thumb1_md_asm_adjust (vec &outputs, vec & /*inputs*/, vec & /*input_modes*/, vec &constraints, vec & /*clobbers*/, - HARD_REG_SET & /*clobbered_regs*/) + HARD_REG_SET & /*clobbered_regs*/, location_t /*loc*/) { for (unsigned i = 0, n = outputs.length (); i < n; ++i) if (startswith (constraints[i], "=@cc")) diff --git a/gcc/config/avr/avr.c b/gcc/config/avr/avr.c index c95c436..200701a 100644 --- a/gcc/config/avr/avr.c +++ b/gcc/config/avr/avr.c @@ -14498,7 +14498,8 @@ static rtx_insn * avr_md_asm_adjust (vec &/*outputs*/, vec &/*inputs*/, vec & /*input_modes*/, vec &/*constraints*/, - vec &clobbers, HARD_REG_SET &clobbered_regs) + vec &clobbers, HARD_REG_SET &clobbered_regs, + location_t /*loc*/) { clobbers.safe_push (cc_reg_rtx); SET_HARD_REG_BIT (clobbered_regs, REG_CC); diff --git a/gcc/config/cris/cris.c b/gcc/config/cris/cris.c index d9213d7..f458ea0 100644 --- a/gcc/config/cris/cris.c +++ b/gcc/config/cris/cris.c @@ -151,7 +151,7 @@ static void cris_function_arg_advance (cumulative_args_t, const function_arg_info &); static rtx_insn *cris_md_asm_adjust (vec &, vec &, vec &, vec &, - vec &, HARD_REG_SET &); + vec &, HARD_REG_SET &, location_t); static void cris_option_override (void); @@ -3507,7 +3507,7 @@ static rtx_insn * cris_md_asm_adjust (vec &outputs, vec &inputs, vec & /*input_modes*/, vec &constraints, vec &clobbers, - HARD_REG_SET &clobbered_regs) + HARD_REG_SET &clobbered_regs, location_t /*loc*/) { /* For the time being, all asms clobber condition codes. Revisit when there's a reasonable use for inputs/outputs diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index cff2690..530d357 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -21596,7 +21596,7 @@ static rtx_insn * ix86_md_asm_adjust (vec &outputs, vec & /*inputs*/, vec & /*input_modes*/, vec &constraints, vec &clobbers, - HARD_REG_SET &clobbered_regs) + HARD_REG_SET &clobbered_regs, location_t loc) { bool saw_asm_flag = false; @@ -21609,7 +21609,7 @@ ix86_md_asm_adjust (vec &outputs, vec & /*inputs*/, con += 4; if (strchr (con, ',') != NULL) { - error ("alternatives not allowed in % flag output"); + error_at (loc, "alternatives not allowed in % flag output"); continue; } @@ -21673,7 +21673,7 @@ ix86_md_asm_adjust (vec &outputs, vec & /*inputs*/, } if (code == UNKNOWN) { - error ("unknown % flag output %qs", constraints[i]); + error_at (loc, "unknown % flag output %qs", constraints[i]); continue; } if (invert) @@ -21702,7 +21702,7 @@ ix86_md_asm_adjust (vec &outputs, vec & /*inputs*/, machine_mode dest_mode = GET_MODE (dest); if (!SCALAR_INT_MODE_P (dest_mode)) { - error ("invalid type for % flag output"); + error_at (loc, "invalid type for % flag output"); continue; } diff --git a/gcc/config/mn10300/mn10300.c b/gcc/config/mn10300/mn10300.c index c1c2e6e..6f842a3 100644 --- a/gcc/config/mn10300/mn10300.c +++ b/gcc/config/mn10300/mn10300.c @@ -2850,7 +2850,7 @@ static rtx_insn * mn10300_md_asm_adjust (vec & /*outputs*/, vec & /*inputs*/, vec & /*input_modes*/, vec & /*constraints*/, vec &clobbers, - HARD_REG_SET &clobbered_regs) + HARD_REG_SET &clobbered_regs, location_t /*loc*/) { clobbers.safe_push (gen_rtx_REG (CCmode, CC_REG)); SET_HARD_REG_BIT (clobbered_regs, CC_REG); diff --git a/gcc/config/nds32/nds32.c b/gcc/config/nds32/nds32.c index 7217d78..2c9cfcf 100644 --- a/gcc/config/nds32/nds32.c +++ b/gcc/config/nds32/nds32.c @@ -4199,7 +4199,8 @@ nds32_md_asm_adjust (vec &outputs ATTRIBUTE_UNUSED, vec &inputs ATTRIBUTE_UNUSED, vec &input_modes ATTRIBUTE_UNUSED, vec &constraints ATTRIBUTE_UNUSED, - vec &clobbers, HARD_REG_SET &clobbered_regs) + vec &clobbers, HARD_REG_SET &clobbered_regs, + location_t /*loc*/) { if (!flag_inline_asm_r15) { diff --git a/gcc/config/pdp11/pdp11.c b/gcc/config/pdp11/pdp11.c index 4cab3ae..ced6531 100644 --- a/gcc/config/pdp11/pdp11.c +++ b/gcc/config/pdp11/pdp11.c @@ -156,7 +156,7 @@ static int pdp11_addr_cost (rtx, machine_mode, addr_space_t, bool); static int pdp11_insn_cost (rtx_insn *insn, bool speed); static rtx_insn *pdp11_md_asm_adjust (vec &, vec &, vec &, vec &, - vec &, HARD_REG_SET &); + vec &, HARD_REG_SET &, location_t); static bool pdp11_return_in_memory (const_tree, const_tree); static rtx pdp11_function_value (const_tree, const_tree, bool); static rtx pdp11_libcall_value (machine_mode, const_rtx); @@ -2139,7 +2139,7 @@ static rtx_insn * pdp11_md_asm_adjust (vec & /*outputs*/, vec & /*inputs*/, vec & /*input_modes*/, vec & /*constraints*/, vec &clobbers, - HARD_REG_SET &clobbered_regs) + HARD_REG_SET &clobbered_regs, location_t /*loc*/) { clobbers.safe_push (gen_rtx_REG (CCmode, CC_REGNUM)); SET_HARD_REG_BIT (clobbered_regs, CC_REGNUM); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index ce29b37..779de95 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -3444,7 +3444,7 @@ static rtx_insn * rs6000_md_asm_adjust (vec & /*outputs*/, vec & /*inputs*/, vec & /*input_modes*/, vec & /*constraints*/, vec &clobbers, - HARD_REG_SET &clobbered_regs) + HARD_REG_SET &clobbered_regs, location_t /*loc*/) { clobbers.safe_push (gen_rtx_REG (SImode, CA_REGNO)); SET_HARD_REG_BIT (clobbered_regs, CA_REGNO); diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 590dd8f..800e0ab 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -16771,7 +16771,7 @@ static rtx_insn * s390_md_asm_adjust (vec &outputs, vec &inputs, vec &input_modes, vec &constraints, vec & /*clobbers*/, - HARD_REG_SET & /*clobbered_regs*/) + HARD_REG_SET & /*clobbered_regs*/, location_t /*loc*/) { if (!TARGET_VXE) /* Long doubles are stored in FPR pairs - nothing to do. */ diff --git a/gcc/config/vax/vax.c b/gcc/config/vax/vax.c index 3aacd1e..e26ab3b 100644 --- a/gcc/config/vax/vax.c +++ b/gcc/config/vax/vax.c @@ -57,7 +57,7 @@ static bool vax_rtx_costs (rtx, machine_mode, int, int, int *, bool); static machine_mode vax_cc_modes_compatible (machine_mode, machine_mode); static rtx_insn *vax_md_asm_adjust (vec &, vec &, vec &, vec &, - vec &, HARD_REG_SET &); + vec &, HARD_REG_SET &, location_t); static rtx vax_function_arg (cumulative_args_t, const function_arg_info &); static void vax_function_arg_advance (cumulative_args_t, const function_arg_info &); @@ -1181,7 +1181,8 @@ vax_md_asm_adjust (vec &outputs ATTRIBUTE_UNUSED, vec &inputs ATTRIBUTE_UNUSED, vec &input_modes ATTRIBUTE_UNUSED, vec &constraints ATTRIBUTE_UNUSED, - vec &clobbers, HARD_REG_SET &clobbered_regs) + vec &clobbers, HARD_REG_SET &clobbered_regs, + location_t /*loc*/) { clobbers.safe_push (gen_rtx_REG (CCmode, VAX_PSL_REGNUM)); SET_HARD_REG_BIT (clobbered_regs, VAX_PSL_REGNUM); diff --git a/gcc/config/visium/visium.c b/gcc/config/visium/visium.c index 7eb2248..58e5355 100644 --- a/gcc/config/visium/visium.c +++ b/gcc/config/visium/visium.c @@ -190,7 +190,7 @@ static tree visium_build_builtin_va_list (void); static rtx_insn *visium_md_asm_adjust (vec &, vec &, vec &, vec &, vec &, - HARD_REG_SET &); + HARD_REG_SET &, location_t); static bool visium_legitimate_constant_p (machine_mode, rtx); @@ -795,7 +795,7 @@ static rtx_insn * visium_md_asm_adjust (vec & /*outputs*/, vec & /*inputs*/, vec & /*input_modes*/, vec & /*constraints*/, vec &clobbers, - HARD_REG_SET &clobbered_regs) + HARD_REG_SET &clobbered_regs, location_t /*loc*/) { clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REGNUM)); SET_HARD_REG_BIT (clobbered_regs, FLAGS_REGNUM); -- cgit v1.1 From 5402023f05e8fc28c2f1cfd7107264403b118a17 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Thu, 15 Jul 2021 13:16:00 +0100 Subject: Revert "AArch64: Correct dot-product auto-vect optab RTL" This reverts commit 6d1cdb27828d2ef1ae1ab0209836646a269b9610. --- gcc/config/aarch64/aarch64-simd-builtins.def | 4 +- gcc/config/aarch64/aarch64-simd.md | 62 +++++++++++++++++----------- gcc/config/aarch64/arm_neon.h | 8 ++-- 3 files changed, 45 insertions(+), 29 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 99e7348..063f503 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -375,8 +375,8 @@ BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) /* Implemented by _prod. */ - BUILTIN_VB (TERNOP, sdot_prod, 10, NONE) - BUILTIN_VB (TERNOPU, udot_prod, 10, NONE) + BUILTIN_VB (TERNOP, sdot, 0, NONE) + BUILTIN_VB (TERNOPU, udot, 0, NONE) BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE) /* Implemented by aarch64__lane{q}. */ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 88fa5ba..7489098 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -587,28 +587,8 @@ DONE; }) -;; These expands map to the Dot Product optab the vectorizer checks for -;; and to the intrinsics patttern. -;; The auto-vectorizer expects a dot product builtin that also does an -;; accumulation into the provided register. -;; Given the following pattern -;; -;; for (i=0; idot_prod" +;; These instructions map to the __builtins for the Dot Product operations. +(define_insn "aarch64_dot" [(set (match_operand:VS 0 "register_operand" "=w") (plus:VS (match_operand:VS 1 "register_operand" "0") (unspec:VS [(match_operand: 2 "register_operand" "w") @@ -633,6 +613,41 @@ [(set_attr "type" "neon_dot")] ) +;; These expands map to the Dot Product optab the vectorizer checks for. +;; The auto-vectorizer expects a dot product builtin that also does an +;; accumulation into the provided register. +;; Given the following pattern +;; +;; for (i=0; idot_prod" + [(set (match_operand:VS 0 "register_operand") + (plus:VS (unspec:VS [(match_operand: 1 "register_operand") + (match_operand: 2 "register_operand")] + DOTPROD) + (match_operand:VS 3 "register_operand")))] + "TARGET_DOTPROD" +{ + emit_insn ( + gen_aarch64_dot (operands[3], operands[3], operands[1], + operands[2])); + emit_insn (gen_rtx_SET (operands[0], operands[3])); + DONE; +}) + ;; These instructions map to the __builtins for the Dot Product ;; indexed operations. (define_insn "aarch64_dot_lane" @@ -929,7 +944,8 @@ rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode)); rtx abd = gen_reg_rtx (V16QImode); emit_insn (gen_aarch64_abdv16qi (abd, operands[1], operands[2])); - emit_insn (gen_udot_prodv16qi (operands[0], operands[3], abd, ones)); + emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3], + abd, ones)); DONE; } rtx reduc = gen_reg_rtx (V8HImode); diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 597f44c..00d76ea 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -31767,28 +31767,28 @@ __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_udot_prodv8qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_udot_prodv16qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_sdot_prodv8qi (__r, __a, __b); + return __builtin_aarch64_sdotv8qi (__r, __a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_sdot_prodv16qi (__r, __a, __b); + return __builtin_aarch64_sdotv16qi (__r, __a, __b); } __extension__ extern __inline uint32x2_t -- cgit v1.1 From 8e321f2a6383e378f64e556707de1cdae0a8562d Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Thu, 15 Jul 2021 13:16:15 +0100 Subject: Revert "AArch32: Correct sdot RTL on aarch32" This reverts commit c9165e2d58bb037793c1c93e1b5633a61f88db30. --- gcc/config/arm/neon.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 7645121..8b0a396 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2961,7 +2961,13 @@ DOTPROD) (match_operand:VCVTI 3 "register_operand")))] "TARGET_DOTPROD" -) +{ + emit_insn ( + gen_neon_dot (operands[3], operands[3], operands[1], + operands[2])); + emit_insn (gen_rtx_SET (operands[0], operands[3])); + DONE; +}) ;; Auto-vectorizer pattern for usdot (define_expand "usdot_prod" -- cgit v1.1 From ad5f8ac1d2f2dc92d43663243b52f9e9eb3cf7c0 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Thu, 15 Jul 2021 10:16:17 -0500 Subject: rs6000: Don't let swaps pass break multiply low-part (PR101129) 2021-07-15 Bill Schmidt gcc/ PR target/101129 * config/rs6000/rs6000-p8swap.c (has_part_mult): New. (rs6000_analyze_swaps): Insns containing a subreg of a mult are not swappable. gcc/testsuite/ PR target/101129 * gcc.target/powerpc/pr101129.c: New. --- gcc/config/rs6000/rs6000-p8swap.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-p8swap.c b/gcc/config/rs6000/rs6000-p8swap.c index 21cbcb2..6b559aa 100644 --- a/gcc/config/rs6000/rs6000-p8swap.c +++ b/gcc/config/rs6000/rs6000-p8swap.c @@ -1523,6 +1523,22 @@ replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i) insn->set_deleted (); } +/* INSN is known to contain a SUBREG, which we can normally handle, + but if the SUBREG itself contains a MULT then we need to leave it alone + to avoid turning a mult_hipart into a mult_lopart, for example. */ +static bool +has_part_mult (rtx_insn *insn) +{ + rtx body = PATTERN (insn); + if (GET_CODE (body) != SET) + return false; + rtx src = SET_SRC (body); + if (GET_CODE (src) != SUBREG) + return false; + rtx inner = XEXP (src, 0); + return (GET_CODE (inner) == MULT); +} + /* Make NEW_MEM_EXP's attributes and flags resemble those of ORIGINAL_MEM_EXP. */ static void @@ -2501,6 +2517,9 @@ rs6000_analyze_swaps (function *fun) insn_entry[uid].is_swappable = 0; else if (special != SH_NONE) insn_entry[uid].special_handling = special; + else if (insn_entry[uid].contains_subreg + && has_part_mult (insn)) + insn_entry[uid].is_swappable = 0; else if (insn_entry[uid].contains_subreg) insn_entry[uid].special_handling = SH_SUBREG; } -- cgit v1.1 From f364cdffa47af574f90f671b2dcf5afa91442741 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 15 Jul 2021 22:34:25 +0200 Subject: i386: Fix ix86_hard_regno_mode_ok for TDmode on 32bit targets [PR101346] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit General regs on 32bit targets do not support 128bit modes, including TDmode. gcc/ 2021-07-15 Uroš Bizjak PR target/101346 * config/i386/i386.h (VALID_SSE_REG_MODE): Add TDmode. (VALID_INT_MODE_P): Add SDmode and DDmode. Add TDmode for TARGET_64BIT. (VALID_DFP_MODE_P): Remove. * config/i386/i386.c (ix86_hard_regno_mode_ok): Do not use VALID_DFP_MODE_P. gcc/testsuite/ 2021-07-15 Uroš Bizjak PR target/101346 * gcc.target/i386/pr101346.c: New test. --- gcc/config/i386/i386.c | 7 ++----- gcc/config/i386/i386.h | 8 +++----- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 530d357..9d74b7a 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19535,11 +19535,8 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) return !can_create_pseudo_p (); } /* We handle both integer and floats in the general purpose registers. */ - else if (VALID_INT_MODE_P (mode)) - return true; - else if (VALID_FP_MODE_P (mode)) - return true; - else if (VALID_DFP_MODE_P (mode)) + else if (VALID_INT_MODE_P (mode) + || VALID_FP_MODE_P (mode)) return true; /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go on to use that value in smaller contexts, this can easily force a diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 324e8a9..0c2c93d 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1023,7 +1023,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define VALID_SSE_REG_MODE(MODE) \ ((MODE) == V1TImode || (MODE) == TImode \ || (MODE) == V4SFmode || (MODE) == V4SImode \ - || (MODE) == SFmode || (MODE) == TFmode) + || (MODE) == SFmode || (MODE) == TFmode || (MODE) == TDmode) #define VALID_MMX_REG_MODE_3DNOW(MODE) \ ((MODE) == V2SFmode || (MODE) == SFmode) @@ -1037,9 +1037,6 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define VALID_MASK_AVX512BW_MODE(MODE) ((MODE) == SImode || (MODE) == DImode) -#define VALID_DFP_MODE_P(MODE) \ - ((MODE) == SDmode || (MODE) == DDmode || (MODE) == TDmode) - #define VALID_FP_MODE_P(MODE) \ ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode \ || (MODE) == SCmode || (MODE) == DCmode || (MODE) == XCmode) \ @@ -1049,12 +1046,13 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); || (MODE) == SImode || (MODE) == DImode \ || (MODE) == CQImode || (MODE) == CHImode \ || (MODE) == CSImode || (MODE) == CDImode \ + || (MODE) == SDmode || (MODE) == DDmode \ || (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode \ || (TARGET_64BIT \ && ((MODE) == TImode || (MODE) == CTImode \ || (MODE) == TFmode || (MODE) == TCmode \ || (MODE) == V8QImode || (MODE) == V4HImode \ - || (MODE) == V2SImode))) + || (MODE) == V2SImode || (MODE) == TDmode))) /* Return true for modes passed in SSE registers. */ #define SSE_REG_MODE_P(MODE) \ -- cgit v1.1 From a314d50336db752f2ae2c50262956ce4490567ac Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Thu, 15 Jul 2021 14:30:48 +0200 Subject: Disable --param vect-partial-vector-usage by default on x86 The following defaults --param vect-partial-vector-usage to zero for x86_64 matching existing behavior where support for this is not present. 2021-07-15 Richard Biener * config/i386/i386-options.c (ix86_option_override_internal): Set param_vect_partial_vector_usage to zero if not set. --- gcc/config/i386/i386-options.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 7cba655..3416a4f 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -2834,6 +2834,11 @@ ix86_option_override_internal (bool main_args_p, SET_OPTION_IF_UNSET (opts, opts_set, param_ira_consider_dup_in_all_alts, 0); + /* Fully masking the main or the epilogue vectorized loop is not + profitable generally so leave it disabled until we get more + fine grained control & costing. */ + SET_OPTION_IF_UNSET (opts, opts_set, param_vect_partial_vector_usage, 0); + return true; } -- cgit v1.1 From 2f11ca2a3a3bea38a7c5bd63e777620a4887e649 Mon Sep 17 00:00:00 2001 From: Cooper Qu Date: Fri, 16 Jul 2021 16:05:39 +0800 Subject: C-SKY: Use the common way to define MULTILIB_DIRNAMES. C-SKY previously used a forked print-sysroot-suffix.sh and define CSKY_MULTILIB_DIRNAMES to specify OS multilib directories. This patch delete the forked print-sysroot-suffix.sh and define MULTILIB_DIRNAMES to generate same directories. gcc/ * config.gcc: Don't use forked print-sysroot-suffix.sh and t-sysroot-suffix for C-SKY. * config/csky/print-sysroot-suffix.sh: Delete. * config/csky/t-csky-linux: Delete. * config/csky/t-sysroot-suffix: Define MULTILIB_DIRNAMES instead of CSKY_MULTILIB_DIRNAMES. --- gcc/config/csky/print-sysroot-suffix.sh | 147 -------------------------------- gcc/config/csky/t-csky-linux | 2 +- gcc/config/csky/t-sysroot-suffix | 28 ------ 3 files changed, 1 insertion(+), 176 deletions(-) delete mode 100644 gcc/config/csky/print-sysroot-suffix.sh delete mode 100644 gcc/config/csky/t-sysroot-suffix (limited to 'gcc/config') diff --git a/gcc/config/csky/print-sysroot-suffix.sh b/gcc/config/csky/print-sysroot-suffix.sh deleted file mode 100644 index 4840bc6..0000000 --- a/gcc/config/csky/print-sysroot-suffix.sh +++ /dev/null @@ -1,147 +0,0 @@ -#! /bin/sh -# Script to generate SYSROOT_SUFFIX_SPEC equivalent to MULTILIB_OSDIRNAMES -# Arguments are MULTILIB_OSDIRNAMES, MULTILIB_OPTIONS and MULTILIB_MATCHES. - -# Copyright (C) 2018-2021 Free Software Foundation, Inc. -# Contributed by C-SKY Microsystems and Mentor Graphics. - -# This file is part of GCC. - -# GCC is free software; you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation; either version 3, or (at your option) any later -# version. - -# GCC is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -# for more details. - -# You should have received a copy of the GNU General Public License -# along with GCC; see the file COPYING3. If not see -# . - -# This shell script produces a header file fragment that defines -# SYSROOT_SUFFIX_SPEC. It assumes that the sysroots will have the same -# structure and names used by the multilibs. - -# Invocation: -# print-sysroot-suffix.sh \ -# MULTILIB_OSDIRNAMES \ -# MULTILIB_OPTIONS \ -# MULTILIB_MATCHES \ -# > t-sysroot-suffix.h - -# The three options exactly correspond to the variables of the same -# names defined in the tmake_file fragments. - -# Example: -# sh ./gcc/config/print-sysroot-suffix.sh "a=A" "a b/c/d" "" -# => -# #undef SYSROOT_SUFFIX_SPEC -# #define SYSROOT_SUFFIX_SPEC "" \ -# "%{a:" \ -# "%{b:A/b/;" \ -# "c:A/c/;" \ -# "d:A/d/;" \ -# ":A/};" \ -# ":}" - -# The script uses temporary subscripts in order to permit a recursive -# algorithm without the use of functions. - -set -e - -dirnames="$1" -options="$2" -matches="$3" - -cat > print-sysroot-suffix3.sh <<\EOF -#! /bin/sh -# Print all the multilib matches for this option -result="$1" -EOF -for x in $matches; do - l=`echo $x | sed -e 's/=.*$//' -e 's/?/=/g'` - r=`echo $x | sed -e 's/^.*=//' -e 's/?/=/g'` - echo "[ \"\$1\" = \"$l\" ] && result=\"\$result|$r\"" >> print-sysroot-suffix3.sh -done -echo 'echo $result' >> print-sysroot-suffix3.sh -chmod +x print-sysroot-suffix3.sh - -cat > print-sysroot-suffix2.sh <<\EOF -#! /bin/sh -# Recursive script to enumerate all multilib combinations, match against -# multilib directories and output a spec string of the result. -# Will fold identical trees. - -padding="$1" -optstring="$2" -shift 2 -n="\" \\ -$padding\"" -if [ $# = 0 ]; then -EOF - -pat= -for x in $dirnames; do -# p=`echo $x | sed -e 's,=!,/$=/,'` - p=`echo $x | sed -e 's/=//g'` -# pat="$pat -e 's=^//$p='" - pat="$pat -e 's/$p/g'" -done -echo ' optstring=`echo "/$optstring" | sed '"$pat\`" >> print-sysroot-suffix2.sh -cat >> print-sysroot-suffix2.sh <<\EOF - case $optstring in - //*) - ;; - *) - echo "$optstring" - ;; - esac -else - thisopt="$1" - shift - bit= - lastcond= - result= - for x in `echo "$thisopt" | sed -e 's,/, ,g'`; do - case $x in -EOF -for x in `echo "$options" | sed -e 's,/, ,g'`; do - match=`./print-sysroot-suffix3.sh "$x"` - echo "$x) optmatch=\"$match\" ;;" >> print-sysroot-suffix2.sh -done -cat >> print-sysroot-suffix2.sh <<\EOF - esac - bit=`"$0" "$padding " "$optstring$x/" "$@"` - if [ -z "$lastopt" ]; then - lastopt="$optmatch" - else - if [ "$lastbit" = "$bit" ]; then - lastopt="$lastopt|$optmatch" - else - result="$result$lastopt:$lastbit;$n" - lastopt="$optmatch" - fi - fi - lastbit="$bit" - done - bit=`"$0" "$padding " "$optstring" "$@"` - if [ "$bit" = "$lastbit" ]; then - if [ -z "$result" ]; then - echo "$bit" - else - echo "$n%{$result:$bit}" - fi - else - echo "$n%{$result$lastopt:$lastbit;$n:$bit}" - fi -fi -EOF -chmod +x ./print-sysroot-suffix2.sh -result=`./print-sysroot-suffix2.sh \"\" \"\" $options` -echo "#undef SYSROOT_SUFFIX_SPEC" -echo "#define SYSROOT_SUFFIX_SPEC \"$result\"" -rm print-sysroot-suffix2.sh -rm print-sysroot-suffix3.sh diff --git a/gcc/config/csky/t-csky-linux b/gcc/config/csky/t-csky-linux index 0730c3a..9139040 100644 --- a/gcc/config/csky/t-csky-linux +++ b/gcc/config/csky/t-csky-linux @@ -21,7 +21,7 @@ MULTILIB_EXCEPTIONS = -CSKY_MULTILIB_OSDIRNAMES = mfloat-abi.softfp=/soft-fp mfloat-abi.hard=/hard-fp mfloat-abi.soft=/. mcpu.ck810f=/. mcpu.ck807f=/ck807 mcpu.ck860f=/ck860 +MULTILIB_OSDIRNAMES = ./ ./ck807 ./ck860 ./ ./soft-fp ./hard-fp # Arch variants. MULTILIB_OPTIONS += mcpu=ck810f/mcpu=ck807f/mcpu=ck860f diff --git a/gcc/config/csky/t-sysroot-suffix b/gcc/config/csky/t-sysroot-suffix deleted file mode 100644 index d891f69..0000000 --- a/gcc/config/csky/t-sysroot-suffix +++ /dev/null @@ -1,28 +0,0 @@ -# Makefile fragment for C-SKY sysroot suffix. -# -# Copyright (C) 2018-2021 Free Software Foundation, Inc. -# Contributed by C-SKY Microsystems and Mentor Graphics. -# -# This file is part of GCC. -# -# GCC is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GCC is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GCC; see the file COPYING3. If not see -# . - -# Generate SYSROOT_SUFFIX_SPEC from MULTILIB_OSDIRNAMES. - -sysroot-suffix.h: $(srcdir)/config/csky/print-sysroot-suffix.sh - $(SHELL) $(srcdir)/config/csky/print-sysroot-suffix.sh \ - "$(CSKY_MULTILIB_OSDIRNAMES)" "$(MULTILIB_OPTIONS)" \ - "$(MULTILIB_MATCHES)" > tmp-sysroot-suffix.h - mv tmp-sysroot-suffix.h $@ -- cgit v1.1 From 0990d93dd8a4268bff5bbe48aa26748cf63201c7 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 7 Jun 2021 13:44:15 +0200 Subject: IBM Z: Use @PLT symbols for local functions in 64-bit mode This helps with generating code for kernel hotpatches, which contain individual functions and are loaded more than 2G away from vmlinux. This should not create performance regressions for the normal use cases, because for local functions ld replaces @PLT calls with direct calls. gcc/ChangeLog: * config/s390/predicates.md (bras_sym_operand): Accept all functions in 64-bit mode, use UNSPEC_PLT31. (larl_operand): Use UNSPEC_PLT31. * config/s390/s390.c (s390_loadrelative_operand_p): Likewise. (legitimize_pic_address): Likewise. (s390_emit_tls_call_insn): Mark __tls_get_offset as function, use UNSPEC_PLT31. (s390_delegitimize_address): Use UNSPEC_PLT31. (s390_output_addr_const_extra): Likewise. (print_operand): Add @PLT to TLS calls, handle %K. (s390_function_profiler): Mark __fentry__/_mcount as function, use %K, use UNSPEC_PLT31. (s390_output_mi_thunk): Use only UNSPEC_GOT, use %K. (s390_emit_call): Use UNSPEC_PLT31. (s390_emit_tpf_eh_return): Mark __tpf_eh_return as function. * config/s390/s390.md (UNSPEC_PLT31): Rename from UNSPEC_PLT. (*movdi_64): Use %K. (reload_base_64): Likewise. (*sibcall_brc): Likewise. (*sibcall_brcl): Likewise. (*sibcall_value_brc): Likewise. (*sibcall_value_brcl): Likewise. (*bras): Likewise. (*brasl): Likewise. (*bras_r): Likewise. (*brasl_r): Likewise. (*bras_tls): Likewise. (*brasl_tls): Likewise. (main_base_64): Likewise. (reload_base_64): Likewise. (@split_stack_call): Likewise. gcc/testsuite/ChangeLog: * g++.dg/ext/visibility/noPLT.C: Skip on s390x. * g++.target/s390/mi-thunk.C: New test. * gcc.target/s390/nodatarel-1.c: Move foostatic to the new tests. * gcc.target/s390/pr80080-4.c: Allow @PLT suffix. * gcc.target/s390/risbg-ll-3.c: Likewise. * gcc.target/s390/call.h: Common code for the new tests. * gcc.target/s390/call-z10-pic-nodatarel.c: New test. * gcc.target/s390/call-z10-pic.c: New test. * gcc.target/s390/call-z10.c: New test. * gcc.target/s390/call-z9-pic-nodatarel.c: New test. * gcc.target/s390/call-z9-pic.c: New test. * gcc.target/s390/call-z9.c: New test. * gcc.target/s390/mfentry-m64-pic.c: New test. * gcc.target/s390/tls.h: Common code for the new TLS tests. * gcc.target/s390/tls-pic.c: New test. * gcc.target/s390/tls.c: New test. --- gcc/config/s390/predicates.md | 9 +++-- gcc/config/s390/s390.c | 81 ++++++++++++++++++++++++++++++------------- gcc/config/s390/s390.md | 32 ++++++++--------- 3 files changed, 79 insertions(+), 43 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/predicates.md b/gcc/config/s390/predicates.md index 15093cb..99c343a 100644 --- a/gcc/config/s390/predicates.md +++ b/gcc/config/s390/predicates.md @@ -101,10 +101,13 @@ (define_special_predicate "bras_sym_operand" (ior (and (match_code "symbol_ref") - (match_test "!flag_pic || SYMBOL_REF_LOCAL_P (op)")) + (ior (match_test "!flag_pic") + (match_test "SYMBOL_REF_LOCAL_P (op)") + (and (match_test "TARGET_64BIT") + (match_test "SYMBOL_REF_FUNCTION_P (op)")))) (and (match_code "const") (and (match_test "GET_CODE (XEXP (op, 0)) == UNSPEC") - (match_test "XINT (XEXP (op, 0), 1) == UNSPEC_PLT"))))) + (match_test "XINT (XEXP (op, 0), 1) == UNSPEC_PLT31"))))) ;; Return true if OP is a PLUS that is not a legitimate ;; operand for the LA instruction. @@ -197,7 +200,7 @@ && XINT (op, 1) == UNSPEC_GOTENT) return true; if (GET_CODE (op) == UNSPEC - && XINT (op, 1) == UNSPEC_PLT) + && XINT (op, 1) == UNSPEC_PLT31) return true; if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_INDNTPOFF) diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 800e0ab..b1d3b99 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -3291,7 +3291,7 @@ s390_loadrelative_operand_p (rtx addr, rtx *symref, HOST_WIDE_INT *addend) if (GET_CODE (addr) == SYMBOL_REF || (GET_CODE (addr) == UNSPEC && (XINT (addr, 1) == UNSPEC_GOTENT - || XINT (addr, 1) == UNSPEC_PLT))) + || XINT (addr, 1) == UNSPEC_PLT31))) { if (symref) *symref = addr; @@ -4964,7 +4964,7 @@ legitimize_pic_address (rtx orig, rtx reg) || (SYMBOL_REF_P (addr) && s390_rel_address_ok_p (addr)) || (GET_CODE (addr) == UNSPEC && (XINT (addr, 1) == UNSPEC_GOTENT - || XINT (addr, 1) == UNSPEC_PLT))) + || XINT (addr, 1) == UNSPEC_PLT31))) && GET_CODE (addend) == CONST_INT) { /* This can be locally addressed. */ @@ -5125,7 +5125,7 @@ legitimize_pic_address (rtx orig, rtx reg) /* For @PLT larl is used. This is handled like local symbol refs. */ - case UNSPEC_PLT: + case UNSPEC_PLT31: gcc_unreachable (); break; @@ -5191,7 +5191,10 @@ s390_emit_tls_call_insn (rtx result_reg, rtx tls_call) emit_insn (s390_load_got ()); if (!s390_tls_symbol) - s390_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, "__tls_get_offset"); + { + s390_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, "__tls_get_offset"); + SYMBOL_REF_FLAGS (s390_tls_symbol) |= SYMBOL_FLAG_FUNCTION; + } insn = s390_emit_call (s390_tls_symbol, tls_call, result_reg, gen_rtx_REG (Pmode, RETURN_REGNUM)); @@ -7596,7 +7599,7 @@ s390_delegitimize_address (rtx orig_x) y = XEXP (x, 0); if (GET_CODE (y) == UNSPEC && (XINT (y, 1) == UNSPEC_GOTENT - || XINT (y, 1) == UNSPEC_PLT)) + || XINT (y, 1) == UNSPEC_PLT31)) y = XVECEXP (y, 0, 0); else return orig_x; @@ -7849,7 +7852,7 @@ s390_output_addr_const_extra (FILE *file, rtx x) output_addr_const (file, XVECEXP (x, 0, 0)); fprintf (file, "@GOTOFF"); return true; - case UNSPEC_PLT: + case UNSPEC_PLT31: output_addr_const (file, XVECEXP (x, 0, 0)); fprintf (file, "@PLT"); return true; @@ -7943,6 +7946,7 @@ print_operand_address (FILE *file, rtx addr) 'E': print opcode suffix for branch on index instruction. 'G': print the size of the operand in bytes. 'J': print tls_load/tls_gdcall/tls_ldcall suffix + 'K': print @PLT suffix for call targets and load address values. 'M': print the second word of a TImode operand. 'N': print the second word of a DImode operand. 'O': print only the displacement of a memory reference or address. @@ -8129,6 +8133,29 @@ print_operand (FILE *file, rtx x, int code) case 'Y': print_shift_count_operand (file, x); return; + + case 'K': + /* Append @PLT to both local and non-local symbols in order to support + Linux Kernel livepatching: patches contain individual functions and + are loaded further than 2G away from vmlinux, and therefore they must + call even static functions via PLT. ld will optimize @PLT away for + normal code, and keep it for patches. + + Do not indiscriminately add @PLT in 31-bit mode due to the %r12 + restriction, use UNSPEC_PLT31 instead. + + @PLT only makes sense for functions, data is taken care of by + -mno-pic-data-is-text-relative. + + Adding @PLT interferes with handling of weak symbols in non-PIC code, + since their addresses are loaded with larl, which then always produces + a non-NULL result, so skip them here as well. */ + if (TARGET_64BIT + && GET_CODE (x) == SYMBOL_REF + && SYMBOL_REF_FUNCTION_P (x) + && !(SYMBOL_REF_WEAK (x) && !flag_pic)) + fprintf (file, "@PLT"); + return; } switch (GET_CODE (x)) @@ -13125,9 +13152,10 @@ s390_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) op[3] = GEN_INT (UNITS_PER_LONG); op[2] = gen_rtx_SYMBOL_REF (Pmode, flag_fentry ? "__fentry__" : "_mcount"); - if (flag_pic) + SYMBOL_REF_FLAGS (op[2]) |= SYMBOL_FLAG_FUNCTION; + if (flag_pic && !TARGET_64BIT) { - op[2] = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op[2]), UNSPEC_PLT); + op[2] = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op[2]), UNSPEC_PLT31); op[2] = gen_rtx_CONST (Pmode, op[2]); } @@ -13142,7 +13170,7 @@ s390_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) warning (OPT_Wcannot_profile, "nested functions cannot be profiled " "with %<-mfentry%> on s390"); else - output_asm_insn ("brasl\t0,%2", op); + output_asm_insn ("brasl\t0,%2%K2", op); } else if (TARGET_64BIT) { @@ -13154,7 +13182,7 @@ s390_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) output_asm_insn ("stg\t%0,%1", op); if (flag_dwarf2_cfi_asm) output_asm_insn (".cfi_rel_offset\t%0,%3", op); - output_asm_insn ("brasl\t%0,%2", op); + output_asm_insn ("brasl\t%0,%2%K2", op); output_asm_insn ("lg\t%0,%1", op); if (flag_dwarf2_cfi_asm) output_asm_insn (".cfi_restore\t%0", op); @@ -13170,7 +13198,7 @@ s390_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) output_asm_insn ("st\t%0,%1", op); if (flag_dwarf2_cfi_asm) output_asm_insn (".cfi_rel_offset\t%0,%3", op); - output_asm_insn ("brasl\t%0,%2", op); + output_asm_insn ("brasl\t%0,%2%K2", op); output_asm_insn ("l\t%0,%1", op); if (flag_dwarf2_cfi_asm) output_asm_insn (".cfi_restore\t%0", op); @@ -13246,9 +13274,11 @@ s390_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED, if (flag_pic && !SYMBOL_REF_LOCAL_P (op[0])) { nonlocal = 1; - op[0] = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op[0]), - TARGET_64BIT ? UNSPEC_PLT : UNSPEC_GOT); - op[0] = gen_rtx_CONST (Pmode, op[0]); + if (!TARGET_64BIT) + { + op[0] = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op[0]), UNSPEC_GOT); + op[0] = gen_rtx_CONST (Pmode, op[0]); + } } /* Operand 1 is the 'this' pointer. */ @@ -13338,7 +13368,7 @@ s390_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED, } /* Jump to target. */ - output_asm_insn ("jg\t%0", op); + output_asm_insn ("jg\t%0%K0", op); /* Output literal pool if required. */ if (op[5]) @@ -13729,7 +13759,7 @@ rtx_insn * s390_emit_call (rtx addr_location, rtx tls_call, rtx result_reg, rtx retaddr_reg) { - bool plt_call = false; + bool plt31_call_p = false; rtx_insn *insn; rtx vec[4] = { NULL_RTX }; int elts = 0; @@ -13744,15 +13774,15 @@ s390_emit_call (rtx addr_location, rtx tls_call, rtx result_reg, { /* When calling a global routine in PIC mode, we must replace the symbol itself with the PLT stub. */ - if (flag_pic && !SYMBOL_REF_LOCAL_P (addr_location)) + if (flag_pic && !SYMBOL_REF_LOCAL_P (addr_location) && !TARGET_64BIT) { - if (TARGET_64BIT || retaddr_reg != NULL_RTX) + if (retaddr_reg != NULL_RTX) { addr_location = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr_location), - UNSPEC_PLT); + UNSPEC_PLT31); addr_location = gen_rtx_CONST (Pmode, addr_location); - plt_call = true; + plt31_call_p = true; } else /* For -fpic code the PLT entries might use r12 which is @@ -13773,7 +13803,7 @@ s390_emit_call (rtx addr_location, rtx tls_call, rtx result_reg, register 1. */ if (retaddr_reg == NULL_RTX && GET_CODE (addr_location) != SYMBOL_REF - && !plt_call) + && !plt31_call_p) { emit_move_insn (gen_rtx_REG (Pmode, SIBCALL_REGNUM), addr_location); addr_location = gen_rtx_REG (Pmode, SIBCALL_REGNUM); @@ -13781,7 +13811,7 @@ s390_emit_call (rtx addr_location, rtx tls_call, rtx result_reg, if (TARGET_INDIRECT_BRANCH_NOBP_CALL && GET_CODE (addr_location) != SYMBOL_REF - && !plt_call) + && !plt31_call_p) { /* Indirect branch thunks require the target to be a single GPR. */ addr_location = force_reg (Pmode, addr_location); @@ -13833,7 +13863,7 @@ s390_emit_call (rtx addr_location, rtx tls_call, rtx result_reg, insn = emit_call_insn (*call); /* 31-bit PLT stubs and tls calls use the GOT register implicitly. */ - if ((!TARGET_64BIT && plt_call) || tls_call != NULL_RTX) + if (plt31_call_p || tls_call != NULL_RTX) { /* s390_function_ok_for_sibcall should have denied sibcalls in this case. */ @@ -13889,7 +13919,10 @@ s390_emit_tpf_eh_return (rtx target) rtx reg, orig_ra; if (!s390_tpf_eh_return_symbol) - s390_tpf_eh_return_symbol = gen_rtx_SYMBOL_REF (Pmode, "__tpf_eh_return"); + { + s390_tpf_eh_return_symbol = gen_rtx_SYMBOL_REF (Pmode, "__tpf_eh_return"); + SYMBOL_REF_FLAGS (s390_tpf_eh_return_symbol) |= SYMBOL_FLAG_FUNCTION; + } reg = gen_rtx_REG (Pmode, 2); orig_ra = gen_rtx_REG (Pmode, 3); diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 0c5b4dc..8ad21b0 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -79,7 +79,7 @@ UNSPEC_GOTENT UNSPEC_GOT UNSPEC_GOTOFF - UNSPEC_PLT + UNSPEC_PLT31 UNSPEC_PLTOFF ; Literal pool @@ -1906,7 +1906,7 @@ vlgvg\t%0,%v1,0 vleg\t%v0,%1,0 vsteg\t%v1,%0,0 - larl\t%0,%1" + larl\t%0,%1%K1" [(set_attr "op_type" "RI,RI,RI,RI,RI,RIL,RIL,RIL,RRE,RRE,RRE,RXY,RIL,RRE,RXY, RXY,RR,RX,RXY,RX,RXY,RIL,SIL,*,*,RS,RS,VRI,VRR,VRS,VRS, VRX,VRX,RIL") @@ -2180,7 +2180,7 @@ (match_operand:SI 1 "larl_operand" "X"))] "!TARGET_64BIT && !FP_REG_P (operands[0])" - "larl\t%0,%1" + "larl\t%0,%1%K1" [(set_attr "op_type" "RIL") (set_attr "type" "larl") (set_attr "z10prop" "z10_fwd_A1") @@ -10373,7 +10373,7 @@ [(call (mem:QI (match_operand 0 "bras_sym_operand" "X")) (match_operand 1 "const_int_operand" "n"))] "SIBLING_CALL_P (insn) && TARGET_SMALL_EXEC" - "j\t%0" + "j\t%0%K0" [(set_attr "op_type" "RI") (set_attr "type" "branch")]) @@ -10381,7 +10381,7 @@ [(call (mem:QI (match_operand 0 "bras_sym_operand" "X")) (match_operand 1 "const_int_operand" "n"))] "SIBLING_CALL_P (insn)" - "jg\t%0" + "jg\t%0%K0" [(set_attr "op_type" "RIL") (set_attr "type" "branch")]) @@ -10434,7 +10434,7 @@ (call (mem:QI (match_operand 1 "bras_sym_operand" "X")) (match_operand 2 "const_int_operand" "n")))] "SIBLING_CALL_P (insn) && TARGET_SMALL_EXEC" - "j\t%1" + "j\t%1%K1" [(set_attr "op_type" "RI") (set_attr "type" "branch")]) @@ -10443,7 +10443,7 @@ (call (mem:QI (match_operand 1 "bras_sym_operand" "X")) (match_operand 2 "const_int_operand" "n")))] "SIBLING_CALL_P (insn)" - "jg\t%1" + "jg\t%1%K1" [(set_attr "op_type" "RIL") (set_attr "type" "branch")]) @@ -10470,7 +10470,7 @@ "!SIBLING_CALL_P (insn) && TARGET_SMALL_EXEC && GET_MODE (operands[2]) == Pmode" - "bras\t%2,%0" + "bras\t%2,%0%K0" [(set_attr "op_type" "RI") (set_attr "type" "jsr") (set_attr "z196prop" "z196_cracked")]) @@ -10482,7 +10482,7 @@ "!SIBLING_CALL_P (insn) && GET_MODE (operands[2]) == Pmode" - "brasl\t%2,%0" + "brasl\t%2,%0%K0" [(set_attr "op_type" "RIL") (set_attr "type" "jsr") (set_attr "z196prop" "z196_cracked") @@ -10576,7 +10576,7 @@ "!SIBLING_CALL_P (insn) && TARGET_SMALL_EXEC && GET_MODE (operands[3]) == Pmode" - "bras\t%3,%1" + "bras\t%3,%1%K1" [(set_attr "op_type" "RI") (set_attr "type" "jsr") (set_attr "z196prop" "z196_cracked")]) @@ -10589,7 +10589,7 @@ "!SIBLING_CALL_P (insn) && GET_MODE (operands[3]) == Pmode" - "brasl\t%3,%1" + "brasl\t%3,%1%K1" [(set_attr "op_type" "RIL") (set_attr "type" "jsr") (set_attr "z196prop" "z196_cracked") @@ -10720,7 +10720,7 @@ "!SIBLING_CALL_P (insn) && TARGET_SMALL_EXEC && GET_MODE (operands[3]) == Pmode" - "bras\t%3,%1%J4" + "bras\t%3,%1%K1%J4" [(set_attr "op_type" "RI") (set_attr "type" "jsr") (set_attr "z196prop" "z196_cracked")]) @@ -10734,7 +10734,7 @@ "!SIBLING_CALL_P (insn) && GET_MODE (operands[3]) == Pmode" - "brasl\t%3,%1%J4" + "brasl\t%3,%1%K1%J4" [(set_attr "op_type" "RIL") (set_attr "type" "jsr") (set_attr "z196prop" "z196_cracked") @@ -11343,7 +11343,7 @@ [(set (match_operand 0 "register_operand" "=a") (unspec [(label_ref (match_operand 1 "" ""))] UNSPEC_MAIN_BASE))] "GET_MODE (operands[0]) == Pmode" - "larl\t%0,%1" + "larl\t%0,%1%K1" [(set_attr "op_type" "RIL") (set_attr "type" "larl") (set_attr "z10prop" "z10_fwd_A1") @@ -11363,7 +11363,7 @@ [(set (match_operand 0 "register_operand" "=a") (unspec [(label_ref (match_operand 1 "" ""))] UNSPEC_RELOAD_BASE))] "GET_MODE (operands[0]) == Pmode" - "larl\t%0,%1" + "larl\t%0,%1%K1" [(set_attr "op_type" "RIL") (set_attr "type" "larl") (set_attr "z10prop" "z10_fwd_A1")]) @@ -12220,7 +12220,7 @@ "" { s390_output_split_stack_data (operands[1], operands[2], operands[3], operands[4]); - return "jg\t%0"; + return "jg\t%0%K0"; } [(set_attr "op_type" "RIL") (set_attr "type" "branch")]) -- cgit v1.1 From bd5b625228d545d5ecb35df24f9f094edc95e3fa Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 15 Jun 2021 09:35:34 -0500 Subject: rs6000: Initial create of rs6000-gen-builtins.c 2021-04-02 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c: New. --- gcc/config/rs6000/rs6000-gen-builtins.c | 165 ++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 gcc/config/rs6000/rs6000-gen-builtins.c (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c new file mode 100644 index 0000000..6ab7d7b --- /dev/null +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -0,0 +1,165 @@ +/* Generate built-in function initialization and recognition for Power. + Copyright (C) 2020-21 Free Software Foundation, Inc. + Contributed by Bill Schmidt, IBM + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* This program generates built-in function initialization and + recognition code for Power targets, based on text files that + describe the built-in functions and vector overloads: + + rs6000-builtin-new.def Table of built-in functions + rs6000-overload.def Table of overload functions + + Both files group similar functions together in "stanzas," as + described below. + + Each stanza in the built-in function file starts with a line + identifying the circumstances in which the group of functions is + permitted, with the gating predicate in square brackets. For + example, this could be + + [altivec] + + or it could be + + [power9] + + The bracketed gating predicate is the only information allowed on + the stanza header line, other than whitespace. + + Following the stanza header are two lines for each function: the + prototype line and the attributes line. The prototype line has + this format, where the square brackets indicate optional + information and angle brackets indicate required information: + + [kind] (); + + Here [kind] can be one of "const", "pure", or "fpmath"; + is a legal type for a built-in function result; + is the name by which the function can be called; + and is a comma-separated list of legal types + for built-in function arguments. The argument list may be + empty, but the parentheses and semicolon are required. + + The attributes line looks like this: + + {} + + Here is a unique internal identifier for the built-in + function that will be used as part of an enumeration of all + built-in functions; is the define_expand or + define_insn that will be invoked when the call is expanded; + and is a comma-separated list of special + conditions that apply to the built-in function. The attribute + list may be empty, but the braces are required. + + Attributes are strings, such as these: + + init Process as a vec_init function + set Process as a vec_set function + extract Process as a vec_extract function + nosoft Not valid with -msoft-float + ldvec Needs special handling for vec_ld semantics + stvec Needs special handling for vec_st semantics + reve Needs special handling for element reversal + pred Needs special handling for comparison predicates + htm Needs special handling for transactional memory + htmspr HTM function using an SPR + htmcr HTM function using a CR + mma Needs special handling for MMA instructions + quad MMA instruction using a register quad as an input operand + pair MMA instruction using a register pair as an input operand + no32bit Not valid for TARGET_32BIT + 32bit Requires different handling for TARGET_32BIT + cpu This is a "cpu_is" or "cpu_supports" builtin + ldstmask Altivec mask for load or store + lxvrse Needs special handling for load-rightmost, sign-extended + lxvrze Needs special handling for load-rightmost, zero-extended + endian Needs special handling for endianness + + An example stanza might look like this: + +[altivec] + const vsc __builtin_altivec_abs_v16qi (vsc); + ABS_V16QI absv16qi2 {} + const vss __builtin_altivec_abs_v8hi (vss); + ABS_V8HI absv8hi2 {} + + Here "vsc" and "vss" are shorthand for "vector signed char" and + "vector signed short" to shorten line lengths and improve readability. + Note the use of indentation, which is recommended but not required. + + The overload file has more complex stanza headers. Here the stanza + represents all functions with the same overloaded function name: + + [, , [[, ]] ] + + Here the single square brackets are part of the syntax, + is a unique internal identifier for the overload that will be used as + part of an enumeration of all overloaded functions; is the + name that will appear as a #define in rs6000-vecdefines.h; + is the name that is overloaded in the back end; and + is an optional token used to guard the #define with an #ifdef + in rs6000-vecdefines.h. + + Each function entry again has two lines. The first line is again a + prototype line (this time without [kind]): + + (); + + The second line contains the that this particular instance of + the overloaded function maps to. It must match a token that appears in + rs6000-builtin-new.def. Optionally, a second token may appear. If only + one token is on the line, it is also used to build the unique identifier + for the overloaded function. If a second token is present, the second + token is used instead for this purpose. This is necessary in cases + where a built-in function accepts more than one type signature. It is + common to have a built-in function that, for example, specifies a + "vector signed char" argument, but accepts "vector unsigned char" and + "vector bool char" as well because only the mode matters. Note that + the overload resolution mechanism has always handled these cases by + performing fold_convert on vector arguments to hide type mismatches, + and it will continue to do so. + + As a concrete example, __builtin_altivec_mtvscr uses an opaque argument + type for the source operand. Its built-in function id is MTVSCR. The + overloaded function __builtin_vec_mtvscr takes a variety of specific + types, but not all vector types. Each of these maps to the same + __builtin_altivec_mtvscr built-in function, but the overload ID must + be unique, so we must specify the second token as shown here. + + [VEC_MTVSCR, vec_mtvscr, __builtin_vec_mtvscr] + void __builtin_vec_mtvscr (vbc); + MTVSCR MTVSCR_VBC + void __builtin_vec_mtvscr (vsc); + MTVSCR MTVSCR_VSC + ... + + Blank lines may be used as desired in these files between the lines as + defined above; that is, you can introduce as many extra newlines as you + like after a required newline, but nowhere else. Lines beginning with + a semicolon are also treated as blank lines. */ + +#include +#include +#include +#include +#include +#include +#include +#include -- cgit v1.1 From 4a720a9547320699aceda7d2e0b08de5ab40132f Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Fri, 2 Apr 2021 16:23:13 -0500 Subject: rs6000: Add initial input files This patch adds a tiny subset of the built-in and overload descriptions. 2021-04-02 Bill Schmidt gcc/ * config/rs6000/rs6000-builtin-new.def: New. * config/rs6000/rs6000-overload.def: New. --- gcc/config/rs6000/rs6000-builtin-new.def | 199 +++++++++++++++++++++++++++++++ gcc/config/rs6000/rs6000-overload.def | 82 +++++++++++++ 2 files changed, 281 insertions(+) create mode 100644 gcc/config/rs6000/rs6000-builtin-new.def create mode 100644 gcc/config/rs6000/rs6000-overload.def (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def new file mode 100644 index 0000000..a84a3de --- /dev/null +++ b/gcc/config/rs6000/rs6000-builtin-new.def @@ -0,0 +1,199 @@ +; Built-in functions for PowerPC. +; Copyright (C) 2020-21 Free Software Foundation, Inc. +; Contributed by Bill Schmidt, IBM +; +; This file is part of GCC. +; +; GCC is free software; you can redistribute it and/or modify it under +; the terms of the GNU General Public License as published by the Free +; Software Foundation; either version 3, or (at your option) any later +; version. +; +; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +; WARRANTY; without even the implied warranty of MERCHANTABILITY or +; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received a copy of the GNU General Public License +; along with GCC; see the file COPYING3. If not see +; . + + +; Built-in functions in this file are organized into "stanzas", where +; all built-ins in a given stanza are enabled together. Each stanza +; starts with a line identifying the circumstances in which the group of +; functions is permitted, with the gating predicate in square brackets. +; For example, this could be +; +; [altivec] +; +; or it could be +; +; [power9] +; +; The bracketed gating predicate is the only information allowed on +; the stanza header line, other than whitespace. +; +; Following the stanza header are two lines for each function: the +; prototype line and the attributes line. The prototype line has +; this format, where the square brackets indicate optional +; information and angle brackets indicate required information: +; +; [kind] (); +; +; Here [kind] can be one of "const", "pure", or "fpmath"; +; is a legal type for a built-in function result; +; is the name by which the function can be called; +; and is a comma-separated list of legal types +; for built-in function arguments. The argument list may be +; empty, but the parentheses and semicolon are required. +; +; A legal type is of the form: +; +; [const] [[signed|unsigned] | ] [*] +; +; where "const" applies only to a of "int". Legal values +; of are (for now): +; +; char +; short +; int +; long +; long double +; long long +; float +; double +; __int128 +; _Float128 +; bool +; string +; _Decimal32 +; _Decimal64 +; _Decimal128 +; __ibm128 +; +; Legal values of are as follows, and are shorthand for +; the associated meaning: +; +; vsc vector signed char +; vuc vector unsigned char +; vbc vector bool char +; vss vector signed short +; vus vector unsigned short +; vbs vector bool short +; vsi vector signed int +; vui vector unsigned int +; vbi vector bool int +; vsll vector signed long long +; vull vector unsigned long long +; vbll vector bool long long +; vsq vector signed __int128 +; vuq vector unsigned __int128 +; vbq vector bool __int128 +; vp vector pixel +; vf vector float +; vd vector double +; v256 __vector_pair +; v512 __vector_quad +; +; For simplicity, We don't support "short int" and "long long int". +; We don't currently support a of "_Float16". "signed" +; and "unsigned" only apply to integral base types. The optional * +; indicates a pointer type. +; +; The attributes line looks like this: +; +; {} +; +; Here is a unique internal identifier for the built-in +; function that will be used as part of an enumeration of all +; built-in functions; is the define_expand or +; define_insn that will be invoked when the call is expanded; +; and is a comma-separated list of special +; conditions that apply to the built-in function. The attribute +; list may be empty, but the braces are required. +; +; Attributes are strings, and the allowed ones are listed below. +; +; init Process as a vec_init function +; set Process as a vec_set function +; extract Process as a vec_extract function +; nosoft Not valid with -msoft-float +; ldvec Needs special handling for vec_ld semantics +; stvec Needs special handling for vec_st semantics +; reve Needs special handling for element reversal +; pred Needs special handling for comparison predicates +; htm Needs special handling for transactional memory +; htmspr HTM function using an SPR +; htmcr HTM function using a CR +; mma Needs special handling for MMA +; quad MMA instruction using a register quad as an input operand +; pair MMA instruction using a register pair as an input operand +; no32bit Not valid for TARGET_32BIT +; 32bit Requires different handling for TARGET_32BIT +; cpu This is a "cpu_is" or "cpu_supports" builtin +; ldstmask Altivec mask for load or store +; lxvrse Needs special handling for load-rightmost, sign-extended +; lxvrze Needs special handling for load-rightmost, zero-extended +; endian Needs special handling for endianness +; +; Each attribute corresponds to extra processing required when +; the built-in is expanded. All such special processing should +; be controlled by an attribute from now on. +; +; It is important to note that each entry's must be +; unique. The code generated from this file will call def_builtin +; for each entry, and this can only happen once per name. +; +; The type signature for the builtin must match the modes of the RTL +; pattern . When a builtin is used only as a basis for +; overloading, you can use an arbitrary type for each mode (for example, +; for V8HImode, you could use vp, vss, vus, or vbs). The overloading +; machinery takes care of adding appropriate casts between vectors to +; satisfy impedance matching. The overloaded prototypes are the ones +; that must match what users expect. Thus you will often have a small +; number of entries in this file that correspond to a much greater +; number of entries in rs6000-overload.def. +; +; However, builtins in this file that are expected to be directly called +; by users must have one version for each expected type combination. +; +; Eventually we want to automatically generate built-in documentation +; from the entries in this file. Documenting of built-ins with more +; than one acceptable prototype can be done by cross-referencing +; against rs6000-overload.def and picking up the allowable prototypes +; from there. +; +; Blank lines may be used as desired in this file between the lines as +; defined above; that is, you can introduce as many extra newlines as you +; like after a required newline, but nowhere else. Lines beginning with +; a semicolon are also treated as blank lines. +; +; A const int argument may be restricted to certain values. This is +; indicated by one of the following occurring after the "int" token: +; +; restricts the constant to x bits, interpreted as unsigned +; restricts the constant to the inclusive range [x,y] +; [x,y] restricts the constant to the inclusive range [x,y], +; but only applies if the argument is constant. +; {x,y} restricts the constant to one of two values, x or y. +; +; Here x and y are integer tokens. Note that the "const" token is a +; lie when the restriction is [x,y], but this simplifies the parsing +; significantly and is hopefully forgivable. + + + +; AltiVec builtins. +[altivec] + const vsc __builtin_altivec_abs_v16qi (vsc); + ABS_V16QI absv16qi2 {} + + const vf __builtin_altivec_abs_v4sf (vf); + ABS_V4SF absv4sf2 {} + + const vsi __builtin_altivec_abs_v4si (vsi); + ABS_V4SI absv4si2 {} + + const vss __builtin_altivec_abs_v8hi (vss); + ABS_V8HI absv8hi2 {} diff --git a/gcc/config/rs6000/rs6000-overload.def b/gcc/config/rs6000/rs6000-overload.def new file mode 100644 index 0000000..d8028c9 --- /dev/null +++ b/gcc/config/rs6000/rs6000-overload.def @@ -0,0 +1,82 @@ +; Overloaded built-in functions for PowerPC. +; Copyright (C) 2020-21 Free Software Foundation, Inc. +; Contributed by Bill Schmidt, IBM +; +; This file is part of GCC. +; +; GCC is free software; you can redistribute it and/or modify it under +; the terms of the GNU General Public License as published by the Free +; Software Foundation; either version 3, or (at your option) any later +; version. +; +; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +; WARRANTY; without even the implied warranty of MERCHANTABILITY or +; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received a copy of the GNU General Public License +; along with GCC; see the file COPYING3. If not see +; . + + +; Overloaded built-in functions in this file are organized into "stanzas", +; where all built-ins in a given stanza have the same overloaded function +; name: +; +; [, , [[, ]] ] +; +; Here the single square brackets are part of the syntax; +; is a unique internal identifier for the overload that will be used as +; part of an enumeration of all overloaded functions; is the +; name that will appear as a #define in rs6000-vecdefines.h; +; is the name that is overloaded in the back end; and +; is an optional token used to guard the #define with an #ifdef +; in rs6000-vecdefines.h. If no #define is desired, the should +; be replaced with the token SKIP. +; +; Each function entry has two lines. The first line is a prototype line. +; See rs6000-builtin-new.def for a description of the prototype line. +; A prototype line in this file differs in that it doesn't have an +; optional [kind] token: +; +; (); +; +; The second line contains the that this particular instance of +; the overloaded function maps to. It must match a token that appears in +; rs6000-builtin-new.def. Optionally, a second token may appear. If only +; one token is on the line, it is also used to build the unique identifier +; for the overloaded function. If a second token is present, the second +; token is used instead for this purpose. This is necessary in cases +; where a built-in function accepts more than one type signature. It is +; common to have a built-in function that, for example, specifies a +; "vector signed char" argument, but accepts "vector unsigned char" and +; "vector bool char" as well because only the mode matters. Note that +; the overload resolution mechanism has always handled these cases by +; performing fold_convert on vector arguments to hide type mismatches, +; and it will continue to do so. +; +; As a concrete example, __builtin_altivec_mtvscr uses an opaque argument +; type for the source operand. Its built-in function id is MTVSCR. The +; overloaded function __builtin_vec_mtvscr takes a variety of specific +; types, but not all vector types. Each of these maps to the same +; __builtin_altivec_mtvscr built-in function, but the overload ID must +; be unique, so we must specify the second token as shown here. +; +;[VEC_MTVSCR, vec_mtvscr, __builtin_vec_mtvscr] +; void __builtin_vec_mtvscr (vbc); +; MTVSCR MTVSCR_VBC +; void __builtin_vec_mtvscr (vsc); +; MTVSCR MTVSCR_VSC +; ... +; +; Blank lines may be used as desired in this file between the lines as +; defined above; that is, you can introduce as many extra newlines as you +; like after a required newline, but nowhere else. Lines beginning with +; a semicolon are also treated as blank lines. + + +[VEC_ABS, vec_abs, __builtin_vec_abs] + vsc __builtin_vec_abs (vsc); + ABS_V16QI + vss __builtin_vec_abs (vss); + ABS_V8HI -- cgit v1.1 From 43fa306f1d723d9d6c0884e38b102b954d3a4c30 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Mon, 7 Jun 2021 11:20:56 -0500 Subject: rs6000: Add file support and functions for diagnostic support 2021-06-07 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (bif_file): New variable. (ovld_file): Likewise. (header_file): Likewise. (init_file): Likewise. (defines_file): Likewise. (pgm_path): Likewise. (bif_path): Likewise. (ovld_path): Likewise. (header_path): Likewise. (init_path): Likewise. (defines_path): Likewise. (LINELEN): New macro. (linebuf): New variable. (line): Likewise. (pos): Likewise. (diag): Likewise. (bif_diag): New function. (ovld_diag): Likewise. --- gcc/config/rs6000/rs6000-gen-builtins.c | 47 +++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 6ab7d7b..3c53c34 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -163,3 +163,50 @@ along with GCC; see the file COPYING3. If not see #include #include #include + +/* Input and output file descriptors and pathnames. */ +static FILE *bif_file; +static FILE *ovld_file; +static FILE *header_file; +static FILE *init_file; +static FILE *defines_file; + +static const char *pgm_path; +static const char *bif_path; +static const char *ovld_path; +static const char *header_path; +static const char *init_path; +static const char *defines_path; + +/* Position information. Note that "pos" is zero-indexed, but users + expect one-indexed column information, so representations of "pos" + as columns in diagnostic messages must be adjusted. */ +#define LINELEN 1024 +static char linebuf[LINELEN]; +static int line; +static int pos; + +/* Pointer to a diagnostic function. */ +static void (*diag) (const char *, ...) + __attribute__ ((format (printf, 1, 2))); + +/* Custom diagnostics. */ +static void __attribute__ ((format (printf, 1, 2))) +bif_diag (const char * fmt, ...) +{ + va_list args; + fprintf (stderr, "%s:%d: ", bif_path, line); + va_start (args, fmt); + vfprintf (stderr, fmt, args); + va_end (args); +} + +static void __attribute__ ((format (printf, 1, 2))) +ovld_diag (const char * fmt, ...) +{ + va_list args; + fprintf (stderr, "%s:%d: ", ovld_path, line); + va_start (args, fmt); + vfprintf (stderr, fmt, args); + va_end (args); +} -- cgit v1.1 From 9abd2ac5a9b694bcdd871165d109f94866032534 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Fri, 16 Jul 2021 12:21:08 -0400 Subject: rs6000: Add helper functions for parsing 2021-07-16 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (consume_whitespace): New function. (advance_line): Likewise. (safe_inc_pos): Likewise. (match_identifier): Likewise. (match_integer): Likewise. (match_to_right_bracket): Likewise. --- gcc/config/rs6000/rs6000-gen-builtins.c | 110 ++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 3c53c34..7923cc4 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -210,3 +210,113 @@ ovld_diag (const char * fmt, ...) vfprintf (stderr, fmt, args); va_end (args); } + +/* Pass over whitespace (other than a newline, which terminates the scan). */ +static void +consume_whitespace (void) +{ + while (pos < LINELEN && isspace(linebuf[pos]) && linebuf[pos] != '\n') + pos++; + return; +} + +/* Get the next nonblank, noncomment line, returning 0 on EOF, 1 otherwise. */ +static int +advance_line (FILE *file) +{ + while (1) + { + /* Read ahead one line and check for EOF. */ + if (!fgets (linebuf, sizeof linebuf, file)) + return 0; + line++; + size_t len = strlen (linebuf); + if (linebuf[len - 1] != '\n') + (*diag) ("line doesn't terminate with newline\n"); + pos = 0; + consume_whitespace (); + if (linebuf[pos] != '\n' && linebuf[pos] != ';') + return 1; + } +} + +static inline void +safe_inc_pos (void) +{ + if (pos++ >= LINELEN) + { + (*diag) ("line length overrun.\n"); + exit (1); + } +} + +/* Match an identifier, returning NULL on failure, else a pointer to a + buffer containing the identifier. */ +static char * +match_identifier (void) +{ + int lastpos = pos - 1; + while (isalnum (linebuf[lastpos + 1]) || linebuf[lastpos + 1] == '_') + ++lastpos; + + if (lastpos < pos) + return 0; + + char *buf = (char *) malloc (lastpos - pos + 2); + memcpy (buf, &linebuf[pos], lastpos - pos + 1); + buf[lastpos - pos + 1] = '\0'; + + pos = lastpos + 1; + return buf; +} + +/* Match an integer and return the string representing its value, + or a null string on failure. */ +static char * +match_integer (void) +{ + int startpos = pos; + if (linebuf[pos] == '-') + safe_inc_pos (); + + int lastpos = pos - 1; + while (isdigit (linebuf[lastpos + 1])) + ++lastpos; + + if (lastpos < pos) + return NULL; + + pos = lastpos + 1; + char *buf = (char *) malloc (lastpos - startpos + 2); + memcpy (buf, &linebuf[startpos], lastpos - startpos + 1); + buf[lastpos - startpos + 1] = '\0'; + return buf; +} + +/* Match a string up to but not including a ']', and return its value, + or zero if there is nothing before the ']'. Error if we don't find + such a character. */ +static const char * +match_to_right_bracket (void) +{ + int lastpos = pos - 1; + while (linebuf[lastpos + 1] != ']') + { + if (linebuf[lastpos + 1] == '\n') + { + (*diag) ("no ']' found before end of line.\n"); + exit (1); + } + ++lastpos; + } + + if (lastpos < pos) + return 0; + + char *buf = (char *) malloc (lastpos - pos + 2); + memcpy (buf, &linebuf[pos], lastpos - pos + 1); + buf[lastpos - pos + 1] = '\0'; + + pos = lastpos + 1; + return buf; +} -- cgit v1.1 From 0d685dfbb603b631c0e1d121dd73e73d33573ec5 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Mon, 7 Jun 2021 11:49:56 -0500 Subject: rs6000: Add functions for matching types, part 1 of 3 2021-06-07 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (void_status): New enum. (basetype): Likewise. (typeinfo): Likewise. (handle_pointer): New function. (match_basetype): New stub function. (match_const_restriction): Likewise. (match_type): New function. --- gcc/config/rs6000/rs6000-gen-builtins.c | 367 ++++++++++++++++++++++++++++++++ 1 file changed, 367 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 7923cc4..3845071 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -186,6 +186,52 @@ static char linebuf[LINELEN]; static int line; static int pos; +/* Used to determine whether a type can be void (only return types). */ +enum void_status +{ + VOID_NOTOK, + VOID_OK +}; + +/* Legal base types for an argument or return type. */ +enum basetype +{ + BT_CHAR, + BT_SHORT, + BT_INT, + BT_LONG, + BT_LONGLONG, + BT_FLOAT, + BT_DOUBLE, + BT_LONGDOUBLE, + BT_INT128, + BT_FLOAT128, + BT_BOOL, + BT_STRING, + BT_DECIMAL32, + BT_DECIMAL64, + BT_DECIMAL128, + BT_IBM128, + BT_VPAIR, + BT_VQUAD +}; + +/* Type modifiers for an argument or return type. */ +struct typeinfo +{ + char isvoid; + char isconst; + char isvector; + char issigned; + char isunsigned; + char isbool; + char ispixel; + char ispointer; + basetype base; + char *val1; + char *val2; +}; + /* Pointer to a diagnostic function. */ static void (*diag) (const char *, ...) __attribute__ ((format (printf, 1, 2))); @@ -320,3 +366,324 @@ match_to_right_bracket (void) pos = lastpos + 1; return buf; } + +static inline void +handle_pointer (typeinfo *typedata) +{ + consume_whitespace (); + if (linebuf[pos] == '*') + { + typedata->ispointer = 1; + safe_inc_pos (); + } +} + +/* Match one of the allowable base types. Consumes one token unless the + token is "long", which must be paired with a second "long". Optionally + consumes a following '*' token for pointers. Return 1 for success, + 0 for failure. */ +static int +match_basetype (typeinfo *typedata) +{ + return 1; +} + +/* A const int argument may be restricted to certain values. This is + indicated by one of the following occurring after the "int' token: + + restricts the constant to x bits, interpreted as unsigned + restricts the constant to the inclusive range [x,y] + [x,y] restricts the constant to the inclusive range [x,y], + but only applies if the argument is constant. + {x,y} restricts the constant to one of two values, x or y. + + Here x and y are integer tokens. Note that the "const" token is a + lie when the restriction is [x,y], but this simplifies the parsing + significantly and is hopefully forgivable. + + Return 1 for success, else 0. */ +static int +match_const_restriction (typeinfo *typedata) +{ + return 1; +} + +/* Look for a type, which can be terminated by a token that is not part of + a type, a comma, or a closing parenthesis. Place information about the + type in TYPEDATA. Return 1 for success, 0 for failure. */ +static int +match_type (typeinfo *typedata, int voidok) +{ + /* A legal type is of the form: + + [const] [[signed|unsigned] | ] [*] + + Legal values of are (for now): + + char + short + int + long + long double + long long + float + double + __int128 + _Float128 + bool + string + _Decimal32 + _Decimal64 + _Decimal128 + __ibm128 + + Legal values of are as follows, and are shorthand for + the associated meaning: + + vsc vector signed char + vuc vector unsigned char + vbc vector bool char + vss vector signed short + vus vector unsigned short + vbs vector bool short + vsi vector signed int + vui vector unsigned int + vbi vector bool int + vsll vector signed long long + vull vector unsigned long long + vbll vector bool long long + vsq vector signed __int128 + vuq vector unsigned __int128 + vbq vector bool __int128 + vp vector pixel + vf vector float + vd vector double + v256 __vector_pair + v512 __vector_quad + + For simplicity, We don't support "short int" and "long long int". + We don't currently support a of "_Float16". "signed" + and "unsigned" only apply to integral base types. The optional * + indicates a pointer type. */ + + consume_whitespace (); + memset (typedata, 0, sizeof *typedata); + int oldpos = pos; + + char *token = match_identifier (); + if (!token) + return 0; + + if (!strcmp (token, "const")) + { + typedata->isconst = 1; + consume_whitespace (); + oldpos = pos; + token = match_identifier (); + } + + if (!strcmp (token, "void")) + typedata->isvoid = 1; + + if (!strcmp (token, "vsc")) + { + typedata->isvector = 1; + typedata->issigned = 1; + typedata->base = BT_CHAR; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vuc")) + { + typedata->isvector = 1; + typedata->isunsigned = 1; + typedata->base = BT_CHAR; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vbc")) + { + typedata->isvector = 1; + typedata->isbool = 1; + typedata->base = BT_CHAR; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vss")) + { + typedata->isvector = 1; + typedata->issigned = 1; + typedata->base = BT_SHORT; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vus")) + { + typedata->isvector = 1; + typedata->isunsigned = 1; + typedata->base = BT_SHORT; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vbs")) + { + typedata->isvector = 1; + typedata->isbool = 1; + typedata->base = BT_SHORT; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vsi")) + { + typedata->isvector = 1; + typedata->issigned = 1; + typedata->base = BT_INT; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vui")) + { + typedata->isvector = 1; + typedata->isunsigned = 1; + typedata->base = BT_INT; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vbi")) + { + typedata->isvector = 1; + typedata->isbool = 1; + typedata->base = BT_INT; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vsll")) + { + typedata->isvector = 1; + typedata->issigned = 1; + typedata->base = BT_LONGLONG; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vull")) + { + typedata->isvector = 1; + typedata->isunsigned = 1; + typedata->base = BT_LONGLONG; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vbll")) + { + typedata->isvector = 1; + typedata->isbool = 1; + typedata->base = BT_LONGLONG; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vsq")) + { + typedata->isvector = 1; + typedata->issigned = 1; + typedata->base = BT_INT128; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vuq")) + { + typedata->isvector = 1; + typedata->isunsigned = 1; + typedata->base = BT_INT128; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vbq")) + { + typedata->isvector = 1; + typedata->isbool = 1; + typedata->base = BT_INT128; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vp")) + { + typedata->isvector = 1; + typedata->ispixel = 1; + typedata->base = BT_SHORT; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vf")) + { + typedata->isvector = 1; + typedata->base = BT_FLOAT; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "vd")) + { + typedata->isvector = 1; + typedata->base = BT_DOUBLE; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "v256")) + { + typedata->isvector = 1; + typedata->base = BT_VPAIR; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "v512")) + { + typedata->isvector = 1; + typedata->base = BT_VQUAD; + handle_pointer (typedata); + return 1; + } + else if (!strcmp (token, "signed")) + typedata->issigned = 1; + else if (!strcmp (token, "unsigned")) + typedata->isunsigned = 1; + else if (!typedata->isvoid && !typedata->isconst) + { + /* Push back token. */ + pos = oldpos; + return match_basetype (typedata); + } + + if (typedata->isvoid) + { + consume_whitespace (); + if (linebuf[pos] == '*') + { + typedata->ispointer = 1; + safe_inc_pos (); + } + else if (!voidok) + return 0; + return 1; + } + + if (!typedata->issigned && !typedata->isunsigned) + pos = oldpos; + if (!match_basetype (typedata)) + return 0; + + if (typedata->isconst) + { + if (typedata->ispointer) + return 1; + if (typedata->base != BT_INT) + { + (*diag)("'const' at %d requires pointer or integer type", + oldpos + 1); + return 0; + } + consume_whitespace (); + if (linebuf[pos] == '<' || linebuf[pos] == '{' || linebuf[pos] == '[') + return match_const_restriction (typedata); + } + + return 1; +} -- cgit v1.1 From 81736934dd7928f67dacb6f0a5656b8622b0ff46 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Fri, 2 Apr 2021 16:31:38 -0500 Subject: rs6000: Add functions for matching types, part 2 of 3 2021-04-02 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (match_basetype): Implement. --- gcc/config/rs6000/rs6000-gen-builtins.c | 64 +++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 3845071..44d6e8a 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -385,6 +385,70 @@ handle_pointer (typeinfo *typedata) static int match_basetype (typeinfo *typedata) { + consume_whitespace (); + int oldpos = pos; + char *token = match_identifier (); + if (!token) + { + (*diag) ("missing base type in return type at column %d\n", pos + 1); + return 0; + } + + if (!strcmp (token, "char")) + typedata->base = BT_CHAR; + else if (!strcmp (token, "short")) + typedata->base = BT_SHORT; + else if (!strcmp (token, "int")) + typedata->base = BT_INT; + else if (!strcmp (token, "long")) + { + consume_whitespace (); + oldpos = pos; + char *mustbelongordbl = match_identifier (); + if (!mustbelongordbl) + typedata->base = BT_LONG; + else if (!strcmp (mustbelongordbl, "long")) + typedata->base = BT_LONGLONG; + else if (!strcmp (mustbelongordbl, "double")) + typedata->base = BT_LONGDOUBLE; + else + /* Speculatively accept "long" here and push back the token. + This occurs when "long" is a return type and the next token + is the function name. */ + { + typedata->base = BT_LONG; + pos = oldpos; + } + } + else if (!strcmp (token, "float")) + typedata->base = BT_FLOAT; + else if (!strcmp (token, "double")) + typedata->base = BT_DOUBLE; + else if (!strcmp (token, "__int128")) + typedata->base = BT_INT128; + else if (!strcmp (token, "_Float128")) + typedata->base = BT_FLOAT128; + else if (!strcmp (token, "bool")) + typedata->base = BT_BOOL; + /* A "string" is a special "const char *" -- we need it because it + cannot match either signed or unsigned char *. */ + else if (!strcmp (token, "string")) + typedata->base = BT_STRING; + else if (!strcmp (token, "_Decimal32")) + typedata->base = BT_DECIMAL32; + else if (!strcmp (token, "_Decimal64")) + typedata->base = BT_DECIMAL64; + else if (!strcmp (token, "_Decimal128")) + typedata->base = BT_DECIMAL128; + else if (!strcmp (token, "__ibm128")) + typedata->base = BT_IBM128; + else + { + (*diag) ("unrecognized base type at column %d\n", oldpos + 1); + return 0; + } + + handle_pointer (typedata); return 1; } -- cgit v1.1 From e11b02f17206af70f72a82ff1167a2676d5b18c9 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Mon, 7 Jun 2021 12:03:49 -0500 Subject: rs6000: Add functions for matching types, part 3 of 3 2021-06-07 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (restriction): New enum. (typeinfo): Add restr field. (match_bracketed_pair): New function. (match_const_restriction): Implement. --- gcc/config/rs6000/rs6000-gen-builtins.c | 115 +++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 44d6e8a..34566fc 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -216,6 +216,22 @@ enum basetype BT_VQUAD }; +/* Ways in which a const int value can be restricted. RES_BITS indicates + that the integer is restricted to val1 bits, interpreted as an unsigned + number. RES_RANGE indicates that the integer is restricted to values + between val1 and val2, inclusive. RES_VAR_RANGE is like RES_RANGE, but + the argument may be variable, so it can only be checked if it is constant. + RES_VALUES indicates that the integer must have one of the values val1 + or val2. */ +enum restriction +{ + RES_NONE, + RES_BITS, + RES_RANGE, + RES_VAR_RANGE, + RES_VALUES +}; + /* Type modifiers for an argument or return type. */ struct typeinfo { @@ -228,6 +244,7 @@ struct typeinfo char ispixel; char ispointer; basetype base; + restriction restr; char *val1; char *val2; }; @@ -452,6 +469,53 @@ match_basetype (typeinfo *typedata) return 1; } +/* Helper routine for match_const_restriction. */ +static int +match_bracketed_pair (typeinfo *typedata, char open, char close, + restriction restr) +{ + if (linebuf[pos] == open) + { + safe_inc_pos (); + int oldpos = pos; + char *x = match_integer (); + if (x == NULL) + { + (*diag) ("malformed integer at column %d.\n", oldpos + 1); + return 0; + } + consume_whitespace (); + if (linebuf[pos] != ',') + { + (*diag) ("missing comma at column %d.\n", pos + 1); + return 0; + } + safe_inc_pos (); + consume_whitespace (); + oldpos = pos; + char *y = match_integer (); + if (y == NULL) + { + (*diag) ("malformed integer at column %d.\n", oldpos + 1); + return 0; + } + typedata->restr = restr; + typedata->val1 = x; + typedata->val2 = y; + + consume_whitespace (); + if (linebuf[pos] != close) + { + (*diag) ("malformed restriction at column %d.\n", pos + 1); + return 0; + } + safe_inc_pos (); + return 1; + } + + return 0; +} + /* A const int argument may be restricted to certain values. This is indicated by one of the following occurring after the "int' token: @@ -469,7 +533,56 @@ match_basetype (typeinfo *typedata) static int match_const_restriction (typeinfo *typedata) { - return 1; + int oldpos = pos; + if (linebuf[pos] == '<') + { + safe_inc_pos (); + oldpos = pos; + char *x = match_integer (); + if (x == NULL) + { + (*diag) ("malformed integer at column %d.\n", oldpos + 1); + return 0; + } + consume_whitespace (); + if (linebuf[pos] == '>') + { + typedata->restr = RES_BITS; + typedata->val1 = x; + safe_inc_pos (); + return 1; + } + else if (linebuf[pos] != ',') + { + (*diag) ("malformed restriction at column %d.\n", pos + 1); + return 0; + } + safe_inc_pos (); + oldpos = pos; + char *y = match_integer (); + if (y == NULL) + { + (*diag) ("malformed integer at column %d.\n", oldpos + 1); + return 0; + } + typedata->restr = RES_RANGE; + typedata->val1 = x; + typedata->val2 = y; + + consume_whitespace (); + if (linebuf[pos] != '>') + { + (*diag) ("malformed restriction at column %d.\n", pos + 1); + return 0; + } + safe_inc_pos (); + return 1; + } + else if (match_bracketed_pair (typedata, '{', '}', RES_VALUES) + || match_bracketed_pair (typedata, '[', ']', RES_VAR_RANGE)) + return 1; + + return 0; } /* Look for a type, which can be terminated by a token that is not part of -- cgit v1.1 From fa5f8b49e55caf5bb341f5eb6b5ab828b9286425 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 8 Jun 2021 09:33:40 -0500 Subject: rs6000: Red-black tree implementation for balanced tree search 2021-06-08 Bill Schmidt gcc/ * config/rs6000/rbtree.c: New file. * config/rs6000/rbtree.h: New file. --- gcc/config/rs6000/rbtree.c | 242 +++++++++++++++++++++++++++++++++++++++++++++ gcc/config/rs6000/rbtree.h | 52 ++++++++++ 2 files changed, 294 insertions(+) create mode 100644 gcc/config/rs6000/rbtree.c create mode 100644 gcc/config/rs6000/rbtree.h (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rbtree.c b/gcc/config/rs6000/rbtree.c new file mode 100644 index 0000000..37a559c --- /dev/null +++ b/gcc/config/rs6000/rbtree.c @@ -0,0 +1,242 @@ +/* Partial red-black tree implementation for rs6000-gen-builtins.c. + Copyright (C) 2020-21 Free Software Foundation, Inc. + Contributed by Bill Schmidt, IBM + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include +#include +#include +#include +#include "rbtree.h" + +/* Initialize a red-black tree. */ +void +rbt_new (struct rbt_strings *t) +{ + t->rbt_nil = (rbt_string_node *) malloc (sizeof (rbt_string_node)); + t->rbt_nil->color = RBT_BLACK; + t->rbt_root = t->rbt_nil; +} + +/* Create a new node to be inserted into the red-black tree. An inserted + node starts out red. */ +static struct rbt_string_node * +rbt_create_node (struct rbt_strings *t, char *str) +{ + struct rbt_string_node *nodeptr + = (struct rbt_string_node *) malloc (sizeof (rbt_string_node)); + nodeptr->str = str; + nodeptr->left = t->rbt_nil; + nodeptr->right = t->rbt_nil; + nodeptr->par = NULL; + nodeptr->color = RBT_RED; + return nodeptr; +} + +/* Perform a left-rotate operation on NODE in the red-black tree. */ +static void +rbt_left_rotate (struct rbt_strings *t, struct rbt_string_node *node) +{ + struct rbt_string_node *right = node->right; + assert (right); + + /* Turn RIGHT's left subtree into NODE's right subtree. */ + node->right = right->left; + if (right->left != t->rbt_nil) + right->left->par = node; + + /* Link NODE's parent to RIGHT. */ + right->par = node->par; + + if (node->par == t->rbt_nil) + t->rbt_root = right; + else if (node == node->par->left) + node->par->left = right; + else + node->par->right = right; + + /* Put NODE on RIGHT's left. */ + right->left = node; + node->par = right; +} + +/* Perform a right-rotate operation on NODE in the red-black tree. */ +static void +rbt_right_rotate (struct rbt_strings *t, struct rbt_string_node *node) +{ + struct rbt_string_node *left = node->left; + assert (left); + + /* Turn LEFT's right subtree into NODE's left subtree. */ + node->left = left->right; + if (left->right != t->rbt_nil) + left->right->par = node; + + /* Link NODE's parent to LEFT. */ + left->par = node->par; + + if (node->par == t->rbt_nil) + t->rbt_root = left; + else if (node == node->par->right) + node->par->right = left; + else + node->par->left = left; + + /* Put NODE on LEFT's right. */ + left->right = node; + node->par = left; +} + +/* Insert STR into the tree, returning 1 for success and 0 if STR already + appears in the tree. */ +int +rbt_insert (struct rbt_strings *t, char *str) +{ + struct rbt_string_node *curr = t->rbt_root; + struct rbt_string_node *trail = t->rbt_nil; + + while (curr != t->rbt_nil) + { + trail = curr; + int cmp = strcmp (str, curr->str); + if (cmp < 0) + curr = curr->left; + else if (cmp > 0) + curr = curr->right; + else + return 0; + } + + struct rbt_string_node *fresh = rbt_create_node (t, str); + fresh->par = trail; + + if (trail == t->rbt_nil) + t->rbt_root = fresh; + else if (strcmp (fresh->str, trail->str) < 0) + trail->left = fresh; + else + trail->right = fresh; + + fresh->left = t->rbt_nil; + fresh->right = t->rbt_nil; + + /* FRESH has now been inserted as a red leaf. If we have invalidated + one of the following preconditions, we must fix things up: + (a) If a node is red, both of its children are black. + (b) The root must be black. + Note that only (a) or (b) applies at any given time during the + process. This algorithm works up the tree from NEW looking + for a red child with a red parent, and cleaning that up. If the + root ends up red, it gets turned black at the end. */ + curr = fresh; + while (curr->par->color == RBT_RED) + if (curr->par == curr->par->par->left) + { + struct rbt_string_node *uncle = curr->par->par->right; + if (uncle->color == RBT_RED) + { + curr->par->color = RBT_BLACK; + uncle->color = RBT_BLACK; + curr->par->par->color = RBT_RED; + curr = curr->par->par; + } + else if (curr == curr->par->right) + { + curr = curr->par; + rbt_left_rotate (t, curr); + } + else + { + curr->par->color = RBT_BLACK; + curr->par->par->color = RBT_RED; + rbt_right_rotate (t, curr->par->par); + } + } + else /* curr->par == curr->par->par->right */ + { + /* Gender-neutral formations are awkward, so let's be fair. ;-) + ("Parent-sibling" is just awful.) */ + struct rbt_string_node *aunt = curr->par->par->left; + if (aunt->color == RBT_RED) + { + curr->par->color = RBT_BLACK; + aunt->color = RBT_BLACK; + curr->par->par->color = RBT_RED; + curr = curr->par->par; + } + else if (curr == curr->par->left) + { + curr = curr->par; + rbt_right_rotate (t, curr); + } + else + { + curr->par->color = RBT_BLACK; + curr->par->par->color = RBT_RED; + rbt_left_rotate (t, curr->par->par); + } + } + + t->rbt_root->color = RBT_BLACK; + return 1; +} + +/* Return 1 if STR is in the red-black tree, else 0. */ +int +rbt_find (struct rbt_strings *t, char *str) +{ + struct rbt_string_node *curr = t->rbt_root; + + while (curr != t->rbt_nil) + { + int cmp = strcmp (str, curr->str); + if (cmp < 0) + curr = curr->left; + else if (cmp > 0) + curr = curr->right; + else + return 1; + } + + return 0; +} + +/* Inorder dump of the binary search tree. */ +void +rbt_dump (struct rbt_strings *t, struct rbt_string_node *subtree) +{ + if (subtree != t->rbt_nil) + { + rbt_dump (t, subtree->left); + fprintf (stderr, "%s\n", subtree->str); + rbt_dump (t, subtree->right); + } +} + +/* Inorder call-back for iteration over the tree. */ +void +rbt_inorder_callback (struct rbt_strings *t, struct rbt_string_node *subtree, + void (*fn) (char *)) +{ + if (subtree != t->rbt_nil) + { + rbt_inorder_callback (t, subtree->left, fn); + (*fn) (subtree->str); + rbt_inorder_callback (t, subtree->right, fn); + } +} diff --git a/gcc/config/rs6000/rbtree.h b/gcc/config/rs6000/rbtree.h new file mode 100644 index 0000000..fab0001 --- /dev/null +++ b/gcc/config/rs6000/rbtree.h @@ -0,0 +1,52 @@ +/* Partial red-black tree implementation for rs6000-gen-builtins.c. + Copyright (C) 2020-21 Free Software Foundation, Inc. + Contributed by Bill Schmidt, IBM + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* Red-black binary search tree on strings. Presently we don't support + deletes; only insert/find operations are implemented. */ +enum rbt_color + { + RBT_BLACK, + RBT_RED + }; + +struct rbt_string_node { + char *str; + struct rbt_string_node *left; + struct rbt_string_node *right; + struct rbt_string_node *par; + enum rbt_color color; +}; + +/* Root and sentinel nodes of a red-black tree. + rbt_nil points to a sentinel node, which is the parent of root + and the child of every node without a "real" left or right child. + rbt_root points to the root of the tree, if it exists yet. The + root and sentinel nodes are always black. */ +struct rbt_strings { + struct rbt_string_node *rbt_nil; + struct rbt_string_node *rbt_root; +}; + +void rbt_new (struct rbt_strings *); +int rbt_insert (struct rbt_strings *, char *); +int rbt_find (struct rbt_strings *, char *); +void rbt_dump (struct rbt_strings *, struct rbt_string_node *); +void rbt_inorder_callback (struct rbt_strings *, struct rbt_string_node *, + void (*) (char *)); -- cgit v1.1 From 5586e7e85de381f3df843091494889f8ad8e3e1f Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sun, 18 Jul 2021 06:12:29 -0700 Subject: x86: Don't issue vzeroupper if callee returns AVX register Don't issue vzeroupper before function call if callee returns AVX register since callee must be compiled with AVX. gcc/ PR target/101495 * config/i386/i386.c (ix86_check_avx_upper_stores): Moved before ix86_avx_u128_mode_needed. (ix86_avx_u128_mode_needed): Return AVX_U128_DIRTY if callee returns AVX register. gcc/testsuite/ PR target/101495 * gcc.target/i386/avx-vzeroupper-28.c: New test. --- gcc/config/i386/i386.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 9d74b7a..8481693 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -14093,6 +14093,18 @@ ix86_check_avx_upper_register (const_rtx exp) && GET_MODE_BITSIZE (GET_MODE (exp)) > 128); } +/* Check if a 256bit or 512bit AVX register is referenced in stores. */ + +static void +ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data) + { + if (ix86_check_avx_upper_register (dest)) + { + bool *used = (bool *) data; + *used = true; + } + } + /* Return needed mode for entity in optimize_mode_switching pass. */ static int @@ -14117,6 +14129,14 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) } } + /* Needed mode is set to AVX_U128_CLEAN if there are no 256bit + nor 512bit registers used in the function return register. */ + bool avx_upper_reg_found = false; + note_stores (insn, ix86_check_avx_upper_stores, + &avx_upper_reg_found); + if (avx_upper_reg_found) + return AVX_U128_DIRTY; + /* If the function is known to preserve some SSE registers, RA and previous passes can legitimately rely on that for modes wider than 256 bits. It's only safe to issue a @@ -14217,18 +14237,6 @@ ix86_mode_needed (int entity, rtx_insn *insn) return 0; } -/* Check if a 256bit or 512bit AVX register is referenced in stores. */ - -static void -ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data) - { - if (ix86_check_avx_upper_register (dest)) - { - bool *used = (bool *) data; - *used = true; - } - } - /* Calculate mode of upper 128bit AVX registers after the insn. */ static int -- cgit v1.1 From db95ac7745b284d1fd667ee6262b4afc778fe074 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Sat, 17 Jul 2021 01:16:28 -0700 Subject: [AARCH64] Fix PR 101205: csinv does not have an zero_extend version So the problem is even though there was a csneg with a zero_extend in the front, there was not one for csinv. This fixes it by extending that pattern. OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions. gcc/ChangeLog: PR target/101205 * config/aarch64/aarch64.md (csneg3_uxtw_insn): Rename to ... (*cs3_uxtw_insn4): and extend to NEG_NOT. gcc/testsuite/ChangeLog: PR target/101205 * gcc.target/aarch64/csinv-neg-1.c: New test. --- gcc/config/aarch64/aarch64.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index f12a0be..8cd259f 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4203,15 +4203,15 @@ [(set_attr "type" "csel")] ) -(define_insn "csneg3_uxtw_insn" +(define_insn "*cs3_uxtw_insn4" [(set (match_operand:DI 0 "register_operand" "=r") (zero_extend:DI (if_then_else:SI (match_operand 1 "aarch64_comparison_operation" "") - (neg:SI (match_operand:SI 2 "register_operand" "r")) + (NEG_NOT:SI (match_operand:SI 2 "register_operand" "r")) (match_operand:SI 3 "aarch64_reg_or_zero" "rZ"))))] "" - "csneg\\t%w0, %w3, %w2, %M1" + "cs\\t%w0, %w3, %w2, %M1" [(set_attr "type" "csel")] ) -- cgit v1.1 From aad32a00b7d2b64ae158b2b167768a9ae3e20f6e Mon Sep 17 00:00:00 2001 From: Andrew Stubbs Date: Thu, 8 Jul 2021 15:47:53 +0100 Subject: amdgcn: Add -mxnack and -msram-ecc [PR 100208] gcc/ChangeLog: PR target/100208 * config/gcn/gcn-hsa.h (DRIVER_SELF_SPECS): New. (ASM_SPEC): Set -mattr for xnack and sram-ecc. * config/gcn/gcn-opts.h (enum sram_ecc_type): New. * config/gcn/gcn-valu.md: Add a warning comment. * config/gcn/gcn.c (gcn_option_override): Add "sorry" for -mxnack. (output_file_start): Add xnack and sram-ecc state to ".amdgcn_target". * config/gcn/gcn.md: Add a warning comment. * config/gcn/gcn.opt: Add -mxnack and -msram-ecc. * config/gcn/mkoffload.c (EF_AMDGPU_MACH_AMDGCN_GFX908): Remove SRAM-ECC flag. (EF_AMDGPU_XNACK): New. (EF_AMDGPU_SRAM_ECC): New. (elf_flags): New. (copy_early_debug_info): Use elf_flags. (main): Handle -mxnack and -msram-ecc options. * doc/invoke.texi: Document -mxnack and -msram-ecc. gcc/testsuite/ChangeLog: PR target/100208 * gcc.target/gcn/sram-ecc-1.c: New test. * gcc.target/gcn/sram-ecc-2.c: New test. * gcc.target/gcn/sram-ecc-3.c: New test. * gcc.target/gcn/sram-ecc-4.c: New test. * gcc.target/gcn/sram-ecc-5.c: New test. * gcc.target/gcn/sram-ecc-6.c: New test. * gcc.target/gcn/sram-ecc-7.c: New test. * gcc.target/gcn/sram-ecc-8.c: New test. --- gcc/config/gcn/gcn-hsa.h | 6 +++++ gcc/config/gcn/gcn-opts.h | 7 ++++++ gcc/config/gcn/gcn-valu.md | 2 ++ gcc/config/gcn/gcn.c | 13 +++++++++-- gcc/config/gcn/gcn.md | 1 + gcc/config/gcn/gcn.opt | 21 +++++++++++++++++ gcc/config/gcn/mkoffload.c | 58 ++++++++++++++++++++++++++++++++++++++++++++-- 7 files changed, 104 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn-hsa.h b/gcc/config/gcn/gcn-hsa.h index 61cdb31..724e9a3 100644 --- a/gcc/config/gcn/gcn-hsa.h +++ b/gcc/config/gcn/gcn-hsa.h @@ -75,9 +75,15 @@ extern unsigned int gcn_local_sym_hash (const char *name); supported for gcn. */ #define GOMP_SELF_SPECS "" +#define DRIVER_SELF_SPECS \ + "%{march=fiji|march=gfx900|march=gfx906:%{!msram-ecc=*:-msram-ecc=off}}" + /* Use LLVM assembler and linker options. */ #define ASM_SPEC "-triple=amdgcn--amdhsa " \ "%:last_arg(%{march=*:-mcpu=%*}) " \ + "-mattr=%{mxnack:+xnack;:-xnack} " \ + /* FIXME: support "any" when we move to HSACOv4. */ \ + "-mattr=%{!msram-ecc=off:+sram-ecc;:-sram-ecc} " \ "-filetype=obj" #define LINK_SPEC "--pie --export-dynamic" #define LIB_SPEC "-lc" diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h index ed67d01..b255160 100644 --- a/gcc/config/gcn/gcn-opts.h +++ b/gcc/config/gcn/gcn-opts.h @@ -34,4 +34,11 @@ extern int gcn_isa; #define TARGET_GCN5 (gcn_isa == 5) #define TARGET_GCN5_PLUS (gcn_isa >= 5) +enum sram_ecc_type +{ + SRAM_ECC_OFF, + SRAM_ECC_ON, + SRAM_ECC_ANY +}; + #endif diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index beefcf7..84ff675 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -703,6 +703,8 @@ ;; - The address space and glc (volatile) fields are there to replace the ;; fields normally found in a MEM. ;; - Multiple forms of address expression are supported, below. +;; +;; TODO: implement combined gather and zero_extend, but only for -msram-ecc=on (define_expand "gather_load" [(match_operand:V_ALL 0 "register_operand") diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 6d02a4a..385b90c 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -144,6 +144,10 @@ gcn_option_override (void) /* 1MB total. */ stack_size_opt = 1048576; } + + /* The xnack option is a placeholder, for now. */ + if (flag_xnack) + sorry ("XNACK support"); } /* }}} */ @@ -5182,11 +5186,16 @@ output_file_start (void) case PROCESSOR_FIJI: cpu = "gfx803"; break; case PROCESSOR_VEGA10: cpu = "gfx900"; break; case PROCESSOR_VEGA20: cpu = "gfx906"; break; - case PROCESSOR_GFX908: cpu = "gfx908+sram-ecc"; break; + case PROCESSOR_GFX908: cpu = "gfx908"; break; default: gcc_unreachable (); } - fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s\"\n", cpu); + const char *xnack = (flag_xnack ? "+xnack" : ""); + /* FIXME: support "any" when we move to HSACOv4. */ + const char *sram_ecc = (flag_sram_ecc ? "+sram-ecc" : ""); + + fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n", + cpu, xnack, sram_ecc); } /* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h. diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index ae7249a..8ffa43c 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -569,6 +569,7 @@ (set_attr "length" "4,4,8,12,12,12,12,4,8,8,12,12,8,12,12,8,12,12")]) ; 8/16bit move pattern +; TODO: implement combined load and zero_extend, but *only* for -msram-ecc=on (define_insn "*mov_insn" [(set (match_operand:QIHI 0 "nonimmediate_operand" diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt index 767d458..b2b10b0 100644 --- a/gcc/config/gcn/gcn.opt +++ b/gcc/config/gcn/gcn.opt @@ -76,3 +76,24 @@ Target RejectNegative Joined UInteger Var(stack_size_opt) Init(-1) Wopenacc-dims Target Var(warn_openacc_dims) Warning Warn about invalid OpenACC dimensions. + +mxnack +Target Var(flag_xnack) Init(0) +Compile for devices requiring XNACK enabled. Default off. + +Enum +Name(sram_ecc_type) Type(enum sram_ecc_type) +SRAM-ECC modes: + +EnumValue +Enum(sram_ecc_type) String(off) Value(SRAM_ECC_OFF) + +EnumValue +Enum(sram_ecc_type) String(on) Value(SRAM_ECC_ON) + +EnumValue +Enum(sram_ecc_type) String(any) Value(SRAM_ECC_ANY) + +msram-ecc= +Target RejectNegative Joined ToLower Enum(sram_ecc_type) Var(flag_sram_ecc) Init(SRAM_ECC_ANY) +Compile for devices with the SRAM ECC feature enabled, or not. Default \"any\". diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c index 1469a68..804cc26 100644 --- a/gcc/config/gcn/mkoffload.c +++ b/gcc/config/gcn/mkoffload.c @@ -52,7 +52,10 @@ #undef EF_AMDGPU_MACH_AMDGCN_GFX906 #define EF_AMDGPU_MACH_AMDGCN_GFX906 0x2f #undef EF_AMDGPU_MACH_AMDGCN_GFX908 -#define EF_AMDGPU_MACH_AMDGCN_GFX908 0x230 // Assume SRAM-ECC enabled. +#define EF_AMDGPU_MACH_AMDGCN_GFX908 0x30 + +#define EF_AMDGPU_XNACK 0x100 +#define EF_AMDGPU_SRAM_ECC 0x200 #ifndef R_AMDGPU_NONE #define R_AMDGPU_NONE 0 @@ -77,6 +80,7 @@ static struct obstack files_to_cleanup; enum offload_abi offload_abi = OFFLOAD_ABI_UNSET; uint32_t elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX803; // Default GPU architecture. +uint32_t elf_flags = 0; /* Delete tempfiles. */ @@ -298,7 +302,7 @@ copy_early_debug_info (const char *infile, const char *outfile) ehdr.e_ident[8] = ELFABIVERSION_AMDGPU_HSA; ehdr.e_type = ET_REL; ehdr.e_machine = EM_AMDGPU; - ehdr.e_flags = elf_arch; + ehdr.e_flags = elf_arch | elf_flags; /* Load the section headers so we can walk them later. */ Elf64_Shdr *sections = (Elf64_Shdr *)xmalloc (sizeof (Elf64_Shdr) @@ -823,6 +827,7 @@ main (int argc, char **argv) bool fopenacc = false; bool fPIC = false; bool fpic = false; + bool sram_seen = false; for (int i = 1; i < argc; i++) { #define STR "-foffload-abi=" @@ -845,6 +850,26 @@ main (int argc, char **argv) fPIC = true; else if (strcmp (argv[i], "-fpic") == 0) fpic = true; + else if (strcmp (argv[i], "-mxnack") == 0) + elf_flags |= EF_AMDGPU_XNACK; + else if (strcmp (argv[i], "-mno-xnack") == 0) + elf_flags &= ~EF_AMDGPU_XNACK; + else if (strcmp (argv[i], "-msram-ecc=on") == 0) + { + elf_flags |= EF_AMDGPU_SRAM_ECC; + sram_seen = true; + } + else if (strcmp (argv[i], "-msram-ecc=any") == 0) + { + /* FIXME: change this when we move to HSACOv4. */ + elf_flags |= EF_AMDGPU_SRAM_ECC; + sram_seen = true; + } + else if (strcmp (argv[i], "-msram-ecc=off") == 0) + { + elf_flags &= ~EF_AMDGPU_SRAM_ECC; + sram_seen = true; + } else if (strcmp (argv[i], "-save-temps") == 0) save_temps = true; else if (strcmp (argv[i], "-v") == 0) @@ -865,6 +890,21 @@ main (int argc, char **argv) if (!(fopenacc ^ fopenmp)) fatal_error (input_location, "either -fopenacc or -fopenmp must be set"); + /* The SRAM-ECC feature defaults to "any" on GPUs where the feature is + available. */ + if (!sram_seen) + switch (elf_arch) + { + case EF_AMDGPU_MACH_AMDGCN_GFX803: + case EF_AMDGPU_MACH_AMDGCN_GFX900: + case EF_AMDGPU_MACH_AMDGCN_GFX906: + break; + default: + /* FIXME: change this when we move to HSACOv4. */ + elf_flags |= EF_AMDGPU_SRAM_ECC; + break; + } + const char *abi; switch (offload_abi) { @@ -892,6 +932,12 @@ main (int argc, char **argv) obstack_ptr_grow (&cc_argv_obstack, "-xlto"); if (fopenmp) obstack_ptr_grow (&cc_argv_obstack, "-mgomp"); + obstack_ptr_grow (&cc_argv_obstack, + (elf_flags & EF_AMDGPU_XNACK + ? "-mxnack" : "-mno-xnack")); + obstack_ptr_grow (&cc_argv_obstack, + (elf_flags & EF_AMDGPU_SRAM_ECC + ? "-msram-ecc=on" : "-msram-ecc=off")); for (int ix = 1; ix != argc; ix++) { @@ -993,6 +1039,14 @@ main (int argc, char **argv) } obstack_ptr_grow (&ld_argv_obstack, gcn_s2_name); obstack_ptr_grow (&ld_argv_obstack, "-lgomp"); + obstack_ptr_grow (&ld_argv_obstack, + (elf_flags & EF_AMDGPU_XNACK + ? "-mxnack" : "-mno-xnack")); + obstack_ptr_grow (&ld_argv_obstack, + (elf_flags & EF_AMDGPU_SRAM_ECC + ? "-msram-ecc=on" : "-msram-ecc=off")); + if (verbose) + obstack_ptr_grow (&ld_argv_obstack, "-v"); for (int i = 1; i < argc; i++) if (startswith (argv[i], "-l") -- cgit v1.1 From f007a638a86e4b59bef0a0d8efa5bb8c5e5b200a Mon Sep 17 00:00:00 2001 From: Indu Bhagat Date: Mon, 19 Jul 2021 10:24:59 -0700 Subject: debug: Allow means for targets to opt out of CTF/BTF support CTF/BTF debug formats can be safely enabled for all ELF-based targets by default in GCC. CTF/BTF debug formats now adopt a similar approach as taken for DWARF debug format via the DWARF2_DEBUGGING_INFO. - By default, CTF/BTF formats can be enabled for all ELF-based targets. - By default, CTF/BTF formats can be disabled for all non ELF-based targets. - If the user passed a -gctf but CTF is not enabled for the target, GCC issues an error to the user (as is done currently with other debug formats) - "target system does not support the 'ctf' debug format". Analogous behavior for -gbtf command line option. A previous commit disabled the CTF and BTF testcases on the AIX platform. This is not necessary now that CTF and BTF debug formats are disabled by default on all non-ELF targets. GCC emits an error message when -gctf/-gbtf is used on such platforms and these tests will be skipped. gcc/ * config/elfos.h (CTF_DEBUGGING_INFO): New definition. (BTF_DEBUGGING_INFO): Likewise. * doc/tm.texi.in: Document the new macros. * doc/tm.texi: Regenerated. * toplev.c: Guard initialization of debug hooks. gcc/testsuite/ * gcc.dg/debug/btf/btf.exp: Do not run BTF testsuite if target does not support BTF format. Remove redundant check for AIX. * gcc.dg/debug/ctf/ctf.exp: Do not run CTF testsuite if target does not support CTF format. Remove redundant check for AIX. * lib/gcc-dg.exp: Remove redundant check for AIX. --- gcc/config/elfos.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/elfos.h b/gcc/config/elfos.h index 7a736cc..e5cb487 100644 --- a/gcc/config/elfos.h +++ b/gcc/config/elfos.h @@ -68,6 +68,14 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #define DWARF2_DEBUGGING_INFO 1 +/* All ELF targets can support CTF. */ + +#define CTF_DEBUGGING_INFO 1 + +/* All ELF targets can support BTF. */ + +#define BTF_DEBUGGING_INFO 1 + /* The GNU tools operate better with dwarf2, and it is required by some psABI's. Since we don't have any native tools to be compatible with, default to dwarf2. */ -- cgit v1.1 From a1d27560770818c514ad1ad6683f89e1e1bcd0ec Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Mon, 19 Jul 2021 20:49:17 -0500 Subject: vect: Recog mul_highpart pattern [PR100696] This patch is to extend the existing pattern mulhs handlings to cover normal multiply highpart pattern recognization, it introduces one new internal function IFN_MULH for 1:1 map to [su]mul_highpart optab. Since it covers MULT_HIGHPART_EXPR with optab support, i386 part change is to ensure it follows the consistent costing path. Bootstrapped & regtested on powerpc64le-linux-gnu P9, x86_64-redhat-linux and aarch64-linux-gnu. gcc/ChangeLog: PR tree-optimization/100696 * internal-fn.c (first_commutative_argument): Add info for IFN_MULH. * internal-fn.def (IFN_MULH): New internal function. * tree-vect-patterns.c (vect_recog_mulhs_pattern): Add support to recog normal multiply highpart as IFN_MULH. * config/i386/i386.c (ix86_add_stmt_cost): Adjust for combined function CFN_MULH. gcc/testsuite/ChangeLog: PR tree-optimization/100696 * gcc.target/i386/pr100637-3w.c: Adjust for mul_highpart recog. --- gcc/config/i386/i386.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 8481693..ff96134 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -22568,6 +22568,9 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, mode == SFmode ? ix86_cost->fmass : ix86_cost->fmasd); break; + case CFN_MULH: + stmt_cost = ix86_multiplication_cost (ix86_cost, mode); + break; default: break; } -- cgit v1.1 From 1c0d49b9ce9ab011fa77d4eac689fa1a038123ef Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Mon, 19 Jul 2021 20:50:13 -0500 Subject: rs6000: Support [u]mul3_highpart for vector This patch is to make Power10 newly introduced vector multiply high (part) instructions exploited in vectorized loops, it renames existing define_insns as standard pattern names. It depends on that patch which enables vectorizer to recog mul_highpart. gcc/ChangeLog: * config/rs6000/vsx.md (mulhs_): Rename to... (smul3_highpart): ... this. (mulhu_): Rename to... (umul3_highpart): ... this. * config/rs6000/rs6000-builtin.def (MULHS_V2DI, MULHS_V4SI, MULHU_V2DI, MULHU_V4SI): Adjust. gcc/testsuite/ChangeLog: * gcc.target/powerpc/mul-vectorize-3.c: New test. * gcc.target/powerpc/mul-vectorize-4.c: New test. --- gcc/config/rs6000/rs6000-builtin.def | 8 ++++---- gcc/config/rs6000/vsx.md | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index a0dfefc..9dbf16f 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -3035,10 +3035,10 @@ BU_P10V_AV_2 (MODS_V2DI, "vmodsd", CONST, modv2di3) BU_P10V_AV_2 (MODS_V4SI, "vmodsw", CONST, modv4si3) BU_P10V_AV_2 (MODU_V2DI, "vmodud", CONST, umodv2di3) BU_P10V_AV_2 (MODU_V4SI, "vmoduw", CONST, umodv4si3) -BU_P10V_AV_2 (MULHS_V2DI, "vmulhsd", CONST, mulhs_v2di) -BU_P10V_AV_2 (MULHS_V4SI, "vmulhsw", CONST, mulhs_v4si) -BU_P10V_AV_2 (MULHU_V2DI, "vmulhud", CONST, mulhu_v2di) -BU_P10V_AV_2 (MULHU_V4SI, "vmulhuw", CONST, mulhu_v4si) +BU_P10V_AV_2 (MULHS_V2DI, "vmulhsd", CONST, smulv2di3_highpart) +BU_P10V_AV_2 (MULHS_V4SI, "vmulhsw", CONST, smulv4si3_highpart) +BU_P10V_AV_2 (MULHU_V2DI, "vmulhud", CONST, umulv2di3_highpart) +BU_P10V_AV_2 (MULHU_V4SI, "vmulhuw", CONST, umulv4si3_highpart) BU_P10V_AV_2 (MULLD_V2DI, "vmulld", CONST, mulv2di3) BU_P10V_VSX_1 (VXXSPLTIW_V4SI, "vxxspltiw_v4si", CONST, xxspltiw_v4si) diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index f622873..6f6fc0b 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -6351,7 +6351,7 @@ [(set_attr "type" "vecdiv") (set_attr "size" "")]) -(define_insn "mulhs_" +(define_insn "smul3_highpart" [(set (match_operand:VIlong 0 "vsx_register_operand" "=v") (mult:VIlong (ashiftrt (match_operand:VIlong 1 "vsx_register_operand" "v") @@ -6363,7 +6363,7 @@ "vmulhs %0,%1,%2" [(set_attr "type" "veccomplex")]) -(define_insn "mulhu_" +(define_insn "umul3_highpart" [(set (match_operand:VIlong 0 "vsx_register_operand" "=v") (us_mult:VIlong (ashiftrt (match_operand:VIlong 1 "vsx_register_operand" "v") -- cgit v1.1 From 6d4da4aeef5b20f7f9693ddc27d26740d0dbe36c Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 20 Jul 2021 06:15:16 +0200 Subject: i386: Remove atomic_storedi_fpu and atomic_loaddi_fpu peepholes [PR100182] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These patterns result in non-atomic sequence. 2021-07-21 Uroš Bizjak gcc/ PR target/100182 * config/i386/sync.md (define_peephole2 atomic_storedi_fpu): Remove. (define_peephole2 atomic_loaddi_fpu): Ditto. gcc/testsuite/ PR target/100182 * gcc.target/i386/pr71245-1.c: Remove. * gcc.target/i386/pr71245-2.c: Ditto. --- gcc/config/i386/sync.md | 152 ------------------------------------------------ 1 file changed, 152 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md index 7913b91..05a8352 100644 --- a/gcc/config/i386/sync.md +++ b/gcc/config/i386/sync.md @@ -219,82 +219,6 @@ DONE; }) -(define_peephole2 - [(set (match_operand:DF 0 "fp_register_operand") - (unspec:DF [(match_operand:DI 1 "memory_operand")] - UNSPEC_FILD_ATOMIC)) - (set (match_operand:DI 2 "memory_operand") - (unspec:DI [(match_dup 0)] - UNSPEC_FIST_ATOMIC)) - (set (match_operand:DF 3 "sse_reg_operand") - (match_operand:DF 4 "memory_operand"))] - "!TARGET_64BIT - && peep2_reg_dead_p (2, operands[0]) - && rtx_equal_p (XEXP (operands[4], 0), XEXP (operands[2], 0))" - [(set (match_dup 3) (match_dup 5)) - (set (match_dup 4) (match_dup 3))] - "operands[5] = gen_lowpart (DFmode, operands[1]);") - -(define_peephole2 - [(set (match_operand:DF 0 "fp_register_operand") - (unspec:DF [(match_operand:DI 1 "memory_operand")] - UNSPEC_FILD_ATOMIC)) - (set (match_operand:DI 2 "memory_operand") - (unspec:DI [(match_dup 0)] - UNSPEC_FIST_ATOMIC)) - (set (mem:BLK (scratch:SI)) - (unspec:BLK [(mem:BLK (scratch:SI))] UNSPEC_MEMORY_BLOCKAGE)) - (set (match_operand:DF 3 "sse_reg_operand") - (match_operand:DF 4 "memory_operand"))] - "!TARGET_64BIT - && peep2_reg_dead_p (2, operands[0]) - && rtx_equal_p (XEXP (operands[4], 0), XEXP (operands[2], 0))" - [(const_int 0)] -{ - emit_move_insn (operands[3], gen_lowpart (DFmode, operands[1])); - emit_move_insn (operands[4], operands[3]); - emit_insn (gen_memory_blockage ()); - DONE; -}) - -(define_peephole2 - [(set (match_operand:DF 0 "sse_reg_operand") - (unspec:DF [(match_operand:DI 1 "memory_operand")] - UNSPEC_LDX_ATOMIC)) - (set (match_operand:DI 2 "memory_operand") - (unspec:DI [(match_dup 0)] - UNSPEC_STX_ATOMIC)) - (set (match_operand:DF 3 "sse_reg_operand") - (match_operand:DF 4 "memory_operand"))] - "!TARGET_64BIT - && peep2_reg_dead_p (2, operands[0]) - && rtx_equal_p (XEXP (operands[4], 0), XEXP (operands[2], 0))" - [(set (match_dup 3) (match_dup 5)) - (set (match_dup 4) (match_dup 3))] - "operands[5] = gen_lowpart (DFmode, operands[1]);") - -(define_peephole2 - [(set (match_operand:DF 0 "sse_reg_operand") - (unspec:DF [(match_operand:DI 1 "memory_operand")] - UNSPEC_LDX_ATOMIC)) - (set (match_operand:DI 2 "memory_operand") - (unspec:DI [(match_dup 0)] - UNSPEC_STX_ATOMIC)) - (set (mem:BLK (scratch:SI)) - (unspec:BLK [(mem:BLK (scratch:SI))] UNSPEC_MEMORY_BLOCKAGE)) - (set (match_operand:DF 3 "sse_reg_operand") - (match_operand:DF 4 "memory_operand"))] - "!TARGET_64BIT - && peep2_reg_dead_p (2, operands[0]) - && rtx_equal_p (XEXP (operands[4], 0), XEXP (operands[2], 0))" - [(const_int 0)] -{ - emit_move_insn (operands[3], gen_lowpart (DFmode, operands[1])); - emit_move_insn (operands[4], operands[3]); - emit_insn (gen_memory_blockage ()); - DONE; -}) - (define_expand "atomic_store" [(set (match_operand:ATOMIC 0 "memory_operand") (unspec:ATOMIC [(match_operand:ATOMIC 1 "nonimmediate_operand") @@ -384,82 +308,6 @@ DONE; }) -(define_peephole2 - [(set (match_operand:DF 0 "memory_operand") - (match_operand:DF 1 "any_fp_register_operand")) - (set (match_operand:DF 2 "fp_register_operand") - (unspec:DF [(match_operand:DI 3 "memory_operand")] - UNSPEC_FILD_ATOMIC)) - (set (match_operand:DI 4 "memory_operand") - (unspec:DI [(match_dup 2)] - UNSPEC_FIST_ATOMIC))] - "!TARGET_64BIT - && peep2_reg_dead_p (3, operands[2]) - && rtx_equal_p (XEXP (operands[0], 0), XEXP (operands[3], 0))" - [(set (match_dup 0) (match_dup 1)) - (set (match_dup 5) (match_dup 1))] - "operands[5] = gen_lowpart (DFmode, operands[4]);") - -(define_peephole2 - [(set (match_operand:DF 0 "memory_operand") - (match_operand:DF 1 "any_fp_register_operand")) - (set (mem:BLK (scratch:SI)) - (unspec:BLK [(mem:BLK (scratch:SI))] UNSPEC_MEMORY_BLOCKAGE)) - (set (match_operand:DF 2 "fp_register_operand") - (unspec:DF [(match_operand:DI 3 "memory_operand")] - UNSPEC_FILD_ATOMIC)) - (set (match_operand:DI 4 "memory_operand") - (unspec:DI [(match_dup 2)] - UNSPEC_FIST_ATOMIC))] - "!TARGET_64BIT - && peep2_reg_dead_p (4, operands[2]) - && rtx_equal_p (XEXP (operands[0], 0), XEXP (operands[3], 0))" - [(const_int 0)] -{ - emit_move_insn (operands[0], operands[1]); - emit_insn (gen_memory_blockage ()); - emit_move_insn (gen_lowpart (DFmode, operands[4]), operands[1]); - DONE; -}) - -(define_peephole2 - [(set (match_operand:DF 0 "memory_operand") - (match_operand:DF 1 "any_fp_register_operand")) - (set (match_operand:DF 2 "sse_reg_operand") - (unspec:DF [(match_operand:DI 3 "memory_operand")] - UNSPEC_LDX_ATOMIC)) - (set (match_operand:DI 4 "memory_operand") - (unspec:DI [(match_dup 2)] - UNSPEC_STX_ATOMIC))] - "!TARGET_64BIT - && peep2_reg_dead_p (3, operands[2]) - && rtx_equal_p (XEXP (operands[0], 0), XEXP (operands[3], 0))" - [(set (match_dup 0) (match_dup 1)) - (set (match_dup 5) (match_dup 1))] - "operands[5] = gen_lowpart (DFmode, operands[4]);") - -(define_peephole2 - [(set (match_operand:DF 0 "memory_operand") - (match_operand:DF 1 "any_fp_register_operand")) - (set (mem:BLK (scratch:SI)) - (unspec:BLK [(mem:BLK (scratch:SI))] UNSPEC_MEMORY_BLOCKAGE)) - (set (match_operand:DF 2 "sse_reg_operand") - (unspec:DF [(match_operand:DI 3 "memory_operand")] - UNSPEC_LDX_ATOMIC)) - (set (match_operand:DI 4 "memory_operand") - (unspec:DI [(match_dup 2)] - UNSPEC_STX_ATOMIC))] - "!TARGET_64BIT - && peep2_reg_dead_p (4, operands[2]) - && rtx_equal_p (XEXP (operands[0], 0), XEXP (operands[3], 0))" - [(const_int 0)] -{ - emit_move_insn (operands[0], operands[1]); - emit_insn (gen_memory_blockage ()); - emit_move_insn (gen_lowpart (DFmode, operands[4]), operands[1]); - DONE; -}) - ;; ??? You'd think that we'd be able to perform this via FLOAT + FIX_TRUNC ;; operations. But the fix_trunc patterns want way more setup than we want ;; to provide. Note that the scratch is DFmode instead of XFmode in order -- cgit v1.1 From b7e450c97340789687b65ab013dbe25e012b0b6c Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 9 Jul 2021 10:12:19 +0100 Subject: aarch64: Refactor TBL/TBX RTL patterns Rename two-source-register TBL/TBX RTL patterns so that their names better reflect what they do, rather than confusing them with tbl3 or tbx4 patterns. Also use the correct "neon_tbl2" type attribute for both patterns. Rename single-source-register TBL/TBX patterns for consistency. gcc/ChangeLog: 2021-07-08 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Use two variant generators for all TBL/TBX intrinsics and rename to consistent forms: qtbl[1234] or qtbx[1234]. * config/aarch64/aarch64-simd.md (aarch64_tbl1): Rename to... (aarch64_qtbl1): This. (aarch64_tbx1): Rename to... (aarch64_qtbx1): This. (aarch64_tbl2v16qi): Delete. (aarch64_tbl3): Rename to... (aarch64_qtbl2): This. (aarch64_tbx4): Rename to... (aarch64_qtbx2): This. * config/aarch64/aarch64.c (aarch64_expand_vec_perm_1): Use renamed qtbl1 and qtbl2 RTL patterns. * config/aarch64/arm_neon.h (vqtbl1_p8): Use renamed qtbl1 RTL pattern. (vqtbl1_s8): Likewise. (vqtbl1_u8): Likewise. (vqtbl1q_p8): Likewise. (vqtbl1q_s8): Likewise. (vqtbl1q_u8): Likewise. (vqtbx1_s8): Use renamed qtbx1 RTL pattern. (vqtbx1_u8): Likewise. (vqtbx1_p8): Likewise. (vqtbx1q_s8): Likewise. (vqtbx1q_u8): Likewise. (vqtbx1q_p8): Likewise. (vtbl1_s8): Use renamed qtbl1 RTL pattern. (vtbl1_u8): Likewise. (vtbl1_p8): Likewise. (vtbl2_s8): Likewise (vtbl2_u8): Likewise. (vtbl2_p8): Likewise. (vtbl3_s8): Use renamed qtbl2 RTL pattern. (vtbl3_u8): Likewise. (vtbl3_p8): Likewise. (vtbl4_s8): Likewise. (vtbl4_u8): Likewise. (vtbl4_p8): Likewise. (vtbx2_s8): Use renamed qtbx2 RTL pattern. (vtbx2_u8): Likewise. (vtbx2_p8): Likewise. (vqtbl2_s8): Use renamed qtbl2 RTL pattern. (vqtbl2_u8): Likewise. (vqtbl2_p8): Likewise. (vqtbl2q_s8): Likewise. (vqtbl2q_u8): Likewise. (vqtbl2q_p8): Likewise. (vqtbx2_s8): Use renamed qtbx2 RTL pattern. (vqtbx2_u8): Likewise. (vqtbx2_p8): Likewise. (vqtbx2q_s8): Likewise. (vqtbx2q_u8): Likewise. (vqtbx2q_p8): Likewise. (vtbx4_s8): Likewise. (vtbx4_u8): Likewise. (vtbx4_p8): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 34 +++---- gcc/config/aarch64/aarch64-simd.md | 24 ++--- gcc/config/aarch64/aarch64.c | 8 +- gcc/config/aarch64/arm_neon.h | 132 ++++++++++++--------------- 4 files changed, 82 insertions(+), 116 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 063f503..b7f1237 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -718,37 +718,31 @@ VAR1 (BINOPP, crypto_pmull, 0, NONE, di) VAR1 (BINOPP, crypto_pmull, 0, NONE, v2di) - /* Implemented by aarch64_tbl3. */ - VAR1 (BINOP, tbl3, 0, NONE, v8qi) - VAR1 (BINOP, tbl3, 0, NONE, v16qi) + /* Implemented by aarch64_qtbl1. */ + VAR2 (BINOP, qtbl1, 0, NONE, v8qi, v16qi) + VAR2 (BINOPU, qtbl1, 0, NONE, v8qi, v16qi) - /* Implemented by aarch64_tbl1. */ - VAR2 (BINOP, tbl1, 0, NONE, v8qi, v16qi) - VAR2 (BINOPU, tbl1, 0, NONE, v8qi, v16qi) + /* Implemented by aarch64_qtbl2. */ + VAR2 (BINOP, qtbl2, 0, NONE, v8qi, v16qi) /* Implemented by aarch64_qtbl3. */ - VAR1 (BINOP, qtbl3, 0, NONE, v8qi) - VAR1 (BINOP, qtbl3, 0, NONE, v16qi) + VAR2 (BINOP, qtbl3, 0, NONE, v8qi, v16qi) /* Implemented by aarch64_qtbl4. */ - VAR1 (BINOP, qtbl4, 0, NONE, v8qi) - VAR1 (BINOP, qtbl4, 0, NONE, v16qi) + VAR2 (BINOP, qtbl4, 0, NONE, v8qi, v16qi) - /* Implemented by aarch64_tbx1. */ - VAR2 (TERNOP, tbx1, 0, NONE, v8qi, v16qi) - VAR2 (TERNOPU, tbx1, 0, NONE, v8qi, v16qi) + /* Implemented by aarch64_qtbx1. */ + VAR2 (TERNOP, qtbx1, 0, NONE, v8qi, v16qi) + VAR2 (TERNOPU, qtbx1, 0, NONE, v8qi, v16qi) - /* Implemented by aarch64_tbx4. */ - VAR1 (TERNOP, tbx4, 0, NONE, v8qi) - VAR1 (TERNOP, tbx4, 0, NONE, v16qi) + /* Implemented by aarch64_qtbx2. */ + VAR2 (TERNOP, qtbx2, 0, NONE, v8qi, v16qi) /* Implemented by aarch64_qtbx3. */ - VAR1 (TERNOP, qtbx3, 0, NONE, v8qi) - VAR1 (TERNOP, qtbx3, 0, NONE, v16qi) + VAR2 (TERNOP, qtbx3, 0, NONE, v8qi, v16qi) /* Implemented by aarch64_qtbx4. */ - VAR1 (TERNOP, qtbx4, 0, NONE, v8qi) - VAR1 (TERNOP, qtbx4, 0, NONE, v16qi) + VAR2 (TERNOP, qtbx4, 0, NONE, v8qi, v16qi) /* Builtins for ARMv8.1-A Adv.SIMD instructions. */ diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7489098..7332a73 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -6948,7 +6948,7 @@ { rtx op0 = gen_rtx_REG (V16QImode, REGNO (operands[0]) + i); rtx op1 = gen_rtx_REG (V16QImode, REGNO (operands[1]) + i); - emit_insn (gen_aarch64_tbl1v16qi (op0, op1, operands[2])); + emit_insn (gen_aarch64_qtbl1v16qi (op0, op1, operands[2])); } DONE; } @@ -7425,7 +7425,7 @@ DONE; }) -(define_insn "aarch64_tbl1" +(define_insn "aarch64_qtbl1" [(set (match_operand:VB 0 "register_operand" "=w") (unspec:VB [(match_operand:V16QI 1 "register_operand" "w") (match_operand:VB 2 "register_operand" "w")] @@ -7435,7 +7435,7 @@ [(set_attr "type" "neon_tbl1")] ) -(define_insn "aarch64_tbx1" +(define_insn "aarch64_qtbx1" [(set (match_operand:VB 0 "register_operand" "=w") (unspec:VB [(match_operand:VB 1 "register_operand" "0") (match_operand:V16QI 2 "register_operand" "w") @@ -7448,27 +7448,17 @@ ;; Two source registers. -(define_insn "aarch64_tbl2v16qi" - [(set (match_operand:V16QI 0 "register_operand" "=w") - (unspec:V16QI [(match_operand:OI 1 "register_operand" "w") - (match_operand:V16QI 2 "register_operand" "w")] - UNSPEC_TBL))] - "TARGET_SIMD" - "tbl\\t%0.16b, {%S1.16b - %T1.16b}, %2.16b" - [(set_attr "type" "neon_tbl2_q")] -) - -(define_insn "aarch64_tbl3" +(define_insn "aarch64_qtbl2" [(set (match_operand:VB 0 "register_operand" "=w") (unspec:VB [(match_operand:OI 1 "register_operand" "w") (match_operand:VB 2 "register_operand" "w")] UNSPEC_TBL))] "TARGET_SIMD" "tbl\\t%S0., {%S1.16b - %T1.16b}, %S2." - [(set_attr "type" "neon_tbl3")] + [(set_attr "type" "neon_tbl2")] ) -(define_insn "aarch64_tbx4" +(define_insn "aarch64_qtbx2" [(set (match_operand:VB 0 "register_operand" "=w") (unspec:VB [(match_operand:VB 1 "register_operand" "0") (match_operand:OI 2 "register_operand" "w") @@ -7476,7 +7466,7 @@ UNSPEC_TBX))] "TARGET_SIMD" "tbx\\t%S0., {%S2.16b - %T2.16b}, %S3." - [(set_attr "type" "neon_tbl4")] + [(set_attr "type" "neon_tbl2")] ) ;; Three source registers. diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index f5b25a7..3bdf19d 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -22047,11 +22047,11 @@ aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel) /* Expand the argument to a V16QI mode by duplicating it. */ rtx pair = gen_reg_rtx (V16QImode); emit_insn (gen_aarch64_combinev8qi (pair, op0, op0)); - emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel)); + emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel)); } else { - emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel)); + emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel)); } } else @@ -22062,13 +22062,13 @@ aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel) { pair = gen_reg_rtx (V16QImode); emit_insn (gen_aarch64_combinev8qi (pair, op0, op1)); - emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel)); + emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel)); } else { pair = gen_reg_rtx (OImode); emit_insn (gen_aarch64_combinev16qi (pair, op0, op1)); - emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel)); + emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel)); } } } diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 00d76ea..1048d7c 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9534,90 +9534,90 @@ __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl1_p8 (poly8x16_t __tab, uint8x8_t __idx) { - return (poly8x8_t) __builtin_aarch64_tbl1v8qi ((int8x16_t) __tab, - (int8x8_t) __idx); + return (poly8x8_t) __builtin_aarch64_qtbl1v8qi ((int8x16_t) __tab, + (int8x8_t) __idx); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl1_s8 (int8x16_t __tab, uint8x8_t __idx) { - return __builtin_aarch64_tbl1v8qi (__tab, (int8x8_t) __idx); + return __builtin_aarch64_qtbl1v8qi (__tab, (int8x8_t) __idx); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl1_u8 (uint8x16_t __tab, uint8x8_t __idx) { - return __builtin_aarch64_tbl1v8qi_uuu (__tab, __idx); + return __builtin_aarch64_qtbl1v8qi_uuu (__tab, __idx); } __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl1q_p8 (poly8x16_t __tab, uint8x16_t __idx) { - return (poly8x16_t) __builtin_aarch64_tbl1v16qi ((int8x16_t) __tab, - (int8x16_t) __idx); + return (poly8x16_t) __builtin_aarch64_qtbl1v16qi ((int8x16_t) __tab, + (int8x16_t) __idx); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl1q_s8 (int8x16_t __tab, uint8x16_t __idx) { - return __builtin_aarch64_tbl1v16qi (__tab, (int8x16_t) __idx); + return __builtin_aarch64_qtbl1v16qi (__tab, (int8x16_t) __idx); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl1q_u8 (uint8x16_t __tab, uint8x16_t __idx) { - return __builtin_aarch64_tbl1v16qi_uuu (__tab, __idx); + return __builtin_aarch64_qtbl1v16qi_uuu (__tab, __idx); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1_s8 (int8x8_t __r, int8x16_t __tab, uint8x8_t __idx) { - return __builtin_aarch64_tbx1v8qi (__r, __tab, (int8x8_t) __idx); + return __builtin_aarch64_qtbx1v8qi (__r, __tab, (int8x8_t) __idx); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1_u8 (uint8x8_t __r, uint8x16_t __tab, uint8x8_t __idx) { - return __builtin_aarch64_tbx1v8qi_uuuu (__r, __tab, __idx); + return __builtin_aarch64_qtbx1v8qi_uuuu (__r, __tab, __idx); } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1_p8 (poly8x8_t __r, poly8x16_t __tab, uint8x8_t __idx) { - return (poly8x8_t) __builtin_aarch64_tbx1v8qi ((int8x8_t) __r, - (int8x16_t) __tab, - (int8x8_t) __idx); + return (poly8x8_t) __builtin_aarch64_qtbx1v8qi ((int8x8_t) __r, + (int8x16_t) __tab, + (int8x8_t) __idx); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1q_s8 (int8x16_t __r, int8x16_t __tab, uint8x16_t __idx) { - return __builtin_aarch64_tbx1v16qi (__r, __tab, (int8x16_t) __idx); + return __builtin_aarch64_qtbx1v16qi (__r, __tab, (int8x16_t) __idx); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1q_u8 (uint8x16_t __r, uint8x16_t __tab, uint8x16_t __idx) { - return __builtin_aarch64_tbx1v16qi_uuuu (__r, __tab, __idx); + return __builtin_aarch64_qtbx1v16qi_uuuu (__r, __tab, __idx); } __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1q_p8 (poly8x16_t __r, poly8x16_t __tab, uint8x16_t __idx) { - return (poly8x16_t) __builtin_aarch64_tbx1v16qi ((int8x16_t) __r, - (int8x16_t) __tab, - (int8x16_t) __idx); + return (poly8x16_t) __builtin_aarch64_qtbx1v16qi ((int8x16_t) __r, + (int8x16_t) __tab, + (int8x16_t) __idx); } /* V7 legacy table intrinsics. */ @@ -9628,7 +9628,7 @@ vtbl1_s8 (int8x8_t __tab, int8x8_t __idx) { int8x16_t __temp = vcombine_s8 (__tab, vcreate_s8 (__AARCH64_UINT64_C (0x0))); - return __builtin_aarch64_tbl1v8qi (__temp, __idx); + return __builtin_aarch64_qtbl1v8qi (__temp, __idx); } __extension__ extern __inline uint8x8_t @@ -9637,7 +9637,7 @@ vtbl1_u8 (uint8x8_t __tab, uint8x8_t __idx) { uint8x16_t __temp = vcombine_u8 (__tab, vcreate_u8 (__AARCH64_UINT64_C (0x0))); - return __builtin_aarch64_tbl1v8qi_uuu (__temp, __idx); + return __builtin_aarch64_qtbl1v8qi_uuu (__temp, __idx); } __extension__ extern __inline poly8x8_t @@ -9646,8 +9646,8 @@ vtbl1_p8 (poly8x8_t __tab, uint8x8_t __idx) { poly8x16_t __temp = vcombine_p8 (__tab, vcreate_p8 (__AARCH64_UINT64_C (0x0))); - return (poly8x8_t) __builtin_aarch64_tbl1v8qi ((int8x16_t) __temp, - (int8x8_t) __idx); + return (poly8x8_t) __builtin_aarch64_qtbl1v8qi ((int8x16_t) __temp, + (int8x8_t) __idx); } __extension__ extern __inline int8x8_t @@ -9655,7 +9655,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl2_s8 (int8x8x2_t __tab, int8x8_t __idx) { int8x16_t __temp = vcombine_s8 (__tab.val[0], __tab.val[1]); - return __builtin_aarch64_tbl1v8qi (__temp, __idx); + return __builtin_aarch64_qtbl1v8qi (__temp, __idx); } __extension__ extern __inline uint8x8_t @@ -9663,7 +9663,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl2_u8 (uint8x8x2_t __tab, uint8x8_t __idx) { uint8x16_t __temp = vcombine_u8 (__tab.val[0], __tab.val[1]); - return __builtin_aarch64_tbl1v8qi_uuu (__temp, __idx); + return __builtin_aarch64_qtbl1v8qi_uuu (__temp, __idx); } __extension__ extern __inline poly8x8_t @@ -9671,15 +9671,14 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl2_p8 (poly8x8x2_t __tab, uint8x8_t __idx) { poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]); - return (poly8x8_t) __builtin_aarch64_tbl1v8qi ((int8x16_t) __temp, - (int8x8_t) __idx); + return (poly8x8_t) __builtin_aarch64_qtbl1v8qi ((int8x16_t) __temp, + (int8x8_t) __idx); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl3_s8 (int8x8x3_t __tab, int8x8_t __idx) { - int8x8_t __result; int8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); @@ -9688,15 +9687,13 @@ vtbl3_s8 (int8x8x3_t __tab, int8x8_t __idx) (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); - __result = __builtin_aarch64_tbl3v8qi (__o, __idx); - return __result; + return __builtin_aarch64_qtbl2v8qi (__o, __idx); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl3_u8 (uint8x8x3_t __tab, uint8x8_t __idx) { - uint8x8_t __result; uint8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); @@ -9705,15 +9702,13 @@ vtbl3_u8 (uint8x8x3_t __tab, uint8x8_t __idx) (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); - __result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); - return __result; + return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl3_p8 (poly8x8x3_t __tab, uint8x8_t __idx) { - poly8x8_t __result; poly8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); @@ -9722,15 +9717,13 @@ vtbl3_p8 (poly8x8x3_t __tab, uint8x8_t __idx) (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); - __result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); - return __result; + return (poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl4_s8 (int8x8x4_t __tab, int8x8_t __idx) { - int8x8_t __result; int8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); @@ -9739,15 +9732,13 @@ vtbl4_s8 (int8x8x4_t __tab, int8x8_t __idx) (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); - __result = __builtin_aarch64_tbl3v8qi (__o, __idx); - return __result; + return __builtin_aarch64_qtbl2v8qi (__o, __idx); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl4_u8 (uint8x8x4_t __tab, uint8x8_t __idx) { - uint8x8_t __result; uint8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); @@ -9756,15 +9747,13 @@ vtbl4_u8 (uint8x8x4_t __tab, uint8x8_t __idx) (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); - __result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); - return __result; + return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl4_p8 (poly8x8x4_t __tab, uint8x8_t __idx) { - poly8x8_t __result; poly8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); @@ -9773,8 +9762,7 @@ vtbl4_p8 (poly8x8x4_t __tab, uint8x8_t __idx) (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); - __result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); - return __result; + return(poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } __extension__ extern __inline int8x8_t @@ -9782,7 +9770,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbx2_s8 (int8x8_t __r, int8x8x2_t __tab, int8x8_t __idx) { int8x16_t __temp = vcombine_s8 (__tab.val[0], __tab.val[1]); - return __builtin_aarch64_tbx1v8qi (__r, __temp, __idx); + return __builtin_aarch64_qtbx1v8qi (__r, __temp, __idx); } __extension__ extern __inline uint8x8_t @@ -9790,7 +9778,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbx2_u8 (uint8x8_t __r, uint8x8x2_t __tab, uint8x8_t __idx) { uint8x16_t __temp = vcombine_u8 (__tab.val[0], __tab.val[1]); - return __builtin_aarch64_tbx1v8qi_uuuu (__r, __temp, __idx); + return __builtin_aarch64_qtbx1v8qi_uuuu (__r, __temp, __idx); } __extension__ extern __inline poly8x8_t @@ -9798,9 +9786,9 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbx2_p8 (poly8x8_t __r, poly8x8x2_t __tab, uint8x8_t __idx) { poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]); - return (poly8x8_t) __builtin_aarch64_tbx1v8qi ((int8x8_t) __r, - (int8x16_t) __temp, - (int8x8_t) __idx); + return (poly8x8_t) __builtin_aarch64_qtbx1v8qi ((int8x8_t) __r, + (int8x16_t) __temp, + (int8x8_t) __idx); } /* End of temporary inline asm. */ @@ -23335,7 +23323,7 @@ vqtbl2_s8 (int8x16x2_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1); - return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); + return __builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } __extension__ extern __inline uint8x8_t @@ -23345,7 +23333,7 @@ vqtbl2_u8 (uint8x16x2_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); - return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); + return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } __extension__ extern __inline poly8x8_t @@ -23355,7 +23343,7 @@ vqtbl2_p8 (poly8x16x2_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); - return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); + return (poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } __extension__ extern __inline int8x16_t @@ -23365,7 +23353,7 @@ vqtbl2q_s8 (int8x16x2_t __tab, uint8x16_t __idx) __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); - return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)__idx); + return __builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx); } __extension__ extern __inline uint8x16_t @@ -23375,7 +23363,7 @@ vqtbl2q_u8 (uint8x16x2_t __tab, uint8x16_t __idx) __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); - return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)__idx); + return (uint8x16_t)__builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx); } __extension__ extern __inline poly8x16_t @@ -23385,7 +23373,7 @@ vqtbl2q_p8 (poly8x16x2_t __tab, uint8x16_t __idx) __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); - return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)__idx); + return (poly8x16_t)__builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx); } /* vqtbl3 */ @@ -23539,7 +23527,7 @@ vqtbx2_s8 (int8x8_t __r, int8x16x2_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1); - return __builtin_aarch64_tbx4v8qi (__r, __o, (int8x8_t)__idx); + return __builtin_aarch64_qtbx2v8qi (__r, __o, (int8x8_t)__idx); } __extension__ extern __inline uint8x8_t @@ -23549,8 +23537,8 @@ vqtbx2_u8 (uint8x8_t __r, uint8x16x2_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); - return (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o, - (int8x8_t)__idx); + return (uint8x8_t)__builtin_aarch64_qtbx2v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); } __extension__ extern __inline poly8x8_t @@ -23560,8 +23548,8 @@ vqtbx2_p8 (poly8x8_t __r, poly8x16x2_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); - return (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o, - (int8x8_t)__idx); + return (poly8x8_t)__builtin_aarch64_qtbx2v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); } __extension__ extern __inline int8x16_t @@ -23571,7 +23559,7 @@ vqtbx2q_s8 (int8x16_t __r, int8x16x2_t __tab, uint8x16_t __idx) __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1); - return __builtin_aarch64_tbx4v16qi (__r, __o, (int8x16_t)__idx); + return __builtin_aarch64_qtbx2v16qi (__r, __o, (int8x16_t)__idx); } __extension__ extern __inline uint8x16_t @@ -23581,7 +23569,7 @@ vqtbx2q_u8 (uint8x16_t __r, uint8x16x2_t __tab, uint8x16_t __idx) __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); - return (uint8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)__r, __o, + return (uint8x16_t)__builtin_aarch64_qtbx2v16qi ((int8x16_t)__r, __o, (int8x16_t)__idx); } @@ -23592,8 +23580,8 @@ vqtbx2q_p8 (poly8x16_t __r, poly8x16x2_t __tab, uint8x16_t __idx) __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); - return (poly8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)__r, __o, - (int8x16_t)__idx); + return (poly8x16_t)__builtin_aarch64_qtbx2v16qi ((int8x16_t)__r, __o, + (int8x16_t)__idx); } /* vqtbx3 */ @@ -28511,7 +28499,6 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx) { - int8x8_t __result; int8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); @@ -28520,15 +28507,13 @@ vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx) (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); - __result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx); - return __result; + return __builtin_aarch64_qtbx2v8qi (__r, __o, __idx); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx) { - uint8x8_t __result; uint8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); @@ -28537,16 +28522,14 @@ vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx) (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); - __result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o, - (int8x8_t)__idx); - return __result; + return (uint8x8_t)__builtin_aarch64_qtbx2v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx) { - poly8x8_t __result; poly8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); @@ -28555,9 +28538,8 @@ vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx) (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); - __result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o, - (int8x8_t)__idx); - return __result; + return (poly8x8_t)__builtin_aarch64_qtbx2v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); } /* vtrn */ -- cgit v1.1 From e0e82856d535f56c916382f892ed2435dde54d4d Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 20 Jul 2021 17:26:10 +0200 Subject: rs6000: Fix up easy_vector_constant_msb handling [PR101384] The following gcc.dg/pr101384.c testcase is miscompiled on powerpc64le-linux. easy_altivec_constant has code to try construct vector constants with different element sizes, perhaps different from CONST_VECTOR's mode. But as written, that works fine for vspltis[bhw] cases, but not for the vspltisw x,-1; vsl[bhw] x,x,x case, because that creates always a V16QImode, V8HImode or V4SImode constant containing broadcasted constant with just the MSB set. The vspltis_constant function etc. expects the vspltis[bhw] instructions where the small [-16..15] or even [-32..30] constant is sign-extended to the remaining step bytes, but that is not the case for the 0x80...00 constants, with step > 1 we can't handle e.g. { 0x80, 0xff, 0xff, 0xff, 0x80, 0xff, 0xff, 0xff, 0x80, 0xff, 0xff, 0xff, 0x80, 0xff, 0xff, 0xff } vectors but do want to handle e.g. { 0, 0, 0, 0x80, 0, 0, 0, 0x80, 0, 0, 0, 0x80, 0, 0, 0, 0x80 } and similarly with copies > 1 we do want to handle e.g. { 0x80808080, 0x80808080, 0x80808080, 0x80808080 }. 2021-07-20 Jakub Jelinek PR target/101384 * config/rs6000/rs6000-protos.h (easy_altivec_constant): Change return type from bool to int. * config/rs6000/rs6000.c (vspltis_constant): Fix up handling the EASY_VECTOR_MSB case if either step or copies is not 1. (vspltis_shifted): Fix comment typo. (easy_altivec_constant): Change return type from bool to int, instead of returning true return byte size of the element mode that should be used to synthetize the constant. * config/rs6000/predicates.md (easy_vector_constant_msb): Require that vspltis_shifted is 0, handle the case where easy_altivec_constant assumes using different vector mode from CONST_VECTOR's mode. * config/rs6000/altivec.md (easy_vector_constant_msb splitter): Use easy_altivec_constant to determine mode in which -1 >> -1 should be performed, use rs6000_expand_vector_init instead of gen_vec_initv4sisi. * gcc.dg/pr101384.c: New test. * gcc.target/powerpc/pr101384-1.c: New test. * gcc.target/powerpc/pr101384-2.c: New test. --- gcc/config/rs6000/altivec.md | 19 ++++++++++--- gcc/config/rs6000/predicates.md | 17 +++++++++-- gcc/config/rs6000/rs6000-protos.h | 2 +- gcc/config/rs6000/rs6000.c | 59 +++++++++++++++++++++++++++------------ 4 files changed, 71 insertions(+), 26 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index a20d6ac..d70c17e 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -317,22 +317,33 @@ [(const_int 0)] { rtx dest = operands[0]; - machine_mode mode = GET_MODE (operands[0]); + machine_mode mode; rtvec v; int i, num_elements; - if (mode == V4SFmode) + switch (easy_altivec_constant (operands[1], mode)) { + case 1: + mode = V16QImode; + break; + case 2: + mode = V8HImode; + break; + case 4: mode = V4SImode; - dest = gen_lowpart (V4SImode, dest); + break; + default: + gcc_unreachable (); } + if (mode != mode) + dest = gen_lowpart (mode, dest); num_elements = GET_MODE_NUNITS (mode); v = rtvec_alloc (num_elements); for (i = 0; i < num_elements; i++) RTVEC_ELT (v, i) = constm1_rtx; - emit_insn (gen_vec_initv4sisi (dest, gen_rtx_PARALLEL (mode, v))); + rs6000_expand_vector_init (dest, gen_rtx_PARALLEL (mode, v)); emit_insn (gen_rtx_SET (dest, gen_rtx_ASHIFT (mode, dest, dest))); DONE; }) diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index 121cbf1..956e42b 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -683,15 +683,26 @@ (define_predicate "easy_vector_constant_msb" (and (match_code "const_vector") (and (match_test "TARGET_ALTIVEC") - (match_test "easy_altivec_constant (op, mode)"))) + (match_test "easy_altivec_constant (op, mode)") + (match_test "vspltis_shifted (op) == 0"))) { HOST_WIDE_INT val; - int elt; + int elt, sz = easy_altivec_constant (op, mode); + machine_mode inner = GET_MODE_INNER (mode); + int isz = GET_MODE_SIZE (inner); if (mode == V2DImode || mode == V2DFmode) return 0; elt = BYTES_BIG_ENDIAN ? GET_MODE_NUNITS (mode) - 1 : 0; + if (isz < sz) + { + if (const_vector_elt_as_int (op, elt) != 0) + return 0; + elt += (BYTES_BIG_ENDIAN ? -1 : 1) * (sz - isz) / isz; + } + else if (isz > sz) + inner = smallest_int_mode_for_size (sz * BITS_PER_UNIT); val = const_vector_elt_as_int (op, elt); - return EASY_VECTOR_MSB (val, GET_MODE_INNER (mode)); + return EASY_VECTOR_MSB (val, inner); }) ;; Return true if this is an easy altivec constant that we form diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 94bf961..14f6b31 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -30,7 +30,7 @@ extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, int, int, int, tree, machine_mode); #endif /* TREE_CODE */ -extern bool easy_altivec_constant (rtx, machine_mode); +extern int easy_altivec_constant (rtx, machine_mode); extern bool xxspltib_constant_p (rtx, machine_mode, int *, int *); extern int vspltis_shifted (rtx); extern HOST_WIDE_INT const_vector_elt_as_int (rtx, unsigned int); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 779de95..279f00c 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6134,6 +6134,27 @@ vspltis_constant (rtx op, unsigned step, unsigned copies) splat_val = val; msb_val = val >= 0 ? 0 : -1; + if (val == 0 && step > 1) + { + /* Special case for loading most significant bit with step > 1. + In that case, match 0s in all but step-1s elements, where match + EASY_VECTOR_MSB. */ + for (i = 1; i < nunits; ++i) + { + unsigned elt = BYTES_BIG_ENDIAN ? nunits - 1 - i : i; + HOST_WIDE_INT elt_val = const_vector_elt_as_int (op, elt); + if ((i & (step - 1)) == step - 1) + { + if (!EASY_VECTOR_MSB (elt_val, inner)) + break; + } + else if (elt_val) + break; + } + if (i == nunits) + return true; + } + /* Construct the value to be splatted, if possible. If not, return 0. */ for (i = 2; i <= copies; i *= 2) { @@ -6146,6 +6167,7 @@ vspltis_constant (rtx op, unsigned step, unsigned copies) | (small_val & mask))) return false; splat_val = small_val; + inner = smallest_int_mode_for_size (bitsize); } /* Check if SPLAT_VAL can really be the operand of a vspltis[bhw]. */ @@ -6160,8 +6182,9 @@ vspltis_constant (rtx op, unsigned step, unsigned copies) ; /* Also check if are loading up the most significant bit which can be done by - loading up -1 and shifting the value left by -1. */ - else if (EASY_VECTOR_MSB (splat_val, inner)) + loading up -1 and shifting the value left by -1. Only do this for + step 1 here, for larger steps it is done earlier. */ + else if (EASY_VECTOR_MSB (splat_val, inner) && step == 1) ; else @@ -6271,15 +6294,15 @@ vspltis_shifted (rtx op) } } - /* If all elements are equal, we don't need to do VLSDOI. */ + /* If all elements are equal, we don't need to do VSLDOI. */ return 0; } -/* Return true if OP is of the given MODE and can be synthesized - with a vspltisb, vspltish or vspltisw. */ +/* Return non-zero (element mode byte size) if OP is of the given MODE + and can be synthesized with a vspltisb, vspltish or vspltisw. */ -bool +int easy_altivec_constant (rtx op, machine_mode mode) { unsigned step, copies; @@ -6287,39 +6310,39 @@ easy_altivec_constant (rtx op, machine_mode mode) if (mode == VOIDmode) mode = GET_MODE (op); else if (mode != GET_MODE (op)) - return false; + return 0; /* V2DI/V2DF was added with VSX. Only allow 0 and all 1's as easy constants. */ if (mode == V2DFmode) - return zero_constant (op, mode); + return zero_constant (op, mode) ? 8 : 0; else if (mode == V2DImode) { if (!CONST_INT_P (CONST_VECTOR_ELT (op, 0)) || !CONST_INT_P (CONST_VECTOR_ELT (op, 1))) - return false; + return 0; if (zero_constant (op, mode)) - return true; + return 8; if (INTVAL (CONST_VECTOR_ELT (op, 0)) == -1 && INTVAL (CONST_VECTOR_ELT (op, 1)) == -1) - return true; + return 8; - return false; + return 0; } /* V1TImode is a special container for TImode. Ignore for now. */ else if (mode == V1TImode) - return false; + return 0; /* Start with a vspltisw. */ step = GET_MODE_NUNITS (mode) / 4; copies = 1; if (vspltis_constant (op, step, copies)) - return true; + return 4; /* Then try with a vspltish. */ if (step == 1) @@ -6328,7 +6351,7 @@ easy_altivec_constant (rtx op, machine_mode mode) step >>= 1; if (vspltis_constant (op, step, copies)) - return true; + return 2; /* And finally a vspltisb. */ if (step == 1) @@ -6337,12 +6360,12 @@ easy_altivec_constant (rtx op, machine_mode mode) step >>= 1; if (vspltis_constant (op, step, copies)) - return true; + return 1; if (vspltis_shifted (op) != 0) - return true; + return GET_MODE_SIZE (GET_MODE_INNER (mode)); - return false; + return 0; } /* Generate a VEC_DUPLICATE representing a vspltis[bhw] instruction whose -- cgit v1.1 From 7aa28dbc371cf3c09c05c68672b00d9006391595 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 21 Jul 2021 05:15:55 -0700 Subject: x86: Remove OPTION_MASK_ISA_SSE4_2 from CRC32 _builtin functions Since commit 39671f87b2df6a1894cc11a161e4a7949d1ddccd Author: H.J. Lu Date: Thu Apr 15 05:59:48 2021 -0700 x86: Use crc32 target option for CRC32 intrinsics enabled OPTION_MASK_ISA_CRC32 for -msse4 and removed TARGET_SSE4_2 check in sse4_2_crc32 pattens, remove OPTION_MASK_ISA_SSE4_2 from CRC32 _builtin functions. gcc/ PR target/101549 * config/i386/i386-builtin.def: Remove OPTION_MASK_ISA_SSE4_2 from CRC32 _builtin functions. gcc/testsuite/ PR target/101549 * gcc.target/i386/crc32-6.c: New test. --- gcc/config/i386/i386-builtin.def | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 1cc0cc6..4b1ae0e 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -970,10 +970,10 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_ptestv2di, "__builtin_ia32_pte /* SSE4.2 */ BDESC (OPTION_MASK_ISA_SSE4_2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI) -BDESC (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, 0, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR) -BDESC (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, 0, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT) -BDESC (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, 0, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT) -BDESC (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64) +BDESC (OPTION_MASK_ISA_CRC32, 0, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR) +BDESC (OPTION_MASK_ISA_CRC32, 0, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT) +BDESC (OPTION_MASK_ISA_CRC32, 0, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT) +BDESC (OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64) /* SSE4A */ BDESC (OPTION_MASK_ISA_SSE4A, 0, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT) -- cgit v1.1 From 005054e48e2d6c4d9b0aac7fda2e4a324886307c Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 20 Jul 2021 18:25:53 -0400 Subject: rs6000: Main function with stubs for parsing and output 2021-07-20 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (rbtree.h): New #include. (num_bifs): New variable. (num_ovld_stanzas): Likewise. (num_ovlds): Likewise. (parse_codes): New enum. (bif_rbt): New variable. (ovld_rbt): Likewise. (fntype_rbt): Likewise. (bifo_rbt): Likewise. (parse_bif): New stub function. (create_bif_order): Likewise. (parse_ovld): Likewise. (write_header_file): Likewise. (write_init_file): Likewise. (write_defines_file): Likewise. (delete_output_files): New function. (main): Likewise. --- gcc/config/rs6000/rs6000-gen-builtins.c | 215 ++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 34566fc..08aa76b 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -163,6 +163,7 @@ along with GCC; see the file COPYING3. If not see #include #include #include +#include "rbtree.h" /* Input and output file descriptors and pathnames. */ static FILE *bif_file; @@ -249,6 +250,29 @@ struct typeinfo char *val2; }; +static int num_bifs; +static int num_ovld_stanzas; +static int num_ovlds; + +/* Return codes for parsing routines. */ +enum parse_codes +{ + PC_OK, + PC_EOFILE, + PC_EOSTANZA, + PC_PARSEFAIL +}; + +/* The red-black trees for built-in function identifiers, built-in + overload identifiers, and function type descriptors. */ +static rbt_strings bif_rbt; +static rbt_strings ovld_rbt; +static rbt_strings fntype_rbt; + +/* Another red-black tree containing a mapping from built-in function + identifiers to the order in which they were encountered. */ +static rbt_strings bifo_rbt; + /* Pointer to a diagnostic function. */ static void (*diag) (const char *, ...) __attribute__ ((format (printf, 1, 2))); @@ -864,3 +888,194 @@ match_type (typeinfo *typedata, int voidok) return 1; } + +/* Parse the built-in file. */ +static parse_codes +parse_bif (void) +{ + return PC_OK; +} + +/* Create a mapping from function IDs in their final order to the order + they appear in the built-in function file. */ +static void +create_bif_order (void) +{ +} + +/* Parse the overload file. */ +static parse_codes +parse_ovld (void) +{ + return PC_OK; +} + +/* Write everything to the header file (rs6000-builtins.h). Return + 1 if successful, 0 otherwise. */ +static int +write_header_file (void) +{ + return 1; +} + +/* Write everything to the initialization file (rs6000-builtins.c). + Return 1 if successful, 0 otherwise. */ +static int +write_init_file (void) +{ + return 1; +} + +/* Write everything to the include file (rs6000-vecdefines.h). + Return 1 if successful, 0 otherwise. */ +static int +write_defines_file (void) +{ + return 1; +} + +/* Close and delete output files after any failure, so that subsequent + build dependencies will fail. */ +static void +delete_output_files (void) +{ + /* Depending on whence we're called, some of these may already be + closed. Don't check for errors. */ + fclose (header_file); + fclose (init_file); + fclose (defines_file); + + remove (header_path); + remove (init_path); + remove (defines_path); +} + +/* Main program to convert flat files into built-in initialization code. */ +int +main (int argc, const char **argv) +{ + if (argc != 6) + { + fprintf (stderr, + "Five arguments required: two input files and three output " + "files.\n"); + exit (1); + } + + pgm_path = argv[0]; + bif_path = argv[1]; + ovld_path = argv[2]; + header_path = argv[3]; + init_path = argv[4]; + defines_path = argv[5]; + + bif_file = fopen (bif_path, "r"); + if (!bif_file) + { + fprintf (stderr, "Cannot open input built-in file '%s'.\n", bif_path); + exit (1); + } + ovld_file = fopen (ovld_path, "r"); + if (!ovld_file) + { + fprintf (stderr, "Cannot open input overload file '%s'.\n", ovld_path); + exit (1); + } + header_file = fopen (header_path, "w"); + if (!header_file) + { + fprintf (stderr, "Cannot open header file '%s' for output.\n", + header_path); + exit (1); + } + init_file = fopen (init_path, "w"); + if (!init_file) + { + fprintf (stderr, "Cannot open init file '%s' for output.\n", init_path); + exit (1); + } + defines_file = fopen (defines_path, "w"); + if (!defines_file) + { + fprintf (stderr, "Cannot open defines file '%s' for output.\n", + defines_path); + exit (1); + } + + /* Initialize the balanced trees containing built-in function ids, + overload function ids, and function type declaration ids. */ + rbt_new (&bif_rbt); + rbt_new (&ovld_rbt); + rbt_new (&fntype_rbt); + + /* Initialize another balanced tree that contains a map from built-in + function ids to the order in which they were encountered. */ + rbt_new (&bifo_rbt); + + /* Parse the built-in function file. */ + num_bifs = 0; + line = 0; + if (parse_bif () == PC_PARSEFAIL) + { + fprintf (stderr, "Parsing of '%s' failed, aborting.\n", bif_path); + delete_output_files (); + exit (1); + } + fclose (bif_file); + + /* Create a mapping from function IDs in their final order to + the order they appear in the built-in function file. */ + create_bif_order (); + +#ifdef DEBUG + fprintf (stderr, "\nFunction ID list:\n"); + rbt_dump (&bif_rbt, bif_rbt.rbt_root); + fprintf (stderr, "\n"); +#endif + + /* Parse the overload file. */ + num_ovld_stanzas = 0; + num_ovlds = 0; + line = 0; + if (parse_ovld () == PC_PARSEFAIL) + { + fprintf (stderr, "Parsing of '%s' failed, aborting.\n", ovld_path); + delete_output_files (); + exit (1); + } + fclose (ovld_file); + +#ifdef DEBUG + fprintf (stderr, "\nFunction type decl list:\n"); + rbt_dump (&fntype_rbt, fntype_rbt.rbt_root); + fprintf (stderr, "\n"); +#endif + + /* Write the header file and the file containing initialization code. */ + if (!write_header_file ()) + { + fprintf (stderr, "Output to '%s' failed, aborting.\n", header_path); + delete_output_files (); + exit (1); + } + if (!write_init_file ()) + { + fprintf (stderr, "Output to '%s' failed, aborting.\n", init_path); + delete_output_files (); + exit (1); + } + + /* Write the defines file to be included into altivec.h. */ + if (!write_defines_file ()) + { + fprintf (stderr, "Output to '%s' failed, aborting.\n", defines_path); + delete_output_files (); + exit (1); + } + + fclose (header_file); + fclose (init_file); + fclose (defines_file); + + return 0; +} -- cgit v1.1 From c2d777d6f3a17ac07f78bc4c7dc4d1e0ddd566ae Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 20 Jul 2021 18:45:57 -0400 Subject: rs6000: Parsing built-in input file, part 1 of 3 2021-07-20 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (bif_stanza): New enum. (curr_bif_stanza): New variable. (stanza_entry): New struct. (stanza_map): New initialized variable. (enable_string): Likewise. (fnkinds): New enum. (typelist): New struct. (attrinfo): Likewise. (MAXRESTROPNDS): New macro. (prototype): New struct. (MAXBIFS): New macro. (bifdata): New struct. (bifs): New variable. (curr_bif): Likewise. (bif_order): Likewise. (bif_index): Likewise. (fatal): New function. (stanza_name_to_stanza): Likewise. (parse_bif_attrs): New stub function. (parse_prototype): Likewise. (parse_bif_entry): New function. (parse_bif_stanza): Likewise. (parse_bif): Implement. (set_bif_order): New function. (create_bif_order): Implement. --- gcc/config/rs6000/rs6000-gen-builtins.c | 379 +++++++++++++++++++++++++++++++- 1 file changed, 378 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 08aa76b..b066ece 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -194,6 +194,101 @@ enum void_status VOID_OK }; +/* Stanzas are groupings of built-in functions and overloads by some + common feature/attribute. These definitions are for built-in function + stanzas. */ +enum bif_stanza +{ + BSTZ_ALWAYS, + BSTZ_P5, + BSTZ_P6, + BSTZ_ALTIVEC, + BSTZ_CELL, + BSTZ_VSX, + BSTZ_P7, + BSTZ_P7_64, + BSTZ_P8, + BSTZ_P8V, + BSTZ_P9, + BSTZ_P9_64, + BSTZ_P9V, + BSTZ_IEEE128_HW, + BSTZ_DFP, + BSTZ_CRYPTO, + BSTZ_HTM, + BSTZ_P10, + BSTZ_P10_64, + BSTZ_MMA, + NUMBIFSTANZAS +}; + +static bif_stanza curr_bif_stanza; + +struct stanza_entry +{ + const char *stanza_name; + bif_stanza stanza; +}; + +static stanza_entry stanza_map[NUMBIFSTANZAS] = + { + { "always", BSTZ_ALWAYS }, + { "power5", BSTZ_P5 }, + { "power6", BSTZ_P6 }, + { "altivec", BSTZ_ALTIVEC }, + { "cell", BSTZ_CELL }, + { "vsx", BSTZ_VSX }, + { "power7", BSTZ_P7 }, + { "power7-64", BSTZ_P7_64 }, + { "power8", BSTZ_P8 }, + { "power8-vector", BSTZ_P8V }, + { "power9", BSTZ_P9 }, + { "power9-64", BSTZ_P9_64 }, + { "power9-vector", BSTZ_P9V }, + { "ieee128-hw", BSTZ_IEEE128_HW }, + { "dfp", BSTZ_DFP }, + { "crypto", BSTZ_CRYPTO }, + { "htm", BSTZ_HTM }, + { "power10", BSTZ_P10 }, + { "power10-64", BSTZ_P10_64 }, + { "mma", BSTZ_MMA } + }; + +static const char *enable_string[NUMBIFSTANZAS] = + { + "ENB_ALWAYS", + "ENB_P5", + "ENB_P6", + "ENB_ALTIVEC", + "ENB_CELL", + "ENB_VSX", + "ENB_P7", + "ENB_P7_64", + "ENB_P8", + "ENB_P8V", + "ENB_P9", + "ENB_P9_64", + "ENB_P9V", + "ENB_IEEE128_HW", + "ENB_DFP", + "ENB_CRYPTO", + "ENB_HTM", + "ENB_P10", + "ENB_P10_64", + "ENB_MMA" + }; + +/* Function modifiers provide special handling for const, pure, and fpmath + functions. These are mutually exclusive, and therefore kept separate + from other bif attributes. */ +enum fnkinds +{ + FNK_NONE, + FNK_CONST, + FNK_PURE, + FNK_FPMATH +}; + /* Legal base types for an argument or return type. */ enum basetype { @@ -250,7 +345,76 @@ struct typeinfo char *val2; }; +/* A list of argument types. */ +struct typelist +{ + typeinfo info; + typelist *next; +}; + +/* Attributes of a builtin function. */ +struct attrinfo +{ + bool isinit; + bool isset; + bool isextract; + bool isnosoft; + bool isldvec; + bool isstvec; + bool isreve; + bool ispred; + bool ishtm; + bool ishtmspr; + bool ishtmcr; + bool ismma; + bool isquad; + bool ispair; + bool isno32bit; + bool is32bit; + bool iscpu; + bool isldstmask; + bool islxvrse; + bool islxvrze; + bool isendian; +}; + +/* Fields associated with a function prototype (bif or overload). */ +#define MAXRESTROPNDS 3 +struct prototype +{ + typeinfo rettype; + char *bifname; + int nargs; + typelist *args; + int restr_opnd[MAXRESTROPNDS]; + restriction restr[MAXRESTROPNDS]; + char *restr_val1[MAXRESTROPNDS]; + char *restr_val2[MAXRESTROPNDS]; +}; + +/* Data associated with a builtin function, and a table of such data. */ +#define MAXBIFS 16384 +struct bifdata +{ + int stanza; + fnkinds kind; + prototype proto; + char *idname; + char *patname; + attrinfo attrs; + char *fndecl; +}; + +static bifdata bifs[MAXBIFS]; static int num_bifs; +static int curr_bif; + +/* Array used to track the order in which built-ins appeared in the + built-in file. We reorder them alphabetically but sometimes need + this information. */ +static int *bif_order; +static int bif_index = 0; + static int num_ovld_stanzas; static int num_ovlds; @@ -419,6 +583,25 @@ handle_pointer (typeinfo *typedata) } } +/* Produce a fatal error message. */ +static void +fatal (const char *msg) +{ + fprintf (stderr, "FATAL: %s\n", msg); + abort (); +} + +static bif_stanza +stanza_name_to_stanza (const char *stanza_name) +{ + for (int i = 0; i < NUMBIFSTANZAS; i++) + if (!strcmp (stanza_name, stanza_map[i].stanza_name)) + return stanza_map[i].stanza; + fatal ("Stanza mapping is inconsistent."); + /* Unreachable. */ + return BSTZ_ALWAYS; +} + /* Match one of the allowable base types. Consumes one token unless the token is "long", which must be paired with a second "long". Optionally consumes a following '*' token for pointers. Return 1 for success, @@ -889,11 +1072,203 @@ match_type (typeinfo *typedata, int voidok) return 1; } +/* Parse the attribute list. */ +static parse_codes +parse_bif_attrs (attrinfo *attrptr) +{ + return PC_OK; +} + +/* Parse a function prototype. This code is shared by the bif and overload + file processing. */ +static parse_codes +parse_prototype (prototype *protoptr) +{ + return PC_OK; +} + +/* Parse a two-line entry for a built-in function. */ +static parse_codes +parse_bif_entry (void) +{ + /* Check for end of stanza. */ + pos = 0; + consume_whitespace (); + if (linebuf[pos] == '[') + return PC_EOSTANZA; + + /* Allocate an entry in the bif table. */ + if (num_bifs >= MAXBIFS - 1) + { + (*diag) ("too many built-in functions.\n"); + return PC_PARSEFAIL; + } + + curr_bif = num_bifs++; + bifs[curr_bif].stanza = curr_bif_stanza; + + /* Read the first token and see if it is a function modifier. */ + consume_whitespace (); + int oldpos = pos; + char *token = match_identifier (); + if (!token) + { + (*diag) ("malformed entry at column %d\n", oldpos + 1); + return PC_PARSEFAIL; + } + + if (!strcmp (token, "const")) + bifs[curr_bif].kind = FNK_CONST; + else if (!strcmp (token, "pure")) + bifs[curr_bif].kind = FNK_PURE; + else if (!strcmp (token, "fpmath")) + bifs[curr_bif].kind = FNK_FPMATH; + else + { + /* No function modifier, so push the token back. */ + pos = oldpos; + bifs[curr_bif].kind = FNK_NONE; + } + + if (parse_prototype (&bifs[curr_bif].proto) == PC_PARSEFAIL) + return PC_PARSEFAIL; + + /* Now process line 2. First up is the builtin id. */ + if (!advance_line (bif_file)) + { + (*diag) ("unexpected EOF.\n"); + return PC_PARSEFAIL; + } + + pos = 0; + consume_whitespace (); + oldpos = pos; + bifs[curr_bif].idname = match_identifier (); + if (!bifs[curr_bif].idname) + { + (*diag) ("missing builtin id at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + +#ifdef DEBUG + (*diag) ("ID name is '%s'.\n", bifs[curr_bif].idname); +#endif + + /* Save the ID in a lookup structure. */ + if (!rbt_insert (&bif_rbt, bifs[curr_bif].idname)) + { + (*diag) ("duplicate function ID '%s' at column %d.\n", + bifs[curr_bif].idname, oldpos + 1); + return PC_PARSEFAIL; + } + + /* Append a number representing the order in which this function + was encountered to its name, and save in another lookup + structure. */ + char *buf; + asprintf (&buf, "%s:%05d", bifs[curr_bif].idname, curr_bif); + + if (!rbt_insert (&bifo_rbt, buf)) + { + (*diag) ("internal error inserting '%s' in bifo_rbt\n", buf); + return PC_PARSEFAIL; + } + + /* Now the pattern name. */ + consume_whitespace (); + bifs[curr_bif].patname = match_identifier (); + if (!bifs[curr_bif].patname) + { + (*diag) ("missing pattern name at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + +#ifdef DEBUG + (*diag) ("pattern name is '%s'.\n", bifs[curr_bif].patname); +#endif + + /* Process attributes. */ + return parse_bif_attrs (&bifs[curr_bif].attrs); +} + +/* Parse one stanza of the input BIF file. linebuf already contains the + first line to parse. */ +static parse_codes +parse_bif_stanza (void) +{ + /* Parse the stanza header. */ + pos = 0; + consume_whitespace (); + + if (linebuf[pos] != '[') + { + (*diag) ("ill-formed stanza header at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + safe_inc_pos (); + + const char *stanza_name = match_to_right_bracket (); + if (!stanza_name) + { + (*diag) ("no expression found in stanza header.\n"); + return PC_PARSEFAIL; + } + + curr_bif_stanza = stanza_name_to_stanza (stanza_name); + + if (linebuf[pos] != ']') + { + (*diag) ("ill-formed stanza header at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + safe_inc_pos (); + + consume_whitespace (); + if (linebuf[pos] != '\n' && pos != LINELEN - 1) + { + (*diag) ("garbage after stanza header.\n"); + return PC_PARSEFAIL; + } + + parse_codes result = PC_OK; + + while (result != PC_EOSTANZA) + { + if (!advance_line (bif_file)) + return PC_EOFILE; + result = parse_bif_entry (); + if (result == PC_PARSEFAIL) + return PC_PARSEFAIL; + } + + return PC_OK; +} + /* Parse the built-in file. */ static parse_codes parse_bif (void) { - return PC_OK; + parse_codes result; + diag = &bif_diag; + if (!advance_line (bif_file)) + return PC_OK; + + do + result = parse_bif_stanza (); + while (result == PC_OK); + + if (result == PC_EOFILE) + return PC_OK; + return result; +} + +/* Callback function for create_bif_order. */ +void set_bif_order (char *str) +{ + int num = 0; + char *colon = strchr (str, ':'); + sscanf (++colon, "%d", &num); + bif_order[bif_index++] = num; } /* Create a mapping from function IDs in their final order to the order @@ -901,6 +1276,8 @@ parse_bif (void) static void create_bif_order (void) { + bif_order = (int *) malloc ((curr_bif + 1) * sizeof (int)); + rbt_inorder_callback (&bifo_rbt, bifo_rbt.rbt_root, set_bif_order); } /* Parse the overload file. */ -- cgit v1.1 From 3c51b62bb8f052a610c3de2a8a0892ee3c3945ad Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 21 Jul 2021 08:32:32 -0400 Subject: rs6000: Parsing built-in input file, part 2 of 3 2021-07-21 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (parse_args): New function. (parse_prototype): Implement. --- gcc/config/rs6000/rs6000-gen-builtins.c | 145 ++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index b066ece..ee32a0d 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -1072,6 +1072,93 @@ match_type (typeinfo *typedata, int voidok) return 1; } +/* Parse the argument list. */ +static parse_codes +parse_args (prototype *protoptr) +{ + typelist **argptr = &protoptr->args; + int *nargs = &protoptr->nargs; + int *restr_opnd = protoptr->restr_opnd; + restriction *restr = protoptr->restr; + char **val1 = protoptr->restr_val1; + char **val2 = protoptr->restr_val2; + int restr_cnt = 0; + + int success; + *nargs = 0; + + /* Start the argument list. */ + consume_whitespace (); + if (linebuf[pos] != '(') + { + (*diag) ("missing '(' at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + safe_inc_pos (); + + do { + consume_whitespace (); + int oldpos = pos; + typelist *argentry = (typelist *) malloc (sizeof (typelist)); + memset (argentry, 0, sizeof *argentry); + typeinfo *argtype = &argentry->info; + success = match_type (argtype, VOID_NOTOK); + if (success) + { + if (argtype->restr) + { + if (restr_cnt >= MAXRESTROPNDS) + { + (*diag) ("More than two %d operands\n", MAXRESTROPNDS); + return PC_PARSEFAIL; + } + restr_opnd[restr_cnt] = *nargs + 1; + restr[restr_cnt] = argtype->restr; + val1[restr_cnt] = argtype->val1; + val2[restr_cnt] = argtype->val2; + restr_cnt++; + } + (*nargs)++; + *argptr = argentry; + argptr = &argentry->next; + consume_whitespace (); + if (linebuf[pos] == ',') + safe_inc_pos (); + else if (linebuf[pos] != ')') + { + (*diag) ("arg not followed by ',' or ')' at column %d.\n", + pos + 1); + return PC_PARSEFAIL; + } + +#ifdef DEBUG + (*diag) ("argument type: isvoid = %d, isconst = %d, isvector = %d, " + "issigned = %d, isunsigned = %d, isbool = %d, ispixel = %d, " + "ispointer = %d, base = %d, restr = %d, val1 = \"%s\", " + "val2 = \"%s\", pos = %d.\n", + argtype->isvoid, argtype->isconst, argtype->isvector, + argtype->issigned, argtype->isunsigned, argtype->isbool, + argtype->ispixel, argtype->ispointer, argtype->base, + argtype->restr, argtype->val1, argtype->val2, pos + 1); +#endif + } + else + { + free (argentry); + *argptr = NULL; + pos = oldpos; + if (linebuf[pos] != ')') + { + (*diag) ("badly terminated arg list at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + safe_inc_pos (); + } + } while (success); + + return PC_OK; +} + /* Parse the attribute list. */ static parse_codes parse_bif_attrs (attrinfo *attrptr) @@ -1084,6 +1171,64 @@ parse_bif_attrs (attrinfo *attrptr) static parse_codes parse_prototype (prototype *protoptr) { + typeinfo *ret_type = &protoptr->rettype; + char **bifname = &protoptr->bifname; + + /* Get the return type. */ + consume_whitespace (); + int oldpos = pos; + int success = match_type (ret_type, VOID_OK); + if (!success) + { + (*diag) ("missing or badly formed return type at column %d.\n", + oldpos + 1); + return PC_PARSEFAIL; + } + +#ifdef DEBUG + (*diag) ("return type: isvoid = %d, isconst = %d, isvector = %d, " + "issigned = %d, isunsigned = %d, isbool = %d, ispixel = %d, " + "ispointer = %d, base = %d, restr = %d, val1 = \"%s\", " + "val2 = \"%s\", pos = %d.\n", + ret_type->isvoid, ret_type->isconst, ret_type->isvector, + ret_type->issigned, ret_type->isunsigned, ret_type->isbool, + ret_type->ispixel, ret_type->ispointer, ret_type->base, + ret_type->restr, ret_type->val1, ret_type->val2, pos + 1); +#endif + + /* Get the bif name. */ + consume_whitespace (); + oldpos = pos; + *bifname = match_identifier (); + if (!*bifname) + { + (*diag) ("missing function name at column %d.\n", oldpos + 1); + return PC_PARSEFAIL; + } + +#ifdef DEBUG + (*diag) ("function name is '%s'.\n", *bifname); +#endif + + /* Process arguments. */ + if (parse_args (protoptr) == PC_PARSEFAIL) + return PC_PARSEFAIL; + + /* Process terminating semicolon. */ + consume_whitespace (); + if (linebuf[pos] != ';') + { + (*diag) ("missing semicolon at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + safe_inc_pos (); + consume_whitespace (); + if (linebuf[pos] != '\n') + { + (*diag) ("garbage at end of line at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + return PC_OK; } -- cgit v1.1 From 63c334f286e764349f2961c6a4c2492ec0394d01 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 21 Jul 2021 08:36:25 -0400 Subject: rs6000: Parsing built-in input file, part 3 of 3 2021-07-21 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (parse_bif_attrs): Implement. --- gcc/config/rs6000/rs6000-gen-builtins.c | 102 ++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index ee32a0d..6030e13 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -1163,6 +1163,108 @@ parse_args (prototype *protoptr) static parse_codes parse_bif_attrs (attrinfo *attrptr) { + consume_whitespace (); + if (linebuf[pos] != '{') + { + (*diag) ("missing attribute set at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + safe_inc_pos (); + + memset (attrptr, 0, sizeof *attrptr); + char *attrname = NULL; + + do { + consume_whitespace (); + int oldpos = pos; + attrname = match_identifier (); + if (attrname) + { + if (!strcmp (attrname, "init")) + attrptr->isinit = 1; + else if (!strcmp (attrname, "set")) + attrptr->isset = 1; + else if (!strcmp (attrname, "extract")) + attrptr->isextract = 1; + else if (!strcmp (attrname, "nosoft")) + attrptr->isnosoft = 1; + else if (!strcmp (attrname, "ldvec")) + attrptr->isldvec = 1; + else if (!strcmp (attrname, "stvec")) + attrptr->isstvec = 1; + else if (!strcmp (attrname, "reve")) + attrptr->isreve = 1; + else if (!strcmp (attrname, "pred")) + attrptr->ispred = 1; + else if (!strcmp (attrname, "htm")) + attrptr->ishtm = 1; + else if (!strcmp (attrname, "htmspr")) + attrptr->ishtmspr = 1; + else if (!strcmp (attrname, "htmcr")) + attrptr->ishtmcr = 1; + else if (!strcmp (attrname, "mma")) + attrptr->ismma = 1; + else if (!strcmp (attrname, "quad")) + attrptr->isquad = 1; + else if (!strcmp (attrname, "pair")) + attrptr->ispair = 1; + else if (!strcmp (attrname, "no32bit")) + attrptr->isno32bit = 1; + else if (!strcmp (attrname, "32bit")) + attrptr->is32bit = 1; + else if (!strcmp (attrname, "cpu")) + attrptr->iscpu = 1; + else if (!strcmp (attrname, "ldstmask")) + attrptr->isldstmask = 1; + else if (!strcmp (attrname, "lxvrse")) + attrptr->islxvrse = 1; + else if (!strcmp (attrname, "lxvrze")) + attrptr->islxvrze = 1; + else if (!strcmp (attrname, "endian")) + attrptr->isendian = 1; + else + { + (*diag) ("unknown attribute at column %d.\n", oldpos + 1); + return PC_PARSEFAIL; + } + + consume_whitespace (); + if (linebuf[pos] == ',') + safe_inc_pos (); + else if (linebuf[pos] != '}') + { + (*diag) ("arg not followed by ',' or '}' at column %d.\n", + pos + 1); + return PC_PARSEFAIL; + } + } + else + { + pos = oldpos; + if (linebuf[pos] != '}') + { + (*diag) ("badly terminated attr set at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + safe_inc_pos (); + } + } while (attrname); + +#ifdef DEBUG + (*diag) ("attribute set: init = %d, set = %d, extract = %d, nosoft = %d, " + "ldvec = %d, stvec = %d, reve = %d, pred = %d, htm = %d, " + "htmspr = %d, htmcr = %d, mma = %d, quad = %d, pair = %d, " + "no32bit = %d, 32bit = %d, cpu = %d, ldstmask = %d, lxvrse = %d, " + "lxvrze = %d, endian = %d.\n", + attrptr->isinit, attrptr->isset, attrptr->isextract, + attrptr->isnosoft, attrptr->isldvec, attrptr->isstvec, + attrptr->isreve, attrptr->ispred, attrptr->ishtm, attrptr->ishtmspr, + attrptr->ishtmcr, attrptr->ismma, attrptr->isquad, attrptr->ispair, + attrptr->isno32bit, attrptr->is32bit, attrptr->iscpu, + attrptr->isldstmask, attrptr->islxvrse, attrptr->islxvrze, + attrptr->isendian); +#endif + return PC_OK; } -- cgit v1.1 From 582b56dfd023077e9210a3adce478dd73d96d340 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 21 Jul 2021 08:39:37 -0400 Subject: rs6000: Parsing of overload input file 2021-06-07 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (ovld_stanza): New struct. (MAXOVLDSTANZAS): New macro. (ovld_stanzas): New variable. (curr_ovld_stanza): Likewise. (MAXOVLDS): New macro. (ovlddata): New struct. (ovlds): New variable. (curr_ovld): Likewise. (max_ovld_args): Likewise. (parse_ovld_entry): New function. (parse_ovld_stanza): Likewise. (parse_ovld): Implement. --- gcc/config/rs6000/rs6000-gen-builtins.c | 235 +++++++++++++++++++++++++++++++- 1 file changed, 234 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 6030e13..ddeb39e 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -415,8 +415,35 @@ static int curr_bif; static int *bif_order; static int bif_index = 0; +/* Stanzas are groupings of built-in functions and overloads by some + common feature/attribute. These definitions are for overload stanzas. */ +struct ovld_stanza +{ + char *stanza_id; + char *extern_name; + char *intern_name; + char *ifdef; +}; + +#define MAXOVLDSTANZAS 512 +static ovld_stanza ovld_stanzas[MAXOVLDSTANZAS]; static int num_ovld_stanzas; +static int curr_ovld_stanza; + +#define MAXOVLDS 16384 +struct ovlddata +{ + int stanza; + prototype proto; + char *bif_id_name; + char *ovld_id_name; + char *fndecl; +}; + +static ovlddata ovlds[MAXOVLDS]; static int num_ovlds; +static int curr_ovld; +static int max_ovld_args = 0; /* Return codes for parsing routines. */ enum parse_codes @@ -1527,11 +1554,217 @@ create_bif_order (void) rbt_inorder_callback (&bifo_rbt, bifo_rbt.rbt_root, set_bif_order); } +/* Parse one two-line entry in the overload file. */ +static parse_codes +parse_ovld_entry (void) +{ + /* Check for end of stanza. */ + pos = 0; + consume_whitespace (); + if (linebuf[pos] == '[') + return PC_EOSTANZA; + + /* Allocate an entry in the overload table. */ + if (num_ovlds >= MAXOVLDS - 1) + { + (*diag) ("too many overloads.\n"); + return PC_PARSEFAIL; + } + + curr_ovld = num_ovlds++; + ovlds[curr_ovld].stanza = curr_ovld_stanza; + + if (parse_prototype (&ovlds[curr_ovld].proto) == PC_PARSEFAIL) + return PC_PARSEFAIL; + + if (ovlds[curr_ovld].proto.nargs > max_ovld_args) + max_ovld_args = ovlds[curr_ovld].proto.nargs; + + /* Now process line 2, which just contains the builtin id and an + optional overload id. */ + if (!advance_line (ovld_file)) + { + (*diag) ("unexpected EOF.\n"); + return PC_EOFILE; + } + + pos = 0; + consume_whitespace (); + int oldpos = pos; + char *id = match_identifier (); + ovlds[curr_ovld].bif_id_name = id; + ovlds[curr_ovld].ovld_id_name = id; + if (!id) + { + (*diag) ("missing overload id at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + +#ifdef DEBUG + (*diag) ("ID name is '%s'.\n", id); +#endif + + /* The builtin id has to match one from the bif file. */ + if (!rbt_find (&bif_rbt, id)) + { + (*diag) ("builtin ID '%s' not found in bif file.\n", id); + return PC_PARSEFAIL; + } + + /* Check for an optional overload id. Usually we use the builtin + function id for that purpose, but sometimes we need multiple + overload entries for the same builtin id, and it needs to be unique. */ + consume_whitespace (); + if (linebuf[pos] != '\n') + { + id = match_identifier (); + ovlds[curr_ovld].ovld_id_name = id; + consume_whitespace (); + } + + /* Save the overload ID in a lookup structure. */ + if (!rbt_insert (&ovld_rbt, id)) + { + (*diag) ("duplicate overload ID '%s' at column %d.\n", id, oldpos + 1); + return PC_PARSEFAIL; + } + + if (linebuf[pos] != '\n') + { + (*diag) ("garbage at end of line at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + return PC_OK; +} + +/* Parse one stanza of the input overload file. linebuf already contains the + first line to parse. */ +static parse_codes +parse_ovld_stanza (void) +{ + /* Parse the stanza header. */ + pos = 0; + consume_whitespace (); + + if (linebuf[pos] != '[') + { + (*diag) ("ill-formed stanza header at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + safe_inc_pos (); + + char *stanza_name = match_identifier (); + if (!stanza_name) + { + (*diag) ("no identifier found in stanza header.\n"); + return PC_PARSEFAIL; + } + + /* Add the identifier to a table and set the number to be recorded + with subsequent overload entries. */ + if (num_ovld_stanzas >= MAXOVLDSTANZAS) + { + (*diag) ("too many stanza headers.\n"); + return PC_PARSEFAIL; + } + + curr_ovld_stanza = num_ovld_stanzas++; + ovld_stanza *stanza = &ovld_stanzas[curr_ovld_stanza]; + stanza->stanza_id = stanza_name; + + consume_whitespace (); + if (linebuf[pos] != ',') + { + (*diag) ("missing comma at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + safe_inc_pos (); + + consume_whitespace (); + stanza->extern_name = match_identifier (); + if (!stanza->extern_name) + { + (*diag) ("missing external name at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + + consume_whitespace (); + if (linebuf[pos] != ',') + { + (*diag) ("missing comma at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + safe_inc_pos (); + + consume_whitespace (); + stanza->intern_name = match_identifier (); + if (!stanza->intern_name) + { + (*diag) ("missing internal name at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + + consume_whitespace (); + if (linebuf[pos] == ',') + { + safe_inc_pos (); + consume_whitespace (); + stanza->ifdef = match_identifier (); + if (!stanza->ifdef) + { + (*diag) ("missing ifdef token at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + consume_whitespace (); + } + else + stanza->ifdef = 0; + + if (linebuf[pos] != ']') + { + (*diag) ("ill-formed stanza header at column %d.\n", pos + 1); + return PC_PARSEFAIL; + } + safe_inc_pos (); + + consume_whitespace (); + if (linebuf[pos] != '\n' && pos != LINELEN - 1) + { + (*diag) ("garbage after stanza header.\n"); + return PC_PARSEFAIL; + } + + parse_codes result = PC_OK; + + while (result != PC_EOSTANZA) + { + if (!advance_line (ovld_file)) + return PC_EOFILE; + + result = parse_ovld_entry (); + if (result == PC_EOFILE || result == PC_PARSEFAIL) + return result; + } + + return PC_OK; +} + /* Parse the overload file. */ static parse_codes parse_ovld (void) { - return PC_OK; + parse_codes result = PC_OK; + diag = &ovld_diag; + + if (!advance_line (ovld_file)) + return PC_OK; + + while (result == PC_OK) + result = parse_ovld_stanza (); + + if (result == PC_EOFILE) + return PC_OK; + return result; } /* Write everything to the header file (rs6000-builtins.h). Return -- cgit v1.1 From 04ef43c7b35e006559781f758a81c207e6d54a15 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 21 Jul 2021 08:45:36 -0400 Subject: rs6000: Build and store function type identifiers 2021-07-21 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (complete_vector_type): New function. (complete_base_type): Likewise. (construct_fntype_id): Likewise. (parse_bif_entry): Call contruct_fntype_id. (parse_ovld_entry): Likewise. --- gcc/config/rs6000/rs6000-gen-builtins.c | 231 ++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index ddeb39e..2ddebcb 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -1295,6 +1295,229 @@ parse_bif_attrs (attrinfo *attrptr) return PC_OK; } +/* Convert a vector type into a mode string. */ +static void +complete_vector_type (typeinfo *typeptr, char *buf, int *bufi) +{ + if (typeptr->isbool) + buf[(*bufi)++] = 'b'; + buf[(*bufi)++] = 'v'; + if (typeptr->ispixel) + { + memcpy (&buf[*bufi], "p8hi", 4); + *bufi += 4; + return; + } + switch (typeptr->base) + { + case BT_CHAR: + memcpy (&buf[*bufi], "16qi", 4); + *bufi += 4; + break; + case BT_SHORT: + memcpy (&buf[*bufi], "8hi", 3); + *bufi += 3; + break; + case BT_INT: + memcpy (&buf[*bufi], "4si", 3); + *bufi += 3; + break; + case BT_LONGLONG: + memcpy (&buf[*bufi], "2di", 3); + *bufi += 3; + break; + case BT_FLOAT: + memcpy (&buf[*bufi], "4sf", 3); + *bufi += 3; + break; + case BT_DOUBLE: + memcpy (&buf[*bufi], "2df", 3); + *bufi += 3; + break; + case BT_INT128: + memcpy (&buf[*bufi], "1ti", 3); + *bufi += 3; + break; + case BT_FLOAT128: + memcpy (&buf[*bufi], "1tf", 3); + *bufi += 3; + break; + case BT_VPAIR: + memcpy (&buf[*bufi], "1poi", 4); + *bufi += 4; + break; + case BT_VQUAD: + memcpy (&buf[*bufi], "1pxi", 4); + *bufi += 4; + break; + default: + (*diag) ("unhandled basetype %d.\n", typeptr->base); + exit (1); + } +} + +/* Convert a base type into a mode string. */ +static void +complete_base_type (typeinfo *typeptr, char *buf, int *bufi) +{ + switch (typeptr->base) + { + case BT_CHAR: + memcpy (&buf[*bufi], "qi", 2); + break; + case BT_SHORT: + memcpy (&buf[*bufi], "hi", 2); + break; + case BT_INT: + memcpy (&buf[*bufi], "si", 2); + break; + case BT_LONG: + memcpy (&buf[*bufi], "lg", 2); + break; + case BT_LONGLONG: + memcpy (&buf[*bufi], "di", 2); + break; + case BT_FLOAT: + memcpy (&buf[*bufi], "sf", 2); + break; + case BT_DOUBLE: + memcpy (&buf[*bufi], "df", 2); + break; + case BT_LONGDOUBLE: + memcpy (&buf[*bufi], "ld", 2); + break; + case BT_INT128: + memcpy (&buf[*bufi], "ti", 2); + break; + case BT_FLOAT128: + memcpy (&buf[*bufi], "tf", 2); + break; + case BT_BOOL: + memcpy (&buf[*bufi], "bi", 2); + break; + case BT_STRING: + memcpy (&buf[*bufi], "st", 2); + break; + case BT_DECIMAL32: + memcpy (&buf[*bufi], "sd", 2); + break; + case BT_DECIMAL64: + memcpy (&buf[*bufi], "dd", 2); + break; + case BT_DECIMAL128: + memcpy (&buf[*bufi], "td", 2); + break; + case BT_IBM128: + memcpy (&buf[*bufi], "if", 2); + break; + default: + (*diag) ("unhandled basetype %d.\n", typeptr->base); + exit (1); + } + + *bufi += 2; +} + +/* Build a function type descriptor identifier from the return type + and argument types described by PROTOPTR, and store it if it does + not already exist. Return the identifier. */ +static char * +construct_fntype_id (prototype *protoptr) +{ + /* Determine the maximum space for a function type descriptor id. + Each type requires at most 9 characters (6 for the mode*, 1 for + the optional 'u' preceding the mode, 1 for the optional 'p' + preceding the mode, and 1 for an underscore following the mode). + We also need 5 characters for the string "ftype" that separates + the return mode from the argument modes. The last argument doesn't + need a trailing underscore, but we count that as the one trailing + "ftype" instead. For the special case of zero arguments, we need 9 + for the return type and 7 for "ftype_v". Finally, we need one + character for the terminating null. Thus for a function with N + arguments, we need at most 9N+15 characters for N>0, otherwise 17. + ---- + *Worst case is bv16qi for "vector bool char". */ + int len = protoptr->nargs ? (protoptr->nargs + 1) * 9 + 6 : 17; + char *buf = (char *) malloc (len); + int bufi = 0; + + if (protoptr->rettype.ispointer) + buf[bufi++] = 'p'; + + if (protoptr->rettype.isvoid) + buf[bufi++] = 'v'; + else + { + if (protoptr->rettype.isunsigned) + buf[bufi++] = 'u'; + if (protoptr->rettype.isvector) + complete_vector_type (&protoptr->rettype, buf, &bufi); + else + complete_base_type (&protoptr->rettype, buf, &bufi); + } + + memcpy (&buf[bufi], "_ftype", 6); + bufi += 6; + + if (!protoptr->nargs) + { + memcpy (&buf[bufi], "_v", 2); + bufi += 2; + } + else + { + typelist *argptr = protoptr->args; + for (int i = 0; i < protoptr->nargs; i++, argptr = argptr->next) + { + assert (argptr); + buf[bufi++] = '_'; + if (argptr->info.isconst + && argptr->info.base == BT_INT + && !argptr->info.ispointer) + { + buf[bufi++] = 'c'; + buf[bufi++] = 'i'; + continue; + } + if (argptr->info.ispointer) + { + if (argptr->info.isvoid) + { + if (argptr->info.isconst) + { + memcpy (&buf[bufi], "pcvoid", 6); + bufi += 6; + continue; + } + else + { + buf[bufi++] = 'p'; + buf[bufi++] = 'v'; + continue; + } + } + else + buf[bufi++] = 'p'; + } + + if (argptr->info.isunsigned) + buf[bufi++] = 'u'; + if (argptr->info.isvector) + complete_vector_type (&argptr->info, buf, &bufi); + else + complete_base_type (&argptr->info, buf, &bufi); + } + assert (!argptr); + } + + buf[bufi] = '\0'; + + /* Ignore return value, as duplicates are fine and expected here. */ + rbt_insert (&fntype_rbt, buf); + + return buf; +} + /* Parse a function prototype. This code is shared by the bif and overload file processing. */ static parse_codes @@ -1407,6 +1630,10 @@ parse_bif_entry (void) if (parse_prototype (&bifs[curr_bif].proto) == PC_PARSEFAIL) return PC_PARSEFAIL; + /* Build a function type descriptor identifier from the return type + and argument types, and store it if it does not already exist. */ + bifs[curr_bif].fndecl = construct_fntype_id (&bifs[curr_bif].proto); + /* Now process line 2. First up is the builtin id. */ if (!advance_line (bif_file)) { @@ -1580,6 +1807,10 @@ parse_ovld_entry (void) if (ovlds[curr_ovld].proto.nargs > max_ovld_args) max_ovld_args = ovlds[curr_ovld].proto.nargs; + /* Build a function type descriptor identifier from the return type + and argument types, and store it if it does not already exist. */ + ovlds[curr_ovld].fndecl = construct_fntype_id (&ovlds[curr_ovld].proto); + /* Now process line 2, which just contains the builtin id and an optional overload id. */ if (!advance_line (ovld_file)) -- cgit v1.1 From 86e5e4c93716b84a49a2aba2b52649b366a77b95 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 21 Jul 2021 08:47:49 -0400 Subject: rs6000: Write output to the builtin definition include file 2021-06-07 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (write_defines_file): Implement. --- gcc/config/rs6000/rs6000-gen-builtins.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 2ddebcb..0f6fd0c 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -2019,6 +2019,23 @@ write_init_file (void) static int write_defines_file (void) { + fprintf (defines_file, "#ifndef _RS6000_VECDEFINES_H\n"); + fprintf (defines_file, "#define _RS6000_VECDEFINES_H 1\n\n"); + fprintf (defines_file, "#if defined(_ARCH_PPC64) && defined (_ARCH_PWR9)\n"); + fprintf (defines_file, " #define _ARCH_PPC64_PWR9 1\n"); + fprintf (defines_file, "#endif\n\n"); + for (int i = 0; i < num_ovld_stanzas; i++) + if (strcmp (ovld_stanzas[i].extern_name, "SKIP")) + { + if (ovld_stanzas[i].ifdef) + fprintf (defines_file, "#ifdef %s\n", ovld_stanzas[i].ifdef); + fprintf (defines_file, "#define %s %s\n", + ovld_stanzas[i].extern_name, + ovld_stanzas[i].intern_name); + if (ovld_stanzas[i].ifdef) + fprintf (defines_file, "#endif\n"); + } + fprintf (defines_file, "\n#endif\n"); return 1; } -- cgit v1.1 From 89c0330163f94043b65a0be4d8b29b2f695efc8a Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 21 Jul 2021 08:57:06 -0400 Subject: rs6000: Write output to the builtins header file 2021-07-21 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (write_autogenerated_header): New function. (write_decls): Likewise. (write_extern_fntype): New callback function. (write_header_file): Implement. --- gcc/config/rs6000/rs6000-gen-builtins.c | 228 ++++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 0f6fd0c..1b3a114 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -1998,11 +1998,239 @@ parse_ovld (void) return result; } +/* Write a comment at the top of FILE about how the code was generated. */ +static void +write_autogenerated_header (FILE *file) +{ + fprintf (file, "/* Automatically generated by the program '%s'\n", + pgm_path); + fprintf (file, " from the files '%s' and '%s'. */\n\n", + bif_path, ovld_path); +} + +/* Write declarations into the header file. */ +static void +write_decls (void) +{ + fprintf (header_file, "enum rs6000_gen_builtins\n{\n RS6000_BIF_NONE,\n"); + for (int i = 0; i <= curr_bif; i++) + fprintf (header_file, " RS6000_BIF_%s,\n", bifs[bif_order[i]].idname); + fprintf (header_file, " RS6000_BIF_MAX,\n"); + fprintf (header_file, " RS6000_OVLD_NONE,\n"); + for (int i = 0; i < num_ovld_stanzas; i++) + fprintf (header_file, " RS6000_OVLD_%s,\n", ovld_stanzas[i].stanza_id); + fprintf (header_file, " RS6000_OVLD_MAX\n};\n\n"); + + fprintf (header_file, + "extern GTY(()) tree rs6000_builtin_decls_x[RS6000_OVLD_MAX];\n\n"); + + fprintf (header_file, + "enum rs6000_ovld_instances\n{\n RS6000_INST_NONE,\n"); + for (int i = 0; i <= curr_ovld; i++) + fprintf (header_file, " RS6000_INST_%s,\n", ovlds[i].ovld_id_name); + fprintf (header_file, " RS6000_INST_MAX\n};\n\n"); + + fprintf (header_file, "#define MAX_OVLD_ARGS %d\n", max_ovld_args); + + fprintf (header_file, "enum restriction {\n"); + fprintf (header_file, " RES_NONE,\n"); + fprintf (header_file, " RES_BITS,\n"); + fprintf (header_file, " RES_RANGE,\n"); + fprintf (header_file, " RES_VAR_RANGE,\n"); + fprintf (header_file, " RES_VALUES\n"); + fprintf (header_file, "};\n\n"); + + fprintf (header_file, "enum bif_enable {\n"); + fprintf (header_file, " ENB_ALWAYS,\n"); + fprintf (header_file, " ENB_P5,\n"); + fprintf (header_file, " ENB_P6,\n"); + fprintf (header_file, " ENB_ALTIVEC,\n"); + fprintf (header_file, " ENB_CELL,\n"); + fprintf (header_file, " ENB_VSX,\n"); + fprintf (header_file, " ENB_P7,\n"); + fprintf (header_file, " ENB_P7_64,\n"); + fprintf (header_file, " ENB_P8,\n"); + fprintf (header_file, " ENB_P8V,\n"); + fprintf (header_file, " ENB_P9,\n"); + fprintf (header_file, " ENB_P9_64,\n"); + fprintf (header_file, " ENB_P9V,\n"); + fprintf (header_file, " ENB_IEEE128_HW,\n"); + fprintf (header_file, " ENB_DFP,\n"); + fprintf (header_file, " ENB_CRYPTO,\n"); + fprintf (header_file, " ENB_HTM,\n"); + fprintf (header_file, " ENB_P10,\n"); + fprintf (header_file, " ENB_P10_64,\n"); + fprintf (header_file, " ENB_MMA\n"); + fprintf (header_file, "};\n\n"); + + fprintf (header_file, "#define PPC_MAXRESTROPNDS 3\n"); + fprintf (header_file, "struct GTY((user)) bifdata\n"); + fprintf (header_file, "{\n"); + fprintf (header_file, " const char *bifname;\n"); + fprintf (header_file, " bif_enable enable;\n"); + fprintf (header_file, " tree fntype;\n"); + fprintf (header_file, " insn_code icode;\n"); + fprintf (header_file, " int nargs;\n"); + fprintf (header_file, " int bifattrs;\n"); + fprintf (header_file, " int restr_opnd[PPC_MAXRESTROPNDS];\n"); + fprintf (header_file, " restriction restr[PPC_MAXRESTROPNDS];\n"); + fprintf (header_file, " int restr_val1[PPC_MAXRESTROPNDS];\n"); + fprintf (header_file, " int restr_val2[PPC_MAXRESTROPNDS];\n"); + fprintf (header_file, " const char *attr_string;\n"); + fprintf (header_file, " rs6000_gen_builtins assoc_bif;\n"); + fprintf (header_file, "};\n\n"); + + fprintf (header_file, "#define bif_init_bit\t\t(0x00000001)\n"); + fprintf (header_file, "#define bif_set_bit\t\t(0x00000002)\n"); + fprintf (header_file, "#define bif_extract_bit\t\t(0x00000004)\n"); + fprintf (header_file, "#define bif_nosoft_bit\t\t(0x00000008)\n"); + fprintf (header_file, "#define bif_ldvec_bit\t\t(0x00000010)\n"); + fprintf (header_file, "#define bif_stvec_bit\t\t(0x00000020)\n"); + fprintf (header_file, "#define bif_reve_bit\t\t(0x00000040)\n"); + fprintf (header_file, "#define bif_pred_bit\t\t(0x00000080)\n"); + fprintf (header_file, "#define bif_htm_bit\t\t(0x00000100)\n"); + fprintf (header_file, "#define bif_htmspr_bit\t\t(0x00000200)\n"); + fprintf (header_file, "#define bif_htmcr_bit\t\t(0x00000400)\n"); + fprintf (header_file, "#define bif_mma_bit\t\t(0x00000800)\n"); + fprintf (header_file, "#define bif_quad_bit\t\t(0x00001000)\n"); + fprintf (header_file, "#define bif_pair_bit\t\t(0x00002000)\n"); + fprintf (header_file, "#define bif_no32bit_bit\t\t(0x00004000)\n"); + fprintf (header_file, "#define bif_32bit_bit\t\t(0x00008000)\n"); + fprintf (header_file, "#define bif_cpu_bit\t\t(0x00010000)\n"); + fprintf (header_file, "#define bif_ldstmask_bit\t(0x00020000)\n"); + fprintf (header_file, "#define bif_lxvrse_bit\t\t(0x00040000)\n"); + fprintf (header_file, "#define bif_lxvrze_bit\t\t(0x00080000)\n"); + fprintf (header_file, "#define bif_endian_bit\t\t(0x00100000)\n"); + fprintf (header_file, "\n"); + fprintf (header_file, + "#define bif_is_init(x)\t\t((x).bifattrs & bif_init_bit)\n"); + fprintf (header_file, + "#define bif_is_set(x)\t\t((x).bifattrs & bif_set_bit)\n"); + fprintf (header_file, + "#define bif_is_extract(x)\t((x).bifattrs & bif_extract_bit)\n"); + fprintf (header_file, + "#define bif_is_nosoft(x)\t((x).bifattrs & bif_nosoft_bit)\n"); + fprintf (header_file, + "#define bif_is_ldvec(x)\t\t((x).bifattrs & bif_ldvec_bit)\n"); + fprintf (header_file, + "#define bif_is_stvec(x)\t\t((x).bifattrs & bif_stvec_bit)\n"); + fprintf (header_file, + "#define bif_is_reve(x)\t\t((x).bifattrs & bif_reve_bit)\n"); + fprintf (header_file, + "#define bif_is_predicate(x)\t((x).bifattrs & bif_pred_bit)\n"); + fprintf (header_file, + "#define bif_is_htm(x)\t\t((x).bifattrs & bif_htm_bit)\n"); + fprintf (header_file, + "#define bif_is_htmspr(x)\t((x).bifattrs & bif_htmspr_bit)\n"); + fprintf (header_file, + "#define bif_is_htmcr(x)\t\t((x).bifattrs & bif_htmcr_bit)\n"); + fprintf (header_file, + "#define bif_is_mma(x)\t\t((x).bifattrs & bif_mma_bit)\n"); + fprintf (header_file, + "#define bif_is_quad(x)\t\t((x).bifattrs & bif_quad_bit)\n"); + fprintf (header_file, + "#define bif_is_pair(x)\t\t((x).bifattrs & bif_pair_bit)\n"); + fprintf (header_file, + "#define bif_is_no32bit(x)\t((x).bifattrs & bif_no32bit_bit)\n"); + fprintf (header_file, + "#define bif_is_32bit(x)\t((x).bifattrs & bif_32bit_bit)\n"); + fprintf (header_file, + "#define bif_is_cpu(x)\t\t((x).bifattrs & bif_cpu_bit)\n"); + fprintf (header_file, + "#define bif_is_ldstmask(x)\t((x).bifattrs & bif_ldstmask_bit)\n"); + fprintf (header_file, + "#define bif_is_lxvrse(x)\t((x).bifattrs & bif_lxvrse_bit)\n"); + fprintf (header_file, + "#define bif_is_lxvrze(x)\t((x).bifattrs & bif_lxvrze_bit)\n"); + fprintf (header_file, + "#define bif_is_endian(x)\t((x).bifattrs & bif_endian_bit)\n"); + fprintf (header_file, "\n"); + + /* #### Note that the _x is added for now to avoid conflict with + the existing rs6000_builtin_info[] file while testing. It will + be removed as we progress. */ + /* #### Cannot mark this as a GC root because only pointer types can + be marked as GTY((user)) and be GC roots. All trees in here are + kept alive by other globals, so not a big deal. Alternatively, + we could change the enum fields to ints and cast them in and out + to avoid requiring a GTY((user)) designation, but that seems + unnecessarily gross. */ + fprintf (header_file, + "extern bifdata rs6000_builtin_info_x[RS6000_BIF_MAX];\n\n"); + + fprintf (header_file, "struct GTY((user)) ovlddata\n"); + fprintf (header_file, "{\n"); + fprintf (header_file, " const char *bifname;\n"); + fprintf (header_file, " rs6000_gen_builtins bifid;\n"); + fprintf (header_file, " tree fntype;\n"); + fprintf (header_file, " ovlddata *next;\n"); + fprintf (header_file, "};\n\n"); + + fprintf (header_file, "struct ovldrecord\n"); + fprintf (header_file, "{\n"); + fprintf (header_file, " const char *ovld_name;\n"); + fprintf (header_file, " ovlddata *first_instance;\n"); + fprintf (header_file, "};\n\n"); + + fprintf (header_file, + "/* #### Cannot mark this as a GC root because only pointer\n" + " types can be marked as GTY((user)) and be GC roots. All\n" + " trees in here are kept alive by other globals, so not a big\n" + " deal. Alternatively, we could change the enum fields to ints\n" + " and cast them in and out to avoid requiring a GTY((user))\n" + " designation, but that seems unnecessarily gross. */\n"); + fprintf (header_file, + "extern ovlddata rs6000_instance_info[RS6000_INST_MAX];\n"); + fprintf (header_file, "extern ovldrecord rs6000_overload_info[];\n\n"); + + fprintf (header_file, "extern void rs6000_autoinit_builtins ();\n\n"); + fprintf (header_file, + "extern bool rs6000_new_builtin_is_supported_p " + "(rs6000_gen_builtins);\n"); + fprintf (header_file, + "extern tree rs6000_builtin_decl (unsigned, " + "bool ATTRIBUTE_UNUSED);\n\n"); + fprintf (header_file, + "extern void gt_ggc_mx (bifdata *bd);\n"); + fprintf (header_file, + "extern void gt_pch_nx (bifdata *bd);\n"); + fprintf (header_file, + "extern void gt_pch_nx (bifdata *bd, gt_pointer_operator op, " + "void *cookie);\n"); + fprintf (header_file, + "extern void gt_ggc_mx (ovlddata *od);\n"); + fprintf (header_file, + "extern void gt_pch_nx (ovlddata *od);\n"); + fprintf (header_file, + "extern void gt_pch_nx (ovlddata *od, gt_pointer_operator op, " + "void *cookie);\n"); +} + +/* Callback functions used for generating trees for function types. */ +void +write_extern_fntype (char *str) +{ + fprintf (header_file, "extern GTY(()) tree %s;\n", str); +} + /* Write everything to the header file (rs6000-builtins.h). Return 1 if successful, 0 otherwise. */ static int write_header_file (void) { + write_autogenerated_header (header_file); + + fprintf (header_file, "#ifndef _RS6000_BUILTINS_H\n"); + fprintf (header_file, "#define _RS6000_BUILTINS_H 1\n\n"); + fprintf (header_file, "extern int new_builtins_are_live;\n\n"); + + write_decls (); + + /* Write function type list declarators to the header file. */ + rbt_inorder_callback (&fntype_rbt, fntype_rbt.rbt_root, write_extern_fntype); + fprintf (header_file, "\n"); + fprintf (header_file, "\n#endif\n"); + return 1; } -- cgit v1.1 From ef9af12d24108340f9eafa572a5c3aebfff6df88 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 21 Jul 2021 09:16:55 -0400 Subject: rs6000: Write output to the builtins init file, part 1 of 3 2021-07-21 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (write_fntype): New callback function. (write_fntype_init): New stub function. (write_init_bif_table): Likewise. (write_init_ovld_table): New function. (write_init_file): Implement. --- gcc/config/rs6000/rs6000-gen-builtins.c | 164 ++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 1b3a114..dd24369 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -2213,6 +2213,18 @@ write_extern_fntype (char *str) fprintf (header_file, "extern GTY(()) tree %s;\n", str); } +void +write_fntype (char *str) +{ + fprintf (init_file, "tree %s;\n", str); +} + +/* Write an initializer for a function type identified by STR. */ +void +write_fntype_init (char *str) +{ +} + /* Write everything to the header file (rs6000-builtins.h). Return 1 if successful, 0 otherwise. */ static int @@ -2234,11 +2246,163 @@ write_header_file (void) return 1; } +/* Write code to initialize the built-in function table. */ +static void +write_init_bif_table (void) +{ +} + +/* Write code to initialize the overload table. */ +static void +write_init_ovld_table (void) +{ + fprintf (init_file, " int base = RS6000_OVLD_NONE;\n\n"); + fprintf (init_file, + " /* The fndecl for an overload is arbitrarily the first one\n" + " for the overload. We sort out the real types when\n" + " processing the overload in the gcc front end. */\n"); + + for (int i = 0; i <= curr_ovld; i++) + { + fprintf (init_file, + " rs6000_instance_info[RS6000_INST_%s].fntype" + "\n = %s;\n", + ovlds[i].ovld_id_name, ovlds[i].fndecl); + + if (i == 0 || ovlds[i].stanza != ovlds[i-1].stanza) + { + ovld_stanza *stanza = &ovld_stanzas[ovlds[i].stanza]; + fprintf (init_file, "\n"); + + /* Check whether we have a "tf" token in this string, representing + a float128_type_node. It's possible that float128_type_node is + undefined (occurs for -maltivec -mno-vsx, for example), so we + must guard against that. */ + int tf_found = strstr (ovlds[i].fndecl, "tf") != NULL; + + /* Similarly, look for decimal float tokens. */ + int dfp_found = (strstr (ovlds[i].fndecl, "sd") != NULL + || strstr (ovlds[i].fndecl, "dd") != NULL + || strstr (ovlds[i].fndecl, "td") != NULL); + + fprintf (init_file, + " if (new_builtins_are_live)\n"); + fprintf (init_file, " {\n"); + + if (tf_found) + { + fprintf (init_file, " if (float128_type_node)\n"); + fprintf (init_file, " {\n"); + } + else if (dfp_found) + { + fprintf (init_file, " if (dfloat64_type_node)\n"); + fprintf (init_file, " {\n"); + } + + fprintf (init_file, + " rs6000_builtin_decls_x[(int)RS6000_OVLD_%s] = t\n", + stanza->stanza_id); + fprintf (init_file, + " = add_builtin_function (\"%s\",\n", + stanza->intern_name); + fprintf (init_file, + " %s,\n", + ovlds[i].fndecl); + fprintf (init_file, + " (int)RS6000_OVLD_%s," + " BUILT_IN_MD,\n", + stanza->stanza_id); + fprintf (init_file, + " NULL, NULL_TREE);\n"); + + if (tf_found || dfp_found) + fprintf (init_file, " }\n"); + + fprintf (init_file, " }\n\n"); + + fprintf (init_file, + " rs6000_overload_info[RS6000_OVLD_%s - base]" + ".first_instance\n", + stanza->stanza_id); + fprintf (init_file, + " = &rs6000_instance_info[RS6000_INST_%s];\n\n", + ovlds[i].ovld_id_name); + } + } +} + /* Write everything to the initialization file (rs6000-builtins.c). Return 1 if successful, 0 otherwise. */ static int write_init_file (void) { + write_autogenerated_header (init_file); + + fprintf (init_file, "#include \"config.h\"\n"); + fprintf (init_file, "#include \"system.h\"\n"); + fprintf (init_file, "#include \"coretypes.h\"\n"); + fprintf (init_file, "#include \"backend.h\"\n"); + fprintf (init_file, "#include \"rtl.h\"\n"); + fprintf (init_file, "#include \"tree.h\"\n"); + fprintf (init_file, "#include \"langhooks.h\"\n"); + fprintf (init_file, "#include \"insn-codes.h\"\n"); + fprintf (init_file, "#include \"rs6000-builtins.h\"\n"); + fprintf (init_file, "\n"); + + fprintf (init_file, "int new_builtins_are_live = 0;\n\n"); + + fprintf (init_file, "tree rs6000_builtin_decls_x[RS6000_OVLD_MAX];\n\n"); + + rbt_inorder_callback (&fntype_rbt, fntype_rbt.rbt_root, write_fntype); + fprintf (init_file, "\n"); + + fprintf (init_file, "void\n"); + fprintf (init_file, "rs6000_autoinit_builtins ()\n"); + fprintf (init_file, "{\n"); + fprintf (init_file, " tree t;\n"); + rbt_inorder_callback (&fntype_rbt, fntype_rbt.rbt_root, write_fntype_init); + fprintf (init_file, "\n"); + + fprintf (init_file, + " rs6000_builtin_decls_x[RS6000_BIF_NONE] = NULL_TREE;\n"); + fprintf (init_file, + " rs6000_builtin_decls_x[RS6000_BIF_MAX] = NULL_TREE;\n"); + fprintf (init_file, + " rs6000_builtin_decls_x[RS6000_OVLD_NONE] = NULL_TREE;\n\n"); + + write_init_bif_table (); + write_init_ovld_table (); + + fprintf (init_file, "}\n\n"); + + fprintf (init_file, + "void gt_ggc_mx (bifdata *bd)\n"); + fprintf (init_file, + "{\n gt_ggc_mx (bd->fntype);\n}\n\n"); + fprintf (init_file, + "void gt_pch_nx (bifdata *bd)\n"); + fprintf (init_file, + "{\n gt_pch_nx (bd->fntype);\n}\n\n"); + fprintf (init_file, + "void gt_pch_nx (bifdata *bd, gt_pointer_operator op, " + "void *cookie)\n"); + fprintf (init_file, + "{\n op(&(bd->fntype), cookie);\n}\n\n"); + fprintf (init_file, + "void gt_ggc_mx (ovlddata *od)\n"); + fprintf (init_file, + "{\n gt_ggc_mx (od->fntype);\n}\n\n"); + fprintf (init_file, + "void gt_pch_nx (ovlddata *od)\n"); + fprintf (init_file, + "{\n gt_pch_nx (od->fntype);\n}\n\n"); + fprintf (init_file, + "void gt_pch_nx (ovlddata *od, gt_pointer_operator op, " + "void *cookie)\n"); + fprintf (init_file, + "{\n op(&(od->fntype), cookie);\n}\n"); + return 1; } -- cgit v1.1 From d3f5a1418b423ec8c12f05025997c4d5b6e417de Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 21 Jul 2021 09:19:46 -0400 Subject: rs6000: Write output to the builtins init file, part 2 of 3 2021-07-21 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (write_init_bif_table): Implement. --- gcc/config/rs6000/rs6000-gen-builtins.c | 81 +++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index dd24369..da0d14e 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -2250,6 +2250,87 @@ write_header_file (void) static void write_init_bif_table (void) { + for (int i = 0; i <= curr_bif; i++) + { + fprintf (init_file, + " rs6000_builtin_info_x[RS6000_BIF_%s].fntype" + "\n = %s;\n", + bifs[i].idname, bifs[i].fndecl); + + /* Check whether we have a "tf" token in this string, representing + a float128_type_node. It's possible that float128_type_node is + undefined (occurs for -maltivec -mno-vsx, for example), so we + must guard against that. */ + int tf_found = strstr (bifs[i].fndecl, "tf") != NULL; + + /* Similarly, look for decimal float tokens. */ + int dfp_found = (strstr (bifs[i].fndecl, "sd") != NULL + || strstr (bifs[i].fndecl, "dd") != NULL + || strstr (bifs[i].fndecl, "td") != NULL); + + fprintf (init_file, + " if (new_builtins_are_live)\n"); + fprintf (init_file, " {\n"); + + if (tf_found) + { + fprintf (init_file, " if (float128_type_node)\n"); + fprintf (init_file, " {\n"); + } + else if (dfp_found) + { + fprintf (init_file, " if (dfloat64_type_node)\n"); + fprintf (init_file, " {\n"); + } + + fprintf (init_file, + " rs6000_builtin_decls_x[(int)RS6000_BIF_%s] = t\n", + bifs[i].idname); + fprintf (init_file, + " = add_builtin_function (\"%s\",\n", + bifs[i].proto.bifname); + fprintf (init_file, + " %s,\n", + bifs[i].fndecl); + fprintf (init_file, + " (int)RS6000_BIF_%s," + " BUILT_IN_MD,\n", + bifs[i].idname); + fprintf (init_file, + " NULL, NULL_TREE);\n"); + if (bifs[i].kind == FNK_CONST) + { + fprintf (init_file, " TREE_READONLY (t) = 1;\n"); + fprintf (init_file, " TREE_NOTHROW (t) = 1;\n"); + } + else if (bifs[i].kind == FNK_PURE) + { + fprintf (init_file, " DECL_PURE_P (t) = 1;\n"); + fprintf (init_file, " TREE_NOTHROW (t) = 1;\n"); + } + else if (bifs[i].kind == FNK_FPMATH) + { + fprintf (init_file, " TREE_NOTHROW (t) = 1;\n"); + fprintf (init_file, " if (flag_rounding_math)\n"); + fprintf (init_file, " {\n"); + fprintf (init_file, " DECL_PURE_P (t) = 1;\n"); + fprintf (init_file, " DECL_IS_NOVOPS (t) = 1;\n"); + fprintf (init_file, " }\n"); + fprintf (init_file, " else\n"); + fprintf (init_file, " TREE_READONLY (t) = 1;\n"); + } + + if (tf_found || dfp_found) + { + fprintf (init_file, " }\n"); + fprintf (init_file, " else\n"); + fprintf (init_file, " {\n"); + fprintf (init_file, " rs6000_builtin_decls_x" + "[(int)RS6000_BIF_%s] = NULL_TREE;\n", bifs[i].idname); + fprintf (init_file, " }\n"); + } + fprintf (init_file, " }\n\n"); + } } /* Write code to initialize the overload table. */ -- cgit v1.1 From a56c251898ea70b46798d7893a871bcfe318529b Mon Sep 17 00:00:00 2001 From: liuhongt Date: Tue, 20 Jul 2021 18:32:35 +0800 Subject: Support logic shift left/right for avx512 mask type. gcc/ChangeLog: * config/i386/constraints.md (Wb): New constraint. (Ww): Ditto. * config/i386/i386.md (*ashlhi3_1): Extend to avx512 mask shift. (*ashlqi3_1): Ditto. (*3_1): Split to .. (*ashr3_1): this, ... (*lshr3_1): and this, also extend this pattern to avx512 mask registers. (*3_1): Split to .. (*ashr3_1): this, ... (*lshrqi3_1): and this, also extend this pattern to avx512 mask registers. (*lshrhi3_1): And this, also extend this pattern to avx512 mask registers. * config/i386/sse.md (k): New define_split after it to convert generic shift pattern to mask shift ones. gcc/testsuite/ChangeLog: * gcc.target/i386/mask-shift.c: New test. --- gcc/config/i386/constraints.md | 10 +++ gcc/config/i386/i386.md | 162 +++++++++++++++++++++++++++++++++++------ gcc/config/i386/sse.md | 14 ++++ 3 files changed, 162 insertions(+), 24 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md index 485e3f5..4aa28a5 100644 --- a/gcc/config/i386/constraints.md +++ b/gcc/config/i386/constraints.md @@ -222,6 +222,16 @@ (match_operand 0 "vector_all_ones_operand")))) ;; Integer constant constraints. +(define_constraint "Wb" + "Integer constant in the range 0 @dots{} 7, for 8-bit shifts." + (and (match_code "const_int") + (match_test "IN_RANGE (ival, 0, 7)"))) + +(define_constraint "Ww" + "Integer constant in the range 0 @dots{} 15, for 16-bit shifts." + (and (match_code "const_int") + (match_test "IN_RANGE (ival, 0, 15)"))) + (define_constraint "I" "Integer constant in the range 0 @dots{} 31, for 32-bit shifts." (and (match_code "const_int") diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 8b809c4..44ae18e 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -1136,6 +1136,7 @@ ;; Immediate operand constraint for shifts. (define_mode_attr S [(QI "I") (HI "I") (SI "I") (DI "J") (TI "O")]) +(define_mode_attr KS [(QI "Wb") (HI "Ww") (SI "I") (DI "J")]) ;; Print register name in the specified mode. (define_mode_attr k [(QI "b") (HI "w") (SI "k") (DI "q")]) @@ -11088,9 +11089,9 @@ (set_attr "mode" "")]) (define_insn "*ashl3_1" - [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r") - (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0,l,rm") - (match_operand:QI 2 "nonmemory_operand" "c,M,r"))) + [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,?k") + (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0,l,rm,k") + (match_operand:QI 2 "nonmemory_operand" "c,M,r,"))) (clobber (reg:CC FLAGS_REG))] "ix86_binary_operator_ok (ASHIFT, mode, operands)" { @@ -11098,6 +11099,7 @@ { case TYPE_LEA: case TYPE_ISHIFTX: + case TYPE_MSKLOG: return "#"; case TYPE_ALU: @@ -11113,7 +11115,7 @@ return "sal{}\t{%2, %0|%0, %2}"; } } - [(set_attr "isa" "*,*,bmi2") + [(set_attr "isa" "*,*,bmi2,avx512bw") (set (attr "type") (cond [(eq_attr "alternative" "1") (const_string "lea") @@ -11123,6 +11125,8 @@ (match_operand 0 "register_operand")) (match_operand 2 "const1_operand")) (const_string "alu") + (eq_attr "alternative" "3") + (const_string "msklog") ] (const_string "ishift"))) (set (attr "length_immediate") @@ -11218,15 +11222,16 @@ "operands[2] = gen_lowpart (SImode, operands[2]);") (define_insn "*ashlhi3_1" - [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yp") - (ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l") - (match_operand:QI 2 "nonmemory_operand" "cI,M"))) + [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yp,?k") + (ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l,k") + (match_operand:QI 2 "nonmemory_operand" "cI,M,Ww"))) (clobber (reg:CC FLAGS_REG))] "ix86_binary_operator_ok (ASHIFT, HImode, operands)" { switch (get_attr_type (insn)) { case TYPE_LEA: + case TYPE_MSKLOG: return "#"; case TYPE_ALU: @@ -11241,9 +11246,12 @@ return "sal{w}\t{%2, %0|%0, %2}"; } } - [(set (attr "type") + [(set_attr "isa" "*,*,avx512f") + (set (attr "type") (cond [(eq_attr "alternative" "1") (const_string "lea") + (eq_attr "alternative" "2") + (const_string "msklog") (and (and (match_test "TARGET_DOUBLE_WITH_ADD") (match_operand 0 "register_operand")) (match_operand 2 "const1_operand")) @@ -11259,18 +11267,19 @@ (match_test "optimize_function_for_size_p (cfun)"))))) (const_string "0") (const_string "*"))) - (set_attr "mode" "HI,SI")]) + (set_attr "mode" "HI,SI,HI")]) (define_insn "*ashlqi3_1" - [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,Yp") - (ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,l") - (match_operand:QI 2 "nonmemory_operand" "cI,cI,M"))) + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,Yp,?k") + (ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,l,k") + (match_operand:QI 2 "nonmemory_operand" "cI,cI,M,Wb"))) (clobber (reg:CC FLAGS_REG))] "ix86_binary_operator_ok (ASHIFT, QImode, operands)" { switch (get_attr_type (insn)) { case TYPE_LEA: + case TYPE_MSKLOG: return "#"; case TYPE_ALU: @@ -11298,9 +11307,12 @@ } } } - [(set (attr "type") + [(set_attr "isa" "*,*,*,avx512dq") + (set (attr "type") (cond [(eq_attr "alternative" "2") (const_string "lea") + (eq_attr "alternative" "3") + (const_string "msklog") (and (and (match_test "TARGET_DOUBLE_WITH_ADD") (match_operand 0 "register_operand")) (match_operand 2 "const1_operand")) @@ -11316,7 +11328,7 @@ (match_test "optimize_function_for_size_p (cfun)"))))) (const_string "0") (const_string "*"))) - (set_attr "mode" "QI,SI,SI") + (set_attr "mode" "QI,SI,SI,QI") ;; Potential partial reg stall on alternative 1. (set (attr "preferred_for_speed") (cond [(eq_attr "alternative" "1") @@ -11818,13 +11830,13 @@ [(set_attr "type" "ishiftx") (set_attr "mode" "")]) -(define_insn "*3_1" +(define_insn "*ashr3_1" [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r") - (any_shiftrt:SWI48 + (ashiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0,rm") (match_operand:QI 2 "nonmemory_operand" "c,r"))) (clobber (reg:CC FLAGS_REG))] - "ix86_binary_operator_ok (, mode, operands)" + "ix86_binary_operator_ok (ASHIFTRT, mode, operands)" { switch (get_attr_type (insn)) { @@ -11834,9 +11846,9 @@ default: if (operands[2] == const1_rtx && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) - return "{}\t%0"; + return "sar{}\t%0"; else - return "{}\t{%2, %0|%0, %2}"; + return "sar{}\t{%2, %0|%0, %2}"; } } [(set_attr "isa" "*,bmi2") @@ -11850,6 +11862,40 @@ (const_string "*"))) (set_attr "mode" "")]) +(define_insn "*lshr3_1" + [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,?k") + (lshiftrt:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,k") + (match_operand:QI 2 "nonmemory_operand" "c,r,"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (LSHIFTRT, mode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_ISHIFTX: + case TYPE_MSKLOG: + return "#"; + + default: + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "shr{}\t%0"; + else + return "shr{}\t{%2, %0|%0, %2}"; + } +} + [(set_attr "isa" "*,bmi2,avx512bw") + (set_attr "type" "ishift,ishiftx,msklog") + (set (attr "length_immediate") + (if_then_else + (and (and (match_operand 2 "const1_operand") + (eq_attr "alternative" "0")) + (ior (match_test "TARGET_SHIFT1") + (match_test "optimize_function_for_size_p (cfun)"))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "")]) + ;; Convert shift to the shiftx pattern to avoid flags dependency. (define_split [(set (match_operand:SWI48 0 "register_operand") @@ -11915,19 +11961,19 @@ (zero_extend:DI (any_shiftrt:SI (match_dup 1) (match_dup 2))))] "operands[2] = gen_lowpart (SImode, operands[2]);") -(define_insn "*3_1" +(define_insn "*ashr3_1" [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m") - (any_shiftrt:SWI12 + (ashiftrt:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "0") (match_operand:QI 2 "nonmemory_operand" "c"))) (clobber (reg:CC FLAGS_REG))] - "ix86_binary_operator_ok (, mode, operands)" + "ix86_binary_operator_ok (ASHIFTRT, mode, operands)" { if (operands[2] == const1_rtx && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) - return "{}\t%0"; + return "sar{}\t%0"; else - return "{}\t{%2, %0|%0, %2}"; + return "sar{}\t{%2, %0|%0, %2}"; } [(set_attr "type" "ishift") (set (attr "length_immediate") @@ -11939,6 +11985,74 @@ (const_string "*"))) (set_attr "mode" "")]) +(define_insn "*lshrqi3_1" + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,?k") + (lshiftrt:QI + (match_operand:QI 1 "nonimmediate_operand" "0, k") + (match_operand:QI 2 "nonmemory_operand" "cI,Wb"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (LSHIFTRT, QImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_ISHIFT: + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "shr{b}\t%0"; + else + return "shr{b}\t{%2, %0|%0, %2}"; + case TYPE_MSKLOG: + return "#"; + default: + gcc_unreachable (); + } +} + [(set_attr "isa" "*,avx512dq") + (set_attr "type" "ishift,msklog") + (set (attr "length_immediate") + (if_then_else + (and (and (match_operand 2 "const1_operand") + (eq_attr "alternative" "0")) + (ior (match_test "TARGET_SHIFT1") + (match_test "optimize_function_for_size_p (cfun)"))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "QI")]) + +(define_insn "*lshrhi3_1" + [(set (match_operand:HI 0 "nonimmediate_operand" "=rm, ?k") + (lshiftrt:HI + (match_operand:HI 1 "nonimmediate_operand" "0, k") + (match_operand:QI 2 "nonmemory_operand" "cI, Ww"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (LSHIFTRT, HImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_ISHIFT: + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "shr{w}\t%0"; + else + return "shr{w}\t{%2, %0|%0, %2}"; + case TYPE_MSKLOG: + return "#"; + default: + gcc_unreachable (); + } +} + [(set_attr "isa" "*, avx512f") + (set_attr "type" "ishift,msklog") + (set (attr "length_immediate") + (if_then_else + (and (and (match_operand 2 "const1_operand") + (eq_attr "alternative" "0")) + (ior (match_test "TARGET_SHIFT1") + (match_test "optimize_function_for_size_p (cfun)"))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "HI")]) + (define_insn "*3_1_slp" [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+")) (any_shiftrt:SWI12 (match_operand:SWI12 1 "register_operand" "0") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index ab29999..f8759e4 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1755,6 +1755,20 @@ (set_attr "prefix" "vex") (set_attr "mode" "")]) +(define_split + [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand") + (any_lshift:SWI1248_AVX512BW + (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand") + (match_operand 2 "const_int_operand"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_AVX512F && reload_completed" + [(parallel + [(set (match_dup 0) + (any_lshift:SWI1248_AVX512BW + (match_dup 1) + (match_dup 2))) + (unspec [(const_int 0)] UNSPEC_MASKOP)])]) + (define_insn "ktest" [(set (reg:CC FLAGS_REG) (unspec:CC -- cgit v1.1 From a6291d88d5b6c17d41950e21d7d452f7f0f73020 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Tue, 13 Jul 2021 18:22:03 +0800 Subject: Remove pass_cpb which is related to enable avx512 embedded broadcast from constant pool. By optimizing vector movement to broadcast in ix86_expand_vector_move during pass_expand, pass_reload/LRA can automatically generate an avx512 embedded broadcast, pass_cpb is not needed. Considering that in the absence of avx512f, broadcast from memory is still slightly faster than loading the entire memory, so always enable broadcast. benchmark: https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vaddps/broadcast The performance diff strategy : cycles memory : 1046611188 memory : 1255420817 memory : 1044720793 memory : 1253414145 average : 1097868397 broadcast : 1044430688 broadcast : 1044477630 broadcast : 1253554603 broadcast : 1044561934 average : 1096756213 But however broadcast has larger size. the size diff size broadcast.o text data bss dec hex filename 137 0 0 137 89 broadcast.o size memory.o text data bss dec hex filename 115 0 0 115 73 memory.o gcc/ChangeLog: * config/i386/i386-expand.c (ix86_broadcast_from_integer_constant): Rename to .. (ix86_broadcast_from_constant): .. this, and extend it to handle float mode. (ix86_expand_vector_move): Extend to float mode. * config/i386/i386-features.c (replace_constant_pool_with_broadcast): Remove. (remove_partial_avx_dependency_gate): Ditto. (constant_pool_broadcast): Ditto. (class pass_constant_pool_broadcast): Ditto. (make_pass_constant_pool_broadcast): Ditto. (remove_partial_avx_dependency): Adjust gate. * config/i386/i386-passes.def: Remove pass_constant_pool_broadcast. * config/i386/i386-protos.h (make_pass_constant_pool_broadcast): Remove. gcc/testsuite/ChangeLog: * gcc.target/i386/fuse-caller-save-xmm.c: Adjust testcase. --- gcc/config/i386/i386-expand.c | 36 +++++++-- gcc/config/i386/i386-features.c | 157 ++-------------------------------------- gcc/config/i386/i386-passes.def | 1 - gcc/config/i386/i386-protos.h | 1 - 4 files changed, 33 insertions(+), 162 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 69ea79e..896bd68 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -453,8 +453,10 @@ ix86_expand_move (machine_mode mode, rtx operands[]) emit_insn (gen_rtx_SET (op0, op1)); } +/* OP is a memref of CONST_VECTOR, return scalar constant mem + if CONST_VECTOR is a vec_duplicate, else return NULL. */ static rtx -ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) +ix86_broadcast_from_constant (machine_mode mode, rtx op) { int nunits = GET_MODE_NUNITS (mode); if (nunits < 2) @@ -462,7 +464,8 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) /* Don't use integer vector broadcast if we can't move from GPR to SSE register directly. */ - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) + if (!TARGET_INTER_UNIT_MOVES_TO_VEC + && INTEGRAL_MODE_P (mode)) return nullptr; /* Convert CONST_VECTOR to a non-standard SSE constant integer @@ -470,12 +473,17 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) if (!(TARGET_AVX2 || (TARGET_AVX && (GET_MODE_INNER (mode) == SImode - || GET_MODE_INNER (mode) == DImode))) + || GET_MODE_INNER (mode) == DImode)) + || FLOAT_MODE_P (mode)) || standard_sse_constant_p (op, mode)) return nullptr; - /* Don't broadcast from a 64-bit integer constant in 32-bit mode. */ - if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT) + /* Don't broadcast from a 64-bit integer constant in 32-bit mode. + We can still put 64-bit integer constant in memory when + avx512 embed broadcast is available. */ + if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT + && (!TARGET_AVX512F + || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL))) return nullptr; if (GET_MODE_INNER (mode) == TImode) @@ -561,17 +569,29 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) if (can_create_pseudo_p () && GET_MODE_SIZE (mode) >= 16 - && GET_MODE_CLASS (mode) == MODE_VECTOR_INT + && VECTOR_MODE_P (mode) && (MEM_P (op1) && SYMBOL_REF_P (XEXP (op1, 0)) && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0)))) { - rtx first = ix86_broadcast_from_integer_constant (mode, op1); + rtx first = ix86_broadcast_from_constant (mode, op1); if (first != nullptr) { /* Broadcast to XMM/YMM/ZMM register from an integer - constant. */ + constant or scalar mem. */ + /* Hard registers are used for 2 purposes: + 1. Prevent stack realignment when the original code + doesn't use vector registers, which is the same for + memcpy and memset. + 2. Prevent combine to convert constant broadcast to + load from constant pool. */ op1 = ix86_gen_scratch_sse_rtx (mode); + if (FLOAT_MODE_P (mode) + || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode)) + { + first = force_const_mem (GET_MODE_INNER (mode), first); + op1 = gen_reg_rtx (mode); + } bool ok = ix86_expand_vector_init_duplicate (false, mode, op1, first); gcc_assert (ok); diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index cbd430a..d9c6652 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -2136,81 +2136,6 @@ make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt) return new pass_insert_endbr_and_patchable_area (ctxt); } -/* Replace all one-value const vector that are referenced by SYMBOL_REFs in x - with embedded broadcast. i.e.transform - - vpaddq .LC0(%rip), %zmm0, %zmm0 - ret - .LC0: - .quad 3 - .quad 3 - .quad 3 - .quad 3 - .quad 3 - .quad 3 - .quad 3 - .quad 3 - - to - - vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0 - ret - .LC0: - .quad 3 */ -static void -replace_constant_pool_with_broadcast (rtx_insn *insn) -{ - subrtx_ptr_iterator::array_type array; - FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL) - { - rtx *loc = *iter; - rtx x = *loc; - rtx broadcast_mem, vec_dup, constant, first; - machine_mode mode; - - /* Constant pool. */ - if (!MEM_P (x) - || !SYMBOL_REF_P (XEXP (x, 0)) - || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))) - continue; - - /* Const vector. */ - mode = GET_MODE (x); - if (!VECTOR_MODE_P (mode)) - return; - constant = get_pool_constant (XEXP (x, 0)); - if (GET_CODE (constant) != CONST_VECTOR) - return; - - /* There could be some rtx like - (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) - but with "*.LC1" refer to V2DI constant vector. */ - if (GET_MODE (constant) != mode) - { - constant = simplify_subreg (mode, constant, GET_MODE (constant), 0); - if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR) - return; - } - first = XVECEXP (constant, 0, 0); - - for (int i = 1; i < GET_MODE_NUNITS (mode); ++i) - { - rtx tmp = XVECEXP (constant, 0, i); - /* Vector duplicate value. */ - if (!rtx_equal_p (tmp, first)) - return; - } - - /* Replace with embedded broadcast. */ - broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first); - vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem); - validate_change (insn, loc, vec_dup, 0); - - /* At most 1 memory_operand in an insn. */ - return; - } -} - /* At entry of the nearest common dominator for basic blocks with conversions, generate a single vxorps %xmmN, %xmmN, %xmmN @@ -2249,10 +2174,6 @@ remove_partial_avx_dependency (void) if (!NONDEBUG_INSN_P (insn)) continue; - /* Handle AVX512 embedded broadcast here to save compile time. */ - if (TARGET_AVX512F) - replace_constant_pool_with_broadcast (insn); - set = single_set (insn); if (!set) continue; @@ -2384,16 +2305,6 @@ remove_partial_avx_dependency (void) return 0; } -static bool -remove_partial_avx_dependency_gate () -{ - return (TARGET_AVX - && TARGET_SSE_PARTIAL_REG_DEPENDENCY - && TARGET_SSE_MATH - && optimize - && optimize_function_for_speed_p (cfun)); -} - namespace { const pass_data pass_data_remove_partial_avx_dependency = @@ -2419,7 +2330,11 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { - return remove_partial_avx_dependency_gate (); + return (TARGET_AVX + && TARGET_SSE_PARTIAL_REG_DEPENDENCY + && TARGET_SSE_MATH + && optimize + && optimize_function_for_speed_p (cfun)); } virtual unsigned int execute (function *) @@ -2436,68 +2351,6 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt) return new pass_remove_partial_avx_dependency (ctxt); } -/* For const vector having one duplicated value, there's no need to put - whole vector in the constant pool when target supports embedded broadcast. */ -static unsigned int -constant_pool_broadcast (void) -{ - timevar_push (TV_MACH_DEP); - rtx_insn *insn; - - for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) - { - if (INSN_P (insn)) - replace_constant_pool_with_broadcast (insn); - } - timevar_pop (TV_MACH_DEP); - return 0; -} - -namespace { - -const pass_data pass_data_constant_pool_broadcast = -{ - RTL_PASS, /* type */ - "cpb", /* name */ - OPTGROUP_NONE, /* optinfo_flags */ - TV_MACH_DEP, /* tv_id */ - 0, /* properties_required */ - 0, /* properties_provided */ - 0, /* properties_destroyed */ - 0, /* todo_flags_start */ - TODO_df_finish, /* todo_flags_finish */ -}; - -class pass_constant_pool_broadcast : public rtl_opt_pass -{ -public: - pass_constant_pool_broadcast (gcc::context *ctxt) - : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt) - {} - - /* opt_pass methods: */ - virtual bool gate (function *) - { - /* Return false if rpad pass gate is true. - replace_constant_pool_with_broadcast is called - from both this pass and rpad pass. */ - return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ()); - } - - virtual unsigned int execute (function *) - { - return constant_pool_broadcast (); - } -}; // class pass_cpb - -} // anon namespace - -rtl_opt_pass * -make_pass_constant_pool_broadcast (gcc::context *ctxt) -{ - return new pass_constant_pool_broadcast (ctxt); -} - /* This compares the priority of target features in function DECL1 and DECL2. It returns positive value if DECL1 is higher priority, negative value if DECL2 is higher priority and 0 if they are the diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def index 44df00e..29baf8a 100644 --- a/gcc/config/i386/i386-passes.def +++ b/gcc/config/i386/i386-passes.def @@ -33,4 +33,3 @@ along with GCC; see the file COPYING3. If not see INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area); INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); - INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast); diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 51376fc..07ac02a 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -395,4 +395,3 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area (gcc::context *); extern rtl_opt_pass *make_pass_remove_partial_avx_dependency (gcc::context *); -extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *); -- cgit v1.1 From 3382846558e02044598556e66e5ea1cb3115429d Mon Sep 17 00:00:00 2001 From: Haochen Gui Date: Fri, 23 Jul 2021 09:47:23 +0800 Subject: Fix execution failure of parity_1.f90 on P10 [PR100952] gcc/ PR target/100952 * config/rs6000/rs6000.md (cstore4): Fix wrong fall through. --- gcc/config/rs6000/rs6000.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 2368153..a84438f 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -11831,7 +11831,10 @@ { /* Everything is best done with setbc[r] if available. */ if (TARGET_POWER10 && TARGET_ISEL) - rs6000_emit_int_cmove (operands[0], operands[1], const1_rtx, const0_rtx); + { + rs6000_emit_int_cmove (operands[0], operands[1], const1_rtx, const0_rtx); + DONE; + } /* Expanding EQ and NE directly to some machine instructions does not help but does hurt combine. So don't. */ -- cgit v1.1 From 484acfa4cfe9385d7b78919ca9eb2047ded8f078 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 6 Jul 2021 16:20:02 +0100 Subject: aarch64: Use memcpy to copy vector tables in vqtbl[234] intrinsics Use __builtin_memcpy to copy vector structures instead of building a new opaque structure one vector at a time in each of the vqtbl[234] Neon intrinsics in arm_neon.h. This simplifies the header file and also improves code generation - superfluous move instructions were emitted for every register extraction/set in this additional structure. Add new code generation tests to verify that superfluous move instructions are no longer generated for the vqtbl[234] intrinsics. gcc/ChangeLog: 2021-07-08 Jonathan Wright * config/aarch64/arm_neon.h (vqtbl2_s8): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_oi one vector at a time. (vqtbl2_u8): Likewise. (vqtbl2_p8): Likewise. (vqtbl2q_s8): Likewise. (vqtbl2q_u8): Likewise. (vqtbl2q_p8): Likewise. (vqtbl3_s8): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_ci one vector at a time. (vqtbl3_u8): Likewise. (vqtbl3_p8): Likewise. (vqtbl3q_s8): Likewise. (vqtbl3q_u8): Likewise. (vqtbl3q_p8): Likewise. (vqtbl4_s8): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_xi one vector at a time. (vqtbl4_u8): Likewise. (vqtbl4_p8): Likewise. (vqtbl4q_s8): Likewise. (vqtbl4q_u8): Likewise. (vqtbl4q_p8): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: New test. --- gcc/config/aarch64/arm_neon.h | 72 +++++++++++-------------------------------- 1 file changed, 18 insertions(+), 54 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 1048d7c..31ae86e 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -23321,8 +23321,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl2_s8 (int8x16x2_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -23331,8 +23330,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl2_u8 (uint8x16x2_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -23341,8 +23339,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl2_p8 (poly8x16x2_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -23351,8 +23348,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl2q_s8 (int8x16x2_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx); } @@ -23361,8 +23357,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl2q_u8 (uint8x16x2_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x16_t)__builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx); } @@ -23371,8 +23366,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl2q_p8 (poly8x16x2_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x16_t)__builtin_aarch64_qtbl2v16qi (__o, (int8x16_t)__idx); } @@ -23383,9 +23377,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl3_s8 (int8x16x3_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx); } @@ -23394,9 +23386,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl3_u8 (uint8x16x3_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx); } @@ -23405,9 +23395,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl3_p8 (poly8x16x3_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx); } @@ -23416,9 +23404,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl3q_s8 (int8x16x3_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx); } @@ -23427,9 +23413,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl3q_u8 (uint8x16x3_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx); } @@ -23438,9 +23422,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl3q_p8 (poly8x16x3_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx); } @@ -23451,10 +23433,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl4_s8 (int8x16x4_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx); } @@ -23463,10 +23442,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl4_u8 (uint8x16x4_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx); } @@ -23475,10 +23451,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl4_p8 (poly8x16x4_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx); } @@ -23487,10 +23460,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl4q_s8 (int8x16x4_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx); } @@ -23499,10 +23469,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl4q_u8 (uint8x16x4_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx); } @@ -23511,10 +23478,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl4q_p8 (poly8x16x4_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx); } -- cgit v1.1 From 5f65676eba16f38e5e22122e6885c0bd8e504276 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 8 Jul 2021 12:32:45 +0100 Subject: aarch64: Use memcpy to copy vector tables in vqtbx[234] intrinsics Use __builtin_memcpy to copy vector structures instead of building a new opaque structure one vector at a time in each of the vqtbx[234] Neon intrinsics in arm_neon.h. This simplifies the header file and also improves code generation - superfluous move instructions were emitted for every register extraction/set in this additional structure. Add new code generation tests to verify that superfluous move instructions are no longer generated for the vqtbx[234] intrinsics. gcc/ChangeLog: 2021-07-08 Jonathan Wright * config/aarch64/arm_neon.h (vqtbx2_s8): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_oi one vector at a time. (vqtbx2_u8): Likewise. (vqtbx2_p8): Likewise. (vqtbx2q_s8): Likewise. (vqtbx2q_u8): Likewise. (vqtbx2q_p8): Likewise. (vqtbx3_s8): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_ci one vector at a time. (vqtbx3_u8): Likewise. (vqtbx3_p8): Likewise. (vqtbx3q_s8): Likewise. (vqtbx3q_u8): Likewise. (vqtbx3q_p8): Likewise. (vqtbx4_s8): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_xi one vector at a time. (vqtbx4_u8): Likewise. (vqtbx4_p8): Likewise. (vqtbx4q_s8): Likewise. (vqtbx4q_u8): Likewise. (vqtbx4q_p8): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: New tests. --- gcc/config/aarch64/arm_neon.h | 77 ++++++++++++------------------------------- 1 file changed, 21 insertions(+), 56 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 31ae86e..a7b8449 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -23482,15 +23482,14 @@ vqtbl4q_p8 (poly8x16x4_t __tab, uint8x16_t __idx) return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx); } - /* vqtbx2 */ + __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx2_s8 (int8x8_t __r, int8x16x2_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbx2v8qi (__r, __o, (int8x8_t)__idx); } @@ -23499,8 +23498,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx2_u8 (uint8x8_t __r, uint8x16x2_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x8_t)__builtin_aarch64_qtbx2v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -23510,8 +23508,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx2_p8 (poly8x8_t __r, poly8x16x2_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x8_t)__builtin_aarch64_qtbx2v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -23521,8 +23518,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx2q_s8 (int8x16_t __r, int8x16x2_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbx2v16qi (__r, __o, (int8x16_t)__idx); } @@ -23531,10 +23527,9 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx2q_u8 (uint8x16_t __r, uint8x16x2_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x16_t)__builtin_aarch64_qtbx2v16qi ((int8x16_t)__r, __o, - (int8x16_t)__idx); + (int8x16_t)__idx); } __extension__ extern __inline poly8x16_t @@ -23542,21 +23537,19 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx2q_p8 (poly8x16_t __r, poly8x16x2_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x16_t)__builtin_aarch64_qtbx2v16qi ((int8x16_t)__r, __o, (int8x16_t)__idx); } /* vqtbx3 */ + __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx3_s8 (int8x8_t __r, int8x16x3_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbx3v8qi (__r, __o, (int8x8_t)__idx); } @@ -23565,9 +23558,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx3_u8 (uint8x8_t __r, uint8x16x3_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -23577,9 +23568,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx3_p8 (poly8x8_t __r, poly8x16x3_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -23589,9 +23578,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx3q_s8 (int8x16_t __r, int8x16x3_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbx3v16qi (__r, __o, (int8x16_t)__idx); } @@ -23600,9 +23587,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx3q_u8 (uint8x16_t __r, uint8x16x3_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)__r, __o, (int8x16_t)__idx); } @@ -23612,9 +23597,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx3q_p8 (poly8x16_t __r, poly8x16x3_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)__r, __o, (int8x16_t)__idx); } @@ -23626,10 +23609,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx4_s8 (int8x8_t __r, int8x16x4_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbx4v8qi (__r, __o, (int8x8_t)__idx); } @@ -23638,10 +23618,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx4_u8 (uint8x8_t __r, uint8x16x4_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -23651,10 +23628,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx4_p8 (poly8x8_t __r, poly8x16x4_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -23664,10 +23638,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx4q_s8 (int8x16_t __r, int8x16x4_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbx4v16qi (__r, __o, (int8x16_t)__idx); } @@ -23676,10 +23647,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx4q_u8 (uint8x16_t __r, uint8x16x4_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)__r, __o, (int8x16_t)__idx); } @@ -23689,10 +23657,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx4q_p8 (poly8x16_t __r, poly8x16x4_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)__r, __o, (int8x16_t)__idx); } -- cgit v1.1 From f2f04d8b9d1f5d4fc8c3a17c7fa5ac518574f2df Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 8 Jul 2021 23:27:54 +0100 Subject: aarch64: Use memcpy to copy vector tables in vtbl[34] intrinsics Use __builtin_memcpy to copy vector structures instead of building a new opaque structure one vector at a time in each of the vtbl[34] Neon intrinsics in arm_neon.h. This simplifies the header file and also improves code generation - superfluous move instructions were emitted for every register extraction/set in this additional structure. gcc/ChangeLog: 2021-07-08 Jonathan Wright * config/aarch64/arm_neon.h (vtbl3_s8): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_oi one vector at a time. (vtbl3_u8): Likewise. (vtbl3_p8): Likewise. (vtbl4_s8): Likewise. (vtbl4_u8): Likewise. (vtbl4_p8): Likewise. --- gcc/config/aarch64/arm_neon.h | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index a7b8449..0ec46ef 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9682,11 +9682,9 @@ vtbl3_s8 (int8x8x3_t __tab, int8x8_t __idx) int8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); - __temp.val[1] = vcombine_s8 (__tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __temp.val[1] = vcombine_s8 (__tab.val[2], + vcreate_s8 (__AARCH64_UINT64_C (0x0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return __builtin_aarch64_qtbl2v8qi (__o, __idx); } @@ -9697,11 +9695,9 @@ vtbl3_u8 (uint8x8x3_t __tab, uint8x8_t __idx) uint8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); - __temp.val[1] = vcombine_u8 (__tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __temp.val[1] = vcombine_u8 (__tab.val[2], + vcreate_u8 (__AARCH64_UINT64_C (0x0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -9712,11 +9708,9 @@ vtbl3_p8 (poly8x8x3_t __tab, uint8x8_t __idx) poly8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); - __temp.val[1] = vcombine_p8 (__tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __temp.val[1] = vcombine_p8 (__tab.val[2], + vcreate_p8 (__AARCH64_UINT64_C (0x0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return (poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -9728,10 +9722,7 @@ vtbl4_s8 (int8x8x4_t __tab, int8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return __builtin_aarch64_qtbl2v8qi (__o, __idx); } @@ -9743,10 +9734,7 @@ vtbl4_u8 (uint8x8x4_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -9758,10 +9746,7 @@ vtbl4_p8 (poly8x8x4_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return(poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } -- cgit v1.1 From 4848e283ccaed451ddcc38edcb9f5ce9e9f2d7eb Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 8 Jul 2021 23:27:54 +0100 Subject: aarch64: Use memcpy to copy vector tables in vtbx4 intrinsics Use __builtin_memcpy to copy vector structures instead of building a new opaque structure one vector at a time in each of the vtbx4 Neon intrinsics in arm_neon.h. This simplifies the header file and also improves code generation - superfluous move instructions were emitted for every register extraction/set in this additional structure. gcc/ChangeLog: 2021-07-19 Jonathan Wright * config/aarch64/arm_neon.h (vtbx4_s8): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_oi one vector at a time. (vtbx4_u8): Likewise. (vtbx4_p8): Likewise. --- gcc/config/aarch64/arm_neon.h | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 0ec46ef..d383af3 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -28417,10 +28417,7 @@ vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return __builtin_aarch64_qtbx2v8qi (__r, __o, __idx); } @@ -28432,10 +28429,7 @@ vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return (uint8x8_t)__builtin_aarch64_qtbx2v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -28448,10 +28442,7 @@ vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return (poly8x8_t)__builtin_aarch64_qtbx2v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } -- cgit v1.1 From e8de7edde6c5c3cc60f15c78422b85b4ccdc08bf Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 20 Jul 2021 10:28:34 +0100 Subject: aarch64: Use memcpy to copy vector tables in vst4[q] intrinsics Use __builtin_memcpy to copy vector structures instead of building a new opaque structure one vector at a time in each of the vst4[q] Neon intrinsics in arm_neon.h. This simplifies the header file and also improves code generation - superfluous move instructions were emitted for every register extraction/set in this additional structure. Add new code generation tests to verify that superfluous move instructions are no longer generated for the vst4q intrinsics. gcc/ChangeLog: 2021-07-20 Jonathan Wright * config/aarch64/arm_neon.h (vst4_s64): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_xi one vector at a time. (vst4_u64): Likewise. (vst4_f64): Likewise. (vst4_s8): Likewise. (vst4_p8): Likewise. (vst4_s16): Likewise. (vst4_p16): Likewise. (vst4_s32): Likewise. (vst4_u8): Likewise. (vst4_u16): Likewise. (vst4_u32): Likewise. (vst4_f16): Likewise. (vst4_f32): Likewise. (vst4_p64): Likewise. (vst4q_s8): Likewise. (vst4q_p8): Likewise. (vst4q_s16): Likewise. (vst4q_p16): Likewise. (vst4q_s32): Likewise. (vst4q_s64): Likewise. (vst4q_u8): Likewise. (vst4q_u16): Likewise. (vst4q_u32): Likewise. (vst4q_u64): Likewise. (vst4q_f16): Likewise. (vst4q_f32): Likewise. (vst4q_f64): Likewise. (vst4q_p64): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: Add new tests. --- gcc/config/aarch64/arm_neon.h | 148 ++++++++---------------------------------- 1 file changed, 28 insertions(+), 120 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index d383af3..ae3ce8c 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -27914,10 +27914,7 @@ vst4_s64 (int64_t * __a, int64x1x4_t __val) __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); __temp.val[3] = vcombine_s64 (__val.val[3], vcreate_s64 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -27931,10 +27928,7 @@ vst4_u64 (uint64_t * __a, uint64x1x4_t __val) __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); __temp.val[3] = vcombine_u64 (__val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -27948,10 +27942,7 @@ vst4_f64 (float64_t * __a, float64x1x4_t __val) __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); __temp.val[3] = vcombine_f64 (__val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4df ((__builtin_aarch64_simd_df *) __a, __o); } @@ -27965,10 +27956,7 @@ vst4_s8 (int8_t * __a, int8x8x4_t __val) __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); __temp.val[3] = vcombine_s8 (__val.val[3], vcreate_s8 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27982,10 +27970,7 @@ vst4_p8 (poly8_t * __a, poly8x8x4_t __val) __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); __temp.val[3] = vcombine_p8 (__val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27999,10 +27984,7 @@ vst4_s16 (int16_t * __a, int16x4x4_t __val) __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); __temp.val[3] = vcombine_s16 (__val.val[3], vcreate_s16 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -28016,10 +27998,7 @@ vst4_p16 (poly16_t * __a, poly16x4x4_t __val) __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); __temp.val[3] = vcombine_p16 (__val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -28033,10 +28012,7 @@ vst4_s32 (int32_t * __a, int32x2x4_t __val) __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); __temp.val[3] = vcombine_s32 (__val.val[3], vcreate_s32 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -28050,10 +28026,7 @@ vst4_u8 (uint8_t * __a, uint8x8x4_t __val) __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); __temp.val[3] = vcombine_u8 (__val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -28067,10 +28040,7 @@ vst4_u16 (uint16_t * __a, uint16x4x4_t __val) __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); __temp.val[3] = vcombine_u16 (__val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -28084,10 +28054,7 @@ vst4_u32 (uint32_t * __a, uint32x2x4_t __val) __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); __temp.val[3] = vcombine_u32 (__val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -28101,10 +28068,7 @@ vst4_f16 (float16_t * __a, float16x4x4_t __val) __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0))); __temp.val[3] = vcombine_f16 (__val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o); } @@ -28118,10 +28082,7 @@ vst4_f32 (float32_t * __a, float32x2x4_t __val) __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); __temp.val[3] = vcombine_f32 (__val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } @@ -28135,14 +28096,7 @@ vst4_p64 (poly64_t * __a, poly64x1x4_t __val) __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0))); __temp.val[3] = vcombine_p64 (__val.val[3], vcreate_p64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -28151,10 +28105,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_s8 (int8_t * __a, int8x16x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -28163,10 +28114,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_p8 (poly8_t * __a, poly8x16x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -28175,10 +28123,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_s16 (int16_t * __a, int16x8x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -28187,10 +28132,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_p16 (poly16_t * __a, poly16x8x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -28199,10 +28141,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_s32 (int32_t * __a, int32x4x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -28211,10 +28150,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_s64 (int64_t * __a, int64x2x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -28223,10 +28159,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_u8 (uint8_t * __a, uint8x16x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -28235,10 +28168,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_u16 (uint16_t * __a, uint16x8x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -28247,10 +28177,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_u32 (uint32_t * __a, uint32x4x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -28259,10 +28186,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_u64 (uint64_t * __a, uint64x2x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -28271,10 +28195,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_f16 (float16_t * __a, float16x8x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o); } @@ -28283,10 +28204,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_f32 (float32_t * __a, float32x4x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } @@ -28295,10 +28213,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_f64 (float64_t * __a, float64x2x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o); } @@ -28307,14 +28222,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_p64 (poly64_t * __a, poly64x2x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); } -- cgit v1.1 From 95509ee2c135c5338b0bb69bdce63e3b20420bd3 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 21 Jul 2021 10:55:00 +0100 Subject: aarch64: Use memcpy to copy vector tables in vst3[q] intrinsics Use __builtin_memcpy to copy vector structures instead of building a new opaque structure one vector at a time in each of the vst3[q] Neon intrinsics in arm_neon.h. This simplifies the header file and also improves code generation - superfluous move instructions were emitted for every register extraction/set in this additional structure. Add new code generation tests to verify that superfluous move instructions are no longer generated for the vst3q intrinsics. gcc/ChangeLog: 2021-07-21 Jonathan Wright * config/aarch64/arm_neon.h (vst3_s64): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_ci one vector at a time. (vst3_u64): Likewise. (vst3_f64): Likewise. (vst3_s8): Likewise. (vst3_p8): Likewise. (vst3_s16): Likewise. (vst3_p16): Likewise. (vst3_s32): Likewise. (vst3_u8): Likewise. (vst3_u16): Likewise. (vst3_u32): Likewise. (vst3_f16): Likewise. (vst3_f32): Likewise. (vst3_p64): Likewise. (vst3q_s8): Likewise. (vst3q_p8): Likewise. (vst3q_s16): Likewise. (vst3q_p16): Likewise. (vst3q_s32): Likewise. (vst3q_s64): Likewise. (vst3q_u8): Likewise. (vst3q_u16): Likewise. (vst3q_u32): Likewise. (vst3q_u64): Likewise. (vst3q_f16): Likewise. (vst3q_f32): Likewise. (vst3q_f64): Likewise. (vst3q_p64): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: Add new tests. --- gcc/config/aarch64/arm_neon.h | 118 ++++++++++-------------------------------- 1 file changed, 28 insertions(+), 90 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index ae3ce8c..fde321e 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -27543,9 +27543,7 @@ vst3_s64 (int64_t * __a, int64x1x3_t __val) __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -27558,9 +27556,7 @@ vst3_u64 (uint64_t * __a, uint64x1x3_t __val) __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -27573,9 +27569,7 @@ vst3_f64 (float64_t * __a, float64x1x3_t __val) __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3df ((__builtin_aarch64_simd_df *) __a, __o); } @@ -27588,9 +27582,7 @@ vst3_s8 (int8_t * __a, int8x8x3_t __val) __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27603,9 +27595,7 @@ vst3_p8 (poly8_t * __a, poly8x8x3_t __val) __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27618,9 +27608,7 @@ vst3_s16 (int16_t * __a, int16x4x3_t __val) __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -27633,9 +27621,7 @@ vst3_p16 (poly16_t * __a, poly16x4x3_t __val) __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -27648,9 +27634,7 @@ vst3_s32 (int32_t * __a, int32x2x3_t __val) __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -27663,9 +27647,7 @@ vst3_u8 (uint8_t * __a, uint8x8x3_t __val) __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27678,9 +27660,7 @@ vst3_u16 (uint16_t * __a, uint16x4x3_t __val) __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -27693,9 +27673,7 @@ vst3_u32 (uint32_t * __a, uint32x2x3_t __val) __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -27708,9 +27686,7 @@ vst3_f16 (float16_t * __a, float16x4x3_t __val) __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o); } @@ -27723,9 +27699,7 @@ vst3_f32 (float32_t * __a, float32x2x3_t __val) __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } @@ -27738,12 +27712,7 @@ vst3_p64 (poly64_t * __a, poly64x1x3_t __val) __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -27752,9 +27721,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_s8 (int8_t * __a, int8x16x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27763,9 +27730,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_p8 (poly8_t * __a, poly8x16x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27774,9 +27739,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_s16 (int16_t * __a, int16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -27785,9 +27748,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_p16 (poly16_t * __a, poly16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -27796,9 +27757,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_s32 (int32_t * __a, int32x4x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -27807,9 +27766,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_s64 (int64_t * __a, int64x2x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -27818,9 +27775,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_u8 (uint8_t * __a, uint8x16x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27829,9 +27784,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_u16 (uint16_t * __a, uint16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -27840,9 +27793,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_u32 (uint32_t * __a, uint32x4x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -27851,9 +27802,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_u64 (uint64_t * __a, uint64x2x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -27862,9 +27811,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_f16 (float16_t * __a, float16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o); } @@ -27873,9 +27820,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_f32 (float32_t * __a, float32x4x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } @@ -27884,9 +27829,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_f64 (float64_t * __a, float64x2x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o); } @@ -27895,12 +27838,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_p64 (poly64_t * __a, poly64x2x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); } -- cgit v1.1 From 03148b8e508ea09ce62259ffb95844182c0b90c6 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 21 Jul 2021 12:37:01 +0100 Subject: aarch64: Use memcpy to copy vector tables in vst2[q] intrinsics Use __builtin_memcpy to copy vector structures instead of building a new opaque structure one vector at a time in each of the vst2[q] Neon intrinsics in arm_neon.h. This simplifies the header file and also improves code generation - superfluous move instructions were emitted for every register extraction/set in this additional structure. Add new code generation tests to verify that superfluous move instructions are no longer generated for the vst2q intrinsics. gcc/ChangeLog: 2021-07-21 Jonathan Wrightt * config/aarch64/arm_neon.h (vst2_s64): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_oi one vector at a time. (vst2_u64): Likewise. (vst2_f64): Likewise. (vst2_s8): Likewise. (vst2_p8): Likewise. (vst2_s16): Likewise. (vst2_p16): Likewise. (vst2_s32): Likewise. (vst2_u8): Likewise. (vst2_u16): Likewise. (vst2_u32): Likewise. (vst2_f16): Likewise. (vst2_f32): Likewise. (vst2_p64): Likewise. (vst2q_s8): Likewise. (vst2q_p8): Likewise. (vst2q_s16): Likewise. (vst2q_p16): Likewise. (vst2q_s32): Likewise. (vst2q_s64): Likewise. (vst2q_u8): Likewise. (vst2q_u16): Likewise. (vst2q_u32): Likewise. (vst2q_u64): Likewise. (vst2q_f16): Likewise. (vst2q_f32): Likewise. (vst2q_f64): Likewise. (vst2q_p64): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: Add new tests. --- gcc/config/aarch64/arm_neon.h | 88 ++++++++++++++----------------------------- 1 file changed, 28 insertions(+), 60 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index fde321e..0e4ab35 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -27216,8 +27216,7 @@ vst2_s64 (int64_t * __a, int64x1x2_t __val) int64x2x2_t __temp; __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -27229,8 +27228,7 @@ vst2_u64 (uint64_t * __a, uint64x1x2_t __val) uint64x2x2_t __temp; __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -27242,8 +27240,7 @@ vst2_f64 (float64_t * __a, float64x1x2_t __val) float64x2x2_t __temp; __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o); } @@ -27255,8 +27252,7 @@ vst2_s8 (int8_t * __a, int8x8x2_t __val) int8x16x2_t __temp; __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27268,8 +27264,7 @@ vst2_p8 (poly8_t * __a, poly8x8x2_t __val) poly8x16x2_t __temp; __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27281,8 +27276,7 @@ vst2_s16 (int16_t * __a, int16x4x2_t __val) int16x8x2_t __temp; __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -27294,8 +27288,7 @@ vst2_p16 (poly16_t * __a, poly16x4x2_t __val) poly16x8x2_t __temp; __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -27307,8 +27300,7 @@ vst2_s32 (int32_t * __a, int32x2x2_t __val) int32x4x2_t __temp; __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -27320,8 +27312,7 @@ vst2_u8 (uint8_t * __a, uint8x8x2_t __val) uint8x16x2_t __temp; __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27333,8 +27324,7 @@ vst2_u16 (uint16_t * __a, uint16x4x2_t __val) uint16x8x2_t __temp; __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -27346,8 +27336,7 @@ vst2_u32 (uint32_t * __a, uint32x2x2_t __val) uint32x4x2_t __temp; __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -27359,8 +27348,7 @@ vst2_f16 (float16_t * __a, float16x4x2_t __val) float16x8x2_t __temp; __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2v4hf (__a, __o); } @@ -27372,8 +27360,7 @@ vst2_f32 (float32_t * __a, float32x2x2_t __val) float32x4x2_t __temp; __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } @@ -27385,10 +27372,7 @@ vst2_p64 (poly64_t * __a, poly64x1x2_t __val) poly64x2x2_t __temp; __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, - (poly64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, - (poly64x2_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -27397,8 +27381,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_s8 (int8_t * __a, int8x16x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27407,8 +27390,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_p8 (poly8_t * __a, poly8x16x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27417,8 +27399,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_s16 (int16_t * __a, int16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -27427,8 +27408,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_p16 (poly16_t * __a, poly16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -27437,8 +27417,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_s32 (int32_t * __a, int32x4x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -27447,8 +27426,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_s64 (int64_t * __a, int64x2x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -27457,8 +27435,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_u8 (uint8_t * __a, uint8x16x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -27467,8 +27444,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_u16 (uint16_t * __a, uint16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -27477,8 +27453,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_u32 (uint32_t * __a, uint32x4x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -27487,8 +27462,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_u64 (uint64_t * __a, uint64x2x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -27497,8 +27471,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_f16 (float16_t * __a, float16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v8hf (__a, __o); } @@ -27507,8 +27480,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_f32 (float32_t * __a, float32x4x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } @@ -27517,8 +27489,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_f64 (float64_t * __a, float64x2x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o); } @@ -27527,10 +27498,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_p64 (poly64_t * __a, poly64x2x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, - (poly64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, - (poly64x2_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); } -- cgit v1.1 From 1711b045829d281da9da440d70f2bf410127eea4 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 21 Jul 2021 16:55:01 +0100 Subject: aarch64: Use memcpy to copy vector tables in vst1[q]_x4 intrinsics Use __builtin_memcpy to copy vector structures instead of using a union in each of the vst1[q]_x4 Neon intrinsics in arm_neon.h. Add new code generation tests to verify that superfluous move instructions are not generated for the vst1q_x4 intrinsics. gcc/ChangeLog: 2021-07-21 Jonathan Wright * config/aarch64/arm_neon.h (vst1_s8_x4): Use __builtin_memcpy instead of using a union. (vst1q_s8_x4): Likewise. (vst1_s16_x4): Likewise. (vst1q_s16_x4): Likewise. (vst1_s32_x4): Likewise. (vst1q_s32_x4): Likewise. (vst1_u8_x4): Likewise. (vst1q_u8_x4): Likewise. (vst1_u16_x4): Likewise. (vst1q_u16_x4): Likewise. (vst1_u32_x4): Likewise. (vst1q_u32_x4): Likewise. (vst1_f16_x4): Likewise. (vst1q_f16_x4): Likewise. (vst1_f32_x4): Likewise. (vst1q_f32_x4): Likewise. (vst1_p8_x4): Likewise. (vst1q_p8_x4): Likewise. (vst1_p16_x4): Likewise. (vst1q_p16_x4): Likewise. (vst1_s64_x4): Likewise. (vst1_u64_x4): Likewise. (vst1_p64_x4): Likewise. (vst1q_s64_x4): Likewise. (vst1q_u64_x4): Likewise. (vst1q_p64_x4): Likewise. (vst1_f64_x4): Likewise. (vst1q_f64_x4): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: Add new tests. --- gcc/config/aarch64/arm_neon.h | 266 +++++++++++++++++++++++++++++------------- 1 file changed, 182 insertions(+), 84 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 0e4ab35..9cf16a8 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -26984,226 +26984,324 @@ vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t __val) __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s8_x4 (int8_t * __a, int8x8x4_t val) +vst1_s8_x4 (int8_t * __a, int8x8x4_t __val) { - union { int8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + int8x16x4_t __temp; + __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[3] = vcombine_s8 (__val.val[3], vcreate_s8 (__AARCH64_INT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s8_x4 (int8_t * __a, int8x16x4_t val) +vst1q_s8_x4 (int8_t * __a, int8x16x4_t __val) { - union { int8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s16_x4 (int16_t * __a, int16x4x4_t val) +vst1_s16_x4 (int16_t * __a, int16x4x4_t __val) { - union { int16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + int16x8x4_t __temp; + __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[3] = vcombine_s16 (__val.val[3], vcreate_s16 (__AARCH64_INT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s16_x4 (int16_t * __a, int16x8x4_t val) +vst1q_s16_x4 (int16_t * __a, int16x8x4_t __val) { - union { int16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s32_x4 (int32_t * __a, int32x2x4_t val) +vst1_s32_x4 (int32_t * __a, int32x2x4_t __val) { - union { int32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + int32x4x4_t __temp; + __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[3] = vcombine_s32 (__val.val[3], vcreate_s32 (__AARCH64_INT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s32_x4 (int32_t * __a, int32x4x4_t val) +vst1q_s32_x4 (int32_t * __a, int32x4x4_t __val) { - union { int32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4si ((__builtin_aarch64_simd_si *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v4si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u8_x4 (uint8_t * __a, uint8x8x4_t val) +vst1_u8_x4 (uint8_t * __a, uint8x8x4_t __val) { - union { uint8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + uint8x16x4_t __temp; + __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u8 (__val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u8_x4 (uint8_t * __a, uint8x16x4_t val) +vst1q_u8_x4 (uint8_t * __a, uint8x16x4_t __val) { - union { uint8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u16_x4 (uint16_t * __a, uint16x4x4_t val) +vst1_u16_x4 (uint16_t * __a, uint16x4x4_t __val) { - union { uint16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + uint16x8x4_t __temp; + __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u16 (__val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u16_x4 (uint16_t * __a, uint16x8x4_t val) +vst1q_u16_x4 (uint16_t * __a, uint16x8x4_t __val) { - union { uint16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u32_x4 (uint32_t * __a, uint32x2x4_t val) +vst1_u32_x4 (uint32_t * __a, uint32x2x4_t __val) { - union { uint32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + uint32x4x4_t __temp; + __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u32 (__val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u32_x4 (uint32_t * __a, uint32x4x4_t val) +vst1q_u32_x4 (uint32_t * __a, uint32x4x4_t __val) { - union { uint32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4si ((__builtin_aarch64_simd_si *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v4si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_f16_x4 (float16_t * __a, float16x4x4_t val) +vst1_f16_x4 (float16_t * __a, float16x4x4_t __val) { - union { float16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4hf ((__builtin_aarch64_simd_hf *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + float16x8x4_t __temp; + __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f16 (__val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4v4hf ((__builtin_aarch64_simd_hf *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_f16_x4 (float16_t * __a, float16x8x4_t val) +vst1q_f16_x4 (float16_t * __a, float16x8x4_t __val) { - union { float16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v8hf ((__builtin_aarch64_simd_hf *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v8hf ((__builtin_aarch64_simd_hf *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_f32_x4 (float32_t * __a, float32x2x4_t val) +vst1_f32_x4 (float32_t * __a, float32x2x4_t __val) { - union { float32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v2sf ((__builtin_aarch64_simd_sf *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + float32x4x4_t __temp; + __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f32 (__val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_f32_x4 (float32_t * __a, float32x4x4_t val) +vst1q_f32_x4 (float32_t * __a, float32x4x4_t __val) { - union { float32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4sf ((__builtin_aarch64_simd_sf *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_p8_x4 (poly8_t * __a, poly8x8x4_t val) +vst1_p8_x4 (poly8_t * __a, poly8x8x4_t __val) { - union { poly8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + poly8x16x4_t __temp; + __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p8 (__val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_p8_x4 (poly8_t * __a, poly8x16x4_t val) +vst1q_p8_x4 (poly8_t * __a, poly8x16x4_t __val) { - union { poly8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_p16_x4 (poly16_t * __a, poly16x4x4_t val) +vst1_p16_x4 (poly16_t * __a, poly16x4x4_t __val) { - union { poly16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + poly16x8x4_t __temp; + __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p16 (__val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_p16_x4 (poly16_t * __a, poly16x8x4_t val) +vst1q_p16_x4 (poly16_t * __a, poly16x8x4_t __val) { - union { poly16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s64_x4 (int64_t * __a, int64x1x4_t val) +vst1_s64_x4 (int64_t * __a, int64x1x4_t __val) { - union { int64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + int64x2x4_t __temp; + __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[3] = vcombine_s64 (__val.val[3], vcreate_s64 (__AARCH64_INT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u64_x4 (uint64_t * __a, uint64x1x4_t val) +vst1_u64_x4 (uint64_t * __a, uint64x1x4_t __val) { - union { uint64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + uint64x2x4_t __temp; + __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u64 (__val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_p64_x4 (poly64_t * __a, poly64x1x4_t val) +vst1_p64_x4 (poly64_t * __a, poly64x1x4_t __val) { - union { poly64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + poly64x2x4_t __temp; + __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p64 (__val.val[3], vcreate_p64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s64_x4 (int64_t * __a, int64x2x4_t val) +vst1q_s64_x4 (int64_t * __a, int64x2x4_t __val) { - union { int64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u64_x4 (uint64_t * __a, uint64x2x4_t val) +vst1q_u64_x4 (uint64_t * __a, uint64x2x4_t __val) { - union { uint64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_p64_x4 (poly64_t * __a, poly64x2x4_t val) +vst1q_p64_x4 (poly64_t * __a, poly64x2x4_t __val) { - union { poly64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_f64_x4 (float64_t * __a, float64x1x4_t val) +vst1_f64_x4 (float64_t * __a, float64x1x4_t __val) { - union { float64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4df ((__builtin_aarch64_simd_df *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + float64x2x4_t __temp; + __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f64 (__val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4df ((__builtin_aarch64_simd_df *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_f64_x4 (float64_t * __a, float64x2x4_t val) +vst1q_f64_x4 (float64_t * __a, float64x2x4_t __val) { - union { float64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v2df ((__builtin_aarch64_simd_df *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v2df ((__builtin_aarch64_simd_df *) __a, __o); } /* vstn */ -- cgit v1.1 From 085666673db03c2e53db368d699c47032c6c5f2e Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 22 Jul 2021 05:17:27 -0700 Subject: x86: Don't return hard register when LRA is in progress Don't return hard register in ix86_gen_scratch_sse_rtx when LRA is in progress to avoid ICE when there are no available hard registers for LRA. gcc/ PR target/101504 * config/i386/i386.c (ix86_gen_scratch_sse_rtx): Don't return hard register when LRA is in progress. gcc/testsuite/ PR target/101504 * gcc.target/i386/pr101504.c: New test. --- gcc/config/i386/i386.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ff96134..876a19f 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -23180,7 +23180,7 @@ ix86_optab_supported_p (int op, machine_mode mode1, machine_mode, rtx ix86_gen_scratch_sse_rtx (machine_mode mode) { - if (TARGET_SSE) + if (TARGET_SSE && !lra_in_progress) return gen_rtx_REG (mode, (TARGET_64BIT ? LAST_REX_SSE_REG : LAST_SSE_REG)); -- cgit v1.1 From ccf6e2c21be84a478bcef4cced49879879a1104c Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 23 Jul 2021 12:41:05 +0100 Subject: aarch64: Use memcpy to copy vector tables in vst1[q]_x3 intrinsics Use __builtin_memcpy to copy vector structures instead of building a new opaque structure one vector at a time in each of the vst1[q]_x3 Neon intrinsics in arm_neon.h. This simplifies the header file and also improves code generation - superfluous move instructions were emitted for every register extraction/set in this additional structure. Add new code generation tests to verify that superfluous move instructions are not generated for the vst1q_x3 intrinsics. gcc/ChangeLog: 2021-07-23 Jonathan Wright * config/aarch64/arm_neon.h (vst1_s64_x3): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_ci one vector at a time. (vst1_u64_x3): Likewise. (vst1_f64_x3): Likewise. (vst1_s8_x3): Likewise. (vst1_p8_x3): Likewise. (vst1_s16_x3): Likewise. (vst1_p16_x3): Likewise. (vst1_s32_x3): Likewise. (vst1_u8_x3): Likewise. (vst1_u16_x3): Likewise. (vst1_u32_x3): Likewise. (vst1_f16_x3): Likewise. (vst1_f32_x3): Likewise. (vst1_p64_x3): Likewise. (vst1q_s8_x3): Likewise. (vst1q_p8_x3): Likewise. (vst1q_s16_x3): Likewise. (vst1q_p16_x3): Likewise. (vst1q_s32_x3): Likewise. (vst1q_s64_x3): Likewise. (vst1q_u8_x3): Likewise. (vst1q_u16_x3): Likewise. (vst1q_u32_x3): Likewise. (vst1q_u64_x3): Likewise. (vst1q_f16_x3): Likewise. (vst1q_f32_x3): Likewise. (vst1q_f64_x3): Likewise. (vst1q_p64_x3): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: Add new tests. --- gcc/config/aarch64/arm_neon.h | 118 ++++++++++-------------------------------- 1 file changed, 28 insertions(+), 90 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 9cf16a8..47bb94c 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -26619,9 +26619,7 @@ vst1_s64_x3 (int64_t * __a, int64x1x3_t __val) __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -26634,9 +26632,7 @@ vst1_u64_x3 (uint64_t * __a, uint64x1x3_t __val) __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -26649,9 +26645,7 @@ vst1_f64_x3 (float64_t * __a, float64x1x3_t __val) __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3df ((__builtin_aarch64_simd_df *) __a, __o); } @@ -26664,9 +26658,7 @@ vst1_s8_x3 (int8_t * __a, int8x8x3_t __val) __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -26679,9 +26671,7 @@ vst1_p8_x3 (poly8_t * __a, poly8x8x3_t __val) __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -26694,9 +26684,7 @@ vst1_s16_x3 (int16_t * __a, int16x4x3_t __val) __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -26709,9 +26697,7 @@ vst1_p16_x3 (poly16_t * __a, poly16x4x3_t __val) __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -26724,9 +26710,7 @@ vst1_s32_x3 (int32_t * __a, int32x2x3_t __val) __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -26739,9 +26723,7 @@ vst1_u8_x3 (uint8_t * __a, uint8x8x3_t __val) __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -26754,9 +26736,7 @@ vst1_u16_x3 (uint16_t * __a, uint16x4x3_t __val) __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -26769,9 +26749,7 @@ vst1_u32_x3 (uint32_t * __a, uint32x2x3_t __val) __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -26784,9 +26762,7 @@ vst1_f16_x3 (float16_t * __a, float16x4x3_t __val) __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3v4hf ((__builtin_aarch64_simd_hf *) __a, __o); } @@ -26799,9 +26775,7 @@ vst1_f32_x3 (float32_t * __a, float32x2x3_t __val) __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } @@ -26814,12 +26788,7 @@ vst1_p64_x3 (poly64_t * __a, poly64x1x3_t __val) __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -26828,9 +26797,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_s8_x3 (int8_t * __a, int8x16x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -26839,9 +26806,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -26850,9 +26815,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_s16_x3 (int16_t * __a, int16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -26861,9 +26824,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -26872,9 +26833,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_s32_x3 (int32_t * __a, int32x4x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -26883,9 +26842,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_s64_x3 (int64_t * __a, int64x2x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -26894,9 +26851,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -26905,9 +26860,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -26916,9 +26869,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -26927,9 +26878,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -26938,9 +26887,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_f16_x3 (float16_t * __a, float16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v8hf ((__builtin_aarch64_simd_hf *) __a, __o); } @@ -26949,9 +26896,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_f32_x3 (float32_t * __a, float32x4x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } @@ -26960,9 +26905,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_f64_x3 (float64_t * __a, float64x2x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v2df ((__builtin_aarch64_simd_df *) __a, __o); } @@ -26971,12 +26914,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o); } -- cgit v1.1 From 50752b751fff56e7e2c74024bae659d5e9dea50f Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 23 Jul 2021 13:41:39 +0100 Subject: aarch64: Use memcpy to copy vector tables in vst1[q]_x2 intrinsics Use __builtin_memcpy to copy vector structures instead of building a new opaque structure one vector at a time in each of the vst1[q]_x2 Neon intrinsics in arm_neon.h. This simplifies the header file and also improves code generation - superfluous move instructions were emitted for every register extraction/set in this additional structure. Add new code generation tests to verify that superfluous move instructions are not generated for the vst1q_x2 intrinsics. gcc/ChangeLog: 2021-07-23 Jonathan Wright * config/aarch64/arm_neon.h (vst1_s64_x2): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_oi one vector at a time. (vst1_u64_x2): Likewise. (vst1_f64_x2): Likewise. (vst1_s8_x2): Likewise. (vst1_p8_x2): Likewise. (vst1_s16_x2): Likewise. (vst1_p16_x2): Likewise. (vst1_s32_x2): Likewise. (vst1_u8_x2): Likewise. (vst1_u16_x2): Likewise. (vst1_u32_x2): Likewise. (vst1_f16_x2): Likewise. (vst1_f32_x2): Likewise. (vst1_p64_x2): Likewise. (vst1q_s8_x2): Likewise. (vst1q_p8_x2): Likewise. (vst1q_s16_x2): Likewise. (vst1q_p16_x2): Likewise. (vst1q_s32_x2): Likewise. (vst1q_s64_x2): Likewise. (vst1q_u8_x2): Likewise. (vst1q_u16_x2): Likewise. (vst1q_u32_x2): Likewise. (vst1q_u64_x2): Likewise. (vst1q_f16_x2): Likewise. (vst1q_f32_x2): Likewise. (vst1q_f64_x2): Likewise. (vst1q_p64_x2): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: Add new tests. --- gcc/config/aarch64/arm_neon.h | 88 ++++++++++++++----------------------------- 1 file changed, 28 insertions(+), 60 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 47bb94c..7523974 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -26276,8 +26276,7 @@ vst1_s64_x2 (int64_t * __a, int64x1x2_t __val) = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -26291,8 +26290,7 @@ vst1_u64_x2 (uint64_t * __a, uint64x1x2_t __val) = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -26306,8 +26304,7 @@ vst1_f64_x2 (float64_t * __a, float64x1x2_t __val) = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2df ((__builtin_aarch64_simd_df *) __a, __o); } @@ -26321,8 +26318,7 @@ vst1_s8_x2 (int8_t * __a, int8x8x2_t __val) = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -26336,8 +26332,7 @@ vst1_p8_x2 (poly8_t * __a, poly8x8x2_t __val) = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -26351,8 +26346,7 @@ vst1_s16_x2 (int16_t * __a, int16x4x2_t __val) = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -26366,8 +26360,7 @@ vst1_p16_x2 (poly16_t * __a, poly16x4x2_t __val) = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -26381,8 +26374,7 @@ vst1_s32_x2 (int32_t * __a, int32x2x2_t __val) = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -26394,8 +26386,7 @@ vst1_u8_x2 (uint8_t * __a, uint8x8x2_t __val) uint8x16x2_t __temp; __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -26407,8 +26398,7 @@ vst1_u16_x2 (uint16_t * __a, uint16x4x2_t __val) uint16x8x2_t __temp; __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -26420,8 +26410,7 @@ vst1_u32_x2 (uint32_t * __a, uint32x2x2_t __val) uint32x4x2_t __temp; __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -26433,8 +26422,7 @@ vst1_f16_x2 (float16_t * __a, float16x4x2_t __val) float16x8x2_t __temp; __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2v4hf (__a, __o); } @@ -26446,8 +26434,7 @@ vst1_f32_x2 (float32_t * __a, float32x2x2_t __val) float32x4x2_t __temp; __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } @@ -26459,10 +26446,7 @@ vst1_p64_x2 (poly64_t * __a, poly64x1x2_t __val) poly64x2x2_t __temp; __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, - (poly64x2_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, - (poly64x2_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -26471,8 +26455,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_s8_x2 (int8_t * __a, int8x16x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -26481,8 +26464,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -26491,8 +26473,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_s16_x2 (int16_t * __a, int16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -26501,8 +26482,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -26511,8 +26491,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_s32_x2 (int32_t * __a, int32x4x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -26521,8 +26500,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_s64_x2 (int64_t * __a, int64x2x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -26531,8 +26509,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } @@ -26541,8 +26518,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } @@ -26551,8 +26527,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o); } @@ -26561,8 +26536,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o); } @@ -26571,8 +26545,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_f16_x2 (float16_t * __a, float16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v8hf (__a, __o); } @@ -26581,8 +26554,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_f32_x2 (float32_t * __a, float32x4x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } @@ -26591,8 +26563,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_f64_x2 (float64_t * __a, float64x2x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v2df ((__builtin_aarch64_simd_df *) __a, __o); } @@ -26601,10 +26572,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, - (poly64x2_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, - (poly64x2_t) __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o); } -- cgit v1.1 From 2050ac1a547eebe7de4af98b57429a934e75fff4 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Mon, 26 Jul 2021 10:22:23 +0100 Subject: AArch64: correct usdot vectorizer and intrinsics optabs There's a slight mismatch between the vectorizer optabs and the intrinsics patterns for NEON. The vectorizer expects operands[3] and operands[0] to be the same but the aarch64 intrinsics expanders expect operands[0] and operands[1] to be the same. This means we need different patterns here. This adds a separate usdot vectorizer pattern which just shuffles around the RTL params. There's also an inconsistency between the usdot and (u|s)dot intrinsics RTL patterns which is not corrected here. gcc/ChangeLog: * config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SUSS, aarch64_types_ternop_suss_qualifiers): New. * config/aarch64/aarch64-simd-builtins.def (usdot_prod): Use it. * config/aarch64/aarch64-simd.md (usdot_prod): Re-organize RTL. * config/aarch64/arm_neon.h (vusdot_s32, vusdotq_s32): Use it. --- gcc/config/aarch64/aarch64-builtins.c | 4 ++++ gcc/config/aarch64/aarch64-simd-builtins.def | 2 +- gcc/config/aarch64/aarch64-simd.md | 28 ++++++++++++++-------------- gcc/config/aarch64/arm_neon.h | 4 ++-- 4 files changed, 21 insertions(+), 17 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index 9ed4b72..f6b41d9 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -209,6 +209,10 @@ static enum aarch64_type_qualifiers aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none }; #define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers) +static enum aarch64_type_qualifiers +aarch64_types_ternop_suss_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_none, qualifier_unsigned, qualifier_none, qualifier_none }; +#define TYPES_TERNOP_SUSS (aarch64_types_ternop_suss_qualifiers) static enum aarch64_type_qualifiers diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index b7f1237..3bb45a8 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -377,7 +377,7 @@ /* Implemented by _prod. */ BUILTIN_VB (TERNOP, sdot, 0, NONE) BUILTIN_VB (TERNOPU, udot, 0, NONE) - BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE) + BUILTIN_VB (TERNOP_SUSS, usdot_prod, 10, NONE) /* Implemented by aarch64__lane{q}. */ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7332a73..bf667b9 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -599,20 +599,6 @@ [(set_attr "type" "neon_dot")] ) -;; These instructions map to the __builtins for the armv8.6a I8MM usdot -;; (vector) Dot Product operation. -(define_insn "usdot_prod" - [(set (match_operand:VS 0 "register_operand" "=w") - (plus:VS - (unspec:VS [(match_operand: 2 "register_operand" "w") - (match_operand: 3 "register_operand" "w")] - UNSPEC_USDOT) - (match_operand:VS 1 "register_operand" "0")))] - "TARGET_I8MM" - "usdot\\t%0., %2., %3." - [(set_attr "type" "neon_dot")] -) - ;; These expands map to the Dot Product optab the vectorizer checks for. ;; The auto-vectorizer expects a dot product builtin that also does an ;; accumulation into the provided register. @@ -648,6 +634,20 @@ DONE; }) +;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot +;; (vector) Dot Product operation and the vectorized optab. +(define_insn "usdot_prod" + [(set (match_operand:VS 0 "register_operand" "=w") + (plus:VS + (unspec:VS [(match_operand: 1 "register_operand" "w") + (match_operand: 2 "register_operand" "w")] + UNSPEC_USDOT) + (match_operand:VS 3 "register_operand" "0")))] + "TARGET_I8MM" + "usdot\\t%0., %1., %2." + [(set_attr "type" "neon_dot")] +) + ;; These instructions map to the __builtins for the Dot Product ;; indexed operations. (define_insn "aarch64_dot_lane" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 7523974..0f43994 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -33744,14 +33744,14 @@ __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b); + return __builtin_aarch64_usdot_prodv8qi_suss (__a, __b, __r); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_usdot_prodv16qi_ssus (__r, __a, __b); + return __builtin_aarch64_usdot_prodv16qi_suss (__a, __b, __r); } __extension__ extern __inline int32x2_t -- cgit v1.1 From 1ab2270036dc0f2a13442ce682267bc7433ffb34 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Mon, 26 Jul 2021 10:23:21 +0100 Subject: AArch64: correct dot-product RTL patterns for aarch64. The previous fix for this problem was wrong due to a subtle difference between where NEON expects the RMW values and where intrinsics expects them. The insn pattern is modeled after the intrinsics and so needs an expand for the vectorizer optab to switch the RTL. However operand[3] is not expected to be written to so the current pattern is bogus. Instead I rewrite the RTL to be in canonical ordering and merge them. gcc/ChangeLog: * config/aarch64/aarch64-simd-builtins.def (sdot, udot): Rename to.. (sdot_prod, udot_prod): ... This. * config/aarch64/aarch64-simd.md (aarch64_dot): Merged into... (dot_prod): ... this. (aarch64_dot_lane, aarch64_dot_laneq): Change operands order. (sadv16qi): Use new operands order. * config/aarch64/arm_neon.h (vdot_u32, vdotq_u32, vdot_s32, vdotq_s32): Use new RTL ordering. --- gcc/config/aarch64/aarch64-simd-builtins.def | 4 +- gcc/config/aarch64/aarch64-simd.md | 63 +++++++++++----------------- gcc/config/aarch64/arm_neon.h | 8 ++-- 3 files changed, 31 insertions(+), 44 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 3bb45a8..402453a 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -375,8 +375,8 @@ BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) /* Implemented by _prod. */ - BUILTIN_VB (TERNOP, sdot, 0, NONE) - BUILTIN_VB (TERNOPU, udot, 0, NONE) + BUILTIN_VB (TERNOP, sdot_prod, 10, NONE) + BUILTIN_VB (TERNOPU, udot_prod, 10, NONE) BUILTIN_VB (TERNOP_SUSS, usdot_prod, 10, NONE) /* Implemented by aarch64__lane{q}. */ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index bf667b9..13c8698 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -587,19 +587,8 @@ DONE; }) -;; These instructions map to the __builtins for the Dot Product operations. -(define_insn "aarch64_dot" - [(set (match_operand:VS 0 "register_operand" "=w") - (plus:VS (match_operand:VS 1 "register_operand" "0") - (unspec:VS [(match_operand: 2 "register_operand" "w") - (match_operand: 3 "register_operand" "w")] - DOTPROD)))] - "TARGET_DOTPROD" - "dot\\t%0., %2., %3." - [(set_attr "type" "neon_dot")] -) - -;; These expands map to the Dot Product optab the vectorizer checks for. +;; These expands map to the Dot Product optab the vectorizer checks for +;; and to the intrinsics patttern. ;; The auto-vectorizer expects a dot product builtin that also does an ;; accumulation into the provided register. ;; Given the following pattern @@ -619,20 +608,17 @@ ;; ... ;; ;; and so the vectorizer provides r, in which the result has to be accumulated. -(define_expand "dot_prod" - [(set (match_operand:VS 0 "register_operand") - (plus:VS (unspec:VS [(match_operand: 1 "register_operand") - (match_operand: 2 "register_operand")] - DOTPROD) - (match_operand:VS 3 "register_operand")))] +(define_insn "dot_prod" + [(set (match_operand:VS 0 "register_operand" "=w") + (plus:VS + (unspec:VS [(match_operand: 1 "register_operand" "w") + (match_operand: 2 "register_operand" "w")] + DOTPROD) + (match_operand:VS 3 "register_operand" "0")))] "TARGET_DOTPROD" -{ - emit_insn ( - gen_aarch64_dot (operands[3], operands[3], operands[1], - operands[2])); - emit_insn (gen_rtx_SET (operands[0], operands[3])); - DONE; -}) + "dot\\t%0., %1., %2." + [(set_attr "type" "neon_dot")] +) ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot ;; (vector) Dot Product operation and the vectorized optab. @@ -652,11 +638,12 @@ ;; indexed operations. (define_insn "aarch64_dot_lane" [(set (match_operand:VS 0 "register_operand" "=w") - (plus:VS (match_operand:VS 1 "register_operand" "0") - (unspec:VS [(match_operand: 2 "register_operand" "w") - (match_operand:V8QI 3 "register_operand" "") - (match_operand:SI 4 "immediate_operand" "i")] - DOTPROD)))] + (plus:VS + (unspec:VS [(match_operand: 2 "register_operand" "w") + (match_operand:V8QI 3 "register_operand" "") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD) + (match_operand:VS 1 "register_operand" "0")))] "TARGET_DOTPROD" { operands[4] = aarch64_endian_lane_rtx (V8QImode, INTVAL (operands[4])); @@ -667,11 +654,12 @@ (define_insn "aarch64_dot_laneq" [(set (match_operand:VS 0 "register_operand" "=w") - (plus:VS (match_operand:VS 1 "register_operand" "0") - (unspec:VS [(match_operand: 2 "register_operand" "w") - (match_operand:V16QI 3 "register_operand" "") - (match_operand:SI 4 "immediate_operand" "i")] - DOTPROD)))] + (plus:VS + (unspec:VS [(match_operand: 2 "register_operand" "w") + (match_operand:V16QI 3 "register_operand" "") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD) + (match_operand:VS 1 "register_operand" "0")))] "TARGET_DOTPROD" { operands[4] = aarch64_endian_lane_rtx (V16QImode, INTVAL (operands[4])); @@ -944,8 +932,7 @@ rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode)); rtx abd = gen_reg_rtx (V16QImode); emit_insn (gen_aarch64_abdv16qi (abd, operands[1], operands[2])); - emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3], - abd, ones)); + emit_insn (gen_udot_prodv16qi (operands[0], abd, ones, operands[3])); DONE; } rtx reduc = gen_reg_rtx (V8HImode); diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 0f43994..313b35f 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -31472,28 +31472,28 @@ __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udot_prodv8qi_uuuu (__a, __b, __r); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udot_prodv16qi_uuuu (__a, __b, __r); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_sdotv8qi (__r, __a, __b); + return __builtin_aarch64_sdot_prodv8qi (__a, __b, __r); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_sdotv16qi (__r, __a, __b); + return __builtin_aarch64_sdot_prodv16qi (__a, __b, __r); } __extension__ extern __inline uint32x2_t -- cgit v1.1 From 3bc9db6a989671bedf19e61bd1b21f79588e99da Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 16 Jul 2021 15:34:38 +0100 Subject: simplify-rtx: Push sign/zero-extension inside vec_duplicate As a general principle, vec_duplicate should be as close to the root of an expression as possible. Where unary operations have vec_duplicate as an argument, these operations should be pushed inside the vec_duplicate. This patch modifies unary operation simplification to push sign/zero-extension of a scalar inside vec_duplicate. This patch also updates all RTL patterns in aarch64-simd.md to use the new canonical form. gcc/ChangeLog: 2021-07-19 Jonathan Wright * config/aarch64/aarch64-simd.md: Push sign/zero-extension inside vec_duplicate for all patterns. * simplify-rtx.c (simplify_context::simplify_unary_operation_1): Push sign/zero-extension inside vec_duplicate. --- gcc/config/aarch64/aarch64-simd.md | 359 +++++++++++++++++++------------------ 1 file changed, 187 insertions(+), 172 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 13c8698..c5638d0 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -2079,14 +2079,16 @@ (define_insn "aarch64_mlal_hi_n_insn" [(set (match_operand: 0 "register_operand" "=w") - (plus: - (mult: - (ANY_EXTEND: (vec_select: - (match_operand:VQ_HSI 2 "register_operand" "w") - (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) - (ANY_EXTEND: (vec_duplicate: - (match_operand: 4 "register_operand" "")))) - (match_operand: 1 "register_operand" "0")))] + (plus: + (mult: + (ANY_EXTEND: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (ANY_EXTEND: + (match_operand: 4 "register_operand" "")))) + (match_operand: 1 "register_operand" "0")))] "TARGET_SIMD" "mlal2\t%0., %2., %4.[0]" [(set_attr "type" "neon_mla__long")] @@ -2154,14 +2156,16 @@ (define_insn "aarch64_mlsl_hi_n_insn" [(set (match_operand: 0 "register_operand" "=w") - (minus: - (match_operand: 1 "register_operand" "0") - (mult: - (ANY_EXTEND: (vec_select: - (match_operand:VQ_HSI 2 "register_operand" "w") - (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) - (ANY_EXTEND: (vec_duplicate: - (match_operand: 4 "register_operand" ""))))))] + (minus: + (match_operand: 1 "register_operand" "0") + (mult: + (ANY_EXTEND: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (ANY_EXTEND: + (match_operand: 4 "register_operand" ""))))))] "TARGET_SIMD" "mlsl2\t%0., %2., %4.[0]" [(set_attr "type" "neon_mla__long")] @@ -2197,14 +2201,14 @@ (define_insn "aarch64_mlal_n" [(set (match_operand: 0 "register_operand" "=w") - (plus: - (mult: - (ANY_EXTEND: - (match_operand:VD_HSI 2 "register_operand" "w")) - (ANY_EXTEND: - (vec_duplicate:VD_HSI - (match_operand: 3 "register_operand" "")))) - (match_operand: 1 "register_operand" "0")))] + (plus: + (mult: + (ANY_EXTEND: + (match_operand:VD_HSI 2 "register_operand" "w")) + (vec_duplicate: + (ANY_EXTEND: + (match_operand: 3 "register_operand" "")))) + (match_operand: 1 "register_operand" "0")))] "TARGET_SIMD" "mlal\t%0., %2., %3.[0]" [(set_attr "type" "neon_mla__long")] @@ -2226,14 +2230,14 @@ (define_insn "aarch64_mlsl_n" [(set (match_operand: 0 "register_operand" "=w") - (minus: - (match_operand: 1 "register_operand" "0") - (mult: - (ANY_EXTEND: - (match_operand:VD_HSI 2 "register_operand" "w")) - (ANY_EXTEND: - (vec_duplicate:VD_HSI - (match_operand: 3 "register_operand" ""))))))] + (minus: + (match_operand: 1 "register_operand" "0") + (mult: + (ANY_EXTEND: + (match_operand:VD_HSI 2 "register_operand" "w")) + (vec_duplicate: + (ANY_EXTEND: + (match_operand: 3 "register_operand" ""))))))] "TARGET_SIMD" "mlsl\t%0., %2., %3.[0]" [(set_attr "type" "neon_mla__long")] @@ -2311,8 +2315,8 @@ (mult: (ANY_EXTEND: (match_operand: 1 "register_operand" "w")) - (ANY_EXTEND: - (vec_duplicate: + (vec_duplicate: + (ANY_EXTEND: (vec_select: (match_operand:VDQHS 2 "register_operand" "") (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))] @@ -2327,13 +2331,15 @@ (define_insn "aarch64_mull_hi_lane_insn" [(set (match_operand: 0 "register_operand" "=w") (mult: - (ANY_EXTEND: (vec_select: - (match_operand:VQ_HSI 1 "register_operand" "w") - (match_operand:VQ_HSI 2 "vect_par_cnst_hi_half" ""))) - (ANY_EXTEND: (vec_duplicate: - (vec_select: - (match_operand: 3 "register_operand" "") - (parallel [(match_operand:SI 4 "immediate_operand" "i")]))))))] + (ANY_EXTEND: + (vec_select: + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 2 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (ANY_EXTEND: + (vec_select: + (match_operand: 3 "register_operand" "") + (parallel [(match_operand:SI 4 "immediate_operand" "i")]))))))] "TARGET_SIMD" { operands[4] = aarch64_endian_lane_rtx (mode, INTVAL (operands[4])); @@ -2359,13 +2365,15 @@ (define_insn "aarch64_mull_hi_laneq_insn" [(set (match_operand: 0 "register_operand" "=w") (mult: - (ANY_EXTEND: (vec_select: - (match_operand:VQ_HSI 1 "register_operand" "w") - (match_operand:VQ_HSI 2 "vect_par_cnst_hi_half" ""))) - (ANY_EXTEND: (vec_duplicate: - (vec_select: - (match_operand: 3 "register_operand" "") - (parallel [(match_operand:SI 4 "immediate_operand" "i")]))))))] + (ANY_EXTEND: + (vec_select: + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 2 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (ANY_EXTEND: + (vec_select: + (match_operand: 3 "register_operand" "") + (parallel [(match_operand:SI 4 "immediate_operand" "i")]))))))] "TARGET_SIMD" { operands[4] = aarch64_endian_lane_rtx (mode, INTVAL (operands[4])); @@ -2390,11 +2398,11 @@ (define_insn "aarch64_mull_n" [(set (match_operand: 0 "register_operand" "=w") - (mult: - (ANY_EXTEND: - (match_operand:VD_HSI 1 "register_operand" "w")) - (ANY_EXTEND: - (vec_duplicate: + (mult: + (ANY_EXTEND: + (match_operand:VD_HSI 1 "register_operand" "w")) + (vec_duplicate: + (ANY_EXTEND: (match_operand: 2 "register_operand" "")))))] "TARGET_SIMD" "mull\t%0., %1., %2.[0]" @@ -2404,11 +2412,12 @@ (define_insn "aarch64_mull_hi_n_insn" [(set (match_operand: 0 "register_operand" "=w") (mult: - (ANY_EXTEND: (vec_select: - (match_operand:VQ_HSI 1 "register_operand" "w") - (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) (ANY_EXTEND: - (vec_duplicate: + (vec_select: + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (ANY_EXTEND: (match_operand: 2 "register_operand" "")))))] "TARGET_SIMD" "mull2\\t%0., %1., %2.[0]" @@ -2435,8 +2444,8 @@ (mult: (ANY_EXTEND: (match_operand: 2 "register_operand" "w")) - (ANY_EXTEND: - (vec_duplicate: + (vec_duplicate: + (ANY_EXTEND: (vec_select: (match_operand:VDQHS 3 "register_operand" "") (parallel [(match_operand:SI 4 "immediate_operand" "i")]))))) @@ -2453,13 +2462,15 @@ [(set (match_operand: 0 "register_operand" "=w") (plus: (mult: - (ANY_EXTEND: (vec_select: - (match_operand:VQ_HSI 2 "register_operand" "w") - (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) - (ANY_EXTEND: (vec_duplicate: - (vec_select: - (match_operand: 4 "register_operand" "") - (parallel [(match_operand:SI 5 "immediate_operand" "i")]))))) + (ANY_EXTEND: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (ANY_EXTEND: + (vec_select: + (match_operand: 4 "register_operand" "") + (parallel [(match_operand:SI 5 "immediate_operand" "i")]))))) (match_operand: 1 "register_operand" "0")))] "TARGET_SIMD" { @@ -2488,13 +2499,15 @@ [(set (match_operand: 0 "register_operand" "=w") (plus: (mult: - (ANY_EXTEND: (vec_select: - (match_operand:VQ_HSI 2 "register_operand" "w") - (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) - (ANY_EXTEND: (vec_duplicate: - (vec_select: - (match_operand: 4 "register_operand" "") - (parallel [(match_operand:SI 5 "immediate_operand" "i")]))))) + (ANY_EXTEND: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (ANY_EXTEND: + (vec_select: + (match_operand: 4 "register_operand" "") + (parallel [(match_operand:SI 5 "immediate_operand" "i")]))))) (match_operand: 1 "register_operand" "0")))] "TARGET_SIMD" { @@ -2526,8 +2539,8 @@ (mult: (ANY_EXTEND: (match_operand: 2 "register_operand" "w")) - (ANY_EXTEND: - (vec_duplicate: + (vec_duplicate: + (ANY_EXTEND: (vec_select: (match_operand:VDQHS 3 "register_operand" "") (parallel [(match_operand:SI 4 "immediate_operand" "i")])))))))] @@ -2544,13 +2557,15 @@ (minus: (match_operand: 1 "register_operand" "0") (mult: - (ANY_EXTEND: (vec_select: - (match_operand:VQ_HSI 2 "register_operand" "w") - (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) - (ANY_EXTEND: (vec_duplicate: - (vec_select: - (match_operand: 4 "register_operand" "") - (parallel [(match_operand:SI 5 "immediate_operand" "i")])))) + (ANY_EXTEND: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (ANY_EXTEND: + (vec_select: + (match_operand: 4 "register_operand" "") + (parallel [(match_operand:SI 5 "immediate_operand" "i")])))) )))] "TARGET_SIMD" { @@ -2580,13 +2595,15 @@ (minus: (match_operand: 1 "register_operand" "0") (mult: - (ANY_EXTEND: (vec_select: - (match_operand:VQ_HSI 2 "register_operand" "w") - (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) - (ANY_EXTEND: (vec_duplicate: - (vec_select: - (match_operand: 4 "register_operand" "") - (parallel [(match_operand:SI 5 "immediate_operand" "i")])))) + (ANY_EXTEND: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (ANY_EXTEND: + (vec_select: + (match_operand: 4 "register_operand" "") + (parallel [(match_operand:SI 5 "immediate_operand" "i")])))) )))] "TARGET_SIMD" { @@ -5313,12 +5330,12 @@ (mult: (sign_extend: (match_operand:VD_HSI 2 "register_operand" "w")) - (sign_extend: - (vec_duplicate:VD_HSI + (vec_duplicate: + (sign_extend: (vec_select: (match_operand: 3 "register_operand" "") (parallel [(match_operand:SI 4 "immediate_operand" "i")]))) - )) + )) (const_int 1)) (match_operand: 1 "register_operand" "0")))] "TARGET_SIMD" @@ -5338,12 +5355,12 @@ (mult: (sign_extend: (match_operand:VD_HSI 2 "register_operand" "w")) - (sign_extend: - (vec_duplicate:VD_HSI + (vec_duplicate: + (sign_extend: (vec_select: (match_operand: 3 "register_operand" "") (parallel [(match_operand:SI 4 "immediate_operand" "i")]))) - )) + )) (const_int 1))))] "TARGET_SIMD" { @@ -5363,12 +5380,12 @@ (mult: (sign_extend: (match_operand:VD_HSI 2 "register_operand" "w")) - (sign_extend: - (vec_duplicate:VD_HSI + (vec_duplicate: + (sign_extend: (vec_select: (match_operand: 3 "register_operand" "") (parallel [(match_operand:SI 4 "immediate_operand" "i")]))) - )) + )) (const_int 1))))] "TARGET_SIMD" { @@ -5386,12 +5403,12 @@ (mult: (sign_extend: (match_operand:VD_HSI 2 "register_operand" "w")) - (sign_extend: - (vec_duplicate:VD_HSI + (vec_duplicate: + (sign_extend: (vec_select: (match_operand: 3 "register_operand" "") (parallel [(match_operand:SI 4 "immediate_operand" "i")]))) - )) + )) (const_int 1)) (match_operand: 1 "register_operand" "0")))] "TARGET_SIMD" @@ -5507,8 +5524,8 @@ (mult: (sign_extend: (match_operand:VD_HSI 2 "register_operand" "w")) - (sign_extend: - (vec_duplicate:VD_HSI + (vec_duplicate: + (sign_extend: (match_operand: 3 "register_operand" "")))) (const_int 1))))] "TARGET_SIMD" @@ -5523,8 +5540,8 @@ (mult: (sign_extend: (match_operand:VD_HSI 2 "register_operand" "w")) - (sign_extend: - (vec_duplicate:VD_HSI + (vec_duplicate: + (sign_extend: (match_operand: 3 "register_operand" "")))) (const_int 1)) (match_operand: 1 "register_operand" "0")))] @@ -5601,11 +5618,11 @@ (ss_ashift: (mult: (sign_extend: - (vec_select: - (match_operand:VQ_HSI 2 "register_operand" "w") - (match_operand:VQ_HSI 5 "vect_par_cnst_hi_half" ""))) - (sign_extend: - (vec_duplicate: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 5 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (sign_extend: (vec_select: (match_operand: 3 "register_operand" "") (parallel [(match_operand:SI 4 "immediate_operand" "i")]) @@ -5622,15 +5639,15 @@ (define_insn "aarch64_sqdmlal2_lane_internal" [(set (match_operand: 0 "register_operand" "=w") - (ss_plus: + (ss_plus: (ss_ashift: (mult: (sign_extend: - (vec_select: - (match_operand:VQ_HSI 2 "register_operand" "w") - (match_operand:VQ_HSI 5 "vect_par_cnst_hi_half" ""))) - (sign_extend: - (vec_duplicate: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 5 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (sign_extend: (vec_select: (match_operand: 3 "register_operand" "") (parallel [(match_operand:SI 4 "immediate_operand" "i")]) @@ -5648,16 +5665,16 @@ (define_insn "aarch64_sqdmlsl2_laneq_internal" [(set (match_operand: 0 "register_operand" "=w") - (ss_minus: + (ss_minus: (match_operand: 1 "register_operand" "0") (ss_ashift: (mult: (sign_extend: - (vec_select: - (match_operand:VQ_HSI 2 "register_operand" "w") - (match_operand:VQ_HSI 5 "vect_par_cnst_hi_half" ""))) - (sign_extend: - (vec_duplicate: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 5 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (sign_extend: (vec_select: (match_operand: 3 "register_operand" "") (parallel [(match_operand:SI 4 "immediate_operand" "i")]) @@ -5674,15 +5691,15 @@ (define_insn "aarch64_sqdmlal2_laneq_internal" [(set (match_operand: 0 "register_operand" "=w") - (ss_plus: + (ss_plus: (ss_ashift: (mult: (sign_extend: - (vec_select: - (match_operand:VQ_HSI 2 "register_operand" "w") - (match_operand:VQ_HSI 5 "vect_par_cnst_hi_half" ""))) - (sign_extend: - (vec_duplicate: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 5 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (sign_extend: (vec_select: (match_operand: 3 "register_operand" "") (parallel [(match_operand:SI 4 "immediate_operand" "i")]) @@ -5734,16 +5751,16 @@ (define_insn "aarch64_sqdmlsl2_n_internal" [(set (match_operand: 0 "register_operand" "=w") - (ss_minus: + (ss_minus: (match_operand: 1 "register_operand" "0") (ss_ashift: (mult: (sign_extend: - (vec_select: - (match_operand:VQ_HSI 2 "register_operand" "w") - (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" ""))) - (sign_extend: - (vec_duplicate: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (sign_extend: (match_operand: 3 "register_operand" "")))) (const_int 1))))] "TARGET_SIMD" @@ -5753,15 +5770,15 @@ (define_insn "aarch64_sqdmlal2_n_internal" [(set (match_operand: 0 "register_operand" "=w") - (ss_plus: + (ss_plus: (ss_ashift: (mult: (sign_extend: - (vec_select: - (match_operand:VQ_HSI 2 "register_operand" "w") - (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" ""))) - (sign_extend: - (vec_duplicate: + (vec_select: + (match_operand:VQ_HSI 2 "register_operand" "w") + (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (sign_extend: (match_operand: 3 "register_operand" "")))) (const_int 1)) (match_operand: 1 "register_operand" "0")))] @@ -5806,13 +5823,13 @@ (define_insn "aarch64_sqdmull_lane" [(set (match_operand: 0 "register_operand" "=w") - (ss_ashift: + (ss_ashift: (mult: (sign_extend: (match_operand:VD_HSI 1 "register_operand" "w")) - (sign_extend: - (vec_duplicate:VD_HSI - (vec_select: + (vec_duplicate: + (sign_extend: + (vec_select: (match_operand: 2 "register_operand" "") (parallel [(match_operand:SI 3 "immediate_operand" "i")]))) )) @@ -5827,13 +5844,13 @@ (define_insn "aarch64_sqdmull_laneq" [(set (match_operand: 0 "register_operand" "=w") - (ss_ashift: + (ss_ashift: (mult: (sign_extend: (match_operand:VD_HSI 1 "register_operand" "w")) - (sign_extend: - (vec_duplicate:VD_HSI - (vec_select: + (vec_duplicate: + (sign_extend: + (vec_select: (match_operand: 2 "register_operand" "") (parallel [(match_operand:SI 3 "immediate_operand" "i")]))) )) @@ -5890,13 +5907,13 @@ (define_insn "aarch64_sqdmull_n" [(set (match_operand: 0 "register_operand" "=w") - (ss_ashift: + (ss_ashift: (mult: (sign_extend: (match_operand:VD_HSI 1 "register_operand" "w")) - (sign_extend: - (vec_duplicate:VD_HSI - (match_operand: 2 "register_operand" ""))) + (vec_duplicate: + (sign_extend: + (match_operand: 2 "register_operand" ""))) ) (const_int 1)))] "TARGET_SIMD" @@ -5906,8 +5923,6 @@ ;; vqdmull2 - - (define_insn "aarch64_sqdmull2_internal" [(set (match_operand: 0 "register_operand" "=w") (ss_ashift: @@ -5943,15 +5958,15 @@ (define_insn "aarch64_sqdmull2_lane_internal" [(set (match_operand: 0 "register_operand" "=w") - (ss_ashift: + (ss_ashift: (mult: (sign_extend: (vec_select: - (match_operand:VQ_HSI 1 "register_operand" "w") - (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" ""))) - (sign_extend: - (vec_duplicate: - (vec_select: + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (sign_extend: + (vec_select: (match_operand: 2 "register_operand" "") (parallel [(match_operand:SI 3 "immediate_operand" "i")]))) )) @@ -5966,15 +5981,15 @@ (define_insn "aarch64_sqdmull2_laneq_internal" [(set (match_operand: 0 "register_operand" "=w") - (ss_ashift: + (ss_ashift: (mult: (sign_extend: (vec_select: - (match_operand:VQ_HSI 1 "register_operand" "w") - (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" ""))) - (sign_extend: - (vec_duplicate: - (vec_select: + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (sign_extend: + (vec_select: (match_operand: 2 "register_operand" "") (parallel [(match_operand:SI 3 "immediate_operand" "i")]))) )) @@ -6019,15 +6034,15 @@ (define_insn "aarch64_sqdmull2_n_internal" [(set (match_operand: 0 "register_operand" "=w") - (ss_ashift: + (ss_ashift: (mult: (sign_extend: (vec_select: - (match_operand:VQ_HSI 1 "register_operand" "w") - (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) - (sign_extend: - (vec_duplicate: - (match_operand: 2 "register_operand" ""))) + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) + (vec_duplicate: + (sign_extend: + (match_operand: 2 "register_operand" ""))) ) (const_int 1)))] "TARGET_SIMD" -- cgit v1.1 From 5b58057b6e7b0d1551907725da515ea18179010d Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 27 Jul 2021 11:31:20 -0400 Subject: rs6000: Write output to the builtins init file, part 3 of 3 2021-07-27 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (typemap): New struct. (TYPE_MAP_SIZE): New macro. (type_map): New initialized variable. (typemap_cmp): New function. (write_type_node): Likewise. (write_fntype_init): Implement. --- gcc/config/rs6000/rs6000-gen-builtins.c | 163 ++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index da0d14e..59b0632 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -464,6 +464,108 @@ static rbt_strings fntype_rbt; identifiers to the order in which they were encountered. */ static rbt_strings bifo_rbt; +/* Mapping from type tokens to type node names. */ +struct typemap +{ + const char *key; + const char *value; +}; + +/* This table must be kept in alphabetical order, as we use binary + search for table lookups in map_token_to_type_node. The table + maps tokens from a fntype string to a tree type. For example, + in "si_ftype_hi" we would map "si" to "intSI_type_node" and + map "hi" to "intHI_type_node". */ +#define TYPE_MAP_SIZE 86 +static typemap type_map[TYPE_MAP_SIZE] = + { + { "bi", "bool_int" }, + { "bv16qi", "bool_V16QI" }, + { "bv1ti", "bool_V1TI" }, + { "bv2di", "bool_V2DI" }, + { "bv4si", "bool_V4SI" }, + { "bv8hi", "bool_V8HI" }, + { "ci", "integer" }, + { "dd", "dfloat64" }, + { "df", "double" }, + { "di", "long_long_integer" }, + { "hi", "intHI" }, + { "if", "ibm128_float" }, + { "ld", "long_double" }, + { "lg", "long_integer" }, + { "pbv16qi", "ptr_bool_V16QI" }, + { "pbv1ti", "ptr_bool_V1TI" }, + { "pbv2di", "ptr_bool_V2DI" }, + { "pbv4si", "ptr_bool_V4SI" }, + { "pbv8hi", "ptr_bool_V8HI" }, + { "pcvoid", "pcvoid" }, + { "pdd", "ptr_dfloat64" }, + { "pdf", "ptr_double" }, + { "pdi", "ptr_long_long_integer" }, + { "phi", "ptr_intHI" }, + { "pif", "ptr_ibm128_float" }, + { "pld", "ptr_long_double" }, + { "plg", "ptr_long_integer" }, + { "pqi", "ptr_intQI" }, + { "psf", "ptr_float" }, + { "psi", "ptr_intSI" }, + { "ptd", "ptr_dfloat128" }, + { "ptf", "ptr_float128" }, + { "pti", "ptr_intTI" }, + { "pudi", "ptr_long_long_unsigned" }, + { "puhi", "ptr_uintHI" }, + { "pulg", "ptr_long_unsigned" }, + { "puqi", "ptr_uintQI" }, + { "pusi", "ptr_uintSI" }, + { "puti", "ptr_uintTI" }, + { "puv16qi", "ptr_unsigned_V16QI" }, + { "puv1ti", "ptr_unsigned_V1TI" }, + { "puv2di", "ptr_unsigned_V2DI" }, + { "puv4si", "ptr_unsigned_V4SI" }, + { "puv8hi", "ptr_unsigned_V8HI" }, + { "pv", "ptr" }, + { "pv16qi", "ptr_V16QI" }, + { "pv1poi", "ptr_vector_pair" }, + { "pv1pxi", "ptr_vector_quad" }, + { "pv1ti", "ptr_V1TI" }, + { "pv2df", "ptr_V2DF" }, + { "pv2di", "ptr_V2DI" }, + { "pv4sf", "ptr_V4SF" }, + { "pv4si", "ptr_V4SI" }, + { "pv8hi", "ptr_V8HI" }, + { "pvp8hi", "ptr_pixel_V8HI" }, + { "qi", "intQI" }, + { "sd", "dfloat32" }, + { "sf", "float" }, + { "si", "intSI" }, + { "st", "const_str" }, + { "td", "dfloat128" }, + { "tf", "float128" }, + { "ti", "intTI" }, + { "udi", "long_long_unsigned" }, + { "uhi", "unsigned_intHI" }, + { "ulg", "long_unsigned" }, + { "uqi", "unsigned_intQI" }, + { "usi", "unsigned_intSI" }, + { "uti", "unsigned_intTI" }, + { "uv16qi", "unsigned_V16QI" }, + { "uv1ti", "unsigned_V1TI" }, + { "uv2di", "unsigned_V2DI" }, + { "uv4si", "unsigned_V4SI" }, + { "uv8hi", "unsigned_V8HI" }, + { "v", "void" }, + { "v16qi", "V16QI" }, + { "v1poi", "vector_pair" }, + { "v1pxi", "vector_quad" }, + { "v1ti", "V1TI" }, + { "v2df", "V2DF" }, + { "v2di", "V2DI" }, + { "v4sf", "V4SF" }, + { "v4si", "V4SI" }, + { "v8hi", "V8HI" }, + { "vp8hi", "pixel_V8HI" }, + }; + /* Pointer to a diagnostic function. */ static void (*diag) (const char *, ...) __attribute__ ((format (printf, 1, 2))); @@ -2219,10 +2321,71 @@ write_fntype (char *str) fprintf (init_file, "tree %s;\n", str); } +/* Comparator for bsearch on the type map. */ +int +typemap_cmp (const void *key, const void *entry) +{ + return strcmp ((const char *)key, ((const typemap *)entry)->key); +} + +/* Write the type node corresponding to TOK. */ +static void +write_type_node (char *tok, bool indent) +{ + if (indent) + fprintf (init_file, " "); + typemap *entry = (typemap *) bsearch (tok, type_map, TYPE_MAP_SIZE, + sizeof (typemap), typemap_cmp); + if (!entry) + fatal ("Type map is inconsistent."); + fprintf (init_file, "%s_type_node", entry->value); +} + /* Write an initializer for a function type identified by STR. */ void write_fntype_init (char *str) { + char *tok; + + /* Check whether we have a "tf" token in this string, representing + a float128_type_node. It's possible that float128_type_node is + undefined (occurs for -maltivec -mno-vsx, for example), so we + must guard against that. */ + int tf_found = strstr (str, "tf") != NULL; + + /* Similarly, look for decimal float tokens. */ + int dfp_found = (strstr (str, "dd") != NULL + || strstr (str, "td") != NULL + || strstr (str, "sd") != NULL); + + /* Avoid side effects of strtok on the original string by using a copy. */ + char *buf = strdup (str); + + if (tf_found) + fprintf (init_file, " if (float128_type_node)\n "); + else if (dfp_found) + fprintf (init_file, " if (dfloat64_type_node)\n "); + + fprintf (init_file, " %s\n = build_function_type_list (", buf); + tok = strtok (buf, "_"); + write_type_node (tok, tf_found || dfp_found); + tok = strtok (0, "_"); + assert (tok); + assert (!strcmp (tok, "ftype")); + + tok = strtok (0, "_"); + if (tok) + fprintf (init_file, ",\n\t\t\t\t"); + + /* Note: A function with no arguments ends with '_ftype_v'. */ + while (tok && strcmp (tok, "v")) + { + write_type_node (tok, tf_found || dfp_found); + tok = strtok (0, "_"); + fprintf (init_file, ",\n\t\t\t\t"); + } + fprintf (init_file, "NULL_TREE);\n"); + free (buf); } /* Write everything to the header file (rs6000-builtins.h). Return -- cgit v1.1 From bb4d8febb3660965cc4617f2a91be0014236faf7 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Mon, 26 Jul 2021 23:04:44 -0400 Subject: rs6000: Write static initializations for built-in table 2021-07-26 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (write_bif_static_init): New function. (write_init_file): Call write_bif_static_init. --- gcc/config/rs6000/rs6000-gen-builtins.c | 112 ++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 59b0632..1d7f744 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -2409,6 +2409,116 @@ write_header_file (void) return 1; } +/* Write the decl and initializer for rs6000_builtin_info_x[]. */ +static void +write_bif_static_init (void) +{ + const char *res[3]; + fprintf (init_file, "bifdata rs6000_builtin_info_x[RS6000_BIF_MAX] =\n"); + fprintf (init_file, " {\n"); + fprintf (init_file, " { /* RS6000_BIF_NONE: */\n"); + fprintf (init_file, " \"\", ENB_ALWAYS, 0, CODE_FOR_nothing, 0,\n"); + fprintf (init_file, " 0, {0, 0, 0}, {RES_NONE, RES_NONE, RES_NONE},\n"); + fprintf (init_file, " {0, 0, 0}, {0, 0, 0}, \"\", RS6000_BIF_NONE\n"); + fprintf (init_file, " },\n"); + for (int i = 0; i <= curr_bif; i++) + { + bifdata *bifp = &bifs[bif_order[i]]; + fprintf (init_file, " { /* RS6000_BIF_%s: */\n", bifp->idname); + fprintf (init_file, " /* bifname */\t\"%s\",\n", + bifp->proto.bifname); + fprintf (init_file, " /* enable*/\t%s,\n", + enable_string[bifp->stanza]); + /* Type must be instantiated at run time. */ + fprintf (init_file, " /* fntype */\t0,\n"); + fprintf (init_file, " /* icode */\tCODE_FOR_%s,\n", + bifp->patname); + fprintf (init_file, " /* nargs */\t%d,\n", + bifp->proto.nargs); + fprintf (init_file, " /* bifattrs */\t0"); + if (bifp->attrs.isinit) + fprintf (init_file, " | bif_init_bit"); + if (bifp->attrs.isset) + fprintf (init_file, " | bif_set_bit"); + if (bifp->attrs.isextract) + fprintf (init_file, " | bif_extract_bit"); + if (bifp->attrs.isnosoft) + fprintf (init_file, " | bif_nosoft_bit"); + if (bifp->attrs.isldvec) + fprintf (init_file, " | bif_ldvec_bit"); + if (bifp->attrs.isstvec) + fprintf (init_file, " | bif_stvec_bit"); + if (bifp->attrs.isreve) + fprintf (init_file, " | bif_reve_bit"); + if (bifp->attrs.ispred) + fprintf (init_file, " | bif_pred_bit"); + if (bifp->attrs.ishtm) + fprintf (init_file, " | bif_htm_bit"); + if (bifp->attrs.ishtmspr) + fprintf (init_file, " | bif_htmspr_bit"); + if (bifp->attrs.ishtmcr) + fprintf (init_file, " | bif_htmcr_bit"); + if (bifp->attrs.ismma) + fprintf (init_file, " | bif_mma_bit"); + if (bifp->attrs.isquad) + fprintf (init_file, " | bif_quad_bit"); + if (bifp->attrs.ispair) + fprintf (init_file, " | bif_pair_bit"); + if (bifp->attrs.isno32bit) + fprintf (init_file, " | bif_no32bit_bit"); + if (bifp->attrs.is32bit) + fprintf (init_file, " | bif_32bit_bit"); + if (bifp->attrs.iscpu) + fprintf (init_file, " | bif_cpu_bit"); + if (bifp->attrs.isldstmask) + fprintf (init_file, " | bif_ldstmask_bit"); + if (bifp->attrs.islxvrse) + fprintf (init_file, " | bif_lxvrse_bit"); + if (bifp->attrs.islxvrze) + fprintf (init_file, " | bif_lxvrze_bit"); + if (bifp->attrs.isendian) + fprintf (init_file, " | bif_endian_bit"); + fprintf (init_file, ",\n"); + fprintf (init_file, " /* restr_opnd */\t{%d, %d, %d},\n", + bifp->proto.restr_opnd[0], bifp->proto.restr_opnd[1], + bifp->proto.restr_opnd[2]); + for (int j = 0; j < 3; j++) + if (!bifp->proto.restr_opnd[j]) + res[j] = "RES_NONE"; + else if (bifp->proto.restr[j] == RES_BITS) + res[j] = "RES_BITS"; + else if (bifp->proto.restr[j] == RES_RANGE) + res[j] = "RES_RANGE"; + else if (bifp->proto.restr[j] == RES_VALUES) + res[j] = "RES_VALUES"; + else if (bifp->proto.restr[j] == RES_VAR_RANGE) + res[j] = "RES_VAR_RANGE"; + else + res[j] = "ERROR"; + fprintf (init_file, " /* restr */\t{%s, %s, %s},\n", + res[0], res[1], res[2]); + fprintf (init_file, " /* restr_val1 */\t{%s, %s, %s},\n", + bifp->proto.restr_val1[0] ? bifp->proto.restr_val1[0] : "0", + bifp->proto.restr_val1[1] ? bifp->proto.restr_val1[1] : "0", + bifp->proto.restr_val1[2] ? bifp->proto.restr_val1[2] : "0"); + fprintf (init_file, " /* restr_val2 */\t{%s, %s, %s},\n", + bifp->proto.restr_val2[0] ? bifp->proto.restr_val2[0] : "0", + bifp->proto.restr_val2[1] ? bifp->proto.restr_val2[1] : "0", + bifp->proto.restr_val2[2] ? bifp->proto.restr_val2[2] : "0"); + fprintf (init_file, " /* attr_string */\t\"%s\",\n", + (bifp->kind == FNK_CONST ? "= const" + : (bifp->kind == FNK_PURE ? "= pure" + : (bifp->kind == FNK_FPMATH ? "= fp, const" + : "")))); + bool no_icode = !strcmp (bifp->patname, "nothing"); + fprintf (init_file, " /* assoc_bif */\tRS6000_BIF_%s%s\n", + bifp->attrs.ismma && no_icode ? bifp->idname : "NONE", + bifp->attrs.ismma && no_icode ? "_INTERNAL" : ""); + fprintf (init_file, " },\n"); + } + fprintf (init_file, " };\n\n"); +} + /* Write code to initialize the built-in function table. */ static void write_init_bif_table (void) @@ -2598,6 +2708,8 @@ write_init_file (void) fprintf (init_file, "tree rs6000_builtin_decls_x[RS6000_OVLD_MAX];\n\n"); + write_bif_static_init (); + rbt_inorder_callback (&fntype_rbt, fntype_rbt.rbt_root, write_fntype); fprintf (init_file, "\n"); -- cgit v1.1 From 7590016ba956de8d036138e3b4dd2435ce625ba2 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Mon, 26 Jul 2021 23:07:19 -0400 Subject: rs6000: Write static initializations for overload tables 2021-06-07 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (write_ovld_static_init): New function. (write_init_file): Call write_ovld_static_init. --- gcc/config/rs6000/rs6000-gen-builtins.c | 53 +++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 1d7f744..e5d3b71 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -2519,6 +2519,58 @@ write_bif_static_init (void) fprintf (init_file, " };\n\n"); } +/* Write the decls and initializers for rs6000_overload_info[] and + rs6000_instance_info[]. */ +static void +write_ovld_static_init (void) +{ + fprintf (init_file, + "ovldrecord rs6000_overload_info[RS6000_OVLD_MAX " + "- RS6000_OVLD_NONE] =\n"); + fprintf (init_file, " {\n"); + fprintf (init_file, " { /* RS6000_OVLD_NONE: */\n"); + fprintf (init_file, " \"\", NULL\n"); + fprintf (init_file, " },\n"); + for (int i = 0; i <= curr_ovld_stanza; i++) + { + fprintf (init_file, " { /* RS6000_OVLD_%s: */\n", + ovld_stanzas[i].stanza_id); + fprintf (init_file, " /* ovld_name */\t\"%s\",\n", + ovld_stanzas[i].intern_name); + /* First-instance must currently be instantiated at run time. */ + fprintf (init_file, " /* first_instance */\tNULL\n"); + fprintf (init_file, " },\n"); + } + fprintf (init_file, " };\n\n"); + + fprintf (init_file, "ovlddata rs6000_instance_info[RS6000_INST_MAX] =\n"); + fprintf (init_file, " {\n"); + fprintf (init_file, " { /* RS6000_INST_NONE: */\n"); + fprintf (init_file, " \"\", RS6000_BIF_NONE, NULL_TREE, NULL\n"); + fprintf (init_file, " },\n"); + for (int i = 0; i <= curr_ovld; i++) + { + fprintf (init_file, " { /* RS6000_INST_%s: */\n", + ovlds[i].ovld_id_name); + fprintf (init_file, " /* bifname */\t\"%s\",\n", + ovlds[i].proto.bifname); + fprintf (init_file, " /* bifid */\tRS6000_BIF_%s,\n", + ovlds[i].bif_id_name); + /* Type must be instantiated at run time. */ + fprintf (init_file, " /* fntype */\t0,\n"); + fprintf (init_file, " /* next */\t"); + if (i < curr_ovld + && !strcmp (ovlds[i+1].proto.bifname, ovlds[i].proto.bifname)) + fprintf (init_file, + "&rs6000_instance_info[RS6000_INST_%s]\n", + ovlds[i+1].ovld_id_name); + else + fprintf (init_file, "NULL\n"); + fprintf (init_file, " },\n"); + } + fprintf (init_file, " };\n\n"); +} + /* Write code to initialize the built-in function table. */ static void write_init_bif_table (void) @@ -2709,6 +2761,7 @@ write_init_file (void) fprintf (init_file, "tree rs6000_builtin_decls_x[RS6000_OVLD_MAX];\n\n"); write_bif_static_init (); + write_ovld_static_init (); rbt_inorder_callback (&fntype_rbt, fntype_rbt.rbt_root, write_fntype); fprintf (init_file, "\n"); -- cgit v1.1 From 872da9a6f664a06d73c987aa0cb2e5b830158a10 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Fri, 26 Mar 2021 10:56:47 +0800 Subject: Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct. gcc/ChangeLog: PR target/99881 * config/i386/i386.h (processor_costs): Add new member integer_to_sse. * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost, i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost, geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost, bdver_cost, znver1_cost, znver2_cost, znver3_cost, btver1_cost, btver2_cost, btver3_cost, pentium4_cost, nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost, generic_cost, core_cost): Initialize integer_to_sse same value as sse_op. (skylake_cost): Initialize integer_to_sse twice as much as sse_op. * config/i386/i386.c (ix86_builtin_vectorization_cost): Use integer_to_sse instead of sse_op to calculate the cost of vec_construct. gcc/testsuite/ChangeLog: PR target/99881 * gcc.target/i386/pr99881.c: New test. --- gcc/config/i386/i386.c | 6 +++++- gcc/config/i386/i386.h | 1 + gcc/config/i386/x86-tune-costs.h | 26 ++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 876a19f..ac59ebf 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, case vec_construct: { /* N element inserts into SSE vectors. */ - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; + int cost + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? + ix86_cost->sse_op + : ix86_cost->integer_to_sse); + /* One vinserti128 for combining two SSE vectors for AVX256. */ if (GET_MODE_BITSIZE (mode) == 256) cost += ix86_vec_cost (mode, ix86_cost->addss); diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 0c2c93d..d1e1c22 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -165,6 +165,7 @@ struct processor_costs { const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ zmm_move; const int sse_to_integer; /* cost of moving SSE register to integer. */ + const int integer_to_sse; /* cost of moving integer to SSE register. */ const int gather_static, gather_per_elt; /* Cost of gather load is computed as static + per_item * nelts. */ const int scatter_static, scatter_per_elt; /* Cost of gather store is diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index ffe810f..67cfa00 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ in 128bit, 256bit and 512bit */ 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */ 5, 0, /* Gather load static, per_elt. */ 5, 0, /* Gather store static, per_elt. */ 0, /* size of l1 cache */ @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 0, /* size of l1 cache */ @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 4, /* size of l1 cache. 486 has 8kB cache @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = { {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = { {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = { {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -753,6 +759,7 @@ struct processor_costs geode_cost = { {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 2, 2, /* Gather load static, per_elt. */ 2, 2, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -860,6 +867,7 @@ struct processor_costs k6_cost = { {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 2, 2, /* Gather load static, per_elt. */ 2, 2, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = { {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 5, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = { {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 5, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = { {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = { {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 16, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 12, 12, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 16, /* size of l1 cache. */ @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = { {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, throughput 12. Approx 9 uops do not depend on vector size and every load is 7 uops. */ @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = { 2, 2, 3, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, throughput 12. Approx 9 uops do not depend on vector size and every load is 7 uops. */ @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = { 2, 2, 3, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, throughput 9. Approx 7 uops do not depend on vector size and every load is 4 uops. */ @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = { {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */ 20, 8, /* Gather load static, per_elt. */ 22, 10, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = { {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 20, 8, /* Gather load static, per_elt. */ 22, 10, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = { {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 14, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 10, 10, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = { {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 14, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 10, 10, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = { {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ 20, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 16, 16, /* Gather load static, per_elt. */ 16, 16, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = { {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ 20, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 12, 12, /* Gather load static, per_elt. */ 12, 12, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = { {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 8, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 8, 8, /* Gather load static, per_elt. */ 8, 8, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = { {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 8, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 8, 8, /* Gather load static, per_elt. */ 8, 8, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = { {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ 4, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 6, 6, /* Gather load static, per_elt. */ 6, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = { {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 18, 6, /* Gather load static, per_elt. */ 18, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = { {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 2, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, rec. throughput 6. So 5 uops statically and one uops per load. */ -- cgit v1.1 From 88d0f70a326eeb42b479aa537f8a81bf5a199346 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 28 Jul 2021 10:52:51 +0200 Subject: i386: Improve AVX2 expansion of vector >> vector DImode arithm. shifts [PR101611] AVX2 introduced vector >> vector shifts, but unfortunately for V{2,4}DImode it only supports logical and not arithmetic shifts, only AVX512F for V8DImode or AVX512VL for V{2,4}DImode fixed that omission. Earlier in GCC12 cycle I've committed vector >> scalar arithmetic shift emulation using various sequences, this patch handles the vector >> vector case. No need to adjust costs, the previous cost adjustment actually covers even the vector by vector shifts. The patch emits the right arithmetic V{2,4}DImode shifts using 2 logical right V{2,4}DImode shifts (once of the original operands, once of sign mask constant by the vector shift count), xor and subtraction, on each element (long long) x >> y is done as (((unsigned long long) x >> y) ^ (0x8000000000000000ULL >> y)) - (0x8000000000000000ULL >> y) i.e. if x doesn't have in some element the MSB set, it is just the logical shift, if it does, then the xor and subtraction cause also all higher bits to be set. 2021-07-28 Jakub Jelinek PR target/101611 * config/i386/sse.md (vashr3): Split into vashrv8di3 expander and vashrv4di3 expander, where the latter requires just TARGET_AVX2 and has special !TARGET_AVX512VL expansion. (vashrv2di3): Rename to ... (vashrv2di3): ... this. Change condition to TARGET_XOP || TARGET_AVX2 and add special !TARGET_XOP && !TARGET_AVX512VL expansion. * gcc.target/i386/avx2-pr101611-1.c: New test. * gcc.target/i386/avx2-pr101611-2.c: New test. --- gcc/config/i386/sse.md | 47 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index f8759e4..b5a0898 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -20499,13 +20499,34 @@ (match_operand:VI48_256 2 "nonimmediate_operand")))] "TARGET_AVX2") -(define_expand "vashr3" - [(set (match_operand:VI8_256_512 0 "register_operand") - (ashiftrt:VI8_256_512 - (match_operand:VI8_256_512 1 "register_operand") - (match_operand:VI8_256_512 2 "nonimmediate_operand")))] +(define_expand "vashrv8di3" + [(set (match_operand:V8DI 0 "register_operand") + (ashiftrt:V8DI + (match_operand:V8DI 1 "register_operand") + (match_operand:V8DI 2 "nonimmediate_operand")))] "TARGET_AVX512F") +(define_expand "vashrv4di3" + [(set (match_operand:V4DI 0 "register_operand") + (ashiftrt:V4DI + (match_operand:V4DI 1 "register_operand") + (match_operand:V4DI 2 "nonimmediate_operand")))] + "TARGET_AVX2" +{ + if (!TARGET_AVX512VL) + { + rtx mask = ix86_build_signbit_mask (V4DImode, 1, 0); + rtx t1 = gen_reg_rtx (V4DImode); + rtx t2 = gen_reg_rtx (V4DImode); + rtx t3 = gen_reg_rtx (V4DImode); + emit_insn (gen_vlshrv4di3 (t1, operands[1], operands[2])); + emit_insn (gen_vlshrv4di3 (t2, mask, operands[2])); + emit_insn (gen_xorv4di3 (t3, t1, t2)); + emit_insn (gen_subv4di3 (operands[0], t3, t2)); + DONE; + } +}) + (define_expand "vashr3" [(set (match_operand:VI12_128 0 "register_operand") (ashiftrt:VI12_128 @@ -20527,12 +20548,12 @@ } }) -(define_expand "vashrv2di3" +(define_expand "vashrv2di3" [(set (match_operand:V2DI 0 "register_operand") (ashiftrt:V2DI (match_operand:V2DI 1 "register_operand") (match_operand:V2DI 2 "nonimmediate_operand")))] - "TARGET_XOP || TARGET_AVX512VL" + "TARGET_XOP || TARGET_AVX2" { if (TARGET_XOP) { @@ -20541,6 +20562,18 @@ emit_insn (gen_xop_shav2di3 (operands[0], operands[1], neg)); DONE; } + if (!TARGET_AVX512VL) + { + rtx mask = ix86_build_signbit_mask (V2DImode, 1, 0); + rtx t1 = gen_reg_rtx (V2DImode); + rtx t2 = gen_reg_rtx (V2DImode); + rtx t3 = gen_reg_rtx (V2DImode); + emit_insn (gen_vlshrv2di3 (t1, operands[1], operands[2])); + emit_insn (gen_vlshrv2di3 (t2, mask, operands[2])); + emit_insn (gen_xorv2di3 (t3, t1, t2)); + emit_insn (gen_subv2di3 (operands[0], t3, t2)); + DONE; + } }) (define_expand "vashrv4si3" -- cgit v1.1 From b662250c1f5e0e453f1fd6022cc47e517f70fa8e Mon Sep 17 00:00:00 2001 From: Bin Cheng Date: Wed, 28 Jul 2021 17:50:59 +0800 Subject: AArch64: use stable sorting in generating ldp/stp In some corner cases, we have code as below: [base + 0x310] = A [base + 0x320] = B [base + 0x330] = C [base + 0x320] = D unstable sorting could result in wrong value in offset 0x320. The patch fixes it by using gcc_stablesort. 2021-07-28 Bin Cheng * config/aarch64/aarch64.c (aarch64_gen_adjusted_ldpstp): use gcc_stablesort. --- gcc/config/aarch64/aarch64.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 3bdf19d..e211460 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -24441,8 +24441,16 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load, for (int i = 0; i < 8; i ++) temp_operands[i] = operands[i]; - /* Sort the operands. */ - qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare); + /* Sort the operands. Note for cases as below: + [base + 0x310] = A + [base + 0x320] = B + [base + 0x330] = C + [base + 0x320] = D + We need stable sorting otherwise wrong data may be store to offset 0x320. + Also note the dead store in above case should be optimized away, but no + guarantees here. */ + gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *), + aarch64_ldrstr_offset_compare); /* Copy the memory operands so that if we have to bail for some reason the original addresses are unchanged. */ -- cgit v1.1 From 1af16666943ef075673501765a13e425e47015cd Mon Sep 17 00:00:00 2001 From: Andrew Stubbs Date: Tue, 27 Jul 2021 15:40:21 +0100 Subject: amdgcn: Fix attributes for LLVM-12 [PR 100208] This should work for a wider range of LLVM 12 variants now. More work required for LLVM 13 though. gcc/ChangeLog: PR target/100208 * config.in: Regenerate. * config/gcn/gcn-hsa.h (A_FIJI): New define. (A_900): New define. (A_906): New define. (A_908): New define. (ASM_SPEC): Use A_FIJI, A_900, A_906 and A_908. * config/gcn/gcn.c (output_file_start): Adjust attributes according to the assembler capabilities. * config/gcn/mkoffload.c (main): Likewise. * configure: Regenerate. * configure.ac: Add tests for LLVM assembler attribute features. --- gcc/config/gcn/gcn-hsa.h | 25 ++++++++++++++++++++++++- gcc/config/gcn/gcn.c | 31 ++++++++++++++++++++++++++----- gcc/config/gcn/mkoffload.c | 3 +++ 3 files changed, 53 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn-hsa.h b/gcc/config/gcn/gcn-hsa.h index 724e9a3..fc99c8d 100644 --- a/gcc/config/gcn/gcn-hsa.h +++ b/gcc/config/gcn/gcn-hsa.h @@ -75,6 +75,28 @@ extern unsigned int gcn_local_sym_hash (const char *name); supported for gcn. */ #define GOMP_SELF_SPECS "" +#ifdef HAVE_GCN_SRAM_ECC_FIJI +#define A_FIJI +#else +#define A_FIJI "!march=*:;march=fiji:;" +#endif +#ifdef HAVE_GCN_SRAM_ECC_GFX900 +#define A_900 +#else +#define A_900 "march=gfx900:;" +#endif +#ifdef HAVE_GCN_SRAM_ECC_GFX906 +#define A_906 +#else +#define A_906 "march=gfx906:;" +#endif +#ifdef HAVE_GCN_SRAM_ECC_GFX908 +#define A_908 +#else +#define A_908 "march=gfx908:;" +#endif + +/* These targets can't have SRAM-ECC, even if a broken assembler allows it. */ #define DRIVER_SELF_SPECS \ "%{march=fiji|march=gfx900|march=gfx906:%{!msram-ecc=*:-msram-ecc=off}}" @@ -83,7 +105,8 @@ extern unsigned int gcn_local_sym_hash (const char *name); "%:last_arg(%{march=*:-mcpu=%*}) " \ "-mattr=%{mxnack:+xnack;:-xnack} " \ /* FIXME: support "any" when we move to HSACOv4. */ \ - "-mattr=%{!msram-ecc=off:+sram-ecc;:-sram-ecc} " \ + "-mattr=%{" A_FIJI A_900 A_906 A_908 \ + "!msram-ecc=off:+sram-ecc;:-sram-ecc} " \ "-filetype=obj" #define LINK_SPEC "--pie --export-dynamic" #define LIB_SPEC "-lc" diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 385b90c..d25c4e5 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -5181,18 +5181,39 @@ static void output_file_start (void) { const char *cpu; + bool use_sram = flag_sram_ecc; switch (gcn_arch) { - case PROCESSOR_FIJI: cpu = "gfx803"; break; - case PROCESSOR_VEGA10: cpu = "gfx900"; break; - case PROCESSOR_VEGA20: cpu = "gfx906"; break; - case PROCESSOR_GFX908: cpu = "gfx908"; break; + case PROCESSOR_FIJI: + cpu = "gfx803"; +#ifndef HAVE_GCN_SRAM_ECC_FIJI + use_sram = false; +#endif + break; + case PROCESSOR_VEGA10: + cpu = "gfx900"; +#ifndef HAVE_GCN_SRAM_ECC_GFX900 + use_sram = false; +#endif + break; + case PROCESSOR_VEGA20: + cpu = "gfx906"; +#ifndef HAVE_GCN_SRAM_ECC_GFX906 + use_sram = false; +#endif + break; + case PROCESSOR_GFX908: + cpu = "gfx908"; +#ifndef HAVE_GCN_SRAM_ECC_GFX908 + use_sram = false; +#endif + break; default: gcc_unreachable (); } const char *xnack = (flag_xnack ? "+xnack" : ""); /* FIXME: support "any" when we move to HSACOv4. */ - const char *sram_ecc = (flag_sram_ecc ? "+sram-ecc" : ""); + const char *sram_ecc = (use_sram ? "+sram-ecc" : ""); fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n", cpu, xnack, sram_ecc); diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c index 804cc26..732bdfd 100644 --- a/gcc/config/gcn/mkoffload.c +++ b/gcc/config/gcn/mkoffload.c @@ -898,6 +898,9 @@ main (int argc, char **argv) case EF_AMDGPU_MACH_AMDGCN_GFX803: case EF_AMDGPU_MACH_AMDGCN_GFX900: case EF_AMDGPU_MACH_AMDGCN_GFX906: +#ifndef HAVE_GCN_SRAM_ECC_GFX908 + case EF_AMDGPU_MACH_AMDGCN_GFX908: +#endif break; default: /* FIXME: change this when we move to HSACOv4. */ -- cgit v1.1 From 9775e465c1fbfc32656de77c618c61acf5bd905d Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Tue, 27 Jul 2021 07:46:04 -0700 Subject: x86: Don't set AVX_U128_DIRTY when zeroing YMM/ZMM register There is no SSE <-> AVX transition penalty if the upper bits of YMM/ZMM registers are unchanged and YMM/ZMM store doesn't change the upper bits of YMM/ZMM registers. 1. Since zeroing YMM/ZMM register is implemented with zeroing XMM register, don't set AVX_U128_DIRTY when zeroing YMM/ZMM register. 2. Since store doesn't change the INIT state on the upper bits of YMM/ZMM register, don't set AVX_U128_DIRTY on store if the source of store was never non-zero. Here are the vzeroupper count differences on SPEC CPU 2017 with -Ofast -march=skylake-avx512 Before After Diff 500.perlbench_r 226 225 -0.44% 502.gcc_r 1263 1103 -12.67% 503.bwaves_r 14 14 0.00% 505.mcf_r 29 28 -3.45% 507.cactuBSSN_r 4651 4628 -0.49% 508.namd_r 433 432 -0.23% 510.parest_r 20380 19347 -5.07% 511.povray_r 495 452 -8.69% 519.lbm_r 2 2 0.00% 520.omnetpp_r 5954 5677 -4.65% 521.wrf_r 12353 12339 -0.11% 523.xalancbmk_r 13137 13001 -1.04% 525.x264_r 192 191 -0.52% 526.blender_r 2515 2366 -5.92% 527.cam4_r 4601 4583 -0.39% 531.deepsjeng_r 20 19 -5.00% 538.imagick_r 898 805 -10.36% 541.leela_r 427 399 -6.56% 544.nab_r 74 74 0.00% 548.exchange2_r 72 72 0.00% 549.fotonik3d_r 318 318 0.00% 554.roms_r 558 554 -0.72% 557.xz_r 79 52 -34.18% and performance differences are within noise range. gcc/ PR target/101456 * config/i386/i386.c (ix86_avx_u128_mode_needed): Don't set AVX_U128_DIRTY when all bits are zero. gcc/testsuite/ PR target/101456 * gcc.target/i386/pr101456-1.c: New test. * gcc.target/i386/pr101456-2.c: Likewise. --- gcc/config/i386/i386.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ac59ebf..12ae37e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -14149,6 +14149,94 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) return AVX_U128_CLEAN; } + rtx set = single_set (insn); + if (set) + { + rtx dest = SET_DEST (set); + rtx src = SET_SRC (set); + if (ix86_check_avx_upper_register (dest)) + { + /* This is an YMM/ZMM load. Return AVX_U128_DIRTY if the + source isn't zero. */ + if (standard_sse_constant_p (src, GET_MODE (dest)) != 1) + return AVX_U128_DIRTY; + else + return AVX_U128_ANY; + } + else if (ix86_check_avx_upper_register (src)) + { + /* This is an YMM/ZMM store. Check for the source operand + of SRC DEFs in the same basic block before INSN. */ + basic_block bb = BLOCK_FOR_INSN (insn); + rtx_insn *end = BB_END (bb); + + /* Return AVX_U128_DIRTY if there is no DEF in the same basic + block. */ + int status = AVX_U128_DIRTY; + + for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src)); + def; def = DF_REF_NEXT_REG (def)) + if (DF_REF_BB (def) == bb) + { + /* Ignore DEF from different basic blocks. */ + rtx_insn *def_insn = DF_REF_INSN (def); + + /* Check if DEF_INSN is before INSN. */ + rtx_insn *next; + for (next = NEXT_INSN (def_insn); + next != nullptr && next != end && next != insn; + next = NEXT_INSN (next)) + ; + + /* Skip if DEF_INSN isn't before INSN. */ + if (next != insn) + continue; + + /* Return AVX_U128_DIRTY if the source operand of + DEF_INSN isn't constant zero. */ + + if (CALL_P (def_insn)) + { + bool avx_upper_reg_found = false; + note_stores (def_insn, ix86_check_avx_upper_stores, + &avx_upper_reg_found); + + /* Return AVX_U128_DIRTY if call returns AVX. */ + if (avx_upper_reg_found) + return AVX_U128_DIRTY; + + continue; + } + + set = single_set (def_insn); + if (!set) + return AVX_U128_DIRTY; + + dest = SET_DEST (set); + + /* Skip if DEF_INSN is not an AVX load. */ + if (ix86_check_avx_upper_register (dest)) + { + src = SET_SRC (set); + /* Return AVX_U128_DIRTY if the source operand isn't + constant zero. */ + if (standard_sse_constant_p (src, GET_MODE (dest)) + != 1) + return AVX_U128_DIRTY; + } + + /* We get here only if all AVX loads are from constant + zero. */ + status = AVX_U128_ANY; + } + + return status; + } + + /* This isn't YMM/ZMM load/store. */ + return AVX_U128_ANY; + } + /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced. Hardware changes state only when a 256bit register is written to, but we need to prevent the compiler from moving optimal insertion -- cgit v1.1 From 8b06ccb20eaf2e7cb85e9dca23f1599ee37d17a7 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Wed, 28 Jul 2021 16:34:03 +0100 Subject: aarch64: Add smov alternative to sign_extend pattern In the testcase here we were generating a umov + sxth to move a half-word value from SIMD to GP regs with sign-extension. We can use a single smov instruction for it instead but the sign-extend pattern was missing the right alternative. The *zero_extend2_aarch64 pattern for zero-extension already has the right alternative for the analogous umov instruction, so this mirrors that pattern. Bootstrapped and tested on aarch64-none-linux-gnu. The test gcc.target/aarch64/sve/clastb_4.c is adjusted to scan for the clastb h0, p0, h0, z0.h form instead of the clastb w0, p0, w0, z0.h form. This is an improvement as the W forms of the clast instructions are more expensive. gcc/ChangeLog: * config/aarch64/aarch64.md (*extend2_aarch64): Add "r,w" alternative. gcc/testsuite/ChangeLog: * gcc.target/aarch64/smov_1.c: New test. * gcc.target/aarch64/sve/clastb_4.c: Adjust clast scan-assembler. --- gcc/config/aarch64/aarch64.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 8cd259f..eb8ccd4 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1874,13 +1874,15 @@ ) (define_insn "*extend2_aarch64" - [(set (match_operand:GPI 0 "register_operand" "=r,r") - (sign_extend:GPI (match_operand:SHORT 1 "nonimmediate_operand" "r,m")))] + [(set (match_operand:GPI 0 "register_operand" "=r,r,r") + (sign_extend:GPI (match_operand:SHORT 1 "nonimmediate_operand" "r,m,w")))] "" "@ sxt\t%0, %w1 - ldrs\t%0, %1" - [(set_attr "type" "extend,load_4")] + ldrs\t%0, %1 + smov\t%w0, %1.[0]" + [(set_attr "type" "extend,load_4,neon_to_gp") + (set_attr "arch" "*,*,fp")] ) (define_insn "*zero_extend2_aarch64" -- cgit v1.1 From aafa38b5bfed5e3eff258aa5354ed928f4986709 Mon Sep 17 00:00:00 2001 From: Jiufu Guo Date: Thu, 15 Jul 2021 17:21:00 +0800 Subject: Use preferred mode for doloop IV [PR61837] Currently, doloop.xx variable is using the type as niter which may be shorter than word size. For some targets, it would be better to use word size type. For example, on 64bit system, to access 32bit value, subreg maybe used. Then using 64bit type maybe better for niter if it can be present in both 32bit and 64bit. This patch add target hook to query preferred mode for doloop IV, and update mode accordingly. gcc/ChangeLog: 2021-07-29 Jiufu Guo PR target/61837 * config/rs6000/rs6000.c (TARGET_PREFERRED_DOLOOP_MODE): New hook. (rs6000_preferred_doloop_mode): New hook. * doc/tm.texi: Regenerate. * doc/tm.texi.in: Add hook preferred_doloop_mode. * target.def (preferred_doloop_mode): New hook. * targhooks.c (default_preferred_doloop_mode): New hook. * targhooks.h (default_preferred_doloop_mode): New hook. * tree-ssa-loop-ivopts.c (compute_doloop_base_on_mode): New function. (add_iv_candidate_for_doloop): Call targetm.preferred_doloop_mode and compute_doloop_base_on_mode. gcc/testsuite/ChangeLog: 2021-07-29 Jiufu Guo PR target/61837 * gcc.target/powerpc/pr61837.c: New test. --- gcc/config/rs6000/rs6000.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 279f00c..2de5a96 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1700,6 +1700,9 @@ static const struct attribute_spec rs6000_attribute_table[] = #undef TARGET_DOLOOP_COST_FOR_ADDRESS #define TARGET_DOLOOP_COST_FOR_ADDRESS 1000000000 +#undef TARGET_PREFERRED_DOLOOP_MODE +#define TARGET_PREFERRED_DOLOOP_MODE rs6000_preferred_doloop_mode + #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv @@ -27935,6 +27938,14 @@ rs6000_predict_doloop_p (struct loop *loop) return true; } +/* Implement TARGET_PREFERRED_DOLOOP_MODE. */ + +static machine_mode +rs6000_preferred_doloop_mode (machine_mode) +{ + return word_mode; +} + /* Implement TARGET_CANNOT_SUBSTITUTE_MEM_EQUIV_P. */ static bool -- cgit v1.1 From 231bcc77b953406b8381c7f55a3ec181da67d1e7 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 28 Jul 2021 16:24:52 +0800 Subject: Add a separate function to calculate cost for WIDEN_MULT_EXPR. gcc/ChangeLog: PR target/39821 * config/i386/i386.c (ix86_widen_mult_cost): New function. (ix86_add_stmt_cost): Use ix86_widen_mult_cost for WIDEN_MULT_EXPR. gcc/testsuite/ChangeLog: PR target/39821 * gcc.target/i386/sse2-pr39821.c: New test. * gcc.target/i386/sse4-pr39821.c: New test. --- gcc/config/i386/i386.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 12ae37e..a0285e6 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19845,6 +19845,44 @@ ix86_vec_cost (machine_mode mode, int cost) return cost; } +/* Return cost of vec_widen_mult_hi/lo_, + vec_widen_mul_hi/lo_ is only available for VI124_AVX2. */ +static int +ix86_widen_mult_cost (const struct processor_costs *cost, + enum machine_mode mode, bool uns_p) +{ + gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); + int extra_cost = 0; + int basic_cost = 0; + switch (mode) + { + case V8HImode: + case V16HImode: + if (!uns_p || mode == V16HImode) + extra_cost = cost->sse_op * 2; + basic_cost = cost->mulss * 2 + cost->sse_op * 4; + break; + case V4SImode: + case V8SImode: + /* pmulhw/pmullw can be used. */ + basic_cost = cost->mulss * 2 + cost->sse_op * 2; + break; + case V2DImode: + /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend, + require extra 4 mul, 4 add, 4 cmp and 2 shift. */ + if (!TARGET_SSE4_1 && !uns_p) + extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4 + + cost->sse_op * 2; + /* Fallthru. */ + case V4DImode: + basic_cost = cost->mulss * 2 + cost->sse_op * 4; + break; + default: + gcc_unreachable(); + } + return ix86_vec_cost (mode, basic_cost + extra_cost); +} + /* Return cost of multiplication in MODE. */ static int @@ -22575,10 +22613,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, break; case MULT_EXPR: - case WIDEN_MULT_EXPR: + /* For MULT_HIGHPART_EXPR, x86 only supports pmulhw, + take it as MULT_EXPR. */ case MULT_HIGHPART_EXPR: stmt_cost = ix86_multiplication_cost (ix86_cost, mode); break; + /* There's no direct instruction for WIDEN_MULT_EXPR, + take emulation into account. */ + case WIDEN_MULT_EXPR: + stmt_cost = ix86_widen_mult_cost (ix86_cost, mode, + TYPE_UNSIGNED (vectype)); + break; + case NEGATE_EXPR: if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) stmt_cost = ix86_cost->sse_op; -- cgit v1.1 From 0c6d21faa426bd6e6fdb3a6b47af530e49944118 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Thu, 29 Jul 2021 14:32:59 -0400 Subject: Reinstate branch-on-bit insns for H8 gcc/ * config/h8300/h8300-modes.def: Add CCZ, CCV and CCC, drop CCZNV. * config/h8300/h8300.md (H8cc mode iterator): Add CCZ. (cc mode_attr): Similarly. (ccz subst_attr): Similarly. * config/h8300/jumpcall.md: Add new patterns for branch-on-bit. * config/h8300/testcompare.md: Remove various cc0 based patterns that had been commented out. Add pattern to set CCZ from a bit test. --- gcc/config/h8300/h8300-modes.def | 4 +- gcc/config/h8300/h8300.md | 5 ++- gcc/config/h8300/jumpcall.md | 46 +++++++++++++++++++++++ gcc/config/h8300/testcompare.md | 80 +++++----------------------------------- 4 files changed, 61 insertions(+), 74 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300-modes.def b/gcc/config/h8300/h8300-modes.def index 23b777b..6ab5260 100644 --- a/gcc/config/h8300/h8300-modes.def +++ b/gcc/config/h8300/h8300-modes.def @@ -18,4 +18,6 @@ . */ CC_MODE (CCZN); -CC_MODE (CCZNV); +CC_MODE (CCZ); +CC_MODE (CCV); +CC_MODE (CCC); diff --git a/gcc/config/h8300/h8300.md b/gcc/config/h8300/h8300.md index e596987..7f49e42 100644 --- a/gcc/config/h8300/h8300.md +++ b/gcc/config/h8300/h8300.md @@ -140,11 +140,11 @@ ;; The modes we're supporting. This is used when we want to generate ;; multiple patterns where only the mode differs from a single template -(define_mode_iterator H8cc [CC CCZN]) +(define_mode_iterator H8cc [CC CCZN CCZ]) ;; This is used to generate multiple define_substs from a single ;; template for the different variants we might have. -(define_mode_attr cc [(CC "cc") (CCZN "cczn")]) +(define_mode_attr cc [(CC "cc") (CCZN "cczn") (CCZ "ccz")]) ;; The primary substitution pattern. is used to create multiple ;; substitutions based on the CC bits that are set. @@ -165,6 +165,7 @@ ;; apply the subst_cczn or subset_cc define_subst to generate a ;; new pattern that compare-elim can use (define_subst_attr "cczn" "subst_cczn" "" "_cczn") +(define_subst_attr "ccz" "subst_ccz" "" "_ccz") (define_subst_attr "cc" "subst_cc" "" "_cc") ;; Type of delay slot. NONE means the instruction has no delay slot. diff --git a/gcc/config/h8300/jumpcall.md b/gcc/config/h8300/jumpcall.md index e1f0418..3e59fee 100644 --- a/gcc/config/h8300/jumpcall.md +++ b/gcc/config/h8300/jumpcall.md @@ -143,6 +143,52 @@ [(set_attr "type" "bitbranch") (set_attr "length_table" "bitbranch")]) +(define_insn_and_split "" + [(set (pc) + (if_then_else (match_operator 3 "eqne_operator" + [(zero_extract:QHSI (match_operand:QHSI 1 "register_operand" "r") + (const_int 1) + (match_operand 2 "const_int_operand" "n")) + (const_int 0)]) + (label_ref (match_operand 0 "" "")) + (pc)))] + "INTVAL (operands[2]) < 16" + "#" + "&& reload_completed" + [(set (reg:CCZ CC_REG) + (eq (zero_extract:QHSI (match_dup 1) (const_int 1) (match_dup 2)) + (const_int 0))) + (set (pc) + (if_then_else (match_op_dup 3 [(reg:CCZ CC_REG) (const_int 0)]) + (label_ref (match_dup 0)) + (pc)))]) + +(define_insn_and_split "" + [(set (pc) + (if_then_else (match_operator 3 "eqne_operator" + [(zero_extract:SI (match_operand:SI 1 "register_operand" "r") + (const_int 1) + (match_operand 2 "const_int_operand" "n")) + (const_int 0)]) + (label_ref (match_operand 0 "" "")) + (pc))) + (clobber (match_scratch:SI 4 "=&r"))] + "INTVAL (operands[2]) >= 16" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 4) + (ior:SI (and:SI (match_dup 4) (const_int -65536)) + (lshiftrt:SI (match_dup 1) (const_int 16)))) + (clobber (reg:CC CC_REG))]) + (set (reg:CCZ CC_REG) + (eq (zero_extract:SI (match_dup 4) (const_int 1) (match_dup 2)) + (const_int 0))) + (set (pc) + (if_then_else (match_op_dup 3 [(reg:CCZ CC_REG) (const_int 0)]) + (label_ref (match_dup 0)) + (pc)))] + "operands[2] = GEN_INT (INTVAL (operands[2]) - 16);") + ;; Unconditional and other jump instructions. (define_insn "jump" diff --git a/gcc/config/h8300/testcompare.md b/gcc/config/h8300/testcompare.md index e9f6ddc..2919053 100644 --- a/gcc/config/h8300/testcompare.md +++ b/gcc/config/h8300/testcompare.md @@ -26,77 +26,15 @@ ;; "" ;; [(set_attr "length" "2,8,10")]) ;; -;;(define_insn "" -;; [(set (cc0) -;; (compare (zero_extract:HSI (match_operand:HSI 0 "register_operand" "r") -;; (const_int 1) -;; (match_operand 1 "const_int_operand" "n")) -;; (const_int 0)))] -;; "INTVAL (operands[1]) <= 15" -;; "btst %Z1,%Y0" -;; [(set_attr "length" "2")]) -;; -;;(define_insn_and_split "*tstsi_upper_bit" -;; [(set (cc0) -;; (compare (zero_extract:SI (match_operand:SI 0 "register_operand" "r") -;; (const_int 1) -;; (match_operand 1 "const_int_operand" "n")) -;; (const_int 0))) -;; (clobber (match_scratch:SI 2 "=&r"))] -;; "INTVAL (operands[1]) >= 16" -;; "#" -;; "&& reload_completed" -;; [(set (match_dup 2) -;; (ior:SI (and:SI (match_dup 2) -;; (const_int -65536)) -;; (lshiftrt:SI (match_dup 0) -;; (const_int 16)))) -;; (set (cc0) -;; (compare (zero_extract:SI (match_dup 2) -;; (const_int 1) -;; (match_dup 3)) -;; (const_int 0)))] -;; { -;; operands[3] = GEN_INT (INTVAL (operands[1]) - 16); -;; }) -;; -;;(define_insn "*tstsi_variable_bit" -;; [(set (cc0) -;; (compare (zero_extract:SI (match_operand:SI 0 "register_operand" "r") -;; (const_int 1) -;; (and:SI (match_operand:SI 1 "register_operand" "r") -;; (const_int 7))) -;; (const_int 0)))] -;; "" -;; "btst %w1,%w0" -;; [(set_attr "length" "2")]) -;; -;;(define_insn_and_split "*tstsi_variable_bit_qi" -;; [(set (cc0) -;; (compare (zero_extract:SI (zero_extend:SI (match_operand:QI 0 "general_operand_src" "r,U,mn>")) -;; (const_int 1) -;; (and:SI (match_operand:SI 1 "register_operand" "r,r,r") -;; (const_int 7))) -;; (const_int 0))) -;; (clobber (match_scratch:QI 2 "=X,X,&r"))] -;; "!CONSTANT_P (operands[0])" -;; "@ -;; btst\\t%w1,%X0 -;; btst\\t%w1,%X0 -;; #" -;; "&& reload_completed -;; && !satisfies_constraint_U (operands[0])" -;; [(set (match_dup 2) -;; (match_dup 0)) -;; (parallel [(set (cc0) -;; (compare (zero_extract:SI (zero_extend:SI (match_dup 2)) -;; (const_int 1) -;; (and:SI (match_dup 1) -;; (const_int 7))) -;; (const_int 0))) -;; (clobber (scratch:QI))])] -;; "" -;; [(set_attr "length" "2,8,10")]) +(define_insn "" + [(set (reg:CCZ CC_REG) + (eq (zero_extract:HSI (match_operand:HSI 0 "register_operand" "r") + (const_int 1) + (match_operand 1 "const_int_operand" "n")) + (const_int 0)))] + "INTVAL (operands[1]) < 16" + "btst %Z1,%Y0" + [(set_attr "length" "2")]) (define_insn "*tst" [(set (reg:CCZN CC_REG) -- cgit v1.1 From ef22e9c725cc94b68c27b09503bfc4b4064d8dbf Mon Sep 17 00:00:00 2001 From: Hans-Peter Nilsson Date: Sat, 17 Jul 2021 13:19:08 +0200 Subject: Fix MMIX breakage; ICE in df_ref_record, at df-scan.c:2598 This bug made me dive into some of the murkier waters of gcc, namely the source of operand 2 to the "call" pattern. It can be pretty poisonous, but is unused (either directly or later) by most targets. The target function_arg (and function_incoming_arg), can unless specially handled, cause a VOIDmode reg RTX to be generated, for the function arguments end-marker. This is then passed on by expand_call to the target "call" pattern, as operand[2] (which is wrongly documented or wrongly implemented, see comment in mmix.c) but unused by most targets that do not handle it specially, as in operand 2 not making it into the insn generated for the "call" (et al) patterns. Of course, the MMIX port stands out here: the RTX makes it into the generated RTX but is then actually unused and is just a placeholder; see mmix_print_operand 'p'. Anyway, df-scan inspects the emitted call rtx and horks on the void-mode RTX (actually: that it represents a zero-sized register range) from r12-1702. While I could replace or remove the emitted unused call insn operand, that would still leave unusable rtx to future users of function_arg actually looking for next_arg_reg. Better replace VOIDmode with DImode here; that's the "natural" mode of MMIX registers. (As a future improvement, I'll also remove the placeholder argument and replace the intended user; the print_operand output modifier 'p' modifier (as in "PUSHJ $%p2,%0") with some punctuation, perhaps '!' (as in "PUSHJ $%!,%0"). I inspected all ports, but other targets emit a special function_arg_info::end_marker cookie or just don't emit "call" operand[2] (etc) in the expanded "call" pattern. gcc: * config/mmix/mmix.c (mmix_function_arg_1): Avoid generating a VOIDmode register for e.g the function_arg_info::end_marker. --- gcc/config/mmix/mmix.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/mmix/mmix.c b/gcc/config/mmix/mmix.c index 40bfb43..db7af7b 100644 --- a/gcc/config/mmix/mmix.c +++ b/gcc/config/mmix/mmix.c @@ -667,10 +667,17 @@ mmix_function_arg_1 (const cumulative_args_t argsp_v, { CUMULATIVE_ARGS *argsp = get_cumulative_args (argsp_v); + /* The mode of the argument will be VOIDmode for the "end_marker". Make sure + we don't ever generate a VOIDmode register; later passes will barf on that. + We may want to use the register number, so return something nominally + useful. Thus, for VOIDmode, use DImode, being the natural mode for the + register. */ + machine_mode mode = arg.mode == VOIDmode ? DImode : arg.mode; + /* Last-argument marker. */ if (arg.end_marker_p ()) return (argsp->regs < MMIX_MAX_ARGS_IN_REGS) - ? gen_rtx_REG (arg.mode, + ? gen_rtx_REG (mode, (incoming ? MMIX_FIRST_INCOMING_ARG_REGNUM : MMIX_FIRST_ARG_REGNUM) + argsp->regs) @@ -678,10 +685,10 @@ mmix_function_arg_1 (const cumulative_args_t argsp_v, return (argsp->regs < MMIX_MAX_ARGS_IN_REGS && !targetm.calls.must_pass_in_stack (arg) - && (GET_MODE_BITSIZE (arg.mode) <= 64 + && (GET_MODE_BITSIZE (mode) <= 64 || argsp->lib || TARGET_LIBFUNC)) - ? gen_rtx_REG (arg.mode, + ? gen_rtx_REG (mode, (incoming ? MMIX_FIRST_INCOMING_ARG_REGNUM : MMIX_FIRST_ARG_REGNUM) -- cgit v1.1 From e41ba804ba5f5ca433e09238d561b1b4c8b10985 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Thu, 29 Jul 2021 22:26:25 -0500 Subject: Use range-based for loops for traversing loops This patch follows Martin's suggestion here[1], to support range based loop for iterating loops, analogously to the patch for vec[2]. For example, use below range-based for loop for (auto loop : loops_list (cfun, 0)) to replace the previous macro FOR_EACH_LOOP FOR_EACH_LOOP (loop, 0) [1] https://gcc.gnu.org/pipermail/gcc-patches/2021-June/573424.html [2] https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572315.html gcc/ChangeLog: * cfgloop.h (as_const): New function. (class loop_iterator): Rename to ... (class loops_list): ... this. (loop_iterator::next): Rename to ... (loops_list::Iter::fill_curr_loop): ... this and adjust. (loop_iterator::loop_iterator): Rename to ... (loops_list::loops_list): ... this and adjust. (loops_list::Iter): New class. (loops_list::iterator): New type. (loops_list::const_iterator): New type. (loops_list::begin): New function. (loops_list::end): Likewise. (loops_list::begin const): Likewise. (loops_list::end const): Likewise. (FOR_EACH_LOOP): Remove. (FOR_EACH_LOOP_FN): Remove. * cfgloop.c (flow_loops_dump): Adjust FOR_EACH_LOOP* with range-based for loop with loops_list instance. (sort_sibling_loops): Likewise. (disambiguate_loops_with_multiple_latches): Likewise. (verify_loop_structure): Likewise. * cfgloopmanip.c (create_preheaders): Likewise. (force_single_succ_latches): Likewise. * config/aarch64/falkor-tag-collision-avoidance.c (execute_tag_collision_avoidance): Likewise. * config/mn10300/mn10300.c (mn10300_scan_for_setlb_lcc): Likewise. * config/s390/s390.c (s390_adjust_loops): Likewise. * doc/loop.texi: Likewise. * gimple-loop-interchange.cc (pass_linterchange::execute): Likewise. * gimple-loop-jam.c (tree_loop_unroll_and_jam): Likewise. * gimple-loop-versioning.cc (loop_versioning::analyze_blocks): Likewise. (loop_versioning::make_versioning_decisions): Likewise. * gimple-ssa-split-paths.c (split_paths): Likewise. * graphite-isl-ast-to-gimple.c (graphite_regenerate_ast_isl): Likewise. * graphite.c (canonicalize_loop_form): Likewise. (graphite_transform_loops): Likewise. * ipa-fnsummary.c (analyze_function_body): Likewise. * ipa-pure-const.c (analyze_function): Likewise. * loop-doloop.c (doloop_optimize_loops): Likewise. * loop-init.c (loop_optimizer_finalize): Likewise. (fix_loop_structure): Likewise. * loop-invariant.c (calculate_loop_reg_pressure): Likewise. (move_loop_invariants): Likewise. * loop-unroll.c (decide_unrolling): Likewise. (unroll_loops): Likewise. * modulo-sched.c (sms_schedule): Likewise. * predict.c (predict_loops): Likewise. (pass_profile::execute): Likewise. * profile.c (branch_prob): Likewise. * sel-sched-ir.c (sel_finish_pipelining): Likewise. (sel_find_rgns): Likewise. * tree-cfg.c (replace_loop_annotate): Likewise. (replace_uses_by): Likewise. (move_sese_region_to_fn): Likewise. * tree-if-conv.c (pass_if_conversion::execute): Likewise. * tree-loop-distribution.c (loop_distribution::execute): Likewise. * tree-parloops.c (parallelize_loops): Likewise. * tree-predcom.c (tree_predictive_commoning): Likewise. * tree-scalar-evolution.c (scev_initialize): Likewise. (scev_reset): Likewise. * tree-ssa-dce.c (find_obviously_necessary_stmts): Likewise. * tree-ssa-live.c (remove_unused_locals): Likewise. * tree-ssa-loop-ch.c (ch_base::copy_headers): Likewise. * tree-ssa-loop-im.c (analyze_memory_references): Likewise. (tree_ssa_lim_initialize): Likewise. * tree-ssa-loop-ivcanon.c (canonicalize_induction_variables): Likewise. * tree-ssa-loop-ivopts.c (tree_ssa_iv_optimize): Likewise. * tree-ssa-loop-manip.c (get_loops_exits): Likewise. * tree-ssa-loop-niter.c (estimate_numbers_of_iterations): Likewise. (free_numbers_of_iterations_estimates): Likewise. * tree-ssa-loop-prefetch.c (tree_ssa_prefetch_arrays): Likewise. * tree-ssa-loop-split.c (tree_ssa_split_loops): Likewise. * tree-ssa-loop-unswitch.c (tree_ssa_unswitch_loops): Likewise. * tree-ssa-loop.c (gate_oacc_kernels): Likewise. (pass_scev_cprop::execute): Likewise. * tree-ssa-propagate.c (clean_up_loop_closed_phi): Likewise. * tree-ssa-sccvn.c (do_rpo_vn): Likewise. * tree-ssa-threadupdate.c (jump_thread_path_registry::thread_through_all_blocks): Likewise. * tree-vectorizer.c (vectorize_loops): Likewise. * tree-vrp.c (vrp_asserts::find_assert_locations): Likewise. --- gcc/config/aarch64/falkor-tag-collision-avoidance.c | 4 +--- gcc/config/mn10300/mn10300.c | 4 +--- gcc/config/s390/s390.c | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/falkor-tag-collision-avoidance.c b/gcc/config/aarch64/falkor-tag-collision-avoidance.c index de214e4..6c8e02a 100644 --- a/gcc/config/aarch64/falkor-tag-collision-avoidance.c +++ b/gcc/config/aarch64/falkor-tag-collision-avoidance.c @@ -808,8 +808,6 @@ record_loads (tag_map_t &tag_map, struct loop *loop) void execute_tag_collision_avoidance () { - struct loop *loop; - df_set_flags (DF_RD_PRUNE_DEAD_DEFS); df_chain_add_problem (DF_UD_CHAIN); df_compute_regs_ever_live (true); @@ -824,7 +822,7 @@ execute_tag_collision_avoidance () calculate_dominance_info (CDI_DOMINATORS); loop_optimizer_init (AVOID_CFG_MODIFICATIONS); - FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) + for (auto loop : loops_list (cfun, LI_FROM_INNERMOST)) { tag_map_t tag_map (512); diff --git a/gcc/config/mn10300/mn10300.c b/gcc/config/mn10300/mn10300.c index 6f842a3..aeb5d04 100644 --- a/gcc/config/mn10300/mn10300.c +++ b/gcc/config/mn10300/mn10300.c @@ -3234,8 +3234,6 @@ mn10300_loop_contains_call_insn (loop_p loop) static void mn10300_scan_for_setlb_lcc (void) { - loop_p loop; - DUMP ("Looking for loops that can use the SETLB insn", NULL_RTX); df_analyze (); @@ -3248,7 +3246,7 @@ mn10300_scan_for_setlb_lcc (void) if an inner loop is not suitable for use with the SETLB/Lcc insns, it may be the case that its parent loop is suitable. Thus we should check all loops, but work from the innermost outwards. */ - FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) + for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST)) { const char * reason = NULL; diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index b1d3b99..8c7d366 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -14479,15 +14479,13 @@ s390_adjust_loop_scan_osc (struct loop* loop) static void s390_adjust_loops () { - struct loop *loop = NULL; - df_analyze (); compute_bb_for_insn (); /* Find the loops. */ loop_optimizer_init (AVOID_CFG_MODIFICATIONS); - FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) + for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST)) { if (dump_file) { -- cgit v1.1 From 5a973aec601cb69024e0ebf6b0961906cec7c446 Mon Sep 17 00:00:00 2001 From: "prathamesh.kulkarni" Date: Fri, 30 Jul 2021 15:10:37 +0530 Subject: arm/66791: Replace builtins in vld1. gcc/ChangeLog: PR target/66791 * config/arm/arm_neon.h (vld1_p64): Replace call to builtin by explicitly dereferencing __a. (vld1_s64): Likewise. (vld1_u64): Likewise. * config/arm/arm_neon_builtins.def (vld1): Remove entry for di and change to VAR13. --- gcc/config/arm/arm_neon.h | 6 +++--- gcc/config/arm/arm_neon_builtins.def | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 41b596b..5a91d15 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -10301,7 +10301,7 @@ __extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_p64 (const poly64_t * __a) { - return (poly64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a); + return (poly64x1_t) { *__a }; } #pragma GCC pop_options @@ -10330,7 +10330,7 @@ __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_s64 (const int64_t * __a) { - return (int64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a); + return (int64x1_t) { *__a }; } #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) @@ -10374,7 +10374,7 @@ __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_u64 (const uint64_t * __a) { - return (uint64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a); + return (uint64x1_t) { *__a }; } __extension__ extern __inline poly8x8_t diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 70438ac..fb6d66e 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -302,8 +302,8 @@ VAR1 (TERNOP, vtbx1, v8qi) VAR1 (TERNOP, vtbx2, v8qi) VAR1 (TERNOP, vtbx3, v8qi) VAR1 (TERNOP, vtbx4, v8qi) -VAR14 (LOAD1, vld1, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di, +VAR13 (LOAD1, vld1, + v8qi, v4hi, v4hf, v2si, v2sf, v16qi, v8hi, v8hf, v4si, v4sf, v2di, v4bf, v8bf) VAR12 (LOAD1LANE, vld1_lane, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di, v4bf, v8bf) -- cgit v1.1 From 854ef6e50acf1b182ddaf007cff2cf60545692b0 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 30 Jul 2021 05:58:38 -0700 Subject: x86: Don't enable LZCNT/POPCNT if disabled explicitly gcc/ PR target/101685 * config/i386/i386-options.c (ix86_option_override_internal): Don't enable LZCNT/POPCNT if they have been disabled explicitly. gcc/testsuite/ PR target/101685 * gcc.target/i386/pr101685.c: New test. --- gcc/config/i386/i386-options.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 3416a4f..6b78998 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -2124,8 +2124,10 @@ ix86_option_override_internal (bool main_args_p, if (((processor_alias_table[i].flags & PTA_ABM) != 0) && !TARGET_EXPLICIT_ABM_P (opts)) { - SET_TARGET_LZCNT (opts); - SET_TARGET_POPCNT (opts); + if (!TARGET_EXPLICIT_LZCNT_P (opts)) + SET_TARGET_LZCNT (opts); + if (!TARGET_EXPLICIT_POPCNT_P (opts)) + SET_TARGET_POPCNT (opts); } if ((processor_alias_table[i].flags -- cgit v1.1 From 45cb789e6adf5d571c574a94b77413c845fed106 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Sun, 20 Jun 2021 15:21:39 +0800 Subject: mips: add MSA vec_cmp and vec_cmpu expand pattern [PR101132] Middle-end started to emit vec_cmp and vec_cmpu since GCC 11, causing ICE on MIPS with MSA enabled. Add the pattern to prevent it. gcc/ PR target/101132 * config/mips/mips-protos.h (mips_expand_vec_cmp_expr): Declare. * config/mips/mips.c (mips_expand_vec_cmp_expr): New function. * config/mips/mips-msa.md (vec_cmp): New expander. (vec_cmpu): New expander. gcc/testsuite/ PR target/101132 * gcc.target/mips/pr101132.c: New test. --- gcc/config/mips/mips-msa.md | 22 ++++++++++++++++++++++ gcc/config/mips/mips-protos.h | 1 + gcc/config/mips/mips.c | 11 +++++++++++ 3 files changed, 34 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/mips/mips-msa.md b/gcc/config/mips/mips-msa.md index 3ecf2bd..3a67f25 100644 --- a/gcc/config/mips/mips-msa.md +++ b/gcc/config/mips/mips-msa.md @@ -435,6 +435,28 @@ DONE; }) +(define_expand "vec_cmp" + [(match_operand: 0 "register_operand") + (match_operator 1 "" + [(match_operand:MSA 2 "register_operand") + (match_operand:MSA 3 "register_operand")])] + "ISA_HAS_MSA" +{ + mips_expand_vec_cmp_expr (operands); + DONE; +}) + +(define_expand "vec_cmpu" + [(match_operand: 0 "register_operand") + (match_operator 1 "" + [(match_operand:IMSA 2 "register_operand") + (match_operand:IMSA 3 "register_operand")])] + "ISA_HAS_MSA" +{ + mips_expand_vec_cmp_expr (operands); + DONE; +}) + (define_insn "msa_insert_" [(set (match_operand:MSA 0 "register_operand" "=f,f") (vec_merge:MSA diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h index 51b82b1..a5e4151 100644 --- a/gcc/config/mips/mips-protos.h +++ b/gcc/config/mips/mips-protos.h @@ -385,6 +385,7 @@ extern mulsidi3_gen_fn mips_mulsidi3_gen_fn (enum rtx_code); extern void mips_register_frame_header_opt (void); extern void mips_expand_vec_cond_expr (machine_mode, machine_mode, rtx *); +extern void mips_expand_vec_cmp_expr (rtx *); /* Routines implemented in mips-d.c */ extern void mips_d_target_versions (void); diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 00a8eef..8f04339 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -22321,6 +22321,17 @@ mips_expand_msa_cmp (rtx dest, enum rtx_code cond, rtx op0, rtx op1) } } +void +mips_expand_vec_cmp_expr (rtx *operands) +{ + rtx cond = operands[1]; + rtx op0 = operands[2]; + rtx op1 = operands[3]; + rtx res = operands[0]; + + mips_expand_msa_cmp (res, GET_CODE (cond), op0, op1); +} + /* Expand VEC_COND_EXPR, where: MODE is mode of the result VIMODE equivalent integer mode -- cgit v1.1 From 2065654435e3d97676366f82b939bc9273382dbe Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Fri, 30 Jul 2021 23:44:14 +0800 Subject: mips: Fix up mips_atomic_assign_expand_fenv [PR94780] Commit message shamelessly copied from 1777beb6b129 by jakub: This function, because it is sometimes called even outside of function bodies, uses create_tmp_var_raw rather than create_tmp_var. But in order for that to work, when first referenced, the VAR_DECLs need to appear in a TARGET_EXPR so that during gimplification the var gets the right DECL_CONTEXT and is added to local decls. gcc/ PR target/94780 * config/mips/mips.c (mips_atomic_assign_expand_fenv): Use TARGET_EXPR instead of MODIFY_EXPR. --- gcc/config/mips/mips.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 8f04339..89d1be6 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -22439,12 +22439,12 @@ mips_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update) tree get_fcsr = mips_builtin_decls[MIPS_GET_FCSR]; tree set_fcsr = mips_builtin_decls[MIPS_SET_FCSR]; tree get_fcsr_hold_call = build_call_expr (get_fcsr, 0); - tree hold_assign_orig = build2 (MODIFY_EXPR, MIPS_ATYPE_USI, - fcsr_orig_var, get_fcsr_hold_call); + tree hold_assign_orig = build4 (TARGET_EXPR, MIPS_ATYPE_USI, + fcsr_orig_var, get_fcsr_hold_call, NULL, NULL); tree hold_mod_val = build2 (BIT_AND_EXPR, MIPS_ATYPE_USI, fcsr_orig_var, build_int_cst (MIPS_ATYPE_USI, 0xfffff003)); - tree hold_assign_mod = build2 (MODIFY_EXPR, MIPS_ATYPE_USI, - fcsr_mod_var, hold_mod_val); + tree hold_assign_mod = build4 (TARGET_EXPR, MIPS_ATYPE_USI, + fcsr_mod_var, hold_mod_val, NULL, NULL); tree set_fcsr_hold_call = build_call_expr (set_fcsr, 1, fcsr_mod_var); tree hold_all = build2 (COMPOUND_EXPR, MIPS_ATYPE_USI, hold_assign_orig, hold_assign_mod); @@ -22454,8 +22454,8 @@ mips_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update) *clear = build_call_expr (set_fcsr, 1, fcsr_mod_var); tree get_fcsr_update_call = build_call_expr (get_fcsr, 0); - *update = build2 (MODIFY_EXPR, MIPS_ATYPE_USI, - exceptions_var, get_fcsr_update_call); + *update = build4 (TARGET_EXPR, MIPS_ATYPE_USI, + exceptions_var, get_fcsr_update_call, NULL, NULL); tree set_fcsr_update_call = build_call_expr (set_fcsr, 1, fcsr_orig_var); *update = build2 (COMPOUND_EXPR, void_type_node, *update, set_fcsr_update_call); -- cgit v1.1 From ee189a7327565cfcc2441aa308333ad3b6525e6e Mon Sep 17 00:00:00 2001 From: Hans-Peter Nilsson Date: Sun, 18 Jul 2021 03:40:11 +0200 Subject: doc: correct documentation of "call" (et al) operand 2. An old itch being scratched: the documentation lies; it's not "the number of registers used as operands", unless the target makes a special arrangement to that effect, and there's nothing in the guts of gcc setting up or assuming those semantics. Instead, see calls.c:expand_call, variable next_arg_reg. Or just consider the variable name. The text is somewhat transcribed from the head comment of emit_call_1 for parameter next_arg_reg. Most important is to document the relation to function_arg_info::end_marker() and the TARGET_FUNCTION_ARG hook. The "normally" in the head comment, in "normally it is the first arg-register beyond those used for args in this call, or 0 if all the arg-registers are used in this call" means "by default", unless the target tests end_marker_p and does something special, but the port is free to return whatever it likes when it sees the end-marker. And, I do mean "whatever it likes" because if the port doesn't actually mention that operand in the RTX emitted for its "call" or "call_value" patterns ("usually" define_expands), it can be any mumbo-jumbo, such as a VOIDmode register, which seems like it happens for some targets, or NULL, that happens for others. Returning a VOIDmode register until recently included MMIX, where it made it into the emitted RTL, confusing later passes, recently exposed as an ICE. Tested by inspecting the info and generated pdf for sanity. gcc: * doc/md.texi (call): Correct information about operand 2. * config/mmix/mmix.md ("call", "call_value"): Remove fixed FIXMEs. --- gcc/config/mmix/mmix.md | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/mmix/mmix.md b/gcc/config/mmix/mmix.md index a6d7608..33e9c60 100644 --- a/gcc/config/mmix/mmix.md +++ b/gcc/config/mmix/mmix.md @@ -999,10 +999,8 @@ DIVU %1,%1,%2\;GET %0,:rR\;NEGU %2,0,%0\;CSNN %0,$255,%2") = mmix_get_hard_reg_initial_val (Pmode, MMIX_INCOMING_RETURN_ADDRESS_REGNUM); - /* FIXME: There's a bug in gcc which causes NULL to be passed as - operand[2] when we get out of registers, which later confuses gcc. - Work around it by replacing it with const_int 0. Possibly documentation - error too. */ + /* NULL gets passed as operand[2] when we get out of registers, + which later confuses gcc. Replace it with const_int 0. */ if (operands[2] == NULL_RTX) operands[2] = const0_rtx; @@ -1036,14 +1034,10 @@ DIVU %1,%1,%2\;GET %0,:rR\;NEGU %2,0,%0\;CSNN %0,$255,%2") = mmix_get_hard_reg_initial_val (Pmode, MMIX_INCOMING_RETURN_ADDRESS_REGNUM); - /* FIXME: See 'call'. */ + /* See 'call'. */ if (operands[3] == NULL_RTX) operands[3] = const0_rtx; - /* FIXME: Documentation bug: operands[3] (operands[2] for 'call') is the - *next* argument register, not the number of arguments in registers. - (There used to be code here where that mattered.) */ - operands[5] = gen_rtx_REG (DImode, MMIX_INCOMING_RETURN_ADDRESS_REGNUM); }") -- cgit v1.1 From 5b2515f5ae86c21fc13021e1a8728a48e68aa7d6 Mon Sep 17 00:00:00 2001 From: Hans-Peter Nilsson Date: Sun, 18 Jul 2021 04:59:30 +0200 Subject: MMIX: remove generic placeholders parameters in call insn patterns. I guess the best way to describe these operands, at least for MMIX, is "ballast". Some targets seem to drag along one or two of the incoming pattern operands through the rtl passes and not dropping them until assembly output. Let's stop doing that for MMIX. There really are *two* unused parameters: one is a number corresponding to the stack-size of arguments as a const_int and the other is whatever the target yields for targetm.calls.function_arg (args_so_far, function_arg_info::end_marker ()). There's a mandatory second argument to the "call" RTX, but the target doesn't have to keep it a variable number; it can be replaced by (const_int 0) early, like this. Astute readers may object that as the MMIX call-type insns (PUSHJ, PUSHGO) have a parameter in addition to the address of the called function, so should the emitted RTL. But, that parameter depends only on the local function, not the called function (IOW, it's the same for all calls in a function), and its value isn't known until frame layout time. Having it a parameter in the emitted RTL for the call would just be confusing. (Maybe this will be amended later, if/when improving "shrink-wrapping".) gcc: * config/mmix/mmix.md ("call", "call_value", "*call_real") ("*call_value_real"): Don't generate rtx mentioning the generic operands 1 and 2 to "call", and similarly for "call_value". * config/mmix/mmix.c (mmix_print_operand_punct_valid_p) (mmix_print_operand): Use '!' instead of 'p'. --- gcc/config/mmix/mmix.c | 20 +++++++++--------- gcc/config/mmix/mmix.md | 56 ++++++++++++++++++++----------------------------- 2 files changed, 33 insertions(+), 43 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/mmix/mmix.c b/gcc/config/mmix/mmix.c index db7af7b..010cd47 100644 --- a/gcc/config/mmix/mmix.c +++ b/gcc/config/mmix/mmix.c @@ -1624,6 +1624,12 @@ mmix_print_operand (FILE *stream, rtx x, int code) fprintf (stream, "%d", MMIX_POP_ARGUMENT ()); return; + case '!': + /* The number of registers we want to save. This was setup by the + prologue. */ + fprintf (stream, "%d", cfun->machine->highest_saved_stack_register + 1); + return; + case 'B': if (GET_CODE (x) != CONST_INT) fatal_insn ("MMIX Internal: Expected a CONST_INT, not this", x); @@ -1712,15 +1718,6 @@ mmix_print_operand (FILE *stream, rtx x, int code) (int64_t) (mmix_intval (x) - 1)); return; - case 'p': - /* Store the number of registers we want to save. This was setup - by the prologue. The actual operand contains the number of - registers to pass, but we don't use it currently. Anyway, we - need to output the number of saved registers here. */ - fprintf (stream, "%d", - cfun->machine->highest_saved_stack_register + 1); - return; - case 'r': /* Store the register to output a constant to. */ if (! REG_P (x)) @@ -1830,7 +1827,10 @@ mmix_print_operand_punct_valid_p (unsigned char code) /* A '+' is used for branch prediction, similar to other ports. */ return code == '+' /* A '.' is used for the %d in the POP %d,0 return insn. */ - || code == '.'; + || code == '.' + /* A '!' is used for the number of saved registers, like when outputting + PUSHJ and PUSHGO. */ + || code == '!'; } /* TARGET_PRINT_OPERAND_ADDRESS. */ diff --git a/gcc/config/mmix/mmix.md b/gcc/config/mmix/mmix.md index 33e9c60..99be826 100644 --- a/gcc/config/mmix/mmix.md +++ b/gcc/config/mmix/mmix.md @@ -974,11 +974,9 @@ DIVU %1,%1,%2\;GET %0,:rR\;NEGU %2,0,%0\;CSNN %0,$255,%2") "%+B%D1 %2,%0") (define_expand "call" - [(parallel [(call (match_operand:QI 0 "memory_operand" "") - (match_operand 1 "general_operand" "")) - (use (match_operand 2 "general_operand" "")) - (clobber (match_dup 4))]) - (set (match_dup 4) (match_dup 3))] + [(parallel [(call (match_operand:QI 0 "memory_operand" "") (const_int 0)) + (clobber (match_dup 1))]) + (set (match_dup 1) (match_dup 2))] "" " { @@ -992,28 +990,24 @@ DIVU %1,%1,%2\;GET %0,:rR\;NEGU %2,0,%0\;CSNN %0,$255,%2") = replace_equiv_address (operands[0], force_reg (Pmode, XEXP (operands[0], 0))); + /* Note that we overwrite the generic operands[1] and operands[2]; we + don't use those values. */ + operands[1] = gen_rtx_REG (DImode, MMIX_INCOMING_RETURN_ADDRESS_REGNUM); + /* Since the epilogue 'uses' the return address, and it is clobbered in the call, and we set it back after every call (all but one setting will be optimized away), integrity is maintained. */ - operands[3] + operands[2] = mmix_get_hard_reg_initial_val (Pmode, MMIX_INCOMING_RETURN_ADDRESS_REGNUM); - - /* NULL gets passed as operand[2] when we get out of registers, - which later confuses gcc. Replace it with const_int 0. */ - if (operands[2] == NULL_RTX) - operands[2] = const0_rtx; - - operands[4] = gen_rtx_REG (DImode, MMIX_INCOMING_RETURN_ADDRESS_REGNUM); }") (define_expand "call_value" [(parallel [(set (match_operand 0 "" "") (call (match_operand:QI 1 "memory_operand" "") - (match_operand 2 "general_operand" ""))) - (use (match_operand 3 "general_operand" "")) - (clobber (match_dup 5))]) - (set (match_dup 5) (match_dup 4))] + (const_int 0))) + (clobber (match_dup 2))]) + (set (match_dup 2) (match_dup 3))] "" " { @@ -1027,18 +1021,16 @@ DIVU %1,%1,%2\;GET %0,:rR\;NEGU %2,0,%0\;CSNN %0,$255,%2") = replace_equiv_address (operands[1], force_reg (Pmode, XEXP (operands[1], 0))); + /* Note that we overwrite the generic operands[2] and operands[3]; we + don't use those values. */ + operands[2] = gen_rtx_REG (DImode, MMIX_INCOMING_RETURN_ADDRESS_REGNUM); + /* Since the epilogue 'uses' the return address, and it is clobbered in the call, and we set it back after every call (all but one setting will be optimized away), integrity is maintained. */ - operands[4] + operands[3] = mmix_get_hard_reg_initial_val (Pmode, MMIX_INCOMING_RETURN_ADDRESS_REGNUM); - - /* See 'call'. */ - if (operands[3] == NULL_RTX) - operands[3] = const0_rtx; - - operands[5] = gen_rtx_REG (DImode, MMIX_INCOMING_RETURN_ADDRESS_REGNUM); }") ;; Don't use 'p' here. A 'p' must stand first in constraints, or reload @@ -1059,25 +1051,23 @@ DIVU %1,%1,%2\;GET %0,:rR\;NEGU %2,0,%0\;CSNN %0,$255,%2") (define_insn "*call_real" [(call (mem:QI (match_operand:DI 0 "mmix_symbolic_or_address_operand" "s,rU")) - (match_operand 1 "" "")) - (use (match_operand 2 "" "")) + (const_int 0)) (clobber (reg:DI MMIX_rJ_REGNUM))] "" "@ - PUSHJ $%p2,%0 - PUSHGO $%p2,%a0") + PUSHJ $%!,%0 + PUSHGO $%!,%a0") (define_insn "*call_value_real" [(set (match_operand 0 "register_operand" "=r,r") (call (mem:QI (match_operand:DI 1 "mmix_symbolic_or_address_operand" "s,rU")) - (match_operand 2 "" ""))) - (use (match_operand 3 "" "")) - (clobber (reg:DI MMIX_rJ_REGNUM))] + (const_int 0))) + (clobber (reg:DI MMIX_rJ_REGNUM))] "" "@ - PUSHJ $%p3,%1 - PUSHGO $%p3,%a1") + PUSHJ $%!,%1 + PUSHGO $%!,%a1") ;; I hope untyped_call and untyped_return are not needed for MMIX. ;; Users of Objective-C will notice. -- cgit v1.1 From f7bf03cf69ccb7dcfa0320774aa7f3c51344dada Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Fri, 30 Jul 2021 22:46:32 +0100 Subject: Decrement followed by cmov improvements. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following patch to the x86_64 backend improves the code generated for a decrement followed by a conditional move. The primary change is to recognize that after subtracting one, checking the result is -1 (or equivalently that the original value was zero) can be implemented using the borrow/carry flag instead of requiring an explicit test instruction. This is achieved by a new define_insn_and_split that allows combine to split the desired sequence/composite into a *subsi_3 and *movsicc_noc. The other change with this patch is/are a pair of peephole2 optimizations to eliminate register-to-register moves generated during register allocation. During reload, the compiler doesn't know that inverting the condition of a conditional cmove can sometimes reduce register pressure, but this is easy to tidy up during the peephole2 pass (where swapping the order of the insn's operands performs the required logic inversion). Both improvements are demonstrated by the case below: int foo(int x) { if (x == 0) x = 16; else x--; return x; } Before: foo: leal -1(%rdi), %eax testl %edi, %edi movl $16, %edx cmove %edx, %eax ret After: foo: subl $1, %edi movl $16, %eax cmovnc %edi, %eax ret And the value of the peephole2 clean-up can be seen on its own in: int bar(int x) { x--; if (x == 0) x = 16; return x; } Before: bar: movl %edi, %eax movl $16, %edx subl $1, %eax cmove %edx, %eax ret After: bar: subl $1, %edi movl $16, %eax cmovne %edi, %eax ret These idioms were inspired by the source code of NIST SciMark4's Random_nextDouble function, where the tweaks above result in a ~1% improvement in the MonteCarlo benchmark kernel. 2021-07-30 Roger Sayle Uroš Bizjak gcc/ChangeLog * config/i386/i386.md (*dec_cmov): New define_insn_and_split to generate a conditional move using the carry flag after sub $1. (peephole2): Eliminate a register-to-register move by inverting the condition of a conditional move. gcc/testsuite/ChangeLog * gcc.target/i386/dec-cmov-1.c: New test. * gcc.target/i386/dec-cmov-2.c: New test. --- gcc/config/i386/i386.md | 87 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 44ae18e..73a4953 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -6756,6 +6756,29 @@ ? GEU : LTU, VOIDmode, cc, const0_rtx); }) +;; Help combine use borrow flag to test for -1 after dec (add $-1). +(define_insn_and_split "*dec_cmov" + [(set (match_operand:SWI248 0 "register_operand" "=r") + (if_then_else:SWI248 + (match_operator 1 "bt_comparison_operator" + [(match_operand:SWI248 2 "register_operand" "0") (const_int 0)]) + (plus:SWI248 (match_dup 2) (const_int -1)) + (match_operand:SWI248 3 "nonimmediate_operand" "rm"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_CMOVE" + "#" + "&& reload_completed" + [(parallel [(set (reg:CC FLAGS_REG) + (compare:CC (match_dup 2) (const_int 1))) + (set (match_dup 0) (minus:SWI248 (match_dup 2) (const_int 1)))]) + (set (match_dup 0) + (if_then_else:SWI248 (match_dup 4) (match_dup 0) (match_dup 3)))] +{ + rtx cc = gen_rtx_REG (CCCmode, FLAGS_REG); + operands[4] = gen_rtx_fmt_ee (GET_CODE (operands[1]) == NE + ? GEU : LTU, VOIDmode, cc, const0_rtx); +}) + (define_insn "*subsi_3_zext" [(set (reg FLAGS_REG) (compare (match_operand:SI 1 "register_operand" "0") @@ -19182,6 +19205,70 @@ gcc_unreachable (); }) +;; Eliminate a reg-reg mov by inverting the condition of a cmov (#1). +;; mov r0,r1; dec r0; mov r2,r3; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1 +(define_peephole2 + [(set (match_operand:SWI248 0 "register_operand") + (match_operand:SWI248 1 "register_operand")) + (parallel [(set (reg FLAGS_REG) (match_operand 5)) + (set (match_dup 0) (match_operand:SWI248 6))]) + (set (match_operand:SWI248 2 "register_operand") + (match_operand:SWI248 3)) + (set (match_dup 0) + (if_then_else:SWI248 (match_operator 4 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_dup 0) + (match_dup 2)))] + "TARGET_CMOVE + && REGNO (operands[2]) != REGNO (operands[0]) + && REGNO (operands[2]) != REGNO (operands[1]) + && peep2_reg_dead_p (1, operands[1]) + && peep2_reg_dead_p (4, operands[2]) + && !reg_overlap_mentioned_p (operands[0], operands[3])" + [(parallel [(set (match_dup 7) (match_dup 8)) + (set (match_dup 1) (match_dup 9))]) + (set (match_dup 0) (match_dup 3)) + (set (match_dup 0) (if_then_else:SWI248 (match_dup 4) + (match_dup 1) + (match_dup 0)))] +{ + operands[7] = SET_DEST (XVECEXP (PATTERN (peep2_next_insn (1)), 0, 0)); + operands[8] = replace_rtx (operands[5], operands[0], operands[1]); + operands[9] = replace_rtx (operands[6], operands[0], operands[1]); +}) + +;; Eliminate a reg-reg mov by inverting the condition of a cmov (#2). +;; mov r2,r3; mov r0,r1; dec r0; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1 +(define_peephole2 + [(set (match_operand:SWI248 2 "register_operand") + (match_operand:SWI248 3)) + (set (match_operand:SWI248 0 "register_operand") + (match_operand:SWI248 1 "register_operand")) + (parallel [(set (reg FLAGS_REG) (match_operand 5)) + (set (match_dup 0) (match_operand:SWI248 6))]) + (set (match_dup 0) + (if_then_else:SWI248 (match_operator 4 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_dup 0) + (match_dup 2)))] + "TARGET_CMOVE + && REGNO (operands[2]) != REGNO (operands[0]) + && REGNO (operands[2]) != REGNO (operands[1]) + && peep2_reg_dead_p (2, operands[1]) + && peep2_reg_dead_p (4, operands[2]) + && !reg_overlap_mentioned_p (operands[0], operands[3])" + [(parallel [(set (match_dup 7) (match_dup 8)) + (set (match_dup 1) (match_dup 9))]) + (set (match_dup 0) (match_dup 3)) + (set (match_dup 0) (if_then_else:SWI248 (match_dup 4) + (match_dup 1) + (match_dup 0)))] +{ + operands[7] = SET_DEST (XVECEXP (PATTERN (peep2_next_insn (2)), 0, 0)); + operands[8] = replace_rtx (operands[5], operands[0], operands[1]); + operands[9] = replace_rtx (operands[6], operands[0], operands[1]); +}) + (define_expand "movcc" [(set (match_operand:X87MODEF 0 "register_operand") (if_then_else:X87MODEF -- cgit v1.1 From 9d352c68e8c8b642a36a6bcfc7f6b5dba11ac748 Mon Sep 17 00:00:00 2001 From: "Paul A. Clarke" Date: Mon, 12 Jul 2021 12:06:18 -0500 Subject: rs6000: Add support for SSE4.1 "blend" intrinsics _mm_blend_epi16 and _mm_blendv_epi8 were added earlier. Add these four to complete the set. 2021-07-30 Paul A. Clarke gcc * config/rs6000/smmintrin.h (_mm_blend_pd, _mm_blendv_pd, _mm_blend_ps, _mm_blendv_ps): New. --- gcc/config/rs6000/smmintrin.h | 60 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h index 16fd34d..382f50c 100644 --- a/gcc/config/rs6000/smmintrin.h +++ b/gcc/config/rs6000/smmintrin.h @@ -116,6 +116,66 @@ _mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask) return (__m128i) vec_sel ((__v16qu) __A, (__v16qu) __B, __lmask); } +__inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8) +{ + __v16qu __pcv[] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, + }; + __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]); + return (__m128) __r; +} + +__inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask) +{ + const __v4si __zero = {0}; + const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero); + return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask); +} + +__inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8) +{ + __v16qu __pcv[] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 } + }; + __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]); + return (__m128d) __r; +} + +__inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask) +{ + const __v2di __zero = {0}; + const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero); + return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask); +} + __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_testz_si128 (__m128i __A, __m128i __B) -- cgit v1.1 From bd9a8737d478f7f1d01a9d5f1cc4309ffbb53103 Mon Sep 17 00:00:00 2001 From: "Paul A. Clarke" Date: Thu, 1 Jul 2021 17:04:51 -0500 Subject: rs6000: Add support for SSE4.1 "ceil" intrinsics 2021-07-30 Paul A. Clarke gcc * config/rs6000/smmintrin.h (_mm_ceil_pd, _mm_ceil_ps, _mm_ceil_sd, _mm_ceil_ss): New. --- gcc/config/rs6000/smmintrin.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h index 382f50c..919627b 100644 --- a/gcc/config/rs6000/smmintrin.h +++ b/gcc/config/rs6000/smmintrin.h @@ -232,4 +232,36 @@ _mm_test_mix_ones_zeros (__m128i __A, __m128i __mask) return any_ones * any_zeros; } +__inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ceil_pd (__m128d __A) +{ + return (__m128d) vec_ceil ((__v2df) __A); +} + +__inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ceil_sd (__m128d __A, __m128d __B) +{ + __v2df __r = vec_ceil ((__v2df) __B); + __r[1] = ((__v2df) __A)[1]; + return (__m128d) __r; +} + +__inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ceil_ps (__m128 __A) +{ + return (__m128) vec_ceil ((__v4sf) __A); +} + +__inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ceil_ss (__m128 __A, __m128 __B) +{ + __v4sf __r = (__v4sf) __A; + __r[0] = __builtin_ceil (((__v4sf) __B)[0]); + return __r; +} + #endif -- cgit v1.1 From 5f500715438761f59de5fb992267748c5d4dc4b6 Mon Sep 17 00:00:00 2001 From: "Paul A. Clarke" Date: Tue, 6 Jul 2021 17:31:21 -0500 Subject: rs6000: Add support for SSE4.1 "floor" intrinsics 2021-07-30 Paul A. Clarke gcc * config/rs6000/smmintrin.h (_mm_floor_pd, _mm_floor_ps, _mm_floor_sd, _mm_floor_ss): New. --- gcc/config/rs6000/smmintrin.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h index 919627b..0145b92 100644 --- a/gcc/config/rs6000/smmintrin.h +++ b/gcc/config/rs6000/smmintrin.h @@ -248,6 +248,22 @@ _mm_ceil_sd (__m128d __A, __m128d __B) return (__m128d) __r; } +__inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_floor_pd (__m128d __A) +{ + return (__m128d) vec_floor ((__v2df) __A); +} + +__inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_floor_sd (__m128d __A, __m128d __B) +{ + __v2df __r = vec_floor ((__v2df) __B); + __r[1] = ((__v2df) __A)[1]; + return (__m128d) __r; +} + __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_ceil_ps (__m128 __A) @@ -264,4 +280,20 @@ _mm_ceil_ss (__m128 __A, __m128 __B) return __r; } +__inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_floor_ps (__m128 __A) +{ + return (__m128) vec_floor ((__v4sf) __A); +} + +__inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_floor_ss (__m128 __A, __m128 __B) +{ + __v4sf __r = (__v4sf) __A; + __r[0] = __builtin_floor (((__v4sf) __B)[0]); + return __r; +} + #endif -- cgit v1.1 From 91425e2adecd00091d7443104ecb367686e88663 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Sat, 31 Jul 2021 09:19:32 +0200 Subject: i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103] This patch improves emitted code for the non-TARGET_LZCNT case. As __builtin_clz* is UB on 0 argument and for !TARGET_LZCNT CLZ_VALUE_DEFINED_AT_ZERO is 0, it is UB even at RTL time and so we can take advantage of that and assume the result will be 0 to 31 or 0 to 63. Given that, sign or zero extension of that result are the same and are actually already performed by bsrl or xorl instructions. And constant - __builtin_clz* can be simplified into bsr + constant - bitmask. For TARGET_LZCNT, a lot of this is already fine as is (e.g. the sign or zero extensions), and other optimizations are IMHO not possible (if we have lzcnt, we've lost information on whether it is UB at zero or not and so can't transform it into bsr even when that is 1-2 insns shorter). The changes on the 3 testcases between unpatched and patched gcc are for -m64: pr78103-1.s: bsrq %rdi, %rax - xorq $63, %rax - cltq + xorl $63, %eax ... bsrq %rdi, %rax - xorq $63, %rax - cltq + xorl $63, %eax ... bsrl %edi, %eax xorl $31, %eax - cltq ... bsrl %edi, %eax xorl $31, %eax - cltq pr78103-2.s: bsrl %edi, %edi - movl $32, %eax - xorl $31, %edi - subl %edi, %eax + leal 1(%rdi), %eax ... - bsrl %edi, %edi - movl $31, %eax - xorl $31, %edi - subl %edi, %eax + bsrl %edi, %eax ... bsrq %rdi, %rdi - movl $64, %eax - xorq $63, %rdi - subl %edi, %eax + leal 1(%rdi), %eax ... - bsrq %rdi, %rdi - movl $63, %eax - xorq $63, %rdi - subl %edi, %eax + bsrq %rdi, %rax pr78103-3.s: bsrl %edi, %edi - movl $32, %eax - xorl $31, %edi - movslq %edi, %rdi - subq %rdi, %rax + leaq 1(%rdi), %rax ... - bsrl %edi, %edi - movl $31, %eax - xorl $31, %edi - movslq %edi, %rdi - subq %rdi, %rax + bsrl %edi, %eax ... bsrq %rdi, %rdi - movl $64, %eax - xorq $63, %rdi - movslq %edi, %rdi - subq %rdi, %rax + leaq 1(%rdi), %rax ... - bsrq %rdi, %rdi - movl $63, %eax - xorq $63, %rdi - movslq %edi, %rdi - subq %rdi, %rax + bsrq %rdi, %rax Most of the changes are done with combine splitters, but for *bsr_rex64_2 and *bsr_2 I had to use define_insn_and_split, because as mentioned in the PR the combiner unfortunately doesn't create LOG_LINKS in between the two insns created by combine splitter, so it can't be combined further with following instructions. 2021-07-31 Jakub Jelinek PR target/78103 * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New define_insn patterns. (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns. Add combine splitters for constant - clz. (clz2): Use a temporary pseudo for bsr result. * gcc.target/i386/pr78103-1.c: New test. * gcc.target/i386/pr78103-2.c: New test. * gcc.target/i386/pr78103-3.c: New test. --- gcc/config/i386/i386.md | 210 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 205 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 73a4953..c9787d7 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -14784,6 +14784,18 @@ (set_attr "znver1_decode" "vector") (set_attr "mode" "DI")]) +(define_insn "bsr_rex64_1" + [(set (match_operand:DI 0 "register_operand" "=r") + (minus:DI (const_int 63) + (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_LZCNT && TARGET_64BIT" + "bsr{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "znver1_decode" "vector") + (set_attr "mode" "DI")]) + (define_insn "bsr" [(set (reg:CCZ FLAGS_REG) (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm") @@ -14798,17 +14810,204 @@ (set_attr "znver1_decode" "vector") (set_attr "mode" "SI")]) +(define_insn "bsr_1" + [(set (match_operand:SI 0 "register_operand" "=r") + (minus:SI (const_int 31) + (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_LZCNT" + "bsr{l}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "znver1_decode" "vector") + (set_attr "mode" "SI")]) + +(define_insn "bsr_zext_1" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (minus:SI + (const_int 31) + (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_LZCNT && TARGET_64BIT" + "bsr{l}\t{%1, %k0|%k0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "znver1_decode" "vector") + (set_attr "mode" "SI")]) + +; As bsr is undefined behavior on zero and for other input +; values it is in range 0 to 63, we can optimize away sign-extends. +(define_insn_and_split "*bsr_rex64_2" + [(set (match_operand:DI 0 "register_operand") + (xor:DI + (sign_extend:DI + (minus:SI + (const_int 63) + (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand")) + 0))) + (const_int 63))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()" + "#" + "&& 1" + [(parallel [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (match_dup 1) (const_int 0))) + (set (match_dup 2) + (minus:DI (const_int 63) (clz:DI (match_dup 1))))]) + (parallel [(set (match_dup 0) + (zero_extend:DI (xor:SI (match_dup 3) (const_int 63)))) + (clobber (reg:CC FLAGS_REG))])] +{ + operands[2] = gen_reg_rtx (DImode); + operands[3] = lowpart_subreg (SImode, operands[2], DImode); +}) + +(define_insn_and_split "*bsr_2" + [(set (match_operand:DI 0 "register_operand") + (sign_extend:DI + (xor:SI + (minus:SI + (const_int 31) + (clz:SI (match_operand:SI 1 "nonimmediate_operand"))) + (const_int 31)))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()" + "#" + "&& 1" + [(parallel [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (match_dup 1) (const_int 0))) + (set (match_dup 2) + (minus:SI (const_int 31) (clz:SI (match_dup 1))))]) + (parallel [(set (match_dup 0) + (zero_extend:DI (xor:SI (match_dup 2) (const_int 31)))) + (clobber (reg:CC FLAGS_REG))])] + "operands[2] = gen_reg_rtx (SImode);") + +; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x). +; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be +; in [0, 63] or [0, 31] range. +(define_split + [(set (match_operand:SI 0 "register_operand") + (minus:SI + (match_operand:SI 2 "const_int_operand") + (xor:SI + (minus:SI (const_int 63) + (subreg:SI + (clz:DI (match_operand:DI 1 "nonimmediate_operand")) + 0)) + (const_int 63))))] + "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()" + [(set (match_dup 3) + (minus:DI (const_int 63) (clz:DI (match_dup 1)))) + (set (match_dup 0) + (plus:SI (match_dup 5) (match_dup 4)))] +{ + operands[3] = gen_reg_rtx (DImode); + operands[5] = lowpart_subreg (SImode, operands[3], DImode); + if (INTVAL (operands[2]) == 63) + { + emit_insn (gen_bsr_rex64_1 (operands[3], operands[1])); + emit_move_insn (operands[0], operands[5]); + DONE; + } + operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode); +}) + +(define_split + [(set (match_operand:SI 0 "register_operand") + (minus:SI + (match_operand:SI 2 "const_int_operand") + (xor:SI + (minus:SI (const_int 31) + (clz:SI (match_operand:SI 1 "nonimmediate_operand"))) + (const_int 31))))] + "!TARGET_LZCNT && ix86_pre_reload_split ()" + [(set (match_dup 3) + (minus:SI (const_int 31) (clz:SI (match_dup 1)))) + (set (match_dup 0) + (plus:SI (match_dup 3) (match_dup 4)))] +{ + if (INTVAL (operands[2]) == 31) + { + emit_insn (gen_bsr_1 (operands[0], operands[1])); + DONE; + } + operands[3] = gen_reg_rtx (SImode); + operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode); +}) + +(define_split + [(set (match_operand:DI 0 "register_operand") + (minus:DI + (match_operand:DI 2 "const_int_operand") + (xor:DI + (sign_extend:DI + (minus:SI (const_int 63) + (subreg:SI + (clz:DI (match_operand:DI 1 "nonimmediate_operand")) + 0))) + (const_int 63))))] + "!TARGET_LZCNT + && TARGET_64BIT + && ix86_pre_reload_split () + && ((unsigned HOST_WIDE_INT) + trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode) + == UINTVAL (operands[2]) - 63)" + [(set (match_dup 3) + (minus:DI (const_int 63) (clz:DI (match_dup 1)))) + (set (match_dup 0) + (plus:DI (match_dup 3) (match_dup 4)))] +{ + if (INTVAL (operands[2]) == 63) + { + emit_insn (gen_bsr_rex64_1 (operands[0], operands[1])); + DONE; + } + operands[3] = gen_reg_rtx (DImode); + operands[4] = GEN_INT (UINTVAL (operands[2]) - 63); +}) + +(define_split + [(set (match_operand:DI 0 "register_operand") + (minus:DI + (match_operand:DI 2 "const_int_operand") + (sign_extend:DI + (xor:SI + (minus:SI (const_int 31) + (clz:SI (match_operand:SI 1 "nonimmediate_operand"))) + (const_int 31)))))] + "!TARGET_LZCNT + && TARGET_64BIT + && ix86_pre_reload_split () + && ((unsigned HOST_WIDE_INT) + trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode) + == UINTVAL (operands[2]) - 31)" + [(set (match_dup 3) + (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1))))) + (set (match_dup 0) + (plus:DI (match_dup 3) (match_dup 4)))] +{ + if (INTVAL (operands[2]) == 31) + { + emit_insn (gen_bsr_zext_1 (operands[0], operands[1])); + DONE; + } + operands[3] = gen_reg_rtx (DImode); + operands[4] = GEN_INT (UINTVAL (operands[2]) - 31); +}) + (define_expand "clz2" [(parallel [(set (reg:CCZ FLAGS_REG) (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm") (const_int 0))) - (set (match_operand:SWI48 0 "register_operand") - (minus:SWI48 - (match_dup 2) - (clz:SWI48 (match_dup 1))))]) + (set (match_dup 3) (minus:SWI48 + (match_dup 2) + (clz:SWI48 (match_dup 1))))]) (parallel - [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2))) + [(set (match_operand:SWI48 0 "register_operand") + (xor:SWI48 (match_dup 3) (match_dup 2))) (clobber (reg:CC FLAGS_REG))])] "" { @@ -14818,6 +15017,7 @@ DONE; } operands[2] = GEN_INT (GET_MODE_BITSIZE (mode)-1); + operands[3] = gen_reg_rtx (mode); }) (define_insn_and_split "clz2_lzcnt" -- cgit v1.1 From 6f0c43e97825ee54e3779afbedcd0def12443001 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sun, 1 Aug 2021 09:55:33 -0700 Subject: i386: Improve SImode constant - __builtin_clzll for -mno-lzcnt Add a zero_extend patten for bsr_rex64_1 and use it to split SImode constant - __builtin_clzll to avoid unncessary zero_extend. gcc/ PR target/78103 * config/i386/i386.md (bsr_rex64_1_zext): New. (combine splitter for constant - clzll): Replace gen_bsr_rex64_1 with gen_bsr_rex64_1_zext. gcc/testsuite/ PR target/78103 * gcc.target/i386/pr78103-2.c: Also scan incl. * gcc.target/i386/pr78103-3.c: Scan leal|addl|incl for x32. Also scan incq. --- gcc/config/i386/i386.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index c9787d7..0c23ddb 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -14796,6 +14796,21 @@ (set_attr "znver1_decode" "vector") (set_attr "mode" "DI")]) +(define_insn "bsr_rex64_1_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (minus:SI (const_int 63) + (subreg:SI + (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm")) + 0)))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_LZCNT && TARGET_64BIT" + "bsr{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "znver1_decode" "vector") + (set_attr "mode" "DI")]) + (define_insn "bsr" [(set (reg:CCZ FLAGS_REG) (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm") @@ -14907,7 +14922,7 @@ operands[5] = lowpart_subreg (SImode, operands[3], DImode); if (INTVAL (operands[2]) == 63) { - emit_insn (gen_bsr_rex64_1 (operands[3], operands[1])); + emit_insn (gen_bsr_rex64_1_zext (operands[3], operands[1])); emit_move_insn (operands[0], operands[5]); DONE; } -- cgit v1.1 From 1bee034e012d1146d34b0d767fe28a485c210e4b Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 2 Aug 2021 10:01:46 -0700 Subject: x86: Add TARGET_GEN_MEMSET_SCRATCH_RTX Define TARGET_GEN_MEMSET_SCRATCH_RTX to ix86_gen_scratch_sse_rtx to return a scratch SSE register for memset. gcc/ PR middle-end/90773 * config/i386/i386.c (TARGET_GEN_MEMSET_SCRATCH_RTX): New. gcc/testsuite/ PR middle-end/90773 * gcc.target/i386/pr90773-5.c: Updated to expect XMM register. * gcc.target/i386/pr90773-14.c: Likewise. * gcc.target/i386/pr90773-15.c: New test. * gcc.target/i386/pr90773-16.c: Likewise. * gcc.target/i386/pr90773-17.c: Likewise. * gcc.target/i386/pr90773-18.c: Likewise. * gcc.target/i386/pr90773-19.c: Likewise. --- gcc/config/i386/i386.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index a0285e6..5d20ca2 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -23313,7 +23313,8 @@ ix86_optab_supported_p (int op, machine_mode mode1, machine_mode, } } -/* Return a scratch register in MODE for vector load and store. */ +/* Implement the TARGET_GEN_MEMSET_SCRATCH_RTX hook. Return a scratch + register in MODE for vector load and store. */ rtx ix86_gen_scratch_sse_rtx (machine_mode mode) @@ -24232,6 +24233,9 @@ static bool ix86_libc_has_fast_function (int fcode ATTRIBUTE_UNUSED) #undef TARGET_LIBC_HAS_FAST_FUNCTION #define TARGET_LIBC_HAS_FAST_FUNCTION ix86_libc_has_fast_function +#undef TARGET_GEN_MEMSET_SCRATCH_RTX +#define TARGET_GEN_MEMSET_SCRATCH_RTX ix86_gen_scratch_sse_rtx + #if CHECKING_P #undef TARGET_RUN_TARGET_SELFTESTS #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests -- cgit v1.1 From 7f4c3943f795fda33df648d2196b678bada1ba81 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 2 Aug 2021 10:01:46 -0700 Subject: x86: Avoid stack realignment when copying data To avoid stack realignment, use SCRATCH_SSE_REG to copy data from one memory location to another. gcc/ * config/i386/i386-expand.c (ix86_expand_vector_move): Call ix86_gen_scratch_sse_rtx to get a scratch SSE register to copy data from one memory location to another. gcc/testsuite/ * gcc.target/i386/eh_return-1.c: New test. --- gcc/config/i386/i386-expand.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 896bd68..1d469bf 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -625,7 +625,9 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) && !register_operand (op0, mode) && !register_operand (op1, mode)) { - emit_move_insn (op0, force_reg (GET_MODE (op0), op1)); + rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0)); + emit_move_insn (tmp, op1); + emit_move_insn (op0, tmp); return; } -- cgit v1.1 From 29f0e955c97da002b5adb4e8c9dfd2ea9709e207 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 2 Aug 2021 10:01:46 -0700 Subject: x86: Update piecewise move and store We can use TImode/OImode/XImode integers for piecewise move and store. 1. Define MAX_MOVE_MAX to 64, which is the constant maximum number of bytes that a single instruction can move quickly between memory and registers or between two memory locations. 2. Define MOVE_MAX to the maximum number of bytes we can move from memory to memory in one reasonably fast instruction. The difference between MAX_MOVE_MAX and MOVE_MAX is that MAX_MOVE_MAX must be a constant, independent of compiler options, since it is used in reload.h to define struct target_reload and MOVE_MAX can vary, depending on compiler options. 3. When vector register is used for piecewise move and store, we don't increase stack_alignment_needed since vector register spill isn't required for piecewise move and store. Since stack_realign_needed is set to true by checking stack_alignment_estimated set by pseudo vector register usage, we also need to check stack_realign_needed to eliminate frame pointer. gcc/ * config/i386/i386.c (ix86_finalize_stack_frame_flags): Also check stack_realign_needed for stack realignment. (ix86_legitimate_constant_p): Always allow CONST_WIDE_INT smaller than the largest integer supported by vector register. * config/i386/i386.h (MAX_MOVE_MAX): New. Set to 64. (MOVE_MAX): Set to bytes of the largest integer supported by vector register. (STORE_MAX_PIECES): New. gcc/testsuite/ * gcc.target/i386/pr90773-1.c: Adjust to expect movq for 32-bit. * gcc.target/i386/pr90773-4.c: Also run for 32-bit. * gcc.target/i386/pr90773-15.c: Likewise. * gcc.target/i386/pr90773-16.c: Likewise. * gcc.target/i386/pr90773-17.c: Likewise. * gcc.target/i386/pr90773-24.c: Likewise. * gcc.target/i386/pr90773-25.c: Likewise. * gcc.target/i386/pr100865-1.c: Likewise. * gcc.target/i386/pr100865-2.c: Likewise. * gcc.target/i386/pr100865-3.c: Likewise. * gcc.target/i386/pr90773-14.c: Also run for 32-bit and expect XMM movd to store 4 bytes. * gcc.target/i386/pr100865-4a.c: Also run for 32-bit and expect YMM registers. * gcc.target/i386/pr100865-4b.c: Likewise. * gcc.target/i386/pr100865-10a.c: Expect YMM registers. * gcc.target/i386/pr100865-10b.c: Likewise. --- gcc/config/i386/i386.c | 21 +++++++++++++++++--- gcc/config/i386/i386.h | 53 +++++++++++++++++++++++++++++++++----------------- 2 files changed, 53 insertions(+), 21 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5d20ca2..842eb0e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -7953,8 +7953,17 @@ ix86_finalize_stack_frame_flags (void) assumed stack realignment might be needed or -fno-omit-frame-pointer is used, but in the end nothing that needed the stack alignment had been spilled nor stack access, clear frame_pointer_needed and say we - don't need stack realignment. */ - if ((stack_realign || (!flag_omit_frame_pointer && optimize)) + don't need stack realignment. + + When vector register is used for piecewise move and store, we don't + increase stack_alignment_needed as there is no register spill for + piecewise move and store. Since stack_realign_needed is set to true + by checking stack_alignment_estimated which is updated by pseudo + vector register usage, we also need to check stack_realign_needed to + eliminate frame pointer. */ + if ((stack_realign + || (!flag_omit_frame_pointer && optimize) + || crtl->stack_realign_needed) && frame_pointer_needed && crtl->is_leaf && crtl->sp_is_unchanging @@ -10418,7 +10427,13 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x) /* FALLTHRU */ case E_OImode: case E_XImode: - if (!standard_sse_constant_p (x, mode)) + if (!standard_sse_constant_p (x, mode) + && GET_MODE_SIZE (TARGET_AVX512F + ? XImode + : (TARGET_AVX + ? OImode + : (TARGET_SSE2 + ? TImode : DImode))) < GET_MODE_SIZE (mode)) return false; default: break; diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index d1e1c22..bed9cd9 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1757,24 +1757,41 @@ typedef struct ix86_args { /* Define this as 1 if `char' should by default be signed; else as 0. */ #define DEFAULT_SIGNED_CHAR 1 -/* Max number of bytes we can move from memory to memory - in one reasonably fast instruction. */ -#define MOVE_MAX 16 - -/* MOVE_MAX_PIECES is the number of bytes at a time which we can - move efficiently, as opposed to MOVE_MAX which is the maximum - number of bytes we can move with a single instruction. - - ??? We should use TImode in 32-bit mode and use OImode or XImode - if they are available. But since by_pieces_ninsns determines the - widest mode with MAX_FIXED_MODE_SIZE, we can only use TImode in - 64-bit mode. */ -#define MOVE_MAX_PIECES \ - ((TARGET_64BIT \ - && TARGET_SSE2 \ - && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \ - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ - ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD) +/* The constant maximum number of bytes that a single instruction can + move quickly between memory and registers or between two memory + locations. */ +#define MAX_MOVE_MAX 64 + +/* Max number of bytes we can move from memory to memory in one + reasonably fast instruction, as opposed to MOVE_MAX_PIECES which + is the number of bytes at a time which we can move efficiently. + MOVE_MAX_PIECES defaults to MOVE_MAX. */ + +#define MOVE_MAX \ + ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ + ? 64 \ + : ((TARGET_AVX \ + && !TARGET_PREFER_AVX128 \ + && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \ + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ + ? 32 \ + : ((TARGET_SSE2 \ + && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \ + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ + ? 16 : UNITS_PER_WORD))) + +/* STORE_MAX_PIECES is the number of bytes at a time that we can + store efficiently. */ +#define STORE_MAX_PIECES \ + ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ + ? 64 \ + : ((TARGET_AVX \ + && !TARGET_PREFER_AVX128 \ + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ + ? 32 \ + : ((TARGET_SSE2 \ + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ + ? 16 : UNITS_PER_WORD))) /* If a memory-to-memory move would take MOVE_RATIO or more simple move-instruction pairs, we will do a cpymem or libcall instead. -- cgit v1.1 From 724adffe65e1268e8b3f2ab538660020f7572114 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Tue, 27 Jul 2021 18:08:38 +0800 Subject: Support cond_add/sub/mul/div for vector float/double. gcc/ChangeLog: * config/i386/sse.md (cond_):New expander. (cond_mul): Ditto. (cond_div): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/cond_op_addsubmuldiv_double-1.c: New test. * gcc.target/i386/cond_op_addsubmuldiv_double-2.c: New test. * gcc.target/i386/cond_op_addsubmuldiv_float-1.c: New test. * gcc.target/i386/cond_op_addsubmuldiv_float-2.c: New test. --- gcc/config/i386/sse.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index b5a0898..8bf1764 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1891,6 +1891,24 @@ } [(set_attr "isa" "noavx,noavx,avx,avx")]) +(define_expand "cond_" + [(set (match_operand:VF 0 "register_operand") + (vec_merge:VF + (plusminus:VF + (match_operand:VF 2 "vector_operand") + (match_operand:VF 3 "vector_operand")) + (match_operand:VF 4 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + " == 64 || TARGET_AVX512VL" +{ + emit_insn (gen_3_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + DONE; +}) + (define_expand "3" [(set (match_operand:VF 0 "register_operand") (plusminus:VF @@ -1953,6 +1971,24 @@ (set_attr "prefix" "") (set_attr "mode" "")]) +(define_expand "cond_mul" + [(set (match_operand:VF 0 "register_operand") + (vec_merge:VF + (mult:VF + (match_operand:VF 2 "vector_operand") + (match_operand:VF 3 "vector_operand")) + (match_operand:VF 4 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + " == 64 || TARGET_AVX512VL" +{ + emit_insn (gen_mul3_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + DONE; +}) + (define_expand "mul3" [(set (match_operand:VF 0 "register_operand") (mult:VF @@ -2041,6 +2077,24 @@ } }) +(define_expand "cond_div" + [(set (match_operand:VF 0 "register_operand") + (vec_merge:VF + (div:VF + (match_operand:VF 2 "register_operand") + (match_operand:VF 3 "vector_operand")) + (match_operand:VF 4 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + " == 64 || TARGET_AVX512VL" +{ + emit_insn (gen__div3_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + DONE; +}) + (define_insn "_div3" [(set (match_operand:VF 0 "register_operand" "=x,v") (div:VF -- cgit v1.1 From d0b952edd3f5753332ea234ef261711a81e87229 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Tue, 3 Aug 2021 13:22:11 +0800 Subject: Add cond_add/sub/mul for vector integer modes. gcc/ChangeLog: * config/i386/sse.md (cond_): New expander. (cond_mul): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/cond_op_addsubmul_d-1.c: New test. * gcc.target/i386/cond_op_addsubmul_d-2.c: New test. * gcc.target/i386/cond_op_addsubmul_q-1.c: New test. * gcc.target/i386/cond_op_addsubmul_q-2.c: New test. * gcc.target/i386/cond_op_addsubmul_w-1.c: New test. * gcc.target/i386/cond_op_addsubmul_w-2.c: New test. --- gcc/config/i386/sse.md | 88 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 8bf1764..52b2b42 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -333,6 +333,14 @@ [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) +(define_mode_iterator VI1248_AVX512VLBW + [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX512VL && TARGET_AVX512BW") + (V16QI "TARGET_AVX512VL && TARGET_AVX512BW") + (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX512VL && TARGET_AVX512BW") + (V8HI "TARGET_AVX512VL && TARGET_AVX512BW") + V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") + V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) + (define_mode_iterator VF_AVX512VL [V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL") V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")]) @@ -11803,6 +11811,24 @@ "TARGET_SSE2" "ix86_fixup_binary_operands_no_copy (, mode, operands);") +(define_expand "cond_" + [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand") + (vec_merge:VI1248_AVX512VLBW + (plusminus:VI1248_AVX512VLBW + (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand") + (match_operand:VI1248_AVX512VLBW 3 "nonimmediate_operand")) + (match_operand:VI1248_AVX512VLBW 4 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512F" +{ + emit_insn (gen_3_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + DONE; +}) + (define_expand "3_mask" [(set (match_operand:VI48_AVX512VL 0 "register_operand") (vec_merge:VI48_AVX512VL @@ -11929,6 +11955,24 @@ DONE; }) +(define_expand "cond_mul" + [(set (match_operand:VI2_AVX512VL 0 "register_operand") + (vec_merge:VI2_AVX512VL + (mult:VI2_AVX512VL + (match_operand:VI2_AVX512VL 2 "vector_operand") + (match_operand:VI2_AVX512VL 3 "vector_operand")) + (match_operand:VI2_AVX512VL 4 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512BW" +{ + emit_insn (gen_mul3_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + DONE; +}) + (define_expand "mul3" [(set (match_operand:VI2_AVX2 0 "register_operand") (mult:VI2_AVX2 (match_operand:VI2_AVX2 1 "vector_operand") @@ -12363,6 +12407,24 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +(define_expand "cond_mul" + [(set (match_operand:VI8_AVX512VL 0 "register_operand") + (vec_merge:VI8_AVX512VL + (mult:VI8_AVX512VL + (match_operand:VI8_AVX512VL 2 "vector_operand") + (match_operand:VI8_AVX512VL 3 "vector_operand")) + (match_operand:VI8_AVX512VL 4 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512DQ" +{ + emit_insn (gen_avx512dq_mul3_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + DONE; +}) + (define_insn "avx512dq_mul3" [(set (match_operand:VI8_AVX512VL 0 "register_operand" "=v") (mult:VI8_AVX512VL @@ -12375,6 +12437,24 @@ (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_expand "cond_mul" + [(set (match_operand:VI4_AVX512VL 0 "register_operand") + (vec_merge:VI4_AVX512VL + (mult:VI4_AVX512VL + (match_operand:VI4_AVX512VL 2 "vector_operand") + (match_operand:VI4_AVX512VL 3 "vector_operand")) + (match_operand:VI4_AVX512VL 4 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512F" +{ + emit_insn (gen_mul3_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + DONE; +}) + (define_expand "mul3" [(set (match_operand:VI4_AVX512F 0 "register_operand") (mult:VI4_AVX512F @@ -14043,14 +14123,6 @@ ] (const_string "")))]) -(define_mode_iterator VI1248_AVX512VLBW - [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX512VL && TARGET_AVX512BW") - (V16QI "TARGET_AVX512VL && TARGET_AVX512BW") - (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX512VL && TARGET_AVX512BW") - (V8HI "TARGET_AVX512VL && TARGET_AVX512BW") - V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") - V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) - (define_mode_iterator AVX512ZEXTMASK [(DI "TARGET_AVX512BW") (SI "TARGET_AVX512BW") HI]) -- cgit v1.1 From fa3ca6151ccac0e215727641eee36abf6e437b26 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 3 Aug 2021 13:00:43 +0100 Subject: aarch64: Turn sve_width tuning field into a bitmask The tuning structures have an sve_width field that specifies the number of bits in an SVE vector (or SVE_NOT_IMPLEMENTED if not applicable). This patch turns the field into a bitmask so that it can specify multiple widths at the same time. For now we always treat the mininum width as the likely width. An alternative would have been to add extra fields, which would have coped correctly with non-power-of-2 widths. However, we're very far from supporting constant non-power-of-2 vectors in GCC, so I think the non-power-of-2 case will in reality always have to be hidden behind VLA. gcc/ * config/aarch64/aarch64-protos.h (tune_params::sve_width): Turn into a bitmask. * config/aarch64/aarch64.c (aarch64_cmp_autovec_modes): Update accordingly. (aarch64_estimated_poly_value): Likewise. Use the least significant set bit for the minimum and likely values. Use the most significant set bit for the maximum value. --- gcc/config/aarch64/aarch64-protos.h | 8 ++++---- gcc/config/aarch64/aarch64.c | 15 ++++++++++----- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index c203338..fb4ce8e 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -506,10 +506,10 @@ struct tune_params const struct cpu_vector_cost *vec_costs; const struct cpu_branch_cost *branch_costs; const struct cpu_approx_modes *approx_modes; - /* Width of the SVE registers or SVE_NOT_IMPLEMENTED if not applicable. - Only used for tuning decisions, does not disable VLA - vectorization. */ - enum aarch64_sve_vector_bits_enum sve_width; + /* A bitmask of the possible SVE register widths in bits, + or SVE_NOT_IMPLEMENTED if not applicable. Only used for tuning + decisions, does not disable VLA vectorization. */ + unsigned int sve_width; int memmov_cost; int issue_rate; unsigned int fusible_ops; diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index e211460..1a8cd13 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -19144,14 +19144,12 @@ aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m) bool prefer_asimd = aarch64_autovec_preference == 3; bool prefer_sve = aarch64_autovec_preference == 4; - aarch64_sve_vector_bits_enum tune_width = aarch64_tune_params.sve_width; - poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m); poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m); /* If the CPU information does not have an SVE width registered use the generic poly_int comparison that prefers SVE. If a preference is explicitly requested avoid this path. */ - if (tune_width == SVE_SCALABLE + if (aarch64_tune_params.sve_width == SVE_SCALABLE && !prefer_asimd && !prefer_sve) return maybe_gt (nunits_sve, nunits_asimd); @@ -24980,8 +24978,7 @@ aarch64_estimated_poly_value (poly_int64 val, poly_value_estimate_kind kind = POLY_VALUE_LIKELY) { - enum aarch64_sve_vector_bits_enum width_source - = aarch64_tune_params.sve_width; + unsigned int width_source = aarch64_tune_params.sve_width; /* If there is no core-specific information then the minimum and likely values are based on 128-bit vectors and the maximum is based on @@ -24996,6 +24993,14 @@ aarch64_estimated_poly_value (poly_int64 val, return val.coeffs[0] + val.coeffs[1] * 15; } + /* Allow sve_width to be a bitmask of different VL, treating the lowest + as likely. This could be made more general if future -mtune options + need it to be. */ + if (kind == POLY_VALUE_MAX) + width_source = 1 << floor_log2 (width_source); + else + width_source = least_bit_hwi (width_source); + /* If the core provides width information, use that. */ HOST_WIDE_INT over_128 = width_source - 128; return val.coeffs[0] + val.coeffs[1] * over_128 / 128; -- cgit v1.1 From 83d796d3e58badcb864d179b882979f714ffd162 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 3 Aug 2021 13:00:44 +0100 Subject: aarch64: Add a simple fixed-point class for costing This patch adds a simple fixed-point class for holding fractional cost values. It can exactly represent the reciprocal of any single-vector SVE element count (including the non-power-of-2 ones). This means that it can also hold 1/N for all N in [1, 16], which should be enough for the various *_per_cycle fields. For now the assumption is that the number of possible reciprocals is fixed at compile time and so the class should always be able to hold an exact value. The class uses a uint64_t to hold the fixed-point value, which means that it can hold any scaled uint32_t cost. Normally we don't worry about overflow when manipulating raw uint32_t costs, but just to be on the safe side, the class uses saturating arithmetic for all operations. As far as the changes to the cost routines themselves go: - The changes to aarch64_add_stmt_cost and its subroutines are just laying groundwork for future patches; no functional change intended. - The changes to aarch64_adjust_body_cost mean that we now take fractional differences into account. gcc/ * config/aarch64/fractional-cost.h: New file. * config/aarch64/aarch64.c: Include (indirectly) and cost_fraction.h. (vec_cost_fraction): New typedef. (aarch64_detect_scalar_stmt_subtype): Use it for statement costs. (aarch64_detect_vector_stmt_subtype): Likewise. (aarch64_sve_adjust_stmt_cost, aarch64_adjust_stmt_cost): Likewise. (aarch64_estimate_min_cycles_per_iter): Use vec_cost_fraction for cycle counts. (aarch64_adjust_body_cost): Likewise. (aarch64_test_cost_fraction): New function. (aarch64_run_selftests): Call it. --- gcc/config/aarch64/aarch64.c | 179 ++++++++++++++++++++------ gcc/config/aarch64/fractional-cost.h | 236 +++++++++++++++++++++++++++++++++++ 2 files changed, 377 insertions(+), 38 deletions(-) create mode 100644 gcc/config/aarch64/fractional-cost.h (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 1a8cd13..17fcb34 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -20,8 +20,9 @@ #define IN_TARGET_CODE 1 -#include "config.h" #define INCLUDE_STRING +#define INCLUDE_ALGORITHM +#include "config.h" #include "system.h" #include "coretypes.h" #include "backend.h" @@ -76,6 +77,7 @@ #include "function-abi.h" #include "gimple-pretty-print.h" #include "tree-ssa-loop-niter.h" +#include "fractional-cost.h" /* This file should be included last. */ #include "target-def.h" @@ -14912,10 +14914,10 @@ aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info, for STMT_INFO, which has cost kind KIND. If this is a scalar operation, try to subdivide the target-independent categorization provided by KIND to get a more accurate cost. */ -static unsigned int +static fractional_cost aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, stmt_vec_info stmt_info, - unsigned int stmt_cost) + fractional_cost stmt_cost) { /* Detect an extension of a loaded value. In general, we'll be able to fuse the extension with the load. */ @@ -14931,11 +14933,11 @@ aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, the target-independent categorization provided by KIND to get a more accurate cost. WHERE specifies where the cost associated with KIND occurs. */ -static unsigned int +static fractional_cost aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, stmt_vec_info stmt_info, tree vectype, enum vect_cost_model_location where, - unsigned int stmt_cost) + fractional_cost stmt_cost) { const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype); const sve_vec_cost *sve_costs = nullptr; @@ -15016,10 +15018,10 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, for STMT_INFO, which has cost kind KIND and which when vectorized would operate on vector type VECTYPE. Adjust the cost as necessary for SVE targets. */ -static unsigned int +static fractional_cost aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind, stmt_vec_info stmt_info, tree vectype, - unsigned int stmt_cost) + fractional_cost stmt_cost) { /* Unlike vec_promote_demote, vector_stmt conversions do not change the vector register size or number of units. Integer promotions of this @@ -15083,9 +15085,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind, /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND and which when vectorized would operate on vector type VECTYPE. Add the cost of any embedded operations. */ -static unsigned int +static fractional_cost aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, - tree vectype, unsigned int stmt_cost) + tree vectype, fractional_cost stmt_cost) { if (vectype) { @@ -15339,7 +15341,7 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, if (flag_vect_cost_model) { - int stmt_cost + fractional_cost stmt_cost = aarch64_builtin_vectorization_cost (kind, vectype, misalign); /* Do one-time initialization based on the vinfo. */ @@ -15440,7 +15442,7 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, count *= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); /* FIXME */ } - retval = (unsigned) (count * stmt_cost); + retval = (count * stmt_cost).ceil (); costs->region[where] += retval; } @@ -15472,17 +15474,17 @@ aarch64_sve_op_count::dump () const /* Use ISSUE_INFO to estimate the minimum number of cycles needed to issue the operations described by OPS. This is a very simplistic model! */ -static unsigned int +static fractional_cost aarch64_estimate_min_cycles_per_iter (const aarch64_vec_op_count *ops, const aarch64_base_vec_issue_info *issue_info) { - unsigned int cycles = MAX (ops->reduction_latency, 1); - cycles = MAX (cycles, CEIL (ops->stores, issue_info->stores_per_cycle)); - cycles = MAX (cycles, CEIL (ops->loads + ops->stores, - issue_info->loads_stores_per_cycle)); - cycles = MAX (cycles, CEIL (ops->general_ops, - issue_info->general_ops_per_cycle)); + fractional_cost cycles = MAX (ops->reduction_latency, 1); + cycles = std::max (cycles, { ops->stores, issue_info->stores_per_cycle }); + cycles = std::max (cycles, { ops->loads + ops->stores, + issue_info->loads_stores_per_cycle }); + cycles = std::max (cycles, { ops->general_ops, + issue_info->general_ops_per_cycle }); return cycles; } @@ -15536,12 +15538,14 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) if (!issue_info) return body_cost; - unsigned int scalar_cycles_per_iter + fractional_cost scalar_cycles_per_iter = aarch64_estimate_min_cycles_per_iter (&costs->scalar_ops, issue_info->scalar); - unsigned int advsimd_cycles_per_iter + + fractional_cost advsimd_cycles_per_iter = aarch64_estimate_min_cycles_per_iter (&costs->advsimd_ops, issue_info->advsimd); + bool could_use_advsimd = ((costs->vec_flags & VEC_ADVSIMD) || (aarch64_autovec_preference != 2 @@ -15558,36 +15562,37 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n"); costs->scalar_ops.dump (); dump_printf_loc (MSG_NOTE, vect_location, - " estimated cycles per iteration = %d\n", - scalar_cycles_per_iter); + " estimated cycles per iteration = %f\n", + scalar_cycles_per_iter.as_double ()); if (could_use_advsimd) { dump_printf_loc (MSG_NOTE, vect_location, "Advanced SIMD issue estimate:\n"); costs->advsimd_ops.dump (); dump_printf_loc (MSG_NOTE, vect_location, - " estimated cycles per iteration = %d\n", - advsimd_cycles_per_iter); + " estimated cycles per iteration = %f\n", + advsimd_cycles_per_iter.as_double ()); } else dump_printf_loc (MSG_NOTE, vect_location, "Loop could not use Advanced SIMD\n"); } - uint64_t vector_cycles_per_iter = advsimd_cycles_per_iter; + fractional_cost vector_cycles_per_iter = advsimd_cycles_per_iter; unsigned int vector_reduction_latency = costs->advsimd_ops.reduction_latency; + if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve) { /* Estimate the minimum number of cycles per iteration needed to issue non-predicate operations. */ - unsigned int sve_cycles_per_iter + fractional_cost sve_cycles_per_iter = aarch64_estimate_min_cycles_per_iter (&costs->sve_ops, issue_info->sve); /* Separately estimate the minimum number of cycles per iteration needed to issue the predicate operations. */ - unsigned int pred_cycles_per_iter - = CEIL (costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle); + fractional_cost pred_cycles_per_iter + = { costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle }; if (dump_enabled_p ()) { @@ -15595,14 +15600,16 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) costs->sve_ops.dump (); dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per iteration for non-predicate" - " operations = %d\n", sve_cycles_per_iter); + " operations = %f\n", + sve_cycles_per_iter.as_double ()); if (costs->sve_ops.pred_ops) dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per" " iteration for predicate operations = %d\n", - pred_cycles_per_iter); + pred_cycles_per_iter.as_double ()); } - vector_cycles_per_iter = MAX (sve_cycles_per_iter, pred_cycles_per_iter); + vector_cycles_per_iter = std::max (sve_cycles_per_iter, + pred_cycles_per_iter); vector_reduction_latency = costs->sve_ops.reduction_latency; /* If the scalar version of the loop could issue at least as @@ -15616,7 +15623,7 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter; code later in the function handles that case in a more conservative way. */ - uint64_t sve_estimate = pred_cycles_per_iter + 1; + fractional_cost sve_estimate = pred_cycles_per_iter + 1; if (scalar_cycles_per_iter < sve_estimate) { unsigned int min_cost @@ -15656,8 +15663,10 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate) { /* This ensures that min_cost > orig_body_cost * 2. */ - unsigned int min_cost - = orig_body_cost * CEIL (sve_estimate, advsimd_cycles_per_iter) + 1; + unsigned int factor + = fractional_cost::scale (1, sve_estimate, + advsimd_cycles_per_iter); + unsigned int min_cost = orig_body_cost * factor + 1; if (body_cost < min_cost) { if (dump_enabled_p ()) @@ -15690,8 +15699,8 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) so minor differences should only result in minor changes. */ else if (scalar_cycles_per_iter < vector_cycles_per_iter) { - body_cost = CEIL (body_cost * vector_cycles_per_iter, - scalar_cycles_per_iter); + body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter, + scalar_cycles_per_iter); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "Increasing body cost to %d because scalar code" @@ -15716,8 +15725,8 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) && scalar_cycles_per_iter > vector_cycles_per_iter && !should_disparage) { - body_cost = CEIL (body_cost * vector_cycles_per_iter, - scalar_cycles_per_iter); + body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter, + scalar_cycles_per_iter); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "Decreasing body cost to %d account for smaller" @@ -25589,12 +25598,106 @@ aarch64_test_loading_full_dump () ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx)); } +/* Test the fractional_cost class. */ + +static void +aarch64_test_fractional_cost () +{ + using cf = fractional_cost; + + ASSERT_EQ (cf (0, 20), 0); + + ASSERT_EQ (cf (4, 2), 2); + ASSERT_EQ (3, cf (9, 3)); + + ASSERT_NE (cf (5, 2), 2); + ASSERT_NE (3, cf (8, 3)); + + ASSERT_EQ (cf (7, 11) + cf (15, 11), 2); + ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15)); + ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1); + + ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3)); + ASSERT_EQ (cf (1, 4) - cf (1, 2), 0); + ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2)); + ASSERT_EQ (cf (11, 3) - 3, cf (2, 3)); + ASSERT_EQ (3 - cf (7, 3), cf (2, 3)); + ASSERT_EQ (3 - cf (10, 3), 0); + + ASSERT_EQ (cf (2, 3) * 5, cf (10, 3)); + ASSERT_EQ (14 * cf (11, 21), cf (22, 3)); + + ASSERT_TRUE (cf (4, 15) < cf (5, 15)); + ASSERT_FALSE (cf (5, 15) < cf (5, 15)); + ASSERT_FALSE (cf (6, 15) < cf (5, 15)); + ASSERT_TRUE (cf (1, 3) < cf (2, 5)); + ASSERT_TRUE (cf (1, 12) < cf (1, 6)); + ASSERT_FALSE (cf (5, 3) < cf (5, 3)); + ASSERT_TRUE (cf (239, 240) < 1); + ASSERT_FALSE (cf (240, 240) < 1); + ASSERT_FALSE (cf (241, 240) < 1); + ASSERT_FALSE (2 < cf (207, 104)); + ASSERT_FALSE (2 < cf (208, 104)); + ASSERT_TRUE (2 < cf (209, 104)); + + ASSERT_TRUE (cf (4, 15) < cf (5, 15)); + ASSERT_FALSE (cf (5, 15) < cf (5, 15)); + ASSERT_FALSE (cf (6, 15) < cf (5, 15)); + ASSERT_TRUE (cf (1, 3) < cf (2, 5)); + ASSERT_TRUE (cf (1, 12) < cf (1, 6)); + ASSERT_FALSE (cf (5, 3) < cf (5, 3)); + ASSERT_TRUE (cf (239, 240) < 1); + ASSERT_FALSE (cf (240, 240) < 1); + ASSERT_FALSE (cf (241, 240) < 1); + ASSERT_FALSE (2 < cf (207, 104)); + ASSERT_FALSE (2 < cf (208, 104)); + ASSERT_TRUE (2 < cf (209, 104)); + + ASSERT_FALSE (cf (4, 15) >= cf (5, 15)); + ASSERT_TRUE (cf (5, 15) >= cf (5, 15)); + ASSERT_TRUE (cf (6, 15) >= cf (5, 15)); + ASSERT_FALSE (cf (1, 3) >= cf (2, 5)); + ASSERT_FALSE (cf (1, 12) >= cf (1, 6)); + ASSERT_TRUE (cf (5, 3) >= cf (5, 3)); + ASSERT_FALSE (cf (239, 240) >= 1); + ASSERT_TRUE (cf (240, 240) >= 1); + ASSERT_TRUE (cf (241, 240) >= 1); + ASSERT_TRUE (2 >= cf (207, 104)); + ASSERT_TRUE (2 >= cf (208, 104)); + ASSERT_FALSE (2 >= cf (209, 104)); + + ASSERT_FALSE (cf (4, 15) > cf (5, 15)); + ASSERT_FALSE (cf (5, 15) > cf (5, 15)); + ASSERT_TRUE (cf (6, 15) > cf (5, 15)); + ASSERT_FALSE (cf (1, 3) > cf (2, 5)); + ASSERT_FALSE (cf (1, 12) > cf (1, 6)); + ASSERT_FALSE (cf (5, 3) > cf (5, 3)); + ASSERT_FALSE (cf (239, 240) > 1); + ASSERT_FALSE (cf (240, 240) > 1); + ASSERT_TRUE (cf (241, 240) > 1); + ASSERT_TRUE (2 > cf (207, 104)); + ASSERT_FALSE (2 > cf (208, 104)); + ASSERT_FALSE (2 > cf (209, 104)); + + ASSERT_EQ (cf (1, 2).ceil (), 1); + ASSERT_EQ (cf (11, 7).ceil (), 2); + ASSERT_EQ (cf (20, 1).ceil (), 20); + ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe); + ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff); + ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff); + ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe); + ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff); + + ASSERT_EQ (cf (1, 2).as_double (), 0.5); +} + /* Run all target-specific selftests. */ static void aarch64_run_selftests (void) { aarch64_test_loading_full_dump (); + aarch64_test_fractional_cost (); } } // namespace selftest diff --git a/gcc/config/aarch64/fractional-cost.h b/gcc/config/aarch64/fractional-cost.h new file mode 100644 index 0000000..6a01634 --- /dev/null +++ b/gcc/config/aarch64/fractional-cost.h @@ -0,0 +1,236 @@ +// Simple fixed-point representation of fractional costs +// Copyright (C) 2021 Free Software Foundation, Inc. +// +// This file is part of GCC. +// +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. +// +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + +// A simple saturating fixed-point type for representing fractional +// intermediate results in cost calculations. The input and result +// costs are assumed to be uint32_ts. Unlike sreal, the class can +// represent most values that we care about exactly (without rounding). +// See the comment above the SCALE field for the current set of +// exactly-representable reciprocals. +class fractional_cost +{ +public: + // Construct an object equal to INT_VALUE. + constexpr fractional_cost (uint32_t int_value = 0) + : m_value (uint64_t (int_value) * SCALE) {} + + fractional_cost (uint32_t a, uint32_t b); + + fractional_cost operator+ (const fractional_cost &) const; + fractional_cost operator- (const fractional_cost &) const; + fractional_cost operator* (uint32_t) const; + + fractional_cost &operator+= (const fractional_cost &); + fractional_cost &operator-= (const fractional_cost &); + fractional_cost &operator*= (uint32_t); + + bool operator== (const fractional_cost &) const; + bool operator!= (const fractional_cost &) const; + bool operator< (const fractional_cost &) const; + bool operator<= (const fractional_cost &) const; + bool operator>= (const fractional_cost &) const; + bool operator> (const fractional_cost &) const; + + uint32_t ceil () const; + + static uint32_t scale (uint32_t, fractional_cost, fractional_cost); + + explicit operator bool () const { return m_value != 0; } + + // Convert the value to a double. + double as_double () const { return double (m_value) / SCALE; } + +private: + enum raw { RAW }; + constexpr fractional_cost (uint64_t value, raw) : m_value (value) {} + + // A multiple of [1, 16] * 16. This ensures that 1/N is representable + // for every every possible SVE element count N, or for any "X per cycle" + // value N in the range [1, 16]. + static const uint32_t SCALE = 11531520; + + // The value multiplied by BIAS. + uint64_t m_value; +}; + +// Construct a representation of A / B, rounding up if (contrary to +// expectations) we can't represent the value exactly. For now we +// treat inexact values as a bug, since all values of B should come +// from a small set of values that are known at compile time. +inline fractional_cost::fractional_cost (uint32_t a, uint32_t b) + : m_value (CEIL (uint64_t (a) * SCALE, uint64_t (b))) +{ + gcc_checking_assert (SCALE % b == 0); +} + +inline fractional_cost +fractional_cost::operator+ (const fractional_cost &other) const +{ + uint64_t sum = m_value + other.m_value; + return { sum >= m_value ? sum : ~uint64_t (0), RAW }; +} + +inline fractional_cost & +fractional_cost::operator+= (const fractional_cost &other) +{ + *this = *this + other; + return *this; +} + +inline fractional_cost +fractional_cost::operator- (const fractional_cost &other) const +{ + uint64_t diff = m_value - other.m_value; + return { diff <= m_value ? diff : 0, RAW }; +} + +inline fractional_cost & +fractional_cost::operator-= (const fractional_cost &other) +{ + *this = *this - other; + return *this; +} + +inline fractional_cost +fractional_cost::operator* (uint32_t other) const +{ + if (other == 0) + return 0; + + uint64_t max = ~uint64_t (0); + return { m_value <= max / other ? m_value * other : max, RAW }; +} + +inline fractional_cost & +fractional_cost::operator*= (uint32_t other) +{ + *this = *this * other; + return *this; +} + +inline bool +fractional_cost::operator== (const fractional_cost &other) const +{ + return m_value == other.m_value; +} + +inline bool +fractional_cost::operator!= (const fractional_cost &other) const +{ + return m_value != other.m_value; +} + +inline bool +fractional_cost::operator< (const fractional_cost &other) const +{ + return m_value < other.m_value; +} + +inline bool +fractional_cost::operator<= (const fractional_cost &other) const +{ + return m_value <= other.m_value; +} + +inline bool +fractional_cost::operator>= (const fractional_cost &other) const +{ + return m_value >= other.m_value; +} + +inline bool +fractional_cost::operator> (const fractional_cost &other) const +{ + return m_value > other.m_value; +} + +// Round the value up to the nearest integer and saturate to a uint32_t. +inline uint32_t +fractional_cost::ceil () const +{ + uint32_t max = ~uint32_t (0); + if (m_value <= uint64_t (max - 1) * SCALE) + return (m_value + SCALE - 1) / SCALE; + return max; +} + +// Round (COST * A) / B up to the nearest integer and saturate to a uint32_t. +inline uint32_t +fractional_cost::scale (uint32_t cost, fractional_cost a, fractional_cost b) +{ + widest_int result = wi::div_ceil (widest_int (cost) * a.m_value, + b.m_value, SIGNED); + if (result < ~uint32_t (0)) + return result.to_shwi (); + return ~uint32_t (0); +} + +inline fractional_cost +operator+ (uint32_t a, const fractional_cost &b) +{ + return b.operator+ (a); +} + +inline fractional_cost +operator- (uint32_t a, const fractional_cost &b) +{ + return fractional_cost (a).operator- (b); +} + +inline fractional_cost +operator* (uint32_t a, const fractional_cost &b) +{ + return b.operator* (a); +} + +inline bool +operator== (uint32_t a, const fractional_cost &b) +{ + return b.operator== (a); +} + +inline bool +operator!= (uint32_t a, const fractional_cost &b) +{ + return b.operator!= (a); +} + +inline bool +operator< (uint32_t a, const fractional_cost &b) +{ + return b.operator> (a); +} + +inline bool +operator<= (uint32_t a, const fractional_cost &b) +{ + return b.operator>= (a); +} + +inline bool +operator>= (uint32_t a, const fractional_cost &b) +{ + return b.operator<= (a); +} + +inline bool +operator> (uint32_t a, const fractional_cost &b) +{ + return b.operator< (a); +} -- cgit v1.1 From b585f0112f293ace8fadc0c7ace59230140b7472 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 3 Aug 2021 13:00:45 +0100 Subject: aarch64: Split out aarch64_adjust_body_cost_sve This patch splits the SVE-specific part of aarch64_adjust_body_cost out into its own subroutine, so that a future patch can call it more than once. I wondered about using a lambda to avoid having to pass all the arguments, but in the end this way seemed clearer. gcc/ * config/aarch64/aarch64.c (aarch64_adjust_body_cost_sve): New function, split out from... (aarch64_adjust_body_cost): ...here. --- gcc/config/aarch64/aarch64.c | 220 +++++++++++++++++++++++++------------------ 1 file changed, 127 insertions(+), 93 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 17fcb34..b14b6f2 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -15488,6 +15488,126 @@ aarch64_estimate_min_cycles_per_iter return cycles; } +/* Subroutine of aarch64_adjust_body_cost for handling SVE. + Use ISSUE_INFO to work out how fast the SVE code can be issued and compare + it to the equivalent value for scalar code (SCALAR_CYCLES_PER_ITER). + If COULD_USE_ADVSIMD is true, also compare it to the issue rate of + Advanced SIMD code (ADVSIMD_CYCLES_PER_ITER). + + COSTS is as for aarch64_adjust_body_cost. ORIG_BODY_COST is the cost + originally passed to aarch64_adjust_body_cost and *BODY_COST is the current + value of the adjusted cost. *SHOULD_DISPARAGE is true if we think the loop + body is too expensive. */ + +static fractional_cost +aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs, + const aarch64_vec_issue_info *issue_info, + fractional_cost scalar_cycles_per_iter, + fractional_cost advsimd_cycles_per_iter, + bool could_use_advsimd, + unsigned int orig_body_cost, + unsigned int *body_cost, + bool *should_disparage) +{ + /* Estimate the minimum number of cycles per iteration needed to issue + non-predicate operations. */ + fractional_cost sve_nonpred_cycles_per_iter + = aarch64_estimate_min_cycles_per_iter (&costs->sve_ops, + issue_info->sve); + + /* Separately estimate the minimum number of cycles per iteration needed + to issue the predicate operations. */ + fractional_cost sve_pred_issue_cycles_per_iter + = { costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle }; + + /* Calculate the overall limit on the number of cycles per iteration. */ + fractional_cost sve_cycles_per_iter + = std::max (sve_nonpred_cycles_per_iter, sve_pred_issue_cycles_per_iter); + + if (dump_enabled_p ()) + { + costs->sve_ops.dump (); + dump_printf_loc (MSG_NOTE, vect_location, + " estimated cycles per iteration = %f\n", + sve_cycles_per_iter.as_double ()); + dump_printf_loc (MSG_NOTE, vect_location, + " estimated cycles per iteration for non-predicate" + " operations = %f\n", + sve_nonpred_cycles_per_iter.as_double ()); + if (costs->sve_ops.pred_ops) + dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per" + " iteration for predicate operations = %d\n", + sve_pred_issue_cycles_per_iter.as_double ()); + } + + /* If the scalar version of the loop could issue at least as + quickly as the predicate parts of the SVE loop, make the SVE loop + prohibitively expensive. In this case vectorization is adding an + overhead that the original scalar code didn't have. + + This is mostly intended to detect cases in which WHILELOs dominate + for very tight loops, which is something that normal latency-based + costs would not model. Adding this kind of cliffedge would be + too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter; + code in the caller handles that case in a more conservative way. */ + fractional_cost sve_estimate = sve_pred_issue_cycles_per_iter + 1; + if (scalar_cycles_per_iter < sve_estimate) + { + unsigned int min_cost + = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR); + if (*body_cost < min_cost) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d because the" + " scalar code could issue within the limit" + " imposed by predicate operations\n", + min_cost); + *body_cost = min_cost; + *should_disparage = true; + } + } + + /* If it appears that the Advanced SIMD version of a loop could issue + more quickly than the SVE one, increase the SVE cost in proportion + to the difference. The intention is to make Advanced SIMD preferable + in cases where an Advanced SIMD version exists, without increasing + the costs so much that SVE won't be used at all. + + The reasoning is similar to the scalar vs. predicate comparison above: + if the issue rate of the SVE code is limited by predicate operations + (i.e. if sve_pred_issue_cycles_per_iter > sve_nonpred_cycles_per_iter), + and if the Advanced SIMD code could issue within the limit imposed + by the predicate operations, the predicate operations are adding an + overhead that the original code didn't have and so we should prefer + the Advanced SIMD version. However, if the predicate operations + do not dominate in this way, we should only increase the cost of + the SVE code if sve_cycles_per_iter is strictly greater than + advsimd_cycles_per_iter. Given rounding effects, this should mean + that Advanced SIMD is either better or at least no worse. */ + if (sve_nonpred_cycles_per_iter >= sve_pred_issue_cycles_per_iter) + sve_estimate = sve_cycles_per_iter; + if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate) + { + /* This ensures that min_cost > orig_body_cost * 2. */ + unsigned int factor = fractional_cost::scale (1, sve_estimate, + advsimd_cycles_per_iter); + unsigned int min_cost = orig_body_cost * factor + 1; + if (*body_cost < min_cost) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d because Advanced" + " SIMD code could issue as quickly\n", + min_cost); + *body_cost = min_cost; + *should_disparage = true; + } + } + + return sve_cycles_per_iter; +} + /* BODY_COST is the cost of a vector loop body recorded in COSTS. Adjust the cost as necessary and return the new cost. */ static unsigned int @@ -15583,101 +15703,15 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve) { - /* Estimate the minimum number of cycles per iteration needed to issue - non-predicate operations. */ - fractional_cost sve_cycles_per_iter - = aarch64_estimate_min_cycles_per_iter (&costs->sve_ops, - issue_info->sve); - - /* Separately estimate the minimum number of cycles per iteration needed - to issue the predicate operations. */ - fractional_cost pred_cycles_per_iter - = { costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle }; - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n"); - costs->sve_ops.dump (); - dump_printf_loc (MSG_NOTE, vect_location, - " estimated cycles per iteration for non-predicate" - " operations = %f\n", - sve_cycles_per_iter.as_double ()); - if (costs->sve_ops.pred_ops) - dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per" - " iteration for predicate operations = %d\n", - pred_cycles_per_iter.as_double ()); - } - - vector_cycles_per_iter = std::max (sve_cycles_per_iter, - pred_cycles_per_iter); + dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n"); vector_reduction_latency = costs->sve_ops.reduction_latency; - - /* If the scalar version of the loop could issue at least as - quickly as the predicate parts of the SVE loop, make the SVE loop - prohibitively expensive. In this case vectorization is adding an - overhead that the original scalar code didn't have. - - This is mostly intended to detect cases in which WHILELOs dominate - for very tight loops, which is something that normal latency-based - costs would not model. Adding this kind of cliffedge would be - too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter; - code later in the function handles that case in a more - conservative way. */ - fractional_cost sve_estimate = pred_cycles_per_iter + 1; - if (scalar_cycles_per_iter < sve_estimate) - { - unsigned int min_cost - = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR); - if (body_cost < min_cost) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Increasing body cost to %d because the" - " scalar code could issue within the limit" - " imposed by predicate operations\n", - min_cost); - body_cost = min_cost; - should_disparage = true; - } - } - - /* If it appears that the Advanced SIMD version of a loop could issue - more quickly than the SVE one, increase the SVE cost in proportion - to the difference. The intention is to make Advanced SIMD preferable - in cases where an Advanced SIMD version exists, without increasing - the costs so much that SVE won't be used at all. - - The reasoning is similar to the scalar vs. predicate comparison above: - if the issue rate of the SVE code is limited by predicate operations - (i.e. if pred_cycles_per_iter > sve_cycles_per_iter), and if the - Advanced SIMD code could issue within the limit imposed by the - predicate operations, the predicate operations are adding an - overhead that the original code didn't have and so we should prefer - the Advanced SIMD version. However, if the predicate operations - do not dominate in this way, we should only increase the cost of - the SVE code if sve_cycles_per_iter is strictly greater than - advsimd_cycles_per_iter. Given rounding effects, this should mean - that Advanced SIMD is either better or at least no worse. */ - if (sve_cycles_per_iter >= pred_cycles_per_iter) - sve_estimate = sve_cycles_per_iter; - if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate) - { - /* This ensures that min_cost > orig_body_cost * 2. */ - unsigned int factor - = fractional_cost::scale (1, sve_estimate, - advsimd_cycles_per_iter); - unsigned int min_cost = orig_body_cost * factor + 1; - if (body_cost < min_cost) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Increasing body cost to %d because Advanced" - " SIMD code could issue as quickly\n", - min_cost); - body_cost = min_cost; - should_disparage = true; - } - } + vector_cycles_per_iter + = aarch64_adjust_body_cost_sve (costs, issue_info, + scalar_cycles_per_iter, + advsimd_cycles_per_iter, + could_use_advsimd, orig_body_cost, + &body_cost, &should_disparage); } /* Decide whether to stick to latency-based costs or whether to try to -- cgit v1.1 From 78770e0e5d9fef70679e1db4eb2fb06596fbb2f8 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 3 Aug 2021 13:00:45 +0100 Subject: aarch64: Add gather_load_xNN_cost tuning fields This patch adds tuning fields for the total cost of a gather load instruction. Until now, we've costed them as one scalar load per element instead. Those scalar_load-based values are also what the patch uses to fill in the new fields for existing cost structures. gcc/ * config/aarch64/aarch64-protos.h (sve_vec_cost): Add gather_load_x32_cost and gather_load_x64_cost. * config/aarch64/aarch64.c (generic_sve_vector_cost) (a64fx_sve_vector_cost, neoversev1_sve_vector_cost): Update accordingly, using the values given by the scalar_load * number of elements calculation that we used previously. (aarch64_detect_vector_stmt_subtype): Use the new fields. --- gcc/config/aarch64/aarch64-protos.h | 9 +++++++++ gcc/config/aarch64/aarch64.c | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index fb4ce8e..b91eeeb 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -259,12 +259,16 @@ struct sve_vec_cost : simd_vec_cost unsigned int fadda_f16_cost, unsigned int fadda_f32_cost, unsigned int fadda_f64_cost, + unsigned int gather_load_x32_cost, + unsigned int gather_load_x64_cost, unsigned int scatter_store_elt_cost) : simd_vec_cost (base), clast_cost (clast_cost), fadda_f16_cost (fadda_f16_cost), fadda_f32_cost (fadda_f32_cost), fadda_f64_cost (fadda_f64_cost), + gather_load_x32_cost (gather_load_x32_cost), + gather_load_x64_cost (gather_load_x64_cost), scatter_store_elt_cost (scatter_store_elt_cost) {} @@ -279,6 +283,11 @@ struct sve_vec_cost : simd_vec_cost const int fadda_f32_cost; const int fadda_f64_cost; + /* The cost of a gather load instruction. The x32 value is for loads + of 32-bit elements and the x64 value is for loads of 64-bit elements. */ + const int gather_load_x32_cost; + const int gather_load_x64_cost; + /* The per-element cost of a scatter store. */ const int scatter_store_elt_cost; }; diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index b14b6f2..36f1180 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -675,6 +675,8 @@ static const sve_vec_cost generic_sve_vector_cost = 2, /* fadda_f16_cost */ 2, /* fadda_f32_cost */ 2, /* fadda_f64_cost */ + 4, /* gather_load_x32_cost */ + 2, /* gather_load_x64_cost */ 1 /* scatter_store_elt_cost */ }; @@ -744,6 +746,8 @@ static const sve_vec_cost a64fx_sve_vector_cost = 13, /* fadda_f16_cost */ 13, /* fadda_f32_cost */ 13, /* fadda_f64_cost */ + 64, /* gather_load_x32_cost */ + 32, /* gather_load_x64_cost */ 1 /* scatter_store_elt_cost */ }; @@ -1739,6 +1743,8 @@ static const sve_vec_cost neoversev1_sve_vector_cost = 19, /* fadda_f16_cost */ 11, /* fadda_f32_cost */ 8, /* fadda_f64_cost */ + 32, /* gather_load_x32_cost */ + 16, /* gather_load_x64_cost */ 3 /* scatter_store_elt_cost */ }; @@ -14958,6 +14964,19 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))) return simd_costs->store_elt_extra_cost; + /* Detect SVE gather loads, which are costed as a single scalar_load + for each element. We therefore need to divide the full-instruction + cost by the number of elements in the vector. */ + if (kind == scalar_load + && sve_costs + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER) + { + unsigned int nunits = vect_nunits_for_cost (vectype); + if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64) + return { sve_costs->gather_load_x64_cost, nunits }; + return { sve_costs->gather_load_x32_cost, nunits }; + } + /* Detect cases in which a scalar_store is really storing one element in a scatter operation. */ if (kind == scalar_store -- cgit v1.1 From 537afb0857c8f60c2b60a09fad4660420cd13e8f Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 3 Aug 2021 13:00:46 +0100 Subject: aarch64: Tweak the cost of elementwise stores When the vectoriser scalarises a strided store, it counts one scalar_store for each element plus one vec_to_scalar extraction for each element. However, extracting element 0 is free on AArch64, so it should have zero cost. I don't have a testcase that requires this for existing -mtune options, but it becomes more important with a later patch. gcc/ * config/aarch64/aarch64.c (aarch64_is_store_elt_extraction): New function, split out from... (aarch64_detect_vector_stmt_subtype): ...here. (aarch64_add_stmt_cost): Treat extracting element 0 as free. --- gcc/config/aarch64/aarch64.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 36f1180..084f8ca 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -14622,6 +14622,18 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, } } +/* Return true if an operaton of kind KIND for STMT_INFO represents + the extraction of an element from a vector in preparation for + storing the element to memory. */ +static bool +aarch64_is_store_elt_extraction (vect_cost_for_stmt kind, + stmt_vec_info stmt_info) +{ + return (kind == vec_to_scalar + && STMT_VINFO_DATA_REF (stmt_info) + && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))); +} + /* Return true if STMT_INFO represents part of a reduction. */ static bool aarch64_is_reduction (stmt_vec_info stmt_info) @@ -14959,9 +14971,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, /* Detect cases in which vec_to_scalar is describing the extraction of a vector element in preparation for a scalar store. The store itself is costed separately. */ - if (kind == vec_to_scalar - && STMT_VINFO_DATA_REF (stmt_info) - && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))) + if (aarch64_is_store_elt_extraction (kind, stmt_info)) return simd_costs->store_elt_extra_cost; /* Detect SVE gather loads, which are costed as a single scalar_load @@ -15382,6 +15392,12 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, if (vectype && aarch64_sve_only_stmt_p (stmt_info, vectype)) costs->saw_sve_only_op = true; + /* If we scalarize a strided store, the vectorizer costs one + vec_to_scalar for each element. However, we can store the first + element using an FP store without a separate extract step. */ + if (aarch64_is_store_elt_extraction (kind, stmt_info)) + count -= 1; + stmt_cost = aarch64_detect_scalar_stmt_subtype (vinfo, kind, stmt_info, stmt_cost); -- cgit v1.1 From 028059b46ec9aef7dd447792c579f35396751068 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 3 Aug 2021 13:00:47 +0100 Subject: aarch64: Tweak MLA vector costs The issue-based vector costs currently assume that a multiply-add sequence can be implemented using a single instruction. This is generally true for scalars (which have a 4-operand instruction) and SVE (which allows the output to be tied to any input). However, for Advanced SIMD, multiplying two values and adding an invariant will end up being a move and an MLA. The only target to use the issue-based vector costs is Neoverse V1, which would generally prefer SVE in this case anyway. I therefore don't have a self-contained testcase. However, the distinction becomes more important with a later patch. gcc/ * config/aarch64/aarch64.c (aarch64_multiply_add_p): Add a vec_flags parameter. Detect cases in which an Advanced SIMD MLA would almost certainly require a MOV. (aarch64_count_ops): Update accordingly. --- gcc/config/aarch64/aarch64.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 084f8ca..19045ef 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -14767,9 +14767,12 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info) /* Return true if STMT_INFO is the second part of a two-statement multiply-add or multiply-subtract sequence that might be suitable for fusing into a - single instruction. */ + single instruction. If VEC_FLAGS is zero, analyze the operation as + a scalar one, otherwise analyze it as an operation on vectors with those + VEC_* flags. */ static bool -aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info) +aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info, + unsigned int vec_flags) { gassign *assign = dyn_cast (stmt_info->stmt); if (!assign) @@ -14797,6 +14800,22 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info) if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR) continue; + if (vec_flags & VEC_ADVSIMD) + { + /* Scalar and SVE code can tie the result to any FMLA input (or none, + although that requires a MOVPRFX for SVE). However, Advanced SIMD + only supports MLA forms, so will require a move if the result + cannot be tied to the accumulator. The most important case in + which this is true is when the accumulator input is invariant. */ + rhs = gimple_op (assign, 3 - i); + if (TREE_CODE (rhs) != SSA_NAME) + return false; + def_stmt_info = vinfo->lookup_def (rhs); + if (!def_stmt_info + || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def) + return false; + } + return true; } return false; @@ -15232,7 +15251,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs, } /* Assume that multiply-adds will become a single operation. */ - if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info)) + if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags)) return; /* When costing scalar statements in vector code, the count already -- cgit v1.1 From 9690309baf8294b0512b55b133bc102dc0dac5b5 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 3 Aug 2021 13:00:48 +0100 Subject: aarch64: Restrict issue heuristics to inner vector loop The AArch64 vector costs try to take issue rates into account. However, when vectorising an outer loop, we lumped the inner and outer operations together, which is somewhat meaningless. This patch restricts the heuristic to the inner loop. gcc/ * config/aarch64/aarch64.c (aarch64_add_stmt_cost): Only record issue information for operations that occur in the innermost loop. --- gcc/config/aarch64/aarch64.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 19045ef..19625eb 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -15392,6 +15392,10 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, fractional_cost stmt_cost = aarch64_builtin_vectorization_cost (kind, vectype, misalign); + bool in_inner_loop_p = (where == vect_body + && stmt_info + && stmt_in_inner_loop_p (vinfo, stmt_info)); + /* Do one-time initialization based on the vinfo. */ loop_vec_info loop_vinfo = dyn_cast (vinfo); bb_vec_info bb_vinfo = dyn_cast (vinfo); @@ -15438,14 +15442,15 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype, stmt_cost); - /* If we're recording a nonzero vector loop body cost, also estimate - the operations that would need to be issued by all relevant - implementations of the loop. */ + /* If we're recording a nonzero vector loop body cost for the + innermost loop, also estimate the operations that would need + to be issued by all relevant implementations of the loop. */ auto *issue_info = aarch64_tune_params.vec_costs->issue_info; if (loop_vinfo && issue_info && costs->vec_flags && where == vect_body + && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p) && vectype && stmt_cost != 0) { @@ -15489,8 +15494,7 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, /* Statements in an inner loop relative to the loop being vectorized are weighted more heavily. The value here is arbitrary and could potentially be improved with analysis. */ - if (where == vect_body && stmt_info - && stmt_in_inner_loop_p (vinfo, stmt_info)) + if (in_inner_loop_p) { gcc_assert (loop_vinfo); count *= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); /* FIXME */ -- cgit v1.1 From 048039c49b96875144f67e7789fdea54abf7710b Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 3 Aug 2021 13:00:49 +0100 Subject: aarch64: Add -mtune=neoverse-512tvb This patch adds an option to tune for Neoverse cores that have a total vector bandwidth of 512 bits (4x128 for Advanced SIMD and a vector-length-dependent equivalent for SVE). This is intended to be a compromise between tuning aggressively for a single core like Neoverse V1 (which can be too narrow) and tuning for AArch64 cores in general (which can be too wide). -mcpu=neoverse-512tvb is equivalent to -mcpu=neoverse-v1 -mtune=neoverse-512tvb. gcc/ * doc/invoke.texi: Document -mtune=neoverse-512tvb and -mcpu=neoverse-512tvb. * config/aarch64/aarch64-cores.def (neoverse-512tvb): New entry. * config/aarch64/aarch64-tune.md: Regenerate. * config/aarch64/aarch64.c (neoverse512tvb_sve_vector_cost) (neoverse512tvb_sve_issue_info, neoverse512tvb_vec_issue_info) (neoverse512tvb_vector_cost, neoverse512tvb_tunings): New structures. (aarch64_adjust_body_cost_sve): Handle -mtune=neoverse-512tvb. (aarch64_adjust_body_cost): Likewise. --- gcc/config/aarch64/aarch64-cores.def | 1 + gcc/config/aarch64/aarch64-tune.md | 2 +- gcc/config/aarch64/aarch64.c | 184 +++++++++++++++++++++++++++++++++-- 3 files changed, 179 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index de8fe9b..b2aa167 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -139,6 +139,7 @@ AARCH64_CORE("thunderx3t110", thunderx3t110, thunderx3t110, 8_3A, AARCH64_FL_ /* Arm ('A') cores. */ AARCH64_CORE("zeus", zeus, cortexa57, 8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | AARCH64_FL_RNG, neoversev1, 0x41, 0xd40, -1) AARCH64_CORE("neoverse-v1", neoversev1, cortexa57, 8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | AARCH64_FL_RNG, neoversev1, 0x41, 0xd40, -1) +AARCH64_CORE("neoverse-512tvb", neoverse512tvb, cortexa57, 8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | AARCH64_FL_RNG, neoverse512tvb, INVALID_IMP, INVALID_CORE, -1) /* Qualcomm ('Q') cores. */ AARCH64_CORE("saphira", saphira, saphira, 8_4A, AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira, 0x51, 0xC01, -1) diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md index af66c11..e491c29 100644 --- a/gcc/config/aarch64/aarch64-tune.md +++ b/gcc/config/aarch64/aarch64-tune.md @@ -1,5 +1,5 @@ ;; -*- buffer-read-only: t -*- ;; Generated automatically by gentune.sh from aarch64-cores.def (define_attr "tune" - "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82" + "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82" (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 19625eb..f80de2c 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1842,6 +1842,136 @@ static const struct tune_params neoversev1_tunings = &generic_prefetch_tune }; +static const sve_vec_cost neoverse512tvb_sve_vector_cost = +{ + { + 2, /* int_stmt_cost */ + 2, /* fp_stmt_cost */ + 4, /* ld2_st2_permute_cost */ + 5, /* ld3_st3_permute_cost */ + 5, /* ld4_st4_permute_cost */ + 3, /* permute_cost */ + /* Theoretically, a reduction involving 15 scalar ADDs could + complete in ~5 cycles and would have a cost of 15. Assume that + [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */ + 21, /* reduc_i8_cost */ + /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */ + 13, /* reduc_i16_cost */ + /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */ + 9, /* reduc_i32_cost */ + /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */ + 8, /* reduc_i64_cost */ + /* Theoretically, a reduction involving 7 scalar FADDs could + complete in ~6 cycles and would have a cost of 14. Assume that + FADDV completes in 8 cycles and so give it a cost of 14 + 2. */ + 16, /* reduc_f16_cost */ + /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */ + 8, /* reduc_f32_cost */ + /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */ + 4, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ + /* This value is just inherited from the Cortex-A57 table. */ + 8, /* vec_to_scalar_cost */ + /* This depends very much on what the scalar value is and + where it comes from. E.g. some constants take two dependent + instructions or a load, while others might be moved from a GPR. + 4 seems to be a reasonable compromise in practice. */ + 4, /* scalar_to_vec_cost */ + 4, /* align_load_cost */ + 4, /* unalign_load_cost */ + /* Although stores generally have a latency of 2 and compete for the + vector pipes, in practice it's better not to model that. */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ + }, + 3, /* clast_cost */ + 10, /* fadda_f16_cost */ + 6, /* fadda_f32_cost */ + 4, /* fadda_f64_cost */ + /* A strided Advanced SIMD x64 load would take two parallel FP loads + (6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather + is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads + (cost 8) and a vec_construct (cost 2). Add a full vector operation + (cost 2) to that, to avoid the difference being lost in rounding. + + There is no easy comparison between a strided Advanced SIMD x32 load + and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector + operation more than a 64-bit gather. */ + 14, /* gather_load_x32_cost */ + 12, /* gather_load_x64_cost */ + 3 /* scatter_store_elt_cost */ +}; + +static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info = +{ + { + { + 3, /* loads_per_cycle */ + 2, /* stores_per_cycle */ + 4, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ + }, + 2, /* ld2_st2_general_ops */ + 2, /* ld3_st3_general_ops */ + 3 /* ld4_st4_general_ops */ + }, + 2, /* pred_ops_per_cycle */ + 2, /* while_pred_ops */ + 2, /* int_cmp_pred_ops */ + 1, /* fp_cmp_pred_ops */ + 1, /* gather_scatter_pair_general_ops */ + 1 /* gather_scatter_pair_pred_ops */ +}; + +static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info = +{ + &neoversev1_scalar_issue_info, + &neoversev1_advsimd_issue_info, + &neoverse512tvb_sve_issue_info +}; + +static const struct cpu_vector_cost neoverse512tvb_vector_cost = +{ + 1, /* scalar_int_stmt_cost */ + 2, /* scalar_fp_stmt_cost */ + 4, /* scalar_load_cost */ + 1, /* scalar_store_cost */ + 1, /* cond_taken_branch_cost */ + 1, /* cond_not_taken_branch_cost */ + &neoversev1_advsimd_vector_cost, /* advsimd */ + &neoverse512tvb_sve_vector_cost, /* sve */ + &neoverse512tvb_vec_issue_info /* issue_info */ +}; + +static const struct tune_params neoverse512tvb_tunings = +{ + &cortexa76_extra_costs, + &neoversev1_addrcost_table, + &generic_regmove_cost, + &neoverse512tvb_vector_cost, + &generic_branch_cost, + &generic_approx_modes, + SVE_128 | SVE_256, /* sve_width */ + 4, /* memmov_cost */ + 3, /* issue_rate */ + (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ + "32:16", /* function_align. */ + "4", /* jump_align. */ + "32:16", /* loop_align. */ + 2, /* int_reassoc_width. */ + 4, /* fp_reassoc_width. */ + 2, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS + | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ + &generic_prefetch_tune +}; + static const struct tune_params neoversen2_tunings = { &cortexa76_extra_costs, @@ -15569,10 +15699,32 @@ aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs, { /* Estimate the minimum number of cycles per iteration needed to issue non-predicate operations. */ - fractional_cost sve_nonpred_cycles_per_iter + fractional_cost sve_nonpred_issue_cycles_per_iter = aarch64_estimate_min_cycles_per_iter (&costs->sve_ops, issue_info->sve); + /* Estimate the minimum number of cycles per iteration needed to rename + SVE instructions. + + ??? For now this is done inline rather than via cost tables, since it + isn't clear how it should be parameterized for the general case. */ + fractional_cost sve_rename_cycles_per_iter = 0; + if (issue_info == &neoverse512tvb_vec_issue_info) + /* + 1 for an addition. We've already counted a general op for each + store, so we don't need to account for stores separately. The branch + reads no registers and so does not need to be counted either. + + ??? This value is very much on the pessimistic side, but seems to work + pretty well in practice. */ + sve_rename_cycles_per_iter + = { costs->sve_ops.general_ops + + costs->sve_ops.loads + + costs->sve_ops.pred_ops + 1, 5 }; + + /* Combine the rename and non-predicate issue limits into a single value. */ + fractional_cost sve_nonpred_cycles_per_iter + = std::max (sve_nonpred_issue_cycles_per_iter, sve_rename_cycles_per_iter); + /* Separately estimate the minimum number of cycles per iteration needed to issue the predicate operations. */ fractional_cost sve_pred_issue_cycles_per_iter @@ -15588,14 +15740,17 @@ aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs, dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per iteration = %f\n", sve_cycles_per_iter.as_double ()); - dump_printf_loc (MSG_NOTE, vect_location, - " estimated cycles per iteration for non-predicate" - " operations = %f\n", - sve_nonpred_cycles_per_iter.as_double ()); if (costs->sve_ops.pred_ops) - dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per" - " iteration for predicate operations = %d\n", + dump_printf_loc (MSG_NOTE, vect_location, + " predicate issue = %f\n", sve_pred_issue_cycles_per_iter.as_double ()); + if (costs->sve_ops.pred_ops || sve_rename_cycles_per_iter) + dump_printf_loc (MSG_NOTE, vect_location, + " non-predicate issue = %f\n", + sve_nonpred_issue_cycles_per_iter.as_double ()); + if (sve_rename_cycles_per_iter) + dump_printf_loc (MSG_NOTE, vect_location, " rename = %f\n", + sve_rename_cycles_per_iter.as_double ()); } /* If the scalar version of the loop could issue at least as @@ -15770,6 +15925,21 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) advsimd_cycles_per_iter, could_use_advsimd, orig_body_cost, &body_cost, &should_disparage); + + if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost) + { + /* Also take Neoverse V1 tuning into account, doubling the + scalar and Advanced SIMD estimates to account for the + doubling in SVE vector length. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Neoverse V1 estimate:\n"); + aarch64_adjust_body_cost_sve (costs, &neoversev1_vec_issue_info, + scalar_cycles_per_iter * 2, + advsimd_cycles_per_iter * 2, + could_use_advsimd, orig_body_cost, + &body_cost, &should_disparage); + } } /* Decide whether to stick to latency-based costs or whether to try to -- cgit v1.1 From 98d7f305d5081bc91c16b9d2b4d62196b86bca86 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 16 Jul 2021 10:29:46 -0700 Subject: x86: Use XMM31 for scratch SSE register In 64-bit mode, use XMM31 for scratch SSE register to avoid vzeroupper if possible. gcc/ * config/i386/i386.c (ix86_gen_scratch_sse_rtx): In 64-bit mode, try XMM31 to avoid vzeroupper. gcc/testsuite/ * gcc.target/i386/avx-vzeroupper-14.c: Pass -mno-avx512f to disable XMM31. * gcc.target/i386/avx-vzeroupper-15.c: Likewise. * gcc.target/i386/pr82941-1.c: Updated. Check for vzeroupper. * gcc.target/i386/pr82942-1.c: Likewise. * gcc.target/i386/pr82990-1.c: Likewise. * gcc.target/i386/pr82990-3.c: Likewise. * gcc.target/i386/pr82990-5.c: Likewise. * gcc.target/i386/pr100865-4b.c: Likewise. * gcc.target/i386/pr100865-6b.c: Likewise. * gcc.target/i386/pr100865-7b.c: Likewise. * gcc.target/i386/pr100865-10b.c: Likewise. * gcc.target/i386/pr100865-8b.c: Updated. * gcc.target/i386/pr100865-9b.c: Likewise. * gcc.target/i386/pr100865-11b.c: Likewise. * gcc.target/i386/pr100865-12b.c: Likewise. --- gcc/config/i386/i386.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 842eb0e..ec06908 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -23335,9 +23335,21 @@ rtx ix86_gen_scratch_sse_rtx (machine_mode mode) { if (TARGET_SSE && !lra_in_progress) - return gen_rtx_REG (mode, (TARGET_64BIT - ? LAST_REX_SSE_REG - : LAST_SSE_REG)); + { + unsigned int regno; + if (TARGET_64BIT) + { + /* In 64-bit mode, use XMM31 to avoid vzeroupper and always + use XMM31 for CSE. */ + if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode)) + regno = LAST_EXT_REX_SSE_REG; + else + regno = LAST_REX_SSE_REG; + } + else + regno = LAST_SSE_REG; + return gen_rtx_REG (mode, regno); + } else return gen_reg_rtx (mode); } -- cgit v1.1 From eaa93a0f3d9f67c8cbc1dc849ea6feba432ff412 Mon Sep 17 00:00:00 2001 From: "Paul A. Clarke" Date: Mon, 22 Feb 2021 19:13:28 -0600 Subject: rs6000: Add support for _mm_minpos_epu16 Add a naive implementation of the subject x86 intrinsic to ease porting. 2021-08-03 Paul A. Clarke gcc * config/rs6000/smmintrin.h (_mm_minpos_epu16): New. --- gcc/config/rs6000/smmintrin.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h index 0145b92..3767a67 100644 --- a/gcc/config/rs6000/smmintrin.h +++ b/gcc/config/rs6000/smmintrin.h @@ -296,4 +296,31 @@ _mm_floor_ss (__m128 __A, __m128 __B) return __r; } +/* Return horizontal packed word minimum and its index in bits [15:0] + and bits [18:16] respectively. */ +__inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_minpos_epu16 (__m128i __A) +{ + union __u + { + __m128i __m; + __v8hu __uh; + }; + union __u __u = { .__m = __A }, __r = { .__m = {0} }; + unsigned short __ridx = 0; + unsigned short __rmin = __u.__uh[__ridx]; + for (unsigned long __i = 1; __i < 8; __i++) + { + if (__u.__uh[__i] < __rmin) + { + __rmin = __u.__uh[__i]; + __ridx = __i; + } + } + __r.__uh[0] = __rmin; + __r.__uh[1] = __ridx; + return __r.__m; +} + #endif -- cgit v1.1 From ebff536cf401f94129a50e50b69beeb09080a68a Mon Sep 17 00:00:00 2001 From: Segher Boessenkool Date: Tue, 3 Aug 2021 22:22:37 +0000 Subject: rs6000: "e" is not a free constraint letter It is the prefix of the "es" and "eI" constraints. 2021-08-03 Segher Boessenkool * config/rs6000/constraints.md: Remove "e" from the list of available constraint characters. --- gcc/config/rs6000/constraints.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/constraints.md b/gcc/config/rs6000/constraints.md index 561ce97..c8cff1a 100644 --- a/gcc/config/rs6000/constraints.md +++ b/gcc/config/rs6000/constraints.md @@ -17,7 +17,7 @@ ;; along with GCC; see the file COPYING3. If not see ;; . -;; Available constraint letters: e k q t u A B C D S T +;; Available constraint letters: k q t u A B C D S T ;; Register constraints -- cgit v1.1 From 3a7794b469f897e0141817785738e2faa73119b5 Mon Sep 17 00:00:00 2001 From: Segher Boessenkool Date: Fri, 4 Jun 2021 19:10:38 +0000 Subject: rs6000: Replace & by && 2021-08-03 Segher Boessenkool * config/rs6000/vsx.md (*vsx_le_perm_store_): Use && instead of &. --- gcc/config/rs6000/vsx.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 6f6fc0b..441735d 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -1014,7 +1014,7 @@ [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z,Q") (match_operand:VSX_LE_128 1 "vsx_register_operand" "+wa,r"))] "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR - & !altivec_indexed_or_indirect_operand (operands[0], mode)" + && !altivec_indexed_or_indirect_operand (operands[0], mode)" "@ # #" -- cgit v1.1 From 2fc2e3917f9c8fd94f5d101477971d16c483ef88 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 4 Aug 2021 11:41:37 +0800 Subject: Support cond_{fma,fms,fnma,fnms} for vector float/double under AVX512. gcc/ChangeLog: * config/i386/sse.md (cond_fma): New expander. (cond_fms): Ditto. (cond_fnma): Ditto. (cond_fnms): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/cond_op_fma_double-1.c: New test. * gcc.target/i386/cond_op_fma_double-2.c: New test. * gcc.target/i386/cond_op_fma_float-1.c: New test. * gcc.target/i386/cond_op_fma_float-2.c: New test. --- gcc/config/i386/sse.md | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 52b2b42..f5968e0 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -4438,6 +4438,29 @@ [(set_attr "type" "ssemuladd") (set_attr "mode" "")]) +(define_expand "cond_fma" + [(set (match_operand:VF_AVX512VL 0 "register_operand") + (vec_merge:VF_AVX512VL + (fma:VF_AVX512VL + (match_operand:VF_AVX512VL 2 "vector_operand") + (match_operand:VF_AVX512VL 3 "vector_operand") + (match_operand:VF_AVX512VL 4 "vector_operand")) + (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512F" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_fma4 (tmp, + operands[2], + operands[3], + operands[4])); + emit_move_insn (operands[0], gen_rtx_VEC_MERGE (mode, + tmp, + operands[5], + operands[1])); + DONE; +}) + (define_insn "_fmadd__mask" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VF_AVX512VL @@ -4515,6 +4538,30 @@ [(set_attr "type" "ssemuladd") (set_attr "mode" "")]) +(define_expand "cond_fms" + [(set (match_operand:VF_AVX512VL 0 "register_operand") + (vec_merge:VF_AVX512VL + (fma:VF_AVX512VL + (match_operand:VF_AVX512VL 2 "vector_operand") + (match_operand:VF_AVX512VL 3 "vector_operand") + (neg:VF_AVX512VL + (match_operand:VF_AVX512VL 4 "vector_operand"))) + (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512F" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_fms4 (tmp, + operands[2], + operands[3], + operands[4])); + emit_move_insn (operands[0], gen_rtx_VEC_MERGE (mode, + tmp, + operands[5], + operands[1])); + DONE; +}) + (define_insn "_fmsub__mask" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VF_AVX512VL @@ -4594,6 +4641,30 @@ [(set_attr "type" "ssemuladd") (set_attr "mode" "")]) +(define_expand "cond_fnma" + [(set (match_operand:VF_AVX512VL 0 "register_operand") + (vec_merge:VF_AVX512VL + (fma:VF_AVX512VL + (neg:VF_AVX512VL + (match_operand:VF_AVX512VL 2 "vector_operand")) + (match_operand:VF_AVX512VL 3 "vector_operand") + (match_operand:VF_AVX512VL 4 "vector_operand")) + (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512F" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_fnma4 (tmp, + operands[2], + operands[3], + operands[4])); + emit_move_insn (operands[0], gen_rtx_VEC_MERGE (mode, + tmp, + operands[5], + operands[1])); + DONE; +}) + (define_insn "_fnmadd__mask" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VF_AVX512VL @@ -4675,6 +4746,31 @@ [(set_attr "type" "ssemuladd") (set_attr "mode" "")]) +(define_expand "cond_fnms" + [(set (match_operand:VF_AVX512VL 0 "register_operand") + (vec_merge:VF_AVX512VL + (fma:VF_AVX512VL + (neg:VF_AVX512VL + (match_operand:VF_AVX512VL 2 "vector_operand")) + (match_operand:VF_AVX512VL 3 "vector_operand") + (neg:VF_AVX512VL + (match_operand:VF_AVX512VL 4 "vector_operand"))) + (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512F" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_fnms4 (tmp, + operands[2], + operands[3], + operands[4])); + emit_move_insn (operands[0], gen_rtx_VEC_MERGE (mode, + tmp, + operands[5], + operands[1])); + DONE; +}) + (define_insn "_fnmsub__mask" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VF_AVX512VL -- cgit v1.1 From 9f26640f7b89c771b0ebffd7e7f5019d0709a955 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 4 Aug 2021 10:50:28 +0800 Subject: Refine predicate of peephole2 to general_reg_operand. [PR target/101743] The define_peephole2 which is added by r12-2640-gf7bf03cf69ccb7dc should only work on general registers, considering that x86 also supports mov instructions between gpr, sse reg, mask reg, limiting the peephole2 predicate to general_reg_operand. gcc/ChangeLog: PR target/101743 * config/i386/i386.md (peephole2): Refine predicate from register_operand to general_reg_operand. --- gcc/config/i386/i386.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 0c23ddb..51e8b47 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -19423,11 +19423,11 @@ ;; Eliminate a reg-reg mov by inverting the condition of a cmov (#1). ;; mov r0,r1; dec r0; mov r2,r3; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1 (define_peephole2 - [(set (match_operand:SWI248 0 "register_operand") - (match_operand:SWI248 1 "register_operand")) + [(set (match_operand:SWI248 0 "general_reg_operand") + (match_operand:SWI248 1 "general_reg_operand")) (parallel [(set (reg FLAGS_REG) (match_operand 5)) (set (match_dup 0) (match_operand:SWI248 6))]) - (set (match_operand:SWI248 2 "register_operand") + (set (match_operand:SWI248 2 "general_reg_operand") (match_operand:SWI248 3)) (set (match_dup 0) (if_then_else:SWI248 (match_operator 4 "ix86_comparison_operator" @@ -19455,10 +19455,10 @@ ;; Eliminate a reg-reg mov by inverting the condition of a cmov (#2). ;; mov r2,r3; mov r0,r1; dec r0; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1 (define_peephole2 - [(set (match_operand:SWI248 2 "register_operand") + [(set (match_operand:SWI248 2 "general_reg_operand") (match_operand:SWI248 3)) - (set (match_operand:SWI248 0 "register_operand") - (match_operand:SWI248 1 "register_operand")) + (set (match_operand:SWI248 0 "general_reg_operand") + (match_operand:SWI248 1 "general_reg_operand")) (parallel [(set (reg FLAGS_REG) (match_operand 5)) (set (match_dup 0) (match_operand:SWI248 6))]) (set (match_dup 0) -- cgit v1.1 From eb55b5b0df26e95c98ab59d34e69189d4f61bc0c Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 4 Aug 2021 16:52:07 +0100 Subject: aarch64: Fix a typo gcc/ * config/aarch64/aarch64.c: Fix a typo. --- gcc/config/aarch64/aarch64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index f80de2c..81c002b 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -15032,7 +15032,7 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo, scalar operation. - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the - the Advanced SIMD implementation. + Advanced SIMD implementation. - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the SVE implementation. -- cgit v1.1 From 1d65c9d25199264bc8909018df1b0dca71c0b32d Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 19 Jul 2021 14:01:52 +0100 Subject: aarch64: Don't include vec_select element in SIMD multiply cost The Neon multiply/multiply-accumulate/multiply-subtract instructions can take various forms - multiplying full vector registers of values or multiplying one vector by a single element of another. Regardless of the form used, these instructions have the same cost, and this should be reflected by the RTL cost function. This patch adds RTL tree traversal in the Neon multiply cost function to match the vec_select used by the lane-referencing forms of the instructions already mentioned. This traversal prevents the cost of the vec_select from being added into the cost of the multiply - meaning that these instructions can now be emitted in the combine pass as they are no longer deemed prohibitively expensive. gcc/ChangeLog: 2021-07-19 Jonathan Wright * config/aarch64/aarch64.c (aarch64_strip_duplicate_vec_elt): Define. (aarch64_rtx_mult_cost): Traverse RTL tree to prevent vec_select cost from being added into Neon multiply cost. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vmul_element_cost.c: New test. --- gcc/config/aarch64/aarch64.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 81c002b..23829bb 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -12046,6 +12046,26 @@ aarch64_strip_extend (rtx x, bool strip_shift) return x; } + +/* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as + any subsequent extend and VEC_SELECT from X. Returns the inner scalar + operand if successful, or the original expression on failure. */ +static rtx +aarch64_strip_duplicate_vec_elt (rtx x) +{ + if (GET_CODE (x) == VEC_DUPLICATE + && is_a (GET_MODE (XEXP (x, 0)))) + { + x = XEXP (x, 0); + if (GET_CODE (x) == VEC_SELECT) + x = XEXP (x, 0); + else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND) + && GET_CODE (XEXP (x, 0)) == VEC_SELECT) + x = XEXP (XEXP (x, 0), 0); + } + return x; +} + /* Return true iff CODE is a shift supported in combination with arithmetic instructions. */ @@ -12114,15 +12134,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed) if (vec_flags & VEC_ADVSIMD) { /* The by-element versions of the instruction have the same costs as - the normal 3-vector version. So don't add the costs of the - duplicate into the costs of the multiply. We make an assumption - that the input to the VEC_DUPLICATE is already on the FP & SIMD - side. This means costing of a MUL by element pre RA is a bit - optimistic. */ - if (GET_CODE (op0) == VEC_DUPLICATE) - op0 = XEXP (op0, 0); - else if (GET_CODE (op1) == VEC_DUPLICATE) - op1 = XEXP (op1, 0); + the normal 3-vector version. We make an assumption that the input + to the VEC_DUPLICATE is already on the FP & SIMD side. This means + costing of a MUL by element pre RA is a bit optimistic. */ + op0 = aarch64_strip_duplicate_vec_elt (op0); + op1 = aarch64_strip_duplicate_vec_elt (op1); } cost += rtx_cost (op0, mode, MULT, 0, speed); cost += rtx_cost (op1, mode, MULT, 1, speed); -- cgit v1.1 From 63834c84d43fc2eeeaa054c5e24d1e468e9eddab Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 19 Jul 2021 10:19:30 +0100 Subject: aarch64: Don't include vec_select high-half in SIMD multiply cost The Neon multiply/multiply-accumulate/multiply-subtract instructions can select the top or bottom half of the operand registers. This selection does not change the cost of the underlying instruction and this should be reflected by the RTL cost function. This patch adds RTL tree traversal in the Neon multiply cost function to match vec_select high-half of its operands. This traversal prevents the cost of the vec_select from being added into the cost of the multiply - meaning that these instructions can now be emitted in the combine pass as they are no longer deemed prohibitively expensive. gcc/ChangeLog: 2021-07-19 Jonathan Wright * config/aarch64/aarch64.c (aarch64_strip_extend_vec_half): Define. (aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of vec_select high-half from being added into Neon multiply cost. * rtlanal.c (vec_series_highpart_p): Define. * rtlanal.h (vec_series_highpart_p): Declare. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vmul_high_cost.c: New test. --- gcc/config/aarch64/aarch64.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 23829bb..e02cbcb 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -78,6 +78,7 @@ #include "gimple-pretty-print.h" #include "tree-ssa-loop-niter.h" #include "fractional-cost.h" +#include "rtlanal.h" /* This file should be included last. */ #include "target-def.h" @@ -12046,6 +12047,22 @@ aarch64_strip_extend (rtx x, bool strip_shift) return x; } +/* Helper function for rtx cost calculation. Strip extension as well as any + inner VEC_SELECT high-half from X. Returns the inner vector operand if + successful, or the original expression on failure. */ +static rtx +aarch64_strip_extend_vec_half (rtx x) +{ + if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND) + { + x = XEXP (x, 0); + if (GET_CODE (x) == VEC_SELECT + && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)), + XEXP (x, 1))) + x = XEXP (x, 0); + } + return x; +} /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as any subsequent extend and VEC_SELECT from X. Returns the inner scalar @@ -12133,6 +12150,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed) unsigned int vec_flags = aarch64_classify_vector_mode (mode); if (vec_flags & VEC_ADVSIMD) { + /* The select-operand-high-half versions of the instruction have the + same cost as the three vector version - don't add the costs of the + extension or selection into the costs of the multiply. */ + op0 = aarch64_strip_extend_vec_half (op0); + op1 = aarch64_strip_extend_vec_half (op1); /* The by-element versions of the instruction have the same costs as the normal 3-vector version. We make an assumption that the input to the VEC_DUPLICATE is already on the FP & SIMD side. This means -- cgit v1.1 From 5391688acc997e26375e42340cea885fa6ad0d7d Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Wed, 4 Aug 2021 18:40:09 +0200 Subject: IBM Z: Get rid of vec merge unspec This patch gets rid of the unspecs we were using for the vector merge instruction and replaces it with generic rtx. gcc/ChangeLog: * config/s390/s390-modes.def: Add more vector modes to support concatenation of two vectors. * config/s390/s390-protos.h (s390_expand_merge_perm_const): Add prototype. (s390_expand_merge): Likewise. * config/s390/s390.c (s390_expand_merge_perm_const): New function. (s390_expand_merge): New function. * config/s390/s390.md (UNSPEC_VEC_MERGEH, UNSPEC_VEC_MERGEL): Remove constant definitions. * config/s390/vector.md (V_HW_2): Add mode iterators. (VI_HW_4, V_HW_4): Rename VI_HW_4 to V_HW_4. (vec_2x_nelts, vec_2x_wide): New mode attributes. (*vmrhb, *vmrlb, *vmrhh, *vmrlh, *vmrhf, *vmrlf, *vmrhg, *vmrlg): New pattern definitions. (vec_widen_umult_lo_, vec_widen_umult_hi_) (vec_widen_smult_lo_, vec_widen_smult_hi_) (vec_unpacks_lo_v4sf, vec_unpacks_hi_v4sf, vec_unpacks_lo_v2df) (vec_unpacks_hi_v2df): Adjust expanders to emit non-unspec RTX for vec merge. * config/s390/vx-builtins.md (V_HW_4): Remove mode iterator. Now in vector.md. (vec_mergeh, vec_mergel): Use s390_expand_merge to emit vec merge pattern. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c: Instead of vpdi with 0 and 5 vmrlg and vmrhg are used now. * gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c: Likewise. * gcc.target/s390/zvector/vec-types.h: New test. * gcc.target/s390/zvector/vec_merge.c: New test. --- gcc/config/s390/s390-modes.def | 11 ++- gcc/config/s390/s390-protos.h | 2 + gcc/config/s390/s390.c | 36 +++++++ gcc/config/s390/s390.md | 2 - gcc/config/s390/vector.md | 208 ++++++++++++++++++++++++++++++++++------- gcc/config/s390/vx-builtins.md | 35 ++++--- 6 files changed, 237 insertions(+), 57 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390-modes.def b/gcc/config/s390/s390-modes.def index 6d814fc..245c2b8 100644 --- a/gcc/config/s390/s390-modes.def +++ b/gcc/config/s390/s390-modes.def @@ -259,14 +259,17 @@ CC_MODE (CCVFANY); /* Vector modes. */ -VECTOR_MODES (INT, 2); /* V2QI */ -VECTOR_MODES (INT, 4); /* V4QI V2HI */ -VECTOR_MODES (INT, 8); /* V8QI V4HI V2SI */ -VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI */ +VECTOR_MODES (INT, 2); /* V2QI */ +VECTOR_MODES (INT, 4); /* V4QI V2HI */ +VECTOR_MODES (INT, 8); /* V8QI V4HI V2SI */ +VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI */ +VECTOR_MODES (INT, 32); /* V32QI V16HI V8SI V4DI V2TI */ VECTOR_MODE (FLOAT, SF, 2); /* V2SF */ VECTOR_MODE (FLOAT, SF, 4); /* V4SF */ +VECTOR_MODE (FLOAT, SF, 8); /* V8SF */ VECTOR_MODE (FLOAT, DF, 2); /* V2DF */ +VECTOR_MODE (FLOAT, DF, 4); /* V4DF */ VECTOR_MODE (INT, QI, 1); /* V1QI */ VECTOR_MODE (INT, HI, 1); /* V1HI */ diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h index 289e018..4b03c6e 100644 --- a/gcc/config/s390/s390-protos.h +++ b/gcc/config/s390/s390-protos.h @@ -122,6 +122,8 @@ extern void s390_expand_vec_compare_cc (rtx, enum rtx_code, rtx, rtx, bool); extern enum rtx_code s390_reverse_condition (machine_mode, enum rtx_code); extern void s390_expand_vcond (rtx, rtx, rtx, enum rtx_code, rtx, rtx); extern void s390_expand_vec_init (rtx, rtx); +extern rtx s390_expand_merge_perm_const (machine_mode, bool); +extern void s390_expand_merge (rtx, rtx, rtx, bool); extern rtx s390_build_signbit_mask (machine_mode); extern rtx s390_return_addr_rtx (int, rtx); extern rtx s390_back_chain_rtx (void); diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 8c7d366..3f4521e 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -7014,6 +7014,42 @@ s390_expand_vec_init (rtx target, rtx vals) } } +/* Return a parallel of constant integers to be used as permutation + vector for a vector merge operation in MODE. If HIGH_P is true the + left-most elements of the source vectors are merged otherwise the + right-most elements. */ +rtx +s390_expand_merge_perm_const (machine_mode mode, bool high_p) +{ + int nelts = GET_MODE_NUNITS (mode); + rtx perm[16]; + int addend = high_p ? 0 : nelts; + + for (int i = 0; i < nelts; i++) + perm[i] = GEN_INT ((i + addend) / 2 + (i % 2) * nelts); + + return gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelts, perm)); +} + +/* Emit RTL to implement a vector merge operation of SRC1 and SRC2 + which creates the result in TARGET. HIGH_P determines whether a + merge hi or lo will be generated. */ +void +s390_expand_merge (rtx target, rtx src1, rtx src2, bool high_p) +{ + machine_mode mode = GET_MODE (target); + opt_machine_mode opt_mode_2x = mode_for_vector (GET_MODE_INNER (mode), + 2 * GET_MODE_NUNITS (mode)); + gcc_assert (opt_mode_2x.exists ()); + machine_mode mode_double_nelts = opt_mode_2x.require (); + rtx constv = s390_expand_merge_perm_const (mode, high_p); + src1 = force_reg (GET_MODE (src1), src1); + src2 = force_reg (GET_MODE (src2), src2); + rtx x = gen_rtx_VEC_CONCAT (mode_double_nelts, src1, src2); + x = gen_rtx_VEC_SELECT (mode, x, constv); + emit_insn (gen_rtx_SET (target, x)); +} + /* Emit a vector constant that contains 1s in each element's sign bit position and 0s in other positions. MODE is the desired constant's mode. */ extern rtx diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 8ad21b0..d896fae 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -158,8 +158,6 @@ UNSPEC_VEC_LOAD_BNDRY UNSPEC_VEC_LOAD_LEN UNSPEC_VEC_LOAD_LEN_R - UNSPEC_VEC_MERGEH - UNSPEC_VEC_MERGEL UNSPEC_VEC_PACK UNSPEC_VEC_PACK_SATURATE UNSPEC_VEC_PACK_SATURATE_CC diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index ab605b3..51c6332 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -50,7 +50,10 @@ (define_mode_iterator VI_HW_HSD [V8HI V4SI V2DI]) (define_mode_iterator VI_HW_HS [V8HI V4SI]) (define_mode_iterator VI_HW_QH [V16QI V8HI]) -(define_mode_iterator VI_HW_4 [V4SI V4SF]) + +; Directly supported vector modes with a certain number of elements +(define_mode_iterator V_HW_2 [V2DI V2DF]) +(define_mode_iterator V_HW_4 [V4SI V4SF]) ; All integer vector modes supported in a vector register + TImode (define_mode_iterator VIT [V1QI V2QI V4QI V8QI V16QI V1HI V2HI V4HI V8HI V1SI V2SI V4SI V1DI V2DI V1TI TI]) @@ -163,14 +166,14 @@ (DF "d") (V1DF "d") (V2DF "d") (TF "x") (V1TF "x")]) -; Vector with doubled element size. +; Vector with widened element size but half the number of elements. (define_mode_attr vec_double [(V1QI "V1HI") (V2QI "V1HI") (V4QI "V2HI") (V8QI "V4HI") (V16QI "V8HI") (V1HI "V1SI") (V2HI "V1SI") (V4HI "V2SI") (V8HI "V4SI") (V1SI "V1DI") (V2SI "V1DI") (V4SI "V2DI") (V1DI "V1TI") (V2DI "V1TI") (V1SF "V1DF") (V2SF "V1DF") (V4SF "V2DF")]) -; Vector with half the element size. +; Vector with shrinked element size but twice the number of elements. (define_mode_attr vec_half [(V1HI "V2QI") (V2HI "V4QI") (V4HI "V8QI") (V8HI "V16QI") (V1SI "V2HI") (V2SI "V4HI") (V4SI "V8HI") (V1DI "V2SI") (V2DI "V4SI") @@ -178,6 +181,22 @@ (V1DF "V2SF") (V2DF "V4SF") (V1TF "V1DF")]) +; Vector with twice the number of elements but same element size. +(define_mode_attr vec_2x_nelts [(V1QI "V2QI") (V2QI "V4QI") (V4QI "V8QI") (V8QI "V16QI") (V16QI "V32QI") + (V1HI "V2HI") (V2HI "V4HI") (V4HI "V8HI") (V8HI "V16HI") + (V1SI "V2SI") (V2SI "V4SI") (V4SI "V8SI") + (V1DI "V2DI") (V2DI "V4DI") + (V1SF "V2SF") (V2SF "V4SF") (V4SF "V8SF") + (V1DF "V2DF") (V2DF "V4DF")]) + +; Vector with widened element size and the same number of elements. +(define_mode_attr vec_2x_wide [(V1QI "V1HI") (V2QI "V2HI") (V4QI "V4HI") (V8QI "V8HI") (V16QI "V16HI") + (V1HI "V1SI") (V2HI "V2SI") (V4HI "V4SI") (V8HI "V8SI") + (V1SI "V1DI") (V2SI "V2DI") (V4SI "V4DI") + (V1DI "V1TI") (V2DI "V2TI") + (V1SF "V1DF") (V2SF "V2DF") (V4SF "V4DF") + (V1DF "V1TF") (V2DF "V2TF")]) + ; Vector with half the element size AND half the number of elements. (define_mode_attr vec_halfhalf [(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI") @@ -532,8 +551,8 @@ }) (define_insn "*vec_vllezlf" - [(set (match_operand:VI_HW_4 0 "register_operand" "=v") - (vec_concat:VI_HW_4 + [(set (match_operand:V_HW_4 0 "register_operand" "=v") + (vec_concat:V_HW_4 (vec_concat: (match_operand: 1 "memory_operand" "R") (const_int 0)) @@ -748,6 +767,109 @@ "vperm\t%v0,%v1,%v2,%v3" [(set_attr "op_type" "VRR")]) +(define_insn "*vmrhb" + [(set (match_operand:V16QI 0 "register_operand" "=v") + (vec_select:V16QI + (vec_concat:V32QI (match_operand:V16QI 1 "register_operand" "v") + (match_operand:V16QI 2 "register_operand" "v")) + (parallel [(const_int 0) (const_int 16) + (const_int 1) (const_int 17) + (const_int 2) (const_int 18) + (const_int 3) (const_int 19) + (const_int 4) (const_int 20) + (const_int 5) (const_int 21) + (const_int 6) (const_int 22) + (const_int 7) (const_int 23)])))] + "TARGET_VX" + "vmrhb\t%0,%1,%2"; + [(set_attr "op_type" "VRR")]) + +(define_insn "*vmrlb" + [(set (match_operand:V16QI 0 "register_operand" "=v") + (vec_select:V16QI + (vec_concat:V32QI (match_operand:V16QI 1 "register_operand" "v") + (match_operand:V16QI 2 "register_operand" "v")) + (parallel [(const_int 8) (const_int 24) + (const_int 9) (const_int 25) + (const_int 10) (const_int 26) + (const_int 11) (const_int 27) + (const_int 12) (const_int 28) + (const_int 13) (const_int 29) + (const_int 14) (const_int 30) + (const_int 15) (const_int 31)])))] + "TARGET_VX" + "vmrlb\t%0,%1,%2"; + [(set_attr "op_type" "VRR")]) + +(define_insn "*vmrhh" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (vec_select:V8HI + (vec_concat:V16HI (match_operand:V8HI 1 "register_operand" "v") + (match_operand:V8HI 2 "register_operand" "v")) + (parallel [(const_int 0) (const_int 8) + (const_int 1) (const_int 9) + (const_int 2) (const_int 10) + (const_int 3) (const_int 11)])))] + "TARGET_VX" + "vmrhh\t%0,%1,%2"; + [(set_attr "op_type" "VRR")]) + +(define_insn "*vmrlh" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (vec_select:V8HI + (vec_concat:V16HI (match_operand:V8HI 1 "register_operand" "v") + (match_operand:V8HI 2 "register_operand" "v")) + (parallel [(const_int 4) (const_int 12) + (const_int 5) (const_int 13) + (const_int 6) (const_int 14) + (const_int 7) (const_int 15)])))] + "TARGET_VX" + "vmrlh\t%0,%1,%2"; + [(set_attr "op_type" "VRR")]) + +(define_insn "*vmrhf" + [(set (match_operand:V_HW_4 0 "register_operand" "=v") + (vec_select:V_HW_4 + (vec_concat: (match_operand:V_HW_4 1 "register_operand" "v") + (match_operand:V_HW_4 2 "register_operand" "v")) + (parallel [(const_int 0) (const_int 4) + (const_int 1) (const_int 5)])))] + "TARGET_VX" + "vmrhf\t%0,%1,%2"; + [(set_attr "op_type" "VRR")]) + +(define_insn "*vmrlf" + [(set (match_operand:V_HW_4 0 "register_operand" "=v") + (vec_select:V_HW_4 + (vec_concat: (match_operand:V_HW_4 1 "register_operand" "v") + (match_operand:V_HW_4 2 "register_operand" "v")) + (parallel [(const_int 2) (const_int 6) + (const_int 3) (const_int 7)])))] + "TARGET_VX" + "vmrlf\t%0,%1,%2"; + [(set_attr "op_type" "VRR")]) + +(define_insn "*vmrhg" + [(set (match_operand:V_HW_2 0 "register_operand" "=v") + (vec_select:V_HW_2 + (vec_concat: (match_operand:V_HW_2 1 "register_operand" "v") + (match_operand:V_HW_2 2 "register_operand" "v")) + (parallel [(const_int 0) (const_int 2)])))] + "TARGET_VX" + "vmrhg\t%0,%1,%2"; + [(set_attr "op_type" "VRR")]) + +(define_insn "*vmrlg" + [(set (match_operand:V_HW_2 0 "register_operand" "=v") + (vec_select:V_HW_2 + (vec_concat: (match_operand:V_HW_2 1 "register_operand" "v") + (match_operand:V_HW_2 2 "register_operand" "v")) + (parallel [(const_int 1) (const_int 3)])))] + "TARGET_VX" + "vmrlg\t%0,%1,%2"; + [(set_attr "op_type" "VRR")]) + + (define_insn "*tf_to_fprx2_0" [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0) (subreg:DF (match_operand:TF 1 "general_operand" "v") 0))] @@ -1271,12 +1393,14 @@ (unspec: [(match_dup 1) (match_dup 2)] UNSPEC_VEC_UMULT_ODD)) (set (match_operand: 0 "register_operand" "") - (unspec: [(match_dup 3) (match_dup 4)] - UNSPEC_VEC_MERGEL))] + (vec_select: + (vec_concat: (match_dup 3) (match_dup 4)) + (match_dup 5)))] "TARGET_VX" { operands[3] = gen_reg_rtx (mode); operands[4] = gen_reg_rtx (mode); + operands[5] = s390_expand_merge_perm_const (mode, false); }) (define_expand "vec_widen_umult_hi_" @@ -1288,12 +1412,14 @@ (unspec: [(match_dup 1) (match_dup 2)] UNSPEC_VEC_UMULT_ODD)) (set (match_operand: 0 "register_operand" "") - (unspec: [(match_dup 3) (match_dup 4)] - UNSPEC_VEC_MERGEH))] + (vec_select: + (vec_concat: (match_dup 3) (match_dup 4)) + (match_dup 5)))] "TARGET_VX" { operands[3] = gen_reg_rtx (mode); operands[4] = gen_reg_rtx (mode); + operands[5] = s390_expand_merge_perm_const (mode, true); }) (define_expand "vec_widen_smult_lo_" @@ -1305,12 +1431,14 @@ (unspec: [(match_dup 1) (match_dup 2)] UNSPEC_VEC_SMULT_ODD)) (set (match_operand: 0 "register_operand" "") - (unspec: [(match_dup 3) (match_dup 4)] - UNSPEC_VEC_MERGEL))] + (vec_select: + (vec_concat: (match_dup 3) (match_dup 4)) + (match_dup 5)))] "TARGET_VX" { operands[3] = gen_reg_rtx (mode); operands[4] = gen_reg_rtx (mode); + operands[5] = s390_expand_merge_perm_const (mode, false); }) (define_expand "vec_widen_smult_hi_" @@ -1322,12 +1450,14 @@ (unspec: [(match_dup 1) (match_dup 2)] UNSPEC_VEC_SMULT_ODD)) (set (match_operand: 0 "register_operand" "") - (unspec: [(match_dup 3) (match_dup 4)] - UNSPEC_VEC_MERGEH))] + (vec_select: + (vec_concat: (match_dup 3) (match_dup 4)) + (match_dup 5)))] "TARGET_VX" { operands[3] = gen_reg_rtx (mode); operands[4] = gen_reg_rtx (mode); + operands[5] = s390_expand_merge_perm_const (mode, true); }) ; vec_widen_ushiftl_hi @@ -2166,29 +2296,35 @@ (define_expand "vec_unpacks_lo_v4sf" [(set (match_dup 2) - (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v") - (match_dup 1)] - UNSPEC_VEC_MERGEL)) - (set (match_operand:V2DF 0 "register_operand" "=v") + (vec_select:V4SF + (vec_concat:V8SF (match_operand:V4SF 1 "register_operand" "") (match_dup 1)) + (match_dup 3))) + (set (match_operand:V2DF 0 "register_operand" "") (float_extend:V2DF (vec_select:V2SF (match_dup 2) (parallel [(const_int 0) (const_int 2)]))))] "TARGET_VX" -{ operands[2] = gen_reg_rtx(V4SFmode); }) +{ + operands[2] = gen_reg_rtx(V4SFmode); + operands[3] = s390_expand_merge_perm_const (V4SFmode, false); +}) (define_expand "vec_unpacks_hi_v4sf" [(set (match_dup 2) - (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v") - (match_dup 1)] - UNSPEC_VEC_MERGEH)) - (set (match_operand:V2DF 0 "register_operand" "=v") + (vec_select:V4SF + (vec_concat:V8SF (match_operand:V4SF 1 "register_operand" "") (match_dup 1)) + (match_dup 3))) + (set (match_operand:V2DF 0 "register_operand" "") (float_extend:V2DF (vec_select:V2SF (match_dup 2) (parallel [(const_int 0) (const_int 2)]))))] "TARGET_VX" -{ operands[2] = gen_reg_rtx(V4SFmode); }) +{ + operands[2] = gen_reg_rtx(V4SFmode); + operands[3] = s390_expand_merge_perm_const (V4SFmode, true); +}) ; double -> long double @@ -2204,29 +2340,35 @@ (define_expand "vec_unpacks_lo_v2df" [(set (match_dup 2) - (unspec:V2DF [(match_operand:V2DF 1 "register_operand" "v") - (match_dup 1)] - UNSPEC_VEC_MERGEL)) - (set (match_operand:V1TF 0 "register_operand" "=v") + (vec_select:V2DF + (vec_concat:V4DF (match_operand:V2DF 1 "register_operand" "") (match_dup 1)) + (match_dup 3))) + (set (match_operand:V1TF 0 "register_operand" "") (float_extend:V1TF (vec_select:V1DF (match_dup 2) (parallel [(const_int 0)]))))] "TARGET_VXE" -{ operands[2] = gen_reg_rtx (V2DFmode); }) +{ + operands[2] = gen_reg_rtx (V2DFmode); + operands[3] = s390_expand_merge_perm_const (V2DFmode, false); +}) (define_expand "vec_unpacks_hi_v2df" [(set (match_dup 2) - (unspec:V2DF [(match_operand:V2DF 1 "register_operand" "v") - (match_dup 1)] - UNSPEC_VEC_MERGEH)) - (set (match_operand:V1TF 0 "register_operand" "=v") + (vec_select:V2DF + (vec_concat:V4DF (match_operand:V2DF 1 "register_operand" "") (match_dup 1)) + (match_dup 3))) + (set (match_operand:V1TF 0 "register_operand" "") (float_extend:V1TF (vec_select:V1DF (match_dup 2) (parallel [(const_int 0)]))))] "TARGET_VXE" -{ operands[2] = gen_reg_rtx (V2DFmode); }) +{ + operands[2] = gen_reg_rtx (V2DFmode); + operands[3] = s390_expand_merge_perm_const (V2DFmode, true); +}) ; 2 x v2df -> 1 x v4sf diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md index 3df501b..5abe43b 100644 --- a/gcc/config/s390/vx-builtins.md +++ b/gcc/config/s390/vx-builtins.md @@ -22,7 +22,7 @@ (define_mode_iterator V_HW_32_64 [V4SI V2DI V2DF (V4SF "TARGET_VXE")]) (define_mode_iterator VI_HW_SD [V4SI V2DI]) -(define_mode_iterator V_HW_4 [V4SI V4SF]) + ; Full size vector modes with more than one element which are directly supported in vector registers by the hardware. (define_mode_iterator VEC_HW [V16QI V8HI V4SI V2DI V2DF (V4SF "TARGET_VXE")]) (define_mode_iterator VECF_HW [(V4SF "TARGET_VXE") V2DF]) @@ -232,28 +232,27 @@ [(set_attr "op_type" "VRS,VRX,VSI")]) -; FIXME: The following two patterns might using vec_merge. But what is -; the canonical form: (vec_select (vec_merge op0 op1)) or (vec_merge -; (vec_select op0) (vec_select op1) ; vmrhb, vmrhh, vmrhf, vmrhg -(define_insn "vec_mergeh" - [(set (match_operand:V_128_NOSINGLE 0 "register_operand" "=v") - (unspec:V_128_NOSINGLE [(match_operand:V_128_NOSINGLE 1 "register_operand" "v") - (match_operand:V_128_NOSINGLE 2 "register_operand" "v")] - UNSPEC_VEC_MERGEH))] +(define_expand "vec_mergeh" + [(match_operand:V_128_NOSINGLE 0 "register_operand" "") + (match_operand:V_128_NOSINGLE 1 "register_operand" "") + (match_operand:V_128_NOSINGLE 2 "register_operand" "")] "TARGET_VX" - "vmrh\t%v0,%1,%2" - [(set_attr "op_type" "VRR")]) +{ + s390_expand_merge (operands[0], operands[1], operands[2], true); + DONE; +}) ; vmrlb, vmrlh, vmrlf, vmrlg -(define_insn "vec_mergel" - [(set (match_operand:V_128_NOSINGLE 0 "register_operand" "=v") - (unspec:V_128_NOSINGLE [(match_operand:V_128_NOSINGLE 1 "register_operand" "v") - (match_operand:V_128_NOSINGLE 2 "register_operand" "v")] - UNSPEC_VEC_MERGEL))] +(define_expand "vec_mergel" + [(match_operand:V_128_NOSINGLE 0 "register_operand" "") + (match_operand:V_128_NOSINGLE 1 "register_operand" "") + (match_operand:V_128_NOSINGLE 2 "register_operand" "")] "TARGET_VX" - "vmrl\t%v0,%1,%2" - [(set_attr "op_type" "VRR")]) +{ + s390_expand_merge (operands[0], operands[1], operands[2], false); + DONE; +}) ; Vector pack -- cgit v1.1 From 0aa7091befa9fdb67f7013dbd454d336a31ef71d Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Wed, 4 Aug 2021 18:40:09 +0200 Subject: IBM Z: Get rid of vpdi unspec The patch gets rid of the unspec used for the vector permute double immediate instruction and replaces it with generic rtx. gcc/ChangeLog: * config/s390/s390.md (UNSPEC_VEC_PERMI): Remove constant definition. * config/s390/vector.md (*vpdi1, *vpdi4): New pattern definitions. * config/s390/vx-builtins.md (*vec_permi): Emit generic rtx instead of an unspec. gcc/testsuite/ChangeLog: * gcc.target/s390/zvector/vec-permi.c: Removed. * gcc.target/s390/zvector/vec_permi.c: New test. --- gcc/config/s390/s390.md | 1 - gcc/config/s390/vector.md | 26 ++++++++++++++++++++++++++ gcc/config/s390/vx-builtins.md | 26 ++++++++++---------------- 3 files changed, 36 insertions(+), 17 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index d896fae..1b894a9 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -166,7 +166,6 @@ UNSPEC_VEC_PACK_UNSIGNED_SATURATE_CC UNSPEC_VEC_PACK_UNSIGNED_SATURATE_GENCC UNSPEC_VEC_PERM - UNSPEC_VEC_PERMI UNSPEC_VEC_EXTEND UNSPEC_VEC_STORE_LEN UNSPEC_VEC_STORE_LEN_R diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 51c6332..48dc564 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -767,6 +767,32 @@ "vperm\t%v0,%v1,%v2,%v3" [(set_attr "op_type" "VRR")]) + +; First DW of op1 and second DW of op2 +(define_insn "*vpdi1" + [(set (match_operand:V_HW_2 0 "register_operand" "=v") + (vec_select:V_HW_2 + (vec_concat: + (match_operand:V_HW_2 1 "register_operand" "v") + (match_operand:V_HW_2 2 "register_operand" "v")) + (parallel [(const_int 0) (const_int 3)])))] + "TARGET_VX" + "vpdi\t%v0,%v1,%v2,1" + [(set_attr "op_type" "VRR")]) + +; Second DW of op1 and first of op2 +(define_insn "*vpdi4" + [(set (match_operand:V_HW_2 0 "register_operand" "=v") + (vec_select:V_HW_2 + (vec_concat: + (match_operand:V_HW_2 1 "register_operand" "v") + (match_operand:V_HW_2 2 "register_operand" "v")) + (parallel [(const_int 1) (const_int 2)])))] + "TARGET_VX" + "vpdi\t%v0,%v1,%v2,4" + [(set_attr "op_type" "VRR")]) + + (define_insn "*vmrhb" [(set (match_operand:V16QI 0 "register_operand" "=v") (vec_select:V16QI diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md index 5abe43b..3799e83 100644 --- a/gcc/config/s390/vx-builtins.md +++ b/gcc/config/s390/vx-builtins.md @@ -403,28 +403,22 @@ "vperm\t%v0,%v1,%v2,%v3" [(set_attr "op_type" "VRR")]) +; Incoming op3 is in vec_permi format and will we turned into a +; permute vector consisting of op3 and op4. (define_expand "vec_permi" - [(set (match_operand:V_HW_64 0 "register_operand" "") - (unspec:V_HW_64 [(match_operand:V_HW_64 1 "register_operand" "") - (match_operand:V_HW_64 2 "register_operand" "") - (match_operand:QI 3 "const_mask_operand" "")] - UNSPEC_VEC_PERMI))] + [(set (match_operand:V_HW_2 0 "register_operand" "") + (vec_select:V_HW_2 + (vec_concat: + (match_operand:V_HW_2 1 "register_operand" "") + (match_operand:V_HW_2 2 "register_operand" "")) + (parallel [(match_operand:QI 3 "const_mask_operand" "") (match_dup 4)])))] "TARGET_VX" { HOST_WIDE_INT val = INTVAL (operands[3]); - operands[3] = GEN_INT ((val & 1) | (val & 2) << 1); + operands[3] = GEN_INT ((val & 2) >> 1); + operands[4] = GEN_INT ((val & 1) + 2); }) -(define_insn "*vec_permi" - [(set (match_operand:V_HW_64 0 "register_operand" "=v") - (unspec:V_HW_64 [(match_operand:V_HW_64 1 "register_operand" "v") - (match_operand:V_HW_64 2 "register_operand" "v") - (match_operand:QI 3 "const_mask_operand" "C")] - UNSPEC_VEC_PERMI))] - "TARGET_VX && (UINTVAL (operands[3]) & 10) == 0" - "vpdi\t%v0,%v1,%v2,%b3" - [(set_attr "op_type" "VRR")]) - ; Vector replicate -- cgit v1.1 From 4e34925ef1aeab73e022d80149be8cec92c48667 Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Wed, 4 Aug 2021 18:40:10 +0200 Subject: IBM Z: Remove redundant V_HW_64 mode iterator. gcc/ChangeLog: * config/s390/vector.md (V_HW_64): Remove mode iterator. (*vec_load_pair): Use V_HW_2 instead of V_HW_64. * config/s390/vx-builtins.md (vec_scatter_element_SI): Use V_HW_2 instead of V_HW_64. --- gcc/config/s390/vector.md | 7 +++---- gcc/config/s390/vx-builtins.md | 14 +++++++------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 48dc564..d224165 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -36,7 +36,6 @@ (define_mode_iterator V_HW2 [V16QI V8HI V4SI V2DI V2DF (V4SF "TARGET_VXE") (V1TF "TARGET_VXE") (TF "TARGET_VXE")]) -(define_mode_iterator V_HW_64 [V2DI V2DF]) (define_mode_iterator VT_HW_HSDT [V8HI V4SI V4SF V2DI V2DF V1TI V1TF TI TF]) (define_mode_iterator V_HW_HSD [V8HI V4SI (V4SF "TARGET_VXE") V2DI V2DF]) @@ -1972,9 +1971,9 @@ }) (define_insn "*vec_load_pair" - [(set (match_operand:V_HW_64 0 "register_operand" "=v,v") - (vec_concat:V_HW_64 (match_operand: 1 "register_operand" "d,v") - (match_operand: 2 "register_operand" "d,v")))] + [(set (match_operand:V_HW_2 0 "register_operand" "=v,v") + (vec_concat:V_HW_2 (match_operand: 1 "register_operand" "d,v") + (match_operand: 2 "register_operand" "d,v")))] "TARGET_VX" "@ vlvgp\t%v0,%1,%2 diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md index 3799e83..3e7b854 100644 --- a/gcc/config/s390/vx-builtins.md +++ b/gcc/config/s390/vx-builtins.md @@ -452,17 +452,17 @@ ; A 31 bit target address is generated from 64 bit elements ; vsceg -(define_insn "vec_scatter_element_SI" +(define_insn "vec_scatter_element_SI" [(set (mem: (plus:SI (subreg:SI - (unspec: [(match_operand:V_HW_64 1 "register_operand" "v") - (match_operand:QI 3 "const_mask_operand" "C")] + (unspec: [(match_operand:V_HW_2 1 "register_operand" "v") + (match_operand:QI 3 "const_mask_operand" "C")] UNSPEC_VEC_EXTRACT) 4) - (match_operand:SI 2 "address_operand" "ZQ"))) - (unspec: [(match_operand:V_HW_64 0 "register_operand" "v") + (match_operand:SI 2 "address_operand" "ZQ"))) + (unspec: [(match_operand:V_HW_2 0 "register_operand" "v") (match_dup 3)] UNSPEC_VEC_EXTRACT))] - "TARGET_VX && !TARGET_64BIT && UINTVAL (operands[3]) < GET_MODE_NUNITS (mode)" - "vsce\t%v0,%O2(%v1,%R2),%3" + "TARGET_VX && !TARGET_64BIT && UINTVAL (operands[3]) < GET_MODE_NUNITS (mode)" + "vsce\t%v0,%O2(%v1,%R2),%3" [(set_attr "op_type" "VRV")]) ; Element size and target address size is the same -- cgit v1.1 From 6dc8c4656444153c9e2f98d382de39728a849672 Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Wed, 4 Aug 2021 18:40:10 +0200 Subject: IBM Z: Implement TARGET_VECTORIZE_VEC_PERM_CONST for vector merge This patch implements the TARGET_VECTORIZE_VEC_PERM_CONST in the IBM Z backend. The initial implementation only exploits the vector merge instruction but there is more to come. gcc/ChangeLog: * config/s390/s390.c (MAX_VECT_LEN): Define macro. (struct expand_vec_perm_d): Define struct. (expand_perm_with_merge): New function. (vectorize_vec_perm_const_1): New function. (s390_vectorize_vec_perm_const): New function. (TARGET_VECTORIZE_VEC_PERM_CONST): Define target macro. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/perm-merge.c: New test. * gcc.target/s390/vector/vec-types.h: New test. --- gcc/config/s390/s390.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 3f4521e..8dc805f 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -16926,6 +16926,107 @@ s390_md_asm_adjust (vec &outputs, vec &inputs, return after_md_seq; } +#define MAX_VECT_LEN 16 + +struct expand_vec_perm_d +{ + rtx target, op0, op1; + unsigned char perm[MAX_VECT_LEN]; + machine_mode vmode; + unsigned char nelt; + bool testing_p; +}; + +/* Try to expand the vector permute operation described by D using the + vector merge instructions vml and vmh. Return true if vector merge + could be used. */ +static bool +expand_perm_with_merge (const struct expand_vec_perm_d &d) +{ + bool merge_lo_p = true; + bool merge_hi_p = true; + + if (d.nelt % 2) + return false; + + // For V4SI this checks for: { 0, 4, 1, 5 } + for (int telt = 0; telt < d.nelt; telt++) + if (d.perm[telt] != telt / 2 + (telt % 2) * d.nelt) + { + merge_hi_p = false; + break; + } + + if (!merge_hi_p) + { + // For V4SI this checks for: { 2, 6, 3, 7 } + for (int telt = 0; telt < d.nelt; telt++) + if (d.perm[telt] != (telt + d.nelt) / 2 + (telt % 2) * d.nelt) + { + merge_lo_p = false; + break; + } + } + else + merge_lo_p = false; + + if (d.testing_p) + return merge_lo_p || merge_hi_p; + + if (merge_lo_p || merge_hi_p) + s390_expand_merge (d.target, d.op0, d.op1, merge_hi_p); + + return merge_lo_p || merge_hi_p; +} + +/* Try to find the best sequence for the vector permute operation + described by D. Return true if the operation could be + expanded. */ +static bool +vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d) +{ + if (expand_perm_with_merge (d)) + return true; + + return false; +} + +/* Return true if we can emit instructions for the constant + permutation vector in SEL. If OUTPUT, IN0, IN1 are non-null the + hook is supposed to emit the required INSNs. */ + +bool +s390_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, rtx op1, + const vec_perm_indices &sel) +{ + struct expand_vec_perm_d d; + unsigned int i, nelt; + + if (!s390_vector_mode_supported_p (vmode) || GET_MODE_SIZE (vmode) != 16) + return false; + + d.target = target; + d.op0 = op0; + d.op1 = op1; + + d.vmode = vmode; + gcc_assert (VECTOR_MODE_P (d.vmode)); + d.nelt = nelt = GET_MODE_NUNITS (d.vmode); + d.testing_p = target == NULL_RTX; + + gcc_assert (target == NULL_RTX || REG_P (target)); + gcc_assert (sel.length () == nelt); + + for (i = 0; i < nelt; i++) + { + unsigned char e = sel[i]; + gcc_assert (e < 2 * nelt); + d.perm[i] = e; + } + + return vectorize_vec_perm_const_1 (d); +} + /* Initialize GCC target structure. */ #undef TARGET_ASM_ALIGNED_HI_OP @@ -17236,6 +17337,10 @@ s390_md_asm_adjust (vec &outputs, vec &inputs, #undef TARGET_MD_ASM_ADJUST #define TARGET_MD_ASM_ADJUST s390_md_asm_adjust +#undef TARGET_VECTORIZE_VEC_PERM_CONST +#define TARGET_VECTORIZE_VEC_PERM_CONST s390_vectorize_vec_perm_const + + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-s390.h" -- cgit v1.1 From 361da782a25031c6ae3967bf8c10a8119845255c Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Wed, 4 Aug 2021 18:40:11 +0200 Subject: IBM Z: Implement TARGET_VECTORIZE_VEC_PERM_CONST for vpdi This patch makes use of the vector permute double immediate instruction for constant permute vectors. gcc/ChangeLog: * config/s390/s390.c (expand_perm_with_vpdi): New function. (vectorize_vec_perm_const_1): Call expand_perm_with_vpdi. * config/s390/vector.md (*vpdi1, @vpdi1): Enable a parameterized expander. (*vpdi4, @vpdi4): Likewise. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/perm-vpdi.c: New test. --- gcc/config/s390/s390.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ gcc/config/s390/vector.md | 5 ++--- 2 files changed, 49 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 8dc805f..673a134 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -16979,6 +16979,50 @@ expand_perm_with_merge (const struct expand_vec_perm_d &d) return merge_lo_p || merge_hi_p; } +/* Try to expand the vector permute operation described by D using the + vector permute doubleword immediate instruction vpdi. Return true + if vpdi could be used. + + VPDI allows 4 different immediate values (0, 1, 4, 5). The 0 and 5 + cases are covered by vmrhg and vmrlg already. So we only care + about the 1, 4 cases here. + 1 - First element of src1 and second of src2 + 4 - Second element of src1 and first of src2 */ +static bool +expand_perm_with_vpdi (const struct expand_vec_perm_d &d) +{ + bool vpdi1_p = false; + bool vpdi4_p = false; + rtx op0_reg, op1_reg; + + // Only V2DI and V2DF are supported here. + if (d.nelt != 2) + return false; + + if (d.perm[0] == 0 && d.perm[1] == 3) + vpdi1_p = true; + + if (d.perm[0] == 1 && d.perm[1] == 2) + vpdi4_p = true; + + if (!vpdi1_p && !vpdi4_p) + return false; + + if (d.testing_p) + return true; + + op0_reg = force_reg (GET_MODE (d.op0), d.op0); + op1_reg = force_reg (GET_MODE (d.op1), d.op1); + + if (vpdi1_p) + emit_insn (gen_vpdi1 (d.vmode, d.target, op0_reg, op1_reg)); + + if (vpdi4_p) + emit_insn (gen_vpdi4 (d.vmode, d.target, op0_reg, op1_reg)); + + return true; +} + /* Try to find the best sequence for the vector permute operation described by D. Return true if the operation could be expanded. */ @@ -16988,6 +17032,9 @@ vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d) if (expand_perm_with_merge (d)) return true; + if (expand_perm_with_vpdi (d)) + return true; + return false; } diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index d224165..70274a6 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -768,7 +768,7 @@ ; First DW of op1 and second DW of op2 -(define_insn "*vpdi1" +(define_insn "@vpdi1" [(set (match_operand:V_HW_2 0 "register_operand" "=v") (vec_select:V_HW_2 (vec_concat: @@ -780,7 +780,7 @@ [(set_attr "op_type" "VRR")]) ; Second DW of op1 and first of op2 -(define_insn "*vpdi4" +(define_insn "@vpdi4" [(set (match_operand:V_HW_2 0 "register_operand" "=v") (vec_select:V_HW_2 (vec_concat: @@ -926,7 +926,6 @@ operands[5] = simplify_gen_subreg (DFmode, operands[1], TFmode, 8); }) -; vec_perm_const for V2DI using vpdi? ;; ;; Vector integer arithmetic instructions -- cgit v1.1 From 09dba016db937e61be21ef1e9581065a9ed2847d Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 4 Aug 2021 06:15:04 -0700 Subject: x86: Avoid stack realignment when copying data with SSE register To avoid stack realignment, call ix86_gen_scratch_sse_rtx to get a scratch SSE register to copy data with with SSE register from one memory location to another. gcc/ PR target/101772 * config/i386/i386-expand.c (ix86_expand_vector_move): Call ix86_gen_scratch_sse_rtx to get a scratch SSE register to copy data with SSE register from one memory location to another. gcc/testsuite/ PR target/101772 * gcc.target/i386/eh_return-2.c: New test. --- gcc/config/i386/i386-expand.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 1d469bf..bd21efa 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -613,7 +613,11 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) arguments in memory. */ if (!register_operand (op0, mode) && !register_operand (op1, mode)) - op1 = force_reg (mode, op1); + { + rtx scratch = ix86_gen_scratch_sse_rtx (mode); + emit_move_insn (scratch, op1); + op1 = scratch; + } tmp[0] = op0; tmp[1] = op1; ix86_expand_vector_move_misalign (mode, tmp); -- cgit v1.1 From 5738a64f8b3cf132b88b39af84b9f5f5a9a1554c Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Tue, 3 Aug 2021 06:17:22 -0700 Subject: x86: Update STORE_MAX_PIECES Update STORE_MAX_PIECES to allow 16/32/64 bytes only if inter-unit move is enabled since vec_duplicate enabled by inter-unit move is used to implement store_by_pieces of 16/32/64 bytes. gcc/ PR target/101742 * config/i386/i386.h (STORE_MAX_PIECES): Allow 16/32/64 bytes only if TARGET_INTER_UNIT_MOVES_TO_VEC is true. gcc/testsuite/ PR target/101742 * gcc.target/i386/pr101742a.c: New test. * gcc.target/i386/pr101742b.c: Likewise. --- gcc/config/i386/i386.h | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index bed9cd9..21fe51b 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1780,18 +1780,22 @@ typedef struct ix86_args { && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ ? 16 : UNITS_PER_WORD))) -/* STORE_MAX_PIECES is the number of bytes at a time that we can - store efficiently. */ +/* STORE_MAX_PIECES is the number of bytes at a time that we can store + efficiently. Allow 16/32/64 bytes only if inter-unit move is enabled + since vec_duplicate enabled by inter-unit move is used to implement + store_by_pieces of 16/32/64 bytes. */ #define STORE_MAX_PIECES \ - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ - ? 64 \ - : ((TARGET_AVX \ - && !TARGET_PREFER_AVX128 \ - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ - ? 32 \ - : ((TARGET_SSE2 \ - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ - ? 16 : UNITS_PER_WORD))) + (TARGET_INTER_UNIT_MOVES_TO_VEC \ + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ + ? 64 \ + : ((TARGET_AVX \ + && !TARGET_PREFER_AVX128 \ + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ + ? 32 \ + : ((TARGET_SSE2 \ + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ + ? 16 : UNITS_PER_WORD))) \ + : UNITS_PER_WORD) /* If a memory-to-memory move would take MOVE_RATIO or more simple move-instruction pairs, we will do a cpymem or libcall instead. -- cgit v1.1 From 9a8c3fc2b2cc6d73b2e3006625fca2b588ebc1b0 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 4 Aug 2021 16:03:58 +0800 Subject: Support cond_{smax,smin,umax,umin} for vector integer modes under AVX512. gcc/ChangeLog: * config/i386/sse.md (cond_): New expander. gcc/testsuite/ChangeLog: * gcc.target/i386/cond_op_maxmin_b-1.c: New test. * gcc.target/i386/cond_op_maxmin_b-2.c: New test. * gcc.target/i386/cond_op_maxmin_d-1.c: New test. * gcc.target/i386/cond_op_maxmin_d-2.c: New test. * gcc.target/i386/cond_op_maxmin_q-1.c: New test. * gcc.target/i386/cond_op_maxmin_q-2.c: New test. * gcc.target/i386/cond_op_maxmin_ub-1.c: New test. * gcc.target/i386/cond_op_maxmin_ub-2.c: New test. * gcc.target/i386/cond_op_maxmin_ud-1.c: New test. * gcc.target/i386/cond_op_maxmin_ud-2.c: New test. * gcc.target/i386/cond_op_maxmin_uq-1.c: New test. * gcc.target/i386/cond_op_maxmin_uq-2.c: New test. * gcc.target/i386/cond_op_maxmin_uw-1.c: New test. * gcc.target/i386/cond_op_maxmin_uw-2.c: New test. * gcc.target/i386/cond_op_maxmin_w-1.c: New test. * gcc.target/i386/cond_op_maxmin_w-2.c: New test. --- gcc/config/i386/sse.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index f5968e0..6035411 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -13070,6 +13070,24 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) +(define_expand "cond_" + [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand") + (vec_merge:VI1248_AVX512VLBW + (maxmin:VI1248_AVX512VLBW + (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand") + (match_operand:VI1248_AVX512VLBW 3 "nonimmediate_operand")) + (match_operand:VI1248_AVX512VLBW 4 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512F" +{ + emit_insn (gen_3_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + DONE; +}) + (define_expand "3_mask" [(set (match_operand:VI48_AVX512VL 0 "register_operand") (vec_merge:VI48_AVX512VL -- cgit v1.1 From f7aa81892eb54bc040ee6f7fd6134d800a5ee89c Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 4 Aug 2021 18:15:43 +0800 Subject: Support cond_{smax,smin} for vector float/double modes under AVX512. gcc/ChangeLog: * config/i386/sse.md (cond_): New expander. gcc/testsuite/ChangeLog: * gcc.target/i386/cond_op_maxmin_double-1.c: New test. * gcc.target/i386/cond_op_maxmin_double-2.c: New test. * gcc.target/i386/cond_op_maxmin_float-1.c: New test. * gcc.target/i386/cond_op_maxmin_float-2.c: New test. --- gcc/config/i386/sse.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 6035411..51733a3 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2376,6 +2376,24 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "SF")]) +(define_expand "cond_" + [(set (match_operand:VF 0 "register_operand") + (vec_merge:VF + (smaxmin:VF + (match_operand:VF 2 "vector_operand") + (match_operand:VF 3 "vector_operand")) + (match_operand:VF 4 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + " == 64 || TARGET_AVX512VL" +{ + emit_insn (gen_3_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + DONE; +}) + (define_expand "3" [(set (match_operand:VF 0 "register_operand") (smaxmin:VF -- cgit v1.1 From c16f21c7cf97ce48967e42d3b5d22ea169a9c2c8 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 4 Aug 2021 18:43:22 +0800 Subject: Support cond_{xor,ior,and} for vector integer mode under AVX512. gcc/ChangeLog: * config/i386/sse.md (cond_): New expander. gcc/testsuite/ChangeLog: * gcc.target/i386/cond_op_anylogic_d-1.c: New test. * gcc.target/i386/cond_op_anylogic_d-2.c: New test. * gcc.target/i386/cond_op_anylogic_q-1.c: New test. * gcc.target/i386/cond_op_anylogic_q-2.c: New test. --- gcc/config/i386/sse.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 51733a3..a46a237 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -14063,6 +14063,24 @@ DONE; }) +(define_expand "cond_" + [(set (match_operand:VI48_AVX512VL 0 "register_operand") + (vec_merge:VI48_AVX512VL + (any_logic:VI48_AVX512VL + (match_operand:VI48_AVX512VL 2 "vector_operand") + (match_operand:VI48_AVX512VL 3 "vector_operand")) + (match_operand:VI48_AVX512VL 4 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512F" +{ + emit_insn (gen_3_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + DONE; +}) + (define_insn "3" [(set (match_operand:VI48_AVX_AVX512F 0 "register_operand" "=x,x,v") (any_logic:VI48_AVX_AVX512F -- cgit v1.1 From 8cd27a3b25558e5be7f8595fc1c828bc46641671 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 28 Jul 2021 15:49:29 +0100 Subject: aarch64: Don't include vec_select high-half in SIMD add cost The Neon add-long/add-widen instructions can select the top or bottom half of the operand registers. This selection does not change the cost of the underlying instruction and this should be reflected by the RTL cost function. This patch adds RTL tree traversal in the Neon add cost function to match vec_select high-half of its operands. This traversal prevents the cost of the vec_select from being added into the cost of the subtract - meaning that these instructions can now be emitted in the combine pass as they are no longer deemed prohibitively expensive. gcc/ChangeLog: 2021-07-28 Jonathan Wright * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost of vec_select high-half from being added into Neon add cost. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vaddX_high_cost.c: New test. --- gcc/config/aarch64/aarch64.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index e02cbcb..aa687c5 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -13161,6 +13161,21 @@ cost_minus: op1 = XEXP (x, 1); cost_plus: + if (VECTOR_MODE_P (mode)) + { + /* ADDL2 and ADDW2. */ + unsigned int vec_flags = aarch64_classify_vector_mode (mode); + if (vec_flags & VEC_ADVSIMD) + { + /* The select-operand-high-half versions of the add instruction + have the same cost as the regular three vector version - + don't add the costs of the select into the costs of the add. + */ + op0 = aarch64_strip_extend_vec_half (op0); + op1 = aarch64_strip_extend_vec_half (op1); + } + } + if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE) { -- cgit v1.1 From 0c3aab7f2a394b69a0cfd4852e33f11d9eb7e737 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 28 Jul 2021 17:45:36 +0100 Subject: aarch64: Don't include vec_select high-half in SIMD subtract cost The Neon subtract-long/subract-widen instructions can select the top or bottom half of the operand registers. This selection does not change the cost of the underlying instruction and this should be reflected by the RTL cost function. This patch adds RTL tree traversal in the Neon subtract cost function to match vec_select high-half of its operands. This traversal prevents the cost of the vec_select from being added into the cost of the subtract - meaning that these instructions can now be emitted in the combine pass as they are no longer deemed prohibitively expensive. gcc/ChangeLog: 2021-07-28 Jonathan Wright * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost of vec_select high-half from being added into Neon subtract cost. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vsubX_high_cost.c: New test. --- gcc/config/aarch64/aarch64.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index aa687c5..30f8365 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -13089,6 +13089,21 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED, op1 = XEXP (x, 1); cost_minus: + if (VECTOR_MODE_P (mode)) + { + /* SUBL2 and SUBW2. */ + unsigned int vec_flags = aarch64_classify_vector_mode (mode); + if (vec_flags & VEC_ADVSIMD) + { + /* The select-operand-high-half versions of the sub instruction + have the same cost as the regular three vector version - + don't add the costs of the select into the costs of the sub. + */ + op0 = aarch64_strip_extend_vec_half (op0); + op1 = aarch64_strip_extend_vec_half (op1); + } + } + *cost += rtx_cost (op0, mode, MINUS, 0, speed); /* Detect valid immediates. */ -- cgit v1.1 From 62e66c6a6cc52dc0e014141d369cff52757cd7ae Mon Sep 17 00:00:00 2001 From: Richard Earnshaw Date: Mon, 26 Jul 2021 17:07:14 +0100 Subject: arm: ensure the arch_name is always set for the build target This should never happen now if GCC is invoked by the driver, but in the unusual case of calling cc1 (or its ilk) directly from the command line the build target's arch_name string can remain NULL. This can complicate later processing meaning that we need to check for this case explicitly in some circumstances. Nothing should rely on this behaviour, so it's simpler to always set the arch_name when configuring the build target and be done with it. gcc: * config/arm/arm.c (arm_configure_build_target): Ensure the target's arch_name is always set. --- gcc/config/arm/arm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 6d781e2..b2dd58d 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -3432,6 +3432,8 @@ arm_configure_build_target (struct arm_build_target *target, const cpu_tune *tune_data = &all_tunes[arm_selected_tune - all_cores]; /* Finish initializing the target structure. */ + if (!target->arch_name) + target->arch_name = arm_selected_arch->common.name; target->arch_pp_name = arm_selected_arch->arch; target->base_arch = arm_selected_arch->base_arch; target->profile = arm_selected_arch->profile; -- cgit v1.1 From 6a37d0331c25f23628d4308e5a75624005c223b2 Mon Sep 17 00:00:00 2001 From: Richard Earnshaw Date: Tue, 27 Jul 2021 15:44:57 +0100 Subject: arm: Don't reconfigure globals in arm_configure_build_target arm_configure_build_target is usually used to reconfigure the arm_active_target structure, which is then used to reconfigure a number of other global variables describing the current target. Occasionally, however, we need to use arm_configure_build_target to construct a temporary target structure and in that case it is wrong to try to reconfigure the global variables (although probably harmless, since arm_option_reconfigure_globals() only looks at arm_active_target). At the very least, however, this is wasted work, so it is best not to do it unless needed. What's more, several callers of arm_configure_build target call arm_option_reconfigure_globals themselves within a few lines, making the call from within arm_configure_build_target completely redundant. So this patch moves the responsibility of calling of arm_configure_build_target to its callers (only two places needed updating). gcc: * config/arm/arm.c (arm_configure_build_target): Don't call arm_option_reconfigure_globals. (arm_option_restore): Call arm_option_reconfigure_globals after reconfiguring the target. * config/arm/arm-c.c (arm_pragma_target_parse): Likewise. --- gcc/config/arm/arm-c.c | 1 + gcc/config/arm/arm.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm-c.c b/gcc/config/arm/arm-c.c index ae2139c..cc7901b 100644 --- a/gcc/config/arm/arm-c.c +++ b/gcc/config/arm/arm-c.c @@ -409,6 +409,7 @@ arm_pragma_target_parse (tree args, tree pop_target) target_option_current_node = cur_tree; arm_configure_build_target (&arm_active_target, TREE_TARGET_OPTION (cur_tree), false); + arm_option_reconfigure_globals (); } /* Update macros if target_node changes. The global state will be restored diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index b2dd58d..273202a 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -3058,6 +3058,7 @@ arm_option_restore (struct gcc_options */* opts */, struct cl_target_option *ptr) { arm_configure_build_target (&arm_active_target, ptr, false); + arm_option_reconfigure_globals (); } /* Reset options between modes that the user has specified. */ @@ -3441,7 +3442,6 @@ arm_configure_build_target (struct arm_build_target *target, target->tune_flags = tune_data->tune_flags; target->tune = tune_data->tune; target->tune_core = tune_data->scheduler; - arm_option_reconfigure_globals (); } /* Fix up any incompatible options that the user has specified. */ -- cgit v1.1 From c1cdabe3aab817d95a8db00a8b5e9f6bcdea936f Mon Sep 17 00:00:00 2001 From: Richard Earnshaw Date: Thu, 29 Jul 2021 11:00:31 +0100 Subject: arm: reorder assembler architecture directives [PR101723] A change to the way gas interprets the .fpu directive in binutils-2.34 means that issuing .fpu will clear any features set by .arch_extension that apply to the floating point or simd units. This unfortunately causes problems for more recent versions of the architecture because we currently emit .arch, .arch_extension and .fpu directives at different times and try to suppress redundant changes. This change addresses this by firstly unifying all the places where we emit these directives to a single block of code and secondly (re)emitting all the directives if any changes have been made to the target options. Whilst this is slightly more than the strict minimum it should be enough to catch all cases where a change could have happened. The new code also emits the directives in the order: .arch, .fpu, .arch_extension. This ensures that the additional architectural extensions are not removed by a later .fpu directive. Whilst writing this patch I also noticed that in the corner case where the last function to be compiled had a non-standard set of architecture flags, the assembler would add an incorrect set of derived attributes for the file as a whole. Instead of reflecting the command-line options it would reflect the flags from the last file in the function. To address this I've also added a call to re-emit the flags from the asm_file_end callback so the assembler will be in the correct state when it finishes processing the intput. There's some slight churn to the testsuite as a consequence of this, because previously we had a hack to suppress emitting a .fpu directive for one specific case, but with the new order this is no-longer necessary. gcc/ChangeLog: PR target/101723 * config/arm/arm-cpus.in (generic-armv7-a): Add quirk to suppress writing .cpu directive in asm output. * config/arm/arm.c (arm_identify_fpu_from_isa): New variable. (arm_last_printed_arch_string): Delete. (arm_last-printed_fpu_string): Delete. (arm_configure_build_target): If use of floating-point/SIMD is disabled, remove all fp/simd related features from the target ISA. (last_arm_targ_options): New variable. (arm_print_asm_arch_directives): Add new parameters. Change order of emitted directives and handle all cases here. (arm_file_start): Always call arm_print_asm_arch_directives, move all generation of .arch/.arch_extension here. (arm_file_end): Call arm_print_asm_arch. (arm_declare_function_name): Call arm_print_asm_arch_directives instead of printing .arch/.fpu directives directly. gcc/testsuite/ChangeLog: PR target/101723 * gcc.target/arm/cortex-m55-nofp-flag-hard.c: Update expected output. * gcc.target/arm/cortex-m55-nofp-flag-softfp.c: Likewise. * gcc.target/arm/cortex-m55-nofp-nomve-flag-softfp.c: Likewise. * gcc.target/arm/mve/intrinsics/mve_fpu1.c: Convert to dg-do assemble. Add a non-no-op function body. * gcc.target/arm/mve/intrinsics/mve_fpu2.c: Likewise. * gcc.target/arm/pr98636.c (dg-options): Add -mfloat-abi=softfp. * gcc.target/arm/attr-neon.c: Tighten scan-assembler tests. * gcc.target/arm/attr-neon2.c: Use -Ofast, convert test to use check-function-bodies. * gcc.target/arm/attr-neon3.c: Likewise. * gcc.target/arm/pr69245.c: Tighten scan-assembler match, but allow multiple instances. * gcc.target/arm/pragma_fpu_attribute.c: Likewise. * gcc.target/arm/pragma_fpu_attribute_2.c: Likewise. --- gcc/config/arm/arm-cpus.in | 1 + gcc/config/arm/arm.c | 186 +++++++++++++++++++-------------------------- 2 files changed, 78 insertions(+), 109 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in index ab4b6ac..249995a 100644 --- a/gcc/config/arm/arm-cpus.in +++ b/gcc/config/arm/arm-cpus.in @@ -1080,6 +1080,7 @@ begin cpu generic-armv7-a cname genericv7a tune flags LDSCHED architecture armv7-a+fp + isa quirk_no_asmcpu option mp add mp option sec add sec option vfpv3-d16 add VFPv3 FP_DBL diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 273202a..11dafc7 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -79,10 +79,6 @@ typedef struct minipool_node Mnode; typedef struct minipool_fixup Mfix; -/* The last .arch and .fpu assembly strings that we printed. */ -static std::string arm_last_printed_arch_string; -static std::string arm_last_printed_fpu_string; - void (*arm_lang_output_object_attributes_hook)(void); struct four_ints @@ -334,6 +330,7 @@ static rtx_insn *thumb1_md_asm_adjust (vec &, vec &, vec &, vec &, vec &, HARD_REG_SET &, location_t); +static const char *arm_identify_fpu_from_isa (sbitmap); /* Table of machine attributes. */ static const struct attribute_spec arm_attribute_table[] = @@ -3411,6 +3408,11 @@ arm_configure_build_target (struct arm_build_target *target, bitmap_ior (target->isa, target->isa, fpu_bits); } + /* If we have the soft-float ABI, clear any feature bits relating to use of + floating-point operations. They'll just confuse things later on. */ + if (arm_float_abi == ARM_FLOAT_ABI_SOFT) + bitmap_and_compl (target->isa, target->isa, isa_all_fpbits); + /* There may be implied bits which we still need to enable. These are non-named features which are needed to complete other sets of features, but cannot be enabled from arm-cpus.in due to being shared between @@ -28096,20 +28098,65 @@ arm_print_tune_info (void) (int) current_tune->sched_autopref); } +/* The last set of target options used to emit .arch directives, etc. This + could be a function-local static if it were not required to expose it as a + root to the garbage collector. */ +static GTY(()) cl_target_option *last_asm_targ_options = NULL; + /* Print .arch and .arch_extension directives corresponding to the current architecture configuration. */ static void -arm_print_asm_arch_directives () +arm_print_asm_arch_directives (FILE *stream, cl_target_option *targ_options) { + arm_build_target build_target; + /* If the target options haven't changed since the last time we were called + there is nothing to do. This should be sufficient to suppress the + majority of redundant work. */ + if (last_asm_targ_options == targ_options) + return; + + last_asm_targ_options = targ_options; + + build_target.isa = sbitmap_alloc (isa_num_bits); + arm_configure_build_target (&build_target, targ_options, false); + + if (build_target.core_name + && !bitmap_bit_p (build_target.isa, isa_bit_quirk_no_asmcpu)) + { + const char* truncated_name + = arm_rewrite_selected_cpu (build_target.core_name); + asm_fprintf (stream, "\t.cpu %s\n", truncated_name); + } + const arch_option *arch = arm_parse_arch_option_name (all_architectures, "-march", - arm_active_target.arch_name); + build_target.arch_name); auto_sbitmap opt_bits (isa_num_bits); gcc_assert (arch); - asm_fprintf (asm_out_file, "\t.arch %s\n", arm_active_target.arch_name); - arm_last_printed_arch_string = arm_active_target.arch_name; + if (strcmp (build_target.arch_name, "armv7ve") == 0) + { + /* Keep backward compatability for assemblers which don't support + armv7ve. Fortunately, none of the following extensions are reset + by a .fpu directive. */ + asm_fprintf (stream, "\t.arch armv7-a\n"); + asm_fprintf (stream, "\t.arch_extension virt\n"); + asm_fprintf (stream, "\t.arch_extension idiv\n"); + asm_fprintf (stream, "\t.arch_extension sec\n"); + asm_fprintf (stream, "\t.arch_extension mp\n"); + } + else + asm_fprintf (stream, "\t.arch %s\n", build_target.arch_name); + + /* The .fpu directive will reset any architecture extensions from the + assembler that relate to the fp/vector extensions. So put this out before + any .arch_extension directives. */ + const char *fpu_name = (TARGET_SOFT_FLOAT + ? "softvfp" + : arm_identify_fpu_from_isa (build_target.isa)); + asm_fprintf (stream, "\t.fpu %s\n", fpu_name); + if (!arch->common.extensions) return; @@ -28135,13 +28182,12 @@ arm_print_asm_arch_directives () && !TARGET_HAVE_MVE_FLOAT)) continue; - /* If every feature bit of this option is set in the target - ISA specification, print out the option name. However, - don't print anything if all the bits are part of the - FPU specification. */ - if (bitmap_subset_p (opt_bits, arm_active_target.isa) + /* If every feature bit of this option is set in the target ISA + specification, print out the option name. However, don't print + anything if all the bits are part of the FPU specification. */ + if (bitmap_subset_p (opt_bits, build_target.isa) && !bitmap_subset_p (opt_bits, isa_all_fpubits_internal)) - asm_fprintf (asm_out_file, "\t.arch_extension %s\n", opt->name); + asm_fprintf (stream, "\t.arch_extension %s\n", opt->name); } } } @@ -28151,46 +28197,23 @@ arm_file_start (void) { int val; + arm_print_asm_arch_directives + (asm_out_file, TREE_TARGET_OPTION (target_option_default_node)); + if (TARGET_BPABI) { - /* We don't have a specified CPU. Use the architecture to - generate the tags. - - Note: it might be better to do this unconditionally, then the - assembler would not need to know about all new CPU names as - they are added. */ - if (!arm_active_target.core_name) - { - /* armv7ve doesn't support any extensions. */ - if (strcmp (arm_active_target.arch_name, "armv7ve") == 0) - { - /* Keep backward compatability for assemblers - which don't support armv7ve. */ - asm_fprintf (asm_out_file, "\t.arch armv7-a\n"); - asm_fprintf (asm_out_file, "\t.arch_extension virt\n"); - asm_fprintf (asm_out_file, "\t.arch_extension idiv\n"); - asm_fprintf (asm_out_file, "\t.arch_extension sec\n"); - asm_fprintf (asm_out_file, "\t.arch_extension mp\n"); - arm_last_printed_arch_string = "armv7ve"; - } - else - arm_print_asm_arch_directives (); - } - else if (startswith (arm_active_target.core_name, "generic")) - { - asm_fprintf (asm_out_file, "\t.arch %s\n", - arm_active_target.core_name + 8); - arm_last_printed_arch_string = arm_active_target.core_name + 8; - } - else + /* If we have a named cpu, but we the assembler does not support that + name via .cpu, put out a cpu name attribute; but don't do this if the + name starts with the fictitious prefix, 'generic'. */ + if (arm_active_target.core_name + && bitmap_bit_p (arm_active_target.isa, isa_bit_quirk_no_asmcpu) + && !startswith (arm_active_target.core_name, "generic")) { const char* truncated_name = arm_rewrite_selected_cpu (arm_active_target.core_name); if (bitmap_bit_p (arm_active_target.isa, isa_bit_quirk_no_asmcpu)) asm_fprintf (asm_out_file, "\t.eabi_attribute 5, \"%s\"\n", truncated_name); - else - asm_fprintf (asm_out_file, "\t.cpu %s\n", truncated_name); } if (print_tune_info) @@ -28255,6 +28278,13 @@ arm_file_end (void) { int regno; + /* Just in case the last function output in the assembler had non-default + architecture directives, we force the assembler state back to the default + set, so that any 'calculated' build attributes are based on the default + options rather than the special options for that function. */ + arm_print_asm_arch_directives + (asm_out_file, TREE_TARGET_OPTION (target_option_default_node)); + if (NEED_INDICATE_EXEC_STACK) /* Add .note.GNU-stack. */ file_end_indicate_exec_stack (); @@ -33265,58 +33295,7 @@ arm_declare_function_name (FILE *stream, const char *name, tree decl) targ_options = TREE_TARGET_OPTION (target_option_current_node); gcc_assert (targ_options); - /* Only update the assembler .arch string if it is distinct from the last - such string we printed. arch_to_print is set conditionally in case - targ_options->x_arm_arch_string is NULL which can be the case - when cc1 is invoked directly without passing -march option. */ - std::string arch_to_print; - if (targ_options->x_arm_arch_string) - arch_to_print = targ_options->x_arm_arch_string; - - if (arch_to_print != arm_last_printed_arch_string) - { - std::string arch_name - = arch_to_print.substr (0, arch_to_print.find ("+")); - asm_fprintf (asm_out_file, "\t.arch %s\n", arch_name.c_str ()); - const arch_option *arch - = arm_parse_arch_option_name (all_architectures, "-march", - targ_options->x_arm_arch_string); - auto_sbitmap opt_bits (isa_num_bits); - - gcc_assert (arch); - if (arch->common.extensions) - { - for (const struct cpu_arch_extension *opt = arch->common.extensions; - opt->name != NULL; - opt++) - { - if (!opt->remove) - { - arm_initialize_isa (opt_bits, opt->isa_bits); - /* For the cases "-march=armv8.1-m.main+mve -mfloat-abi=soft" - and "-march=armv8.1-m.main+mve.fp -mfloat-abi=soft" MVE and - MVE with floating point instructions is disabled. So the - following check restricts the printing of ".arch_extension - mve" and ".arch_extension fp" (for mve.fp) in the assembly - file. MVE needs this special behaviour because the - feature bit "mve" and "mve_float" are not part of - "fpu bits", so they are not cleared when -mfloat-abi=soft - (i.e nofp) but the marco TARGET_HAVE_MVE and - TARGET_HAVE_MVE_FLOAT are disabled. */ - if ((bitmap_bit_p (opt_bits, isa_bit_mve) && !TARGET_HAVE_MVE) - || (bitmap_bit_p (opt_bits, isa_bit_mve_float) - && !TARGET_HAVE_MVE_FLOAT)) - continue; - if (bitmap_subset_p (opt_bits, arm_active_target.isa) - && !bitmap_subset_p (opt_bits, isa_all_fpubits_internal)) - asm_fprintf (asm_out_file, "\t.arch_extension %s\n", - opt->name); - } - } - } - - arm_last_printed_arch_string = arch_to_print; - } + arm_print_asm_arch_directives (stream, targ_options); fprintf (stream, "\t.syntax unified\n"); @@ -33334,17 +33313,6 @@ arm_declare_function_name (FILE *stream, const char *name, tree decl) else fprintf (stream, "\t.arm\n"); - std::string fpu_to_print - = TARGET_SOFT_FLOAT - ? "softvfp" : arm_identify_fpu_from_isa (arm_active_target.isa); - - if (!(!strcmp (fpu_to_print.c_str (), "softvfp") && TARGET_VFP_BASE) - && (fpu_to_print != arm_last_printed_arch_string)) - { - asm_fprintf (asm_out_file, "\t.fpu %s\n", fpu_to_print.c_str ()); - arm_last_printed_fpu_string = fpu_to_print; - } - if (TARGET_POKE_FUNCTION_NAME) arm_poke_function_name (stream, (const char *) name); } -- cgit v1.1 From 783d809f0bb13a9f50139d03c328f59f9e3840c7 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Thu, 5 Aug 2021 14:03:23 +0100 Subject: vect: Move costing helpers from aarch64 code aarch64.c has various routines to test for specific kinds of vector statement cost. The routines aren't really target-specific, so following a suggestion from Richi, this patch moves them to a new section of tree-vectorizer.h. gcc/ * tree-vectorizer.h (vect_is_store_elt_extraction, vect_is_reduction) (vect_reduc_type, vect_embedded_comparison_type, vect_comparison_type) (vect_is_extending_load, vect_is_integer_truncation): New functions, moved from aarch64.c but given different names. * config/aarch64/aarch64.c (aarch64_is_store_elt_extraction) (aarch64_is_reduction, aarch64_reduc_type) (aarch64_embedded_comparison_type, aarch64_comparison_type) (aarch64_extending_load_p, aarch64_integer_truncation_p): Delete in favor of the above. Update callers accordingly. --- gcc/config/aarch64/aarch64.c | 125 +++++-------------------------------------- 1 file changed, 14 insertions(+), 111 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 30f8365..4cd4b03 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -14820,40 +14820,6 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, } } -/* Return true if an operaton of kind KIND for STMT_INFO represents - the extraction of an element from a vector in preparation for - storing the element to memory. */ -static bool -aarch64_is_store_elt_extraction (vect_cost_for_stmt kind, - stmt_vec_info stmt_info) -{ - return (kind == vec_to_scalar - && STMT_VINFO_DATA_REF (stmt_info) - && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))); -} - -/* Return true if STMT_INFO represents part of a reduction. */ -static bool -aarch64_is_reduction (stmt_vec_info stmt_info) -{ - return (STMT_VINFO_REDUC_DEF (stmt_info) - || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))); -} - -/* If STMT_INFO describes a reduction, return the type of reduction - it describes, otherwise return -1. */ -static int -aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info) -{ - if (loop_vec_info loop_vinfo = dyn_cast (vinfo)) - if (STMT_VINFO_REDUC_DEF (stmt_info)) - { - stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); - return int (STMT_VINFO_REDUC_TYPE (reduc_info)); - } - return -1; -} - /* Return true if an access of kind KIND for STMT_INFO represents one vector of an LD[234] or ST[234] operation. Return the total number of vectors (2, 3 or 4) if so, otherwise return a value outside that range. */ @@ -14874,32 +14840,6 @@ aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info) return 0; } -/* If STMT_INFO is a COND_EXPR that includes an embedded comparison, return the - scalar type of the values being compared. Return null otherwise. */ -static tree -aarch64_embedded_comparison_type (stmt_vec_info stmt_info) -{ - if (auto *assign = dyn_cast (stmt_info->stmt)) - if (gimple_assign_rhs_code (assign) == COND_EXPR) - { - tree cond = gimple_assign_rhs1 (assign); - if (COMPARISON_CLASS_P (cond)) - return TREE_TYPE (TREE_OPERAND (cond, 0)); - } - return NULL_TREE; -} - -/* If STMT_INFO is a comparison or contains an embedded comparison, return the - scalar type of the values being compared. Return null otherwise. */ -static tree -aarch64_comparison_type (stmt_vec_info stmt_info) -{ - if (auto *assign = dyn_cast (stmt_info->stmt)) - if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison) - return TREE_TYPE (gimple_assign_rhs1 (assign)); - return aarch64_embedded_comparison_type (stmt_info); -} - /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD vectors would produce a series of LDP or STP operations. KIND is the kind of statement that STMT_INFO represents. */ @@ -14926,43 +14866,6 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind, return is_gimple_assign (stmt_info->stmt); } -/* Return true if STMT_INFO extends the result of a load. */ -static bool -aarch64_extending_load_p (class vec_info *vinfo, stmt_vec_info stmt_info) -{ - gassign *assign = dyn_cast (stmt_info->stmt); - if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))) - return false; - - tree rhs = gimple_assign_rhs1 (stmt_info->stmt); - tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign)); - tree rhs_type = TREE_TYPE (rhs); - if (!INTEGRAL_TYPE_P (lhs_type) - || !INTEGRAL_TYPE_P (rhs_type) - || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type)) - return false; - - stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs); - return (def_stmt_info - && STMT_VINFO_DATA_REF (def_stmt_info) - && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info))); -} - -/* Return true if STMT_INFO is an integer truncation. */ -static bool -aarch64_integer_truncation_p (stmt_vec_info stmt_info) -{ - gassign *assign = dyn_cast (stmt_info->stmt); - if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))) - return false; - - tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign)); - tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign)); - return (INTEGRAL_TYPE_P (lhs_type) - && INTEGRAL_TYPE_P (rhs_type) - && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type)); -} - /* Return true if STMT_INFO is the second part of a two-statement multiply-add or multiply-subtract sequence that might be suitable for fusing into a single instruction. If VEC_FLAGS is zero, analyze the operation as @@ -15065,7 +14968,7 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo, tree vectype, const sve_vec_cost *sve_costs) { - switch (aarch64_reduc_type (vinfo, stmt_info)) + switch (vect_reduc_type (vinfo, stmt_info)) { case EXTRACT_LAST_REDUCTION: return sve_costs->clast_cost; @@ -15156,7 +15059,7 @@ aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, { /* Detect an extension of a loaded value. In general, we'll be able to fuse the extension with the load. */ - if (kind == scalar_stmt && aarch64_extending_load_p (vinfo, stmt_info)) + if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info)) return 0; return stmt_cost; @@ -15188,7 +15091,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, /* Detect cases in which vec_to_scalar is describing the extraction of a vector element in preparation for a scalar store. The store itself is costed separately. */ - if (aarch64_is_store_elt_extraction (kind, stmt_info)) + if (vect_is_store_elt_extraction (kind, stmt_info)) return simd_costs->store_elt_extra_cost; /* Detect SVE gather loads, which are costed as a single scalar_load @@ -15227,7 +15130,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, instruction like FADDP or MAXV. */ if (kind == vec_to_scalar && where == vect_epilogue - && aarch64_is_reduction (stmt_info)) + && vect_is_reduction (stmt_info)) switch (GET_MODE_INNER (TYPE_MODE (vectype))) { case E_QImode: @@ -15277,12 +15180,12 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind, on the fly. Optimistically assume that a load followed by an extension will fold to this form during combine, and that the extension therefore comes for free. */ - if (kind == vector_stmt && aarch64_extending_load_p (vinfo, stmt_info)) + if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info)) stmt_cost = 0; /* For similar reasons, vector_stmt integer truncations are a no-op, because we can just ignore the unused upper bits of the source. */ - if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info)) + if (kind == vector_stmt && vect_is_integer_truncation (stmt_info)) stmt_cost = 0; /* Advanced SIMD can load and store pairs of registers using LDP and STP, @@ -15357,7 +15260,7 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, } if (kind == vector_stmt || kind == vec_to_scalar) - if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info)) + if (tree cmp_type = vect_embedded_comparison_type (stmt_info)) { if (FLOAT_TYPE_P (cmp_type)) stmt_cost += simd_costs->fp_stmt_cost; @@ -15367,7 +15270,7 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, } if (kind == scalar_stmt) - if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info)) + if (tree cmp_type = vect_embedded_comparison_type (stmt_info)) { if (FLOAT_TYPE_P (cmp_type)) stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost; @@ -15417,12 +15320,12 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs, /* Calculate the minimum cycles per iteration imposed by a reduction operation. */ if ((kind == vector_stmt || kind == vec_to_scalar) - && aarch64_is_reduction (stmt_info)) + && vect_is_reduction (stmt_info)) { unsigned int base = aarch64_in_loop_reduction_latency (vinfo, stmt_info, vectype, vec_flags); - if (aarch64_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION) + if (vect_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION) { if (aarch64_sve_mode_p (TYPE_MODE (vectype))) { @@ -15521,7 +15424,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs, /* Add any embedded comparison operations. */ if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar) - && aarch64_embedded_comparison_type (stmt_info)) + && vect_embedded_comparison_type (stmt_info)) ops->general_ops += num_copies; /* Detect COND_REDUCTIONs and things that would need to become @@ -15530,7 +15433,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs, have only accounted for one. */ if (vec_flags && (kind == vector_stmt || kind == vec_to_scalar)) { - int reduc_type = aarch64_reduc_type (vinfo, stmt_info); + int reduc_type = vect_reduc_type (vinfo, stmt_info); if ((reduc_type == EXTRACT_LAST_REDUCTION && (vec_flags & VEC_ADVSIMD)) || reduc_type == COND_REDUCTION) ops->general_ops += num_copies; @@ -15538,7 +15441,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs, /* Count the predicate operations needed by an SVE comparison. */ if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar)) - if (tree type = aarch64_comparison_type (stmt_info)) + if (tree type = vect_comparison_type (stmt_info)) { unsigned int base = (FLOAT_TYPE_P (type) ? sve_issue->fp_cmp_pred_ops @@ -15616,7 +15519,7 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, /* If we scalarize a strided store, the vectorizer costs one vec_to_scalar for each element. However, we can store the first element using an FP store without a separate extract step. */ - if (aarch64_is_store_elt_extraction (kind, stmt_info)) + if (vect_is_store_elt_extraction (kind, stmt_info)) count -= 1; stmt_cost = aarch64_detect_scalar_stmt_subtype -- cgit v1.1 From 72264a639729a5dcc21dbee304717ce22b338bfd Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sat, 17 Jul 2021 07:44:45 -0700 Subject: : Add pragma GCC target("general-regs-only") 1. Intrinsics in only require GPR ISAs. Add #if defined __MMX__ || defined __SSE__ #pragma GCC push_options #pragma GCC target("general-regs-only") #define __DISABLE_GENERAL_REGS_ONLY__ #endif and #ifdef __DISABLE_GENERAL_REGS_ONLY__ #undef __DISABLE_GENERAL_REGS_ONLY__ #pragma GCC pop_options #endif /* __DISABLE_GENERAL_REGS_ONLY__ */ to to disable non-GPR ISAs so that they can be used in functions with __attribute__ ((target("general-regs-only"))). 2. When checking always_inline attribute, if callee only uses GPRs, ignore MASK_80387 since enable MASK_80387 in caller has no impact on callee inline. gcc/ PR target/99744 * config/i386/i386.c (ix86_can_inline_p): Ignore MASK_80387 if callee only uses GPRs. * config/i386/ia32intrin.h: Revert commit 5463cee2770. * config/i386/serializeintrin.h: Revert commit 71958f740f1. * config/i386/x86gprintrin.h: Add #pragma GCC target("general-regs-only") and #pragma GCC pop_options to disable non-GPR ISAs. gcc/testsuite/ PR target/99744 * gcc.target/i386/pr99744-3.c: New test. * gcc.target/i386/pr99744-4.c: Likewise. * gcc.target/i386/pr99744-5.c: Likewise. * gcc.target/i386/pr99744-6.c: Likewise. * gcc.target/i386/pr99744-7.c: Likewise. * gcc.target/i386/pr99744-8.c: Likewise. --- gcc/config/i386/i386.c | 6 +++++- gcc/config/i386/ia32intrin.h | 14 ++++++++++++-- gcc/config/i386/serializeintrin.h | 7 ++++++- gcc/config/i386/x86gprintrin.h | 11 +++++++++++ 4 files changed, 34 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ec06908..aea224a 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -554,7 +554,7 @@ ix86_can_inline_p (tree caller, tree callee) /* Changes of those flags can be tolerated for always inlines. Lets hope user knows what he is doing. */ - const unsigned HOST_WIDE_INT always_inline_safe_mask + unsigned HOST_WIDE_INT always_inline_safe_mask = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD @@ -579,6 +579,10 @@ ix86_can_inline_p (tree caller, tree callee) && lookup_attribute ("always_inline", DECL_ATTRIBUTES (callee))); + /* If callee only uses GPRs, ignore MASK_80387. */ + if (TARGET_GENERAL_REGS_ONLY_P (callee_opts->x_ix86_target_flags)) + always_inline_safe_mask |= MASK_80387; + cgraph_node *callee_node = cgraph_node::get (callee); /* Callee's isa options should be a subset of the caller's, i.e. a SSE4 function can inline a SSE2 function but a SSE2 function can't inline diff --git a/gcc/config/i386/ia32intrin.h b/gcc/config/i386/ia32intrin.h index 5422b0f..df99220 100644 --- a/gcc/config/i386/ia32intrin.h +++ b/gcc/config/i386/ia32intrin.h @@ -107,12 +107,22 @@ __rdpmc (int __S) #endif /* __iamcu__ */ /* rdtsc */ -#define __rdtsc() __builtin_ia32_rdtsc () +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rdtsc (void) +{ + return __builtin_ia32_rdtsc (); +} #ifndef __iamcu__ /* rdtscp */ -#define __rdtscp(a) __builtin_ia32_rdtscp (a) +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rdtscp (unsigned int *__A) +{ + return __builtin_ia32_rdtscp (__A); +} #endif /* __iamcu__ */ diff --git a/gcc/config/i386/serializeintrin.h b/gcc/config/i386/serializeintrin.h index e280250..89b5b94 100644 --- a/gcc/config/i386/serializeintrin.h +++ b/gcc/config/i386/serializeintrin.h @@ -34,7 +34,12 @@ #define __DISABLE_SERIALIZE__ #endif /* __SERIALIZE__ */ -#define _serialize() __builtin_ia32_serialize () +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_serialize (void) +{ + __builtin_ia32_serialize (); +} #ifdef __DISABLE_SERIALIZE__ #undef __DISABLE_SERIALIZE__ diff --git a/gcc/config/i386/x86gprintrin.h b/gcc/config/i386/x86gprintrin.h index 7793032..b7fefa7 100644 --- a/gcc/config/i386/x86gprintrin.h +++ b/gcc/config/i386/x86gprintrin.h @@ -24,6 +24,12 @@ #ifndef _X86GPRINTRIN_H_INCLUDED #define _X86GPRINTRIN_H_INCLUDED +#if defined __MMX__ || defined __SSE__ +#pragma GCC push_options +#pragma GCC target("general-regs-only") +#define __DISABLE_GENERAL_REGS_ONLY__ +#endif + #include #ifndef __iamcu__ @@ -255,4 +261,9 @@ _ptwrite32 (unsigned __B) #endif /* __iamcu__ */ +#ifdef __DISABLE_GENERAL_REGS_ONLY__ +#undef __DISABLE_GENERAL_REGS_ONLY__ +#pragma GCC pop_options +#endif /* __DISABLE_GENERAL_REGS_ONLY__ */ + #endif /* _X86GPRINTRIN_H_INCLUDED. */ -- cgit v1.1 From 318113a961220c8da79d8d29619138827ccc69f1 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Tue, 1 Jun 2021 15:39:14 +0200 Subject: rs6000: Fix restored rs6000_long_double_type_size As mentioned in the "Fallout: save/restore target options in handle_optimize_attribute" thread, we need to support target option restore of rs6000_long_double_type_size == FLOAT_PRECISION_TFmode. gcc/ChangeLog: * config/rs6000/rs6000.c (rs6000_option_override_internal): When a target option is restored, it can have rs6000_long_double_type_size set to FLOAT_PRECISION_TFmode and error should not be emitted. gcc/testsuite/ChangeLog: * gcc.target/powerpc/pragma-optimize.c: New test. --- gcc/config/rs6000/rs6000.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 2de5a96..5b1c06b 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -4189,6 +4189,8 @@ rs6000_option_override_internal (bool global_init_p) else rs6000_long_double_type_size = default_long_double_size; } + else if (rs6000_long_double_type_size == FLOAT_PRECISION_TFmode) + ; /* The option value can be seen when cl_target_option_restore is called. */ else if (rs6000_long_double_type_size == 128) rs6000_long_double_type_size = FLOAT_PRECISION_TFmode; else if (global_options_set.x_rs6000_ieeequad) -- cgit v1.1 From a6075926947be9bcbf7016bf4b29f549102ad91d Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 29 Jul 2021 12:24:17 +0100 Subject: aarch64: Use memcpy to copy structures in vst4[q]_lane intrinsics Use __builtin_memcpy to copy vector structures instead of using a union - or constructing a new opaque structure one vector at a time - in each of the vst4[q]_lane Neon intrinsics in arm_neon.h. Add new code generation tests to verify that superfluous move instructions are not generated for the vst4q_lane intrinsics. gcc/ChangeLog: 2021-07-29 Jonathan Wright * config/aarch64/arm_neon.h (__ST4_LANE_FUNC): Delete. (__ST4Q_LANE_FUNC): Delete. (vst4_lane_f16): Use __builtin_memcpy to copy vector structure instead of constructing __builtin_aarch64_simd_xi one vector at a time. (vst4_lane_f32): Likewise. (vst4_lane_f64): Likewise. (vst4_lane_p8): Likewise. (vst4_lane_p16): Likewise. (vst4_lane_p64): Likewise. (vst4_lane_s8): Likewise. (vst4_lane_s16): Likewise. (vst4_lane_s32): Likewise. (vst4_lane_s64): Likewise. (vst4_lane_u8): Likewise. (vst4_lane_u16): Likewise. (vst4_lane_u32): Likewise. (vst4_lane_u64): Likewise. (vst4_lane_bf16): Likewise. (vst4q_lane_f16): Use __builtin_memcpy to copy vector structure instead of using a union. (vst4q_lane_f32): Likewise. (vst4q_lane_f64): Likewise. (vst4q_lane_p8): Likewise. (vst4q_lane_p16): Likewise. (vst4q_lane_p64): Likewise. (vst4q_lane_s8): Likewise. (vst4q_lane_s16): Likewise. (vst4q_lane_s32): Likewise. (vst4q_lane_s64): Likewise. (vst4q_lane_u8): Likewise. (vst4q_lane_u16): Likewise. (vst4q_lane_u32): Likewise. (vst4q_lane_u64): Likewise. (vst4q_lane_bf16): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: Add new tests. --- gcc/config/aarch64/arm_neon.h | 517 +++++++++++++++++++++++++++++++++++------- 1 file changed, 429 insertions(+), 88 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 313b35f..6999b81 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9369,94 +9369,411 @@ __ST3Q_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16) __ST3Q_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32) __ST3Q_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64) -#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode, \ - qmode, ptr_mode, funcsuffix, signedtype) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst4_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_xi __o; \ - largetype __temp; \ - __temp.val[0] \ - = vcombine_##funcsuffix (__b.val[0], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[1] \ - = vcombine_##funcsuffix (__b.val[1], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[2] \ - = vcombine_##funcsuffix (__b.val[2], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[3] \ - = vcombine_##funcsuffix (__b.val[3], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[0], 0); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[1], 1); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[2], 2); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[3], 3); \ - __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __o, __c); \ +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_f16 (float16_t *__ptr, float16x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + float16x8x4_t __temp; + __temp.val[0] = vcombine_f16 (__val.val[0], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f16 (__val.val[1], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f16 (__val.val[2], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f16 (__val.val[3], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev4hf ((__builtin_aarch64_simd_hf *) __ptr, __o, + __lane); } -__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16, - float16x8_t) -__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32, - float32x4_t) -__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, df, v2df, df, f64, - float64x2_t) -__ST4_LANE_FUNC (poly8x8x4_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, p16, - int16x8_t) -__ST4_LANE_FUNC (poly64x1x4_t, poly64x2x4_t, poly64_t, di, v2di_ssps, di, p64, - poly64x2_t) -__ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__ST4_LANE_FUNC (int32x2x4_t, int32x4x4_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__ST4_LANE_FUNC (int64x1x4_t, int64x2x4_t, int64_t, di, v2di, di, s64, - int64x2_t) -__ST4_LANE_FUNC (uint8x8x4_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__ST4_LANE_FUNC (uint16x4x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, u16, - int16x8_t) -__ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v2si, v4si, si, u32, - int32x4_t) -__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64, - int64x2_t) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_f32 (float32_t *__ptr, float32x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + float32x4x4_t __temp; + __temp.val[0] = vcombine_f32 (__val.val[0], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f32 (__val.val[1], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f32 (__val.val[2], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f32 (__val.val[3], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev2sf ((__builtin_aarch64_simd_sf *) __ptr, __o, + __lane); +} -#define __ST4Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - union { intype __i; \ - __builtin_aarch64_simd_xi __o; } __temp = { __b }; \ - __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __temp.__o, __c); \ +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_f64 (float64_t *__ptr, float64x1x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + float64x2x4_t __temp; + __temp.val[0] = vcombine_f64 (__val.val[0], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f64 (__val.val[1], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f64 (__val.val[2], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f64 (__val.val[3], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanedf ((__builtin_aarch64_simd_df *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_p8 (poly8_t *__ptr, poly8x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + poly8x16x4_t __temp; + __temp.val[0] = vcombine_p8 (__val.val[0], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p8 (__val.val[1], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p8 (__val.val[2], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p8 (__val.val[3], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_p16 (poly16_t *__ptr, poly16x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + poly16x8x4_t __temp; + __temp.val[0] = vcombine_p16 (__val.val[0], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p16 (__val.val[1], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p16 (__val.val[2], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p16 (__val.val[3], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_p64 (poly64_t *__ptr, poly64x1x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + poly64x2x4_t __temp; + __temp.val[0] = vcombine_p64 (__val.val[0], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p64 (__val.val[1], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p64 (__val.val[2], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p64 (__val.val[3], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_s8 (int8_t *__ptr, int8x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + int8x16x4_t __temp; + __temp.val[0] = vcombine_s8 (__val.val[0], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s8 (__val.val[1], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s8 (__val.val[2], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_s8 (__val.val[3], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_s16 (int16_t *__ptr, int16x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + int16x8x4_t __temp; + __temp.val[0] = vcombine_s16 (__val.val[0], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s16 (__val.val[1], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s16 (__val.val[2], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_s16 (__val.val[3], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_s32 (int32_t *__ptr, int32x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + int32x4x4_t __temp; + __temp.val[0] = vcombine_s32 (__val.val[0], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s32 (__val.val[1], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s32 (__val.val[2], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_s32 (__val.val[3], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_s64 (int64_t *__ptr, int64x1x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + int64x2x4_t __temp; + __temp.val[0] = vcombine_s64 (__val.val[0], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s64 (__val.val[1], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s64 (__val.val[2], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_s64 (__val.val[3], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_u8 (uint8_t *__ptr, uint8x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + uint8x16x4_t __temp; + __temp.val[0] = vcombine_u8 (__val.val[0], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u8 (__val.val[1], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u8 (__val.val[2], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u8 (__val.val[3], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_u16 (uint16_t *__ptr, uint16x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + uint16x8x4_t __temp; + __temp.val[0] = vcombine_u16 (__val.val[0], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u16 (__val.val[1], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u16 (__val.val[2], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u16 (__val.val[3], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_u32 (uint32_t *__ptr, uint32x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + uint32x4x4_t __temp; + __temp.val[0] = vcombine_u32 (__val.val[0], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u32 (__val.val[1], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u32 (__val.val[2], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u32 (__val.val[3], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_u64 (uint64_t *__ptr, uint64x1x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + uint64x2x4_t __temp; + __temp.val[0] = vcombine_u64 (__val.val[0], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u64 (__val.val[1], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u64 (__val.val[2], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u64 (__val.val[3], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_f16 (float16_t *__ptr, float16x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev8hf ((__builtin_aarch64_simd_hf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_f32 (float32_t *__ptr, float32x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev4sf ((__builtin_aarch64_simd_sf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_f64 (float64_t *__ptr, float64x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev2df ((__builtin_aarch64_simd_df *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_p8 (poly8_t *__ptr, poly8x16x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_p16 (poly16_t *__ptr, poly16x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_p64 (poly64_t *__ptr, poly64x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_s8 (int8_t *__ptr, int8x16x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); } -__ST4Q_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16) -__ST4Q_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32) -__ST4Q_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64) -__ST4Q_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8) -__ST4Q_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16) -__ST4Q_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64) -__ST4Q_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8) -__ST4Q_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16) -__ST4Q_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32) -__ST4Q_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64) -__ST4Q_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8) -__ST4Q_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16) -__ST4Q_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32) -__ST4Q_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_s16 (int16_t *__ptr, int16x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_s32 (int32_t *__ptr, int32x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_s64 (int64_t *__ptr, int64x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_u8 (uint8_t *__ptr, uint8x16x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_u16 (uint16_t *__ptr, uint16x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_u32 (uint32_t *__ptr, uint32x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_u64 (uint64_t *__ptr, uint64x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -33729,9 +34046,35 @@ __ST2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16) __ST3_LANE_FUNC (bfloat16x4x3_t, bfloat16x8x3_t, bfloat16_t, v4bf, v8bf, bf, bf16, bfloat16x8_t) __ST3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16_t, v8bf, bf, bf16) -__ST4_LANE_FUNC (bfloat16x4x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, v8bf, bf, - bf16, bfloat16x8_t) -__ST4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16_t, v8bf, bf, bf16) + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_bf16 (bfloat16_t *__ptr, bfloat16x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + bfloat16x8x4_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_bf16 (__val.val[3], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev4bf ((__builtin_aarch64_simd_bf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_bf16 (bfloat16_t *__ptr, bfloat16x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev8bf ((__builtin_aarch64_simd_bf *) __ptr, __o, + __lane); +} #pragma GCC pop_options @@ -33956,7 +34299,5 @@ vaddq_p128 (poly128_t __a, poly128_t __b) #undef __ST2Q_LANE_FUNC #undef __ST3_LANE_FUNC #undef __ST3Q_LANE_FUNC -#undef __ST4_LANE_FUNC -#undef __ST4Q_LANE_FUNC #endif -- cgit v1.1 From 344f879c66d308226ad2621dc208586cb8856c51 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 30 Jul 2021 10:33:08 +0100 Subject: aarch64: Use memcpy to copy structures in vst3[q]_lane intrinsics Use __builtin_memcpy to copy vector structures instead of using a union - or constructing a new opaque structure one vector at a time - in each of the vst3[q]_lane Neon intrinsics in arm_neon.h. Add new code generation tests to verify that superfluous move instructions are not generated for the vst3q_lane intrinsics. gcc/ChangeLog: 2021-07-30 Jonathan Wright * config/aarch64/arm_neon.h (__ST3_LANE_FUNC): Delete. (__ST3Q_LANE_FUNC): Delete. (vst3_lane_f16): Use __builtin_memcpy to copy vector structure instead of constructing __builtin_aarch64_simd_ci one vector at a time. (vst3_lane_f32): Likewise. (vst3_lane_f64): Likewise. (vst3_lane_p8): Likewise. (vst3_lane_p16): Likewise. (vst3_lane_p64): Likewise. (vst3_lane_s8): Likewise. (vst3_lane_s16): Likewise. (vst3_lane_s32): Likewise. (vst3_lane_s64): Likewise. (vst3_lane_u8): Likewise. (vst3_lane_u16): Likewise. (vst3_lane_u32): Likewise. (vst3_lane_u64): Likewise. (vst3_lane_bf16): Likewise. (vst3q_lane_f16): Use __builtin_memcpy to copy vector structure instead of using a union. (vst3q_lane_f32): Likewise. (vst3q_lane_f64): Likewise. (vst3q_lane_p8): Likewise. (vst3q_lane_p16): Likewise. (vst3q_lane_p64): Likewise. (vst3q_lane_s8): Likewise. (vst3q_lane_s16): Likewise. (vst3q_lane_s32): Likewise. (vst3q_lane_s64): Likewise. (vst3q_lane_u8): Likewise. (vst3q_lane_u16): Likewise. (vst3q_lane_u32): Likewise. (vst3q_lane_u64): Likewise. (vst3q_lane_bf16): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: Add new tests. --- gcc/config/aarch64/arm_neon.h | 482 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 399 insertions(+), 83 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 6999b81..d9a833a 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9285,89 +9285,383 @@ __ST2Q_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16) __ST2Q_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32) __ST2Q_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64) -#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode, \ - qmode, ptr_mode, funcsuffix, signedtype) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst3_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_ci __o; \ - largetype __temp; \ - __temp.val[0] \ - = vcombine_##funcsuffix (__b.val[0], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[1] \ - = vcombine_##funcsuffix (__b.val[1], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[2] \ - = vcombine_##funcsuffix (__b.val[2], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[0], 0); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[1], 1); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[2], 2); \ - __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __o, __c); \ +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_f16 (float16_t *__ptr, float16x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + float16x8x3_t __temp; + __temp.val[0] = vcombine_f16 (__val.val[0], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f16 (__val.val[1], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f16 (__val.val[2], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev4hf ((__builtin_aarch64_simd_hf *) __ptr, __o, + __lane); } -__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16, - float16x8_t) -__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32, - float32x4_t) -__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, df, v2df, df, f64, - float64x2_t) -__ST3_LANE_FUNC (poly8x8x3_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, p16, - int16x8_t) -__ST3_LANE_FUNC (poly64x1x3_t, poly64x2x3_t, poly64_t, di, v2di_ssps, di, p64, - poly64x2_t) -__ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__ST3_LANE_FUNC (int32x2x3_t, int32x4x3_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__ST3_LANE_FUNC (int64x1x3_t, int64x2x3_t, int64_t, di, v2di, di, s64, - int64x2_t) -__ST3_LANE_FUNC (uint8x8x3_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__ST3_LANE_FUNC (uint16x4x3_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, u16, - int16x8_t) -__ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v2si, v4si, si, u32, - int32x4_t) -__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64, - int64x2_t) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_f32 (float32_t *__ptr, float32x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + float32x4x3_t __temp; + __temp.val[0] = vcombine_f32 (__val.val[0], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f32 (__val.val[1], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f32 (__val.val[2], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev2sf ((__builtin_aarch64_simd_sf *) __ptr, __o, + __lane); +} -#define __ST3Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - union { intype __i; \ - __builtin_aarch64_simd_ci __o; } __temp = { __b }; \ - __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __temp.__o, __c); \ +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_f64 (float64_t *__ptr, float64x1x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + float64x2x3_t __temp; + __temp.val[0] = vcombine_f64 (__val.val[0], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f64 (__val.val[1], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f64 (__val.val[2], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanedf ((__builtin_aarch64_simd_df *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_p8 (poly8_t *__ptr, poly8x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + poly8x16x3_t __temp; + __temp.val[0] = vcombine_p8 (__val.val[0], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p8 (__val.val[1], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p8 (__val.val[2], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_p16 (poly16_t *__ptr, poly16x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + poly16x8x3_t __temp; + __temp.val[0] = vcombine_p16 (__val.val[0], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p16 (__val.val[1], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p16 (__val.val[2], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_p64 (poly64_t *__ptr, poly64x1x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + poly64x2x3_t __temp; + __temp.val[0] = vcombine_p64 (__val.val[0], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p64 (__val.val[1], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p64 (__val.val[2], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_s8 (int8_t *__ptr, int8x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + int8x16x3_t __temp; + __temp.val[0] = vcombine_s8 (__val.val[0], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s8 (__val.val[1], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s8 (__val.val[2], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_s16 (int16_t *__ptr, int16x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + int16x8x3_t __temp; + __temp.val[0] = vcombine_s16 (__val.val[0], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s16 (__val.val[1], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s16 (__val.val[2], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_s32 (int32_t *__ptr, int32x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + int32x4x3_t __temp; + __temp.val[0] = vcombine_s32 (__val.val[0], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s32 (__val.val[1], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s32 (__val.val[2], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_s64 (int64_t *__ptr, int64x1x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + int64x2x3_t __temp; + __temp.val[0] = vcombine_s64 (__val.val[0], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s64 (__val.val[1], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s64 (__val.val[2], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_u8 (uint8_t *__ptr, uint8x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + uint8x16x3_t __temp; + __temp.val[0] = vcombine_u8 (__val.val[0], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u8 (__val.val[1], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u8 (__val.val[2], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_u16 (uint16_t *__ptr, uint16x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + uint16x8x3_t __temp; + __temp.val[0] = vcombine_u16 (__val.val[0], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u16 (__val.val[1], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u16 (__val.val[2], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_u32 (uint32_t *__ptr, uint32x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + uint32x4x3_t __temp; + __temp.val[0] = vcombine_u32 (__val.val[0], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u32 (__val.val[1], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u32 (__val.val[2], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_u64 (uint64_t *__ptr, uint64x1x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + uint64x2x3_t __temp; + __temp.val[0] = vcombine_u64 (__val.val[0], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u64 (__val.val[1], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u64 (__val.val[2], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_f16 (float16_t *__ptr, float16x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev8hf ((__builtin_aarch64_simd_hf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_f32 (float32_t *__ptr, float32x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev4sf ((__builtin_aarch64_simd_sf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_f64 (float64_t *__ptr, float64x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev2df ((__builtin_aarch64_simd_df *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_p8 (poly8_t *__ptr, poly8x16x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_p16 (poly16_t *__ptr, poly16x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_p64 (poly64_t *__ptr, poly64x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_s8 (int8_t *__ptr, int8x16x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_s16 (int16_t *__ptr, int16x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_s32 (int32_t *__ptr, int32x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_s64 (int64_t *__ptr, int64x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_u8 (uint8_t *__ptr, uint8x16x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); } -__ST3Q_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16) -__ST3Q_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32) -__ST3Q_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64) -__ST3Q_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8) -__ST3Q_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16) -__ST3Q_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64) -__ST3Q_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8) -__ST3Q_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16) -__ST3Q_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32) -__ST3Q_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64) -__ST3Q_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8) -__ST3Q_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16) -__ST3Q_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32) -__ST3Q_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_u16 (uint16_t *__ptr, uint16x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_u32 (uint32_t *__ptr, uint32x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_u64 (uint64_t *__ptr, uint64x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -34043,9 +34337,33 @@ __LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) __ST2_LANE_FUNC (bfloat16x4x2_t, bfloat16x8x2_t, bfloat16_t, v4bf, v8bf, bf, bf16, bfloat16x8_t) __ST2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16) -__ST3_LANE_FUNC (bfloat16x4x3_t, bfloat16x8x3_t, bfloat16_t, v4bf, v8bf, bf, - bf16, bfloat16x8_t) -__ST3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16_t, v8bf, bf, bf16) + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_bf16 (bfloat16_t *__ptr, bfloat16x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + bfloat16x8x3_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev4bf ((__builtin_aarch64_simd_bf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_bf16 (bfloat16_t *__ptr, bfloat16x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev8bf ((__builtin_aarch64_simd_bf *) __ptr, __o, + __lane); +} __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -34297,7 +34615,5 @@ vaddq_p128 (poly128_t __a, poly128_t __b) #undef __LD4Q_LANE_FUNC #undef __ST2_LANE_FUNC #undef __ST2Q_LANE_FUNC -#undef __ST3_LANE_FUNC -#undef __ST3Q_LANE_FUNC #endif -- cgit v1.1 From 1deb0818f4bca408994f666c1bd43289753507f5 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 30 Jul 2021 11:29:45 +0100 Subject: aarch64: Use memcpy to copy structures in vst2[q]_lane intrinsics Use __builtin_memcpy to copy vector structures instead of using a union - or constructing a new opaque structure one vector at a time - in each of the vst2[q]_lane Neon intrinsics in arm_neon.h. Add new code generation tests to verify that superfluous move instructions are not generated for the vst2q_lane intrinsics. gcc/ChangeLog: 2021-07-30 Jonathan Wright * config/aarch64/arm_neon.h (__ST2_LANE_FUNC): Delete. (__ST2Q_LANE_FUNC): Delete. (vst2_lane_f16): Use __builtin_memcpy to copy vector structure instead of constructing __builtin_aarch64_simd_oi one vector at a time. (vst2_lane_f32): Likewise. (vst2_lane_f64): Likewise. (vst2_lane_p8): Likewise. (vst2_lane_p16): Likewise. (vst2_lane_p64): Likewise. (vst2_lane_s8): Likewise. (vst2_lane_s16): Likewise. (vst2_lane_s32): Likewise. (vst2_lane_s64): Likewise. (vst2_lane_u8): Likewise. (vst2_lane_u16): Likewise. (vst2_lane_u32): Likewise. (vst2_lane_u64): Likewise. (vst2_lane_bf16): Likewise. (vst2q_lane_f16): Use __builtin_memcpy to copy vector structure instead of using a union. (vst2q_lane_f32): Likewise. (vst2q_lane_f64): Likewise. (vst2q_lane_p8): Likewise. (vst2q_lane_p16): Likewise. (vst2q_lane_p64): Likewise. (vst2q_lane_s8): Likewise. (vst2q_lane_s16): Likewise. (vst2q_lane_s32): Likewise. (vst2q_lane_s64): Likewise. (vst2q_lane_u8): Likewise. (vst2q_lane_u16): Likewise. (vst2q_lane_u32): Likewise. (vst2q_lane_u64): Likewise. (vst2q_lane_bf16): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: Add new tests. --- gcc/config/aarch64/arm_neon.h | 454 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 372 insertions(+), 82 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index d9a833a..cbae61d 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9206,84 +9206,355 @@ __STRUCTN (float, 64, 4) #undef __STRUCTN -#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode, \ - qmode, ptr_mode, funcsuffix, signedtype) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst2_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_oi __o; \ - largetype __temp; \ - __temp.val[0] \ - = vcombine_##funcsuffix (__b.val[0], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[1] \ - = vcombine_##funcsuffix (__b.val[1], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __o = __builtin_aarch64_set_qregoi##qmode (__o, \ - (signedtype) __temp.val[0], 0); \ - __o = __builtin_aarch64_set_qregoi##qmode (__o, \ - (signedtype) __temp.val[1], 1); \ - __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __o, __c); \ -} - -__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16, - float16x8_t) -__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32, - float32x4_t) -__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64, - float64x2_t) -__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16, - int16x8_t) -__ST2_LANE_FUNC (poly64x1x2_t, poly64x2x2_t, poly64_t, di, v2di_ssps, di, p64, - poly64x2_t) -__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, di, v2di, di, s64, - int64x2_t) -__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, u16, - int16x8_t) -__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32, - int32x4_t) -__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64, - int64x2_t) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_f16 (float16_t *__ptr, float16x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + float16x8x2_t __temp; + __temp.val[0] = vcombine_f16 (__val.val[0], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f16 (__val.val[1], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev4hf ((__builtin_aarch64_simd_hf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_f32 (float32_t *__ptr, float32x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + float32x4x2_t __temp; + __temp.val[0] = vcombine_f32 (__val.val[0], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f32 (__val.val[1], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev2sf ((__builtin_aarch64_simd_sf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_f64 (float64_t *__ptr, float64x1x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + float64x2x2_t __temp; + __temp.val[0] = vcombine_f64 (__val.val[0], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f64 (__val.val[1], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanedf ((__builtin_aarch64_simd_df *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_p8 (poly8_t *__ptr, poly8x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + poly8x16x2_t __temp; + __temp.val[0] = vcombine_p8 (__val.val[0], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p8 (__val.val[1], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_p16 (poly16_t *__ptr, poly16x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + poly16x8x2_t __temp; + __temp.val[0] = vcombine_p16 (__val.val[0], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p16 (__val.val[1], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_p64 (poly64_t *__ptr, poly64x1x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + poly64x2x2_t __temp; + __temp.val[0] = vcombine_p64 (__val.val[0], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p64 (__val.val[1], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_s8 (int8_t *__ptr, int8x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + int8x16x2_t __temp; + __temp.val[0] = vcombine_s8 (__val.val[0], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s8 (__val.val[1], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_s16 (int16_t *__ptr, int16x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + int16x8x2_t __temp; + __temp.val[0] = vcombine_s16 (__val.val[0], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s16 (__val.val[1], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_s32 (int32_t *__ptr, int32x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + int32x4x2_t __temp; + __temp.val[0] = vcombine_s32 (__val.val[0], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s32 (__val.val[1], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_s64 (int64_t *__ptr, int64x1x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + int64x2x2_t __temp; + __temp.val[0] = vcombine_s64 (__val.val[0], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s64 (__val.val[1], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_u8 (uint8_t *__ptr, uint8x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + uint8x16x2_t __temp; + __temp.val[0] = vcombine_u8 (__val.val[0], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u8 (__val.val[1], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_u16 (uint16_t *__ptr, uint16x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + uint16x8x2_t __temp; + __temp.val[0] = vcombine_u16 (__val.val[0], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u16 (__val.val[1], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} -#define __ST2Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - union { intype __i; \ - __builtin_aarch64_simd_oi __o; } __temp = { __b }; \ - __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __temp.__o, __c); \ -} - -__ST2Q_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16) -__ST2Q_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32) -__ST2Q_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64) -__ST2Q_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8) -__ST2Q_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16) -__ST2Q_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64) -__ST2Q_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8) -__ST2Q_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16) -__ST2Q_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32) -__ST2Q_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64) -__ST2Q_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8) -__ST2Q_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16) -__ST2Q_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32) -__ST2Q_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_u32 (uint32_t *__ptr, uint32x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + uint32x4x2_t __temp; + __temp.val[0] = vcombine_u32 (__val.val[0], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u32 (__val.val[1], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_u64 (uint64_t *__ptr, uint64x1x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + uint64x2x2_t __temp; + __temp.val[0] = vcombine_u64 (__val.val[0], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u64 (__val.val[1], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_f16 (float16_t *__ptr, float16x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev8hf ((__builtin_aarch64_simd_hf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_f32 (float32_t *__ptr, float32x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev4sf ((__builtin_aarch64_simd_sf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_f64 (float64_t *__ptr, float64x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev2df ((__builtin_aarch64_simd_df *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_p8 (poly8_t *__ptr, poly8x16x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_p16 (poly16_t *__ptr, poly16x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_p64 (poly64_t *__ptr, poly64x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_s8 (int8_t *__ptr, int8x16x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_s16 (int16_t *__ptr, int16x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_s32 (int32_t *__ptr, int32x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_s64 (int64_t *__ptr, int64x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_u8 (uint8_t *__ptr, uint8x16x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_u16 (uint16_t *__ptr, uint16x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_u32 (uint32_t *__ptr, uint32x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_u64 (uint64_t *__ptr, uint64x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -34334,9 +34605,30 @@ __LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, v8bf, bf, bf16, bfloat16x8_t) __LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) -__ST2_LANE_FUNC (bfloat16x4x2_t, bfloat16x8x2_t, bfloat16_t, v4bf, v8bf, bf, - bf16, bfloat16x8_t) -__ST2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_bf16 (bfloat16_t *__ptr, bfloat16x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + bfloat16x8x2_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev4bf ((__builtin_aarch64_simd_bf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_bf16 (bfloat16_t *__ptr, bfloat16x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev8bf ((__builtin_aarch64_simd_bf *) __ptr, __o, + __lane); +} __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -34613,7 +34905,5 @@ vaddq_p128 (poly128_t __a, poly128_t __b) #undef __LD3Q_LANE_FUNC #undef __LD4_LANE_FUNC #undef __LD4Q_LANE_FUNC -#undef __ST2_LANE_FUNC -#undef __ST2Q_LANE_FUNC #endif -- cgit v1.1 From bc181adf26eae77eacb73d4397ac479dac114d2d Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 30 Jul 2021 15:30:19 +0100 Subject: aarch64: Use memcpy to copy structures in bfloat vst* intrinsics Use __builtin_memcpy to copy vector structures instead of using a union - or constructing a new opaque structure one vector at a time - in each of the vst[234][q] and vst1[q]_x[234] bfloat Neon intrinsics in arm_neon.h. Add new code generation tests to verify that superfluous move instructions are not generated for the vst[234]q or vst1q_x[234] bfloat intrinsics. gcc/ChangeLog: 2021-07-30 Jonathan Wright * config/aarch64/arm_neon.h (vst1_bf16_x2): Use __builtin_memcpy instead of constructing an additional __builtin_aarch64_simd_oi one vector at a time. (vst1q_bf16_x2): Likewise. (vst1_bf16_x3): Use __builtin_memcpy instead of constructing an additional __builtin_aarch64_simd_ci one vector at a time. (vst1q_bf16_x3): Likewise. (vst1_bf16_x4): Use __builtin_memcpy instead of a union. (vst1q_bf16_x4): Likewise. (vst2_bf16): Use __builtin_memcpy instead of constructing an additional __builtin_aarch64_simd_oi one vector at a time. (vst2q_bf16): Likewise. (vst3_bf16): Use __builtin_memcpy instead of constructing an additional __builtin_aarch64_simd_ci mode one vector at a time. (vst3q_bf16): Likewise. (vst4_bf16): Use __builtin_memcpy instead of constructing an additional __builtin_aarch64_simd_xi one vector at a time. (vst4q_bf16): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: Add new tests. --- gcc/config/aarch64/arm_neon.h | 57 +++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 34 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index cbae61d..390cf9a 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -33839,8 +33839,7 @@ vst1_bf16_x2 (bfloat16_t * __a, bfloat16x4x2_t __val) bfloat16x8x2_t __temp; __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2v4bf (__a, __o); } @@ -33849,8 +33848,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_bf16_x2 (bfloat16_t * __a, bfloat16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v8bf (__a, __o); } @@ -33863,9 +33861,7 @@ vst1_bf16_x3 (bfloat16_t * __a, bfloat16x4x3_t __val) __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } @@ -33874,26 +33870,31 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t val) +vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t __val) { - union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + bfloat16x8x4_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t val) +vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t __val) { - union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } __extension__ extern __inline void @@ -33925,8 +33926,7 @@ vst2_bf16 (bfloat16_t * __a, bfloat16x4x2_t __val) bfloat16x8x2_t __temp; __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2v4bf (__a, __o); } @@ -33935,8 +33935,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_bf16 (bfloat16_t * __a, bfloat16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v8bf (__a, __o); } @@ -33949,9 +33948,7 @@ vst3_bf16 (bfloat16_t * __a, bfloat16x4x3_t __val) __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } @@ -33960,9 +33957,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_bf16 (bfloat16_t * __a, bfloat16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } @@ -33976,10 +33971,7 @@ vst4_bf16 (bfloat16_t * __a, bfloat16x4x4_t __val) __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } @@ -33988,10 +33980,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_bf16 (bfloat16_t * __a, bfloat16x8x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } -- cgit v1.1 From 6b0bde7eef492843426c3f6b2da229b3c1526eaa Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Fri, 6 Aug 2021 12:21:05 +0100 Subject: middle-end/AArch64: Fix bootstrap after vec changes The build is broken since a3d3e8c362c2 since it's deleted the ability to pass vec<> by value and now must be past by reference. However some language hooks used by AArch64 were not updated and breaks the build on AArch64. This patch updates these hooks. gcc/c/ChangeLog: * c-decl.c (c_simulate_enum_decl): Pass vec<> by pointer. * c-tree.h (c_simulate_enum_decl): Likewise. gcc/ChangeLog: * config/aarch64/aarch64-sve-builtins.cc (register_svpattern, register_svprfop): Pass vec<> by pointer. * langhooks-def.h (lhd_simulate_enum_decl): Likewise. * langhooks.c (lhd_simulate_enum_decl): Likewise. * langhooks.h (struct lang_hooks_for_types): Likewise. gcc/cp/ChangeLog: * cp-objcp-common.h (cxx_simulate_enum_decl): Pass vec<> by pointer. * decl.c (cxx_simulate_enum_decl): Likewise. --- gcc/config/aarch64/aarch64-sve-builtins.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc index f44f81f..f71b287 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc @@ -3499,7 +3499,7 @@ register_svpattern () #undef PUSH acle_svpattern = lang_hooks.types.simulate_enum_decl (input_location, - "svpattern", values); + "svpattern", &values); } /* Register the svprfop enum. */ @@ -3513,7 +3513,7 @@ register_svprfop () #undef PUSH acle_svprfop = lang_hooks.types.simulate_enum_decl (input_location, - "svprfop", values); + "svprfop", &values); } /* Implement #pragma GCC aarch64 "arm_sve.h". */ -- cgit v1.1 From cd04e829c3ae244abd711e2597f8b72d6c58c713 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 6 Aug 2021 14:21:27 +0200 Subject: i386: Fix conditional move reg-to-reg move elimination peepholes [PR101797] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add missing operand predicate, otherwise any RTX will match. 2021-08-06 Uroš Bizjak gcc/ PR target/101797 * config/i386/i386.md (cmove reg-to-reg move elimination peephole2s): Add general_gr_operand predicate to operand 3. gcc/testsuite/ PR target/101797 * gcc.target/i386/pr101797.c: New test. --- gcc/config/i386/i386.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 51e8b47..bc1c30b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -19428,7 +19428,7 @@ (parallel [(set (reg FLAGS_REG) (match_operand 5)) (set (match_dup 0) (match_operand:SWI248 6))]) (set (match_operand:SWI248 2 "general_reg_operand") - (match_operand:SWI248 3)) + (match_operand:SWI248 3 "general_gr_operand")) (set (match_dup 0) (if_then_else:SWI248 (match_operator 4 "ix86_comparison_operator" [(reg FLAGS_REG) (const_int 0)]) @@ -19456,7 +19456,7 @@ ;; mov r2,r3; mov r0,r1; dec r0; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1 (define_peephole2 [(set (match_operand:SWI248 2 "general_reg_operand") - (match_operand:SWI248 3)) + (match_operand:SWI248 3 "general_gr_operand")) (set (match_operand:SWI248 0 "general_reg_operand") (match_operand:SWI248 1 "general_reg_operand")) (parallel [(set (reg FLAGS_REG) (match_operand 5)) -- cgit v1.1 From a5e78ee60cd54dcceb9e7cfa42edd0c29c280f5c Mon Sep 17 00:00:00 2001 From: Bin Cheng Date: Mon, 9 Aug 2021 17:21:03 +0800 Subject: aarch64: Expand % correctly according to mode iterator Pattern "*extend2_aarch64" is duplicated from the corresponding zero_extend pattern, however % needs to be expanded according to its mode iterator because the smov instruction is different to umov. 2021-08-09 Bin Cheng gcc/ * config/aarch64/aarch64.md (*extend2_aarch64): Use %0. --- gcc/config/aarch64/aarch64.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index eb8ccd4..7085cd4 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1880,7 +1880,7 @@ "@ sxt\t%0, %w1 ldrs\t%0, %1 - smov\t%w0, %1.[0]" + smov\t%0, %1.[0]" [(set_attr "type" "extend,load_4,neon_to_gp") (set_attr "arch" "*,*,fp")] ) -- cgit v1.1 From e2e0b85c1e7cb53fd720df0d09278e3d485c733e Mon Sep 17 00:00:00 2001 From: Tejas Belagod Date: Mon, 9 Aug 2021 11:33:30 +0100 Subject: PR101609: Use the correct iterator for AArch64 vector right shift pattern Loops containing long long shifts fail to vectorize due to the vectorizer not being able to recognize long long right shifts. This is due to a bug in the iterator used for the vashr and vlshr patterns in aarch64-simd.md. 2021-08-09 Tejas Belagod gcc/ChangeLog PR target/101609 * config/aarch64/aarch64-simd.md (vlshr3, vashr3): Use the right iterator. gcc/testsuite/ChangeLog * gcc.target/aarch64/vect-shr-reg.c: New testcase. * gcc.target/aarch64/vect-shr-reg-run.c: Likewise. --- gcc/config/aarch64/aarch64-simd.md | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index c5638d0..48eddf6 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1299,13 +1299,10 @@ DONE; }) -;; Using mode VDQ_BHSI as there is no V2DImode neg! -;; Negating individual lanes most certainly offsets the -;; gain from vectorization. (define_expand "vashr3" - [(match_operand:VDQ_BHSI 0 "register_operand") - (match_operand:VDQ_BHSI 1 "register_operand") - (match_operand:VDQ_BHSI 2 "register_operand")] + [(match_operand:VDQ_I 0 "register_operand") + (match_operand:VDQ_I 1 "register_operand") + (match_operand:VDQ_I 2 "register_operand")] "TARGET_SIMD" { rtx neg = gen_reg_rtx (mode); @@ -1333,9 +1330,9 @@ ) (define_expand "vlshr3" - [(match_operand:VDQ_BHSI 0 "register_operand") - (match_operand:VDQ_BHSI 1 "register_operand") - (match_operand:VDQ_BHSI 2 "register_operand")] + [(match_operand:VDQ_I 0 "register_operand") + (match_operand:VDQ_I 1 "register_operand") + (match_operand:VDQ_I 2 "register_operand")] "TARGET_SIMD" { rtx neg = gen_reg_rtx (mode); -- cgit v1.1 From e2a58ed6dc5293602d0d168475109caa81ad0f0d Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Tue, 2 Mar 2021 04:20:11 -0800 Subject: openacc: Middle-end worker-partitioning support This patch implements worker-partitioning support in the middle end, by rewriting gimple. The OpenACC execution model requires that code can run in either "worker single" mode where only a single worker per gang is active, or "worker partitioned" mode, where multiple workers per gang are active. This means we need to do something equivalent to spawning additional workers when transitioning from worker-single to worker-partitioned mode. However, GPUs typically fix the number of threads of invoked kernels at launch time, so we need to do something with the "extra" threads when they are not wanted. The scheme used is to conditionalise each basic block that executes in "worker single" mode for worker 0 only. Conditional branches are handled specially so "idle" (non-0) workers follow along with worker 0. On transitioning to "worker partitioned" mode, any variables modified by worker 0 are propagated to the other workers via GPU shared memory. Special care is taken for routine calls, writes through pointers, and so forth, as follows: - There are two types of function calls to consider in worker-single mode: "normal" calls to maths library routines, etc. are called from worker 0 only. OpenACC routines may contain worker-partitioned loops themselves, so are called from all workers, including "idle" ones. - SSA names set in worker-single mode, but used in worker-partitioned mode, are copied to shared memory in worker 0. Other workers retrieve the value from the appropriate shared-memory location after a barrier, and new phi nodes are introduced at the convergence point to resolve the worker 0/other worker copies of the value. - Local scalar variables (on the stack) also need special handling. We broadcast any variables that are written in the current worker-single block, and that are read in any worker-partitioned block. (This is believed to be safe, and is flow-insensitive to ease analysis.) - Local aggregates (arrays and composites) on the stack are *not* broadcast. Instead we force gimple stmts modifying elements/fields of local aggregates into fully-partitioned mode. The RHS of the assignment is a scalar, and is thus subject to broadcasting as above. - Writes through pointers may affect any local variable that has its address taken. We use points-to analysis to determine the set of potentially-affected variables for a given pointer indirection. We broadcast any such variable which is used in worker-partitioned mode, on a per-block basis for any block containing a write through a pointer. Some slides about the implementation (from 2018) are available at: https://jtb20.github.io/gcnworkers.pdf gcc/ * Makefile.in (OBJS): Add omp-oacc-neuter-broadcast.o. * doc/tm.texi.in (TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD): Add documentation hook. * doc/tm.texi: Regenerate. * omp-oacc-neuter-broadcast.cc: New file. * omp-builtins.def (BUILT_IN_GOACC_BARRIER) (BUILT_IN_GOACC_SINGLE_START, BUILT_IN_GOACC_SINGLE_COPY_START) (BUILT_IN_GOACC_SINGLE_COPY_END): New builtins. * passes.def (pass_omp_oacc_neuter_broadcast): Add pass. * target.def (goacc.create_worker_broadcast_record): Add target hook. * tree-pass.h (make_pass_omp_oacc_neuter_broadcast): Add prototype. * config/gcn/gcn-protos.h (gcn_goacc_adjust_propagation_record): Rename prototype to... (gcn_goacc_create_worker_broadcast_record): ... this. * config/gcn/gcn-tree.c (gcn_goacc_adjust_propagation_record): Rename function to... (gcn_goacc_create_worker_broadcast_record): ... this. * config/gcn/gcn.c (TARGET_GOACC_ADJUST_PROPAGATION_RECORD): Rename to... (TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD): ... this. Co-Authored-By: Nathan Sidwell (via 'gcc/config/nvptx/nvptx.c' master) Co-Authored-By: Kwok Cheung Yeung Co-Authored-By: Thomas Schwinge --- gcc/config/gcn/gcn-protos.h | 5 ++-- gcc/config/gcn/gcn-tree.c | 58 ++++++++++++++++++++++----------------------- gcc/config/gcn/gcn.c | 6 ++--- 3 files changed, 35 insertions(+), 34 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h index 8bd0b43..5d62a84 100644 --- a/gcc/config/gcn/gcn-protos.h +++ b/gcc/config/gcn/gcn-protos.h @@ -38,9 +38,10 @@ extern rtx gcn_full_exec (); extern rtx gcn_full_exec_reg (); extern rtx gcn_gen_undef (machine_mode); extern bool gcn_global_address_p (rtx); -extern tree gcn_goacc_adjust_propagation_record (tree record_type, bool sender, - const char *name); extern tree gcn_goacc_adjust_private_decl (location_t, tree var, int level); +extern tree gcn_goacc_create_worker_broadcast_record (tree record_type, + bool sender, + const char *name); extern void gcn_goacc_reduction (gcall *call); extern bool gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg); diff --git a/gcc/config/gcn/gcn-tree.c b/gcc/config/gcn/gcn-tree.c index 1eb8882..f722d2d 100644 --- a/gcc/config/gcn/gcn-tree.c +++ b/gcc/config/gcn/gcn-tree.c @@ -548,35 +548,6 @@ gcn_goacc_reduction (gcall *call) } } -/* Implement TARGET_GOACC_ADJUST_PROPAGATION_RECORD. - - Tweak (worker) propagation record, e.g. to put it in shared memory. */ - -tree -gcn_goacc_adjust_propagation_record (tree record_type, bool sender, - const char *name) -{ - tree type = record_type; - - TYPE_ADDR_SPACE (type) = ADDR_SPACE_LDS; - - if (!sender) - type = build_pointer_type (type); - - tree decl = create_tmp_var_raw (type, name); - - if (sender) - { - DECL_CONTEXT (decl) = NULL_TREE; - TREE_STATIC (decl) = 1; - } - - if (sender) - varpool_node::finalize_decl (decl); - - return decl; -} - tree gcn_goacc_adjust_private_decl (location_t, tree var, int level) { @@ -604,4 +575,33 @@ gcn_goacc_adjust_private_decl (location_t, tree var, int level) return var; } +/* Implement TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD. + + Create OpenACC worker state propagation record in shared memory. */ + +tree +gcn_goacc_create_worker_broadcast_record (tree record_type, bool sender, + const char *name) +{ + tree type = record_type; + + TYPE_ADDR_SPACE (type) = ADDR_SPACE_LDS; + + if (!sender) + type = build_pointer_type (type); + + tree decl = create_tmp_var_raw (type, name); + + if (sender) + { + DECL_CONTEXT (decl) = NULL_TREE; + TREE_STATIC (decl) = 1; + } + + if (sender) + varpool_node::finalize_decl (decl); + + return decl; +} + /* }}} */ diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index d25c4e5..87af5d1 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -6513,11 +6513,11 @@ gcn_dwarf_register_span (rtx rtl) #define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA #define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa -#undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD -#define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \ - gcn_goacc_adjust_propagation_record #undef TARGET_GOACC_ADJUST_PRIVATE_DECL #define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl +#undef TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD +#define TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD \ + gcn_goacc_create_worker_broadcast_record #undef TARGET_GOACC_FORK_JOIN #define TARGET_GOACC_FORK_JOIN gcn_fork_join #undef TARGET_GOACC_REDUCTION -- cgit v1.1 From c408512e1f7ca07e07794dc13fd6dfd9d2d7e998 Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Tue, 2 Mar 2021 04:20:13 -0800 Subject: amdgcn: Enable OpenACC worker partitioning for AMD GCN gcc/ * config/gcn/gcn.c (gcn_init_builtins): Override decls for BUILT_IN_GOACC_SINGLE_START, BUILT_IN_GOACC_SINGLE_COPY_START, BUILT_IN_GOACC_SINGLE_COPY_END and BUILT_IN_GOACC_BARRIER. (gcn_goacc_validate_dims): Turn on worker partitioning unconditionally. (gcn_fork_join): Update comment. * config/gcn/gcn.opt (flag_worker_partitioning): Remove. (macc_experimental_workers): Remove unused option. libgomp/ * plugin/plugin-gcn.c (gcn_exec): Change default number of workers to 16. * testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c [acc_device_radeon]: Update. * testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c [ACC_DEVICE_TYPE_radeon]: Likewise. * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c [acc_device_radeon]: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c [ACC_DEVICE_TYPE_radeon]: Likewise. * testsuite/libgomp.oacc-fortran/optional-reduction.f90: XFAIL for 'openacc_radeon_accel_selected' and '-O0'. * testsuite/libgomp.oacc-fortran/reduction-7.f90: Likewise. Co-Authored-By: Kwok Cheung Yeung Co-Authored-By: Thomas Schwinge --- gcc/config/gcn/gcn.c | 15 +++------------ gcc/config/gcn/gcn.opt | 5 ----- 2 files changed, 3 insertions(+), 17 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 87af5d1..9df2827 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -3712,8 +3712,6 @@ gcn_init_builtins (void) TREE_NOTHROW (gcn_builtin_decls[i]) = 1; } -/* FIXME: remove the ifdef once OpenACC support is merged upstream. */ -#ifdef BUILT_IN_GOACC_SINGLE_START /* These builtins need to take/return an LDS pointer: override the generic versions here. */ @@ -3730,7 +3728,6 @@ gcn_init_builtins (void) set_builtin_decl (BUILT_IN_GOACC_BARRIER, gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false); -#endif } /* Implement TARGET_INIT_LIBFUNCS. */ @@ -5019,11 +5016,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned /*used*/) { bool changed = false; - - /* FIXME: remove -facc-experimental-workers when they're ready. */ - int max_workers = flag_worker_partitioning ? 16 : 1; - - gcc_assert (!flag_worker_partitioning); + const int max_workers = 16; /* The vector size must appear to be 64, to the user, unless this is a SEQ routine. The real, internal value is always 1, which means use @@ -5060,8 +5053,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level, { dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS; if (dims[GOMP_DIM_WORKER] < 0) - dims[GOMP_DIM_WORKER] = (flag_worker_partitioning - ? GCN_DEFAULT_WORKERS : 1); + dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS; if (dims[GOMP_DIM_GANG] < 0) dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS; changed = true; @@ -5126,8 +5118,7 @@ static bool gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims), bool ARG_UNUSED (is_fork)) { - /* GCN does not use the fork/join concept invented for NVPTX. - Instead we use standard autovectorization. */ + /* GCN does not need to expand fork/join markers at the RTL level. */ return false; } diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt index b2b10b0..6faacca 100644 --- a/gcc/config/gcn/gcn.opt +++ b/gcc/config/gcn/gcn.opt @@ -62,11 +62,6 @@ bool flag_bypass_init_error = false mbypass-init-error Target RejectNegative Var(flag_bypass_init_error) -bool flag_worker_partitioning = false - -macc-experimental-workers -Target Var(flag_worker_partitioning) Init(0) - int stack_size_opt = -1 mstack-size= -- cgit v1.1 From 62f01243fb27030b8d99c671f27349c2e7465edc Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Mon, 9 Aug 2021 12:21:43 +0200 Subject: Cross-reference parts adapted in 'gcc/omp-oacc-neuter-broadcast.cc' gcc/ * config/nvptx/nvptx.c: Cross-reference parts adapted in 'gcc/omp-oacc-neuter-broadcast.cc'. * omp-low.c: Likewise. * omp-oacc-neuter-broadcast.cc: Cross-reference parts adapted from the above files. --- gcc/config/nvptx/nvptx.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 6642bdf..4e4909e 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -3205,6 +3205,7 @@ nvptx_mach_vector_length () /* Loop structure of the function. The entire function is described as a NULL loop. */ +/* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'. */ struct parallel { @@ -3282,6 +3283,7 @@ typedef auto_vec insn_bb_vec_t; partitioning mode of the function as a whole. Populate MAP with head and tail blocks. We also clear the BB visited flag, which is used when finding partitions. */ +/* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_split_blocks'. */ static void nvptx_split_blocks (bb_insn_map_t *map) @@ -3383,6 +3385,7 @@ nvptx_discover_pre (basic_block block, int expected) } /* Dump this parallel and all its inner parallels. */ +/* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_dump_pars'. */ static void nvptx_dump_pars (parallel *par, unsigned depth) @@ -3408,6 +3411,7 @@ nvptx_dump_pars (parallel *par, unsigned depth) /* If BLOCK contains a fork/join marker, process it to create or terminate a loop structure. Add this block to the current loop, and then walk successor blocks. */ +/* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_find_par'. */ static parallel * nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block) @@ -3488,6 +3492,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block) to head & tail markers, discovered when splitting blocks. This speeds up the discovery. We rely on the BB visited flag having been cleared when splitting blocks. */ +/* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_discover_pars'. */ static parallel * nvptx_discover_pars (bb_insn_map_t *map) -- cgit v1.1 From 9d2d660aab2f332b1e3f69a2fb3419cf3cc33b47 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 9 Aug 2021 16:38:54 +0200 Subject: i386: Name V2SF logic insns [PR101812] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Name V2SF logic insns, so expand_simple_binop works with V2SF modes. 2021-08-09 Uroš Bizjak gcc/ PR target/101812 * config/i386/mmx.md (v2sf3): Rename from *mmx_v2sf3 gcc/testsuite/ PR target/101812 * gcc.target/i386/pr101812.c: New test. --- gcc/config/i386/mmx.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 0984f7c..2d3b63f 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -952,7 +952,7 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "V4SF")]) -(define_insn "*mmx_v2sf3" +(define_insn "v2sf3" [(set (match_operand:V2SF 0 "register_operand" "=x,x") (any_logic:V2SF (match_operand:V2SF 1 "register_operand" "%0,x") -- cgit v1.1 From 00eab082e9f6ac2a7c4b38323829be29f092abcb Mon Sep 17 00:00:00 2001 From: Pat Haugen Date: Mon, 9 Aug 2021 10:05:49 -0500 Subject: Verify destination[source] of a load[store] instruction is a register. gcc/ChangeLog: * config/rs6000/rs6000.c (is_load_insn1): Verify destination is a register. (is_store_insn1): Verify source is a register. --- gcc/config/rs6000/rs6000.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 5b1c06b..60f406a 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -18363,7 +18363,12 @@ is_load_insn1 (rtx pat, rtx *load_mem) return false; if (GET_CODE (pat) == SET) - return find_mem_ref (SET_SRC (pat), load_mem); + { + if (REG_P (SET_DEST (pat))) + return find_mem_ref (SET_SRC (pat), load_mem); + else + return false; + } if (GET_CODE (pat) == PARALLEL) { @@ -18400,7 +18405,12 @@ is_store_insn1 (rtx pat, rtx *str_mem) return false; if (GET_CODE (pat) == SET) - return find_mem_ref (SET_DEST (pat), str_mem); + { + if (REG_P (SET_SRC (pat)) || SUBREG_P (SET_SRC (pat))) + return find_mem_ref (SET_DEST (pat), str_mem); + else + return false; + } if (GET_CODE (pat) == PARALLEL) { -- cgit v1.1 From 813ccbe9d272cd67a8f075beea280de95f807492 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Thu, 5 Aug 2021 17:51:48 +0800 Subject: Support cond_ashr/lshr/ashl for vector integer modes under AVX512. gcc/ChangeLog: * config/i386/sse.md (cond_): New expander. (VI248_AVX512VLBW): New mode iterator. * config/i386/predicates.md (nonimmediate_or_const_vec_dup_operand): New predicate. gcc/testsuite/ChangeLog: * gcc.target/i386/cond_op_shift_d-1.c: New test. * gcc.target/i386/cond_op_shift_d-2.c: New test. * gcc.target/i386/cond_op_shift_q-1.c: New test. * gcc.target/i386/cond_op_shift_q-2.c: New test. * gcc.target/i386/cond_op_shift_ud-1.c: New test. * gcc.target/i386/cond_op_shift_ud-2.c: New test. * gcc.target/i386/cond_op_shift_uq-1.c: New test. * gcc.target/i386/cond_op_shift_uq-2.c: New test. * gcc.target/i386/cond_op_shift_uw-1.c: New test. * gcc.target/i386/cond_op_shift_uw-2.c: New test. * gcc.target/i386/cond_op_shift_w-1.c: New test. * gcc.target/i386/cond_op_shift_w-2.c: New test. --- gcc/config/i386/predicates.md | 4 ++++ gcc/config/i386/sse.md | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 6aa1ea3..129205a 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1161,6 +1161,10 @@ (ior (match_operand 0 "nonimmediate_operand") (match_code "const_vector"))) +(define_predicate "nonimmediate_or_const_vec_dup_operand" + (ior (match_operand 0 "nonimmediate_operand") + (match_test "const_vec_duplicate_p (op)"))) + ;; Return true when OP is either register operand, or any ;; CONST_VECTOR. (define_predicate "reg_or_const_vector_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a46a237..45b1ec2 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -506,6 +506,13 @@ (V4DI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) +(define_mode_iterator VI248_AVX512VLBW + [(V32HI "TARGET_AVX512BW") + (V16HI "TARGET_AVX512VL && TARGET_AVX512BW") + (V8HI "TARGET_AVX512VL && TARGET_AVX512BW") + V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") + V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) + (define_mode_iterator VI48_AVX2 [(V8SI "TARGET_AVX2") V4SI (V4DI "TARGET_AVX2") V2DI]) @@ -22786,6 +22793,35 @@ DONE; }) +(define_expand "cond_" + [(set (match_operand:VI248_AVX512VLBW 0 "register_operand") + (vec_merge:VI248_AVX512VLBW + (any_shift:VI248_AVX512VLBW + (match_operand:VI248_AVX512VLBW 2 "register_operand") + (match_operand:VI248_AVX512VLBW 3 "nonimmediate_or_const_vec_dup_operand")) + (match_operand:VI248_AVX512VLBW 4 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512F" +{ + if (const_vec_duplicate_p (operands[3])) + { + operands[3] = unwrap_const_vec_duplicate (operands[3]); + operands[3] = lowpart_subreg (DImode, operands[3], mode); + emit_insn (gen_3_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + } + else + emit_insn (gen__v_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + DONE; +}) + (define_insn "_ashrv" [(set (match_operand:VI48_AVX512F_AVX512VL 0 "register_operand" "=v") (ashiftrt:VI48_AVX512F_AVX512VL -- cgit v1.1 From 3d7ccbc1efbd475031a9a4a6110c531f71fbf631 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 6 Aug 2021 12:32:01 -0700 Subject: x86: Optimize load of const FP all bits set vectors Check float_vector_all_ones_operand for vector floating-point modes to optimize load of const floating-point all bits set vectors. gcc/ PR target/101804 * config/i386/constraints.md (BC): Document for integer SSE constant all bits set operand. (BF): New constraint for const floating-point all bits set vectors. * config/i386/i386.c (standard_sse_constant_p): Likewise. (standard_sse_constant_opcode): Likewise. * config/i386/sse.md (sseconstm1): New mode attribute. (mov_internal): Replace BC with . gcc/testsuite/ PR target/101804 * gcc.target/i386/avx2-gather-2.c: Pass -march=skylake instead of "-mavx2 -mtune=skylake". Scan vpcmpeqd. --- gcc/config/i386/constraints.md | 10 ++++++++-- gcc/config/i386/i386.c | 11 +++++++++-- gcc/config/i386/sse.md | 11 ++++++++++- 3 files changed, 27 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md index 4aa28a5..87cceac 100644 --- a/gcc/config/i386/constraints.md +++ b/gcc/config/i386/constraints.md @@ -166,7 +166,8 @@ ;; s Sibcall memory operand, not valid for TARGET_X32 ;; w Call memory operand, not valid for TARGET_X32 ;; z Constant call address operand. -;; C SSE constant operand. +;; C Integer SSE constant with all bits set operand. +;; F Floating-point SSE constant with all bits set operand. (define_constraint "Bf" "@internal Flags register operand." @@ -216,11 +217,16 @@ (match_operand 0 "constant_call_address_operand")) (define_constraint "BC" - "@internal SSE constant -1 operand." + "@internal integer SSE constant with all bits set operand." (and (match_test "TARGET_SSE") (ior (match_test "op == constm1_rtx") (match_operand 0 "vector_all_ones_operand")))) +(define_constraint "BF" + "@internal floating-point SSE constant with all bits set operand." + (and (match_test "TARGET_SSE") + (match_operand 0 "float_vector_all_ones_operand"))) + ;; Integer constant constraints. (define_constraint "Wb" "Integer constant in the range 0 @dots{} 7, for 8-bit shifts." diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index aea224a..4d4ab6a 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -5073,7 +5073,11 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) if (x == const0_rtx || const0_operand (x, mode)) return 1; - if (x == constm1_rtx || vector_all_ones_operand (x, mode)) + if (x == constm1_rtx + || vector_all_ones_operand (x, mode) + || ((GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT + || GET_MODE_CLASS (pred_mode) == MODE_VECTOR_FLOAT) + && float_vector_all_ones_operand (x, mode))) { /* VOIDmode integer constant, get mode from the predicate. */ if (mode == VOIDmode) @@ -5171,7 +5175,10 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) gcc_unreachable (); } } - else if (x == constm1_rtx || vector_all_ones_operand (x, mode)) + else if (x == constm1_rtx + || vector_all_ones_operand (x, mode) + || (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT + && float_vector_all_ones_operand (x, mode))) { enum attr_mode insn_mode = get_attr_mode (insn); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 45b1ec2..2b0d10e 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -784,6 +784,15 @@ (V4SF "V4SF") (V2DF "V2DF") (TI "TI")]) +;; SSE constant -1 constraint +(define_mode_attr sseconstm1 + [(V64QI "BC") (V32HI "BC") (V16SI "BC") (V8DI "BC") (V4TI "BC") + (V32QI "BC") (V16HI "BC") (V8SI "BC") (V4DI "BC") (V2TI "BC") + (V16QI "BC") (V8HI "BC") (V4SI "BC") (V2DI "BC") (V1TI "BC") + (V16SF "BF") (V8DF "BF") + (V8SF "BF") (V4DF "BF") + (V4SF "BF") (V2DF "BF")]) + ;; Mapping of vector modes to corresponding mask size (define_mode_attr avx512fmaskmode [(V64QI "DI") (V32QI "SI") (V16QI "HI") @@ -1063,7 +1072,7 @@ [(set (match_operand:VMOVE 0 "nonimmediate_operand" "=v,v ,v ,m") (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand" - " C,BC,vm,v"))] + " C,,vm,v"))] "TARGET_SSE && (register_operand (operands[0], mode) || register_operand (operands[1], mode))" -- cgit v1.1 From 7665af0b1a964b1baae3a59b22fcc420369c63cf Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 10 Aug 2021 11:34:53 +0200 Subject: i386: Improve single operand AVX512F permutations [PR80355] On the following testcase we emit vmovdqa32 .LC0(%rip), %zmm1 vpermd %zmm0, %zmm1, %zmm0 and vmovdqa64 .LC1(%rip), %zmm1 vpermq %zmm0, %zmm1, %zmm0 instead of vshufi32x4 $78, %zmm0, %zmm0, %zmm0 and vshufi64x2 $78, %zmm0, %zmm0, %zmm0 we can emit with the patch. We have patterns that match two argument permutations for vshuf[if]*, but for one argument it doesn't trigger. Either we can add two patterns for that, or we would need to add another routine to i386-expand.c that would transform under certain condition these cases to the two argument vshuf*, doing it in sse.md looked simpler. We don't need this for 32-byte vectors, we already emit single insn permutation that doesn't need memory op there. 2021-08-10 Jakub Jelinek PR target/80355 * config/i386/sse.md (*avx512f_shuf_64x2_1_1, *avx512f_shuf_32x4_1_1): New define_insn patterns. * gcc.target/i386/avx512f-pr80355-1.c: New test. --- gcc/config/i386/sse.md | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 2b0d10e..3957c86 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -15336,6 +15336,42 @@ (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn "*avx512f_shuf_64x2_1_1" + [(set (match_operand:V8FI 0 "register_operand" "=v") + (vec_select:V8FI + (match_operand:V8FI 1 "register_operand" "v") + (parallel [(match_operand 2 "const_0_to_7_operand") + (match_operand 3 "const_0_to_7_operand") + (match_operand 4 "const_0_to_7_operand") + (match_operand 5 "const_0_to_7_operand") + (match_operand 6 "const_0_to_7_operand") + (match_operand 7 "const_0_to_7_operand") + (match_operand 8 "const_0_to_7_operand") + (match_operand 9 "const_0_to_7_operand")])))] + "TARGET_AVX512F + && (INTVAL (operands[2]) & 1) == 0 + && INTVAL (operands[2]) == INTVAL (operands[3]) - 1 + && (INTVAL (operands[4]) & 1) == 0 + && INTVAL (operands[4]) == INTVAL (operands[5]) - 1 + && (INTVAL (operands[6]) & 1) == 0 + && INTVAL (operands[6]) == INTVAL (operands[7]) - 1 + && (INTVAL (operands[8]) & 1) == 0 + && INTVAL (operands[8]) == INTVAL (operands[9]) - 1" +{ + int mask; + mask = INTVAL (operands[2]) / 2; + mask |= INTVAL (operands[4]) / 2 << 2; + mask |= INTVAL (operands[6]) / 2 << 4; + mask |= INTVAL (operands[8]) / 2 << 6; + operands[2] = GEN_INT (mask); + + return "vshuf64x2\t{%2, %1, %1, %0|%0, %1, %1, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + (define_expand "avx512vl_shuf_32x4_mask" [(match_operand:VI4F_256 0 "register_operand") (match_operand:VI4F_256 1 "register_operand") @@ -15482,6 +15518,58 @@ (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn "*avx512f_shuf_32x4_1_1" + [(set (match_operand:V16FI 0 "register_operand" "=v") + (vec_select:V16FI + (match_operand:V16FI 1 "register_operand" "v") + (parallel [(match_operand 2 "const_0_to_15_operand") + (match_operand 3 "const_0_to_15_operand") + (match_operand 4 "const_0_to_15_operand") + (match_operand 5 "const_0_to_15_operand") + (match_operand 6 "const_0_to_15_operand") + (match_operand 7 "const_0_to_15_operand") + (match_operand 8 "const_0_to_15_operand") + (match_operand 9 "const_0_to_15_operand") + (match_operand 10 "const_0_to_15_operand") + (match_operand 11 "const_0_to_15_operand") + (match_operand 12 "const_0_to_15_operand") + (match_operand 13 "const_0_to_15_operand") + (match_operand 14 "const_0_to_15_operand") + (match_operand 15 "const_0_to_15_operand") + (match_operand 16 "const_0_to_15_operand") + (match_operand 17 "const_0_to_15_operand")])))] + "TARGET_AVX512F + && (INTVAL (operands[2]) & 3) == 0 + && INTVAL (operands[2]) == INTVAL (operands[3]) - 1 + && INTVAL (operands[2]) == INTVAL (operands[4]) - 2 + && INTVAL (operands[2]) == INTVAL (operands[5]) - 3 + && (INTVAL (operands[6]) & 3) == 0 + && INTVAL (operands[6]) == INTVAL (operands[7]) - 1 + && INTVAL (operands[6]) == INTVAL (operands[8]) - 2 + && INTVAL (operands[6]) == INTVAL (operands[9]) - 3 + && (INTVAL (operands[10]) & 3) == 0 + && INTVAL (operands[10]) == INTVAL (operands[11]) - 1 + && INTVAL (operands[10]) == INTVAL (operands[12]) - 2 + && INTVAL (operands[10]) == INTVAL (operands[13]) - 3 + && (INTVAL (operands[14]) & 3) == 0 + && INTVAL (operands[14]) == INTVAL (operands[15]) - 1 + && INTVAL (operands[14]) == INTVAL (operands[16]) - 2 + && INTVAL (operands[14]) == INTVAL (operands[17]) - 3" +{ + int mask; + mask = INTVAL (operands[2]) / 4; + mask |= INTVAL (operands[6]) / 4 << 2; + mask |= INTVAL (operands[10]) / 4 << 4; + mask |= INTVAL (operands[14]) / 4 << 6; + operands[2] = GEN_INT (mask); + + return "vshuf32x4\t{%2, %1, %1, %0|%0, %1, %1, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + (define_expand "avx512f_pshufdv3_mask" [(match_operand:V16SI 0 "register_operand") (match_operand:V16SI 1 "nonimmediate_operand") -- cgit v1.1 From 50b5877925ef5ae8e9f913d6d2b5ce0204ebc588 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 10 Aug 2021 12:38:00 +0200 Subject: i386: Allow some V32HImode and V64QImode permutations even without AVX512BW [PR80355] When working on the PR, I've noticed we generate terrible code for V32HImode or V64QImode permutations for -mavx512f -mno-avx512bw. Generally we can't do much with such permutations, but since PR68655 we can handle at least some, those expressible using V16SImode or V8DImode permutations, but that wasn't reachable, because ix86_vectorize_vec_perm_const didn't even try, it said without TARGET_AVX512BW it can't do anything, and with it can do everything, no d.testing_p attempts. This patch makes it try it for TARGET_AVX512F && !TARGET_AVX512BW. The first hunk is to avoid ICE, expand_vec_perm_even_odd_1 asserts d->vmode isn't V32HImode because expand_vec_perm_1 for AVX512BW handles already all permutations, but when we let it through without !TARGET_AVX512BW, expand_vec_perm_1 doesn't handle it. If we want, that hunk can be dropped if we implement in expand_vec_perm_even_odd_1 and its helper the even permutation as vpmovdw + vpmovdw + vinserti64x4 and odd permutation as vpsrld $16 + vpsrld $16 + vpmovdw + vpmovdw + vinserti64x4. 2021-08-10 Jakub Jelinek PR target/80355 * config/i386/i386-expand.c (expand_vec_perm_even_odd): Return false for V32HImode if !TARGET_AVX512BW. (ix86_vectorize_vec_perm_const) : If !TARGET_AVX512BW and TARGET_AVX512F and d.testing_p, don't fail early, but actually check the permutation. * gcc.target/i386/avx512f-pr80355-2.c: New test. --- gcc/config/i386/i386-expand.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index bd21efa..c708b33 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -20337,6 +20337,11 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d) if (d->perm[i] != 2 * i + odd) return false; + if (d->vmode == E_V32HImode + && d->testing_p + && !TARGET_AVX512BW) + return false; + return expand_vec_perm_even_odd_1 (d, odd); } @@ -20877,16 +20882,16 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, return true; break; case E_V32HImode: - if (!TARGET_AVX512BW) + if (!TARGET_AVX512F) return false; - if (d.testing_p) + if (d.testing_p && TARGET_AVX512BW) /* All implementable with a single vperm[it]2 insn. */ return true; break; case E_V64QImode: - if (!TARGET_AVX512BW) + if (!TARGET_AVX512F) return false; - if (d.testing_p) + if (d.testing_p && TARGET_AVX512BW) /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */ return true; break; -- cgit v1.1 From 05a03f3986db25cb5076b409f4048e9dbb5dbfdf Mon Sep 17 00:00:00 2001 From: liuhongt Date: Tue, 10 Aug 2021 19:00:18 +0800 Subject: Extend ldexp{s,d}f3 to vscalefs{s,d} when TARGET_AVX512F and TARGET_SSE_MATH. gcc/ChangeLog: PR target/98309 * config/i386/i386.md (ldexp3): Extend to vscalefs[sd] when TARGET_AVX512F and TARGET_SSE_MATH. gcc/testsuite/ChangeLog: PR target/98309 * gcc.target/i386/pr98309-1.c: New test. * gcc.target/i386/pr98309-2.c: New test. --- gcc/config/i386/i386.md | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index bc1c30b..56b09c5 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -17914,17 +17914,35 @@ [(use (match_operand:MODEF 0 "register_operand")) (use (match_operand:MODEF 1 "general_operand")) (use (match_operand:SI 2 "register_operand"))] - "TARGET_USE_FANCY_MATH_387 - && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - || TARGET_MIX_SSE_I387) + "((TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387)) + || (TARGET_AVX512F && TARGET_SSE_MATH)) && flag_unsafe_math_optimizations" { - rtx op0 = gen_reg_rtx (XFmode); - rtx op1 = gen_reg_rtx (XFmode); + /* Prefer avx512f version. */ + if (TARGET_AVX512F && TARGET_SSE_MATH) + { + rtx op2 = gen_reg_rtx (mode); + emit_insn (gen_floatsi2 (op2, operands[2])); + operands[0] = lowpart_subreg (mode, operands[0], mode); + if (MEM_P (operands[1])) + operands[1] = force_reg (mode, operands[1]); + operands[1] = lowpart_subreg (mode, operands[1], mode); + op2 = lowpart_subreg (mode, op2, mode); + emit_insn (gen_avx512f_vmscalef (operands[0], + operands[1], + op2)); + } + else + { + rtx op0 = gen_reg_rtx (XFmode); + rtx op1 = gen_reg_rtx (XFmode); - emit_insn (gen_extendxf2 (op1, operands[1])); - emit_insn (gen_ldexpxf3 (op0, op1, operands[2])); - emit_insn (gen_truncxf2 (operands[0], op0)); + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_ldexpxf3 (op0, op1, operands[2])); + emit_insn (gen_truncxf2 (operands[0], op0)); + } DONE; }) -- cgit v1.1 From b8f604da25bfe0fd4dadbc338293885819fe8018 Mon Sep 17 00:00:00 2001 From: "prathamesh.kulkarni" Date: Wed, 11 Aug 2021 15:30:14 +0530 Subject: arm/66791: Replace builtins for vdup_n and vmov_n intrinsics. gcc/ChangeLog: PR target/66791 * config/arm/arm_neon.h (vdup_n_s8): Replace call to builtin with constructor. (vdup_n_s16): Likewise. (vdup_n_s32): Likewise. (vdup_n_s64): Likewise. (vdup_n_u8): Likewise. (vdup_n_u16): Likewise. (vdup_n_u32): Likewise. (vdup_n_u64): Likewise. (vdup_n_p8): Likewise. (vdup_n_p16): Likewise. (vdup_n_p64): Likewise. (vdup_n_f16): Likewise. (vdup_n_f32): Likewise. (vdupq_n_s8): Likewise. (vdupq_n_s16): Likewise. (vdupq_n_s32): Likewise. (vdupq_n_s64): Likewise. (vdupq_n_u8): Likewise. (vdupq_n_u16): Likewise. (vdupq_n_u32): Likewise. (vdupq_n_u64): Likewise. (vdupq_n_p8): Likewise. (vdupq_n_p16): Likewise. (vdupq_n_p64): Likewise. (vdupq_n_f16): Likewise. (vdupq_n_f32): Likewise. (vmov_n_s8): Replace call to builtin with call to corresponding vdup_n intrinsic. (vmov_n_s16): Likewise. (vmov_n_s32): Likewise. (vmov_n_s64): Likewise. (vmov_n_u8): Likewise. (vmov_n_u16): Likewise. (vmov_n_u32): Likewise. (vmov_n_u64): Likewise. (vmov_n_p8): Likewise. (vmov_n_p16): Likewise. (vmov_n_f16): Likewise. (vmov_n_f32): Likewise. (vmovq_n_s8): Likewise. (vmovq_n_s16): Likewise. (vmovq_n_s32): Likewise. (vmovq_n_s64): Likewise. (vmovq_n_u8): Likewise. (vmovq_n_u16): Likewise. (vmovq_n_u32): Likewise. (vmovq_n_u64): Likewise. (vmovq_n_p8): Likewise. (vmovq_n_p16): Likewise. (vmovq_n_f16): Likewise. (vmovq_n_f32): Likewise. * config/arm/arm_neon_builtins.def: Remove entries for vdup_n. gcc/testsuite/ChangeLog: PR target/66791 * gcc.target/arm/pr51534.c: Adjust test. --- gcc/config/arm/arm_neon.h | 107 ++++++++++++++++++----------------- gcc/config/arm/arm_neon_builtins.def | 3 - 2 files changed, 55 insertions(+), 55 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 5a91d15..3364b37 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -6664,63 +6664,63 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_s8 (int8_t __a) { - return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); + return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_s16 (int16_t __a) { - return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); + return (int16x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_s32 (int32_t __a) { - return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a); + return (int32x2_t) {__a, __a}; } __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_f32 (float32_t __a) { - return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a); + return (float32x2_t) {__a, __a}; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_u8 (uint8_t __a) { - return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); + return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_u16 (uint16_t __a) { - return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); + return (uint16x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_u32 (uint32_t __a) { - return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a); + return (uint32x2_t) {__a, __a}; } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_p8 (poly8_t __a) { - return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); + return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_p16 (poly16_t __a) { - return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); + return (poly16x4_t) {__a, __a, __a, __a}; } #pragma GCC push_options @@ -6729,7 +6729,7 @@ __extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_p64 (poly64_t __a) { - return (poly64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); + return (poly64x1_t) {__a}; } #pragma GCC pop_options @@ -6737,14 +6737,14 @@ __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_s64 (int64_t __a) { - return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); + return (int64x1_t) {__a}; } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_u64 (uint64_t __a) { - return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); + return (uint64x1_t) {__a}; } #pragma GCC push_options @@ -6753,7 +6753,7 @@ __extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_p64 (poly64_t __a) { - return (poly64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); + return (poly64x2_t) {__a, __a}; } #pragma GCC pop_options @@ -6761,231 +6761,234 @@ __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_s8 (int8_t __a) { - return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); + return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, + __a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_s16 (int16_t __a) { - return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); + return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_s32 (int32_t __a) { - return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a); + return (int32x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_f32 (float32_t __a) { - return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a); + return (float32x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_u8 (uint8_t __a) { - return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); + return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, + __a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_u16 (uint16_t __a) { - return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); + return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_u32 (uint32_t __a) { - return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a); + return (uint32x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_p8 (poly8_t __a) { - return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); + return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, + __a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_p16 (poly16_t __a) { - return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); + return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_s64 (int64_t __a) { - return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); + return (int64x2_t) {__a, __a}; } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_u64 (uint64_t __a) { - return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); + return (uint64x2_t) {__a, __a}; } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_s8 (int8_t __a) { - return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); + return vdup_n_s8 (__a); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_s16 (int16_t __a) { - return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); + return vdup_n_s16 (__a); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_s32 (int32_t __a) { - return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a); + return vdup_n_s32 (__a); } __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_f32 (float32_t __a) { - return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a); + return vdup_n_f32 (__a); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_u8 (uint8_t __a) { - return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); + return vdup_n_u8 (__a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_u16 (uint16_t __a) { - return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); + return vdup_n_u16 (__a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_u32 (uint32_t __a) { - return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a); + return vdup_n_u32 (__a); } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_p8 (poly8_t __a) { - return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); + return vdup_n_p8 (__a); } __extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_p16 (poly16_t __a) { - return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); + return vdup_n_p16 (__a); } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_s64 (int64_t __a) { - return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); + return vdup_n_s64 (__a); } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_u64 (uint64_t __a) { - return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); + return vdup_n_u64 (__a); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_s8 (int8_t __a) { - return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); + return vdupq_n_s8 (__a); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_s16 (int16_t __a) { - return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); + return vdupq_n_s16 (__a); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_s32 (int32_t __a) { - return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a); + return vdupq_n_s32 (__a); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_f32 (float32_t __a) { - return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a); + return vdupq_n_f32 (__a); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_u8 (uint8_t __a) { - return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); + return vdupq_n_u8 (__a); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_u16 (uint16_t __a) { - return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); + return vdupq_n_u16 (__a); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_u32 (uint32_t __a) { - return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a); + return vdupq_n_u32 (__a); } __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_p8 (poly8_t __a) { - return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); + return vdupq_n_p8 (__a); } __extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_p16 (poly16_t __a) { - return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); + return vdupq_n_p16 (__a); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_s64 (int64_t __a) { - return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); + return vdupq_n_s64 (__a); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_u64 (uint64_t __a) { - return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); + return vdupq_n_u64 (__a); } __extension__ extern __inline int8x8_t @@ -18005,14 +18008,14 @@ __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_f16 (float16_t __a) { - return __builtin_neon_vdup_nv4hf (__a); + return (float16x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_f16 (float16_t __a) { - return __builtin_neon_vdup_nv8hf (__a); + return (float16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline float16x4_t @@ -18047,14 +18050,14 @@ __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_f16 (float16_t __a) { - return __builtin_neon_vdup_nv4hf (__a); + return vdup_n_f16 (__a); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_f16 (float16_t __a) { - return __builtin_neon_vdup_nv8hf (__a); + return vdupq_n_f16 (__a); } __extension__ extern __inline float16x4_t @@ -18978,14 +18981,14 @@ __extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_bf16 (bfloat16_t __a) { - return __builtin_neon_vdup_nv4bf (__a); + return (bfloat16x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_bf16 (bfloat16_t __a) { - return __builtin_neon_vdup_nv8bf (__a); + return (bfloat16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline bfloat16x4_t diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index fb6d66e..fafb5c6 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -211,9 +211,6 @@ VAR10 (GETLANE, vget_lane, VAR6 (GETLANE, vget_laneu, v8qi, v4hi, v2si, v16qi, v8hi, v4si) VAR10 (SETLANE, vset_lane, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) -VAR10 (UNOP, vdup_n, - v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) -VAR4 (UNOP, vdup_n, v8hf, v4hf, v8bf, v4bf) VAR10 (GETLANE, vdup_lane, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) VAR4 (GETLANE, vdup_lane, v8hf, v4hf, v8bf, v4bf) -- cgit v1.1 From 6cc92e946edab03b26f8aaca23064adf664433f9 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Thu, 10 Jun 2021 11:14:51 -0500 Subject: rs6000: Add the rest of the [altivec] stanza to the builtins file 2021-06-10 Bill Schmidt gcc/ * config/rs6000/rs6000-builtin-new.def: Finish altivec stanza. * config/rs6000/rs6000-call.c (rs6000_init_builtins): Move initialization of pcvoid_type_node here... (altivec_init_builtins): ...from here. * config/rs6000/rs6000.h (rs6000_builtin_type_index): Add RS6000_BTI_const_ptr_void. (pcvoid_type_node): New macro. --- gcc/config/rs6000/rs6000-builtin-new.def | 831 +++++++++++++++++++++++++++++++ gcc/config/rs6000/rs6000-call.c | 7 +- gcc/config/rs6000/rs6000.h | 2 + 3 files changed, 836 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def index a84a3de..f1aa552 100644 --- a/gcc/config/rs6000/rs6000-builtin-new.def +++ b/gcc/config/rs6000/rs6000-builtin-new.def @@ -197,3 +197,834 @@ const vss __builtin_altivec_abs_v8hi (vss); ABS_V8HI absv8hi2 {} + + const vsc __builtin_altivec_abss_v16qi (vsc); + ABSS_V16QI altivec_abss_v16qi {} + + const vsi __builtin_altivec_abss_v4si (vsi); + ABSS_V4SI altivec_abss_v4si {} + + const vss __builtin_altivec_abss_v8hi (vss); + ABSS_V8HI altivec_abss_v8hi {} + + const vf __builtin_altivec_copysignfp (vf, vf); + COPYSIGN_V4SF vector_copysignv4sf3 {} + + void __builtin_altivec_dss (const int<2>); + DSS altivec_dss {} + + void __builtin_altivec_dssall (); + DSSALL altivec_dssall {} + + void __builtin_altivec_dst (void *, const int, const int<2>); + DST altivec_dst {} + + void __builtin_altivec_dstst (void *, const int, const int<2>); + DSTST altivec_dstst {} + + void __builtin_altivec_dststt (void *, const int, const int<2>); + DSTSTT altivec_dststt {} + + void __builtin_altivec_dstt (void *, const int, const int<2>); + DSTT altivec_dstt {} + + fpmath vsi __builtin_altivec_fix_sfsi (vf); + FIX_V4SF_V4SI fix_truncv4sfv4si2 {} + + fpmath vui __builtin_altivec_fixuns_sfsi (vf); + FIXUNS_V4SF_V4SI fixuns_truncv4sfv4si2 {} + + fpmath vf __builtin_altivec_float_sisf (vsi); + FLOAT_V4SI_V4SF floatv4siv4sf2 {} + + pure vsc __builtin_altivec_lvebx (signed long, const void *); + LVEBX altivec_lvebx {ldvec} + + pure vss __builtin_altivec_lvehx (signed long, const void *); + LVEHX altivec_lvehx {ldvec} + + pure vsi __builtin_altivec_lvewx (signed long, const void *); + LVEWX altivec_lvewx {ldvec} + + pure vuc __builtin_altivec_lvsl (signed long, const void *); + LVSL altivec_lvsl {ldvec} + + pure vuc __builtin_altivec_lvsr (signed long, const void *); + LVSR altivec_lvsr {ldvec} + + pure vsi __builtin_altivec_lvx (signed long, const void *); + LVX altivec_lvx_v4si {ldvec} + + pure vsq __builtin_altivec_lvx_v1ti (signed long, const void *); + LVX_V1TI altivec_lvx_v1ti {ldvec} + + pure vsc __builtin_altivec_lvx_v16qi (signed long, const void *); + LVX_V16QI altivec_lvx_v16qi {ldvec} + + pure vf __builtin_altivec_lvx_v4sf (signed long, const void *); + LVX_V4SF altivec_lvx_v4sf {ldvec} + + pure vsi __builtin_altivec_lvx_v4si (signed long, const void *); + LVX_V4SI altivec_lvx_v4si {ldvec} + + pure vss __builtin_altivec_lvx_v8hi (signed long, const void *); + LVX_V8HI altivec_lvx_v8hi {ldvec} + + pure vsi __builtin_altivec_lvxl (signed long, const void *); + LVXL altivec_lvxl_v4si {ldvec} + + pure vsc __builtin_altivec_lvxl_v16qi (signed long, const void *); + LVXL_V16QI altivec_lvxl_v16qi {ldvec} + + pure vf __builtin_altivec_lvxl_v4sf (signed long, const void *); + LVXL_V4SF altivec_lvxl_v4sf {ldvec} + + pure vsi __builtin_altivec_lvxl_v4si (signed long, const void *); + LVXL_V4SI altivec_lvxl_v4si {ldvec} + + pure vss __builtin_altivec_lvxl_v8hi (signed long, const void *); + LVXL_V8HI altivec_lvxl_v8hi {ldvec} + + const vsc __builtin_altivec_mask_for_load (const void *); + MASK_FOR_LOAD altivec_lvsr_direct {ldstmask} + + vss __builtin_altivec_mfvscr (); + MFVSCR altivec_mfvscr {} + + void __builtin_altivec_mtvscr (vsi); + MTVSCR altivec_mtvscr {} + + const vsll __builtin_altivec_vmulesw (vsi, vsi); + VMULESW vec_widen_smult_even_v4si {} + + const vull __builtin_altivec_vmuleuw (vui, vui); + VMULEUW vec_widen_umult_even_v4si {} + + const vsll __builtin_altivec_vmulosw (vsi, vsi); + VMULOSW vec_widen_smult_odd_v4si {} + + const vull __builtin_altivec_vmulouw (vui, vui); + VMULOUW vec_widen_umult_odd_v4si {} + + const vsc __builtin_altivec_nabs_v16qi (vsc); + NABS_V16QI nabsv16qi2 {} + + const vf __builtin_altivec_nabs_v4sf (vf); + NABS_V4SF vsx_nabsv4sf2 {} + + const vsi __builtin_altivec_nabs_v4si (vsi); + NABS_V4SI nabsv4si2 {} + + const vss __builtin_altivec_nabs_v8hi (vss); + NABS_V8HI nabsv8hi2 {} + + void __builtin_altivec_stvebx (vsc, signed long, void *); + STVEBX altivec_stvebx {stvec} + + void __builtin_altivec_stvehx (vss, signed long, void *); + STVEHX altivec_stvehx {stvec} + + void __builtin_altivec_stvewx (vsi, signed long, void *); + STVEWX altivec_stvewx {stvec} + + void __builtin_altivec_stvx (vsi, signed long, void *); + STVX altivec_stvx_v4si {stvec} + + void __builtin_altivec_stvx_v16qi (vsc, signed long, void *); + STVX_V16QI altivec_stvx_v16qi {stvec} + + void __builtin_altivec_stvx_v4sf (vf, signed long, void *); + STVX_V4SF altivec_stvx_v4sf {stvec} + + void __builtin_altivec_stvx_v4si (vsi, signed long, void *); + STVX_V4SI altivec_stvx_v4si {stvec} + + void __builtin_altivec_stvx_v8hi (vss, signed long, void *); + STVX_V8HI altivec_stvx_v8hi {stvec} + + void __builtin_altivec_stvxl (vsi, signed long, void *); + STVXL altivec_stvxl_v4si {stvec} + + void __builtin_altivec_stvxl_v16qi (vsc, signed long, void *); + STVXL_V16QI altivec_stvxl_v16qi {stvec} + + void __builtin_altivec_stvxl_v4sf (vf, signed long, void *); + STVXL_V4SF altivec_stvxl_v4sf {stvec} + + void __builtin_altivec_stvxl_v4si (vsi, signed long, void *); + STVXL_V4SI altivec_stvxl_v4si {stvec} + + void __builtin_altivec_stvxl_v8hi (vss, signed long, void *); + STVXL_V8HI altivec_stvxl_v8hi {stvec} + + fpmath vf __builtin_altivec_uns_float_sisf (vui); + UNSFLOAT_V4SI_V4SF floatunsv4siv4sf2 {} + + const vui __builtin_altivec_vaddcuw (vui, vui); + VADDCUW altivec_vaddcuw {} + + const vf __builtin_altivec_vaddfp (vf, vf); + VADDFP addv4sf3 {} + + const vsc __builtin_altivec_vaddsbs (vsc, vsc); + VADDSBS altivec_vaddsbs {} + + const vss __builtin_altivec_vaddshs (vss, vss); + VADDSHS altivec_vaddshs {} + + const vsi __builtin_altivec_vaddsws (vsi, vsi); + VADDSWS altivec_vaddsws {} + + const vuc __builtin_altivec_vaddubm (vuc, vuc); + VADDUBM addv16qi3 {} + + const vuc __builtin_altivec_vaddubs (vuc, vuc); + VADDUBS altivec_vaddubs {} + + const vus __builtin_altivec_vadduhm (vus, vus); + VADDUHM addv8hi3 {} + + const vus __builtin_altivec_vadduhs (vus, vus); + VADDUHS altivec_vadduhs {} + + const vsi __builtin_altivec_vadduwm (vsi, vsi); + VADDUWM addv4si3 {} + + const vui __builtin_altivec_vadduws (vui, vui); + VADDUWS altivec_vadduws {} + + const vsc __builtin_altivec_vand_v16qi (vsc, vsc); + VAND_V16QI andv16qi3 {} + + const vuc __builtin_altivec_vand_v16qi_uns (vuc, vuc); + VAND_V16QI_UNS andv16qi3 {} + + const vf __builtin_altivec_vand_v4sf (vf, vf); + VAND_V4SF andv4sf3 {} + + const vsi __builtin_altivec_vand_v4si (vsi, vsi); + VAND_V4SI andv4si3 {} + + const vui __builtin_altivec_vand_v4si_uns (vui, vui); + VAND_V4SI_UNS andv4si3 {} + + const vss __builtin_altivec_vand_v8hi (vss, vss); + VAND_V8HI andv8hi3 {} + + const vus __builtin_altivec_vand_v8hi_uns (vus, vus); + VAND_V8HI_UNS andv8hi3 {} + + const vsc __builtin_altivec_vandc_v16qi (vsc, vsc); + VANDC_V16QI andcv16qi3 {} + + const vuc __builtin_altivec_vandc_v16qi_uns (vuc, vuc); + VANDC_V16QI_UNS andcv16qi3 {} + + const vf __builtin_altivec_vandc_v4sf (vf, vf); + VANDC_V4SF andcv4sf3 {} + + const vsi __builtin_altivec_vandc_v4si (vsi, vsi); + VANDC_V4SI andcv4si3 {} + + const vui __builtin_altivec_vandc_v4si_uns (vui, vui); + VANDC_V4SI_UNS andcv4si3 {} + + const vss __builtin_altivec_vandc_v8hi (vss, vss); + VANDC_V8HI andcv8hi3 {} + + const vus __builtin_altivec_vandc_v8hi_uns (vus, vus); + VANDC_V8HI_UNS andcv8hi3 {} + + const vsc __builtin_altivec_vavgsb (vsc, vsc); + VAVGSB avgv16qi3_ceil {} + + const vss __builtin_altivec_vavgsh (vss, vss); + VAVGSH avgv8hi3_ceil {} + + const vsi __builtin_altivec_vavgsw (vsi, vsi); + VAVGSW avgv4si3_ceil {} + + const vuc __builtin_altivec_vavgub (vuc, vuc); + VAVGUB uavgv16qi3_ceil {} + + const vus __builtin_altivec_vavguh (vus, vus); + VAVGUH uavgv8hi3_ceil {} + + const vui __builtin_altivec_vavguw (vui, vui); + VAVGUW uavgv4si3_ceil {} + + const vf __builtin_altivec_vcfsx (vsi, const int<5>); + VCFSX altivec_vcfsx {} + + const vf __builtin_altivec_vcfux (vui, const int<5>); + VCFUX altivec_vcfux {} + + const vsi __builtin_altivec_vcmpbfp (vf, vf); + VCMPBFP altivec_vcmpbfp {} + + const int __builtin_altivec_vcmpbfp_p (int, vf, vf); + VCMPBFP_P altivec_vcmpbfp_p {pred} + + const vf __builtin_altivec_vcmpeqfp (vf, vf); + VCMPEQFP vector_eqv4sf {} + + const int __builtin_altivec_vcmpeqfp_p (int, vf, vf); + VCMPEQFP_P vector_eq_v4sf_p {pred} + + const vsc __builtin_altivec_vcmpequb (vuc, vuc); + VCMPEQUB vector_eqv16qi {} + + const int __builtin_altivec_vcmpequb_p (int, vsc, vsc); + VCMPEQUB_P vector_eq_v16qi_p {pred} + + const vss __builtin_altivec_vcmpequh (vus, vus); + VCMPEQUH vector_eqv8hi {} + + const int __builtin_altivec_vcmpequh_p (int, vss, vss); + VCMPEQUH_P vector_eq_v8hi_p {pred} + + const vsi __builtin_altivec_vcmpequw (vui, vui); + VCMPEQUW vector_eqv4si {} + + const int __builtin_altivec_vcmpequw_p (int, vsi, vsi); + VCMPEQUW_P vector_eq_v4si_p {pred} + + const vf __builtin_altivec_vcmpgefp (vf, vf); + VCMPGEFP vector_gev4sf {} + + const int __builtin_altivec_vcmpgefp_p (int, vf, vf); + VCMPGEFP_P vector_ge_v4sf_p {pred} + + const vf __builtin_altivec_vcmpgtfp (vf, vf); + VCMPGTFP vector_gtv4sf {} + + const int __builtin_altivec_vcmpgtfp_p (int, vf, vf); + VCMPGTFP_P vector_gt_v4sf_p {pred} + + const vsc __builtin_altivec_vcmpgtsb (vsc, vsc); + VCMPGTSB vector_gtv16qi {} + + const int __builtin_altivec_vcmpgtsb_p (int, vsc, vsc); + VCMPGTSB_P vector_gt_v16qi_p {pred} + + const vss __builtin_altivec_vcmpgtsh (vss, vss); + VCMPGTSH vector_gtv8hi {} + + const int __builtin_altivec_vcmpgtsh_p (int, vss, vss); + VCMPGTSH_P vector_gt_v8hi_p {pred} + + const vsi __builtin_altivec_vcmpgtsw (vsi, vsi); + VCMPGTSW vector_gtv4si {} + + const int __builtin_altivec_vcmpgtsw_p (int, vsi, vsi); + VCMPGTSW_P vector_gt_v4si_p {pred} + + const vsc __builtin_altivec_vcmpgtub (vuc, vuc); + VCMPGTUB vector_gtuv16qi {} + + const int __builtin_altivec_vcmpgtub_p (int, vsc, vsc); + VCMPGTUB_P vector_gtu_v16qi_p {pred} + + const vss __builtin_altivec_vcmpgtuh (vus, vus); + VCMPGTUH vector_gtuv8hi {} + + const int __builtin_altivec_vcmpgtuh_p (int, vss, vss); + VCMPGTUH_P vector_gtu_v8hi_p {pred} + + const vsi __builtin_altivec_vcmpgtuw (vui, vui); + VCMPGTUW vector_gtuv4si {} + + const int __builtin_altivec_vcmpgtuw_p (int, vsi, vsi); + VCMPGTUW_P vector_gtu_v4si_p {pred} + + const vsi __builtin_altivec_vctsxs (vf, const int<5>); + VCTSXS altivec_vctsxs {} + + const vui __builtin_altivec_vctuxs (vf, const int<5>); + VCTUXS altivec_vctuxs {} + + fpmath vf __builtin_altivec_vexptefp (vf); + VEXPTEFP altivec_vexptefp {} + + fpmath vf __builtin_altivec_vlogefp (vf); + VLOGEFP altivec_vlogefp {} + + fpmath vf __builtin_altivec_vmaddfp (vf, vf, vf); + VMADDFP fmav4sf4 {} + + const vf __builtin_altivec_vmaxfp (vf, vf); + VMAXFP smaxv4sf3 {} + + const vsc __builtin_altivec_vmaxsb (vsc, vsc); + VMAXSB smaxv16qi3 {} + + const vuc __builtin_altivec_vmaxub (vuc, vuc); + VMAXUB umaxv16qi3 {} + + const vss __builtin_altivec_vmaxsh (vss, vss); + VMAXSH smaxv8hi3 {} + + const vsi __builtin_altivec_vmaxsw (vsi, vsi); + VMAXSW smaxv4si3 {} + + const vus __builtin_altivec_vmaxuh (vus, vus); + VMAXUH umaxv8hi3 {} + + const vui __builtin_altivec_vmaxuw (vui, vui); + VMAXUW umaxv4si3 {} + + vss __builtin_altivec_vmhaddshs (vss, vss, vss); + VMHADDSHS altivec_vmhaddshs {} + + vss __builtin_altivec_vmhraddshs (vss, vss, vss); + VMHRADDSHS altivec_vmhraddshs {} + + const vf __builtin_altivec_vminfp (vf, vf); + VMINFP sminv4sf3 {} + + const vsc __builtin_altivec_vminsb (vsc, vsc); + VMINSB sminv16qi3 {} + + const vss __builtin_altivec_vminsh (vss, vss); + VMINSH sminv8hi3 {} + + const vsi __builtin_altivec_vminsw (vsi, vsi); + VMINSW sminv4si3 {} + + const vuc __builtin_altivec_vminub (vuc, vuc); + VMINUB uminv16qi3 {} + + const vus __builtin_altivec_vminuh (vus, vus); + VMINUH uminv8hi3 {} + + const vui __builtin_altivec_vminuw (vui, vui); + VMINUW uminv4si3 {} + + const vss __builtin_altivec_vmladduhm (vss, vss, vss); + VMLADDUHM fmav8hi4 {} + + const vsc __builtin_altivec_vmrghb (vsc, vsc); + VMRGHB altivec_vmrghb {} + + const vss __builtin_altivec_vmrghh (vss, vss); + VMRGHH altivec_vmrghh {} + + const vsi __builtin_altivec_vmrghw (vsi, vsi); + VMRGHW altivec_vmrghw {} + + const vsc __builtin_altivec_vmrglb (vsc, vsc); + VMRGLB altivec_vmrglb {} + + const vss __builtin_altivec_vmrglh (vss, vss); + VMRGLH altivec_vmrglh {} + + const vsi __builtin_altivec_vmrglw (vsi, vsi); + VMRGLW altivec_vmrglw {} + + const vsi __builtin_altivec_vmsummbm (vsc, vuc, vsi); + VMSUMMBM altivec_vmsummbm {} + + const vsi __builtin_altivec_vmsumshm (vss, vss, vsi); + VMSUMSHM altivec_vmsumshm {} + + vsi __builtin_altivec_vmsumshs (vss, vss, vsi); + VMSUMSHS altivec_vmsumshs {} + + const vui __builtin_altivec_vmsumubm (vuc, vuc, vui); + VMSUMUBM altivec_vmsumubm {} + + const vui __builtin_altivec_vmsumuhm (vus, vus, vui); + VMSUMUHM altivec_vmsumuhm {} + + vui __builtin_altivec_vmsumuhs (vus, vus, vui); + VMSUMUHS altivec_vmsumuhs {} + + const vss __builtin_altivec_vmulesb (vsc, vsc); + VMULESB vec_widen_smult_even_v16qi {} + + const vsi __builtin_altivec_vmulesh (vss, vss); + VMULESH vec_widen_smult_even_v8hi {} + + const vus __builtin_altivec_vmuleub (vuc, vuc); + VMULEUB vec_widen_umult_even_v16qi {} + + const vui __builtin_altivec_vmuleuh (vus, vus); + VMULEUH vec_widen_umult_even_v8hi {} + + const vss __builtin_altivec_vmulosb (vsc, vsc); + VMULOSB vec_widen_smult_odd_v16qi {} + + const vus __builtin_altivec_vmuloub (vuc, vuc); + VMULOUB vec_widen_umult_odd_v16qi {} + + const vsi __builtin_altivec_vmulosh (vss, vss); + VMULOSH vec_widen_smult_odd_v8hi {} + + const vui __builtin_altivec_vmulouh (vus, vus); + VMULOUH vec_widen_umult_odd_v8hi {} + + fpmath vf __builtin_altivec_vnmsubfp (vf, vf, vf); + VNMSUBFP nfmsv4sf4 {} + + const vsc __builtin_altivec_vnor_v16qi (vsc, vsc); + VNOR_V16QI norv16qi3 {} + + const vuc __builtin_altivec_vnor_v16qi_uns (vuc, vuc); + VNOR_V16QI_UNS norv16qi3 {} + + const vf __builtin_altivec_vnor_v4sf (vf, vf); + VNOR_V4SF norv4sf3 {} + + const vsi __builtin_altivec_vnor_v4si (vsi, vsi); + VNOR_V4SI norv4si3 {} + + const vui __builtin_altivec_vnor_v4si_uns (vui, vui); + VNOR_V4SI_UNS norv4si3 {} + + const vss __builtin_altivec_vnor_v8hi (vss, vss); + VNOR_V8HI norv8hi3 {} + + const vus __builtin_altivec_vnor_v8hi_uns (vus, vus); + VNOR_V8HI_UNS norv8hi3 {} + + const vsc __builtin_altivec_vor_v16qi (vsc, vsc); + VOR_V16QI iorv16qi3 {} + + const vuc __builtin_altivec_vor_v16qi_uns (vuc, vuc); + VOR_V16QI_UNS iorv16qi3 {} + + const vf __builtin_altivec_vor_v4sf (vf, vf); + VOR_V4SF iorv4sf3 {} + + const vsi __builtin_altivec_vor_v4si (vsi, vsi); + VOR_V4SI iorv4si3 {} + + const vui __builtin_altivec_vor_v4si_uns (vui, vui); + VOR_V4SI_UNS iorv4si3 {} + + const vss __builtin_altivec_vor_v8hi (vss, vss); + VOR_V8HI iorv8hi3 {} + + const vus __builtin_altivec_vor_v8hi_uns (vus, vus); + VOR_V8HI_UNS iorv8hi3 {} + + const vsc __builtin_altivec_vperm_16qi (vsc, vsc, vuc); + VPERM_16QI altivec_vperm_v16qi {} + + const vuc __builtin_altivec_vperm_16qi_uns (vuc, vuc, vuc); + VPERM_16QI_UNS altivec_vperm_v16qi_uns {} + + const vsq __builtin_altivec_vperm_1ti (vsq, vsq, vuc); + VPERM_1TI altivec_vperm_v1ti {} + + const vuq __builtin_altivec_vperm_1ti_uns (vuq, vuq, vuc); + VPERM_1TI_UNS altivec_vperm_v1ti_uns {} + + const vf __builtin_altivec_vperm_4sf (vf, vf, vuc); + VPERM_4SF altivec_vperm_v4sf {} + + const vsi __builtin_altivec_vperm_4si (vsi, vsi, vuc); + VPERM_4SI altivec_vperm_v4si {} + + const vui __builtin_altivec_vperm_4si_uns (vui, vui, vuc); + VPERM_4SI_UNS altivec_vperm_v4si_uns {} + + const vss __builtin_altivec_vperm_8hi (vss, vss, vuc); + VPERM_8HI altivec_vperm_v8hi {} + + const vus __builtin_altivec_vperm_8hi_uns (vus, vus, vuc); + VPERM_8HI_UNS altivec_vperm_v8hi_uns {} + + const vp __builtin_altivec_vpkpx (vui, vui); + VPKPX altivec_vpkpx {} + + const vsc __builtin_altivec_vpkshss (vss, vss); + VPKSHSS altivec_vpkshss {} + + const vuc __builtin_altivec_vpkshus (vss, vss); + VPKSHUS altivec_vpkshus {} + + const vss __builtin_altivec_vpkswss (vsi, vsi); + VPKSWSS altivec_vpkswss {} + + const vus __builtin_altivec_vpkswus (vsi, vsi); + VPKSWUS altivec_vpkswus {} + + const vsc __builtin_altivec_vpkuhum (vss, vss); + VPKUHUM altivec_vpkuhum {} + + const vuc __builtin_altivec_vpkuhus (vus, vus); + VPKUHUS altivec_vpkuhus {} + + const vss __builtin_altivec_vpkuwum (vsi, vsi); + VPKUWUM altivec_vpkuwum {} + + const vus __builtin_altivec_vpkuwus (vui, vui); + VPKUWUS altivec_vpkuwus {} + + const vf __builtin_altivec_vrecipdivfp (vf, vf); + VRECIPFP recipv4sf3 {} + + fpmath vf __builtin_altivec_vrefp (vf); + VREFP rev4sf2 {} + + const vsc __builtin_altivec_vreve_v16qi (vsc); + VREVE_V16QI altivec_vrevev16qi2 {} + + const vf __builtin_altivec_vreve_v4sf (vf); + VREVE_V4SF altivec_vrevev4sf2 {} + + const vsi __builtin_altivec_vreve_v4si (vsi); + VREVE_V4SI altivec_vrevev4si2 {} + + const vss __builtin_altivec_vreve_v8hi (vss); + VREVE_V8HI altivec_vrevev8hi2 {} + + fpmath vf __builtin_altivec_vrfim (vf); + VRFIM vector_floorv4sf2 {} + + fpmath vf __builtin_altivec_vrfin (vf); + VRFIN altivec_vrfin {} + + fpmath vf __builtin_altivec_vrfip (vf); + VRFIP vector_ceilv4sf2 {} + + fpmath vf __builtin_altivec_vrfiz (vf); + VRFIZ vector_btruncv4sf2 {} + + const vsc __builtin_altivec_vrlb (vsc, vsc); + VRLB vrotlv16qi3 {} + + const vss __builtin_altivec_vrlh (vss, vss); + VRLH vrotlv8hi3 {} + + const vsi __builtin_altivec_vrlw (vsi, vsi); + VRLW vrotlv4si3 {} + + fpmath vf __builtin_altivec_vrsqrtefp (vf); + VRSQRTEFP rsqrtev4sf2 {} + + fpmath vf __builtin_altivec_vrsqrtfp (vf); + VRSQRTFP rsqrtv4sf2 {} + + const vsc __builtin_altivec_vsel_16qi (vsc, vsc, vuc); + VSEL_16QI vector_select_v16qi {} + + const vuc __builtin_altivec_vsel_16qi_uns (vuc, vuc, vuc); + VSEL_16QI_UNS vector_select_v16qi_uns {} + + const vsq __builtin_altivec_vsel_1ti (vsq, vsq, vuq); + VSEL_1TI vector_select_v1ti {} + + const vuq __builtin_altivec_vsel_1ti_uns (vuq, vuq, vuq); + VSEL_1TI_UNS vector_select_v1ti_uns {} + + const vf __builtin_altivec_vsel_4sf (vf, vf, vf); + VSEL_4SF vector_select_v4sf {} + + const vsi __builtin_altivec_vsel_4si (vsi, vsi, vui); + VSEL_4SI vector_select_v4si {} + + const vui __builtin_altivec_vsel_4si_uns (vui, vui, vui); + VSEL_4SI_UNS vector_select_v4si_uns {} + + const vss __builtin_altivec_vsel_8hi (vss, vss, vus); + VSEL_8HI vector_select_v8hi {} + + const vus __builtin_altivec_vsel_8hi_uns (vus, vus, vus); + VSEL_8HI_UNS vector_select_v8hi_uns {} + + const vsi __builtin_altivec_vsl (vsi, vsi); + VSL altivec_vsl {} + + const vsc __builtin_altivec_vslb (vsc, vuc); + VSLB vashlv16qi3 {} + + const vsc __builtin_altivec_vsldoi_16qi (vsc, vsc, const int<4>); + VSLDOI_16QI altivec_vsldoi_v16qi {} + + const vf __builtin_altivec_vsldoi_4sf (vf, vf, const int<4>); + VSLDOI_4SF altivec_vsldoi_v4sf {} + + const vsi __builtin_altivec_vsldoi_4si (vsi, vsi, const int<4>); + VSLDOI_4SI altivec_vsldoi_v4si {} + + const vss __builtin_altivec_vsldoi_8hi (vss, vss, const int<4>); + VSLDOI_8HI altivec_vsldoi_v8hi {} + + const vss __builtin_altivec_vslh (vss, vus); + VSLH vashlv8hi3 {} + + const vsi __builtin_altivec_vslo (vsi, vsi); + VSLO altivec_vslo {} + + const vsi __builtin_altivec_vslw (vsi, vui); + VSLW vashlv4si3 {} + + const vsc __builtin_altivec_vspltb (vsc, const int<4>); + VSPLTB altivec_vspltb {} + + const vss __builtin_altivec_vsplth (vss, const int<3>); + VSPLTH altivec_vsplth {} + + const vsc __builtin_altivec_vspltisb (const int<-16,15>); + VSPLTISB altivec_vspltisb {} + + const vss __builtin_altivec_vspltish (const int<-16,15>); + VSPLTISH altivec_vspltish {} + + const vsi __builtin_altivec_vspltisw (const int<-16,15>); + VSPLTISW altivec_vspltisw {} + + const vsi __builtin_altivec_vspltw (vsi, const int<2>); + VSPLTW altivec_vspltw {} + + const vsi __builtin_altivec_vsr (vsi, vsi); + VSR altivec_vsr {} + + const vsc __builtin_altivec_vsrab (vsc, vuc); + VSRAB vashrv16qi3 {} + + const vss __builtin_altivec_vsrah (vss, vus); + VSRAH vashrv8hi3 {} + + const vsi __builtin_altivec_vsraw (vsi, vui); + VSRAW vashrv4si3 {} + + const vsc __builtin_altivec_vsrb (vsc, vuc); + VSRB vlshrv16qi3 {} + + const vss __builtin_altivec_vsrh (vss, vus); + VSRH vlshrv8hi3 {} + + const vsi __builtin_altivec_vsro (vsi, vsi); + VSRO altivec_vsro {} + + const vsi __builtin_altivec_vsrw (vsi, vui); + VSRW vlshrv4si3 {} + + const vsi __builtin_altivec_vsubcuw (vsi, vsi); + VSUBCUW altivec_vsubcuw {} + + const vf __builtin_altivec_vsubfp (vf, vf); + VSUBFP subv4sf3 {} + + const vsc __builtin_altivec_vsubsbs (vsc, vsc); + VSUBSBS altivec_vsubsbs {} + + const vss __builtin_altivec_vsubshs (vss, vss); + VSUBSHS altivec_vsubshs {} + + const vsi __builtin_altivec_vsubsws (vsi, vsi); + VSUBSWS altivec_vsubsws {} + + const vuc __builtin_altivec_vsububm (vuc, vuc); + VSUBUBM subv16qi3 {} + + const vuc __builtin_altivec_vsububs (vuc, vuc); + VSUBUBS altivec_vsububs {} + + const vus __builtin_altivec_vsubuhm (vus, vus); + VSUBUHM subv8hi3 {} + + const vus __builtin_altivec_vsubuhs (vus, vus); + VSUBUHS altivec_vsubuhs {} + + const vui __builtin_altivec_vsubuwm (vui, vui); + VSUBUWM subv4si3 {} + + const vui __builtin_altivec_vsubuws (vui, vui); + VSUBUWS altivec_vsubuws {} + + const vsi __builtin_altivec_vsum2sws (vsi, vsi); + VSUM2SWS altivec_vsum2sws {} + + const vsi __builtin_altivec_vsum4sbs (vsc, vsi); + VSUM4SBS altivec_vsum4sbs {} + + const vsi __builtin_altivec_vsum4shs (vss, vsi); + VSUM4SHS altivec_vsum4shs {} + + const vui __builtin_altivec_vsum4ubs (vuc, vui); + VSUM4UBS altivec_vsum4ubs {} + + const vsi __builtin_altivec_vsumsws (vsi, vsi); + VSUMSWS altivec_vsumsws {} + + const vsi __builtin_altivec_vsumsws_be (vsi, vsi); + VSUMSWS_BE altivec_vsumsws_direct {} + + const vui __builtin_altivec_vupkhpx (vp); + VUPKHPX altivec_vupkhpx {} + + const vss __builtin_altivec_vupkhsb (vsc); + VUPKHSB altivec_vupkhsb {} + + const vsi __builtin_altivec_vupkhsh (vss); + VUPKHSH altivec_vupkhsh {} + + const vui __builtin_altivec_vupklpx (vp); + VUPKLPX altivec_vupklpx {} + + const vss __builtin_altivec_vupklsb (vsc); + VUPKLSB altivec_vupklsb {} + + const vsi __builtin_altivec_vupklsh (vss); + VUPKLSH altivec_vupklsh {} + + const vsc __builtin_altivec_vxor_v16qi (vsc, vsc); + VXOR_V16QI xorv16qi3 {} + + const vuc __builtin_altivec_vxor_v16qi_uns (vuc, vuc); + VXOR_V16QI_UNS xorv16qi3 {} + + const vf __builtin_altivec_vxor_v4sf (vf, vf); + VXOR_V4SF xorv4sf3 {} + + const vsi __builtin_altivec_vxor_v4si (vsi, vsi); + VXOR_V4SI xorv4si3 {} + + const vui __builtin_altivec_vxor_v4si_uns (vui, vui); + VXOR_V4SI_UNS xorv4si3 {} + + const vss __builtin_altivec_vxor_v8hi (vss, vss); + VXOR_V8HI xorv8hi3 {} + + const vus __builtin_altivec_vxor_v8hi_uns (vus, vus); + VXOR_V8HI_UNS xorv8hi3 {} + + const signed char __builtin_vec_ext_v16qi (vsc, signed int); + VEC_EXT_V16QI nothing {extract} + + const float __builtin_vec_ext_v4sf (vf, signed int); + VEC_EXT_V4SF nothing {extract} + + const signed int __builtin_vec_ext_v4si (vsi, signed int); + VEC_EXT_V4SI nothing {extract} + + const signed short __builtin_vec_ext_v8hi (vss, signed int); + VEC_EXT_V8HI nothing {extract} + + const vsc __builtin_vec_init_v16qi (signed char, signed char, signed char, signed char, signed char, signed char, signed char, signed char, signed char, signed char, signed char, signed char, signed char, signed char, signed char, signed char); + VEC_INIT_V16QI nothing {init} + + const vf __builtin_vec_init_v4sf (float, float, float, float); + VEC_INIT_V4SF nothing {init} + + const vsi __builtin_vec_init_v4si (signed int, signed int, signed int, signed int); + VEC_INIT_V4SI nothing {init} + + const vss __builtin_vec_init_v8hi (signed short, signed short, signed short, signed short, signed short, signed short, signed short, signed short); + VEC_INIT_V8HI nothing {init} + + const vsc __builtin_vec_set_v16qi (vsc, signed char, const int<4>); + VEC_SET_V16QI nothing {set} + + const vf __builtin_vec_set_v4sf (vf, float, const int<2>); + VEC_SET_V4SF nothing {set} + + const vsi __builtin_vec_set_v4si (vsi, signed int, const int<2>); + VEC_SET_V4SI nothing {set} + + const vss __builtin_vec_set_v8hi (vss, signed short, const int<3>); + VEC_SET_V8HI nothing {set} diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index 904e104..8b16d65 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -13493,6 +13493,9 @@ rs6000_init_builtins (void) intTI_type_node, 1); pixel_V8HI_type_node = rs6000_vector_type ("__vector __pixel", pixel_type_node, 8); + pcvoid_type_node + = build_pointer_type (build_qualified_type (void_type_node, + TYPE_QUAL_CONST)); /* Create Altivec, VSX and MMA builtins on machines with at least the general purpose extensions (970 and newer) to allow the use of @@ -13652,10 +13655,6 @@ altivec_init_builtins (void) tree pvoid_type_node = build_pointer_type (void_type_node); - tree pcvoid_type_node - = build_pointer_type (build_qualified_type (void_type_node, - TYPE_QUAL_CONST)); - tree int_ftype_opaque = build_function_type_list (integer_type_node, opaque_V4SI_type_node, NULL_TREE); diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index 4ca6372..c5d20d2 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -2460,6 +2460,7 @@ enum rs6000_builtin_type_index RS6000_BTI_const_str, /* pointer to const char * */ RS6000_BTI_vector_pair, /* unsigned 256-bit types (vector pair). */ RS6000_BTI_vector_quad, /* unsigned 512-bit types (vector quad). */ + RS6000_BTI_const_ptr_void, /* const pointer to void */ RS6000_BTI_MAX }; @@ -2515,6 +2516,7 @@ enum rs6000_builtin_type_index #define const_str_type_node (rs6000_builtin_types[RS6000_BTI_const_str]) #define vector_pair_type_node (rs6000_builtin_types[RS6000_BTI_vector_pair]) #define vector_quad_type_node (rs6000_builtin_types[RS6000_BTI_vector_quad]) +#define pcvoid_type_node (rs6000_builtin_types[RS6000_BTI_const_ptr_void]) extern GTY(()) tree rs6000_builtin_types[RS6000_BTI_MAX]; extern GTY(()) tree rs6000_builtin_decls[RS6000_BUILTIN_COUNT]; -- cgit v1.1 From ba6aa47470550065d1ff8a8acb40654cdc85fbd9 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 11 Aug 2021 14:56:26 -0500 Subject: rs6000: Add VSX builtins 2021-08-11 Bill Schmidt gcc/ * config/rs6000/rs6000-builtin-new.def: Add vsx stanza. --- gcc/config/rs6000/rs6000-builtin-new.def | 857 +++++++++++++++++++++++++++++++ 1 file changed, 857 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def index f1aa552..b5d3570 100644 --- a/gcc/config/rs6000/rs6000-builtin-new.def +++ b/gcc/config/rs6000/rs6000-builtin-new.def @@ -1028,3 +1028,860 @@ const vss __builtin_vec_set_v8hi (vss, signed short, const int<3>); VEC_SET_V8HI nothing {set} + + +; VSX builtins. +[vsx] + pure vd __builtin_altivec_lvx_v2df (signed long, const void *); + LVX_V2DF altivec_lvx_v2df {ldvec} + + pure vsll __builtin_altivec_lvx_v2di (signed long, const void *); + LVX_V2DI altivec_lvx_v2di {ldvec} + + pure vd __builtin_altivec_lvxl_v2df (signed long, const void *); + LVXL_V2DF altivec_lvxl_v2df {ldvec} + + pure vsll __builtin_altivec_lvxl_v2di (signed long, const void *); + LVXL_V2DI altivec_lvxl_v2di {ldvec} + + const vd __builtin_altivec_nabs_v2df (vd); + NABS_V2DF vsx_nabsv2df2 {} + + const vsll __builtin_altivec_nabs_v2di (vsll); + NABS_V2DI nabsv2di2 {} + + void __builtin_altivec_stvx_v2df (vd, signed long, void *); + STVX_V2DF altivec_stvx_v2df {stvec} + + void __builtin_altivec_stvx_v2di (vsll, signed long, void *); + STVX_V2DI altivec_stvx_v2di {stvec} + + void __builtin_altivec_stvxl_v2df (vd, signed long, void *); + STVXL_V2DF altivec_stvxl_v2df {stvec} + + void __builtin_altivec_stvxl_v2di (vsll, signed long, void *); + STVXL_V2DI altivec_stvxl_v2di {stvec} + + const vd __builtin_altivec_vand_v2df (vd, vd); + VAND_V2DF andv2df3 {} + + const vsll __builtin_altivec_vand_v2di (vsll, vsll); + VAND_V2DI andv2di3 {} + + const vull __builtin_altivec_vand_v2di_uns (vull, vull); + VAND_V2DI_UNS andv2di3 {} + + const vd __builtin_altivec_vandc_v2df (vd, vd); + VANDC_V2DF andcv2df3 {} + + const vsll __builtin_altivec_vandc_v2di (vsll, vsll); + VANDC_V2DI andcv2di3 {} + + const vull __builtin_altivec_vandc_v2di_uns (vull, vull); + VANDC_V2DI_UNS andcv2di3 {} + + const vsll __builtin_altivec_vcmpequd (vull, vull); + VCMPEQUD vector_eqv2di {} + + const int __builtin_altivec_vcmpequd_p (int, vsll, vsll); + VCMPEQUD_P vector_eq_v2di_p {pred} + + const vsll __builtin_altivec_vcmpgtsd (vsll, vsll); + VCMPGTSD vector_gtv2di {} + + const int __builtin_altivec_vcmpgtsd_p (int, vsll, vsll); + VCMPGTSD_P vector_gt_v2di_p {pred} + + const vsll __builtin_altivec_vcmpgtud (vull, vull); + VCMPGTUD vector_gtuv2di {} + + const int __builtin_altivec_vcmpgtud_p (int, vsll, vsll); + VCMPGTUD_P vector_gtu_v2di_p {pred} + + const vd __builtin_altivec_vnor_v2df (vd, vd); + VNOR_V2DF norv2df3 {} + + const vsll __builtin_altivec_vnor_v2di (vsll, vsll); + VNOR_V2DI norv2di3 {} + + const vull __builtin_altivec_vnor_v2di_uns (vull, vull); + VNOR_V2DI_UNS norv2di3 {} + + const vd __builtin_altivec_vor_v2df (vd, vd); + VOR_V2DF iorv2df3 {} + + const vsll __builtin_altivec_vor_v2di (vsll, vsll); + VOR_V2DI iorv2di3 {} + + const vull __builtin_altivec_vor_v2di_uns (vull, vull); + VOR_V2DI_UNS iorv2di3 {} + + const vd __builtin_altivec_vperm_2df (vd, vd, vuc); + VPERM_2DF altivec_vperm_v2df {} + + const vsll __builtin_altivec_vperm_2di (vsll, vsll, vuc); + VPERM_2DI altivec_vperm_v2di {} + + const vull __builtin_altivec_vperm_2di_uns (vull, vull, vuc); + VPERM_2DI_UNS altivec_vperm_v2di_uns {} + + const vd __builtin_altivec_vreve_v2df (vd); + VREVE_V2DF altivec_vrevev2df2 {} + + const vsll __builtin_altivec_vreve_v2di (vsll); + VREVE_V2DI altivec_vrevev2di2 {} + + const vd __builtin_altivec_vsel_2df (vd, vd, vd); + VSEL_2DF vector_select_v2df {} + + const vsll __builtin_altivec_vsel_2di (vsll, vsll, vsll); + VSEL_2DI_B vector_select_v2di {} + + const vull __builtin_altivec_vsel_2di_uns (vull, vull, vull); + VSEL_2DI_UNS vector_select_v2di_uns {} + + const vd __builtin_altivec_vsldoi_2df (vd, vd, const int<4>); + VSLDOI_2DF altivec_vsldoi_v2df {} + + const vsll __builtin_altivec_vsldoi_2di (vsll, vsll, const int<4>); + VSLDOI_2DI altivec_vsldoi_v2di {} + + const vd __builtin_altivec_vxor_v2df (vd, vd); + VXOR_V2DF xorv2df3 {} + + const vsll __builtin_altivec_vxor_v2di (vsll, vsll); + VXOR_V2DI xorv2di3 {} + + const vull __builtin_altivec_vxor_v2di_uns (vull, vull); + VXOR_V2DI_UNS xorv2di3 {} + + const signed __int128 __builtin_vec_ext_v1ti (vsq, signed int); + VEC_EXT_V1TI nothing {extract} + + const double __builtin_vec_ext_v2df (vd, signed int); + VEC_EXT_V2DF nothing {extract} + + const signed long long __builtin_vec_ext_v2di (vsll, signed int); + VEC_EXT_V2DI nothing {extract} + + const vsq __builtin_vec_init_v1ti (signed __int128); + VEC_INIT_V1TI nothing {init} + + const vd __builtin_vec_init_v2df (double, double); + VEC_INIT_V2DF nothing {init} + + const vsll __builtin_vec_init_v2di (signed long long, signed long long); + VEC_INIT_V2DI nothing {init} + + const vsq __builtin_vec_set_v1ti (vsq, signed __int128, const int<0,0>); + VEC_SET_V1TI nothing {set} + + const vd __builtin_vec_set_v2df (vd, double, const int<1>); + VEC_SET_V2DF nothing {set} + + const vsll __builtin_vec_set_v2di (vsll, signed long long, const int<1>); + VEC_SET_V2DI nothing {set} + + const vsc __builtin_vsx_cmpge_16qi (vsc, vsc); + CMPGE_16QI vector_nltv16qi {} + + const vsll __builtin_vsx_cmpge_2di (vsll, vsll); + CMPGE_2DI vector_nltv2di {} + + const vsi __builtin_vsx_cmpge_4si (vsi, vsi); + CMPGE_4SI vector_nltv4si {} + + const vss __builtin_vsx_cmpge_8hi (vss, vss); + CMPGE_8HI vector_nltv8hi {} + + const vsc __builtin_vsx_cmpge_u16qi (vuc, vuc); + CMPGE_U16QI vector_nltuv16qi {} + + const vsll __builtin_vsx_cmpge_u2di (vull, vull); + CMPGE_U2DI vector_nltuv2di {} + + const vsi __builtin_vsx_cmpge_u4si (vui, vui); + CMPGE_U4SI vector_nltuv4si {} + + const vss __builtin_vsx_cmpge_u8hi (vus, vus); + CMPGE_U8HI vector_nltuv8hi {} + + const vsc __builtin_vsx_cmple_16qi (vsc, vsc); + CMPLE_16QI vector_ngtv16qi {} + + const vsll __builtin_vsx_cmple_2di (vsll, vsll); + CMPLE_2DI vector_ngtv2di {} + + const vsi __builtin_vsx_cmple_4si (vsi, vsi); + CMPLE_4SI vector_ngtv4si {} + + const vss __builtin_vsx_cmple_8hi (vss, vss); + CMPLE_8HI vector_ngtv8hi {} + + const vsc __builtin_vsx_cmple_u16qi (vsc, vsc); + CMPLE_U16QI vector_ngtuv16qi {} + + const vsll __builtin_vsx_cmple_u2di (vsll, vsll); + CMPLE_U2DI vector_ngtuv2di {} + + const vsi __builtin_vsx_cmple_u4si (vsi, vsi); + CMPLE_U4SI vector_ngtuv4si {} + + const vss __builtin_vsx_cmple_u8hi (vss, vss); + CMPLE_U8HI vector_ngtuv8hi {} + + const vd __builtin_vsx_concat_2df (double, double); + CONCAT_2DF vsx_concat_v2df {} + + const vsll __builtin_vsx_concat_2di (signed long long, signed long long); + CONCAT_2DI vsx_concat_v2di {} + + const vd __builtin_vsx_cpsgndp (vd, vd); + CPSGNDP vector_copysignv2df3 {} + + const vf __builtin_vsx_cpsgnsp (vf, vf); + CPSGNSP vector_copysignv4sf3 {} + + const vsll __builtin_vsx_div_2di (vsll, vsll); + DIV_V2DI vsx_div_v2di {} + + const vd __builtin_vsx_doublee_v4sf (vf); + DOUBLEE_V4SF doubleev4sf2 {} + + const vd __builtin_vsx_doublee_v4si (vsi); + DOUBLEE_V4SI doubleev4si2 {} + + const vd __builtin_vsx_doubleh_v4sf (vf); + DOUBLEH_V4SF doublehv4sf2 {} + + const vd __builtin_vsx_doubleh_v4si (vsi); + DOUBLEH_V4SI doublehv4si2 {} + + const vd __builtin_vsx_doublel_v4sf (vf); + DOUBLEL_V4SF doublelv4sf2 {} + + const vd __builtin_vsx_doublel_v4si (vsi); + DOUBLEL_V4SI doublelv4si2 {} + + const vd __builtin_vsx_doubleo_v4sf (vf); + DOUBLEO_V4SF doubleov4sf2 {} + + const vd __builtin_vsx_doubleo_v4si (vsi); + DOUBLEO_V4SI doubleov4si2 {} + + const vf __builtin_vsx_floate_v2df (vd); + FLOATE_V2DF floatev2df {} + + const vf __builtin_vsx_floate_v2di (vsll); + FLOATE_V2DI floatev2di {} + + const vf __builtin_vsx_floato_v2df (vd); + FLOATO_V2DF floatov2df {} + + const vf __builtin_vsx_floato_v2di (vsll); + FLOATO_V2DI floatov2di {} + + pure vsq __builtin_vsx_ld_elemrev_v1ti (signed long, const void *); + LD_ELEMREV_V1TI vsx_ld_elemrev_v1ti {ldvec,endian} + + pure vd __builtin_vsx_ld_elemrev_v2df (signed long, const void *); + LD_ELEMREV_V2DF vsx_ld_elemrev_v2df {ldvec,endian} + + pure vsll __builtin_vsx_ld_elemrev_v2di (signed long, const void *); + LD_ELEMREV_V2DI vsx_ld_elemrev_v2di {ldvec,endian} + + pure vf __builtin_vsx_ld_elemrev_v4sf (signed long, const void *); + LD_ELEMREV_V4SF vsx_ld_elemrev_v4sf {ldvec,endian} + + pure vsi __builtin_vsx_ld_elemrev_v4si (signed long, const void *); + LD_ELEMREV_V4SI vsx_ld_elemrev_v4si {ldvec,endian} + + pure vss __builtin_vsx_ld_elemrev_v8hi (signed long, const void *); + LD_ELEMREV_V8HI vsx_ld_elemrev_v8hi {ldvec,endian} + + pure vsc __builtin_vsx_ld_elemrev_v16qi (signed long, const void *); + LD_ELEMREV_V16QI vsx_ld_elemrev_v16qi {ldvec,endian} + +; TODO: There is apparent intent in rs6000-builtin.def to have +; RS6000_BTC_SPECIAL processing for LXSDX, LXVDSX, and STXSDX, but there are +; no def_builtin calls for any of them. At some point, we may want to add a +; set of built-ins for whichever vector types make sense for these. + + pure vsq __builtin_vsx_lxvd2x_v1ti (signed long, const void *); + LXVD2X_V1TI vsx_load_v1ti {ldvec} + + pure vd __builtin_vsx_lxvd2x_v2df (signed long, const void *); + LXVD2X_V2DF vsx_load_v2df {ldvec} + + pure vsll __builtin_vsx_lxvd2x_v2di (signed long, const void *); + LXVD2X_V2DI vsx_load_v2di {ldvec} + + pure vsc __builtin_vsx_lxvw4x_v16qi (signed long, const void *); + LXVW4X_V16QI vsx_load_v16qi {ldvec} + + pure vf __builtin_vsx_lxvw4x_v4sf (signed long, const void *); + LXVW4X_V4SF vsx_load_v4sf {ldvec} + + pure vsi __builtin_vsx_lxvw4x_v4si (signed long, const void *); + LXVW4X_V4SI vsx_load_v4si {ldvec} + + pure vss __builtin_vsx_lxvw4x_v8hi (signed long, const void *); + LXVW4X_V8HI vsx_load_v8hi {ldvec} + + const vd __builtin_vsx_mergeh_2df (vd, vd); + VEC_MERGEH_V2DF vsx_mergeh_v2df {} + + const vsll __builtin_vsx_mergeh_2di (vsll, vsll); + VEC_MERGEH_V2DI vsx_mergeh_v2di {} + + const vd __builtin_vsx_mergel_2df (vd, vd); + VEC_MERGEL_V2DF vsx_mergel_v2df {} + + const vsll __builtin_vsx_mergel_2di (vsll, vsll); + VEC_MERGEL_V2DI vsx_mergel_v2di {} + + const vsll __builtin_vsx_mul_2di (vsll, vsll); + MUL_V2DI vsx_mul_v2di {} + + const vsq __builtin_vsx_set_1ti (vsq, signed __int128, const int<0,0>); + SET_1TI vsx_set_v1ti {set} + + const vd __builtin_vsx_set_2df (vd, double, const int<0,1>); + SET_2DF vsx_set_v2df {set} + + const vsll __builtin_vsx_set_2di (vsll, signed long long, const int<0,1>); + SET_2DI vsx_set_v2di {set} + + const vd __builtin_vsx_splat_2df (double); + SPLAT_2DF vsx_splat_v2df {} + + const vsll __builtin_vsx_splat_2di (signed long long); + SPLAT_2DI vsx_splat_v2di {} + + void __builtin_vsx_st_elemrev_v1ti (vsq, signed long, void *); + ST_ELEMREV_V1TI vsx_st_elemrev_v1ti {stvec,endian} + + void __builtin_vsx_st_elemrev_v2df (vd, signed long, void *); + ST_ELEMREV_V2DF vsx_st_elemrev_v2df {stvec,endian} + + void __builtin_vsx_st_elemrev_v2di (vsll, signed long, void *); + ST_ELEMREV_V2DI vsx_st_elemrev_v2di {stvec,endian} + + void __builtin_vsx_st_elemrev_v4sf (vf, signed long, void *); + ST_ELEMREV_V4SF vsx_st_elemrev_v4sf {stvec,endian} + + void __builtin_vsx_st_elemrev_v4si (vsi, signed long, void *); + ST_ELEMREV_V4SI vsx_st_elemrev_v4si {stvec,endian} + + void __builtin_vsx_st_elemrev_v8hi (vss, signed long, void *); + ST_ELEMREV_V8HI vsx_st_elemrev_v8hi {stvec,endian} + + void __builtin_vsx_st_elemrev_v16qi (vsc, signed long, void *); + ST_ELEMREV_V16QI vsx_st_elemrev_v16qi {stvec,endian} + + void __builtin_vsx_stxvd2x_v1ti (vsq, signed long, void *); + STXVD2X_V1TI vsx_store_v1ti {stvec} + + void __builtin_vsx_stxvd2x_v2df (vd, signed long, void *); + STXVD2X_V2DF vsx_store_v2df {stvec} + + void __builtin_vsx_stxvd2x_v2di (vsll, signed long, void *); + STXVD2X_V2DI vsx_store_v2di {stvec} + + void __builtin_vsx_stxvw4x_v4sf (vf, signed long, void *); + STXVW4X_V4SF vsx_store_v4sf {stvec} + + void __builtin_vsx_stxvw4x_v4si (vsi, signed long, void *); + STXVW4X_V4SI vsx_store_v4si {stvec} + + void __builtin_vsx_stxvw4x_v8hi (vss, signed long, void *); + STXVW4X_V8HI vsx_store_v8hi {stvec} + + void __builtin_vsx_stxvw4x_v16qi (vsc, signed long, void *); + STXVW4X_V16QI vsx_store_v16qi {stvec} + + const vull __builtin_vsx_udiv_2di (vull, vull); + UDIV_V2DI vsx_udiv_v2di {} + + const vd __builtin_vsx_uns_doublee_v4si (vsi); + UNS_DOUBLEE_V4SI unsdoubleev4si2 {} + + const vd __builtin_vsx_uns_doubleh_v4si (vsi); + UNS_DOUBLEH_V4SI unsdoublehv4si2 {} + + const vd __builtin_vsx_uns_doublel_v4si (vsi); + UNS_DOUBLEL_V4SI unsdoublelv4si2 {} + + const vd __builtin_vsx_uns_doubleo_v4si (vsi); + UNS_DOUBLEO_V4SI unsdoubleov4si2 {} + + const vf __builtin_vsx_uns_floate_v2di (vsll); + UNS_FLOATE_V2DI unsfloatev2di {} + + const vf __builtin_vsx_uns_floato_v2di (vsll); + UNS_FLOATO_V2DI unsfloatov2di {} + +; These are duplicates of __builtin_altivec_* counterparts, and are being +; kept for backwards compatibility. The reason for their existence is +; unclear. TODO: Consider deprecation/removal at some point. + const vsc __builtin_vsx_vperm_16qi (vsc, vsc, vuc); + VPERM_16QI_X altivec_vperm_v16qi {} + + const vuc __builtin_vsx_vperm_16qi_uns (vuc, vuc, vuc); + VPERM_16QI_UNS_X altivec_vperm_v16qi_uns {} + + const vsq __builtin_vsx_vperm_1ti (vsq, vsq, vsc); + VPERM_1TI_X altivec_vperm_v1ti {} + + const vsq __builtin_vsx_vperm_1ti_uns (vsq, vsq, vsc); + VPERM_1TI_UNS_X altivec_vperm_v1ti_uns {} + + const vd __builtin_vsx_vperm_2df (vd, vd, vuc); + VPERM_2DF_X altivec_vperm_v2df {} + + const vsll __builtin_vsx_vperm_2di (vsll, vsll, vuc); + VPERM_2DI_X altivec_vperm_v2di {} + + const vull __builtin_vsx_vperm_2di_uns (vull, vull, vuc); + VPERM_2DI_UNS_X altivec_vperm_v2di_uns {} + + const vf __builtin_vsx_vperm_4sf (vf, vf, vuc); + VPERM_4SF_X altivec_vperm_v4sf {} + + const vsi __builtin_vsx_vperm_4si (vsi, vsi, vuc); + VPERM_4SI_X altivec_vperm_v4si {} + + const vui __builtin_vsx_vperm_4si_uns (vui, vui, vuc); + VPERM_4SI_UNS_X altivec_vperm_v4si_uns {} + + const vss __builtin_vsx_vperm_8hi (vss, vss, vuc); + VPERM_8HI_X altivec_vperm_v8hi {} + + const vus __builtin_vsx_vperm_8hi_uns (vus, vus, vuc); + VPERM_8HI_UNS_X altivec_vperm_v8hi_uns {} + + const vsll __builtin_vsx_vsigned_v2df (vd); + VEC_VSIGNED_V2DF vsx_xvcvdpsxds {} + + const vsi __builtin_vsx_vsigned_v4sf (vf); + VEC_VSIGNED_V4SF vsx_xvcvspsxws {} + + const vsi __builtin_vsx_vsignede_v2df (vd); + VEC_VSIGNEDE_V2DF vsignede_v2df {} + + const vsi __builtin_vsx_vsignedo_v2df (vd); + VEC_VSIGNEDO_V2DF vsignedo_v2df {} + + const vsll __builtin_vsx_vunsigned_v2df (vd); + VEC_VUNSIGNED_V2DF vsx_xvcvdpsxds {} + + const vsi __builtin_vsx_vunsigned_v4sf (vf); + VEC_VUNSIGNED_V4SF vsx_xvcvspsxws {} + + const vsi __builtin_vsx_vunsignede_v2df (vd); + VEC_VUNSIGNEDE_V2DF vunsignede_v2df {} + + const vsi __builtin_vsx_vunsignedo_v2df (vd); + VEC_VUNSIGNEDO_V2DF vunsignedo_v2df {} + + const vf __builtin_vsx_xscvdpsp (double); + XSCVDPSP vsx_xscvdpsp {} + + const double __builtin_vsx_xscvspdp (vf); + XSCVSPDP vsx_xscvspdp {} + + const double __builtin_vsx_xsmaxdp (double, double); + XSMAXDP smaxdf3 {} + + const double __builtin_vsx_xsmindp (double, double); + XSMINDP smindf3 {} + + const double __builtin_vsx_xsrdpi (double); + XSRDPI vsx_xsrdpi {} + + const double __builtin_vsx_xsrdpic (double); + XSRDPIC vsx_xsrdpic {} + + const double __builtin_vsx_xsrdpim (double); + XSRDPIM floordf2 {} + + const double __builtin_vsx_xsrdpip (double); + XSRDPIP ceildf2 {} + + const double __builtin_vsx_xsrdpiz (double); + XSRDPIZ btruncdf2 {} + + const signed int __builtin_vsx_xstdivdp_fe (double, double); + XSTDIVDP_FE vsx_tdivdf3_fe {} + + const signed int __builtin_vsx_xstdivdp_fg (double, double); + XSTDIVDP_FG vsx_tdivdf3_fg {} + + const signed int __builtin_vsx_xstsqrtdp_fe (double); + XSTSQRTDP_FE vsx_tsqrtdf2_fe {} + + const signed int __builtin_vsx_xstsqrtdp_fg (double); + XSTSQRTDP_FG vsx_tsqrtdf2_fg {} + + const vd __builtin_vsx_xvabsdp (vd); + XVABSDP absv2df2 {} + + const vf __builtin_vsx_xvabssp (vf); + XVABSSP absv4sf2 {} + + fpmath vd __builtin_vsx_xvadddp (vd, vd); + XVADDDP addv2df3 {} + + fpmath vf __builtin_vsx_xvaddsp (vf, vf); + XVADDSP addv4sf3 {} + + const vd __builtin_vsx_xvcmpeqdp (vd, vd); + XVCMPEQDP vector_eqv2df {} + + const signed int __builtin_vsx_xvcmpeqdp_p (signed int, vd, vd); + XVCMPEQDP_P vector_eq_v2df_p {pred} + + const vf __builtin_vsx_xvcmpeqsp (vf, vf); + XVCMPEQSP vector_eqv4sf {} + + const signed int __builtin_vsx_xvcmpeqsp_p (signed int, vf, vf); + XVCMPEQSP_P vector_eq_v4sf_p {pred} + + const vd __builtin_vsx_xvcmpgedp (vd, vd); + XVCMPGEDP vector_gev2df {} + + const signed int __builtin_vsx_xvcmpgedp_p (signed int, vd, vd); + XVCMPGEDP_P vector_ge_v2df_p {pred} + + const vf __builtin_vsx_xvcmpgesp (vf, vf); + XVCMPGESP vector_gev4sf {} + + const signed int __builtin_vsx_xvcmpgesp_p (signed int, vf, vf); + XVCMPGESP_P vector_ge_v4sf_p {pred} + + const vd __builtin_vsx_xvcmpgtdp (vd, vd); + XVCMPGTDP vector_gtv2df {} + + const signed int __builtin_vsx_xvcmpgtdp_p (signed int, vd, vd); + XVCMPGTDP_P vector_gt_v2df_p {pred} + + const vf __builtin_vsx_xvcmpgtsp (vf, vf); + XVCMPGTSP vector_gtv4sf {} + + const signed int __builtin_vsx_xvcmpgtsp_p (signed int, vf, vf); + XVCMPGTSP_P vector_gt_v4sf_p {pred} + + const vf __builtin_vsx_xvcvdpsp (vd); + XVCVDPSP vsx_xvcvdpsp {} + + const vsll __builtin_vsx_xvcvdpsxds (vd); + XVCVDPSXDS vsx_fix_truncv2dfv2di2 {} + + const vsll __builtin_vsx_xvcvdpsxds_scale (vd, const int); + XVCVDPSXDS_SCALE vsx_xvcvdpsxds_scale {} + + const vsi __builtin_vsx_xvcvdpsxws (vd); + XVCVDPSXWS vsx_xvcvdpsxws {} + + const vsll __builtin_vsx_xvcvdpuxds (vd); + XVCVDPUXDS vsx_fixuns_truncv2dfv2di2 {} + + const vsll __builtin_vsx_xvcvdpuxds_scale (vd, const int); + XVCVDPUXDS_SCALE vsx_xvcvdpuxds_scale {} + + const vull __builtin_vsx_xvcvdpuxds_uns (vd); + XVCVDPUXDS_UNS vsx_fixuns_truncv2dfv2di2 {} + + const vsi __builtin_vsx_xvcvdpuxws (vd); + XVCVDPUXWS vsx_xvcvdpuxws {} + + const vd __builtin_vsx_xvcvspdp (vf); + XVCVSPDP vsx_xvcvspdp {} + + const vsll __builtin_vsx_xvcvspsxds (vf); + XVCVSPSXDS vsx_xvcvspsxds {} + + const vsi __builtin_vsx_xvcvspsxws (vf); + XVCVSPSXWS vsx_fix_truncv4sfv4si2 {} + + const vsll __builtin_vsx_xvcvspuxds (vf); + XVCVSPUXDS vsx_xvcvspuxds {} + + const vsi __builtin_vsx_xvcvspuxws (vf); + XVCVSPUXWS vsx_fixuns_truncv4sfv4si2 {} + + const vd __builtin_vsx_xvcvsxddp (vsll); + XVCVSXDDP vsx_floatv2div2df2 {} + + const vd __builtin_vsx_xvcvsxddp_scale (vsll, const int<5>); + XVCVSXDDP_SCALE vsx_xvcvsxddp_scale {} + + const vf __builtin_vsx_xvcvsxdsp (vsll); + XVCVSXDSP vsx_xvcvsxdsp {} + + const vd __builtin_vsx_xvcvsxwdp (vsi); + XVCVSXWDP vsx_xvcvsxwdp {} + + const vf __builtin_vsx_xvcvsxwsp (vsi); + XVCVSXWSP vsx_floatv4siv4sf2 {} + + const vd __builtin_vsx_xvcvuxddp (vsll); + XVCVUXDDP vsx_floatunsv2div2df2 {} + + const vd __builtin_vsx_xvcvuxddp_scale (vsll, const int<5>); + XVCVUXDDP_SCALE vsx_xvcvuxddp_scale {} + + const vd __builtin_vsx_xvcvuxddp_uns (vull); + XVCVUXDDP_UNS vsx_floatunsv2div2df2 {} + + const vf __builtin_vsx_xvcvuxdsp (vull); + XVCVUXDSP vsx_xvcvuxdsp {} + + const vd __builtin_vsx_xvcvuxwdp (vsi); + XVCVUXWDP vsx_xvcvuxwdp {} + + const vf __builtin_vsx_xvcvuxwsp (vsi); + XVCVUXWSP vsx_floatunsv4siv4sf2 {} + + fpmath vd __builtin_vsx_xvdivdp (vd, vd); + XVDIVDP divv2df3 {} + + fpmath vf __builtin_vsx_xvdivsp (vf, vf); + XVDIVSP divv4sf3 {} + + const vd __builtin_vsx_xvmadddp (vd, vd, vd); + XVMADDDP fmav2df4 {} + + const vf __builtin_vsx_xvmaddsp (vf, vf, vf); + XVMADDSP fmav4sf4 {} + + const vd __builtin_vsx_xvmaxdp (vd, vd); + XVMAXDP smaxv2df3 {} + + const vf __builtin_vsx_xvmaxsp (vf, vf); + XVMAXSP smaxv4sf3 {} + + const vd __builtin_vsx_xvmindp (vd, vd); + XVMINDP sminv2df3 {} + + const vf __builtin_vsx_xvminsp (vf, vf); + XVMINSP sminv4sf3 {} + + const vd __builtin_vsx_xvmsubdp (vd, vd, vd); + XVMSUBDP fmsv2df4 {} + + const vf __builtin_vsx_xvmsubsp (vf, vf, vf); + XVMSUBSP fmsv4sf4 {} + + fpmath vd __builtin_vsx_xvmuldp (vd, vd); + XVMULDP mulv2df3 {} + + fpmath vf __builtin_vsx_xvmulsp (vf, vf); + XVMULSP mulv4sf3 {} + + const vd __builtin_vsx_xvnabsdp (vd); + XVNABSDP vsx_nabsv2df2 {} + + const vf __builtin_vsx_xvnabssp (vf); + XVNABSSP vsx_nabsv4sf2 {} + + const vd __builtin_vsx_xvnegdp (vd); + XVNEGDP negv2df2 {} + + const vf __builtin_vsx_xvnegsp (vf); + XVNEGSP negv4sf2 {} + + const vd __builtin_vsx_xvnmadddp (vd, vd, vd); + XVNMADDDP nfmav2df4 {} + + const vf __builtin_vsx_xvnmaddsp (vf, vf, vf); + XVNMADDSP nfmav4sf4 {} + + const vd __builtin_vsx_xvnmsubdp (vd, vd, vd); + XVNMSUBDP nfmsv2df4 {} + + const vf __builtin_vsx_xvnmsubsp (vf, vf, vf); + XVNMSUBSP nfmsv4sf4 {} + + const vd __builtin_vsx_xvrdpi (vd); + XVRDPI vsx_xvrdpi {} + + const vd __builtin_vsx_xvrdpic (vd); + XVRDPIC vsx_xvrdpic {} + + const vd __builtin_vsx_xvrdpim (vd); + XVRDPIM vsx_floorv2df2 {} + + const vd __builtin_vsx_xvrdpip (vd); + XVRDPIP vsx_ceilv2df2 {} + + const vd __builtin_vsx_xvrdpiz (vd); + XVRDPIZ vsx_btruncv2df2 {} + + fpmath vd __builtin_vsx_xvrecipdivdp (vd, vd); + RECIP_V2DF recipv2df3 {} + + fpmath vf __builtin_vsx_xvrecipdivsp (vf, vf); + RECIP_V4SF recipv4sf3 {} + + const vd __builtin_vsx_xvredp (vd); + XVREDP vsx_frev2df2 {} + + const vf __builtin_vsx_xvresp (vf); + XVRESP vsx_frev4sf2 {} + + const vf __builtin_vsx_xvrspi (vf); + XVRSPI vsx_xvrspi {} + + const vf __builtin_vsx_xvrspic (vf); + XVRSPIC vsx_xvrspic {} + + const vf __builtin_vsx_xvrspim (vf); + XVRSPIM vsx_floorv4sf2 {} + + const vf __builtin_vsx_xvrspip (vf); + XVRSPIP vsx_ceilv4sf2 {} + + const vf __builtin_vsx_xvrspiz (vf); + XVRSPIZ vsx_btruncv4sf2 {} + + const vd __builtin_vsx_xvrsqrtdp (vd); + RSQRT_2DF rsqrtv2df2 {} + + const vf __builtin_vsx_xvrsqrtsp (vf); + RSQRT_4SF rsqrtv4sf2 {} + + const vd __builtin_vsx_xvrsqrtedp (vd); + XVRSQRTEDP rsqrtev2df2 {} + + const vf __builtin_vsx_xvrsqrtesp (vf); + XVRSQRTESP rsqrtev4sf2 {} + + const vd __builtin_vsx_xvsqrtdp (vd); + XVSQRTDP sqrtv2df2 {} + + const vf __builtin_vsx_xvsqrtsp (vf); + XVSQRTSP sqrtv4sf2 {} + + fpmath vd __builtin_vsx_xvsubdp (vd, vd); + XVSUBDP subv2df3 {} + + fpmath vf __builtin_vsx_xvsubsp (vf, vf); + XVSUBSP subv4sf3 {} + + const signed int __builtin_vsx_xvtdivdp_fe (vd, vd); + XVTDIVDP_FE vsx_tdivv2df3_fe {} + + const signed int __builtin_vsx_xvtdivdp_fg (vd, vd); + XVTDIVDP_FG vsx_tdivv2df3_fg {} + + const signed int __builtin_vsx_xvtdivsp_fe (vf, vf); + XVTDIVSP_FE vsx_tdivv4sf3_fe {} + + const signed int __builtin_vsx_xvtdivsp_fg (vf, vf); + XVTDIVSP_FG vsx_tdivv4sf3_fg {} + + const signed int __builtin_vsx_xvtsqrtdp_fe (vd); + XVTSQRTDP_FE vsx_tsqrtv2df2_fe {} + + const signed int __builtin_vsx_xvtsqrtdp_fg (vd); + XVTSQRTDP_FG vsx_tsqrtv2df2_fg {} + + const signed int __builtin_vsx_xvtsqrtsp_fe (vf); + XVTSQRTSP_FE vsx_tsqrtv4sf2_fe {} + + const signed int __builtin_vsx_xvtsqrtsp_fg (vf); + XVTSQRTSP_FG vsx_tsqrtv4sf2_fg {} + + const vf __builtin_vsx_xxmrghw (vf, vf); + XXMRGHW_4SF vsx_xxmrghw_v4sf {} + + const vsi __builtin_vsx_xxmrghw_4si (vsi, vsi); + XXMRGHW_4SI vsx_xxmrghw_v4si {} + + const vf __builtin_vsx_xxmrglw (vf, vf); + XXMRGLW_4SF vsx_xxmrglw_v4sf {} + + const vsi __builtin_vsx_xxmrglw_4si (vsi, vsi); + XXMRGLW_4SI vsx_xxmrglw_v4si {} + + const vsc __builtin_vsx_xxpermdi_16qi (vsc, vsc, const int<2>); + XXPERMDI_16QI vsx_xxpermdi_v16qi {} + + const vsq __builtin_vsx_xxpermdi_1ti (vsq, vsq, const int<2>); + XXPERMDI_1TI vsx_xxpermdi_v1ti {} + + const vd __builtin_vsx_xxpermdi_2df (vd, vd, const int<2>); + XXPERMDI_2DF vsx_xxpermdi_v2df {} + + const vsll __builtin_vsx_xxpermdi_2di (vsll, vsll, const int<2>); + XXPERMDI_2DI vsx_xxpermdi_v2di {} + + const vf __builtin_vsx_xxpermdi_4sf (vf, vf, const int<2>); + XXPERMDI_4SF vsx_xxpermdi_v4sf {} + + const vsi __builtin_vsx_xxpermdi_4si (vsi, vsi, const int<2>); + XXPERMDI_4SI vsx_xxpermdi_v4si {} + + const vss __builtin_vsx_xxpermdi_8hi (vss, vss, const int<2>); + XXPERMDI_8HI vsx_xxpermdi_v8hi {} + + const vsc __builtin_vsx_xxsel_16qi (vsc, vsc, vsc); + XXSEL_16QI vector_select_v16qi {} + + const vuc __builtin_vsx_xxsel_16qi_uns (vuc, vuc, vuc); + XXSEL_16QI_UNS vector_select_v16qi_uns {} + + const vsq __builtin_vsx_xxsel_1ti (vsq, vsq, vsq); + XXSEL_1TI vector_select_v1ti {} + + const vsq __builtin_vsx_xxsel_1ti_uns (vsq, vsq, vsq); + XXSEL_1TI_UNS vector_select_v1ti_uns {} + + const vd __builtin_vsx_xxsel_2df (vd, vd, vd); + XXSEL_2DF vector_select_v2df {} + + const vsll __builtin_vsx_xxsel_2di (vsll, vsll, vsll); + XXSEL_2DI vector_select_v2di {} + + const vull __builtin_vsx_xxsel_2di_uns (vull, vull, vull); + XXSEL_2DI_UNS vector_select_v2di_uns {} + + const vf __builtin_vsx_xxsel_4sf (vf, vf, vf); + XXSEL_4SF vector_select_v4sf {} + + const vsi __builtin_vsx_xxsel_4si (vsi, vsi, vsi); + XXSEL_4SI vector_select_v4si {} + + const vui __builtin_vsx_xxsel_4si_uns (vui, vui, vui); + XXSEL_4SI_UNS vector_select_v4si_uns {} + + const vss __builtin_vsx_xxsel_8hi (vss, vss, vss); + XXSEL_8HI vector_select_v8hi {} + + const vus __builtin_vsx_xxsel_8hi_uns (vus, vus, vus); + XXSEL_8HI_UNS vector_select_v8hi_uns {} + + const vsc __builtin_vsx_xxsldwi_16qi (vsc, vsc, const int<2>); + XXSLDWI_16QI vsx_xxsldwi_v16qi {} + + const vd __builtin_vsx_xxsldwi_2df (vd, vd, const int<2>); + XXSLDWI_2DF vsx_xxsldwi_v2df {} + + const vsll __builtin_vsx_xxsldwi_2di (vsll, vsll, const int<2>); + XXSLDWI_2DI vsx_xxsldwi_v2di {} + + const vf __builtin_vsx_xxsldwi_4sf (vf, vf, const int<2>); + XXSLDWI_4SF vsx_xxsldwi_v4sf {} + + const vsi __builtin_vsx_xxsldwi_4si (vsi, vsi, const int<2>); + XXSLDWI_4SI vsx_xxsldwi_v4si {} + + const vss __builtin_vsx_xxsldwi_8hi (vss, vss, const int<2>); + XXSLDWI_8HI vsx_xxsldwi_v8hi {} + + const vd __builtin_vsx_xxspltd_2df (vd, const int<1>); + XXSPLTD_V2DF vsx_xxspltd_v2df {} + + const vsll __builtin_vsx_xxspltd_2di (vsll, const int<1>); + XXSPLTD_V2DI vsx_xxspltd_v2di {} -- cgit v1.1 From 873273449a944d7796d08ce6fae06eabcab7bf65 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 11 Aug 2021 14:59:17 -0500 Subject: rs6000: Add available-everywhere and ancient builtins 2021-08-11 Bill Schmidt gcc/ * config/rs6000/rs6000-builtin-new.def: Add always, power5, and power6 stanzas. --- gcc/config/rs6000/rs6000-builtin-new.def | 76 ++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def index b5d3570..61f5b94 100644 --- a/gcc/config/rs6000/rs6000-builtin-new.def +++ b/gcc/config/rs6000/rs6000-builtin-new.def @@ -184,6 +184,82 @@ +; Builtins that have been around since time immemorial or are just +; considered available everywhere. +[always] + void __builtin_cpu_init (); + CPU_INIT nothing {cpu} + + bool __builtin_cpu_is (string); + CPU_IS nothing {cpu} + + bool __builtin_cpu_supports (string); + CPU_SUPPORTS nothing {cpu} + + unsigned long long __builtin_ppc_get_timebase (); + GET_TB rs6000_get_timebase {} + + double __builtin_mffs (); + MFFS rs6000_mffs {} + +; This thing really assumes long double == __ibm128, and I'm told it has +; been used as such within libgcc. Given that __builtin_pack_ibm128 +; exists for the same purpose, this should really not be used at all. +; TODO: Consider adding special handling for this to warn whenever +; long double is not __ibm128. + const long double __builtin_pack_longdouble (double, double); + PACK_TF packtf {} + + unsigned long __builtin_ppc_mftb (); + MFTB rs6000_mftb_di {32bit} + + void __builtin_mtfsb0 (const int<5>); + MTFSB0 rs6000_mtfsb0 {} + + void __builtin_mtfsb1 (const int<5>); + MTFSB1 rs6000_mtfsb1 {} + + void __builtin_mtfsf (const int<8>, double); + MTFSF rs6000_mtfsf {} + + const __ibm128 __builtin_pack_ibm128 (double, double); + PACK_IF packif {} + + void __builtin_set_fpscr_rn (const int[0,3]); + SET_FPSCR_RN rs6000_set_fpscr_rn {} + + const double __builtin_unpack_ibm128 (__ibm128, const int<1>); + UNPACK_IF unpackif {} + +; See above comments for __builtin_pack_longdouble. + const double __builtin_unpack_longdouble (long double, const int<1>); + UNPACK_TF unpacktf {} + + +; Builtins that have been around just about forever, but not quite. +[power5] + fpmath double __builtin_recipdiv (double, double); + RECIP recipdf3 {} + + fpmath float __builtin_recipdivf (float, float); + RECIPF recipsf3 {} + + fpmath double __builtin_rsqrt (double); + RSQRT rsqrtdf2 {} + + fpmath float __builtin_rsqrtf (float); + RSQRTF rsqrtsf2 {} + + +; Power6 builtins (ISA 2.05). +[power6] + const signed long __builtin_p6_cmpb (signed long, signed long); + CMPB cmpbdi3 {} + + const signed int __builtin_p6_cmpb_32 (signed int, signed int); + CMPB_32 cmpbsi3 {} + + ; AltiVec builtins. [altivec] const vsc __builtin_altivec_abs_v16qi (vsc); -- cgit v1.1 From 95e1eca43d106d821720744ac6ff1f5df41a1e78 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 11 Aug 2021 14:00:00 +0800 Subject: Combine avx_vec_concatv16si and avx512f_zero_extendv16hiv16si2_1 to avx512f_zero_extendv16hiv16si2_2. Add define_insn_and_split to combine avx_vec_concatv16si/2 and avx512f_zero_extendv16hiv16si2_1 since the latter already zero_extend the upper bits, similar for other patterns which are related to pmovzx{bw,wd,dq}. It will do optimization like - vmovdqa %ymm0, %ymm0 # 7 [c=4 l=6] avx_vec_concatv16si/2 vpmovzxwd %ymm0, %zmm0 # 22 [c=4 l=6] avx512f_zero_extendv16hiv16si2 ret # 25 [c=0 l=1] simple_return_internal gcc/ChangeLog: PR target/101846 * config/i386/sse.md (*avx2_zero_extendv16qiv16hi2_2): New post_reload define_insn_and_split. (*avx512bw_zero_extendv32qiv32hi2_2): Ditto. (*sse4_1_zero_extendv8qiv8hi2_4): Ditto. (*avx512f_zero_extendv16hiv16si2_2): Ditto. (*avx2_zero_extendv8hiv8si2_2): Ditto. (*sse4_1_zero_extendv4hiv4si2_4): Ditto. (*avx512f_zero_extendv8siv8di2_2): Ditto. (*avx2_zero_extendv4siv4di2_2): Ditto. (*sse4_1_zero_extendv2siv2di2_4): Ditto. (VI248_256, VI248_512, VI148_512, VI148_256, VI148_128): New mode iterator. gcc/testsuite/ChangeLog: PR target/101846 * gcc.target/i386/pr101846-1.c: New test. --- gcc/config/i386/sse.md | 219 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 3957c86..3a7bbae 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -681,7 +681,12 @@ (define_mode_iterator VI124_128 [V16QI V8HI V4SI]) (define_mode_iterator VI24_128 [V8HI V4SI]) (define_mode_iterator VI248_128 [V8HI V4SI V2DI]) +(define_mode_iterator VI248_256 [V16HI V8SI V4DI]) +(define_mode_iterator VI248_512 [V32HI V16SI V8DI]) (define_mode_iterator VI48_128 [V4SI V2DI]) +(define_mode_iterator VI148_512 [V64QI V16SI V8DI]) +(define_mode_iterator VI148_256 [V32QI V8SI V4DI]) +(define_mode_iterator VI148_128 [V16QI V4SI V2DI]) ;; Various 256bit and 512 vector integer mode combinations (define_mode_iterator VI124_256 [V32QI V16HI V8SI]) @@ -18603,6 +18608,26 @@ operands[1] = lowpart_subreg (V16QImode, operands[1], V32QImode); }) +(define_insn_and_split "*avx2_zero_extendv16qiv16hi2_2" + [(set (match_operand:V32QI 0 "register_operand" "=v") + (vec_select:V32QI + (vec_concat:V64QI + (subreg:V32QI + (vec_concat:VI248_256 + (match_operand: 1 "nonimmediate_operand" "vm") + (match_operand: 2 "const0_operand" "C")) 0) + (match_operand:V32QI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX2" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V16HI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V16HImode, operands[0], V32QImode); + operands[1] = lowpart_subreg (V16QImode, operands[1], mode); +}) + (define_expand "v16qiv16hi2" [(set (match_operand:V16HI 0 "register_operand") (any_extend:V16HI @@ -18637,6 +18662,26 @@ operands[1] = lowpart_subreg (V32QImode, operands[1], V64QImode); }) +(define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_2" + [(set (match_operand:V64QI 0 "register_operand" "=v") + (vec_select:V64QI + (vec_concat:V128QI + (subreg:V64QI + (vec_concat:VI248_512 + (match_operand: 1 "nonimmediate_operand" "vm") + (match_operand: 2 "const0_operand" "C")) 0) + (match_operand:V64QI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX512BW" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V32HI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V32HImode, operands[0], V64QImode); + operands[1] = lowpart_subreg (V32QImode, operands[1], mode); +}) + (define_expand "v32qiv32hi2" [(set (match_operand:V32HI 0 "register_operand") (any_extend:V32HI @@ -18723,6 +18768,41 @@ } [(set_attr "isa" "noavx,noavx,avx")]) +(define_insn_and_split "*sse4_1_zero_extendv8qiv8hi2_4" + [(set (match_operand:V16QI 0 "register_operand" "=Yr,*x,Yw") + (vec_select:V16QI + (vec_concat:V32QI + (subreg:V16QI + (vec_concat:VI248_128 + (match_operand: 1 "vector_operand" "YrBm,*xBm,Ywm") + (match_operand: 2 "const0_operand" "C,C,C")) 0) + (match_operand:V16QI 3 "const0_operand" "C,C,C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n,n,n")])))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:V8HI + (vec_select:V8QI + (match_dup 1) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] +{ + operands[0] = lowpart_subreg (V8HImode, operands[0], V16QImode); + if (MEM_P (operands[1])) + { + operands[1] = lowpart_subreg (V8QImode, operands[1], mode); + operands[1] = gen_rtx_ZERO_EXTEND (V8HImode, operands[1]); + emit_insn (gen_rtx_SET (operands[0], operands[1])); + DONE; + } + operands[1] = lowpart_subreg (V16QImode, operands[1], mode); +} + [(set_attr "isa" "noavx,noavx,avx")]) + (define_expand "v8qiv8hi2" [(set (match_operand:V8HI 0 "register_operand") (any_extend:V8HI @@ -18913,6 +18993,26 @@ operands[1] = lowpart_subreg (V16HImode, operands[1], V32HImode); }) +(define_insn_and_split "*avx512f_zero_extendv16hiv16si2_2" + [(set (match_operand:V32HI 0 "register_operand" "=v") + (vec_select:V32HI + (vec_concat:V64HI + (subreg:V32HI + (vec_concat:VI148_512 + (match_operand: 1 "nonimmediate_operand" "vm") + (match_operand: 2 "const0_operand" "C")) 0) + (match_operand:V32HI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX512F" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V16SI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V16SImode, operands[0], V32HImode); + operands[1] = lowpart_subreg (V16HImode, operands[1], mode); +}) + (define_insn "avx2_v8hiv8si2" [(set (match_operand:V8SI 0 "register_operand" "=v") (any_extend:V8SI @@ -18947,6 +19047,27 @@ operands[1] = lowpart_subreg (V8HImode, operands[1], V16HImode); }) +(define_insn_and_split "*avx2_zero_extendv8hiv8si2_2" + [(set (match_operand:V16HI 0 "register_operand" "=v") + (vec_select:V16HI + (vec_concat:V32HI + (subreg:V16HI + (vec_concat:VI148_256 + (match_operand: 1 "nonimmediate_operand" "vm") + (match_operand: 2 "const0_operand" "C")) 0) + (match_operand:V16HI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX2" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V8SI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V8SImode, operands[0], V16HImode); + operands[1] = lowpart_subreg (V8HImode, operands[1], mode); +}) + + (define_insn "sse4_1_v4hiv4si2" [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") (any_extend:V4SI @@ -19036,6 +19157,39 @@ } [(set_attr "isa" "noavx,noavx,avx")]) +(define_insn_and_split "*sse4_1_zero_extendv4hiv4si2_4" + [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v") + (vec_select:V8HI + (vec_concat:V16HI + (subreg:V8HI + (vec_concat:VI148_128 + (match_operand: 1 "vector_operand" "YrBm,*xBm,vm") + (match_operand: 2 "const0_operand" "C,C,C")) 0) + (match_operand:V8HI 3 "const0_operand" "C,C,C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n,n,n")])))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:V4SI + (vec_select:V4HI + (match_dup 1) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] +{ + operands[0] = lowpart_subreg (V4SImode, operands[0], V8HImode); + if (MEM_P (operands[1])) + { + operands[1] = lowpart_subreg (V4HImode, operands[1], mode); + operands[1] = gen_rtx_ZERO_EXTEND (V4SImode, operands[1]); + emit_insn (gen_rtx_SET (operands[0], operands[1])); + DONE; + } + operands[1] = lowpart_subreg (V8HImode, operands[1], mode); +} + [(set_attr "isa" "noavx,noavx,avx")]) + (define_insn "avx512f_v8qiv8di2" [(set (match_operand:V8DI 0 "register_operand" "=v") (any_extend:V8DI @@ -19346,6 +19500,24 @@ operands[1] = lowpart_subreg (V8SImode, operands[1], V16SImode); }) +(define_insn_and_split "*avx512f_zero_extendv8siv8di2_2" + [(set (match_operand:V16SI 0 "register_operand" "=v") + (vec_select:V16SI + (vec_concat:V32SI + (vec_concat:V16SI + (match_operand:V8SI 1 "nonimmediate_operand" "vm") + (match_operand:V8SI 2 "const0_operand" "C")) + (match_operand:V16SI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX512F" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V8DI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V8DImode, operands[0], V16SImode); +}) + (define_expand "v8siv8di2" [(set (match_operand:V8DI 0 "register_operand" "=v") (any_extend:V8DI @@ -19380,6 +19552,24 @@ operands[1] = lowpart_subreg (V4SImode, operands[1], V8SImode); }) +(define_insn_and_split "*avx2_zero_extendv4siv4di2_2" + [(set (match_operand:V8SI 0 "register_operand" "=v") + (vec_select:V8SI + (vec_concat:V16SI + (vec_concat:V8SI + (match_operand:V4SI 1 "nonimmediate_operand" "vm") + (match_operand:V4SI 2 "const0_operand" "C")) + (match_operand:V8SI 3 "const0_operand" "C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n")])))] + "TARGET_AVX2" + "#" + "&& reload_completed" + [(set (match_dup 0) (zero_extend:V4DI (match_dup 1)))] +{ + operands[0] = lowpart_subreg (V4DImode, operands[0], V8SImode); +}) + (define_expand "v4siv4di2" [(set (match_operand:V4DI 0 "register_operand") (any_extend:V4DI @@ -19456,6 +19646,35 @@ } [(set_attr "isa" "noavx,noavx,avx")]) +(define_insn_and_split "*sse4_1_zero_extendv2siv2di2_4" + [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") + (vec_select:V4SI + (vec_concat:V8SI + (vec_concat:V4SI + (match_operand:V2SI 1 "vector_operand" "YrBm, *xBm, vm") + (match_operand:V2SI 2 "const0_operand" "C,C,C")) + (match_operand:V4SI 3 "const0_operand" "C,C,C")) + (match_parallel 4 "pmovzx_parallel" + [(match_operand 5 "const_int_operand" "n,n,n")])))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:V2DI + (vec_select:V2SI (match_dup 1) + (parallel [(const_int 0) (const_int 1)]))))] +{ + operands[0] = lowpart_subreg (V2DImode, operands[0], V4SImode); + if (MEM_P (operands[1])) + { + operands[1] = gen_rtx_ZERO_EXTEND (V2DImode, operands[1]); + emit_insn (gen_rtx_SET (operands[0], operands[1])); + DONE; + } + operands[1] = lowpart_subreg (V4SImode, operands[1], V2SImode); +} + [(set_attr "isa" "noavx,noavx,avx")]) + (define_expand "v2siv2di2" [(set (match_operand:V2DI 0 "register_operand") (any_extend:V2DI -- cgit v1.1 From 2bdf17de1d0ad7a75d3474e672a3a2110919862f Mon Sep 17 00:00:00 2001 From: Eric Botcazou Date: Thu, 12 Aug 2021 09:30:31 +0200 Subject: Make -no-pie option work for native Windows Binutils 2.36/2.37 generate PIE executables by default on native Windows (because --dynamicbase is the default) so it makes sense to have a simple way to counter that and -no-pie seems appropriate, all the more so that it is automatically passed when building the compiler itself. gcc/ * configure.ac (PE linker --disable-dynamicbase support): New check. * configure: Regenerate. * config.in: Likewise. * config/i386/mingw32.h (LINK_SPEC_DISABLE_DYNAMICBASE): New define. (LINK_SPEC): Use it. * config/i386/mingw-w64.h (LINK_SPEC_DISABLE_DYNAMICBASE): Likewise. (LINK_SPEC): Likewise. --- gcc/config/i386/mingw-w64.h | 9 +++++++++ gcc/config/i386/mingw32.h | 8 ++++++++ 2 files changed, 17 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/mingw-w64.h b/gcc/config/i386/mingw-w64.h index 0cec6b0..6cc7ac5 100644 --- a/gcc/config/i386/mingw-w64.h +++ b/gcc/config/i386/mingw-w64.h @@ -89,6 +89,14 @@ along with GCC; see the file COPYING3. If not see # define LINK_SPEC_LARGE_ADDR_AWARE "" #endif +#undef LINK_SPEC_DISABLE_DYNAMICBASE +#if HAVE_LD_PE_DISABLE_DYNAMICBASE +# define LINK_SPEC_DISABLE_DYNAMICBASE \ + "%{!shared:%{!mdll:%{no-pie:--disable-dynamicbase}}}" +#else +# define LINK_SPEC_DISABLE_DYNAMICBASE "" +#endif + #undef LINK_SPEC #define LINK_SPEC SUB_LINK_SPEC " %{mwindows:--subsystem windows} \ %{mconsole:--subsystem console} \ @@ -97,6 +105,7 @@ along with GCC; see the file COPYING3. If not see %{static:-Bstatic} %{!static:-Bdynamic} \ %{shared|mdll: " SUB_LINK_ENTRY " --enable-auto-image-base} \ " LINK_SPEC_LARGE_ADDR_AWARE "\ + " LINK_SPEC_DISABLE_DYNAMICBASE "\ %(shared_libgcc_undefs)" /* Enable sincos optimization, overriding cygming.h. sincos, sincosf diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h index 36e7bae..779c933 100644 --- a/gcc/config/i386/mingw32.h +++ b/gcc/config/i386/mingw32.h @@ -148,6 +148,13 @@ along with GCC; see the file COPYING3. If not see "%{!shared:%{!mdll:%{!m64:--large-address-aware}}}" #endif +#if HAVE_LD_PE_DISABLE_DYNAMICBASE +# define LINK_SPEC_DISABLE_DYNAMICBASE \ + "%{!shared:%{!mdll:%{no-pie:--disable-dynamicbase}}}" +#else +# define LINK_SPEC_DISABLE_DYNAMICBASE "" +#endif + #define LINK_SPEC "%{mwindows:--subsystem windows} \ %{mconsole:--subsystem console} \ %{shared: %{mdll: %eshared and mdll are not compatible}} \ @@ -155,6 +162,7 @@ along with GCC; see the file COPYING3. If not see %{static:-Bstatic} %{!static:-Bdynamic} \ %{shared|mdll: " SUB_LINK_ENTRY " --enable-auto-image-base} \ " LINK_SPEC_LARGE_ADDR_AWARE "\ + " LINK_SPEC_DISABLE_DYNAMICBASE "\ %(shared_libgcc_undefs)" /* Include in the mingw32 libraries with libgcc */ -- cgit v1.1 From 04b4f3152593f85b05974528d1607619dd77d702 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 12 Aug 2021 11:26:57 +0200 Subject: i386: Fix up V32HImode permutations with -mno-avx512bw [PR101860] My patch from yesterday apparently broke some V32HImode permutations as the testcase shows. The first function assumed it would never be called in d->testing_p mode and so went right away into emitting the code. And the second one assumed V32HImode would never reach it, which now can for the !TARGET_AVX512BW case. We don't have a instruction in that case though. 2021-08-12 Jakub Jelinek PR target/101860 * config/i386/i386-expand.c (ix86_expand_vec_one_operand_perm_avx512): If d->testing_p, return true after performing checks instead of actually expanding the insn. (expand_vec_perm_broadcast_1): Handle V32HImode - assert !TARGET_AVX512BW and return false. * gcc.target/i386/avx512f-pr101860.c: New test. --- gcc/config/i386/i386-expand.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index c708b33..a652b25 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -18116,6 +18116,9 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d) return false; } + if (d->testing_p) + return true; + target = d->target; op0 = d->op0; for (int i = 0; i < d->nelt; ++i) @@ -20481,6 +20484,10 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) gcc_assert (!TARGET_AVX2 || d->perm[0]); return false; + case E_V32HImode: + gcc_assert (!TARGET_AVX512BW); + return false; + default: gcc_unreachable (); } -- cgit v1.1 From 2d7967a10c2f3b4652f77a1a2119ba03b3472266 Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Thu, 12 Aug 2021 14:21:22 +0300 Subject: arc: Small data doesn't need fcommon option ARC backend is defaulting to -fcommon. This is not anylonger needed, remove it. gcc/ 2021-08-12 Claudiu Zissulescu * common/config/arc/arc-common.c (arc_option_init_struct): Remove fno-common reference. * config/arc/arc.c (arc_override_options): Remove overriding of flag_no_common. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/arc.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c index 69f6ae4..92797db 100644 --- a/gcc/config/arc/arc.c +++ b/gcc/config/arc/arc.c @@ -1440,9 +1440,6 @@ arc_override_options (void) if (flag_pic) target_flags |= MASK_NO_SDATA_SET; - if (flag_no_common == 255) - flag_no_common = !TARGET_NO_SDATA_SET; - /* Check for small data option */ if (!global_options_set.x_g_switch_value && !TARGET_NO_SDATA_SET) g_switch_value = TARGET_LL64 ? 8 : 4; -- cgit v1.1 From 8c8df06e46493f6cb55333db72fa1802279b48b4 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 12 Aug 2021 21:18:46 +0200 Subject: [i386] Introduce scalar version of avx512f_vmscalef. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2021-08-12 Uroš Bizjak gcc/ PR target/98309 * config/i386/i386.md (avx512f_scalef2): New insn pattern. (ldexp3): Use avx512f_scalef2. (UNSPEC_SCALEF): Move from ... * config/i386/sse.md (UNSPEC_SCALEF): ... here. --- gcc/config/i386/i386.md | 27 +++++++++++++++++++-------- gcc/config/i386/sse.md | 1 - 2 files changed, 19 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 56b09c5..4a8e8fe 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -125,6 +125,9 @@ UNSPEC_RSQRT UNSPEC_PSADBW + ;; For AVX512F support + UNSPEC_SCALEF + ;; Generic math support UNSPEC_COPYSIGN UNSPEC_XORSIGN @@ -17894,6 +17897,17 @@ DONE; }) +(define_insn "avx512f_scalef2" + [(set (match_operand:MODEF 0 "register_operand" "=v") + (unspec:MODEF + [(match_operand:MODEF 1 "register_operand" "v") + (match_operand:MODEF 2 "nonimmediate_operand" "vm")] + UNSPEC_SCALEF))] + "TARGET_AVX512F" + "vscalef\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "mode" "")]) + (define_expand "ldexpxf3" [(match_operand:XF 0 "register_operand") (match_operand:XF 1 "register_operand") @@ -17924,15 +17938,12 @@ if (TARGET_AVX512F && TARGET_SSE_MATH) { rtx op2 = gen_reg_rtx (mode); - emit_insn (gen_floatsi2 (op2, operands[2])); - operands[0] = lowpart_subreg (mode, operands[0], mode); - if (MEM_P (operands[1])) + + if (!nonimmediate_operand (operands[1], mode)) operands[1] = force_reg (mode, operands[1]); - operands[1] = lowpart_subreg (mode, operands[1], mode); - op2 = lowpart_subreg (mode, op2, mode); - emit_insn (gen_avx512f_vmscalef (operands[0], - operands[1], - op2)); + + emit_insn (gen_floatsi2 (op2, operands[2])); + emit_insn (gen_avx512f_scalef2 (operands[0], operands[1], op2)); } else { diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 3a7bbae..60e69a4 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -92,7 +92,6 @@ UNSPEC_RCP14 UNSPEC_RSQRT14 UNSPEC_FIXUPIMM - UNSPEC_SCALEF UNSPEC_VTERNLOG UNSPEC_GETEXP UNSPEC_GETMANT -- cgit v1.1 From 58eec9908c01e2f5a6eb9cd76bbf037bbe2cf5e6 Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Fri, 13 Aug 2021 19:43:27 -0400 Subject: Fix xxeval predicates (PR 99921). I noticed that the xxeval built-in function used the altivec_register_operand predicate. Since it takes vsx registers, this might force the register allocate to issue a move when it could use a traditional floating point register. This patch fixes that. 2021-08-13 Michael Meissner gcc/ PR target/99921 * config/rs6000/altivec.md (xxeval): Use register_predicate instead of altivec_register_predicate. --- gcc/config/rs6000/altivec.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index d70c17e..fd86c300 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -3875,9 +3875,9 @@ (define_insn "xxeval" [(set (match_operand:V2DI 0 "register_operand" "=wa") - (unspec:V2DI [(match_operand:V2DI 1 "altivec_register_operand" "wa") - (match_operand:V2DI 2 "altivec_register_operand" "wa") - (match_operand:V2DI 3 "altivec_register_operand" "wa") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "wa") + (match_operand:V2DI 2 "register_operand" "wa") + (match_operand:V2DI 3 "register_operand" "wa") (match_operand:QI 4 "u8bit_cint_operand" "n")] UNSPEC_XXEVAL))] "TARGET_POWER10" -- cgit v1.1 From 240f07805db27cfc746276039c5edccb4c031070 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Sat, 14 Aug 2021 11:44:46 +0200 Subject: i386: Fix ICE with V64QImode broadcast permutation with -mavx512f -mno-avx512bw The testcase shows another problem, for TARGET_AVX512BW we have a single insn doing broadcast from the first element, but don't have one for broadcast of 2nd+ element (so for d->perm[0] we must return false), but for TARGET_AVX512F && !TARGET_AVX512BW we don't even have support for that other broadcast. V64QImode case was just added to the AVX2 cases which had gcc_assert (!TARGET_AVX2 || d->perm[0]); but for V64QImode we actually need gcc_assert (!TARGET_AVX512BW || d->perm[0]); 2021-08-14 Jakub Jelinek PR target/101896 * config/i386/i386-expand.c (expand_vec_perm_broadcast_1) : For this mode assert !TARGET_AVX512BW || d->perm[0] rather than !TARGET_AVX2 || d->perm[0]. * gcc.target/i386/avx512f-pr101896.c: New test. --- gcc/config/i386/i386-expand.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index a652b25..4d7349c 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -20474,7 +20474,6 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); return true; - case E_V64QImode: case E_V32QImode: case E_V16HImode: case E_V8SImode: @@ -20484,6 +20483,10 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) gcc_assert (!TARGET_AVX2 || d->perm[0]); return false; + case E_V64QImode: + gcc_assert (!TARGET_AVX512BW || d->perm[0]); + return false; + case E_V32HImode: gcc_assert (!TARGET_AVX512BW); return false; -- cgit v1.1 From eff8110674ef193481d3657456a262beeb9951ff Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Tue, 6 Apr 2021 05:47:17 +0900 Subject: or1k: Add mcmodel option to handle large GOTs When building libgeos we get an error with: linux-uclibc/9.3.0/crtbeginS.o: in function `__do_global_dtors_aux': crtstuff.c:(.text+0x118): relocation truncated to fit: R_OR1K_GOT16 against symbol `__cxa_finalize' defined in .text section in /home/shorne/work/openrisc/3eb9f9d0f6d8274b2d19753c006bd83f7d536e3c/output/host/or1k-buildroot-linux-uclibc/sysroot/lib/libc.so. This is caused by GOT code having a limit of 64k. In OpenRISC this looks to be the only relocation code pattern to be limited to 64k. This patch allows specifying a new option -mcmodel=large which can be used to generate 2 more instructions to construct 32-bit addresses for up to 4G GOTs. gcc/ChangeLog: PR target/99783 * config/or1k/or1k-opts.h: New file. * config/or1k/or1k.c (or1k_legitimize_address_1, print_reloc): Support generating gotha relocations if -mcmodel=large is specified. * config/or1k/or1k.h (TARGET_CMODEL_SMALL, TARGET_CMODEL_LARGE): New macros. * config/or1k/or1k.opt (mcmodel=): New option. * doc/invoke.texi (OpenRISC Options): Document mcmodel. --- gcc/config/or1k/or1k-opts.h | 30 ++++++++++++++++++++++++++++++ gcc/config/or1k/or1k.c | 11 +++++++++-- gcc/config/or1k/or1k.h | 7 +++++++ gcc/config/or1k/or1k.opt | 19 +++++++++++++++++++ 4 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 gcc/config/or1k/or1k-opts.h (limited to 'gcc/config') diff --git a/gcc/config/or1k/or1k-opts.h b/gcc/config/or1k/or1k-opts.h new file mode 100644 index 0000000..f791b89 --- /dev/null +++ b/gcc/config/or1k/or1k-opts.h @@ -0,0 +1,30 @@ +/* Definitions for option handling for OpenRISC. + Copyright (C) 2021 Free Software Foundation, Inc. + Contributed by Stafford Horne. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#ifndef GCC_OR1K_OPTS_H +#define GCC_OR1K_OPTS_H + +/* The OpenRISC code generation models available. */ +enum or1k_cmodel_type { + CMODEL_SMALL, + CMODEL_LARGE +}; + +#endif /* GCC_OR1K_OPTS_H */ diff --git a/gcc/config/or1k/or1k.c b/gcc/config/or1k/or1k.c index e772a7a..27d3fa1 100644 --- a/gcc/config/or1k/or1k.c +++ b/gcc/config/or1k/or1k.c @@ -750,7 +750,14 @@ or1k_legitimize_address_1 (rtx x, rtx scratch) { base = gen_sym_unspec (base, UNSPEC_GOT); crtl->uses_pic_offset_table = 1; - t2 = gen_rtx_LO_SUM (Pmode, pic_offset_table_rtx, base); + if (TARGET_CMODEL_LARGE) + { + emit_insn (gen_rtx_SET (t1, gen_rtx_HIGH (Pmode, base))); + emit_insn (gen_add3_insn (t1, t1, pic_offset_table_rtx)); + t2 = gen_rtx_LO_SUM (Pmode, t1, base); + } + else + t2 = gen_rtx_LO_SUM (Pmode, pic_offset_table_rtx, base); t2 = gen_const_mem (Pmode, t2); emit_insn (gen_rtx_SET (t1, t2)); base = t1; @@ -1089,7 +1096,7 @@ print_reloc (FILE *stream, rtx x, HOST_WIDE_INT add, reloc_kind kind) no special markup. */ static const char * const relocs[RKIND_MAX][RTYPE_MAX] = { { "lo", "got", "gotofflo", "tpofflo", "gottpofflo", "tlsgdlo" }, - { "ha", NULL, "gotoffha", "tpoffha", "gottpoffha", "tlsgdhi" }, + { "ha", "gotha", "gotoffha", "tpoffha", "gottpoffha", "tlsgdhi" }, }; reloc_type type = RTYPE_DIRECT; diff --git a/gcc/config/or1k/or1k.h b/gcc/config/or1k/or1k.h index fe01ab8..669907e 100644 --- a/gcc/config/or1k/or1k.h +++ b/gcc/config/or1k/or1k.h @@ -21,6 +21,8 @@ #ifndef GCC_OR1K_H #define GCC_OR1K_H +#include "config/or1k/or1k-opts.h" + /* Names to predefine in the preprocessor for this target machine. */ #define TARGET_CPU_CPP_BUILTINS() \ do \ @@ -37,6 +39,11 @@ } \ while (0) +#define TARGET_CMODEL_SMALL \ + (or1k_code_model == CMODEL_SMALL) +#define TARGET_CMODEL_LARGE \ + (or1k_code_model == CMODEL_LARGE) + /* Storage layout. */ #define DEFAULT_SIGNED_CHAR 1 diff --git a/gcc/config/or1k/or1k.opt b/gcc/config/or1k/or1k.opt index 6bd0f3e..cc23e3b 100644 --- a/gcc/config/or1k/or1k.opt +++ b/gcc/config/or1k/or1k.opt @@ -21,6 +21,9 @@ ; See the GCC internals manual (options.texi) for a description of ; this file's format. +HeaderInclude +config/or1k/or1k-opts.h + mhard-div Target RejectNegative InverseMask(SOFT_DIV) Enable generation of hardware divide (l.div, l.divu) instructions. This is the @@ -63,6 +66,22 @@ When -mhard-float is selected, enables generation of unordered floating point compare and set flag (lf.sfun*) instructions. By default functions from libgcc are used to perform unordered floating point compare and set flag operations. +mcmodel= +Target RejectNegative Joined Enum(or1k_cmodel_type) Var(or1k_code_model) Init(CMODEL_SMALL) +Specify the code model used for accessing memory addresses. Specifying large +enables generating binaries with large global offset tables. By default the +value is small. + +Enum +Name(or1k_cmodel_type) Type(enum or1k_cmodel_type) +Known code model types (for use with the -mcmodel= option): + +EnumValue +Enum(or1k_cmodel_type) String(small) Value(CMODEL_SMALL) + +EnumValue +Enum(or1k_cmodel_type) String(large) Value(CMODEL_LARGE) + mcmov Target RejectNegative Mask(CMOV) Enable generation of conditional move (l.cmov) instructions. By default the -- cgit v1.1 From 882f1d58bfa56737ff2de84c3cd1e0acfc318b86 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Sun, 15 Aug 2021 00:13:23 -0400 Subject: Improve many SImode shifts on the H8/300H As I've mentioned before, the H8/300H can only shift a single bit position at a time. Naturally this means many shifts are implemented as loops. There's a variety of special cases that we can do without loops by using rotates, sub-word moves, etc. The general guidance for the port has been to only use inline or special sequences if they're shorter or just one instruction longer than the loop. This was pretty reasonable guidance for QI/HI mode. It was relaxed a bit about 10 years ago for HImode in particular where the kpit team realized they could save 50-100 cycles for some shifts by allowing 2 instructions of code growth over the loop implementation. But they only re-tuned HImode shifts. There's even bigger benefits for re-tuning SImode shifts. There's cases where we can save close to 200 cycles by allowing 2 additional instructions. This patch re-tunes SImode shifts on the H8/300H primarily by inlining more often or using a special sequence + inlining for residuals. Both cases were already supported and this just uses those existing capabilities more often, so it was trivial to implement. I think there's some cases were entirely new special sequences could be used, but I haven't tried those yet. gcc/ * config/h8300/h8300.c (shift_alg_si): Retune H8/300H shifts to allow a bit more code growth, saving many dozens of cycles. (h8300_option_override): Adjus shift_alg_si if optimizing for code size. (get_shift_alg): Use special + inline shifts for residuals in more cases. --- gcc/config/h8300/h8300.c | 52 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300.c b/gcc/config/h8300/h8300.c index d2f6548..7959ad1 100644 --- a/gcc/config/h8300/h8300.c +++ b/gcc/config/h8300/h8300.c @@ -228,18 +228,18 @@ static enum shift_alg shift_alg_si[2][3][32] = { /* 8 9 10 11 12 13 14 15 */ /* 16 17 18 19 20 21 22 23 */ /* 24 25 26 27 28 29 30 31 */ - { INL, INL, INL, INL, INL, LOP, LOP, LOP, + { INL, INL, INL, INL, INL, INL, INL, LOP, SPC, LOP, LOP, LOP, LOP, LOP, LOP, SPC, - SPC, SPC, SPC, SPC, LOP, LOP, LOP, LOP, - SPC, LOP, LOP, LOP, SPC, SPC, SPC, SPC }, /* SHIFT_ASHIFT */ - { INL, INL, INL, INL, INL, LOP, LOP, LOP, + SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, + SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC }, /* SHIFT_ASHIFT */ + { INL, INL, INL, INL, INL, INL, INL, LOP, SPC, LOP, LOP, LOP, LOP, LOP, LOP, SPC, - SPC, SPC, SPC, SPC, LOP, LOP, LOP, LOP, - SPC, LOP, LOP, LOP, SPC, SPC, SPC, SPC }, /* SHIFT_LSHIFTRT */ - { INL, INL, INL, INL, INL, LOP, LOP, LOP, + SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, + SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC }, /* SHIFT_LSHIFTRT */ + { INL, INL, INL, INL, INL, INL, INL, LOP, SPC, LOP, LOP, LOP, LOP, LOP, LOP, LOP, - SPC, SPC, SPC, SPC, LOP, LOP, LOP, LOP, - SPC, LOP, LOP, LOP, LOP, LOP, LOP, SPC }, /* SHIFT_ASHIFTRT */ + SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, + SPC, SPC, SPC, SPC, LOP, LOP, LOP, SPC }, /* SHIFT_ASHIFTRT */ }, { /* TARGET_H8300S */ @@ -343,6 +343,36 @@ h8300_option_override (void) shift_alg_hi[H8_300H][SHIFT_ASHIFTRT][13] = SHIFT_LOOP; shift_alg_hi[H8_300H][SHIFT_ASHIFTRT][14] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFT][5] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFT][6] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFT][20] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFT][21] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFT][22] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFT][23] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFT][25] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFT][26] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFT][27] = SHIFT_LOOP; + + shift_alg_si[H8_300H][SHIFT_LSHIFTRT][5] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_LSHIFTRT][6] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_LSHIFTRT][20] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_LSHIFTRT][21] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_LSHIFTRT][22] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_LSHIFTRT][23] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_LSHIFTRT][25] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_LSHIFTRT][26] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_LSHIFTRT][27] = SHIFT_LOOP; + + shift_alg_si[H8_300H][SHIFT_ASHIFTRT][5] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFTRT][6] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFTRT][20] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFTRT][21] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFTRT][22] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFTRT][23] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFTRT][25] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFTRT][26] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFTRT][27] = SHIFT_LOOP; + /* H8S */ shift_alg_hi[H8_S][SHIFT_ASHIFTRT][14] = SHIFT_LOOP; } @@ -3784,7 +3814,7 @@ get_shift_alg (enum shift_type shift_type, enum shift_mode shift_mode, gcc_unreachable (); } } - else if ((TARGET_H8300H && count >= 16 && count <= 19) + else if ((TARGET_H8300H && count >= 16 && count <= 23) || (TARGET_H8300S && count >= 16 && count <= 21)) { info->remainder = count - 16; @@ -3804,7 +3834,7 @@ get_shift_alg (enum shift_type shift_type, enum shift_mode shift_mode, goto end; } } - else if ((TARGET_H8300H && count == 24) + else if ((TARGET_H8300H && count >= 24 || count <= 27) || (TARGET_H8300S && count >= 24 && count <= 25)) { info->remainder = count - 24; -- cgit v1.1 From 34ce7f7a9a64dd69dd6a77dfd4a77406c3c71014 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Chigot?= Date: Thu, 12 Aug 2021 13:17:15 -0400 Subject: aix: 64 bit AIX TLS libpthread dependency. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 64bit XCOFF files will generated TLS access, with local-exec or global-exec models, by an access to R13. Thus, there isn't any reference to a TLS symbol. The problem is that it allows programs with TLS to be compiled and linked even without -pthread. Most of the time, it will result in a segfault when trying to access a TLS variable. But sometimes, it might create a memory corruption. This patch forces a reference to __tls_get_addr() to ensure link will fail without -pthread. gcc/ChangeLog: 2021-08-11 Clément Chigot * config/rs6000/rs6000.c (xcoff_tls_exec_model_detected): New. (rs6000_legitimize_tls_address_aix): Use it. (rs6000_xcoff_file_end): Add ".ref __tls_get_addr" when xcoff_tls_exec_model_detected is true. --- gcc/config/rs6000/rs6000.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 60f406a..e073b26 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -127,6 +127,9 @@ scalar_int_mode rs6000_pmode; bool rs6000_passes_ieee128 = false; #endif +/* Track use of r13 in 64bit AIX TLS. */ +static bool xcoff_tls_exec_model_detected = false; + /* Generate the manged name (i.e. U10__float128) used in GCC 8.1, and not the name used in current releases (i.e. u9__ieee128). */ static bool ieee128_mangling_gcc_8_1; @@ -9397,7 +9400,10 @@ rs6000_legitimize_tls_address_aix (rtx addr, enum tls_model model) emit_insn (gen_tls_get_tpointer (tlsreg)); } else - tlsreg = gen_rtx_REG (DImode, 13); + { + tlsreg = gen_rtx_REG (DImode, 13); + xcoff_tls_exec_model_detected = true; + } /* Load the TOC value into temporary register. */ tmpreg = gen_reg_rtx (Pmode); @@ -21122,6 +21128,12 @@ rs6000_xcoff_file_end (void) fputs (TARGET_32BIT ? "\t.long _section_.text\n" : "\t.llong _section_.text\n", asm_out_file); + + if (xcoff_tls_exec_model_detected) + { + /* Add a .ref to __tls_get_addr to force libpthread dependency. */ + fputs ("\t.extern __tls_get_addr\n\t.ref __tls_get_addr\n", asm_out_file); + } } struct declare_alias_data -- cgit v1.1 From fdd40498d1981fde0720a0886d6f59ea5fb7ab40 Mon Sep 17 00:00:00 2001 From: Kito Cheng Date: Tue, 20 Jul 2021 10:53:18 +0800 Subject: RISC-V: Allow multi-lib build with different code model --with-multilib-generator was only support for different ISA/ABI combination, however code model is effect the code gen a lots it should able to handled in multilib mechanism. Adding `--cmodel=` option to `--with-multilib-generator` to generating multilib combination with different code model. E.g. --with-multilib-generator="rv64ima-lp64--;--cmodel=medlow,medany" will generate 3 multi-lib suppport: 1) rv64ima with lp64 2) rv64ima with lp64 and medlow code model 3) rv64ima with lp64 and medany code model gcc/ * config/riscv/multilib-generator: Support code model option for multi-lib. * doc/install.texi: Add document of new option for --with-multilib-generator. --- gcc/config/riscv/multilib-generator | 86 ++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 30 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/riscv/multilib-generator b/gcc/config/riscv/multilib-generator index a204543..358bda9 100755 --- a/gcc/config/riscv/multilib-generator +++ b/gcc/config/riscv/multilib-generator @@ -40,6 +40,7 @@ import collections import itertools from functools import reduce import subprocess +import argparse # # TODO: Add test for this script. @@ -127,44 +128,69 @@ def expand_combination(ext): return ext -for cfg in sys.argv[1:]: - try: - (arch, abi, extra, ext) = cfg.split('-') - except: - print ("Invalid configure string %s, ---\n" - " and can be empty, " - "e.g. rv32imafd-ilp32--" % cfg) - sys.exit(1) - - arch = arch_canonicalize (arch) - arches[arch] = 1 - abis[abi] = 1 - extra = list(filter(None, extra.split(','))) - ext_combs = expand_combination(ext) - alts = sum([[x] + [x + y for y in ext_combs] for x in [arch] + extra], []) - alts = list(map(arch_canonicalize, alts)) +multilib_cfgs = filter(lambda x:not x.startswith("--"), sys.argv[1:]) +options = filter(lambda x:x.startswith("--"), sys.argv[1:]) + +parser = argparse.ArgumentParser() +parser.add_argument("--cmodel", type=str) +parser.add_argument("cfgs", type=str, nargs='*') +args = parser.parse_args() + +if args.cmodel: + cmodels = [None] + args.cmodel.split(",") +else: + cmodels = [None] + +cmodel_options = '/'.join(['mcmodel=%s' % x for x in cmodels[1:]]) +cmodel_dirnames = ' \\\n'.join(cmodels[1:]) + +for cmodel in cmodels: + for cfg in args.cfgs: + try: + (arch, abi, extra, ext) = cfg.split('-') + except: + print ("Invalid configure string %s, ---\n" + " and can be empty, " + "e.g. rv32imafd-ilp32--" % cfg) + sys.exit(1) + + # Compact code model only support rv64. + if cmodel == "compact" and arch.startswith("rv32"): + continue - # Drop duplicated entry. - alts = unique(alts) + arch = arch_canonicalize (arch) + arches[arch] = 1 + abis[abi] = 1 + extra = list(filter(None, extra.split(','))) + ext_combs = expand_combination(ext) + alts = sum([[x] + [x + y for y in ext_combs] for x in [arch] + extra], []) + alts = list(map(arch_canonicalize, alts)) - for alt in alts: - if alt == arch: - continue - arches[alt] = 1 - reuse.append('march.%s/mabi.%s=march.%s/mabi.%s' % (arch, abi, alt, abi)) - required.append('march=%s/mabi=%s' % (arch, abi)) + # Drop duplicated entry. + alts = unique(alts) + + for alt in alts[1:]: + if alt == arch: + continue + arches[alt] = 1 + reuse.append('march.%s/mabi.%s=march.%s/mabi.%s' % (arch, abi, alt, abi)) + + if cmodel: + required.append('march=%s/mabi=%s/mcmodel=%s' % (arch, abi, cmodel)) + else: + required.append('march=%s/mabi=%s' % (arch, abi)) -arch_options = '/'.join(['march=%s' % x for x in arches.keys()]) -arch_dirnames = ' \\\n'.join(arches.keys()) + arch_options = '/'.join(['march=%s' % x for x in arches.keys()]) + arch_dirnames = ' \\\n'.join(arches.keys()) -abi_options = '/'.join(['mabi=%s' % x for x in abis.keys()]) -abi_dirnames = ' \\\n'.join(abis.keys()) + abi_options = '/'.join(['mabi=%s' % x for x in abis.keys()]) + abi_dirnames = ' \\\n'.join(abis.keys()) prog = sys.argv[0].split('/')[-1] print('# This file was generated by %s with the command:' % prog) print('# %s' % ' '.join(sys.argv)) -print('MULTILIB_OPTIONS = %s %s' % (arch_options, abi_options)) -print('MULTILIB_DIRNAMES = %s %s' % (arch_dirnames, abi_dirnames)) +print('MULTILIB_OPTIONS = %s %s %s' % (arch_options, abi_options, cmodel_options)) +print('MULTILIB_DIRNAMES = %s %s %s' % (arch_dirnames, abi_dirnames, cmodel_dirnames)) print('MULTILIB_REQUIRED = %s' % ' \\\n'.join(required)) print('MULTILIB_REUSE = %s' % ' \\\n'.join(reuse)) -- cgit v1.1 From faf2b6bc527dff31725dde5538ffff1c92688047 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 16 Aug 2021 11:16:52 +0800 Subject: Optimize __builtin_shuffle_vector. 1. Support vpermw/vpermb in ix86_expand_vec_one_operand_perm_avx512. 2. Support 256/128-bits vpermi2b ix86_expand_vec_perm_vpermt2. 3. Add define_insn_and_split to optimize specific vector permutation to opmov{dw,wb,qd}. gcc/ChangeLog: PR target/101846 * config/i386/i386-expand.c (ix86_expand_vec_perm_vpermt2): Support vpermi2b for V32QI/V16QImode. (ix86_extract_perm_from_pool_constant): New function. (ix86_expand_vec_one_operand_perm_avx512): Support vpermw/vpermb under TARGET_AVX512BW/TARGET_AVX512VBMI. (expand_vec_perm_1): Adjust comments for upper. * config/i386/i386-protos.h (ix86_extract_perm_from_pool_constant): New declare. * config/i386/predicates.md (permvar_truncate_operand): New predicate. (pshufb_truncv4siv4hi_operand): Ditto. (pshufb_truncv8hiv8qi_operand): Ditto. * config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1): New pre_reload define_insn_and_split. (*avx512f_permvar_truncv8siv8hi_1): Ditto. (*avx512f_vpermvar_truncv8div8si_1): Ditto. (*avx512f_permvar_truncv32hiv32qi_1): Ditto. (*avx512f_permvar_truncv16hiv16qi_1): Ditto. (*avx512f_permvar_truncv4div4si_1): Ditto. (*avx512f_pshufb_truncv8hiv8qi_1): Ditto. (*avx512f_pshufb_truncv4siv4hi_1): Ditto. (*avx512f_pshufd_truncv2div2si_1): Ditto. gcc/testsuite/ChangeLog: PR target/101846 * gcc.target/i386/pr101846-2.c: New test. * gcc.target/i386/pr101846-3.c: New test. * gcc.target/i386/pr101846-4.c: New test. --- gcc/config/i386/i386-expand.c | 89 ++++++++++++++++++-- gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/predicates.md | 90 ++++++++++++++++++++ gcc/config/i386/sse.md | 190 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 365 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 4d7349c..9bf13db 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -4778,6 +4778,18 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1, switch (mode) { + case E_V16QImode: + if (TARGET_AVX512VL && TARGET_AVX512VBMI) + gen = gen_avx512vl_vpermt2varv16qi3; + break; + case E_V32QImode: + if (TARGET_AVX512VL && TARGET_AVX512VBMI) + gen = gen_avx512vl_vpermt2varv32qi3; + break; + case E_V64QImode: + if (TARGET_AVX512VBMI) + gen = gen_avx512bw_vpermt2varv64qi3; + break; case E_V8HImode: if (TARGET_AVX512VL && TARGET_AVX512BW) gen = gen_avx512vl_vpermt2varv8hi3; @@ -4786,10 +4798,6 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1, if (TARGET_AVX512VL && TARGET_AVX512BW) gen = gen_avx512vl_vpermt2varv16hi3; break; - case E_V64QImode: - if (TARGET_AVX512VBMI) - gen = gen_avx512bw_vpermt2varv64qi3; - break; case E_V32HImode: if (TARGET_AVX512BW) gen = gen_avx512bw_vpermt2varv32hi3; @@ -5487,6 +5495,45 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) } } +/* Return true if mem is pool constant which contains a const_vector + perm index, assign the index to PERM. */ +bool +ix86_extract_perm_from_pool_constant (int* perm, rtx mem) +{ + machine_mode mode = GET_MODE (mem); + int nelt = GET_MODE_NUNITS (mode); + + if (!INTEGRAL_MODE_P (mode)) + return false; + + /* Needs to be constant pool. */ + if (!(MEM_P (mem)) + || !SYMBOL_REF_P (XEXP (mem, 0)) + || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0))) + return false; + + rtx constant = get_pool_constant (XEXP (mem, 0)); + + if (GET_CODE (constant) != CONST_VECTOR) + return false; + + /* There could be some rtx like + (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) + but with "*.LC1" refer to V2DI constant vector. */ + if (GET_MODE (constant) != mode) + { + constant = simplify_subreg (mode, constant, GET_MODE (constant), 0); + + if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR) + return false; + } + + for (int i = 0; i != nelt; i++) + perm[i] = UINTVAL (XVECEXP (constant, 0, i)); + + return true; +} + /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode, but works for floating pointer parameters and nonoffsetable memories. For pushes, it returns just stack offsets; the values will be saved @@ -18086,6 +18133,7 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d) { machine_mode mode = GET_MODE (d->op0); machine_mode maskmode = mode; + unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode)); rtx (*gen) (rtx, rtx, rtx) = NULL; rtx target, op0, mask; rtx vec[64]; @@ -18096,6 +18144,18 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d) if (!TARGET_AVX512F) return false; + /* Accept VNxHImode and VNxQImode now. */ + if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64) + return false; + + /* vpermw. */ + if (!TARGET_AVX512BW && inner_size == 2) + return false; + + /* vpermb. */ + if (!TARGET_AVX512VBMI && inner_size == 1) + return false; + switch (mode) { case E_V16SImode: @@ -18112,6 +18172,25 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d) gen = gen_avx512f_permvarv8df; maskmode = V8DImode; break; + case E_V32HImode: + gen = gen_avx512bw_permvarv32hi; + break; + case E_V16HImode: + gen = gen_avx512vl_permvarv16hi; + break; + case E_V8HImode: + gen = gen_avx512vl_permvarv8hi; + break; + case E_V64QImode: + gen = gen_avx512bw_permvarv64qi; + break; + case E_V32QImode: + gen = gen_avx512vl_permvarv32qi; + break; + case E_V16QImode: + gen = gen_avx512vl_permvarv16qi; + break; + default: return false; } @@ -18301,7 +18380,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_palignr (d, true)) return true; - /* Try the AVX512F vperm{s,d} instructions. */ + /* Try the AVX512F vperm{w,b,s,d} instructions */ if (ix86_expand_vec_one_operand_perm_avx512 (d)) return true; diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 07ac02a..2fd1307 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -260,6 +260,7 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); extern void ix86_expand_sse2_abs (rtx, rtx); extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx, rtx); +extern bool ix86_extract_perm_from_pool_constant (int*, rtx); /* In i386-c.c */ extern void ix86_target_macros (void); diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 129205a..9321f33 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1713,6 +1713,96 @@ return true; }) +;; Return true if OP is a constant pool in perm{w,d,b} which constains index +;; match pmov{dw,wb,qd}. +(define_predicate "permvar_truncate_operand" + (match_code "mem") +{ + int nelt = GET_MODE_NUNITS (mode); + int perm[128]; + int id; + + if (!INTEGRAL_MODE_P (mode) || !VECTOR_MODE_P (mode)) + return false; + + if (nelt < 2) + return false; + + if (!ix86_extract_perm_from_pool_constant (&perm[0], op)) + return false; + + id = exact_log2 (nelt); + + /* Check that the permutation is suitable for pmovz{bw,wd,dq}. + For example V16HImode to V8HImode + { 0 2 4 6 8 10 12 14 * * * * * * * * }. */ + for (int i = 0; i != nelt / 2; i++) + if ((perm[i] & ((1 << id) - 1)) != i * 2) + return false; + + return true; +}) + +;; Return true if OP is a constant pool in shufb which constains index +;; match pmovdw. +(define_predicate "pshufb_truncv4siv4hi_operand" + (match_code "mem") +{ + int perm[128]; + + if (mode != E_V16QImode) + return false; + + if (!ix86_extract_perm_from_pool_constant (&perm[0], op)) + return false; + + /* Check that the permutation is suitable for pmovdw. + For example V4SImode to V4HImode + { 0 1 4 5 8 9 12 13 * * * * * * * * }. + index = i % 2 + (i / 2) * 4. */ + for (int i = 0; i != 8; i++) + { + /* if (SRC2[(i * 8)+7] = 1) then DEST[(i*8)+7..(i*8)+0] := 0; */ + if (perm[i] & 128) + return false; + + if ((perm[i] & 15) != ((i & 1) + (i & 0xFE) * 2)) + return false; + } + + return true; +}) + +;; Return true if OP is a constant pool in shufb which constains index +;; match pmovdw. +(define_predicate "pshufb_truncv8hiv8qi_operand" + (match_code "mem") +{ + int perm[128]; + + if (mode != E_V16QImode) + return false; + + if (!ix86_extract_perm_from_pool_constant (&perm[0], op)) + return false; + + /* Check that the permutation is suitable for pmovwb. + For example V16QImode to V8QImode + { 0 2 4 6 8 10 12 14 * * * * * * * * }. + index = i % 2 + (i / 2) * 4. */ + for (int i = 0; i != 8; i++) + { + /* if (SRC2[(i * 8)+7] = 1) then DEST[(i*8)+7..(i*8)+0] := 0; */ + if (perm[i] & 128) + return false; + + if ((perm[i] & 15) != i * 2) + return false; + } + + return true; +}) + ;; Return true if OP is a parallel for an pmovz{bw,wd,dq} vec_select, ;; where one of the two operands of the vec_concat is const0_operand. (define_predicate "pmovzx_parallel" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 60e69a4..27e25cc 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -10977,6 +10977,64 @@ (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1" + [(set (match_operand:V16HI 0 "nonimmediate_operand") + (vec_select:V16HI + (unspec:V32HI + [(match_operand:V32HI 1 "register_operand") + (match_operand:V32HI 2 "permvar_truncate_operand")] + UNSPEC_VPERMVAR) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7) + (const_int 8) (const_int 9) + (const_int 10) (const_int 11) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15)])))] + "TARGET_AVX512BW && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (truncate:V16HI (match_dup 1)))] + "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);") + +(define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1" + [(set (match_operand:V8HI 0 "nonimmediate_operand") + (vec_select:V8HI + (unspec:V16HI + [(match_operand:V16HI 1 "register_operand") + (match_operand:V16HI 2 "permvar_truncate_operand")] + UNSPEC_VPERMVAR) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)])))] + "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (truncate:V8HI (match_dup 1)))] + "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);") + +(define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1" + [(set (match_operand:V8SI 0 "nonimmediate_operand") + (vec_select:V8SI + (unspec:V16SI + [(match_operand:V16SI 1 "register_operand") + (match_operand:V16SI 2 "permvar_truncate_operand")] + UNSPEC_VPERMVAR) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)])))] + "TARGET_AVX512F && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (truncate:V8SI (match_dup 1)))] + "operands[1] = lowpart_subreg (V8DImode, operands[1], V16SImode);") + (define_insn "avx512f_2_mask" [(set (match_operand:PMOV_DST_MODE_1 0 "nonimmediate_operand" "=v,m") (vec_merge:PMOV_DST_MODE_1 @@ -11017,6 +11075,36 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) +(define_insn_and_split "*avx512f_permvar_truncv32hiv32qi_1" + [(set (match_operand:V32QI 0 "nonimmediate_operand") + (vec_select:V32QI + (unspec:V64QI + [(match_operand:V64QI 1 "register_operand") + (match_operand:V64QI 2 "permvar_truncate_operand")] + UNSPEC_VPERMVAR) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7) + (const_int 8) (const_int 9) + (const_int 10) (const_int 11) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15) + (const_int 16) (const_int 17) + (const_int 18) (const_int 19) + (const_int 20) (const_int 21) + (const_int 22) (const_int 23) + (const_int 24) (const_int 25) + (const_int 26) (const_int 27) + (const_int 28) (const_int 29) + (const_int 30) (const_int 31)])))] + "TARGET_AVX512VBMI && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (truncate:V32QI (match_dup 1)))] + "operands[1] = lowpart_subreg (V32HImode, operands[1], V64QImode);") + (define_insn "avx512bw_v32hiv32qi2_mask" [(set (match_operand:V32QI 0 "nonimmediate_operand" "=v,m") (vec_merge:V32QI @@ -11062,6 +11150,45 @@ (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn_and_split "*avx512f_permvar_truncv16hiv16qi_1" + [(set (match_operand:V16QI 0 "nonimmediate_operand") + (vec_select:V16QI + (unspec:V32QI + [(match_operand:V32QI 1 "register_operand") + (match_operand:V32QI 2 "permvar_truncate_operand")] + UNSPEC_VPERMVAR) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7) + (const_int 8) (const_int 9) + (const_int 10) (const_int 11) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15)])))] + "TARGET_AVX512VL && TARGET_AVX512VBMI + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (truncate:V16QI (match_dup 1)))] + "operands[1] = lowpart_subreg (V16HImode, operands[1], V32QImode);") + +(define_insn_and_split "*avx512f_permvar_truncv4div4si_1" + [(set (match_operand:V4SI 0 "nonimmediate_operand") + (vec_select:V4SI + (unspec:V8SI + [(match_operand:V8SI 1 "register_operand") + (match_operand:V8SI 2 "permvar_truncate_operand")] + UNSPEC_VPERMVAR) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])))] + "TARGET_AVX512VL && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (truncate:V4SI (match_dup 1)))] + "operands[1] = lowpart_subreg (V4DImode, operands[1], V8SImode);") + (define_insn "_2_mask" [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m") (vec_merge:PMOV_DST_MODE_2 @@ -11120,6 +11247,27 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) +(define_insn_and_split "*avx512f_pshufb_truncv8hiv8qi_1" + [(set (match_operand:DI 0 "register_operand") + (vec_select:DI + (subreg:V2DI + (unspec:V16QI + [(match_operand:V16QI 1 "register_operand") + (match_operand:V16QI 2 "pshufb_truncv8hiv8qi_operand")] + UNSPEC_PSHUFB) 0) + (parallel [(const_int 0)])))] + "TARGET_AVX512VL && ix86_pre_reload_split ()" + "#" + "&& 1" + [(const_int 0)] +{ + rtx op1 = gen_reg_rtx (V8QImode); + operands[1] = lowpart_subreg (V8HImode, operands[1], V16QImode); + emit_insn (gen_truncv8hiv8qi2 (op1, operands[1])); + emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V8QImode)); + DONE; +}) + (define_insn "*avx512vl_v2div2qi2_store_1" [(set (match_operand:V2QI 0 "memory_operand" "=m") (any_truncate:V2QI @@ -11475,6 +11623,27 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) +(define_insn_and_split "*avx512f_pshufb_truncv4siv4hi_1" + [(set (match_operand:DI 0 "register_operand") + (vec_select:DI + (subreg:V2DI + (unspec:V16QI + [(match_operand:V16QI 1 "register_operand") + (match_operand:V16QI 2 "pshufb_truncv4siv4hi_operand")] + UNSPEC_PSHUFB) 0) + (parallel [(const_int 0)])))] + "TARGET_AVX512VL && ix86_pre_reload_split ()" + "#" + "&& 1" + [(const_int 0)] +{ + rtx op1 = gen_reg_rtx (V4HImode); + operands[1] = lowpart_subreg (V4SImode, operands[1], V16QImode); + emit_insn (gen_truncv4siv4hi2 (op1, operands[1])); + emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V4HImode)); + DONE; +}) + (define_insn "*avx512vl_v4hi2_store_1" [(set (match_operand:V4HI 0 "memory_operand" "=m") (any_truncate:V4HI @@ -11698,6 +11867,27 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) +(define_insn_and_split "*avx512f_pshufd_truncv2div2si_1" + [(set (match_operand:DI 0 "register_operand") + (vec_select:DI + (subreg:V2DI + (vec_select:V4SI + (match_operand:V4SI 1 "register_operand") + (parallel [(const_int 0) (const_int 2) + (const_int 2) (const_int 3)])) 0) + (parallel [(const_int 0)])))] + "TARGET_AVX512VL && ix86_pre_reload_split ()" + "#" + "&& 1" + [(const_int 0)] +{ + rtx op1 = gen_reg_rtx (V2SImode); + operands[1] = lowpart_subreg (V2DImode, operands[1], V4SImode); + emit_insn (gen_truncv2div2si2 (op1, operands[1])); + emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V2SImode)); + DONE; +}) + (define_insn "*avx512vl_v2div2si2_store_1" [(set (match_operand:V2SI 0 "memory_operand" "=m") (any_truncate:V2SI -- cgit v1.1 From 1afcf5705a0becdca96111a6cb67670217c335b3 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 16 Aug 2021 17:12:21 +0800 Subject: Fix ICE. gcc/ChangeLog: PR target/101930 * config/i386/i386.md (ldexp3): Force operands[1] to reg. gcc/testsuite/ChangeLog: PR target/101930 * gcc.target/i386/pr101930.c: New test. --- gcc/config/i386/i386.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 4a8e8fe..41d8562 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -17938,9 +17938,7 @@ if (TARGET_AVX512F && TARGET_SSE_MATH) { rtx op2 = gen_reg_rtx (mode); - - if (!nonimmediate_operand (operands[1], mode)) - operands[1] = force_reg (mode, operands[1]); + operands[1] = force_reg (mode, operands[1]); emit_insn (gen_floatsi2 (op2, operands[2])); emit_insn (gen_avx512f_scalef2 (operands[0], operands[1], op2)); -- cgit v1.1 From 8cdcea51c0fd753e6a652c9b236e91b3a6e0911c Mon Sep 17 00:00:00 2001 From: Sebastian Huber Date: Mon, 9 Aug 2021 09:06:14 +0200 Subject: gcov: Add TARGET_GCOV_TYPE_SIZE target hook If -fprofile-update=atomic is used, then the target must provide atomic operations for the counters of the type returned by get_gcov_type(). This is a 64-bit type for targets which have a 64-bit long long type. On 32-bit targets this could be an issue since they may not provide 64-bit atomic operations. Allow targets to override the default type size with the new TARGET_GCOV_TYPE_SIZE target hook. If a 32-bit gcov type size is used, then there is currently a warning in libgcov-driver.c in a dead code block due to sizeof (counter) == sizeof (gcov_unsigned_t): libgcc/libgcov-driver.c: In function 'dump_counter': libgcc/libgcov-driver.c:401:46: warning: right shift count >= width of type [-Wshift-count-overflow] 401 | dump_unsigned ((gcov_unsigned_t)(counter >> 32), dump_fn, arg); | ^~ gcc/c-family/ * c-cppbuiltin.c (c_cpp_builtins): Define __LIBGCC_GCOV_TYPE_SIZE if flag_building_libgcc is true. gcc/ * config/sparc/rtemself.h (SPARC_GCOV_TYPE_SIZE): Define. * config/sparc/sparc.c (sparc_gcov_type_size): New. (TARGET_GCOV_TYPE_SIZE): Redefine if SPARC_GCOV_TYPE_SIZE is defined. * coverage.c (get_gcov_type): Use targetm.gcov_type_size(). * doc/tm.texi (TARGET_GCOV_TYPE_SIZE): Add hook under "Misc". * doc/tm.texi.in: Regenerate. * target.def (gcov_type_size): New target hook. * targhooks.c (default_gcov_type_size): New. * targhooks.h (default_gcov_type_size): Declare. * tree-profile.c (gimple_gen_edge_profiler): Use precision of gcov_type_node. (gimple_gen_time_profiler): Likewise. libgcc/ * libgcov.h (gcov_type): Define using __LIBGCC_GCOV_TYPE_SIZE. (gcov_type_unsigned): Likewise. --- gcc/config/sparc/rtemself.h | 2 ++ gcc/config/sparc/sparc.c | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/sparc/rtemself.h b/gcc/config/sparc/rtemself.h index fa972af..d64ce90 100644 --- a/gcc/config/sparc/rtemself.h +++ b/gcc/config/sparc/rtemself.h @@ -40,3 +40,5 @@ /* Use the default */ #undef LINK_GCC_C_SEQUENCE_SPEC + +#define SPARC_GCOV_TYPE_SIZE 32 diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c index 04fc80f..06f41d7 100644 --- a/gcc/config/sparc/sparc.c +++ b/gcc/config/sparc/sparc.c @@ -944,6 +944,17 @@ char sparc_hard_reg_printed[8]; #undef TARGET_ZERO_CALL_USED_REGS #define TARGET_ZERO_CALL_USED_REGS sparc_zero_call_used_regs +#ifdef SPARC_GCOV_TYPE_SIZE +static HOST_WIDE_INT +sparc_gcov_type_size (void) +{ + return SPARC_GCOV_TYPE_SIZE; +} + +#undef TARGET_GCOV_TYPE_SIZE +#define TARGET_GCOV_TYPE_SIZE sparc_gcov_type_size +#endif + struct gcc_target targetm = TARGET_INITIALIZER; /* Return the memory reference contained in X if any, zero otherwise. */ -- cgit v1.1 From 75a7176575c409940b66020def23508f5701f5fb Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Mon, 16 Aug 2021 22:23:30 -0400 Subject: Improve SImode shifts for H8 Similar to the H8/300H patch, this improves SImode shifts for the H8/S. It's not as big a win on the H8/S since we can shift two positions at a time. But that also means that we can handle more residuals with minimal ode growth after a special shift-by-16 or shift-by-24 sequence. I think there's more to do here, but this seemed like as good a checkpoint as any. Tested without regressions. gcc/ * config/h8300/h8300.c (shift_alg_si): Avoid loops for most SImode shifts on the H8/S. (h8300_option_override): Use loops on H8/S more often when optimizing for size. (get_shift_alg): Handle new "special" cases on H8/S. Simplify accordingly. Handle various arithmetic right shifts with special sequences that we couldn't handle before. --- gcc/config/h8300/h8300.c | 69 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 16 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300.c b/gcc/config/h8300/h8300.c index 7959ad1..0c4e508 100644 --- a/gcc/config/h8300/h8300.c +++ b/gcc/config/h8300/h8300.c @@ -248,17 +248,17 @@ static enum shift_alg shift_alg_si[2][3][32] = { /* 16 17 18 19 20 21 22 23 */ /* 24 25 26 27 28 29 30 31 */ { INL, INL, INL, INL, INL, INL, INL, INL, - INL, INL, INL, LOP, LOP, LOP, LOP, SPC, - SPC, SPC, SPC, SPC, SPC, SPC, LOP, LOP, - SPC, SPC, LOP, LOP, SPC, SPC, SPC, SPC }, /* SHIFT_ASHIFT */ + INL, INL, INL, INL, INL, INL, INL, SPC, + SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, + SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC }, /* SHIFT_ASHIFT */ { INL, INL, INL, INL, INL, INL, INL, INL, - INL, INL, INL, LOP, LOP, LOP, LOP, SPC, - SPC, SPC, SPC, SPC, SPC, SPC, LOP, LOP, - SPC, SPC, LOP, LOP, SPC, SPC, SPC, SPC }, /* SHIFT_LSHIFTRT */ + INL, INL, INL, INL, INL, INL, INL, SPC, + SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, + SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC }, /* SHIFT_LSHIFTRT */ { INL, INL, INL, INL, INL, INL, INL, INL, - INL, INL, INL, LOP, LOP, LOP, LOP, LOP, - SPC, SPC, SPC, SPC, SPC, SPC, LOP, LOP, - SPC, SPC, LOP, LOP, LOP, LOP, LOP, SPC }, /* SHIFT_ASHIFTRT */ + INL, INL, INL, INL, INL, INL, INL, LOP, + SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, + SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC }, /* SHIFT_ASHIFTRT */ } }; @@ -375,6 +375,36 @@ h8300_option_override (void) /* H8S */ shift_alg_hi[H8_S][SHIFT_ASHIFTRT][14] = SHIFT_LOOP; + + shift_alg_si[H8_S][SHIFT_ASHIFT][11] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFT][12] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFT][13] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFT][14] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFT][22] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFT][23] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFT][26] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFT][27] = SHIFT_LOOP; + + shift_alg_si[H8_S][SHIFT_LSHIFTRT][11] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_LSHIFTRT][12] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_LSHIFTRT][13] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_LSHIFTRT][14] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_LSHIFTRT][22] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_LSHIFTRT][23] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_LSHIFTRT][26] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_LSHIFTRT][27] = SHIFT_LOOP; + + shift_alg_si[H8_S][SHIFT_ASHIFTRT][11] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFTRT][12] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFTRT][13] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFTRT][14] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFTRT][22] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFTRT][23] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFTRT][26] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFTRT][27] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFTRT][28] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFTRT][29] = SHIFT_LOOP; + shift_alg_si[H8_S][SHIFT_ASHIFTRT][30] = SHIFT_LOOP; } /* Work out a value for MOVE_RATIO. */ @@ -3814,8 +3844,7 @@ get_shift_alg (enum shift_type shift_type, enum shift_mode shift_mode, gcc_unreachable (); } } - else if ((TARGET_H8300H && count >= 16 && count <= 23) - || (TARGET_H8300S && count >= 16 && count <= 21)) + else if (count >= 16 && count <= 23) { info->remainder = count - 16; @@ -3834,8 +3863,7 @@ get_shift_alg (enum shift_type shift_type, enum shift_mode shift_mode, goto end; } } - else if ((TARGET_H8300H && count >= 24 || count <= 27) - || (TARGET_H8300S && count >= 24 && count <= 25)) + else if (count >= 24 && count <= 27) { info->remainder = count - 24; @@ -3874,7 +3902,10 @@ get_shift_alg (enum shift_type shift_type, enum shift_mode shift_mode, info->special = "sub.w\t%f0,%f0\n\trotl.l\t#2,%S0\n\trotl.l\t#2,%S0\n\textu.l\t%S0"; goto end; case SHIFT_ASHIFTRT: - gcc_unreachable (); + info->remainder = count - 24; + info->special = "mov.w\t%e0,%f0\n\tmov.b\t%t0,%s0\n\texts.w\t%f0\n\texts.l\t%S0"; + info->cc_special = OLD_CC_SET_ZNV; + goto end; } } else if (count == 29) @@ -3900,7 +3931,10 @@ get_shift_alg (enum shift_type shift_type, enum shift_mode shift_mode, } goto end; case SHIFT_ASHIFTRT: - gcc_unreachable (); + info->remainder = count - 24; + info->special = "mov.w\t%e0,%f0\n\tmov.b\t%t0,%s0\n\texts.w\t%f0\n\texts.l\t%S0"; + info->cc_special = OLD_CC_SET_ZNV; + goto end; } } else if (count == 30) @@ -3920,7 +3954,10 @@ get_shift_alg (enum shift_type shift_type, enum shift_mode shift_mode, info->special = "sub.w\t%f0,%f0\n\trotl.l\t#2,%S0\n\textu.l\t%S0"; goto end; case SHIFT_ASHIFTRT: - gcc_unreachable (); + info->remainder = count - 24; + info->special = "mov.w\t%e0,%f0\n\tmov.b\t%t0,%s0\n\texts.w\t%f0\n\texts.l\t%S0"; + info->cc_special = OLD_CC_SET_ZNV; + goto end; } } else if (count == 31) -- cgit v1.1 From 568b9c0e8ee482228f6c565730447de5b18e7cb3 Mon Sep 17 00:00:00 2001 From: Alistair Lee Date: Tue, 17 Aug 2021 10:49:35 +0100 Subject: aarch64: Replace some uses of GET_CODE with RTL predicate macros gcc/ 2021-08-17 Alistair_Lee * rtl.h (CONST_VECTOR_P): New macro. * config/aarch64/aarch64.c (aarch64_get_sve_pred_bits): Use RTL code testing macros. (aarch64_ptrue_all_mode): Likewise. (aarch64_expand_mov_immediate): Likewise. (aarch64_const_vec_all_in_range_p): Likewise. (aarch64_rtx_costs): Likewise. (aarch64_legitimate_constant_p): Likewise. (aarch64_simd_valid_immediate): Likewise. (aarch64_simd_make_constant): Likewise. (aarch64_convert_mult_to_shift): Likewise. (aarch64_expand_sve_vec_perm): Likewise. (aarch64_vec_fpconst_pow_of_2): Likewise. --- gcc/config/aarch64/aarch64.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 4cd4b03..3213585 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -4174,7 +4174,7 @@ aarch64_force_temporary (machine_mode mode, rtx x, rtx value) static bool aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x) { - if (GET_CODE (x) != CONST_VECTOR) + if (!CONST_VECTOR_P (x)) return false; unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode), @@ -4230,7 +4230,7 @@ opt_machine_mode aarch64_ptrue_all_mode (rtx x) { gcc_assert (GET_MODE (x) == VNx16BImode); - if (GET_CODE (x) != CONST_VECTOR + if (!CONST_VECTOR_P (x) || !CONST_VECTOR_DUPLICATE_P (x) || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0)) || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0) @@ -5930,7 +5930,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) return; } - if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode)) + if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode)) if (rtx res = aarch64_expand_sve_const_vector (dest, imm)) { if (dest != res) @@ -10634,7 +10634,7 @@ aarch64_const_vec_all_in_range_p (rtx vec, HOST_WIDE_INT minval, HOST_WIDE_INT maxval) { - if (GET_CODE (vec) != CONST_VECTOR + if (!CONST_VECTOR_P (vec) || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT) return false; @@ -12771,7 +12771,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED, case SIGN_EXTRACT: /* Bit-field insertion. Strip any redundant widening of the RHS to meet the width of the target. */ - if (GET_CODE (op1) == SUBREG) + if (SUBREG_P (op1)) op1 = SUBREG_REG (op1); if ((GET_CODE (op1) == ZERO_EXTEND || GET_CODE (op1) == SIGN_EXTEND) @@ -13044,7 +13044,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED, But the integer MINUS logic expects the shift/extend operation in op1. */ if (! (REG_P (op0) - || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0))))) + || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0))))) { op0 = XEXP (x, 1); op1 = XEXP (x, 0); @@ -18239,7 +18239,7 @@ aarch64_legitimate_constant_p (machine_mode mode, rtx x) /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at least be forced to memory and loaded from there. */ - if (GET_CODE (x) == CONST_VECTOR) + if (CONST_VECTOR_P (x)) return !targetm.cannot_force_const_mem (mode, x); /* Do not allow vector struct mode constants for Advanced SIMD. @@ -20044,7 +20044,7 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, scalar_mode elt_mode = GET_MODE_INNER (mode); rtx base, step; unsigned int n_elts; - if (GET_CODE (op) == CONST_VECTOR + if (CONST_VECTOR_P (op) && CONST_VECTOR_DUPLICATE_P (op)) n_elts = CONST_VECTOR_NPATTERNS (op); else if ((vec_flags & VEC_SVE_DATA) @@ -20066,7 +20066,7 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, } return true; } - else if (GET_CODE (op) == CONST_VECTOR + else if (CONST_VECTOR_P (op) && CONST_VECTOR_NUNITS (op).is_constant (&n_elts)) /* N_ELTS set above. */; else @@ -20666,7 +20666,7 @@ aarch64_simd_make_constant (rtx vals) int n_const = 0; int i; - if (GET_CODE (vals) == CONST_VECTOR) + if (CONST_VECTOR_P (vals)) const_vec = vals; else if (GET_CODE (vals) == PARALLEL) { @@ -21207,7 +21207,7 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals) static rtx aarch64_convert_mult_to_shift (rtx value, rtx_code &code) { - if (GET_CODE (value) != CONST_VECTOR) + if (!CONST_VECTOR_P (value)) return NULL_RTX; rtx_vector_builder builder; @@ -22371,7 +22371,7 @@ aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) rtx sel_reg = force_reg (sel_mode, sel); /* Check if the sel only references the first values vector. */ - if (GET_CODE (sel) == CONST_VECTOR + if (CONST_VECTOR_P (sel) && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1)) { emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg); @@ -22393,7 +22393,7 @@ aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) rtx res0 = gen_reg_rtx (data_mode); rtx res1 = gen_reg_rtx (data_mode); rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits); - if (GET_CODE (sel) != CONST_VECTOR + if (!CONST_VECTOR_P (sel) || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1)) { rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, @@ -24925,7 +24925,7 @@ int aarch64_vec_fpconst_pow_of_2 (rtx x) { int nelts; - if (GET_CODE (x) != CONST_VECTOR + if (!CONST_VECTOR_P (x) || !CONST_VECTOR_NUNITS (x).is_constant (&nelts)) return -1; -- cgit v1.1 From 5ed35a9874ba8c3aa2bbbd720e46783db264b684 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 12 Aug 2021 12:27:15 +0100 Subject: aarch64: Remove macros for vld2[q]_lane Neon intrinsics Remove macros for vld2[q]_lane Neon intrinsics. This is a preparatory step before adding new modes for structures of Advanced SIMD vectors. gcc/ChangeLog: 2021-08-12 Jonathan Wright * config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Delete. (__LD2Q_LANE_FUNC): Likewise. (vld2_lane_u8): Define without macro. (vld2_lane_u16): Likewise. (vld2_lane_u32): Likewise. (vld2_lane_u64): Likewise. (vld2_lane_s8): Likewise. (vld2_lane_s16): Likewise. (vld2_lane_s32): Likewise. (vld2_lane_s64): Likewise. (vld2_lane_f16): Likewise. (vld2_lane_f32): Likewise. (vld2_lane_f64): Likewise. (vld2_lane_p8): Likewise. (vld2_lane_p16): Likewise. (vld2_lane_p64): Likewise. (vld2q_lane_u8): Likewise. (vld2q_lane_u16): Likewise. (vld2q_lane_u32): Likewise. (vld2q_lane_u64): Likewise. (vld2q_lane_s8): Likewise. (vld2q_lane_s16): Likewise. (vld2q_lane_s32): Likewise. (vld2q_lane_s64): Likewise. (vld2q_lane_f16): Likewise. (vld2q_lane_f32): Likewise. (vld2q_lane_f64): Likewise. (vld2q_lane_p8): Likewise. (vld2q_lane_p16): Likewise. (vld2q_lane_p64): Likewise. (vld2_lane_bf16): Likewise. (vld2q_lane_bf16): Likewise. --- gcc/config/aarch64/arm_neon.h | 558 +++++++++++++++++++++++++++++++++++------- 1 file changed, 474 insertions(+), 84 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 390cf9a..91c072f 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -19882,92 +19882,455 @@ vld4q_dup_p64 (const poly64_t * __a) /* vld2_lane */ -#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ - qmode, ptrmode, funcsuffix, signedtype) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_oi __o; \ - largetype __temp; \ - __temp.val[0] = \ - vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ - __temp.val[1] = \ - vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ - __o = __builtin_aarch64_set_qregoi##qmode (__o, \ - (signedtype) __temp.val[0], \ - 0); \ - __o = __builtin_aarch64_set_qregoi##qmode (__o, \ - (signedtype) __temp.val[1], \ - 1); \ - __o = __builtin_aarch64_ld2_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \ - __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \ - return __b; \ +__extension__ extern __inline uint8x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_u8 (const uint8_t * __ptr, uint8x8x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + uint8x16x2_t __temp; + __temp.val[0] = vcombine_u8 (__b.val[0], vcreate_u8 (0)); + __temp.val[1] = vcombine_u8 (__b.val[1], vcreate_u8 (0)); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanev8qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + __b.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; } -__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v4hf, - v8hf, hf, f16, float16x8_t) -__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v2sf, v4sf, - sf, f32, float32x4_t) -__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, df, v2df, - df, f64, float64x2_t) -__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, - p16, int16x8_t) -__LD2_LANE_FUNC (poly64x1x2_t, poly64x1_t, poly64x2x2_t, poly64_t, di, - v2di_ssps, di, p64, poly64x2_t) -__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, di, v2di, di, s64, - int64x2_t) -__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, - u16, int16x8_t) -__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, - u32, int32x4_t) -__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di, - u64, int64x2_t) +__extension__ extern __inline uint16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_u16 (const uint16_t * __ptr, uint16x4x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + uint16x8x2_t __temp; + __temp.val[0] = vcombine_u16 (__b.val[0], vcreate_u16 (0)); + __temp.val[1] = vcombine_u16 (__b.val[1], vcreate_u16 (0)); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanev4hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + __b.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} + +__extension__ extern __inline uint32x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_u32 (const uint32_t * __ptr, uint32x2x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + uint32x4x2_t __temp; + __temp.val[0] = vcombine_u32 (__b.val[0], vcreate_u32 (0)); + __temp.val[1] = vcombine_u32 (__b.val[1], vcreate_u32 (0)); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanev2si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + __b.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} + +__extension__ extern __inline uint64x1x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_u64 (const uint64_t * __ptr, uint64x1x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + uint64x2x2_t __temp; + __temp.val[0] = vcombine_u64 (__b.val[0], vcreate_u64 (0)); + __temp.val[1] = vcombine_u64 (__b.val[1], vcreate_u64 (0)); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanedi ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __b.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} + +__extension__ extern __inline int8x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_s8 (const int8_t * __ptr, int8x8x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + int8x16x2_t __temp; + __temp.val[0] = vcombine_s8 (__b.val[0], vcreate_s8 (0)); + __temp.val[1] = vcombine_s8 (__b.val[1], vcreate_s8 (0)); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanev8qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + __b.val[0] = (int8x8_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (int8x8_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} + +__extension__ extern __inline int16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_s16 (const int16_t * __ptr, int16x4x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + int16x8x2_t __temp; + __temp.val[0] = vcombine_s16 (__b.val[0], vcreate_s16 (0)); + __temp.val[1] = vcombine_s16 (__b.val[1], vcreate_s16 (0)); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanev4hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + __b.val[0] = (int16x4_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (int16x4_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} + +__extension__ extern __inline int32x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_s32 (const int32_t * __ptr, int32x2x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + int32x4x2_t __temp; + __temp.val[0] = vcombine_s32 (__b.val[0], vcreate_s32 (0)); + __temp.val[1] = vcombine_s32 (__b.val[1], vcreate_s32 (0)); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanev2si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + __b.val[0] = (int32x2_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (int32x2_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} + +__extension__ extern __inline int64x1x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_s64 (const int64_t * __ptr, int64x1x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + int64x2x2_t __temp; + __temp.val[0] = vcombine_s64 (__b.val[0], vcreate_s64 (0)); + __temp.val[1] = vcombine_s64 (__b.val[1], vcreate_s64 (0)); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanedi ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __b.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} + +__extension__ extern __inline float16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_f16 (const float16_t * __ptr, float16x4x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + float16x8x2_t __temp; + __temp.val[0] = vcombine_f16 (__b.val[0], vcreate_f16 (0)); + __temp.val[1] = vcombine_f16 (__b.val[1], vcreate_f16 (0)); + __o = __builtin_aarch64_set_qregoiv8hf (__o, (float16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hf (__o, (float16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanev4hf ( + (__builtin_aarch64_simd_hf *) __ptr, __o, __c); + __b.val[0] = (float16x4_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (float16x4_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} + +__extension__ extern __inline float32x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_f32 (const float32_t * __ptr, float32x2x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + float32x4x2_t __temp; + __temp.val[0] = vcombine_f32 (__b.val[0], vcreate_f32 (0)); + __temp.val[1] = vcombine_f32 (__b.val[1], vcreate_f32 (0)); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanev2sf ( + (__builtin_aarch64_simd_sf *) __ptr, __o, __c); + __b.val[0] = (float32x2_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (float32x2_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} + +__extension__ extern __inline float64x1x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_f64 (const float64_t * __ptr, float64x1x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + float64x2x2_t __temp; + __temp.val[0] = vcombine_f64 (__b.val[0], vcreate_f64 (0)); + __temp.val[1] = vcombine_f64 (__b.val[1], vcreate_f64 (0)); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanedf ( + (__builtin_aarch64_simd_df *) __ptr, __o, __c); + __b.val[0] = (float64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (float64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} + +__extension__ extern __inline poly8x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_p8 (const poly8_t * __ptr, poly8x8x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + poly8x16x2_t __temp; + __temp.val[0] = vcombine_p8 (__b.val[0], vcreate_p8 (0)); + __temp.val[1] = vcombine_p8 (__b.val[1], vcreate_p8 (0)); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanev8qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + __b.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} + +__extension__ extern __inline poly16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_p16 (const poly16_t * __ptr, poly16x4x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + poly16x8x2_t __temp; + __temp.val[0] = vcombine_p16 (__b.val[0], vcreate_p16 (0)); + __temp.val[1] = vcombine_p16 (__b.val[1], vcreate_p16 (0)); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanev4hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + __b.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} + +__extension__ extern __inline poly64x1x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_p64 (const poly64_t * __ptr, poly64x1x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + poly64x2x2_t __temp; + __temp.val[0] = vcombine_p64 (__b.val[0], vcreate_p64 (0)); + __temp.val[1] = vcombine_p64 (__b.val[1], vcreate_p64 (0)); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanedi ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __b.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} /* vld2q_lane */ -#define __LD2Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_oi __o; \ - intype ret; \ - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \ - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \ - __o = __builtin_aarch64_ld2_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0); \ - ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1); \ - return ret; \ +__extension__ extern __inline uint8x16x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_u8 (const uint8_t * __ptr, uint8x16x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + uint8x16x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev16qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; } -__LD2Q_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16) -__LD2Q_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32) -__LD2Q_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64) -__LD2Q_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8) -__LD2Q_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16) -__LD2Q_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64) -__LD2Q_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8) -__LD2Q_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16) -__LD2Q_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32) -__LD2Q_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64) -__LD2Q_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8) -__LD2Q_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16) -__LD2Q_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32) -__LD2Q_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64) +__extension__ extern __inline uint16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_u16 (const uint16_t * __ptr, uint16x8x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + uint16x8x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev8hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline uint32x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_u32 (const uint32_t * __ptr, uint32x4x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + uint32x4x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev4si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline uint64x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_u64 (const uint64_t * __ptr, uint64x2x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + uint64x2x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev2di ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline int8x16x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_s8 (const int8_t * __ptr, int8x16x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + int8x16x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev16qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline int16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_s16 (const int16_t * __ptr, int16x8x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + int16x8x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev8hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline int32x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_s32 (const int32_t * __ptr, int32x4x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + int32x4x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev4si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + ret.val[0] = __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline int64x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_s64 (const int64_t * __ptr, int64x2x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + int64x2x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev2di ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline float16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_f16 (const float16_t * __ptr, float16x8x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + float16x8x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev8hf ( + (__builtin_aarch64_simd_hf *) __ptr, __o, __c); + ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline float32x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_f32 (const float32_t * __ptr, float32x4x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + float32x4x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev4sf ( + (__builtin_aarch64_simd_sf *) __ptr, __o, __c); + ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline float64x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_f64 (const float64_t * __ptr, float64x2x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + float64x2x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev2df ( + (__builtin_aarch64_simd_df *) __ptr, __o, __c); + ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline poly8x16x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_p8 (const poly8_t * __ptr, poly8x16x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + poly8x16x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev16qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline poly16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_p16 (const poly16_t * __ptr, poly16x8x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + poly16x8x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev8hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline poly64x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_p64 (const poly64_t * __ptr, poly64x2x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + poly64x2x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev2di ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} /* vld3_lane */ @@ -34584,9 +34947,38 @@ vcopyq_laneq_bf16 (bfloat16x8_t __a, const int __lane1, __a, __lane1); } -__LD2_LANE_FUNC (bfloat16x4x2_t, bfloat16x4_t, bfloat16x8x2_t, bfloat16_t, v4bf, - v8bf, bf, bf16, bfloat16x8_t) -__LD2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2_lane_bf16 (const bfloat16_t * __ptr, bfloat16x4x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + bfloat16x8x2_t __temp; + __temp.val[0] = vcombine_bf16 (__b.val[0], vcreate_bf16 (0)); + __temp.val[1] = vcombine_bf16 (__b.val[1], vcreate_bf16 (0)); + __o = __builtin_aarch64_set_qregoiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_ld2_lanev4bf ( + (__builtin_aarch64_simd_bf *) __ptr, __o, __c); + __b.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoidi (__o, 0); + __b.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoidi (__o, 1); + return __b; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld2q_lane_bf16 (const bfloat16_t * __ptr, bfloat16x8x2_t __b, const int __c) +{ + __builtin_aarch64_simd_oi __o; + bfloat16x8x2_t ret; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_ld2_lanev8bf ( + (__builtin_aarch64_simd_bf *) __ptr, __o, __c); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + __LD3_LANE_FUNC (bfloat16x4x3_t, bfloat16x4_t, bfloat16x8x3_t, bfloat16_t, v4bf, v8bf, bf, bf16, bfloat16x8_t) __LD3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) @@ -34888,8 +35280,6 @@ vaddq_p128 (poly128_t __a, poly128_t __b) #undef __aarch64_vdupq_laneq_u32 #undef __aarch64_vdupq_laneq_u64 -#undef __LD2_LANE_FUNC -#undef __LD2Q_LANE_FUNC #undef __LD3_LANE_FUNC #undef __LD3Q_LANE_FUNC #undef __LD4_LANE_FUNC -- cgit v1.1 From 08f83812e5c5fdd9a7a4a1b9e46bb33725185c5a Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 16 Aug 2021 09:59:44 +0100 Subject: aarch64: Remove macros for vld3[q]_lane Neon intrinsics Remove macros for vld3[q]_lane Neon intrinsics. This is a preparatory step before adding new modes for structures of Advanced SIMD vectors. gcc/ChangeLog: 2021-08-16 Jonathan Wright * config/aarch64/arm_neon.h (__LD3_LANE_FUNC): Delete. (__LD3Q_LANE_FUNC): Delete. (vld3_lane_u8): Define without macro. (vld3_lane_u16): Likewise. (vld3_lane_u32): Likewise. (vld3_lane_u64): Likewise. (vld3_lane_s8): Likewise. (vld3_lane_s16): Likewise. (vld3_lane_s32): Likewise. (vld3_lane_s64): Likewise. (vld3_lane_f16): Likewise. (vld3_lane_f32): Likewise. (vld3_lane_f64): Likewise. (vld3_lane_p8): Likewise. (vld3_lane_p16): Likewise. (vld3_lane_p64): Likewise. (vld3q_lane_u8): Likewise. (vld3q_lane_u16): Likewise. (vld3q_lane_u32): Likewise. (vld3q_lane_u64): Likewise. (vld3q_lane_s8): Likewise. (vld3q_lane_s16): Likewise. (vld3q_lane_s32): Likewise. (vld3q_lane_s64): Likewise. (vld3q_lane_f16): Likewise. (vld3q_lane_f32): Likewise. (vld3q_lane_f64): Likewise. (vld3q_lane_p8): Likewise. (vld3q_lane_p16): Likewise. (vld3q_lane_p64): Likewise. (vld3_lane_bf16): Likewise. (vld3q_lane_bf16): Likewise. --- gcc/config/aarch64/arm_neon.h | 641 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 549 insertions(+), 92 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 91c072f..29b6298 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -20334,100 +20334,525 @@ vld2q_lane_p64 (const poly64_t * __ptr, poly64x2x2_t __b, const int __c) /* vld3_lane */ -#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ - qmode, ptrmode, funcsuffix, signedtype) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_ci __o; \ - largetype __temp; \ - __temp.val[0] = \ - vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ - __temp.val[1] = \ - vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ - __temp.val[2] = \ - vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[0], \ - 0); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[1], \ - 1); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[2], \ - 2); \ - __o = __builtin_aarch64_ld3_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \ - __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \ - __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \ - return __b; \ +__extension__ extern __inline uint8x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_u8 (const uint8_t * __ptr, uint8x8x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + uint8x16x3_t __temp; + __temp.val[0] = vcombine_u8 (__b.val[0], vcreate_u8 (0)); + __temp.val[1] = vcombine_u8 (__b.val[1], vcreate_u8 (0)); + __temp.val[2] = vcombine_u8 (__b.val[2], vcreate_u8 (0)); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanev8qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + __b.val[0] = (uint8x8_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (uint8x8_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (uint8x8_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; } -__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf, - v8hf, hf, f16, float16x8_t) -__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v2sf, v4sf, - sf, f32, float32x4_t) -__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, df, v2df, - df, f64, float64x2_t) -__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, - p16, int16x8_t) -__LD3_LANE_FUNC (poly64x1x3_t, poly64x1_t, poly64x2x3_t, poly64_t, di, - v2di_ssps, di, p64, poly64x2_t) -__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, di, v2di, di, s64, - int64x2_t) -__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, - u16, int16x8_t) -__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v2si, v4si, si, - u32, int32x4_t) -__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di, - u64, int64x2_t) +__extension__ extern __inline uint16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_u16 (const uint16_t * __ptr, uint16x4x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + uint16x8x3_t __temp; + __temp.val[0] = vcombine_u16 (__b.val[0], vcreate_u16 (0)); + __temp.val[1] = vcombine_u16 (__b.val[1], vcreate_u16 (0)); + __temp.val[2] = vcombine_u16 (__b.val[2], vcreate_u16 (0)); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanev4hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + __b.val[0] = (uint16x4_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (uint16x4_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (uint16x4_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} + +__extension__ extern __inline uint32x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_u32 (const uint32_t * __ptr, uint32x2x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + uint32x4x3_t __temp; + __temp.val[0] = vcombine_u32 (__b.val[0], vcreate_u32 (0)); + __temp.val[1] = vcombine_u32 (__b.val[1], vcreate_u32 (0)); + __temp.val[2] = vcombine_u32 (__b.val[2], vcreate_u32 (0)); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanev2si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + __b.val[0] = (uint32x2_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (uint32x2_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (uint32x2_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} + +__extension__ extern __inline uint64x1x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_u64 (const uint64_t * __ptr, uint64x1x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + uint64x2x3_t __temp; + __temp.val[0] = vcombine_u64 (__b.val[0], vcreate_u64 (0)); + __temp.val[1] = vcombine_u64 (__b.val[1], vcreate_u64 (0)); + __temp.val[2] = vcombine_u64 (__b.val[2], vcreate_u64 (0)); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanedi ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __b.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} + +__extension__ extern __inline int8x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_s8 (const int8_t * __ptr, int8x8x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + int8x16x3_t __temp; + __temp.val[0] = vcombine_s8 (__b.val[0], vcreate_s8 (0)); + __temp.val[1] = vcombine_s8 (__b.val[1], vcreate_s8 (0)); + __temp.val[2] = vcombine_s8 (__b.val[2], vcreate_s8 (0)); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanev8qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + __b.val[0] = (int8x8_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (int8x8_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (int8x8_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} + +__extension__ extern __inline int16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_s16 (const int16_t * __ptr, int16x4x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + int16x8x3_t __temp; + __temp.val[0] = vcombine_s16 (__b.val[0], vcreate_s16 (0)); + __temp.val[1] = vcombine_s16 (__b.val[1], vcreate_s16 (0)); + __temp.val[2] = vcombine_s16 (__b.val[2], vcreate_s16 (0)); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanev4hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + __b.val[0] = (int16x4_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (int16x4_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (int16x4_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} + +__extension__ extern __inline int32x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_s32 (const int32_t * __ptr, int32x2x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + int32x4x3_t __temp; + __temp.val[0] = vcombine_s32 (__b.val[0], vcreate_s32 (0)); + __temp.val[1] = vcombine_s32 (__b.val[1], vcreate_s32 (0)); + __temp.val[2] = vcombine_s32 (__b.val[2], vcreate_s32 (0)); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanev2si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + __b.val[0] = (int32x2_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (int32x2_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (int32x2_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} + +__extension__ extern __inline int64x1x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_s64 (const int64_t * __ptr, int64x1x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + int64x2x3_t __temp; + __temp.val[0] = vcombine_s64 (__b.val[0], vcreate_s64 (0)); + __temp.val[1] = vcombine_s64 (__b.val[1], vcreate_s64 (0)); + __temp.val[2] = vcombine_s64 (__b.val[2], vcreate_s64 (0)); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanedi ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __b.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} + +__extension__ extern __inline float16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_f16 (const float16_t * __ptr, float16x4x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + float16x8x3_t __temp; + __temp.val[0] = vcombine_f16 (__b.val[0], vcreate_f16 (0)); + __temp.val[1] = vcombine_f16 (__b.val[1], vcreate_f16 (0)); + __temp.val[2] = vcombine_f16 (__b.val[2], vcreate_f16 (0)); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanev4hf ( + (__builtin_aarch64_simd_hf *) __ptr, __o, __c); + __b.val[0] = (float16x4_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (float16x4_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (float16x4_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} + +__extension__ extern __inline float32x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_f32 (const float32_t * __ptr, float32x2x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + float32x4x3_t __temp; + __temp.val[0] = vcombine_f32 (__b.val[0], vcreate_f32 (0)); + __temp.val[1] = vcombine_f32 (__b.val[1], vcreate_f32 (0)); + __temp.val[2] = vcombine_f32 (__b.val[2], vcreate_f32 (0)); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanev2sf ( + (__builtin_aarch64_simd_sf *) __ptr, __o, __c); + __b.val[0] = (float32x2_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (float32x2_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (float32x2_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} + +__extension__ extern __inline float64x1x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_f64 (const float64_t * __ptr, float64x1x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + float64x2x3_t __temp; + __temp.val[0] = vcombine_f64 (__b.val[0], vcreate_f64 (0)); + __temp.val[1] = vcombine_f64 (__b.val[1], vcreate_f64 (0)); + __temp.val[2] = vcombine_f64 (__b.val[2], vcreate_f64 (0)); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanedi ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __b.val[0] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} + +__extension__ extern __inline poly8x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_p8 (const poly8_t * __ptr, poly8x8x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + poly8x16x3_t __temp; + __temp.val[0] = vcombine_p8 (__b.val[0], vcreate_p8 (0)); + __temp.val[1] = vcombine_p8 (__b.val[1], vcreate_p8 (0)); + __temp.val[2] = vcombine_p8 (__b.val[2], vcreate_p8 (0)); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanev8qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + __b.val[0] = (poly8x8_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (poly8x8_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (poly8x8_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} + +__extension__ extern __inline poly16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_p16 (const poly16_t * __ptr, poly16x4x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + poly16x8x3_t __temp; + __temp.val[0] = vcombine_p16 (__b.val[0], vcreate_p16 (0)); + __temp.val[1] = vcombine_p16 (__b.val[1], vcreate_p16 (0)); + __temp.val[2] = vcombine_p16 (__b.val[2], vcreate_p16 (0)); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanev4hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + __b.val[0] = (poly16x4_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (poly16x4_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (poly16x4_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} + +__extension__ extern __inline poly64x1x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_p64 (const poly64_t * __ptr, poly64x1x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + poly64x2x3_t __temp; + __temp.val[0] = vcombine_p64 (__b.val[0], vcreate_p64 (0)); + __temp.val[1] = vcombine_p64 (__b.val[1], vcreate_p64 (0)); + __temp.val[2] = vcombine_p64 (__b.val[2], vcreate_p64 (0)); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanedi ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __b.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} /* vld3q_lane */ -#define __LD3Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_ci __o; \ - intype ret; \ - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \ - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \ - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \ - __o = __builtin_aarch64_ld3_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0); \ - ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1); \ - ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2); \ - return ret; \ +__extension__ extern __inline uint8x16x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_u8 (const uint8_t * __ptr, uint8x16x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + uint8x16x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev16qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline uint16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_u16 (const uint16_t * __ptr, uint16x8x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + uint16x8x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev8hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline uint32x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_u32 (const uint32_t * __ptr, uint32x4x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + uint32x4x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev4si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline uint64x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_u64 (const uint64_t * __ptr, uint64x2x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + uint64x2x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev2di ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline int8x16x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_s8 (const int8_t * __ptr, int8x16x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + int8x16x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev16qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline int16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_s16 (const int16_t * __ptr, int16x8x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + int16x8x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev8hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline int32x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_s32 (const int32_t * __ptr, int32x4x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + int32x4x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev4si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline int64x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_s64 (const int64_t * __ptr, int64x2x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + int64x2x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev2di ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; } -__LD3Q_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16) -__LD3Q_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32) -__LD3Q_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64) -__LD3Q_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8) -__LD3Q_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16) -__LD3Q_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64) -__LD3Q_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8) -__LD3Q_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16) -__LD3Q_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32) -__LD3Q_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64) -__LD3Q_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8) -__LD3Q_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16) -__LD3Q_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32) -__LD3Q_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64) +__extension__ extern __inline float16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_f16 (const float16_t * __ptr, float16x8x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + float16x8x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev8hf ( + (__builtin_aarch64_simd_hf *) __ptr, __o, __c); + ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline float32x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_f32 (const float32_t * __ptr, float32x4x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + float32x4x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev4sf ( + (__builtin_aarch64_simd_sf *) __ptr, __o, __c); + ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline float64x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_f64 (const float64_t * __ptr, float64x2x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + float64x2x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev2df ( + (__builtin_aarch64_simd_df *) __ptr, __o, __c); + ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline poly8x16x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_p8 (const poly8_t * __ptr, poly8x16x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + poly8x16x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev16qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline poly16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_p16 (const poly16_t * __ptr, poly16x8x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + poly16x8x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev8hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline poly64x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_p64 (const poly64_t * __ptr, poly64x2x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + poly64x2x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev2di ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} /* vld4_lane */ @@ -34979,9 +35404,43 @@ vld2q_lane_bf16 (const bfloat16_t * __ptr, bfloat16x8x2_t __b, const int __c) return ret; } -__LD3_LANE_FUNC (bfloat16x4x3_t, bfloat16x4_t, bfloat16x8x3_t, bfloat16_t, v4bf, - v8bf, bf, bf16, bfloat16x8_t) -__LD3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3_lane_bf16 (const bfloat16_t * __ptr, bfloat16x4x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + bfloat16x8x3_t __temp; + __temp.val[0] = vcombine_bf16 (__b.val[0], vcreate_bf16 (0)); + __temp.val[1] = vcombine_bf16 (__b.val[1], vcreate_bf16 (0)); + __temp.val[2] = vcombine_bf16 (__b.val[2], vcreate_bf16 (0)); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_ld3_lanev4bf ( + (__builtin_aarch64_simd_bf *) __ptr, __o, __c); + __b.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregcidi (__o, 0); + __b.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregcidi (__o, 1); + __b.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __b; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld3q_lane_bf16 (const bfloat16_t * __ptr, bfloat16x8x3_t __b, const int __c) +{ + __builtin_aarch64_simd_ci __o; + bfloat16x8x3_t ret; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_ld3_lanev8bf ( + (__builtin_aarch64_simd_bf *) __ptr, __o, __c); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + __LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, v8bf, bf, bf16, bfloat16x8_t) __LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) @@ -35280,8 +35739,6 @@ vaddq_p128 (poly128_t __a, poly128_t __b) #undef __aarch64_vdupq_laneq_u32 #undef __aarch64_vdupq_laneq_u64 -#undef __LD3_LANE_FUNC -#undef __LD3Q_LANE_FUNC #undef __LD4_LANE_FUNC #undef __LD4Q_LANE_FUNC -- cgit v1.1 From d1819df86fbe42125cccb2fc2959a0bf51e524d6 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 16 Aug 2021 14:37:18 +0100 Subject: aarch64: Remove macros for vld4[q]_lane Neon intrinsics Remove macros for vld4[q]_lane Neon intrinsics. This is a preparatory step before adding new modes for structures of Advanced SIMD vectors. gcc/ChangeLog: 2021-08-16 Jonathan Wright * config/aarch64/arm_neon.h (__LD4_LANE_FUNC): Delete. (__LD4Q_LANE_FUNC): Likewise. (vld4_lane_u8): Define without macro. (vld4_lane_u16): Likewise. (vld4_lane_u32): Likewise. (vld4_lane_u64): Likewise. (vld4_lane_s8): Likewise. (vld4_lane_s16): Likewise. (vld4_lane_s32): Likewise. (vld4_lane_s64): Likewise. (vld4_lane_f16): Likewise. (vld4_lane_f32): Likewise. (vld4_lane_f64): Likewise. (vld4_lane_p8): Likewise. (vld4_lane_p16): Likewise. (vld4_lane_p64): Likewise. (vld4q_lane_u8): Likewise. (vld4q_lane_u16): Likewise. (vld4q_lane_u32): Likewise. (vld4q_lane_u64): Likewise. (vld4q_lane_s8): Likewise. (vld4q_lane_s16): Likewise. (vld4q_lane_s32): Likewise. (vld4q_lane_s64): Likewise. (vld4q_lane_f16): Likewise. (vld4q_lane_f32): Likewise. (vld4q_lane_f64): Likewise. (vld4q_lane_p8): Likewise. (vld4q_lane_p16): Likewise. (vld4q_lane_p64): Likewise. (vld4_lane_bf16): Likewise. (vld4q_lane_bf16): Likewise. --- gcc/config/aarch64/arm_neon.h | 728 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 624 insertions(+), 104 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 29b6298..d8b2970 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -20856,110 +20856,595 @@ vld3q_lane_p64 (const poly64_t * __ptr, poly64x2x3_t __b, const int __c) /* vld4_lane */ -#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ - qmode, ptrmode, funcsuffix, signedtype) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_xi __o; \ - largetype __temp; \ - __temp.val[0] = \ - vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ - __temp.val[1] = \ - vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ - __temp.val[2] = \ - vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ - __temp.val[3] = \ - vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[0], \ - 0); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[1], \ - 1); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[2], \ - 2); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[3], \ - 3); \ - __o = __builtin_aarch64_ld4_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \ - __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \ - __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \ - __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \ - return __b; \ +__extension__ extern __inline uint8x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_u8 (const uint8_t * __ptr, uint8x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint8x16x4_t __temp; + __temp.val[0] = vcombine_u8 (__b.val[0], vcreate_u8 (0)); + __temp.val[1] = vcombine_u8 (__b.val[1], vcreate_u8 (0)); + __temp.val[2] = vcombine_u8 (__b.val[2], vcreate_u8 (0)); + __temp.val[3] = vcombine_u8 (__b.val[3], vcreate_u8 (0)); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + __b.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; } -/* vld4q_lane */ +__extension__ extern __inline uint16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_u16 (const uint16_t * __ptr, uint16x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint16x8x4_t __temp; + __temp.val[0] = vcombine_u16 (__b.val[0], vcreate_u16 (0)); + __temp.val[1] = vcombine_u16 (__b.val[1], vcreate_u16 (0)); + __temp.val[2] = vcombine_u16 (__b.val[2], vcreate_u16 (0)); + __temp.val[3] = vcombine_u16 (__b.val[3], vcreate_u16 (0)); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + __b.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline uint32x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_u32 (const uint32_t * __ptr, uint32x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint32x4x4_t __temp; + __temp.val[0] = vcombine_u32 (__b.val[0], vcreate_u32 (0)); + __temp.val[1] = vcombine_u32 (__b.val[1], vcreate_u32 (0)); + __temp.val[2] = vcombine_u32 (__b.val[2], vcreate_u32 (0)); + __temp.val[3] = vcombine_u32 (__b.val[3], vcreate_u32 (0)); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + __b.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline uint64x1x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_u64 (const uint64_t * __ptr, uint64x1x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint64x2x4_t __temp; + __temp.val[0] = vcombine_u64 (__b.val[0], vcreate_u64 (0)); + __temp.val[1] = vcombine_u64 (__b.val[1], vcreate_u64 (0)); + __temp.val[2] = vcombine_u64 (__b.val[2], vcreate_u64 (0)); + __temp.val[3] = vcombine_u64 (__b.val[3], vcreate_u64 (0)); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanedi ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __b.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline int8x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_s8 (const int8_t * __ptr, int8x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int8x16x4_t __temp; + __temp.val[0] = vcombine_s8 (__b.val[0], vcreate_s8 (0)); + __temp.val[1] = vcombine_s8 (__b.val[1], vcreate_s8 (0)); + __temp.val[2] = vcombine_s8 (__b.val[2], vcreate_s8 (0)); + __temp.val[3] = vcombine_s8 (__b.val[3], vcreate_s8 (0)); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + __b.val[0] = (int8x8_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (int8x8_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (int8x8_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (int8x8_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline int16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_s16 (const int16_t * __ptr, int16x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int16x8x4_t __temp; + __temp.val[0] = vcombine_s16 (__b.val[0], vcreate_s16 (0)); + __temp.val[1] = vcombine_s16 (__b.val[1], vcreate_s16 (0)); + __temp.val[2] = vcombine_s16 (__b.val[2], vcreate_s16 (0)); + __temp.val[3] = vcombine_s16 (__b.val[3], vcreate_s16 (0)); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + __b.val[0] = (int16x4_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (int16x4_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (int16x4_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (int16x4_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline int32x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_s32 (const int32_t * __ptr, int32x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int32x4x4_t __temp; + __temp.val[0] = vcombine_s32 (__b.val[0], vcreate_s32 (0)); + __temp.val[1] = vcombine_s32 (__b.val[1], vcreate_s32 (0)); + __temp.val[2] = vcombine_s32 (__b.val[2], vcreate_s32 (0)); + __temp.val[3] = vcombine_s32 (__b.val[3], vcreate_s32 (0)); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + __b.val[0] = (int32x2_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (int32x2_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (int32x2_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (int32x2_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline int64x1x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_s64 (const int64_t * __ptr, int64x1x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int64x2x4_t __temp; + __temp.val[0] = vcombine_s64 (__b.val[0], vcreate_s64 (0)); + __temp.val[1] = vcombine_s64 (__b.val[1], vcreate_s64 (0)); + __temp.val[2] = vcombine_s64 (__b.val[2], vcreate_s64 (0)); + __temp.val[3] = vcombine_s64 (__b.val[3], vcreate_s64 (0)); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanedi ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __b.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline float16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_f16 (const float16_t * __ptr, float16x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + float16x8x4_t __temp; + __temp.val[0] = vcombine_f16 (__b.val[0], vcreate_f16 (0)); + __temp.val[1] = vcombine_f16 (__b.val[1], vcreate_f16 (0)); + __temp.val[2] = vcombine_f16 (__b.val[2], vcreate_f16 (0)); + __temp.val[3] = vcombine_f16 (__b.val[3], vcreate_f16 (0)); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4hf ( + (__builtin_aarch64_simd_hf *) __ptr, __o, __c); + __b.val[0] = (float16x4_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (float16x4_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (float16x4_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (float16x4_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline float32x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_f32 (const float32_t * __ptr, float32x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + float32x4x4_t __temp; + __temp.val[0] = vcombine_f32 (__b.val[0], vcreate_f32 (0)); + __temp.val[1] = vcombine_f32 (__b.val[1], vcreate_f32 (0)); + __temp.val[2] = vcombine_f32 (__b.val[2], vcreate_f32 (0)); + __temp.val[3] = vcombine_f32 (__b.val[3], vcreate_f32 (0)); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2si ( + (__builtin_aarch64_simd_sf *) __ptr, __o, __c); + __b.val[0] = (float32x2_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (float32x2_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (float32x2_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (float32x2_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline float64x1x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_f64 (const float64_t * __ptr, float64x1x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + float64x2x4_t __temp; + __temp.val[0] = vcombine_f64 (__b.val[0], vcreate_f64 (0)); + __temp.val[1] = vcombine_f64 (__b.val[1], vcreate_f64 (0)); + __temp.val[2] = vcombine_f64 (__b.val[2], vcreate_f64 (0)); + __temp.val[3] = vcombine_f64 (__b.val[3], vcreate_f64 (0)); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanedf ( + (__builtin_aarch64_simd_df *) __ptr, __o, __c); + __b.val[0] = (float64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (float64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (float64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (float64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline poly8x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_p8 (const poly8_t * __ptr, poly8x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + poly8x16x4_t __temp; + __temp.val[0] = vcombine_p8 (__b.val[0], vcreate_p8 (0)); + __temp.val[1] = vcombine_p8 (__b.val[1], vcreate_p8 (0)); + __temp.val[2] = vcombine_p8 (__b.val[2], vcreate_p8 (0)); + __temp.val[3] = vcombine_p8 (__b.val[3], vcreate_p8 (0)); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + __b.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} -__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf, - v8hf, hf, f16, float16x8_t) -__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v2sf, v4sf, - sf, f32, float32x4_t) -__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, df, v2df, - df, f64, float64x2_t) -__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, - p16, int16x8_t) -__LD4_LANE_FUNC (poly64x1x4_t, poly64x1_t, poly64x2x4_t, poly64_t, di, - v2di_ssps, di, p64, poly64x2_t) -__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, di, v2di, di, s64, - int64x2_t) -__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, - u16, int16x8_t) -__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v2si, v4si, si, - u32, int32x4_t) -__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di, - u64, int64x2_t) +__extension__ extern __inline poly16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_p16 (const poly16_t * __ptr, poly16x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + poly16x8x4_t __temp; + __temp.val[0] = vcombine_p16 (__b.val[0], vcreate_p16 (0)); + __temp.val[1] = vcombine_p16 (__b.val[1], vcreate_p16 (0)); + __temp.val[2] = vcombine_p16 (__b.val[2], vcreate_p16 (0)); + __temp.val[3] = vcombine_p16 (__b.val[3], vcreate_p16 (0)); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + __b.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline poly64x1x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_p64 (const poly64_t * __ptr, poly64x1x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + poly64x2x4_t __temp; + __temp.val[0] = vcombine_p64 (__b.val[0], vcreate_p64 (0)); + __temp.val[1] = vcombine_p64 (__b.val[1], vcreate_p64 (0)); + __temp.val[2] = vcombine_p64 (__b.val[2], vcreate_p64 (0)); + __temp.val[3] = vcombine_p64 (__b.val[3], vcreate_p64 (0)); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanedi ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __b.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (poly64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} /* vld4q_lane */ -#define __LD4Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_xi __o; \ - intype ret; \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \ - __o = __builtin_aarch64_ld4_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0); \ - ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1); \ - ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2); \ - ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3); \ - return ret; \ -} - -__LD4Q_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16) -__LD4Q_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32) -__LD4Q_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64) -__LD4Q_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8) -__LD4Q_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16) -__LD4Q_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64) -__LD4Q_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8) -__LD4Q_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16) -__LD4Q_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32) -__LD4Q_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64) -__LD4Q_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8) -__LD4Q_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16) -__LD4Q_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32) -__LD4Q_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64) +__extension__ extern __inline uint8x16x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_u8 (const uint8_t * __ptr, uint8x16x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint8x16x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev16qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline uint16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_u16 (const uint16_t * __ptr, uint16x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint16x8x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline uint32x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_u32 (const uint32_t * __ptr, uint32x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint32x4x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline uint64x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_u64 (const uint64_t * __ptr, uint64x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint64x2x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2di ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline int8x16x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_s8 (const int8_t * __ptr, int8x16x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int8x16x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev16qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline int16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_s16 (const int16_t * __ptr, int16x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int16x8x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline int32x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_s32 (const int32_t * __ptr, int32x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int32x4x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline int64x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_s64 (const int64_t * __ptr, int64x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int64x2x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2di ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline float16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_f16 (const float16_t * __ptr, float16x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + float16x8x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8hf ( + (__builtin_aarch64_simd_hf *) __ptr, __o, __c); + ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (float16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline float32x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_f32 (const float32_t * __ptr, float32x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + float32x4x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4sf ( + (__builtin_aarch64_simd_sf *) __ptr, __o, __c); + ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline float64x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_f64 (const float64_t * __ptr, float64x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + float64x2x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2df ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline poly8x16x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_p8 (const poly8_t * __ptr, poly8x16x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + poly8x16x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev16qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline poly16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_p16 (const poly16_t * __ptr, poly16x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + poly16x8x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline poly64x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_p64 (const poly64_t * __ptr, poly64x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + poly64x2x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2di ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (poly64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} /* vmax */ @@ -35441,9 +35926,47 @@ vld3q_lane_bf16 (const bfloat16_t * __ptr, bfloat16x8x3_t __b, const int __c) return ret; } -__LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, - v8bf, bf, bf16, bfloat16x8_t) -__LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_bf16 (const bfloat16_t * __ptr, bfloat16x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + bfloat16x8x4_t __temp; + __temp.val[0] = vcombine_bf16 (__b.val[0], vcreate_bf16 (0)); + __temp.val[1] = vcombine_bf16 (__b.val[1], vcreate_bf16 (0)); + __temp.val[2] = vcombine_bf16 (__b.val[2], vcreate_bf16 (0)); + __temp.val[3] = vcombine_bf16 (__b.val[3], vcreate_bf16 (0)); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4bf ( + (__builtin_aarch64_simd_bf *) __ptr, __o, __c); + __b.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_bf16 (const bfloat16_t * __ptr, bfloat16x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + bfloat16x8x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8bf ( + (__builtin_aarch64_simd_bf *) __ptr, __o, __c); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -35739,7 +36262,4 @@ vaddq_p128 (poly128_t __a, poly128_t __b) #undef __aarch64_vdupq_laneq_u32 #undef __aarch64_vdupq_laneq_u64 -#undef __LD4_LANE_FUNC -#undef __LD4Q_LANE_FUNC - #endif -- cgit v1.1 From 743b8dd6fd757e997eb060d70fd4ae8e04fb56cd Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Sat, 31 Jul 2021 16:29:03 +0100 Subject: Darwin, X86, config: Adjust 'as' command lines [PR100340]. Versions of the assembler using clang from XCode 12.5/12.5.1 have a bug which produces different code layout between debug and non-debug input, leading to a compare fail for default configure parameters. This is a workaround fix to disable the optimisation that is responsible for the bug. Signed-off-by: Iain Sandoe PR target/100340 - Bootstrap fails with Clang 12.0.5 (XCode 12.5) PR target/100340 gcc/ChangeLog: * config.in: Regenerate. * config/i386/darwin.h (EXTRA_ASM_OPTS): New (ASM_SPEC): Pass options to disable branch shortening where needed. * configure: Regenerate. * configure.ac: Detect versions of 'as' that support the optimisation which has the bug. --- gcc/config/i386/darwin.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h index bac3219..73b06e2 100644 --- a/gcc/config/i386/darwin.h +++ b/gcc/config/i386/darwin.h @@ -125,10 +125,18 @@ along with GCC; see the file COPYING3. If not see %{mfentry*:%eDarwin does not support -mfentry or associated options}" \ DARWIN_CC1_SPEC +/* This is a workaround for a tool bug: see PR100340. */ + +#ifdef HAVE_AS_MLLVM_X86_PAD_FOR_ALIGN +#define EXTRA_ASM_OPTS " -mllvm -x86-pad-for-align=false" +#else +#define EXTRA_ASM_OPTS "" +#endif + #undef ASM_SPEC #define ASM_SPEC "-arch %(darwin_arch) \ " ASM_OPTIONS " -force_cpusubtype_ALL \ - %{static}" ASM_MMACOSX_VERSION_MIN_SPEC + %{static}" ASM_MMACOSX_VERSION_MIN_SPEC EXTRA_ASM_OPTS #undef ENDFILE_SPEC #define ENDFILE_SPEC \ -- cgit v1.1 From 2d9da1c89778be1d6604cc1465b0dd50f241a352 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Fri, 13 Aug 2021 20:20:04 +0100 Subject: Darwin: Reset section names table at the end of compile. For a single use (typical compile) this vector will be reclaimed as GGC. For JIT this is not sufficient since it does not reset the pointer to NULL (and thus we think the the vector is already allocated when a context is reused). The clears the vector and sets the pointer to NULL at the end of object output. Signed-off-by: Iain Sandoe gcc/ChangeLog: * config/darwin.c (darwin_file_end): Reset and reclaim the section names table at the end of compile. --- gcc/config/darwin.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c index b160c23..5d1d13c 100644 --- a/gcc/config/darwin.c +++ b/gcc/config/darwin.c @@ -3129,6 +3129,14 @@ darwin_file_end (void) re-arranging data. */ if (!DARWIN_SECTION_ANCHORS || !flag_section_anchors) fprintf (asm_out_file, "\t.subsections_via_symbols\n"); + + /* We rely on this being NULL at the start of compilation; reset it here + so that JIT can reuse a context. */ + if (dwarf_sect_names_table != NULL) + { + dwarf_sect_names_table->truncate (0); + dwarf_sect_names_table = NULL; + } } /* TODO: Add a language hook for identifying if a decl is a vtable. */ -- cgit v1.1 From 97d51c1764554fcef05fe94ee6445f5d2252bcff Mon Sep 17 00:00:00 2001 From: liuhongt Date: Tue, 17 Aug 2021 13:11:26 +0800 Subject: Add x86 tune to enable v2df vector reduction by paddpd. The tune is disabled by default. gcc/ChangeLog: PR target/97147 * config/i386/i386.h (TARGET_V2DF_REDUCTION_PREFER_HADDPD): New macro. * config/i386/sse.md (*sse3_haddv2df3_low): Add TARGET_V2DF_REDUCTION_PREFER_HADDPD. (*sse3_hsubv2df3_low): Ditto. * config/i386/x86-tune.def (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD): New tune. gcc/testsuite/ChangeLog: PR target/97147 * gcc.target/i386/pr54400.c: Adjust testcase. * gcc.target/i386/pr94147.c: New test. --- gcc/config/i386/i386.h | 2 ++ gcc/config/i386/sse.md | 4 ++-- gcc/config/i386/x86-tune.def | 5 +++++ 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 21fe51b..b3e57a8 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -418,6 +418,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_EMIT_VZEROUPPER] #define TARGET_EXPAND_ABS \ ix86_tune_features[X86_TUNE_EXPAND_ABS] +#define TARGET_V2DF_REDUCTION_PREFER_HADDPD \ + ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 27e25cc..1388968 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2771,7 +2771,7 @@ (vec_select:DF (match_dup 1) (parallel [(match_operand:SI 3 "const_0_to_1_operand")]))))] - "TARGET_SSE3 + "TARGET_SSE3 && TARGET_V2DF_REDUCTION_PREFER_HADDPD && INTVAL (operands[2]) != INTVAL (operands[3])" "@ haddpd\t{%0, %0|%0, %0} @@ -2790,7 +2790,7 @@ (vec_select:DF (match_dup 1) (parallel [(const_int 1)]))))] - "TARGET_SSE3" + "TARGET_SSE3 && TARGET_V2DF_REDUCTION_PREFER_HADDPD" "@ hsubpd\t{%0, %0|%0, %0} vhsubpd\t{%1, %1, %0|%0, %1, %1}" diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index eb057a6..8f55da8 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -452,6 +452,11 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER) smaller FMA chain. */ DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3) +/* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd + for v2df vector reduction. */ +DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD, + "v2df_reduction_prefer_haddpd", m_NONE) + /*****************************************************************************/ /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ /*****************************************************************************/ -- cgit v1.1 From 640df4ef815aa35fedf1d724ab31d8eed8817f82 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 18 Aug 2021 09:10:22 +0100 Subject: aarch64: Fix float <-> int errors in vld4[q]_lane intrinsics A previous commit "aarch64: Remove macros for vld4[q]_lane Neon intrinsics" introduced some float <-> int type conversion errors. This patch fixes those errors. gcc/ChangeLog: 2021-08-18 Jonathan Wright * config/aarch64/arm_neon.h (vld3_lane_f64): Use float RTL pattern and type cast. (vld4_lane_f32): Use float RTL pattern. (vld4q_lane_f64): Use float type cast. --- gcc/config/aarch64/arm_neon.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index d8b2970..635a223 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -20546,8 +20546,8 @@ vld3_lane_f64 (const float64_t * __ptr, float64x1x3_t __b, const int __c) __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[1], 1); __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[2], 2); - __o = __builtin_aarch64_ld3_lanedi ( - (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __o = __builtin_aarch64_ld3_lanedf ( + (__builtin_aarch64_simd_df *) __ptr, __o, __c); __b.val[0] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); __b.val[1] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); __b.val[2] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); @@ -21077,7 +21077,7 @@ vld4_lane_f32 (const float32_t * __ptr, float32x2x4_t __b, const int __c) __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[1], 1); __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[2], 2); __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[3], 3); - __o = __builtin_aarch64_ld4_lanev2si ( + __o = __builtin_aarch64_ld4_lanev2sf ( (__builtin_aarch64_simd_sf *) __ptr, __o, __c); __b.val[0] = (float32x2_t) __builtin_aarch64_get_dregxidi (__o, 0); __b.val[1] = (float32x2_t) __builtin_aarch64_get_dregxidi (__o, 1); @@ -21381,7 +21381,7 @@ vld4q_lane_f64 (const float64_t * __ptr, float64x2x4_t __b, const int __c) __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); __o = __builtin_aarch64_ld4_lanev2df ( - (__builtin_aarch64_simd_di *) __ptr, __o, __c); + (__builtin_aarch64_simd_df *) __ptr, __o, __c); ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 0); ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 1); ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 2); -- cgit v1.1 From 15bdae016654f63a36e49a37c9d26282bebb1da9 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Sun, 28 Mar 2021 14:48:17 +0100 Subject: Darwin: Handle the -rpath command line option. This handles the command line '-rpath' option by passing it through to the static linker. Signed-off-by: Iain Sandoe gcc/ChangeLog: * config.gcc: Include rpath.opt for Darwin. * config/darwin.h (DRIVER_SELF_SPECS): Handle -rpath. --- gcc/config/darwin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h index 20d6b1e..b1be561 100644 --- a/gcc/config/darwin.h +++ b/gcc/config/darwin.h @@ -132,6 +132,7 @@ extern GTY(()) int darwin_ms_struct; "%{gsplit-dwarf:%ngsplit-dwarf is not supported on this platform} \ % Date: Tue, 17 Aug 2021 17:29:06 +0800 Subject: Revert "Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct." This reverts commit 872da9a6f664a06d73c987aa0cb2e5b830158a10. PR target/101936 PR target/101929 --- gcc/config/i386/i386.c | 6 +----- gcc/config/i386/i386.h | 1 - gcc/config/i386/x86-tune-costs.h | 26 -------------------------- 3 files changed, 1 insertion(+), 32 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 4d4ab6a..46844fa 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -22203,11 +22203,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, case vec_construct: { /* N element inserts into SSE vectors. */ - int cost - = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? - ix86_cost->sse_op - : ix86_cost->integer_to_sse); - + int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; /* One vinserti128 for combining two SSE vectors for AVX256. */ if (GET_MODE_BITSIZE (mode) == 256) cost += ix86_vec_cost (mode, ix86_cost->addss); diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index b3e57a8..8aba86d 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -165,7 +165,6 @@ struct processor_costs { const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ zmm_move; const int sse_to_integer; /* cost of moving SSE register to integer. */ - const int integer_to_sse; /* cost of moving integer to SSE register. */ const int gather_static, gather_per_elt; /* Cost of gather load is computed as static + per_item * nelts. */ const int scatter_static, scatter_per_elt; /* Cost of gather store is diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 67cfa00..ffe810f 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -102,7 +102,6 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ in 128bit, 256bit and 512bit */ 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ - COSTS_N_BYTES (2), /* cost of moving integer to sse register. */ 5, 0, /* Gather load static, per_elt. */ 5, 0, /* Gather store static, per_elt. */ 0, /* size of l1 cache */ @@ -212,7 +211,6 @@ struct processor_costs i386_cost = { /* 386 specific costs */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 0, /* size of l1 cache */ @@ -321,7 +319,6 @@ struct processor_costs i486_cost = { /* 486 specific costs */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 4, /* size of l1 cache. 486 has 8kB cache @@ -432,7 +429,6 @@ struct processor_costs pentium_cost = { {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -534,7 +530,6 @@ struct processor_costs lakemont_cost = { {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -651,7 +646,6 @@ struct processor_costs pentiumpro_cost = { {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -759,7 +753,6 @@ struct processor_costs geode_cost = { {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 2, 2, /* Gather load static, per_elt. */ 2, 2, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -867,7 +860,6 @@ struct processor_costs k6_cost = { {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 2, 2, /* Gather load static, per_elt. */ 2, 2, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -981,7 +973,6 @@ struct processor_costs athlon_cost = { {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 5, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1097,7 +1088,6 @@ struct processor_costs k8_cost = { {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 5, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1226,7 +1216,6 @@ struct processor_costs amdfam10_cost = { {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1347,7 +1336,6 @@ const struct processor_costs bdver_cost = { {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 16, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 12, 12, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 16, /* size of l1 cache. */ @@ -1489,7 +1477,6 @@ struct processor_costs znver1_cost = { {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, throughput 12. Approx 9 uops do not depend on vector size and every load is 7 uops. */ @@ -1646,7 +1633,6 @@ struct processor_costs znver2_cost = { 2, 2, 3, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, throughput 12. Approx 9 uops do not depend on vector size and every load is 7 uops. */ @@ -1779,7 +1765,6 @@ struct processor_costs znver3_cost = { 2, 2, 3, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, throughput 9. Approx 7 uops do not depend on vector size and every load is 4 uops. */ @@ -1924,7 +1909,6 @@ struct processor_costs skylake_cost = { {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */ 20, 8, /* Gather load static, per_elt. */ 22, 10, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -2051,7 +2035,6 @@ struct processor_costs icelake_cost = { {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 20, 8, /* Gather load static, per_elt. */ 22, 10, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -2165,7 +2148,6 @@ const struct processor_costs btver1_cost = { {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 14, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 10, 10, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2276,7 +2258,6 @@ const struct processor_costs btver2_cost = { {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 14, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 10, 10, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2386,7 +2367,6 @@ struct processor_costs pentium4_cost = { {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ 20, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 16, 16, /* Gather load static, per_elt. */ 16, 16, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -2499,7 +2479,6 @@ struct processor_costs nocona_cost = { {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ 20, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 12, 12, /* Gather load static, per_elt. */ 12, 12, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -2610,7 +2589,6 @@ struct processor_costs atom_cost = { {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 8, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 8, 8, /* Gather load static, per_elt. */ 8, 8, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2721,7 +2699,6 @@ struct processor_costs slm_cost = { {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 8, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 8, 8, /* Gather load static, per_elt. */ 8, 8, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2832,7 +2809,6 @@ struct processor_costs intel_cost = { {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ 4, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 6, 6, /* Gather load static, per_elt. */ 6, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2950,7 +2926,6 @@ struct processor_costs generic_cost = { {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 18, 6, /* Gather load static, per_elt. */ 18, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -3074,7 +3049,6 @@ struct processor_costs core_cost = { {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 2, /* cost of moving SSE register to integer. */ - COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, rec. throughput 6. So 5 uops statically and one uops per load. */ -- cgit v1.1 From 18e9e7db7afb8635316414b560c10852db13c4c1 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Thu, 19 Aug 2021 14:15:03 -0400 Subject: Drop stabs from h8/300 and v850 ports gcc/ * config.gcc (h8300-*-elf*): Do not include dbxelf.h. (h8300-*-linux*, v850-*-rtems*, v850*-elf*): Likewise. * config/v850/v850.h (DEFAULT_GDB_EXTENSIONS): Remove. --- gcc/config/v850/v850.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/v850/v850.h b/gcc/config/v850/v850.h index 386f9f5..5162268 100644 --- a/gcc/config/v850/v850.h +++ b/gcc/config/v850/v850.h @@ -694,9 +694,6 @@ typedef enum if ((LOG) != 0) \ fprintf (FILE, "\t.align %d\n", (LOG)) -/* We don't have to worry about dbx compatibility for the v850. */ -#define DEFAULT_GDB_EXTENSIONS 1 - /* Use dwarf2 debugging info by default. */ #undef PREFERRED_DEBUGGING_TYPE #define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG -- cgit v1.1 From b0963c4379becafaebd8e52b0b42698ff151c293 Mon Sep 17 00:00:00 2001 From: Peter Bergner Date: Thu, 19 Aug 2021 17:33:29 -0500 Subject: rs6000: Fix ICE expanding lxvp and stxvp gimple built-ins [PR101849] PR101849 shows we ICE on a test case when we pass a non __vector_pair * pointer to the __builtin_vsx_lxvp and __builtin_vsx_stxvp built-ins that is cast to __vector_pair *. The problem is that when we expand the built-in, the cast has already been removed from gimple and we are only given the base pointer. The solution used here (which fixes the ICE) is to catch this case and convert the pointer to a __vector_pair * pointer when expanding the built-in. 2021-08-19 Peter Bergner gcc/ PR target/101849 * config/rs6000/rs6000-call.c (rs6000_gimple_fold_mma_builtin): Cast pointer to __vector_pair *. gcc/testsuite/ PR target/101849 * gcc.target/powerpc/pr101849.c: New test. --- gcc/config/rs6000/rs6000-call.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index 8b16d65..3c3108a 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -11919,6 +11919,9 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi) tree offset = gimple_call_arg (stmt, 0); tree ptr = gimple_call_arg (stmt, 1); tree lhs = gimple_call_lhs (stmt); + if (TREE_TYPE (TREE_TYPE (ptr)) != vector_pair_type_node) + ptr = build1 (VIEW_CONVERT_EXPR, + build_pointer_type (vector_pair_type_node), ptr); tree mem = build_simple_mem_ref (build2 (POINTER_PLUS_EXPR, TREE_TYPE (ptr), ptr, offset)); gimplify_assign (lhs, mem, &new_seq); @@ -11932,6 +11935,9 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi) tree src = gimple_call_arg (stmt, 0); tree offset = gimple_call_arg (stmt, 1); tree ptr = gimple_call_arg (stmt, 2); + if (TREE_TYPE (TREE_TYPE (ptr)) != vector_pair_type_node) + ptr = build1 (VIEW_CONVERT_EXPR, + build_pointer_type (vector_pair_type_node), ptr); tree mem = build_simple_mem_ref (build2 (POINTER_PLUS_EXPR, TREE_TYPE (ptr), ptr, offset)); gimplify_assign (mem, src, &new_seq); -- cgit v1.1 From d2883be3c8e7b5fd17925ea67b99b7330e1a4f72 Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Fri, 20 Aug 2021 00:37:49 -0400 Subject: Move xx* builtins to vsx.md. I noticed that the xx built-in functions (xxspltiw, xxspltidp, xxsplti32dx, xxeval, xxblend, and xxpermx) were all defined in altivec.md. However, since the XX instructions can take both traditional floating point and Altivec registers, these built-in functions should be in vsx.md. This patch just moves the insns from altivec.md to vsx.md. I also moved the VM3 mode iterator and VM3_char mode attribute from altivec.md to vsx.md, since the only use of these were for the XXBLEND insns. 2021-08-20 Michael Meissner gcc/ * config/rs6000/altivec.md (UNSPEC_XXEVAL): Move to vsx.md. (UNSPEC_XXSPLTIW): Move to vsx.md. (UNSPEC_XXSPLTID): Move to vsx.md. (UNSPEC_XXSPLTI32DX): Move to vsx.md. (UNSPEC_XXBLEND): Move to vsx.md. (UNSPEC_XXPERMX): Move to vsx.md. (VM3): Move to vsx.md. (VM3_char): Move to vsx.md. (xxspltiw_v4si): Move to vsx.md. (xxspltiw_v4sf): Move to vsx.md. (xxspltiw_v4sf_inst): Move to vsx.md. (xxspltidp_v2df): Move to vsx.md. (xxspltidp_v2df_inst): Move to vsx.md. (xxsplti32dx_v4si_inst): Move to vsx.md. (xxsplti32dx_v4sf): Move to vsx.md. (xxsplti32dx_v4sf_inst): Move to vsx.md. (xxblend_): Move to vsx.md. (xxpermx): Move to vsx.md. (xxpermx_inst): Move to vsx.md. * config/rs6000/vsx.md (UNSPEC_XXEVAL): Move from altivec.md. (UNSPEC_XXSPLTIW): Move from altivec.md. (UNSPEC_XXSPLTID): Move from altivec.md. (UNSPEC_XXSPLTI32DX): Move from altivec.md. (UNSPEC_XXBLEND): Move from altivec.md. (UNSPEC_XXPERMX): Move from altivec.md. (VM3): Move from altivec.md. (VM3_char): Move from altivec.md. (xxspltiw_v4si): Move from altivec.md. (xxspltiw_v4sf): Move from altivec.md. (xxspltiw_v4sf_inst): Move from altivec.md. (xxspltidp_v2df): Move from altivec.md. (xxspltidp_v2df_inst): Move from altivec.md. (xxsplti32dx_v4si_inst): Move from altivec.md. (xxsplti32dx_v4sf): Move from altivec.md. (xxsplti32dx_v4sf_inst): Move from altivec.md. (xxblend_): Move from altivec.md. (xxpermx): Move from altivec.md. (xxpermx_inst): Move from altivec.md. --- gcc/config/rs6000/altivec.md | 197 ----------------------------------------- gcc/config/rs6000/vsx.md | 206 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+), 197 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index fd86c300..2c73dde 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -175,16 +175,10 @@ UNSPEC_VPEXTD UNSPEC_VCLRLB UNSPEC_VCLRRB - UNSPEC_XXEVAL UNSPEC_VSTRIR UNSPEC_VSTRIL UNSPEC_SLDB UNSPEC_SRDB - UNSPEC_XXSPLTIW - UNSPEC_XXSPLTID - UNSPEC_XXSPLTI32DX - UNSPEC_XXBLEND - UNSPEC_XXPERMX ]) (define_c_enum "unspecv" @@ -225,21 +219,6 @@ (KF "FLOAT128_VECTOR_P (KFmode)") (TF "FLOAT128_VECTOR_P (TFmode)")]) -;; Like VM2, just do char, short, int, long, float and double -(define_mode_iterator VM3 [V4SI - V8HI - V16QI - V4SF - V2DF - V2DI]) - -(define_mode_attr VM3_char [(V2DI "d") - (V4SI "w") - (V8HI "h") - (V16QI "b") - (V2DF "d") - (V4SF "w")]) - ;; Map the Vector convert single precision to double precision for integer ;; versus floating point (define_mode_attr VS_sxwsp [(V4SI "sxw") (V4SF "sp")]) @@ -859,170 +838,6 @@ "vsdbi %0,%1,%2,%3" [(set_attr "type" "vecsimple")]) -(define_insn "xxspltiw_v4si" - [(set (match_operand:V4SI 0 "register_operand" "=wa") - (unspec:V4SI [(match_operand:SI 1 "s32bit_cint_operand" "n")] - UNSPEC_XXSPLTIW))] - "TARGET_POWER10" - "xxspltiw %x0,%1" - [(set_attr "type" "vecsimple") - (set_attr "prefixed" "yes")]) - -(define_expand "xxspltiw_v4sf" - [(set (match_operand:V4SF 0 "register_operand" "=wa") - (unspec:V4SF [(match_operand:SF 1 "const_double_operand" "n")] - UNSPEC_XXSPLTIW))] - "TARGET_POWER10" -{ - long value = rs6000_const_f32_to_i32 (operands[1]); - emit_insn (gen_xxspltiw_v4sf_inst (operands[0], GEN_INT (value))); - DONE; -}) - -(define_insn "xxspltiw_v4sf_inst" - [(set (match_operand:V4SF 0 "register_operand" "=wa") - (unspec:V4SF [(match_operand:SI 1 "c32bit_cint_operand" "n")] - UNSPEC_XXSPLTIW))] - "TARGET_POWER10" - "xxspltiw %x0,%1" - [(set_attr "type" "vecsimple") - (set_attr "prefixed" "yes")]) - -(define_expand "xxspltidp_v2df" - [(set (match_operand:V2DF 0 "register_operand" ) - (unspec:V2DF [(match_operand:SF 1 "const_double_operand")] - UNSPEC_XXSPLTID))] - "TARGET_POWER10" -{ - long value = rs6000_const_f32_to_i32 (operands[1]); - rs6000_emit_xxspltidp_v2df (operands[0], value); - DONE; -}) - -(define_insn "xxspltidp_v2df_inst" - [(set (match_operand:V2DF 0 "register_operand" "=wa") - (unspec:V2DF [(match_operand:SI 1 "c32bit_cint_operand" "n")] - UNSPEC_XXSPLTID))] - "TARGET_POWER10" - "xxspltidp %x0,%1" - [(set_attr "type" "vecsimple") - (set_attr "prefixed" "yes")]) - -(define_expand "xxsplti32dx_v4si" - [(set (match_operand:V4SI 0 "register_operand" "=wa") - (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0") - (match_operand:QI 2 "u1bit_cint_operand" "n") - (match_operand:SI 3 "s32bit_cint_operand" "n")] - UNSPEC_XXSPLTI32DX))] - "TARGET_POWER10" -{ - int index = INTVAL (operands[2]); - - if (!BYTES_BIG_ENDIAN) - index = 1 - index; - - emit_insn (gen_xxsplti32dx_v4si_inst (operands[0], operands[1], - GEN_INT (index), operands[3])); - DONE; -} - [(set_attr "type" "vecsimple")]) - -(define_insn "xxsplti32dx_v4si_inst" - [(set (match_operand:V4SI 0 "register_operand" "=wa") - (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0") - (match_operand:QI 2 "u1bit_cint_operand" "n") - (match_operand:SI 3 "s32bit_cint_operand" "n")] - UNSPEC_XXSPLTI32DX))] - "TARGET_POWER10" - "xxsplti32dx %x0,%2,%3" - [(set_attr "type" "vecsimple") - (set_attr "prefixed" "yes")]) - -(define_expand "xxsplti32dx_v4sf" - [(set (match_operand:V4SF 0 "register_operand" "=wa") - (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "0") - (match_operand:QI 2 "u1bit_cint_operand" "n") - (match_operand:SF 3 "const_double_operand" "n")] - UNSPEC_XXSPLTI32DX))] - "TARGET_POWER10" -{ - int index = INTVAL (operands[2]); - long value = rs6000_const_f32_to_i32 (operands[3]); - if (!BYTES_BIG_ENDIAN) - index = 1 - index; - - emit_insn (gen_xxsplti32dx_v4sf_inst (operands[0], operands[1], - GEN_INT (index), GEN_INT (value))); - DONE; -}) - -(define_insn "xxsplti32dx_v4sf_inst" - [(set (match_operand:V4SF 0 "register_operand" "=wa") - (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "0") - (match_operand:QI 2 "u1bit_cint_operand" "n") - (match_operand:SI 3 "s32bit_cint_operand" "n")] - UNSPEC_XXSPLTI32DX))] - "TARGET_POWER10" - "xxsplti32dx %x0,%2,%3" - [(set_attr "type" "vecsimple") - (set_attr "prefixed" "yes")]) - -(define_insn "xxblend_" - [(set (match_operand:VM3 0 "register_operand" "=wa") - (unspec:VM3 [(match_operand:VM3 1 "register_operand" "wa") - (match_operand:VM3 2 "register_operand" "wa") - (match_operand:VM3 3 "register_operand" "wa")] - UNSPEC_XXBLEND))] - "TARGET_POWER10" - "xxblendv %x0,%x1,%x2,%x3" - [(set_attr "type" "vecsimple") - (set_attr "prefixed" "yes")]) - -(define_expand "xxpermx" - [(set (match_operand:V2DI 0 "register_operand" "+wa") - (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "wa") - (match_operand:V2DI 2 "register_operand" "wa") - (match_operand:V16QI 3 "register_operand" "wa") - (match_operand:QI 4 "u8bit_cint_operand" "n")] - UNSPEC_XXPERMX))] - "TARGET_POWER10" -{ - if (BYTES_BIG_ENDIAN) - emit_insn (gen_xxpermx_inst (operands[0], operands[1], - operands[2], operands[3], - operands[4])); - else - { - /* Reverse value of byte element indexes by XORing with 0xFF. - Reverse the 32-byte section identifier match by subracting bits [0:2] - of elemet from 7. */ - int value = INTVAL (operands[4]); - rtx vreg = gen_reg_rtx (V16QImode); - - emit_insn (gen_xxspltib_v16qi (vreg, GEN_INT (-1))); - emit_insn (gen_xorv16qi3 (operands[3], operands[3], vreg)); - value = 7 - value; - emit_insn (gen_xxpermx_inst (operands[0], operands[2], - operands[1], operands[3], - GEN_INT (value))); - } - - DONE; -} - [(set_attr "type" "vecsimple")]) - -(define_insn "xxpermx_inst" - [(set (match_operand:V2DI 0 "register_operand" "+v") - (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "v") - (match_operand:V2DI 2 "register_operand" "v") - (match_operand:V16QI 3 "register_operand" "v") - (match_operand:QI 4 "u3bit_cint_operand" "n")] - UNSPEC_XXPERMX))] - "TARGET_POWER10" - "xxpermx %x0,%x1,%x2,%x3,%4" - [(set_attr "type" "vecsimple") - (set_attr "prefixed" "yes")]) - (define_expand "vstrir_" [(set (match_operand:VIshort 0 "altivec_register_operand") (unspec:VIshort [(match_operand:VIshort 1 "altivec_register_operand")] @@ -3873,18 +3688,6 @@ [(set_attr "type" "vecperm") (set_attr "isa" "p9v,*")]) -(define_insn "xxeval" - [(set (match_operand:V2DI 0 "register_operand" "=wa") - (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "wa") - (match_operand:V2DI 2 "register_operand" "wa") - (match_operand:V2DI 3 "register_operand" "wa") - (match_operand:QI 4 "u8bit_cint_operand" "n")] - UNSPEC_XXEVAL))] - "TARGET_POWER10" - "xxeval %0,%1,%2,%3,%4" - [(set_attr "type" "vecsimple") - (set_attr "prefixed" "yes")]) - (define_expand "vec_unpacku_hi_v16qi" [(set (match_operand:V8HI 0 "register_operand" "=v") (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v")] diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 441735d..e4ca6e9 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -372,6 +372,12 @@ UNSPEC_REPLACE_UN UNSPEC_VDIVES UNSPEC_VDIVEU + UNSPEC_XXEVAL + UNSPEC_XXSPLTIW + UNSPEC_XXSPLTID + UNSPEC_XXSPLTI32DX + UNSPEC_XXBLEND + UNSPEC_XXPERMX ]) (define_int_iterator XVCVBF16 [UNSPEC_VSX_XVCVSPBF16 @@ -392,6 +398,22 @@ (define_mode_attr REPLACE_ELT_max [(V4SI "12") (V4SF "12") (V2DI "8") (V2DF "8")]) +;; Like VM2 in altivec.md, just do char, short, int, long, float and double +(define_mode_iterator VM3 [V4SI + V8HI + V16QI + V4SF + V2DF + V2DI]) + +(define_mode_attr VM3_char [(V2DI "d") + (V4SI "w") + (V8HI "h") + (V16QI "b") + (V2DF "d") + (V4SF "w")]) + + ;; VSX moves ;; The patterns for LE permuted loads and stores come before the general @@ -6383,3 +6405,187 @@ "TARGET_POWER10" "vmulld %0,%1,%2" [(set_attr "type" "veccomplex")]) + + +;; XXSPLTIW built-in function support +(define_insn "xxspltiw_v4si" + [(set (match_operand:V4SI 0 "register_operand" "=wa") + (unspec:V4SI [(match_operand:SI 1 "s32bit_cint_operand" "n")] + UNSPEC_XXSPLTIW))] + "TARGET_POWER10" + "xxspltiw %x0,%1" + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) + +(define_expand "xxspltiw_v4sf" + [(set (match_operand:V4SF 0 "register_operand" "=wa") + (unspec:V4SF [(match_operand:SF 1 "const_double_operand" "n")] + UNSPEC_XXSPLTIW))] + "TARGET_POWER10" +{ + long value = rs6000_const_f32_to_i32 (operands[1]); + emit_insn (gen_xxspltiw_v4sf_inst (operands[0], GEN_INT (value))); + DONE; +}) + +(define_insn "xxspltiw_v4sf_inst" + [(set (match_operand:V4SF 0 "register_operand" "=wa") + (unspec:V4SF [(match_operand:SI 1 "c32bit_cint_operand" "n")] + UNSPEC_XXSPLTIW))] + "TARGET_POWER10" + "xxspltiw %x0,%1" + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) + +;; XXSPLTIDP built-in function support +(define_expand "xxspltidp_v2df" + [(set (match_operand:V2DF 0 "register_operand" ) + (unspec:V2DF [(match_operand:SF 1 "const_double_operand")] + UNSPEC_XXSPLTID))] + "TARGET_POWER10" +{ + long value = rs6000_const_f32_to_i32 (operands[1]); + rs6000_emit_xxspltidp_v2df (operands[0], value); + DONE; +}) + +(define_insn "xxspltidp_v2df_inst" + [(set (match_operand:V2DF 0 "register_operand" "=wa") + (unspec:V2DF [(match_operand:SI 1 "c32bit_cint_operand" "n")] + UNSPEC_XXSPLTID))] + "TARGET_POWER10" + "xxspltidp %x0,%1" + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) + +;; XXSPLTI32DX built-in function support +(define_expand "xxsplti32dx_v4si" + [(set (match_operand:V4SI 0 "register_operand" "=wa") + (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0") + (match_operand:QI 2 "u1bit_cint_operand" "n") + (match_operand:SI 3 "s32bit_cint_operand" "n")] + UNSPEC_XXSPLTI32DX))] + "TARGET_POWER10" +{ + int index = INTVAL (operands[2]); + + if (!BYTES_BIG_ENDIAN) + index = 1 - index; + + emit_insn (gen_xxsplti32dx_v4si_inst (operands[0], operands[1], + GEN_INT (index), operands[3])); + DONE; +} + [(set_attr "type" "vecsimple")]) + +(define_insn "xxsplti32dx_v4si_inst" + [(set (match_operand:V4SI 0 "register_operand" "=wa") + (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0") + (match_operand:QI 2 "u1bit_cint_operand" "n") + (match_operand:SI 3 "s32bit_cint_operand" "n")] + UNSPEC_XXSPLTI32DX))] + "TARGET_POWER10" + "xxsplti32dx %x0,%2,%3" + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) + +(define_expand "xxsplti32dx_v4sf" + [(set (match_operand:V4SF 0 "register_operand" "=wa") + (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "0") + (match_operand:QI 2 "u1bit_cint_operand" "n") + (match_operand:SF 3 "const_double_operand" "n")] + UNSPEC_XXSPLTI32DX))] + "TARGET_POWER10" +{ + int index = INTVAL (operands[2]); + long value = rs6000_const_f32_to_i32 (operands[3]); + if (!BYTES_BIG_ENDIAN) + index = 1 - index; + + emit_insn (gen_xxsplti32dx_v4sf_inst (operands[0], operands[1], + GEN_INT (index), GEN_INT (value))); + DONE; +}) + +(define_insn "xxsplti32dx_v4sf_inst" + [(set (match_operand:V4SF 0 "register_operand" "=wa") + (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "0") + (match_operand:QI 2 "u1bit_cint_operand" "n") + (match_operand:SI 3 "s32bit_cint_operand" "n")] + UNSPEC_XXSPLTI32DX))] + "TARGET_POWER10" + "xxsplti32dx %x0,%2,%3" + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) + +;; XXBLEND built-in function support +(define_insn "xxblend_" + [(set (match_operand:VM3 0 "register_operand" "=wa") + (unspec:VM3 [(match_operand:VM3 1 "register_operand" "wa") + (match_operand:VM3 2 "register_operand" "wa") + (match_operand:VM3 3 "register_operand" "wa")] + UNSPEC_XXBLEND))] + "TARGET_POWER10" + "xxblendv %x0,%x1,%x2,%x3" + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) + +;; XXPERMX built-in function support +(define_expand "xxpermx" + [(set (match_operand:V2DI 0 "register_operand" "+wa") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "wa") + (match_operand:V2DI 2 "register_operand" "wa") + (match_operand:V16QI 3 "register_operand" "wa") + (match_operand:QI 4 "u8bit_cint_operand" "n")] + UNSPEC_XXPERMX))] + "TARGET_POWER10" +{ + if (BYTES_BIG_ENDIAN) + emit_insn (gen_xxpermx_inst (operands[0], operands[1], + operands[2], operands[3], + operands[4])); + else + { + /* Reverse value of byte element indexes by XORing with 0xFF. + Reverse the 32-byte section identifier match by subracting bits [0:2] + of elemet from 7. */ + int value = INTVAL (operands[4]); + rtx vreg = gen_reg_rtx (V16QImode); + + emit_insn (gen_xxspltib_v16qi (vreg, GEN_INT (-1))); + emit_insn (gen_xorv16qi3 (operands[3], operands[3], vreg)); + value = 7 - value; + emit_insn (gen_xxpermx_inst (operands[0], operands[2], + operands[1], operands[3], + GEN_INT (value))); + } + + DONE; +} + [(set_attr "type" "vecsimple")]) + +(define_insn "xxpermx_inst" + [(set (match_operand:V2DI 0 "register_operand" "+v") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "v") + (match_operand:V2DI 2 "register_operand" "v") + (match_operand:V16QI 3 "register_operand" "v") + (match_operand:QI 4 "u3bit_cint_operand" "n")] + UNSPEC_XXPERMX))] + "TARGET_POWER10" + "xxpermx %x0,%x1,%x2,%x3,%4" + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) + +;; XXEVAL built-in function support +(define_insn "xxeval" + [(set (match_operand:V2DI 0 "register_operand" "=wa") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "wa") + (match_operand:V2DI 2 "register_operand" "wa") + (match_operand:V2DI 3 "register_operand" "wa") + (match_operand:QI 4 "u8bit_cint_operand" "n")] + UNSPEC_XXEVAL))] + "TARGET_POWER10" + "xxeval %0,%1,%2,%3,%4" + [(set_attr "type" "vecsimple") + (set_attr "prefixed" "yes")]) + -- cgit v1.1 From 5f80c6270de6ac79d819de50048b32351a6b97c3 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Fri, 20 Aug 2021 11:19:05 -0400 Subject: Further improvements to constant shifts for the H8 gcc/ * config/h8300/h8300.c (shift_alg_hi): Improve arithmetic shift right by 15 bits for H8/300H and H8/S. Improve logical shifts by 12 bits for H8/S. (shift_alg_si): Improve arithmetic right shift by 28-30 bits for H8/300H. Improve arithmetic shift right by 15 bits for H8/S. Improve logical shifts by 27 bits for H8/S. (get_shift_alg): Corresponding changes. (h8300_option_override): Revert to loops for -Os when profitable. --- gcc/config/h8300/h8300.c | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300.c b/gcc/config/h8300/h8300.c index 0c4e508..8ccacec 100644 --- a/gcc/config/h8300/h8300.c +++ b/gcc/config/h8300/h8300.c @@ -213,9 +213,9 @@ static enum shift_alg shift_alg_hi[2][3][16] = { /* 0 1 2 3 4 5 6 7 */ /* 8 9 10 11 12 13 14 15 */ { INL, INL, INL, INL, INL, INL, INL, INL, - SPC, SPC, SPC, SPC, SPC, ROT, ROT, ROT }, /* SHIFT_ASHIFT */ + SPC, SPC, SPC, SPC, ROT, ROT, ROT, ROT }, /* SHIFT_ASHIFT */ { INL, INL, INL, INL, INL, INL, INL, INL, - SPC, SPC, SPC, SPC, SPC, ROT, ROT, ROT }, /* SHIFT_LSHIFTRT */ + SPC, SPC, SPC, SPC, ROT, ROT, ROT, ROT }, /* SHIFT_LSHIFTRT */ { INL, INL, INL, INL, INL, INL, INL, INL, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC }, /* SHIFT_ASHIFTRT */ } @@ -237,9 +237,9 @@ static enum shift_alg shift_alg_si[2][3][32] = { SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC }, /* SHIFT_LSHIFTRT */ { INL, INL, INL, INL, INL, INL, INL, LOP, - SPC, LOP, LOP, LOP, LOP, LOP, LOP, LOP, + SPC, LOP, LOP, LOP, LOP, LOP, LOP, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, - SPC, SPC, SPC, SPC, LOP, LOP, LOP, SPC }, /* SHIFT_ASHIFTRT */ + SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC }, /* SHIFT_ASHIFTRT */ }, { /* TARGET_H8300S */ @@ -256,7 +256,7 @@ static enum shift_alg shift_alg_si[2][3][32] = { SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC }, /* SHIFT_LSHIFTRT */ { INL, INL, INL, INL, INL, INL, INL, INL, - INL, INL, INL, INL, INL, INL, INL, LOP, + INL, INL, INL, INL, INL, INL, INL, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC, SPC }, /* SHIFT_ASHIFTRT */ } @@ -372,6 +372,9 @@ h8300_option_override (void) shift_alg_si[H8_300H][SHIFT_ASHIFTRT][25] = SHIFT_LOOP; shift_alg_si[H8_300H][SHIFT_ASHIFTRT][26] = SHIFT_LOOP; shift_alg_si[H8_300H][SHIFT_ASHIFTRT][27] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFTRT][28] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFTRT][29] = SHIFT_LOOP; + shift_alg_si[H8_300H][SHIFT_ASHIFTRT][30] = SHIFT_LOOP; /* H8S */ shift_alg_hi[H8_S][SHIFT_ASHIFTRT][14] = SHIFT_LOOP; @@ -3830,6 +3833,10 @@ get_shift_alg (enum shift_type shift_type, enum shift_mode shift_mode, } else if (count == 15) { + /* The basic idea here is to use the shift-by-16 idiom to make things + small and efficient. Of course, that loses one bit that we need, + so we stuff the bit into C, shift by 16, then rotate the bit + back in. */ switch (shift_type) { case SHIFT_ASHIFT: @@ -3841,7 +3848,9 @@ get_shift_alg (enum shift_type shift_type, enum shift_mode shift_mode, info->cc_special = OLD_CC_SET_ZNV; goto end; case SHIFT_ASHIFTRT: - gcc_unreachable (); + info->special = "shll.w\t%f0\n\tmov.w\t%e0,%f0\n\texts.l\t%S0\n\trotxl.l\t%S0"; + info->cc_special = OLD_CC_SET_ZNV; + goto end; } } else if (count >= 16 && count <= 23) @@ -3863,6 +3872,23 @@ get_shift_alg (enum shift_type shift_type, enum shift_mode shift_mode, goto end; } } + else if (TARGET_H8300S && count == 27) + { + switch (shift_type) + { + case SHIFT_ASHIFT: + info->special = "sub.w\t%e0,%e0\n\trotr.l\t#2,%S0\n\trotr.l\t#2,%S0\n\trotr.l\t%S0\n\tsub.w\t%f0,%f0"; + goto end; + case SHIFT_LSHIFTRT: + info->special = "sub.w\t%f0,%f0\n\trotl.l\t#2,%S0\n\trotl.l\t#2,%S0\n\trotl.l\t%S0\n\textu.l\t%S0"; + goto end; + case SHIFT_ASHIFTRT: + info->remainder = count - 24; + info->special = "mov.w\t%e0,%f0\n\tmov.b\t%t0,%s0\n\texts.w\t%f0\n\texts.l\t%S0"; + info->cc_special = OLD_CC_SET_ZNV; + goto end; + } + } else if (count >= 24 && count <= 27) { info->remainder = count - 24; -- cgit v1.1 From 5aae6fd9f4bd61030c79762b9474f52e8fa00dd8 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Sat, 21 Aug 2021 17:25:13 +0000 Subject: Don't warn when alignment of global common data exceeds maximum alignment. 2021-08-21 John David Anglin gcc/ChangeLog: * config/pa/pa.c (pa_asm_output_aligned_common): Remove warning. --- gcc/config/pa/pa.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/pa/pa.c b/gcc/config/pa/pa.c index 46194ba..0614302 100644 --- a/gcc/config/pa/pa.c +++ b/gcc/config/pa/pa.c @@ -9080,9 +9080,7 @@ pa_asm_output_aligned_common (FILE *stream, max_common_align = TARGET_64BIT ? 128 : (size >= 4096 ? 256 : 64); if (align > max_common_align) { - warning (0, "alignment (%u) for %s exceeds maximum alignment " - "for global common data. Using %u", - align / BITS_PER_UNIT, name, max_common_align / BITS_PER_UNIT); + /* Alignment exceeds maximum alignment for global common data. */ align = max_common_align; } -- cgit v1.1 From 304ec0d1d9f177e059e695fbe11d93f99f6f14e0 Mon Sep 17 00:00:00 2001 From: Dragan Mladjenovic Date: Tue, 24 Jul 2018 20:05:08 +0200 Subject: [MIPS] Remove TARGET_ASM_FUNCTION_RODATA_SECTION Since 'Remove obsolete IRIX 6.5 support' [1] we only use gp-relative jump-tables for PIC code. We can fall back to default behaviour for asm_function_rodata_section. [1] https://gcc.gnu.org/ml/libstdc++/2012-03/msg00067.html 2018-06-04 Dragan Mladjenovic gcc/ * config/mips/mips.c (mips_function_rodata_section, TARGET_ASM_FUNCTION_RODATA_SECTION): Removed. --- gcc/config/mips/mips.c | 38 -------------------------------------- 1 file changed, 38 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 89d1be6..39666d6 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -9306,42 +9306,6 @@ mips_select_rtx_section (machine_mode mode, rtx x, return default_elf_select_rtx_section (mode, x, align); } -/* Implement TARGET_ASM_FUNCTION_RODATA_SECTION. - - The complication here is that, with the combination TARGET_ABICALLS - && !TARGET_ABSOLUTE_ABICALLS && !TARGET_GPWORD, jump tables will use - absolute addresses, and should therefore not be included in the - read-only part of a DSO. Handle such cases by selecting a normal - data section instead of a read-only one. The logic apes that in - default_function_rodata_section. */ - -static section * -mips_function_rodata_section (tree decl, bool) -{ - if (!TARGET_ABICALLS || TARGET_ABSOLUTE_ABICALLS || TARGET_GPWORD) - return default_function_rodata_section (decl, false); - - if (decl && DECL_SECTION_NAME (decl)) - { - const char *name = DECL_SECTION_NAME (decl); - if (DECL_COMDAT_GROUP (decl) && startswith (name, ".gnu.linkonce.t.")) - { - char *rname = ASTRDUP (name); - rname[14] = 'd'; - return get_section (rname, SECTION_LINKONCE | SECTION_WRITE, decl); - } - else if (flag_function_sections - && flag_data_sections - && startswith (name, ".text.")) - { - char *rname = ASTRDUP (name); - memcpy (rname + 1, "data", 4); - return get_section (rname, SECTION_WRITE, decl); - } - } - return data_section; -} - /* Implement TARGET_IN_SMALL_DATA_P. */ static bool @@ -22606,8 +22570,6 @@ mips_asm_file_end (void) #define TARGET_ASM_FUNCTION_EPILOGUE mips_output_function_epilogue #undef TARGET_ASM_SELECT_RTX_SECTION #define TARGET_ASM_SELECT_RTX_SECTION mips_select_rtx_section -#undef TARGET_ASM_FUNCTION_RODATA_SECTION -#define TARGET_ASM_FUNCTION_RODATA_SECTION mips_function_rodata_section #undef TARGET_SCHED_INIT #define TARGET_SCHED_INIT mips_sched_init -- cgit v1.1 From f93f0868919ab32bfbc24adb40158298031a4d58 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Fri, 20 Aug 2021 22:52:57 +0800 Subject: mips: msa: truncate immediate shift amount [PR101922] When -mloongson-mmi is enabled, SHIFT_COUNT_TRUNCATED is turned off. This causes untruncated immediate shift amount outputed into the asm, and the GNU assembler refuses to assemble it. Truncate immediate shift amount when outputing the asm instruction to make GAS happy again. gcc/ PR target/101922 * config/mips/mips-protos.h (mips_msa_output_shift_immediate): Declare. * config/mips/mips.c (mips_msa_output_shift_immediate): New function. * config/mips/mips-msa.md (vashl3, vashr3, vlshr3): Call it. gcc/testsuite/ PR target/101922 * gcc.target/mips/pr101922.c: New test. --- gcc/config/mips/mips-msa.md | 27 ++++++++++++++++++--------- gcc/config/mips/mips-protos.h | 1 + gcc/config/mips/mips.c | 21 +++++++++++++++++++++ 3 files changed, 40 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/mips/mips-msa.md b/gcc/config/mips/mips-msa.md index 3a67f25..d3b27d1 100644 --- a/gcc/config/mips/mips-msa.md +++ b/gcc/config/mips/mips-msa.md @@ -870,9 +870,12 @@ (match_operand:IMSA 1 "register_operand" "f,f") (match_operand:IMSA 2 "reg_or_vector_same_uimm6_operand" "f,Uuv6")))] "ISA_HAS_MSA" - "@ - srl.\t%w0,%w1,%w2 - srli.\t%w0,%w1,%E2" +{ + if (which_alternative == 0) + return "srl.\t%w0,%w1,%w2"; + + return mips_msa_output_shift_immediate("srli.\t%w0,%w1,%E2", operands); +} [(set_attr "type" "simd_shift") (set_attr "mode" "")]) @@ -882,9 +885,12 @@ (match_operand:IMSA 1 "register_operand" "f,f") (match_operand:IMSA 2 "reg_or_vector_same_uimm6_operand" "f,Uuv6")))] "ISA_HAS_MSA" - "@ - sra.\t%w0,%w1,%w2 - srai.\t%w0,%w1,%E2" +{ + if (which_alternative == 0) + return "sra.\t%w0,%w1,%w2"; + + return mips_msa_output_shift_immediate("srai.\t%w0,%w1,%E2", operands); +} [(set_attr "type" "simd_shift") (set_attr "mode" "")]) @@ -894,9 +900,12 @@ (match_operand:IMSA 1 "register_operand" "f,f") (match_operand:IMSA 2 "reg_or_vector_same_uimm6_operand" "f,Uuv6")))] "ISA_HAS_MSA" - "@ - sll.\t%w0,%w1,%w2 - slli.\t%w0,%w1,%E2" +{ + if (which_alternative == 0) + return "sll.\t%w0,%w1,%w2"; + + return mips_msa_output_shift_immediate("slli.\t%w0,%w1,%E2", operands); +} [(set_attr "type" "simd_shift") (set_attr "mode" "")]) diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h index a5e4151..8d97eb3 100644 --- a/gcc/config/mips/mips-protos.h +++ b/gcc/config/mips/mips-protos.h @@ -317,6 +317,7 @@ extern const char *mips_output_sync_loop (rtx_insn *, rtx *); extern unsigned int mips_sync_loop_insns (rtx_insn *, rtx *); extern const char *mips_output_division (const char *, rtx *); extern const char *mips_msa_output_division (const char *, rtx *); +extern const char *mips_msa_output_shift_immediate (const char *, rtx *); extern const char *mips_output_probe_stack_range (rtx, rtx); extern bool mips_hard_regno_rename_ok (unsigned int, unsigned int); extern bool mips_linked_madd_p (rtx_insn *, rtx_insn *); diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 39666d6..2f7ffe8 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -14459,6 +14459,27 @@ mips_msa_output_division (const char *division, rtx *operands) } return s; } + +/* Return the assembly code for MSA immediate shift instructions, + which has the operands given by OPERANDS. Truncate the shift amount + to make GAS happy. */ + +const char * +mips_msa_output_shift_immediate (const char *shift, rtx *operands) +{ + rtx amount = operands[2]; + machine_mode mode = amount->mode; + + unsigned val = UINTVAL (CONST_VECTOR_ELT (amount, 0)); + val &= GET_MODE_UNIT_BITSIZE (mode) - 1; + if (!val) + return ""; + + rtx c = gen_int_mode (val, GET_MODE_INNER (mode)); + operands[2] = gen_const_vec_duplicate (mode, c); + + return shift; +} /* Return true if destination of IN_INSN is used as add source in OUT_INSN. Both IN_INSN and OUT_INSN are of type fmadd. Example: -- cgit v1.1 From 63f68535550f77998ad907ba3d3be05a4324fbe8 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Thu, 15 Jul 2021 12:57:18 +0000 Subject: arm: Fix multilib mapping for CDE extensions [PR100856] This is a followup to Srinath's recent patch: the newly added test is failing e.g. on arm-linux-gnueabihf without R/M profile multilibs. It is also failing on arm-eabi with R/M profile multilibs if the execution engine does not support v8.1-M instructions. The patch avoids this by adding check_effective_target_FUNC_multilib in target-supports.exp which effectively checks whether the target supports linking and execution, like what is already done for other ARM effective targets. pr100856.c is updated to use it instead of arm_v8_1m_main_cde_mve_ok (which makes the testcase a bit of a duplicate with check_effective_target_FUNC_multilib). In addition, I noticed that requiring MVE does not seem necessary and this enables the test to pass even when targeting a CPU without MVE: since the test does not involve actual CDE instructions, it can pass on other architecture versions. For instance, when requiring MVE, we have to use cortex-m55 under QEMU for the test to pass because the memset() that comes from v8.1-m.main+mve multilib uses LOB instructions (DLS) (memset is used during startup). Keeping arm_v8_1m_main_cde_mve_ok would mean we would enable the test provided we have the right multilibs, causing a runtime error if the simulator does not support LOB instructions (e.g. when targeting cortex-m7). I do not update sourcebuild.texi since the CDE effective targets are already collectively documented. Finally, the patch fixes two typos in comments. 2021-07-15 Christophe Lyon PR target/100856 gcc/ * config/arm/arm.opt: Fix typo. * config/arm/t-rmprofile: Fix typo. gcc/testsuite/ * gcc.target/arm/acle/pr100856.c: Use arm_v8m_main_cde_multilib and arm_v8m_main_cde. * lib/target-supports.exp: Add check_effective_target_FUNC_multilib for ARM CDE. --- gcc/config/arm/arm.opt | 2 +- gcc/config/arm/t-rmprofile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt index af478a9..7417b55 100644 --- a/gcc/config/arm/arm.opt +++ b/gcc/config/arm/arm.opt @@ -82,7 +82,7 @@ EnumValue Enum(arm_arch) String(native) Value(-1) DriverOnly ; Set to the name of target architecture which is required for -; multilib linking. This option is undocumented becuase it +; multilib linking. This option is undocumented because it ; should not be used by the users. mlibarch= Target RejectNegative JoinedOrMissing NoDWARFRecord DriverOnly Undocumented diff --git a/gcc/config/arm/t-rmprofile b/gcc/config/arm/t-rmprofile index 3e75fcc..a6036bf 100644 --- a/gcc/config/arm/t-rmprofile +++ b/gcc/config/arm/t-rmprofile @@ -54,7 +54,7 @@ MULTILIB_REQUIRED += mthumb/march=armv8.1-m.main+mve/mfloat-abi=hard MULTILIB_MATCHES += march?armv6s-m=march?armv6-m # For all MULITIB_MATCHES for v8-m and above add mlibarch? on the right hand side -# of = in the variant string instead of march?. This is needed becuase all the +# of = in the variant string instead of march?. This is needed because all the # MULITIB_MATCHES variant strings are compared with mlibarch option for multilib # linking. -- cgit v1.1 From ac3bcc813f26e6dc4c3b037d9d5c5a84f7f62cf4 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Mon, 23 Aug 2021 09:47:14 +0000 Subject: arm: Fix __arm_vctp16q return type in arm_mve.h __arm_vctp16q actually returns mve_pred16_t rather than int64_t. 2021-08-23 Christophe Lyon gcc/ * config/arm/arm_mve.h: Fix __arm_vctp16q return type. --- gcc/config/arm/arm_mve.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h index 83f1003..e04d462 100644 --- a/gcc/config/arm/arm_mve.h +++ b/gcc/config/arm/arm_mve.h @@ -3524,7 +3524,7 @@ __arm_vaddlvq_u32 (uint32x4_t __a) return __builtin_mve_vaddlvq_uv4si (__a); } -__extension__ extern __inline int64_t +__extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vctp16q (uint32_t __a) { -- cgit v1.1 From 70c7ab5c487f392e04907ce8f22eb454b8d3c4ff Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 23 Aug 2021 17:00:36 +0800 Subject: Fix ICE. gcc/ChangeLog: PR target/102016 * config/i386/sse.md (*avx512f_pshufb_truncv8hiv8qi_1): Add TARGET_AVX512BW to condition. gcc/testsuite/ChangeLog: PR target/102016 * gcc.target/i386/pr102016.c: New test. --- gcc/config/i386/sse.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 1388968..95f9582 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -11256,7 +11256,7 @@ (match_operand:V16QI 2 "pshufb_truncv8hiv8qi_operand")] UNSPEC_PSHUFB) 0) (parallel [(const_int 0)])))] - "TARGET_AVX512VL && ix86_pre_reload_split ()" + "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()" "#" "&& 1" [(const_int 0)] -- cgit v1.1 From fedadb60b6fc6425387faf4d514b4e8b0e24180e Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Mon, 23 Aug 2021 10:33:35 -0400 Subject: Add tailcall/sibcall support to the H8 gcc/ * config/h8300/h8300-protos.h (h8300_expand_epilogue): Add new argument. * config/h8300/jumpcall.md (call, call_value): Restrict to !SIBLING_CALL_P cases. (subcall, sibcall_value): New patterns & expanders. * config/h8300/proepi.md (epilogue): Pass new argument to h8300_expand_epilogue. (sibcall_epilogue): New expander. * config/h8300/h8300.c (h8300_expand_epilogue): Handle sibcall epilogues too. (h8300_ok_for_sibcall_p): New function. (TARGET_FUNCTION_OK_FOR_SIBCALL): define. --- gcc/config/h8300/h8300-protos.h | 2 +- gcc/config/h8300/h8300.c | 29 ++++++++++++++-- gcc/config/h8300/jumpcall.md | 74 +++++++++++++++++++++++++++++++++++++++-- gcc/config/h8300/proepi.md | 10 +++++- 4 files changed, 108 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300-protos.h b/gcc/config/h8300/h8300-protos.h index 744337d..3d34401 100644 --- a/gcc/config/h8300/h8300-protos.h +++ b/gcc/config/h8300/h8300-protos.h @@ -94,7 +94,7 @@ extern int h8300_tiny_data_p (tree); extern int h8300_can_use_return_insn_p (void); extern void h8300_expand_prologue (void); -extern void h8300_expand_epilogue (void); +extern void h8300_expand_epilogue (bool); extern int h8300_current_function_interrupt_function_p (void); extern int h8300_current_function_monitor_function_p (void); extern int h8300_initial_elimination_offset (int, int); diff --git a/gcc/config/h8300/h8300.c b/gcc/config/h8300/h8300.c index 8ccacec..5f7251a 100644 --- a/gcc/config/h8300/h8300.c +++ b/gcc/config/h8300/h8300.c @@ -874,7 +874,7 @@ h8300_can_use_return_insn_p (void) /* Generate RTL code for the function epilogue. */ void -h8300_expand_epilogue (void) +h8300_expand_epilogue (bool sibcall_p) { int regno; int saved_regs; @@ -919,6 +919,7 @@ h8300_expand_epilogue (void) /* See if this pop would be the last insn before the return. If so, use rte/l or rts/l instead of pop or ldm.l. */ if (TARGET_H8300SX + && !sibcall_p && !frame_pointer_needed && frame_size == 0 && (saved_regs & ((1 << (regno - n_regs + 1)) - 1)) == 0) @@ -931,12 +932,12 @@ h8300_expand_epilogue (void) /* Pop frame pointer if we had one. */ if (frame_pointer_needed) { - if (TARGET_H8300SX) + if (TARGET_H8300SX && !sibcall_p) returned_p = true; h8300_push_pop (HARD_FRAME_POINTER_REGNUM, 1, true, returned_p); } - if (!returned_p) + if (!returned_p && !sibcall_p) emit_jump_insn (ret_rtx); } @@ -5533,6 +5534,25 @@ h8300_push_rounding (poly_int64 bytes) { return ((bytes + PARM_BOUNDARY / 8 - 1) & (-PARM_BOUNDARY / 8)); } + +static bool +h8300_ok_for_sibcall_p (tree fndecl, tree) +{ + /* If either the caller or target are special, then assume sibling + calls are not OK. */ + if (!fndecl + || h8300_os_task_function_p (fndecl) + || h8300_monitor_function_p (fndecl) + || h8300_interrupt_function_p (fndecl) + || h8300_saveall_function_p (fndecl) + || h8300_os_task_function_p (current_function_decl) + || h8300_monitor_function_p (current_function_decl) + || h8300_interrupt_function_p (current_function_decl) + || h8300_saveall_function_p (current_function_decl)) + return false; + + return 1; +} /* Initialize the GCC target structure. */ #undef TARGET_ATTRIBUTE_TABLE @@ -5628,4 +5648,7 @@ h8300_push_rounding (poly_int64 bytes) #undef TARGET_FLAGS_REGNUM #define TARGET_FLAGS_REGNUM 12 +#undef TARGET_FUNCTION_OK_FOR_SIBCALL +#define TARGET_FUNCTION_OK_FOR_SIBCALL h8300_ok_for_sibcall_p + struct gcc_target targetm = TARGET_INITIALIZER; diff --git a/gcc/config/h8300/jumpcall.md b/gcc/config/h8300/jumpcall.md index 3e59fee..b596399 100644 --- a/gcc/config/h8300/jumpcall.md +++ b/gcc/config/h8300/jumpcall.md @@ -290,7 +290,7 @@ (define_insn "call_insn_" [(call (mem:QI (match_operand 0 "call_insn_operand" "Cr")) (match_operand:P 1 "general_operand" "g"))] - "" + "!SIBLING_CALL_P (insn)" { rtx xoperands[1]; xoperands[0] = gen_rtx_MEM (QImode, operands[0]); @@ -328,7 +328,7 @@ [(set (match_operand 0 "" "=r") (call (mem:QI (match_operand 1 "call_insn_operand" "Cr")) (match_operand:P 2 "general_operand" "g")))] - "" + "!SIBLING_CALL_P (insn)" { rtx xoperands[2]; gcc_assert (GET_MODE (operands[1]) == Pmode); @@ -347,3 +347,73 @@ (const_int 2) (const_int 4)))]) +(define_expand "sibcall" + [(call (match_operand:QI 0 "call_expander_operand" "") + (match_operand 1 "general_operand" ""))] + "" + { + if (!register_operand (XEXP (operands[0], 0), Pmode) + && GET_CODE (XEXP (operands[0], 0)) != SYMBOL_REF) + XEXP (operands[0], 0) = force_reg (Pmode, XEXP (operands[0], 0)); + }) + +(define_insn "sibcall_insn_" + [(call (mem:QI (match_operand 0 "call_insn_operand" "Cr")) + (match_operand:P 1 "general_operand" "g"))] + "SIBLING_CALL_P (insn)" +{ + rtx xoperands[1]; + xoperands[0] = gen_rtx_MEM (QImode, operands[0]); + gcc_assert (GET_MODE (operands[0]) == Pmode); + if (GET_CODE (XEXP (xoperands[0], 0)) == SYMBOL_REF + && (SYMBOL_REF_FLAGS (XEXP (xoperands[0], 0)) & SYMBOL_FLAG_FUNCVEC_FUNCTION)) + output_asm_insn ("jmp\\t@%0:8", xoperands); + else + output_asm_insn ("jmp\\t%0", xoperands); + return ""; +} + [(set_attr "type" "call") + (set (attr "length") + (if_then_else (match_operand:QI 0 "small_call_insn_operand" "") + (const_int 2) + (const_int 4)))]) + +;; Call subroutine, returning value in operand 0 +;; (which must be a hard register). + +;; ??? Even though we use HImode here, this works on the H8/300H and H8S. + +(define_expand "sibcall_value" + [(set (match_operand 0 "" "") + (call (match_operand:QI 1 "call_expander_operand" "") + (match_operand 2 "general_operand" "")))] + "" + { + if (!register_operand (XEXP (operands[1], 0), Pmode) + && GET_CODE (XEXP (operands[1], 0)) != SYMBOL_REF) + XEXP (operands[1], 0) = force_reg (Pmode, XEXP (operands[1], 0)); + }) + +(define_insn "sibcall_value_insn_" + [(set (match_operand 0 "" "=r") + (call (mem:QI (match_operand 1 "call_insn_operand" "Cr")) + (match_operand:P 2 "general_operand" "g")))] + "SIBLING_CALL_P (insn)" +{ + rtx xoperands[2]; + gcc_assert (GET_MODE (operands[1]) == Pmode); + xoperands[0] = operands[0]; + xoperands[1] = gen_rtx_MEM (QImode, operands[1]); + if (GET_CODE (XEXP (xoperands[1], 0)) == SYMBOL_REF + && (SYMBOL_REF_FLAGS (XEXP (xoperands[1], 0)) & SYMBOL_FLAG_FUNCVEC_FUNCTION)) + output_asm_insn ("jmp\\t@%1:8", xoperands); + else + output_asm_insn ("jmp\\t%1", xoperands); + return ""; +} + [(set_attr "type" "call") + (set (attr "length") + (if_then_else (match_operand:QI 0 "small_call_insn_operand" "") + (const_int 2) + (const_int 4)))]) + diff --git a/gcc/config/h8300/proepi.md b/gcc/config/h8300/proepi.md index 44d5968..ab58d02 100644 --- a/gcc/config/h8300/proepi.md +++ b/gcc/config/h8300/proepi.md @@ -98,7 +98,7 @@ [(return)] "" { - h8300_expand_epilogue (); + h8300_expand_epilogue (false); DONE; }) @@ -121,3 +121,11 @@ gcc_unreachable (); } [(set_attr "length" "20")]) + +(define_expand "sibcall_epilogue" + [(const_int 0)] + "" + { + h8300_expand_epilogue (true); + DONE; + }) -- cgit v1.1 From bb75b22aba254e8ff144db27b1c8b4804bad73bb Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Mon, 2 Aug 2021 17:38:05 +0200 Subject: Allow matching Intel MIC in OpenMP 'declare variant' ..., and use that to improve XFAILing for Intel MIC offloading execution instead of compilation in 'libgomp.c-c++-common/target-45.c', 'libgomp.fortran/target10.f90'. gcc/ * config/i386/i386-options.c (ix86_omp_device_kind_arch_isa) [ACCEL_COMPILER]: Match "intel_mic". * config/i386/t-omp-device (omp-device-properties-i386) : Add "intel_mic". libgomp/ * testsuite/lib/libgomp.exp (check_effective_target_offload_target_intelmic): Remove 'proc'. (check_effective_target_offload_device_intel_mic): New 'proc'. * testsuite/libgomp.c-c++-common/on_device_arch.h (device_arch_intel_mic, on_device_arch_intel_mic): New. * testsuite/libgomp.c-c++-common/target-45.c: Use that for 'dg-xfail-run-if'. * testsuite/libgomp.fortran/target10.f90: Likewise. --- gcc/config/i386/i386-options.c | 4 ++++ gcc/config/i386/t-omp-device | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 6b78998..fee5a48 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -304,6 +304,10 @@ ix86_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait, case omp_device_kind: return strcmp (name, "cpu") == 0; case omp_device_arch: +#ifdef ACCEL_COMPILER + if (strcmp (name, "intel_mic") == 0) + return 1; +#endif if (strcmp (name, "x86") == 0) return 1; if (TARGET_64BIT) diff --git a/gcc/config/i386/t-omp-device b/gcc/config/i386/t-omp-device index 037ae5e..29350a1 100644 --- a/gcc/config/i386/t-omp-device +++ b/gcc/config/i386/t-omp-device @@ -1,6 +1,6 @@ omp-device-properties-i386: $(srcdir)/config/i386/i386-options.c echo kind: cpu > $@ - echo arch: x86 x86_64 i386 i486 i586 i686 ia32 >> $@ + echo arch: intel_mic x86 x86_64 i386 i486 i586 i686 ia32 >> $@ echo isa: sse4 `sed -n '/^static struct ix86_target_opts isa2\?_opts\[\] =/,/^};/p' \ $(srcdir)/config/i386/i386-options.c | \ sed -n 's/",.*$$//;s/^ { "-m//p'` >> $@ -- cgit v1.1 From 30c335ac44ecb4f17645925360177618763d7c48 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Thu, 19 Aug 2021 16:07:55 -0500 Subject: rs6000: Avoid buffer overruns 2021-08-19 Bill Schmidt gcc/ PR target/101830 * config/rs6000/rs6000-gen-builtins.c (consume_whitespace): Diagnose buffer overrun. (safe_inc_pos): Fix overrun detection. (match_identifier): Diagnose buffer overrun. (match_integer): Likewise. (match_to_right_bracket): Likewise. --- gcc/config/rs6000/rs6000-gen-builtins.c | 34 +++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index e5d3b71..05b2d29 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -597,6 +597,13 @@ consume_whitespace (void) { while (pos < LINELEN && isspace(linebuf[pos]) && linebuf[pos] != '\n') pos++; + + if (pos >= LINELEN) + { + diag ("line length overrun at %d.\n", pos); + exit (1); + } + return; } @@ -623,7 +630,7 @@ advance_line (FILE *file) static inline void safe_inc_pos (void) { - if (pos++ >= LINELEN) + if (++pos >= LINELEN) { (*diag) ("line length overrun.\n"); exit (1); @@ -636,9 +643,16 @@ static char * match_identifier (void) { int lastpos = pos - 1; - while (isalnum (linebuf[lastpos + 1]) || linebuf[lastpos + 1] == '_') + while (lastpos < LINELEN - 1 + && (isalnum (linebuf[lastpos + 1]) || linebuf[lastpos + 1] == '_')) ++lastpos; + if (lastpos >= LINELEN - 1) + { + diag ("line length overrun at %d.\n", lastpos); + exit (1); + } + if (lastpos < pos) return 0; @@ -660,9 +674,15 @@ match_integer (void) safe_inc_pos (); int lastpos = pos - 1; - while (isdigit (linebuf[lastpos + 1])) + while (lastpos < LINELEN - 1 && isdigit (linebuf[lastpos + 1])) ++lastpos; + if (lastpos >= LINELEN - 1) + { + diag ("line length overrun at %d.\n", lastpos); + exit (1); + } + if (lastpos < pos) return NULL; @@ -680,7 +700,7 @@ static const char * match_to_right_bracket (void) { int lastpos = pos - 1; - while (linebuf[lastpos + 1] != ']') + while (lastpos < LINELEN - 1 && linebuf[lastpos + 1] != ']') { if (linebuf[lastpos + 1] == '\n') { @@ -690,6 +710,12 @@ match_to_right_bracket (void) ++lastpos; } + if (lastpos >= LINELEN - 1) + { + diag ("line length overrun at %d.\n", lastpos); + exit (1); + } + if (lastpos < pos) return 0; -- cgit v1.1 From 34ad198138f7a64355c92090e1db260ee135495d Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 27 Jul 2021 14:43:57 -0400 Subject: rs6000: Incorporate new builtins code into the build machinery 2021-07-27 Bill Schmidt gcc/ * config.gcc (powerpc*-*-*): Add rs6000-builtins.o to extra_objs. * config/rs6000/rs6000-gen-builtins.c (main): Close init_file last. * config/rs6000/t-rs6000 (rs6000-gen-builtins.o): New target. (rbtree.o): Likewise. (rs6000-gen-builtins): Likewise. (rs6000-builtins.c): Likewise. (rs6000-builtins.h): Likewise. (rs6000.o): Add dependency. (EXTRA_HEADERS): Add rs6000-vecdefines.h. (rs6000-vecdefines.h): New target. (rs6000-builtins.o): Likewise. (rs6000-call.o): Add rs6000-builtins.h as a dependency. (rs6000-c.o): Likewise. --- gcc/config/rs6000/rs6000-gen-builtins.c | 4 ++- gcc/config/rs6000/t-rs6000 | 46 +++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 05b2d29..8a7505b 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -3005,9 +3005,11 @@ main (int argc, const char **argv) exit (1); } + /* Always close init_file last. This avoids race conditions in the + build machinery. See comments in t-rs6000. */ fclose (header_file); - fclose (init_file); fclose (defines_file); + fclose (init_file); return 0; } diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index 44f7ffb..e0e8ab8 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -27,10 +27,6 @@ rs6000-pcrel-opt.o: $(srcdir)/config/rs6000/rs6000-pcrel-opt.c $(COMPILE) $< $(POSTCOMPILE) -rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.c - $(COMPILE) $< - $(POSTCOMPILE) - rs6000-string.o: $(srcdir)/config/rs6000/rs6000-string.c $(COMPILE) $< $(POSTCOMPILE) @@ -47,7 +43,47 @@ rs6000-logue.o: $(srcdir)/config/rs6000/rs6000-logue.c $(COMPILE) $< $(POSTCOMPILE) -rs6000-call.o: $(srcdir)/config/rs6000/rs6000-call.c +rs6000-gen-builtins.o: $(srcdir)/config/rs6000/rs6000-gen-builtins.c + $(COMPILE) $< + $(POSTCOMPILE) + +rbtree.o: $(srcdir)/config/rs6000/rbtree.c + $(COMPILE) $< + $(POSTCOMPILE) + +rs6000-gen-builtins: rs6000-gen-builtins.o rbtree.o + $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ \ + $(filter-out $(BUILD_LIBDEPS), $^) $(BUILD_LIBS) + +# TODO: Whenever GNU make 4.3 is the minimum required, we should use +# grouped targets on this: +# rs6000-builtins.c rs6000-builtins.h rs6000-vecdefines.h &: +# +# For now, the header files depend on rs6000-builtins.c, which avoids +# races because the .c file is closed last in rs6000-gen-builtins.c. +rs6000-builtins.c: rs6000-gen-builtins \ + $(srcdir)/config/rs6000/rs6000-builtin-new.def \ + $(srcdir)/config/rs6000/rs6000-overload.def + ./rs6000-gen-builtins $(srcdir)/config/rs6000/rs6000-builtin-new.def \ + $(srcdir)/config/rs6000/rs6000-overload.def rs6000-builtins.h \ + rs6000-builtins.c rs6000-vecdefines.h + +rs6000-builtins.h: rs6000-builtins.c + +rs6000.o: rs6000-builtins.h + +EXTRA_HEADERS += rs6000-vecdefines.h +rs6000-vecdefines.h: rs6000-builtins.c + +rs6000-builtins.o: rs6000-builtins.c + $(COMPILE) $< + $(POSTCOMPILE) + +rs6000-call.o: $(srcdir)/config/rs6000/rs6000-call.c rs6000-builtins.h + $(COMPILE) $< + $(POSTCOMPILE) + +rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.c rs6000-builtins.h $(COMPILE) $< $(POSTCOMPILE) -- cgit v1.1 From 596f964f3272081a2320c1220e8aff06ee44fe91 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 27 Jul 2021 14:46:08 -0400 Subject: rs6000: Add gengtype handling to the build machinery 2021-06-07 Bill Schmidt gcc/ * config.gcc (target_gtfiles): Add ./rs6000-builtins.h. * config/rs6000/t-rs6000 (EXTRA_GTYPE_DEPS): Set. --- gcc/config/rs6000/t-rs6000 | 1 + 1 file changed, 1 insertion(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index e0e8ab8..92766d8 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -22,6 +22,7 @@ TM_H += $(srcdir)/config/rs6000/rs6000-builtin.def TM_H += $(srcdir)/config/rs6000/rs6000-cpus.def TM_H += $(srcdir)/config/rs6000/rs6000-modes.h PASSES_EXTRA += $(srcdir)/config/rs6000/rs6000-passes.def +EXTRA_GTYPE_DEPS += $(srcdir)/config/rs6000/rs6000-builtin-new.def rs6000-pcrel-opt.o: $(srcdir)/config/rs6000/rs6000-pcrel-opt.c $(COMPILE) $< -- cgit v1.1 From 192d4edd15cabf1f0e88e5a62142cd252542ea0c Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Mon, 23 Aug 2021 17:26:43 -0500 Subject: rs6000: Fix AIX bootstrap (don't call asprintf) 2021-08-23 Bill Schmidt gcc/ * config/rs6000/rs6000-gen-builtins.c (parse_bif_entry): Don't call asprintf, which is not available on AIX. --- gcc/config/rs6000/rs6000-gen-builtins.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 8a7505b..000e5f9 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -1794,8 +1794,9 @@ parse_bif_entry (void) /* Append a number representing the order in which this function was encountered to its name, and save in another lookup structure. */ - char *buf; - asprintf (&buf, "%s:%05d", bifs[curr_bif].idname, curr_bif); + int orig_len = strlen (bifs[curr_bif].idname); + char *buf = (char *) malloc (orig_len + 7); + sprintf (buf, "%s:%05d", bifs[curr_bif].idname, curr_bif); if (!rbt_insert (&bifo_rbt, buf)) { -- cgit v1.1 From f8977166135de09fe36a3b57cc11daa67587604e Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Tue, 24 Aug 2021 03:04:48 +0100 Subject: Tweak -Os costs for scalar-to-vector pass. Back in June I briefly mentioned in one of my gcc-patches posts that a change that should have always reduced code size, would mysteriously occasionally result in slightly larger code (according to CSiBE): https://gcc.gnu.org/pipermail/gcc-patches/2021-June/573233.html Investigating further, the cause turns out to be that x86_64's scalar-to-vector (stv) pass is relying on poor estimates of the size costs/benefits. This patch tweaks the backend's compute_convert_gain method to provide slightly more accurate values when compiling with -Os. Compilation without -Os is (should be) unaffected. And for completeness, I'll mention that the stv pass is a net win for code size so it's much better to improve its heuristics than simply gate the pass on !optimize_for_size. The net effect of this change is to save 1399 bytes on the CSiBE code size benchmark when compiling with -Os. 2021-08-24 Roger Sayle Richard Biener gcc/ChangeLog * config/i386/i386-features.c (compute_convert_gain): Provide more accurate values for CONST_INT, when optimizing for size. * config/i386/i386.c (COSTS_N_BYTES): Move definition from here... * config/i386/i386.h (COSTS_N_BYTES): to here. --- gcc/config/i386/i386-features.c | 38 +++++++++++++++++++++++++++++++++----- gcc/config/i386/i386.c | 2 -- gcc/config/i386/i386.h | 5 +++++ 3 files changed, 38 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index d9c6652..5a99ea7 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -610,12 +610,40 @@ general_scalar_chain::compute_convert_gain () case CONST_INT: if (REG_P (dst)) - /* DImode can be immediate for TARGET_64BIT and SImode always. */ - igain += m * COSTS_N_INSNS (1); + { + if (optimize_insn_for_size_p ()) + { + /* xor (2 bytes) vs. xorps (3 bytes). */ + if (src == const0_rtx) + igain -= COSTS_N_BYTES (1); + /* movdi_internal vs. movv2di_internal. */ + /* => mov (5 bytes) vs. movaps (7 bytes). */ + else if (x86_64_immediate_operand (src, SImode)) + igain -= COSTS_N_BYTES (2); + else + /* ??? Larger immediate constants are placed in the + constant pool, where the size benefit/impact of + STV conversion is affected by whether and how + often each constant pool entry is shared/reused. + The value below is empirically derived from the + CSiBE benchmark (and the optimal value may drift + over time). */ + igain += COSTS_N_BYTES (0); + } + else + { + /* DImode can be immediate for TARGET_64BIT + and SImode always. */ + igain += m * COSTS_N_INSNS (1); + igain -= vector_const_cost (src); + } + } else if (MEM_P (dst)) - igain += (m * ix86_cost->int_store[2] - - ix86_cost->sse_store[sse_cost_idx]); - igain -= vector_const_cost (src); + { + igain += (m * ix86_cost->int_store[2] + - ix86_cost->sse_store[sse_cost_idx]); + igain -= vector_const_cost (src); + } break; default: diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 46844fa..5bff131 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19982,8 +19982,6 @@ ix86_division_cost (const struct processor_costs *cost, return cost->divide[MODE_INDEX (mode)]; } -#define COSTS_N_BYTES(N) ((N) * 2) - /* Return cost of shift in MODE. If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL. AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 8aba86d..11ac8d0 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -88,6 +88,11 @@ struct stringop_algs } size [MAX_STRINGOP_ALGS]; }; +/* Analog of COSTS_N_INSNS when optimizing for size. */ +#ifndef COSTS_N_BYTES +#define COSTS_N_BYTES(N) ((N) * 2) +#endif + /* Define the specific costs for a given cpu. NB: hard_register is used by TARGET_REGISTER_MOVE_COST and TARGET_MEMORY_MOVE_COST to compute hard register move costs by register allocator. Relative costs of -- cgit v1.1 From 6ddb30f941a44bd528904558673ab35394565f08 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Fri, 20 Aug 2021 15:30:40 +0800 Subject: Optimize (a & b) | (c & ~b) to vpternlog instruction. Also optimize below 3 forms to vpternlog, op1, op2, op3 are register_operand or unary_p as (not reg) A: (any_logic (any_logic op1 op2) op3) B: (any_logic (any_logic op1 op2) (any_logic op3 op4)) op3/op4 should be equal to op1/op2 C: (any_logic (any_logic (any_logic:op1 op2) op3) op4) op3/op4 should be equal to op1/op2 gcc/ChangeLog: PR target/101989 * config/i386/i386.c (ix86_rtx_costs): Define cost for UNSPEC_VTERNLOG. * config/i386/i386.h (STRIP_UNARY): New macro. * config/i386/predicates.md (reg_or_notreg_operand): New predicate. * config/i386/sse.md (*_vternlog_all): New define_insn. (*_vternlog_1): New pre_reload define_insn_and_split. (*_vternlog_2): Ditto. (*_vternlog_3): Ditto. (any_logic1,any_logic2): New code iterator. (logic_op): New code attribute. (ternlogsuffix): Extend to VNxDF and VNxSF. gcc/testsuite/ChangeLog: PR target/101989 * gcc.target/i386/pr101989-1.c: New test. * gcc.target/i386/pr101989-2.c: New test. * gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Adjust testcase. --- gcc/config/i386/i386.c | 5 + gcc/config/i386/i386.h | 2 + gcc/config/i386/predicates.md | 7 ++ gcc/config/i386/sse.md | 234 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 248 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5bff131..ebec866 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -20542,6 +20542,11 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, case UNSPEC: if (XINT (x, 1) == UNSPEC_TP) *total = 0; + else if (XINT(x, 1) == UNSPEC_VTERNLOG) + { + *total = cost->sse_op; + return true; + } return false; case VEC_SELECT: diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 11ac8d0..6511422 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1716,6 +1716,8 @@ typedef struct ix86_args { #define LEGITIMATE_PIC_OPERAND_P(X) legitimate_pic_operand_p (X) +#define STRIP_UNARY(X) (UNARY_P (X) ? XEXP (X, 0) : X) + #define SYMBOLIC_CONST(X) \ (GET_CODE (X) == SYMBOL_REF \ || GET_CODE (X) == LABEL_REF \ diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 9321f33..df5acb4 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1044,6 +1044,13 @@ (ior (match_test "op == const1_rtx") (match_test "op == constm1_rtx"))))) +;; True for registers, or (not: registers). Used to optimize 3-operand +;; bitwise operation. +(define_predicate "reg_or_notreg_operand" + (ior (match_operand 0 "register_operand") + (and (match_code "not") + (match_test "register_operand (XEXP (op, 0), mode)")))) + ;; True if OP is acceptable as operand of DImode shift expander. (define_predicate "shiftdi_operand" (if_then_else (match_test "TARGET_64BIT") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 95f9582..25ca9a5 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -933,7 +933,9 @@ ;; Mapping of vector modes to VPTERNLOG suffix (define_mode_attr ternlogsuffix [(V8DI "q") (V4DI "q") (V2DI "q") + (V8DF "q") (V4DF "q") (V2DF "q") (V16SI "d") (V8SI "d") (V4SI "d") + (V16SF "d") (V8SF "d") (V4SF "d") (V32HI "d") (V16HI "d") (V8HI "d") (V64QI "d") (V32QI "d") (V16QI "d")]) @@ -10041,6 +10043,238 @@ (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn "*_vternlog_all" + [(set (match_operand:V 0 "register_operand" "=v") + (unspec:V + [(match_operand:V 1 "register_operand" "0") + (match_operand:V 2 "register_operand" "v") + (match_operand:V 3 "nonimmediate_operand" "vm") + (match_operand:SI 4 "const_0_to_255_operand")] + UNSPEC_VTERNLOG))] + "TARGET_AVX512F" + "vpternlog\t{%4, %3, %2, %0|%0, %2, %3, %4}" + [(set_attr "type" "sselog") + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + +;; There must be lots of other combinations like +;; +;; (any_logic:V +;; (any_logic:V op1 op2) +;; (any_logic:V op1 op3)) +;; +;; (any_logic:V +;; (any_logic:V +;; (any_logic:V op1, op2) +;; op3) +;; op1) +;; +;; and so on. + +(define_code_iterator any_logic1 [and ior xor]) +(define_code_iterator any_logic2 [and ior xor]) +(define_code_attr logic_op [(and "&") (ior "|") (xor "^")]) + +(define_insn_and_split "*_vpternlog_1" + [(set (match_operand:V 0 "register_operand") + (any_logic:V + (any_logic1:V + (match_operand:V 1 "reg_or_notreg_operand") + (match_operand:V 2 "reg_or_notreg_operand")) + (any_logic2:V + (match_operand:V 3 "reg_or_notreg_operand") + (match_operand:V 4 "reg_or_notreg_operand"))))] + "( == 64 || TARGET_AVX512VL) + && ix86_pre_reload_split () + && (rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[4])) + || rtx_equal_p (STRIP_UNARY (operands[2]), + STRIP_UNARY (operands[4])) + || rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[3])) + || rtx_equal_p (STRIP_UNARY (operands[2]), + STRIP_UNARY (operands[3])))" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:V + [(match_dup 6) + (match_dup 2) + (match_dup 1) + (match_dup 5)] + UNSPEC_VTERNLOG))] +{ + /* VPTERNLOGD reg6, reg2, reg1, imm8. */ + int reg6 = 0xF0; + int reg2 = 0xCC; + int reg1 = 0xAA; + int reg3 = 0; + int reg4 = 0; + int reg_mask, tmp1, tmp2; + if (rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[4]))) + { + reg4 = reg1; + reg3 = reg6; + operands[6] = operands[3]; + } + else if (rtx_equal_p (STRIP_UNARY (operands[2]), + STRIP_UNARY (operands[4]))) + { + reg4 = reg2; + reg3 = reg6; + operands[6] = operands[3]; + } + else if (rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[3]))) + { + reg4 = reg6; + reg3 = reg1; + operands[6] = operands[4]; + } + else + { + reg4 = reg6; + reg3 = reg2; + operands[6] = operands[4]; + } + + reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1; + reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2; + reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3; + reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4; + + tmp1 = reg1 reg2; + tmp2 = reg3 reg4; + reg_mask = tmp1 tmp2; + reg_mask &= 0xFF; + + operands[1] = STRIP_UNARY (operands[1]); + operands[2] = STRIP_UNARY (operands[2]); + operands[6] = STRIP_UNARY (operands[6]); + operands[5] = GEN_INT (reg_mask); +}) + +(define_insn_and_split "*_vpternlog_2" + [(set (match_operand:V 0 "register_operand") + (any_logic:V + (any_logic1:V + (any_logic2:V + (match_operand:V 1 "reg_or_notreg_operand") + (match_operand:V 2 "reg_or_notreg_operand")) + (match_operand:V 3 "reg_or_notreg_operand")) + (match_operand:V 4 "reg_or_notreg_operand")))] + "( == 64 || TARGET_AVX512VL) + && ix86_pre_reload_split () + && (rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[4])) + || rtx_equal_p (STRIP_UNARY (operands[2]), + STRIP_UNARY (operands[4])) + || rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[3])) + || rtx_equal_p (STRIP_UNARY (operands[2]), + STRIP_UNARY (operands[3])))" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:V + [(match_dup 6) + (match_dup 2) + (match_dup 1) + (match_dup 5)] + UNSPEC_VTERNLOG))] +{ + /* VPTERNLOGD reg6, reg2, reg1, imm8. */ + int reg6 = 0xF0; + int reg2 = 0xCC; + int reg1 = 0xAA; + int reg3 = 0; + int reg4 = 0; + int reg_mask, tmp1, tmp2; + if (rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[4]))) + { + reg4 = reg1; + reg3 = reg6; + operands[6] = operands[3]; + } + else if (rtx_equal_p (STRIP_UNARY (operands[2]), + STRIP_UNARY (operands[4]))) + { + reg4 = reg2; + reg3 = reg6; + operands[6] = operands[3]; + } + else if (rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[3]))) + { + reg4 = reg6; + reg3 = reg1; + operands[6] = operands[4]; + } + else + { + reg4 = reg6; + reg3 = reg2; + operands[6] = operands[4]; + } + + reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1; + reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2; + reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3; + reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4; + + tmp1 = reg1 reg2; + tmp2 = tmp1 reg3; + reg_mask = tmp2 reg4; + reg_mask &= 0xFF; + + operands[1] = STRIP_UNARY (operands[1]); + operands[2] = STRIP_UNARY (operands[2]); + operands[6] = STRIP_UNARY (operands[6]); + operands[5] = GEN_INT (reg_mask); +}) + +(define_insn_and_split "*_vpternlog_3" + [(set (match_operand:V 0 "register_operand") + (any_logic:V + (any_logic1:V + (match_operand:V 1 "reg_or_notreg_operand") + (match_operand:V 2 "reg_or_notreg_operand")) + (match_operand:V 3 "reg_or_notreg_operand")))] + "( == 64 || TARGET_AVX512VL) + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:V + [(match_dup 3) + (match_dup 2) + (match_dup 1) + (match_dup 4)] + UNSPEC_VTERNLOG))] +{ + /* VPTERNLOGD reg3, reg2, reg1, imm8. */ + int reg3 = 0xF0; + int reg2 = 0xCC; + int reg1 = 0xAA; + int reg_mask, tmp1; + + reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1; + reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2; + reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3; + + tmp1 = reg1 reg2; + reg_mask = tmp1 reg3; + reg_mask &= 0xFF; + + operands[1] = STRIP_UNARY (operands[1]); + operands[2] = STRIP_UNARY (operands[2]); + operands[3] = STRIP_UNARY (operands[3]); + operands[4] = GEN_INT (reg_mask); +}) + + (define_insn "_vternlog_mask" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") (vec_merge:VI48_AVX512VL -- cgit v1.1 From 8da9b4f73c2c878b48f45fa2ed47d8a9edd31262 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Tue, 24 Aug 2021 18:09:33 +0800 Subject: Enable avx512 embedde broadcast for vpternlog. gcc/ChangeLog: PR target/101989 * config/i386/sse.md (_vternlog): Enable avx512 embedded broadcast. (*_vternlog_all): Ditto. (_vternlog_mask): Ditto. gcc/testsuite/ChangeLog: PR target/101989 * gcc.target/i386/pr101989-broadcast-1.c: New test. --- gcc/config/i386/sse.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 25ca9a5..03fc2df 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -10034,7 +10034,7 @@ (unspec:VI48_AVX512VL [(match_operand:VI48_AVX512VL 1 "register_operand" "0") (match_operand:VI48_AVX512VL 2 "register_operand" "v") - (match_operand:VI48_AVX512VL 3 "nonimmediate_operand" "vm") + (match_operand:VI48_AVX512VL 3 "bcst_vector_operand" "vmBr") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_VTERNLOG))] "TARGET_AVX512F" @@ -10048,7 +10048,7 @@ (unspec:V [(match_operand:V 1 "register_operand" "0") (match_operand:V 2 "register_operand" "v") - (match_operand:V 3 "nonimmediate_operand" "vm") + (match_operand:V 3 "bcst_vector_operand" "vmBr") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_VTERNLOG))] "TARGET_AVX512F" @@ -10281,7 +10281,7 @@ (unspec:VI48_AVX512VL [(match_operand:VI48_AVX512VL 1 "register_operand" "0") (match_operand:VI48_AVX512VL 2 "register_operand" "v") - (match_operand:VI48_AVX512VL 3 "nonimmediate_operand" "vm") + (match_operand:VI48_AVX512VL 3 "bcst_vector_operand" "vmBr") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_VTERNLOG) (match_dup 1) -- cgit v1.1 From 4702d3cf044924970a9a00142542da1edacfd76c Mon Sep 17 00:00:00 2001 From: Richard Earnshaw Date: Fri, 11 Jun 2021 17:18:12 +0100 Subject: arm: Fix general issues with patterns for VLLDM and VLSTM Both lazy_store_multiple_insn and lazy_load_multiple_insn contain invalid RTL (eg they contain a post_inc statement outside of a mem). What's more, the instructions concerned do not modify their input address register. We probably got away with this because they are generated so late in the compilation that no subsequent pass needed to understand them. Nevertheless, this could cause problems someday, so fixed to use a simple legal unspec. gcc: * config/arm/vfp.md (lazy_store_multiple_insn): Rewrite as valid RTL. (lazy_load_multiple_insn): Likewise. --- gcc/config/arm/vfp.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md index 93e96369..9961f93 100644 --- a/gcc/config/arm/vfp.md +++ b/gcc/config/arm/vfp.md @@ -1703,12 +1703,15 @@ (set_attr "type" "mov_reg")] ) +;; Both this and the next instruction are treated by GCC in the same +;; way as a blockage pattern. That's perhaps stronger than it needs +;; to be, but we do not want accesses to the VFP register bank to be +;; moved across either instruction. + (define_insn "lazy_store_multiple_insn" - [(set (match_operand:SI 0 "s_register_operand" "+&rk") - (post_dec:SI (match_dup 0))) - (unspec_volatile [(const_int 0) - (mem:SI (post_dec:SI (match_dup 0)))] - VUNSPEC_VLSTM)] + [(unspec_volatile + [(mem:BLK (match_operand:SI 0 "s_register_operand" "rk"))] + VUNSPEC_VLSTM)] "use_cmse && reload_completed" "vlstm%?\\t%0" [(set_attr "predicable" "yes") @@ -1716,11 +1719,9 @@ ) (define_insn "lazy_load_multiple_insn" - [(set (match_operand:SI 0 "s_register_operand" "+&rk") - (post_inc:SI (match_dup 0))) - (unspec_volatile:SI [(const_int 0) - (mem:SI (match_dup 0))] - VUNSPEC_VLLDM)] + [(unspec_volatile + [(mem:BLK (match_operand:SI 0 "s_register_operand" "rk"))] + VUNSPEC_VLLDM)] "use_cmse && reload_completed" "vlldm%?\\t%0" [(set_attr "predicable" "yes") -- cgit v1.1 From 3929bca9ca95de9d35e82ae8828b188029e3eb70 Mon Sep 17 00:00:00 2001 From: Richard Earnshaw Date: Fri, 11 Jun 2021 16:02:05 +0100 Subject: arm: Add command-line option for enabling CVE-2021-35465 mitigation [PR102035] Add a new option, -mfix-cmse-cve-2021-35465 and document it. Enable it automatically for cortex-m33, cortex-m35p and cortex-m55. gcc: PR target/102035 * config/arm/arm.opt (mfix-cmse-cve-2021-35465): New option. * doc/invoke.texi (Arm Options): Document it. * config/arm/arm-cpus.in (quirk_vlldm): New feature bit. (ALL_QUIRKS): Add quirk_vlldm. (cortex-m33): Add quirk_vlldm. (cortex-m35p, cortex-m55): Likewise. * config/arm/arm.c (arm_option_override): Enable fix_vlldm if targetting an affected CPU and not explicitly controlled on the command line. --- gcc/config/arm/arm-cpus.in | 9 +++++++-- gcc/config/arm/arm.c | 9 +++++++++ gcc/config/arm/arm.opt | 4 ++++ 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in index 249995a..bcc9ebe 100644 --- a/gcc/config/arm/arm-cpus.in +++ b/gcc/config/arm/arm-cpus.in @@ -186,6 +186,9 @@ define feature quirk_armv6kz # Cortex-M3 LDRD quirk. define feature quirk_cm3_ldrd +# v8-m/v8.1-m VLLDM errata. +define feature quirk_vlldm + # Don't use .cpu assembly directive define feature quirk_no_asmcpu @@ -322,7 +325,7 @@ define implied vfp_base MVE MVE_FP ALL_FP # architectures. # xscale isn't really a 'quirk', but it isn't an architecture either and we # need to ignore it for matching purposes. -define fgroup ALL_QUIRKS quirk_no_volatile_ce quirk_armv6kz quirk_cm3_ldrd xscale quirk_no_asmcpu +define fgroup ALL_QUIRKS quirk_no_volatile_ce quirk_armv6kz quirk_cm3_ldrd quirk_vlldm xscale quirk_no_asmcpu define fgroup IGNORE_FOR_MULTILIB cdecp0 cdecp1 cdecp2 cdecp3 cdecp4 cdecp5 cdecp6 cdecp7 @@ -1571,6 +1574,7 @@ begin cpu cortex-m33 architecture armv8-m.main+dsp+fp option nofp remove ALL_FP option nodsp remove armv7em + isa quirk_vlldm costs v7m end cpu cortex-m33 @@ -1580,6 +1584,7 @@ begin cpu cortex-m35p architecture armv8-m.main+dsp+fp option nofp remove ALL_FP option nodsp remove armv7em + isa quirk_vlldm costs v7m end cpu cortex-m35p @@ -1591,7 +1596,7 @@ begin cpu cortex-m55 option nomve remove mve mve_float option nofp remove ALL_FP mve_float option nodsp remove MVE mve_float - isa quirk_no_asmcpu + isa quirk_no_asmcpu quirk_vlldm costs v7m vendor 41 end cpu cortex-m55 diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 11dafc7..5c92941 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -3616,6 +3616,15 @@ arm_option_override (void) fix_cm3_ldrd = 0; } + /* Enable fix_vlldm by default if required. */ + if (fix_vlldm == 2) + { + if (bitmap_bit_p (arm_active_target.isa, isa_bit_quirk_vlldm)) + fix_vlldm = 1; + else + fix_vlldm = 0; + } + /* Hot/Cold partitioning is not currently supported, since we can't handle literal pool placement in that case. */ if (flag_reorder_blocks_and_partition) diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt index 7417b55..a7677ee 100644 --- a/gcc/config/arm/arm.opt +++ b/gcc/config/arm/arm.opt @@ -268,6 +268,10 @@ Target Var(fix_cm3_ldrd) Init(2) Avoid overlapping destination and address registers on LDRD instructions that may trigger Cortex-M3 errata. +mfix-cmse-cve-2021-35465 +Target Var(fix_vlldm) Init(2) +Mitigate issues with VLLDM on some M-profile devices (CVE-2021-35465). + munaligned-access Target Var(unaligned_access) Init(2) Save Enable unaligned word and halfword accesses to packed data. -- cgit v1.1 From 30461cf8dba3d3adb15a125e4da48800eb2b9b8f Mon Sep 17 00:00:00 2001 From: Richard Earnshaw Date: Fri, 18 Jun 2021 17:18:37 +0100 Subject: arm: fix vlldm erratum for Armv8.1-m [PR102035] For Armv8.1-m we generate code that emits VLLDM directly and do not rely on support code in the library, so emit the mitigation directly as well, when required. In this case, we can use the compiler options to determine when to apply the fix and when it is safe to omit it. gcc: PR target/102035 * config/arm/arm.md (attribute arch): Add fix_vlldm. (arch_enabled): Use it. * config/arm/vfp.md (lazy_store_multiple_insn): Add alternative to use when erratum mitigation is needed. --- gcc/config/arm/arm.md | 11 +++++++++-- gcc/config/arm/vfp.md | 10 +++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index 0646048..5d3f21b 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -132,9 +132,12 @@ ; TARGET_32BIT, "t1" or "t2" to specify a specific Thumb mode. "v6" ; for ARM or Thumb-2 with arm_arch6, and nov6 for ARM without ; arm_arch6. "v6t2" for Thumb-2 with arm_arch6 and "v8mb" for ARMv8-M -; Baseline. This attribute is used to compute attribute "enabled", +; Baseline. "fix_vlldm" is for fixing the v8-m/v8.1-m VLLDM erratum. +; This attribute is used to compute attribute "enabled", ; use type "any" to enable an alternative in all cases. -(define_attr "arch" "any,a,t,32,t1,t2,v6,nov6,v6t2,v8mb,iwmmxt,iwmmxt2,armv6_or_vfpv3,neon,mve" +(define_attr "arch" "any, a, t, 32, t1, t2, v6,nov6, v6t2, \ + v8mb, fix_vlldm, iwmmxt, iwmmxt2, armv6_or_vfpv3, \ + neon, mve" (const_string "any")) (define_attr "arch_enabled" "no,yes" @@ -177,6 +180,10 @@ (match_test "TARGET_THUMB1 && arm_arch8")) (const_string "yes") + (and (eq_attr "arch" "fix_vlldm") + (match_test "fix_vlldm")) + (const_string "yes") + (and (eq_attr "arch" "iwmmxt2") (match_test "TARGET_REALLY_IWMMXT2")) (const_string "yes") diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md index 9961f93..f0030a8 100644 --- a/gcc/config/arm/vfp.md +++ b/gcc/config/arm/vfp.md @@ -1720,11 +1720,15 @@ (define_insn "lazy_load_multiple_insn" [(unspec_volatile - [(mem:BLK (match_operand:SI 0 "s_register_operand" "rk"))] + [(mem:BLK (match_operand:SI 0 "s_register_operand" "rk,rk"))] VUNSPEC_VLLDM)] "use_cmse && reload_completed" - "vlldm%?\\t%0" - [(set_attr "predicable" "yes") + "@ + vscclrm\\t{vpr}\;vlldm\\t%0 + vlldm\\t%0" + [(set_attr "arch" "fix_vlldm,*") + (set_attr "predicable" "no") + (set_attr "length" "8,4") (set_attr "type" "load_4")] ) -- cgit v1.1 From 6e5401e87d02919b0594e04f828892deef956407 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 23 Aug 2021 14:47:03 -0700 Subject: x86: Broadcast from integer to a pseudo vector register Broadcast from integer to a pseudo vector register instead of a hard vector register to allow LRA to remove redundant move instruction after broadcast. gcc/ PR target/102021 * config/i386/i386-expand.c (ix86_expand_vector_move): Broadcast from integer to a pseudo vector register. gcc/testsuite/ PR target/102021 * gcc.target/i386/pr100865-10b.c: Expect vzeroupper. * gcc.target/i386/pr100865-4b.c: Likewise. * gcc.target/i386/pr100865-6b.c: Expect vmovdqu and vzeroupper. * gcc.target/i386/pr100865-7b.c: Likewise. * gcc.target/i386/pr102021.c: New test. --- gcc/config/i386/i386-expand.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 9bf13db..2500dbf 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -579,19 +579,10 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) { /* Broadcast to XMM/YMM/ZMM register from an integer constant or scalar mem. */ - /* Hard registers are used for 2 purposes: - 1. Prevent stack realignment when the original code - doesn't use vector registers, which is the same for - memcpy and memset. - 2. Prevent combine to convert constant broadcast to - load from constant pool. */ - op1 = ix86_gen_scratch_sse_rtx (mode); + op1 = gen_reg_rtx (mode); if (FLOAT_MODE_P (mode) || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode)) - { - first = force_const_mem (GET_MODE_INNER (mode), first); - op1 = gen_reg_rtx (mode); - } + first = force_const_mem (GET_MODE_INNER (mode), first); bool ok = ix86_expand_vector_init_duplicate (false, mode, op1, first); gcc_assert (ok); -- cgit v1.1 From fce8a52d0aef5f0ef393f68d31669058e0ddfd71 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Fri, 2 Apr 2021 16:48:36 -0500 Subject: rs6000: Add power7 and power7-64 builtins 2021-04-02 Bill Schmidt gcc/ * config/rs6000/rs6000-builtin-new.def: Add power7 and power7-64 stanzas. --- gcc/config/rs6000/rs6000-builtin-new.def | 39 ++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def index 61f5b94..a310bf4 100644 --- a/gcc/config/rs6000/rs6000-builtin-new.def +++ b/gcc/config/rs6000/rs6000-builtin-new.def @@ -1961,3 +1961,42 @@ const vsll __builtin_vsx_xxspltd_2di (vsll, const int<1>); XXSPLTD_V2DI vsx_xxspltd_v2di {} + + +; Power7 builtins (ISA 2.06). +[power7] + const unsigned int __builtin_addg6s (unsigned int, unsigned int); + ADDG6S addg6s {} + + const signed long __builtin_bpermd (signed long, signed long); + BPERMD bpermd_di {} + + const unsigned int __builtin_cbcdtd (unsigned int); + CBCDTD cbcdtd {} + + const unsigned int __builtin_cdtbcd (unsigned int); + CDTBCD cdtbcd {} + + const signed int __builtin_divwe (signed int, signed int); + DIVWE dive_si {} + + const unsigned int __builtin_divweu (unsigned int, unsigned int); + DIVWEU diveu_si {} + + const vsq __builtin_pack_vector_int128 (unsigned long long, unsigned long long); + PACK_V1TI packv1ti {} + + void __builtin_ppc_speculation_barrier (); + SPECBARR speculation_barrier {} + + const unsigned long __builtin_unpack_vector_int128 (vsq, const int<1>); + UNPACK_V1TI unpackv1ti {} + + +; Power7 builtins requiring 64-bit GPRs (even with 32-bit addressing). +[power7-64] + const signed long long __builtin_divde (signed long long, signed long long); + DIVDE dive_di {} + + const unsigned long long __builtin_divdeu (unsigned long long, unsigned long long); + DIVDEU diveu_di {} -- cgit v1.1 From 8ce18a29ef717f5920ebf5dc1d9e84570a1827d4 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Thu, 1 Apr 2021 13:57:44 -0500 Subject: rs6000: Add power8-vector builtins 2021-04-01 Bill Schmidt gcc/ * config/rs6000/rs6000-builtin-new.def: Add power8-vector stanza. --- gcc/config/rs6000/rs6000-builtin-new.def | 438 +++++++++++++++++++++++++++++++ 1 file changed, 438 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def index a310bf4..2a2c913 100644 --- a/gcc/config/rs6000/rs6000-builtin-new.def +++ b/gcc/config/rs6000/rs6000-builtin-new.def @@ -2000,3 +2000,441 @@ const unsigned long long __builtin_divdeu (unsigned long long, unsigned long long); DIVDEU diveu_di {} + + +; Power8 vector built-ins. +[power8-vector] + const vsll __builtin_altivec_abs_v2di (vsll); + ABS_V2DI absv2di2 {} + + const vsc __builtin_altivec_bcddiv10_v16qi (vsc); + BCDDIV10_V16QI bcddiv10_v16qi {} + + const vsc __builtin_altivec_bcdmul10_v16qi (vsc); + BCDMUL10_V16QI bcdmul10_v16qi {} + + const vsc __builtin_altivec_eqv_v16qi (vsc, vsc); + EQV_V16QI eqvv16qi3 {} + + const vuc __builtin_altivec_eqv_v16qi_uns (vuc, vuc); + EQV_V16QI_UNS eqvv16qi3 {} + + const vsq __builtin_altivec_eqv_v1ti (vsq, vsq); + EQV_V1TI eqvv1ti3 {} + + const vuq __builtin_altivec_eqv_v1ti_uns (vuq, vuq); + EQV_V1TI_UNS eqvv1ti3 {} + + const vd __builtin_altivec_eqv_v2df (vd, vd); + EQV_V2DF eqvv2df3 {} + + const vsll __builtin_altivec_eqv_v2di (vsll, vsll); + EQV_V2DI eqvv2di3 {} + + const vull __builtin_altivec_eqv_v2di_uns (vull, vull); + EQV_V2DI_UNS eqvv2di3 {} + + const vf __builtin_altivec_eqv_v4sf (vf, vf); + EQV_V4SF eqvv4sf3 {} + + const vsi __builtin_altivec_eqv_v4si (vsi, vsi); + EQV_V4SI eqvv4si3 {} + + const vui __builtin_altivec_eqv_v4si_uns (vui, vui); + EQV_V4SI_UNS eqvv4si3 {} + + const vss __builtin_altivec_eqv_v8hi (vss, vss); + EQV_V8HI eqvv8hi3 {} + + const vus __builtin_altivec_eqv_v8hi_uns (vus, vus); + EQV_V8HI_UNS eqvv8hi3 {} + + const vsc __builtin_altivec_nand_v16qi (vsc, vsc); + NAND_V16QI nandv16qi3 {} + + const vuc __builtin_altivec_nand_v16qi_uns (vuc, vuc); + NAND_V16QI_UNS nandv16qi3 {} + + const vsq __builtin_altivec_nand_v1ti (vsq, vsq); + NAND_V1TI nandv1ti3 {} + + const vuq __builtin_altivec_nand_v1ti_uns (vuq, vuq); + NAND_V1TI_UNS nandv1ti3 {} + + const vd __builtin_altivec_nand_v2df (vd, vd); + NAND_V2DF nandv2df3 {} + + const vsll __builtin_altivec_nand_v2di (vsll, vsll); + NAND_V2DI nandv2di3 {} + + const vull __builtin_altivec_nand_v2di_uns (vull, vull); + NAND_V2DI_UNS nandv2di3 {} + + const vf __builtin_altivec_nand_v4sf (vf, vf); + NAND_V4SF nandv4sf3 {} + + const vsi __builtin_altivec_nand_v4si (vsi, vsi); + NAND_V4SI nandv4si3 {} + + const vui __builtin_altivec_nand_v4si_uns (vui, vui); + NAND_V4SI_UNS nandv4si3 {} + + const vss __builtin_altivec_nand_v8hi (vss, vss); + NAND_V8HI nandv8hi3 {} + + const vus __builtin_altivec_nand_v8hi_uns (vus, vus); + NAND_V8HI_UNS nandv8hi3 {} + + const vsc __builtin_altivec_neg_v16qi (vsc); + NEG_V16QI negv16qi2 {} + + const vd __builtin_altivec_neg_v2df (vd); + NEG_V2DF negv2df2 {} + + const vsll __builtin_altivec_neg_v2di (vsll); + NEG_V2DI negv2di2 {} + + const vf __builtin_altivec_neg_v4sf (vf); + NEG_V4SF negv4sf2 {} + + const vsi __builtin_altivec_neg_v4si (vsi); + NEG_V4SI negv4si2 {} + + const vss __builtin_altivec_neg_v8hi (vss); + NEG_V8HI negv8hi2 {} + + const vsc __builtin_altivec_orc_v16qi (vsc, vsc); + ORC_V16QI orcv16qi3 {} + + const vuc __builtin_altivec_orc_v16qi_uns (vuc, vuc); + ORC_V16QI_UNS orcv16qi3 {} + + const vsq __builtin_altivec_orc_v1ti (vsq, vsq); + ORC_V1TI orcv1ti3 {} + + const vuq __builtin_altivec_orc_v1ti_uns (vuq, vuq); + ORC_V1TI_UNS orcv1ti3 {} + + const vd __builtin_altivec_orc_v2df (vd, vd); + ORC_V2DF orcv2df3 {} + + const vsll __builtin_altivec_orc_v2di (vsll, vsll); + ORC_V2DI orcv2di3 {} + + const vull __builtin_altivec_orc_v2di_uns (vull, vull); + ORC_V2DI_UNS orcv2di3 {} + + const vf __builtin_altivec_orc_v4sf (vf, vf); + ORC_V4SF orcv4sf3 {} + + const vsi __builtin_altivec_orc_v4si (vsi, vsi); + ORC_V4SI orcv4si3 {} + + const vui __builtin_altivec_orc_v4si_uns (vui, vui); + ORC_V4SI_UNS orcv4si3 {} + + const vss __builtin_altivec_orc_v8hi (vss, vss); + ORC_V8HI orcv8hi3 {} + + const vus __builtin_altivec_orc_v8hi_uns (vus, vus); + ORC_V8HI_UNS orcv8hi3 {} + + const vsc __builtin_altivec_vclzb (vsc); + VCLZB clzv16qi2 {} + + const vsll __builtin_altivec_vclzd (vsll); + VCLZD clzv2di2 {} + + const vss __builtin_altivec_vclzh (vss); + VCLZH clzv8hi2 {} + + const vsi __builtin_altivec_vclzw (vsi); + VCLZW clzv4si2 {} + + const vuc __builtin_altivec_vgbbd (vuc); + VGBBD p8v_vgbbd {} + + const vsq __builtin_altivec_vaddcuq (vsq, vsq); + VADDCUQ altivec_vaddcuq {} + + const vsq __builtin_altivec_vaddecuq (vsq, vsq, vsq); + VADDECUQ altivec_vaddecuq {} + + const vsq __builtin_altivec_vaddeuqm (vsq, vsq, vsq); + VADDEUQM altivec_vaddeuqm {} + + const vsll __builtin_altivec_vaddudm (vsll, vsll); + VADDUDM addv2di3 {} + + const vsq __builtin_altivec_vadduqm (vsq, vsq); + VADDUQM altivec_vadduqm {} + + const vsll __builtin_altivec_vbpermq (vsc, vsc); + VBPERMQ altivec_vbpermq {} + + const vsc __builtin_altivec_vbpermq2 (vsc, vsc); + VBPERMQ2 altivec_vbpermq2 {} + + const vsll __builtin_altivec_vmaxsd (vsll, vsll); + VMAXSD smaxv2di3 {} + + const vull __builtin_altivec_vmaxud (vull, vull); + VMAXUD umaxv2di3 {} + + const vsll __builtin_altivec_vminsd (vsll, vsll); + VMINSD sminv2di3 {} + + const vull __builtin_altivec_vminud (vull, vull); + VMINUD uminv2di3 {} + + const vd __builtin_altivec_vmrgew_v2df (vd, vd); + VMRGEW_V2DF p8_vmrgew_v2df {} + + const vsll __builtin_altivec_vmrgew_v2di (vsll, vsll); + VMRGEW_V2DI p8_vmrgew_v2di {} + + const vf __builtin_altivec_vmrgew_v4sf (vf, vf); + VMRGEW_V4SF p8_vmrgew_v4sf {} + + const vsi __builtin_altivec_vmrgew_v4si (vsi, vsi); + VMRGEW_V4SI p8_vmrgew_v4si {} + + const vd __builtin_altivec_vmrgow_v2df (vd, vd); + VMRGOW_V2DF p8_vmrgow_v2df {} + + const vsll __builtin_altivec_vmrgow_v2di (vsll, vsll); + VMRGOW_V2DI p8_vmrgow_v2di {} + + const vf __builtin_altivec_vmrgow_v4sf (vf, vf); + VMRGOW_V4SF p8_vmrgow_v4sf {} + + const vsi __builtin_altivec_vmrgow_v4si (vsi, vsi); + VMRGOW_V4SI p8_vmrgow_v4si {} + + const vsc __builtin_altivec_vpermxor (vsc, vsc, vsc); + VPERMXOR altivec_vpermxor {} + + const vsi __builtin_altivec_vpksdss (vsll, vsll); + VPKSDSS altivec_vpksdss {} + + const vsi __builtin_altivec_vpksdus (vsll, vsll); + VPKSDUS altivec_vpksdus {} + + const vsi __builtin_altivec_vpkudum (vsll, vsll); + VPKUDUM altivec_vpkudum {} + + const vsi __builtin_altivec_vpkudus (vsll, vsll); + VPKUDUS altivec_vpkudus {} + + const vsc __builtin_altivec_vpmsumb (vsc, vsc); + VPMSUMB_A crypto_vpmsumb {} + + const vsll __builtin_altivec_vpmsumd (vsll, vsll); + VPMSUMD_A crypto_vpmsumd {} + + const vss __builtin_altivec_vpmsumh (vss, vss); + VPMSUMH_A crypto_vpmsumh {} + + const vsi __builtin_altivec_vpmsumw (vsi, vsi); + VPMSUMW_A crypto_vpmsumw {} + + const vsc __builtin_altivec_vpopcntb (vsc); + VPOPCNTB popcountv16qi2 {} + + const vsll __builtin_altivec_vpopcntd (vsll); + VPOPCNTD popcountv2di2 {} + + const vss __builtin_altivec_vpopcnth (vss); + VPOPCNTH popcountv8hi2 {} + + const vsc __builtin_altivec_vpopcntub (vsc); + VPOPCNTUB popcountv16qi2 {} + + const vsll __builtin_altivec_vpopcntud (vsll); + VPOPCNTUD popcountv2di2 {} + + const vss __builtin_altivec_vpopcntuh (vss); + VPOPCNTUH popcountv8hi2 {} + + const vsi __builtin_altivec_vpopcntuw (vsi); + VPOPCNTUW popcountv4si2 {} + + const vsi __builtin_altivec_vpopcntw (vsi); + VPOPCNTW popcountv4si2 {} + + const vsll __builtin_altivec_vrld (vsll, vsll); + VRLD vrotlv2di3 {} + + const vsll __builtin_altivec_vsld (vsll, vsll); + VSLD vashlv2di3 {} + + const vsll __builtin_altivec_vsrad (vsll, vsll); + VSRAD vashrv2di3 {} + + const vsll __builtin_altivec_vsrd (vsll, vull); + VSRD vlshrv2di3 {} + + const vsq __builtin_altivec_vsubcuq (vsq, vsq); + VSUBCUQ altivec_vsubcuq {} + + const vsq __builtin_altivec_vsubecuq (vsq, vsq, vsq); + VSUBECUQ altivec_vsubecuq {} + + const vsq __builtin_altivec_vsubeuqm (vsq, vsq, vsq); + VSUBEUQM altivec_vsubeuqm {} + + const vsll __builtin_altivec_vsubudm (vsll, vsll); + VSUBUDM subv2di3 {} + + const vsq __builtin_altivec_vsubuqm (vsq, vsq); + VSUBUQM altivec_vsubuqm {} + + const vsll __builtin_altivec_vupkhsw (vsi); + VUPKHSW altivec_vupkhsw {} + + const vsll __builtin_altivec_vupklsw (vsi); + VUPKLSW altivec_vupklsw {} + + const vsq __builtin_bcdadd_v1ti (vsq, vsq, const int<1>); + BCDADD_V1TI bcdadd_v1ti {} + + const vsc __builtin_bcdadd_v16qi (vsc, vsc, const int<1>); + BCDADD_V16QI bcdadd_v16qi {} + + const signed int __builtin_bcdadd_eq_v1ti (vsq, vsq, const int<1>); + BCDADD_EQ_V1TI bcdadd_eq_v1ti {} + + const signed int __builtin_bcdadd_eq_v16qi (vsc, vsc, const int<1>); + BCDADD_EQ_V16QI bcdadd_eq_v16qi {} + + const signed int __builtin_bcdadd_gt_v1ti (vsq, vsq, const int<1>); + BCDADD_GT_V1TI bcdadd_gt_v1ti {} + + const signed int __builtin_bcdadd_gt_v16qi (vsc, vsc, const int<1>); + BCDADD_GT_V16QI bcdadd_gt_v16qi {} + + const signed int __builtin_bcdadd_lt_v1ti (vsq, vsq, const int<1>); + BCDADD_LT_V1TI bcdadd_lt_v1ti {} + + const signed int __builtin_bcdadd_lt_v16qi (vsc, vsc, const int<1>); + BCDADD_LT_V16QI bcdadd_lt_v16qi {} + + const signed int __builtin_bcdadd_ov_v1ti (vsq, vsq, const int<1>); + BCDADD_OV_V1TI bcdadd_unordered_v1ti {} + + const signed int __builtin_bcdadd_ov_v16qi (vsc, vsc, const int<1>); + BCDADD_OV_V16QI bcdadd_unordered_v16qi {} + + const signed int __builtin_bcdinvalid_v1ti (vsq); + BCDINVALID_V1TI bcdinvalid_v1ti {} + + const signed int __builtin_bcdinvalid_v16qi (vsc); + BCDINVALID_V16QI bcdinvalid_v16qi {} + + const vsq __builtin_bcdsub_v1ti (vsq, vsq, const int<1>); + BCDSUB_V1TI bcdsub_v1ti {} + + const vsc __builtin_bcdsub_v16qi (vsc, vsc, const int<1>); + BCDSUB_V16QI bcdsub_v16qi {} + + const signed int __builtin_bcdsub_eq_v1ti (vsq, vsq, const int<1>); + BCDSUB_EQ_V1TI bcdsub_eq_v1ti {} + + const signed int __builtin_bcdsub_eq_v16qi (vsc, vsc, const int<1>); + BCDSUB_EQ_V16QI bcdsub_eq_v16qi {} + + const signed int __builtin_bcdsub_ge_v1ti (vsq, vsq, const int<1>); + BCDSUB_GE_V1TI bcdsub_ge_v1ti {} + + const signed int __builtin_bcdsub_ge_v16qi (vsc, vsc, const int<1>); + BCDSUB_GE_V16QI bcdsub_ge_v16qi {} + + const signed int __builtin_bcdsub_gt_v1ti (vsq, vsq, const int<1>); + BCDSUB_GT_V1TI bcdsub_gt_v1ti {} + + const signed int __builtin_bcdsub_gt_v16qi (vsc, vsc, const int<1>); + BCDSUB_GT_V16QI bcdsub_gt_v16qi {} + + const signed int __builtin_bcdsub_le_v1ti (vsq, vsq, const int<1>); + BCDSUB_LE_V1TI bcdsub_le_v1ti {} + + const signed int __builtin_bcdsub_le_v16qi (vsc, vsc, const int<1>); + BCDSUB_LE_V16QI bcdsub_le_v16qi {} + + const signed int __builtin_bcdsub_lt_v1ti (vsq, vsq, const int<1>); + BCDSUB_LT_V1TI bcdsub_lt_v1ti {} + + const signed int __builtin_bcdsub_lt_v16qi (vsc, vsc, const int<1>); + BCDSUB_LT_V16QI bcdsub_lt_v16qi {} + + const signed int __builtin_bcdsub_ov_v1ti (vsq, vsq, const int<1>); + BCDSUB_OV_V1TI bcdsub_unordered_v1ti {} + + const signed int __builtin_bcdsub_ov_v16qi (vsc, vsc, const int<1>); + BCDSUB_OV_V16QI bcdsub_unordered_v16qi {} + + const vuc __builtin_crypto_vpermxor_v16qi (vuc, vuc, vuc); + VPERMXOR_V16QI crypto_vpermxor_v16qi {} + + const vull __builtin_crypto_vpermxor_v2di (vull, vull, vull); + VPERMXOR_V2DI crypto_vpermxor_v2di {} + + const vui __builtin_crypto_vpermxor_v4si (vui, vui, vui); + VPERMXOR_V4SI crypto_vpermxor_v4si {} + + const vus __builtin_crypto_vpermxor_v8hi (vus, vus, vus); + VPERMXOR_V8HI crypto_vpermxor_v8hi {} + + const vuc __builtin_crypto_vpmsumb (vuc, vuc); + VPMSUMB crypto_vpmsumb {} + + const vull __builtin_crypto_vpmsumd (vull, vull); + VPMSUMD crypto_vpmsumd {} + + const vus __builtin_crypto_vpmsumh (vus, vus); + VPMSUMH crypto_vpmsumh {} + + const vui __builtin_crypto_vpmsumw (vui, vui); + VPMSUMW crypto_vpmsumw {} + + const vf __builtin_vsx_float2_v2df (vd, vd); + FLOAT2_V2DF float2_v2df {} + + const vf __builtin_vsx_float2_v2di (vsll, vsll); + FLOAT2_V2DI float2_v2di {} + + const vsc __builtin_vsx_revb_v16qi (vsc); + REVB_V16QI revb_v16qi {} + + const vsq __builtin_vsx_revb_v1ti (vsq); + REVB_V1TI revb_v1ti {} + + const vd __builtin_vsx_revb_v2df (vd); + REVB_V2DF revb_v2df {} + + const vsll __builtin_vsx_revb_v2di (vsll); + REVB_V2DI revb_v2di {} + + const vf __builtin_vsx_revb_v4sf (vf); + REVB_V4SF revb_v4sf {} + + const vsi __builtin_vsx_revb_v4si (vsi); + REVB_V4SI revb_v4si {} + + const vss __builtin_vsx_revb_v8hi (vss); + REVB_V8HI revb_v8hi {} + + const vf __builtin_vsx_uns_float2_v2di (vsll, vsll); + UNS_FLOAT2_V2DI uns_float2_v2di {} + + const vsi __builtin_vsx_vsigned2_v2df (vd, vd); + VEC_VSIGNED2_V2DF vsigned2_v2df {} + + const vsi __builtin_vsx_vunsigned2_v2df (vd, vd); + VEC_VUNSIGNED2_V2DF vunsigned2_v2df {} + + const vf __builtin_vsx_xscvdpspn (double); + XSCVDPSPN vsx_xscvdpspn {} + + const double __builtin_vsx_xscvspdpn (vf); + XSCVSPDPN vsx_xscvspdpn {} -- cgit v1.1 From 3c496e92d795a8fe5c527e3c5b5a6606669ae50d Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Tue, 24 Aug 2021 18:02:18 +0100 Subject: nvptx: Add a __PTX_SM__ predefined macro based on target ISA. This patch adds a __PTX_SM__ predefined macro to the nvptx backend that allows code to check the compute model being targeted by the compiler. This is equivalent to the __CUDA_ARCH__ macro defined by CUDA's nvcc compiler, but to avoid causing problems for source code that checks for that compiler, this macro uses GCC's nomenclature; it's easy enough for users to "#define __CUDA_ARCH__ __PTX_SM__". What might have been a four line patch is actually a little more complicated, as this patch takes the opportunity to upgrade the nvptx backend to use the now preferred nvptx-c.c idiom. 2021-08-24 Roger Sayle Tom de Vries gcc/ChangeLog * config.gcc (nvptx-*-*): Define {c,c++}_target_objs. * config/nvptx/nvptx-protos.h (nvptx_cpu_cpp_builtins): Prototype. * config/nvptx/nvptx.h (TARGET_CPU_CPP_BUILTINS): Implement with a call to the new nvptx_cpu_cpp_builtins function in nvptx-c.c. * config/nvptx/t-nvptx (nvptx-c.o): New rule. * config/nvptx/nvptx-c.c: New source file. (nvptx_cpu_cpp_builtins): Move implementation here. --- gcc/config/nvptx/nvptx-c.c | 47 +++++++++++++++++++++++++++++++++++++++++ gcc/config/nvptx/nvptx-protos.h | 1 + gcc/config/nvptx/nvptx.h | 12 +---------- gcc/config/nvptx/t-nvptx | 4 ++++ 4 files changed, 53 insertions(+), 11 deletions(-) create mode 100644 gcc/config/nvptx/nvptx-c.c (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx-c.c b/gcc/config/nvptx/nvptx-c.c new file mode 100644 index 0000000..72594a82e --- /dev/null +++ b/gcc/config/nvptx/nvptx-c.c @@ -0,0 +1,47 @@ +/* Subroutines for the C front end on the NVPTX architecture. + * Copyright (C) 2021 Free Software Foundation, Inc. + * + * This file is part of GCC. + * + * GCC is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 3, or (at your + * option) any later version. + * + * GCC is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + * License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GCC; see the file COPYING3. If not see + * . */ + +#define IN_TARGET_CODE 1 + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "target.h" +#include "c-family/c-common.h" +#include "memmodel.h" +#include "tm_p.h" +#include "c-family/c-pragma.h" + +/* Function to tell the preprocessor about the defines for the target. */ +void +nvptx_cpu_cpp_builtins (void) +{ + cpp_assert (parse_in, "machine=nvptx"); + cpp_assert (parse_in, "cpu=nvptx"); + cpp_define (parse_in, "__nvptx__"); + if (TARGET_SOFT_STACK) + cpp_define (parse_in, "__nvptx_softstack__"); + if (TARGET_UNIFORM_SIMT) + cpp_define (parse_in,"__nvptx_unisimt__"); + if (TARGET_SM35) + cpp_define (parse_in, "__PTX_SM__=350"); + else + cpp_define (parse_in,"__PTX_SM__=300"); +} + diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h index b7e6ae2..b29ddc9 100644 --- a/gcc/config/nvptx/nvptx-protos.h +++ b/gcc/config/nvptx/nvptx-protos.h @@ -40,6 +40,7 @@ extern void nvptx_output_aligned_decl (FILE *file, const char *name, extern void nvptx_function_end (FILE *); extern void nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT); extern void nvptx_output_ascii (FILE *, const char *, unsigned HOST_WIDE_INT); +extern void nvptx_cpu_cpp_builtins (void); extern void nvptx_register_pragmas (void); extern unsigned int nvptx_data_alignment (const_tree, unsigned int); diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h index fdaacdd..d367174 100644 --- a/gcc/config/nvptx/nvptx.h +++ b/gcc/config/nvptx/nvptx.h @@ -34,17 +34,7 @@ nvptx-as. */ #define ASM_SPEC "%{misa=*:-m %*; :-m sm_35}" -#define TARGET_CPU_CPP_BUILTINS() \ - do \ - { \ - builtin_assert ("machine=nvptx"); \ - builtin_assert ("cpu=nvptx"); \ - builtin_define ("__nvptx__"); \ - if (TARGET_SOFT_STACK) \ - builtin_define ("__nvptx_softstack__"); \ - if (TARGET_UNIFORM_SIMT) \ - builtin_define ("__nvptx_unisimt__"); \ - } while (0) +#define TARGET_CPU_CPP_BUILTINS() nvptx_cpu_cpp_builtins () /* Avoid the default in ../../gcc.c, which adds "-pthread", which is not supported for nvptx. */ diff --git a/gcc/config/nvptx/t-nvptx b/gcc/config/nvptx/t-nvptx index 6c1010d..d33bacd 100644 --- a/gcc/config/nvptx/t-nvptx +++ b/gcc/config/nvptx/t-nvptx @@ -1,3 +1,7 @@ +nvptx-c.o: $(srcdir)/config/nvptx/nvptx-c.c + $(COMPILE) $< + $(POSTCOMPILE) + CFLAGS-mkoffload.o += $(DRIVER_DEFINES) \ -DGCC_INSTALL_NAME=\"$(GCC_INSTALL_NAME)\" mkoffload.o: $(srcdir)/config/nvptx/mkoffload.c -- cgit v1.1 From 2ed356a4c9af0629d9d5fd30969e432de6302cb3 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 24 Aug 2021 10:01:47 -0500 Subject: rs6000: Add Power9 builtins 2021-08-24 Bill Schmidt gcc/ * config/rs6000/rs6000-builtin-new.def: Add power9-vector, power9, and power9-64 stanzas. --- gcc/config/rs6000/rs6000-builtin-new.def | 368 +++++++++++++++++++++++++++++++ 1 file changed, 368 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def index 2a2c913..0462797 100644 --- a/gcc/config/rs6000/rs6000-builtin-new.def +++ b/gcc/config/rs6000/rs6000-builtin-new.def @@ -2438,3 +2438,371 @@ const double __builtin_vsx_xscvspdpn (vf); XSCVSPDPN vsx_xscvspdpn {} + + +; Power9 vector builtins. +[power9-vector] + const vss __builtin_altivec_convert_4f32_8f16 (vf, vf); + CONVERT_4F32_8F16 convert_4f32_8f16 {} + + const vss __builtin_altivec_convert_4f32_8i16 (vf, vf); + CONVERT_4F32_8I16 convert_4f32_8i16 {} + + const signed int __builtin_altivec_first_match_index_v16qi (vsc, vsc); + VFIRSTMATCHINDEX_V16QI first_match_index_v16qi {} + + const signed int __builtin_altivec_first_match_index_v8hi (vss, vss); + VFIRSTMATCHINDEX_V8HI first_match_index_v8hi {} + + const signed int __builtin_altivec_first_match_index_v4si (vsi, vsi); + VFIRSTMATCHINDEX_V4SI first_match_index_v4si {} + + const signed int __builtin_altivec_first_match_or_eos_index_v16qi (vsc, vsc); + VFIRSTMATCHOREOSINDEX_V16QI first_match_or_eos_index_v16qi {} + + const signed int __builtin_altivec_first_match_or_eos_index_v8hi (vss, vss); + VFIRSTMATCHOREOSINDEX_V8HI first_match_or_eos_index_v8hi {} + + const signed int __builtin_altivec_first_match_or_eos_index_v4si (vsi, vsi); + VFIRSTMATCHOREOSINDEX_V4SI first_match_or_eos_index_v4si {} + + const signed int __builtin_altivec_first_mismatch_index_v16qi (vsc, vsc); + VFIRSTMISMATCHINDEX_V16QI first_mismatch_index_v16qi {} + + const signed int __builtin_altivec_first_mismatch_index_v8hi (vss, vss); + VFIRSTMISMATCHINDEX_V8HI first_mismatch_index_v8hi {} + + const signed int __builtin_altivec_first_mismatch_index_v4si (vsi, vsi); + VFIRSTMISMATCHINDEX_V4SI first_mismatch_index_v4si {} + + const signed int __builtin_altivec_first_mismatch_or_eos_index_v16qi (vsc, vsc); + VFIRSTMISMATCHOREOSINDEX_V16QI first_mismatch_or_eos_index_v16qi {} + + const signed int __builtin_altivec_first_mismatch_or_eos_index_v8hi (vss, vss); + VFIRSTMISMATCHOREOSINDEX_V8HI first_mismatch_or_eos_index_v8hi {} + + const signed int __builtin_altivec_first_mismatch_or_eos_index_v4si (vsi, vsi); + VFIRSTMISMATCHOREOSINDEX_V4SI first_mismatch_or_eos_index_v4si {} + + const vsc __builtin_altivec_vadub (vsc, vsc); + VADUB vaduv16qi3 {} + + const vss __builtin_altivec_vaduh (vss, vss); + VADUH vaduv8hi3 {} + + const vsi __builtin_altivec_vaduw (vsi, vsi); + VADUW vaduv4si3 {} + + const vsll __builtin_altivec_vbpermd (vsll, vsc); + VBPERMD altivec_vbpermd {} + + const signed int __builtin_altivec_vclzlsbb_v16qi (vsc); + VCLZLSBB_V16QI vclzlsbb_v16qi {} + + const signed int __builtin_altivec_vclzlsbb_v4si (vsi); + VCLZLSBB_V4SI vclzlsbb_v4si {} + + const signed int __builtin_altivec_vclzlsbb_v8hi (vss); + VCLZLSBB_V8HI vclzlsbb_v8hi {} + + const vsc __builtin_altivec_vctzb (vsc); + VCTZB ctzv16qi2 {} + + const vsll __builtin_altivec_vctzd (vsll); + VCTZD ctzv2di2 {} + + const vss __builtin_altivec_vctzh (vss); + VCTZH ctzv8hi2 {} + + const vsi __builtin_altivec_vctzw (vsi); + VCTZW ctzv4si2 {} + + const signed int __builtin_altivec_vctzlsbb_v16qi (vsc); + VCTZLSBB_V16QI vctzlsbb_v16qi {} + + const signed int __builtin_altivec_vctzlsbb_v4si (vsi); + VCTZLSBB_V4SI vctzlsbb_v4si {} + + const signed int __builtin_altivec_vctzlsbb_v8hi (vss); + VCTZLSBB_V8HI vctzlsbb_v8hi {} + + const signed int __builtin_altivec_vcmpaeb_p (vsc, vsc); + VCMPAEB_P vector_ae_v16qi_p {} + + const signed int __builtin_altivec_vcmpaed_p (vsll, vsll); + VCMPAED_P vector_ae_v2di_p {} + + const signed int __builtin_altivec_vcmpaedp_p (vd, vd); + VCMPAEDP_P vector_ae_v2df_p {} + + const signed int __builtin_altivec_vcmpaefp_p (vf, vf); + VCMPAEFP_P vector_ae_v4sf_p {} + + const signed int __builtin_altivec_vcmpaeh_p (vss, vss); + VCMPAEH_P vector_ae_v8hi_p {} + + const signed int __builtin_altivec_vcmpaew_p (vsi, vsi); + VCMPAEW_P vector_ae_v4si_p {} + + const vsc __builtin_altivec_vcmpneb (vsc, vsc); + VCMPNEB vcmpneb {} + + const signed int __builtin_altivec_vcmpneb_p (vsc, vsc); + VCMPNEB_P vector_ne_v16qi_p {} + + const signed int __builtin_altivec_vcmpned_p (vsll, vsll); + VCMPNED_P vector_ne_v2di_p {} + + const signed int __builtin_altivec_vcmpnedp_p (vd, vd); + VCMPNEDP_P vector_ne_v2df_p {} + + const signed int __builtin_altivec_vcmpnefp_p (vf, vf); + VCMPNEFP_P vector_ne_v4sf_p {} + + const vss __builtin_altivec_vcmpneh (vss, vss); + VCMPNEH vcmpneh {} + + const signed int __builtin_altivec_vcmpneh_p (vss, vss); + VCMPNEH_P vector_ne_v8hi_p {} + + const vsi __builtin_altivec_vcmpnew (vsi, vsi); + VCMPNEW vcmpnew {} + + const signed int __builtin_altivec_vcmpnew_p (vsi, vsi); + VCMPNEW_P vector_ne_v4si_p {} + + const vsc __builtin_altivec_vcmpnezb (vsc, vsc); + CMPNEZB vcmpnezb {} + + const signed int __builtin_altivec_vcmpnezb_p (signed int, vsc, vsc); + VCMPNEZB_P vector_nez_v16qi_p {pred} + + const vss __builtin_altivec_vcmpnezh (vss, vss); + CMPNEZH vcmpnezh {} + + const signed int __builtin_altivec_vcmpnezh_p (signed int, vss, vss); + VCMPNEZH_P vector_nez_v8hi_p {pred} + + const vsi __builtin_altivec_vcmpnezw (vsi, vsi); + CMPNEZW vcmpnezw {} + + const signed int __builtin_altivec_vcmpnezw_p (signed int, vsi, vsi); + VCMPNEZW_P vector_nez_v4si_p {pred} + + const signed int __builtin_altivec_vextublx (signed int, vsc); + VEXTUBLX vextublx {} + + const signed int __builtin_altivec_vextubrx (signed int, vsc); + VEXTUBRX vextubrx {} + + const signed int __builtin_altivec_vextuhlx (signed int, vss); + VEXTUHLX vextuhlx {} + + const signed int __builtin_altivec_vextuhrx (signed int, vss); + VEXTUHRX vextuhrx {} + + const signed int __builtin_altivec_vextuwlx (signed int, vsi); + VEXTUWLX vextuwlx {} + + const signed int __builtin_altivec_vextuwrx (signed int, vsi); + VEXTUWRX vextuwrx {} + + const vsq __builtin_altivec_vmsumudm (vsll, vsll, vsq); + VMSUMUDM altivec_vmsumudm {} + + const vsll __builtin_altivec_vprtybd (vsll); + VPRTYBD parityv2di2 {} + + const vsq __builtin_altivec_vprtybq (vsq); + VPRTYBQ parityv1ti2 {} + + const vsi __builtin_altivec_vprtybw (vsi); + VPRTYBW parityv4si2 {} + + const vsll __builtin_altivec_vrldmi (vsll, vsll, vsll); + VRLDMI altivec_vrldmi {} + + const vsll __builtin_altivec_vrldnm (vsll, vsll); + VRLDNM altivec_vrldnm {} + + const vsi __builtin_altivec_vrlwmi (vsi, vsi, vsi); + VRLWMI altivec_vrlwmi {} + + const vsi __builtin_altivec_vrlwnm (vsi, vsi); + VRLWNM altivec_vrlwnm {} + + const vsll __builtin_altivec_vsignextsb2d (vsc); + VSIGNEXTSB2D vsignextend_qi_v2di {} + + const vsi __builtin_altivec_vsignextsb2w (vsc); + VSIGNEXTSB2W vsignextend_qi_v4si {} + + const vsll __builtin_altivec_visgnextsh2d (vss); + VSIGNEXTSH2D vsignextend_hi_v2di {} + + const vsi __builtin_altivec_vsignextsh2w (vss); + VSIGNEXTSH2W vsignextend_hi_v4si {} + + const vsll __builtin_altivec_vsignextsw2d (vsi); + VSIGNEXTSW2D vsignextend_si_v2di {} + + const vsc __builtin_altivec_vslv (vsc, vsc); + VSLV vslv {} + + const vsc __builtin_altivec_vsrv (vsc, vsc); + VSRV vsrv {} + + const signed int __builtin_scalar_byte_in_range (signed int, signed int); + CMPRB cmprb {} + + const signed int __builtin_scalar_byte_in_either_range (signed int, signed int); + CMPRB2 cmprb2 {} + + const vsll __builtin_vsx_extract4b (vsc, const int[0,12]); + EXTRACT4B extract4b {} + + const vd __builtin_vsx_extract_exp_dp (vd); + VEEDP xvxexpdp {} + + const vf __builtin_vsx_extract_exp_sp (vf); + VEESP xvxexpsp {} + + const vd __builtin_vsx_extract_sig_dp (vd); + VESDP xvxsigdp {} + + const vf __builtin_vsx_extract_sig_sp (vf); + VESSP xvxsigsp {} + + const vsc __builtin_vsx_insert4b (vsi, vsc, const int[0,12]); + INSERT4B insert4b {} + + const vd __builtin_vsx_insert_exp_dp (vd, vd); + VIEDP xviexpdp {} + + const vf __builtin_vsx_insert_exp_sp (vf, vf); + VIESP xviexpsp {} + + const signed int __builtin_vsx_scalar_cmp_exp_dp_eq (double, double); + VSCEDPEQ xscmpexpdp_eq {} + + const signed int __builtin_vsx_scalar_cmp_exp_dp_gt (double, double); + VSCEDPGT xscmpexpdp_gt {} + + const signed int __builtin_vsx_scalar_cmp_exp_dp_lt (double, double); + VSCEDPLT xscmpexpdp_lt {} + + const signed int __builtin_vsx_scalar_cmp_exp_dp_unordered (double, double); + VSCEDPUO xscmpexpdp_unordered {} + + const signed int __builtin_vsx_scalar_test_data_class_dp (double, const int<7>); + VSTDCDP xststdcdp {} + + const signed int __builtin_vsx_scalar_test_data_class_sp (float, const int<7>); + VSTDCSP xststdcsp {} + + const signed int __builtin_vsx_scalar_test_neg_dp (double); + VSTDCNDP xststdcnegdp {} + + const signed int __builtin_vsx_scalar_test_neg_sp (float); + VSTDCNSP xststdcnegsp {} + + const vsll __builtin_vsx_test_data_class_dp (vd, const int<7>); + VTDCDP xvtstdcdp {} + + const vsi __builtin_vsx_test_data_class_sp (vf, const int<7>); + VTDCSP xvtstdcsp {} + + const vf __builtin_vsx_vextract_fp_from_shorth (vss); + VEXTRACT_FP_FROM_SHORTH vextract_fp_from_shorth {} + + const vf __builtin_vsx_vextract_fp_from_shortl (vss); + VEXTRACT_FP_FROM_SHORTL vextract_fp_from_shortl {} + + const vd __builtin_vsx_xxbrd_v2df (vd); + XXBRD_V2DF p9_xxbrd_v2df {} + + const vsll __builtin_vsx_xxbrd_v2di (vsll); + XXBRD_V2DI p9_xxbrd_v2di {} + + const vss __builtin_vsx_xxbrh_v8hi (vss); + XXBRH_V8HI p9_xxbrh_v8hi {} + + const vsc __builtin_vsx_xxbrq_v16qi (vsc); + XXBRQ_V16QI p9_xxbrq_v16qi {} + + const vsq __builtin_vsx_xxbrq_v1ti (vsq); + XXBRQ_V1TI p9_xxbrq_v1ti {} + + const vf __builtin_vsx_xxbrw_v4sf (vf); + XXBRW_V4SF p9_xxbrw_v4sf {} + + const vsi __builtin_vsx_xxbrw_v4si (vsi); + XXBRW_V4SI p9_xxbrw_v4si {} + + +; Miscellaneous P9 functions +[power9] + signed long long __builtin_darn (); + DARN darn {} + + signed int __builtin_darn_32 (); + DARN_32 darn_32 {} + + signed long long __builtin_darn_raw (); + DARN_RAW darn_raw {} + + double __builtin_mffsl (); + MFFSL rs6000_mffsl {} + + const signed int __builtin_dtstsfi_eq_dd (const int<6>, _Decimal64); + TSTSFI_EQ_DD dfptstsfi_eq_dd {} + + const signed int __builtin_dtstsfi_eq_td (const int<6>, _Decimal128); + TSTSFI_EQ_TD dfptstsfi_eq_td {} + + const signed int __builtin_dtstsfi_gt_dd (const int<6>, _Decimal64); + TSTSFI_GT_DD dfptstsfi_gt_dd {} + + const signed int __builtin_dtstsfi_gt_td (const int<6>, _Decimal128); + TSTSFI_GT_TD dfptstsfi_gt_td {} + + const signed int __builtin_dtstsfi_lt_dd (const int<6>, _Decimal64); + TSTSFI_LT_DD dfptstsfi_lt_dd {} + + const signed int __builtin_dtstsfi_lt_td (const int<6>, _Decimal128); + TSTSFI_LT_TD dfptstsfi_lt_td {} + + const signed int __builtin_dtstsfi_ov_dd (const int<6>, _Decimal64); + TSTSFI_OV_DD dfptstsfi_unordered_dd {} + + const signed int __builtin_dtstsfi_ov_td (const int<6>, _Decimal128); + TSTSFI_OV_TD dfptstsfi_unordered_td {} + + +[power9-64] + void __builtin_altivec_xst_len_r (vsc, void *, long); + XST_LEN_R xst_len_r {} + + void __builtin_altivec_stxvl (vsc, void *, long); + STXVL stxvl {} + + const signed int __builtin_scalar_byte_in_set (signed int, signed long long); + CMPEQB cmpeqb {} + + pure vsc __builtin_vsx_lxvl (const void *, signed long); + LXVL lxvl {} + + const signed long __builtin_vsx_scalar_extract_exp (double); + VSEEDP xsxexpdp {} + + const signed long __builtin_vsx_scalar_extract_sig (double); + VSESDP xsxsigdp {} + + const double __builtin_vsx_scalar_insert_exp (unsigned long long, unsigned long long); + VSIEDP xsiexpdp {} + + const double __builtin_vsx_scalar_insert_exp_dp (double, unsigned long long); + VSIEDPF xsiexpdpf {} + + pure vsc __builtin_vsx_xl_len_r (void *, signed long); + XL_LEN_R xl_len_r {} -- cgit v1.1 From 19b7bf620cd4e610bb91b5d2ae5446a2b73b6308 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 24 Aug 2021 11:50:09 -0500 Subject: rs6000: Add more type nodes to support builtin processing 2021-08-24 Bill Schmidt gcc/ * config/rs6000/rs6000-call.c (rs6000_init_builtins): Initialize various pointer type nodes. * config/rs6000/rs6000.h (rs6000_builtin_type_index): Add enum values for various pointer types. (ptr_V16QI_type_node): New macro. (ptr_V1TI_type_node): New macro. (ptr_V2DI_type_node): New macro. (ptr_V2DF_type_node): New macro. (ptr_V4SI_type_node): New macro. (ptr_V4SF_type_node): New macro. (ptr_V8HI_type_node): New macro. (ptr_unsigned_V16QI_type_node): New macro. (ptr_unsigned_V1TI_type_node): New macro. (ptr_unsigned_V8HI_type_node): New macro. (ptr_unsigned_V4SI_type_node): New macro. (ptr_unsigned_V2DI_type_node): New macro. (ptr_bool_V16QI_type_node): New macro. (ptr_bool_V8HI_type_node): New macro. (ptr_bool_V4SI_type_node): New macro. (ptr_bool_V2DI_type_node): New macro. (ptr_bool_V1TI_type_node): New macro. (ptr_pixel_type_node): New macro. (ptr_intQI_type_node): New macro. (ptr_uintQI_type_node): New macro. (ptr_intHI_type_node): New macro. (ptr_uintHI_type_node): New macro. (ptr_intSI_type_node): New macro. (ptr_uintSI_type_node): New macro. (ptr_intDI_type_node): New macro. (ptr_uintDI_type_node): New macro. (ptr_intTI_type_node): New macro. (ptr_uintTI_type_node): New macro. (ptr_long_integer_type_node): New macro. (ptr_long_unsigned_type_node): New macro. (ptr_float_type_node): New macro. (ptr_double_type_node): New macro. (ptr_long_double_type_node): New macro. (ptr_dfloat64_type_node): New macro. (ptr_dfloat128_type_node): New macro. (ptr_ieee128_type_node): New macro. (ptr_ibm128_type_node): New macro. (ptr_vector_pair_type_node): New macro. (ptr_vector_quad_type_node): New macro. (ptr_long_long_integer_type_node): New macro. (ptr_long_long_unsigned_type_node): New macro. --- gcc/config/rs6000/rs6000-call.c | 148 +++++++++++++++++++++++++++++++++++++++- gcc/config/rs6000/rs6000.h | 82 ++++++++++++++++++++++ 2 files changed, 228 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index 3c3108a..fd7f24d 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -13294,6 +13294,7 @@ rs6000_init_builtins (void) { tree tdecl; tree ftype; + tree t; machine_mode mode; if (TARGET_DEBUG_BUILTIN) @@ -13304,25 +13305,63 @@ rs6000_init_builtins (void) V2DI_type_node = rs6000_vector_type (TARGET_POWERPC64 ? "__vector long" : "__vector long long", long_long_integer_type_node, 2); + ptr_V2DI_type_node + = build_pointer_type (build_qualified_type (V2DI_type_node, + TYPE_QUAL_CONST)); + V2DF_type_node = rs6000_vector_type ("__vector double", double_type_node, 2); + ptr_V2DF_type_node + = build_pointer_type (build_qualified_type (V2DF_type_node, + TYPE_QUAL_CONST)); + V4SI_type_node = rs6000_vector_type ("__vector signed int", intSI_type_node, 4); + ptr_V4SI_type_node + = build_pointer_type (build_qualified_type (V4SI_type_node, + TYPE_QUAL_CONST)); + V4SF_type_node = rs6000_vector_type ("__vector float", float_type_node, 4); + ptr_V4SF_type_node + = build_pointer_type (build_qualified_type (V4SF_type_node, + TYPE_QUAL_CONST)); + V8HI_type_node = rs6000_vector_type ("__vector signed short", intHI_type_node, 8); + ptr_V8HI_type_node + = build_pointer_type (build_qualified_type (V8HI_type_node, + TYPE_QUAL_CONST)); + V16QI_type_node = rs6000_vector_type ("__vector signed char", intQI_type_node, 16); + ptr_V16QI_type_node + = build_pointer_type (build_qualified_type (V16QI_type_node, + TYPE_QUAL_CONST)); unsigned_V16QI_type_node = rs6000_vector_type ("__vector unsigned char", unsigned_intQI_type_node, 16); + ptr_unsigned_V16QI_type_node + = build_pointer_type (build_qualified_type (unsigned_V16QI_type_node, + TYPE_QUAL_CONST)); + unsigned_V8HI_type_node = rs6000_vector_type ("__vector unsigned short", unsigned_intHI_type_node, 8); + ptr_unsigned_V8HI_type_node + = build_pointer_type (build_qualified_type (unsigned_V8HI_type_node, + TYPE_QUAL_CONST)); + unsigned_V4SI_type_node = rs6000_vector_type ("__vector unsigned int", unsigned_intSI_type_node, 4); + ptr_unsigned_V4SI_type_node + = build_pointer_type (build_qualified_type (unsigned_V4SI_type_node, + TYPE_QUAL_CONST)); + unsigned_V2DI_type_node = rs6000_vector_type (TARGET_POWERPC64 ? "__vector unsigned long" : "__vector unsigned long long", long_long_unsigned_type_node, 2); + ptr_unsigned_V2DI_type_node + = build_pointer_type (build_qualified_type (unsigned_V2DI_type_node, + TYPE_QUAL_CONST)); opaque_V4SI_type_node = build_opaque_vector_type (intSI_type_node, 4); @@ -13336,9 +13375,15 @@ rs6000_init_builtins (void) { V1TI_type_node = rs6000_vector_type ("__vector __int128", intTI_type_node, 1); + ptr_V1TI_type_node + = build_pointer_type (build_qualified_type (V1TI_type_node, + TYPE_QUAL_CONST)); unsigned_V1TI_type_node = rs6000_vector_type ("__vector unsigned __int128", unsigned_intTI_type_node, 1); + ptr_unsigned_V1TI_type_node + = build_pointer_type (build_qualified_type (unsigned_V1TI_type_node, + TYPE_QUAL_CONST)); } /* The 'vector bool ...' types must be kept distinct from 'vector unsigned ...' @@ -13372,6 +13417,76 @@ rs6000_init_builtins (void) dfloat128_type_internal_node = dfloat128_type_node; void_type_internal_node = void_type_node; + ptr_intQI_type_node + = build_pointer_type (build_qualified_type (intQI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_uintQI_type_node + = build_pointer_type (build_qualified_type (uintQI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_intHI_type_node + = build_pointer_type (build_qualified_type (intHI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_uintHI_type_node + = build_pointer_type (build_qualified_type (uintHI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_intSI_type_node + = build_pointer_type (build_qualified_type (intSI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_uintSI_type_node + = build_pointer_type (build_qualified_type (uintSI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_intDI_type_node + = build_pointer_type (build_qualified_type (intDI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_uintDI_type_node + = build_pointer_type (build_qualified_type (uintDI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_intTI_type_node + = build_pointer_type (build_qualified_type (intTI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_uintTI_type_node + = build_pointer_type (build_qualified_type (uintTI_type_internal_node, + TYPE_QUAL_CONST)); + + t = build_qualified_type (long_integer_type_internal_node, TYPE_QUAL_CONST); + ptr_long_integer_type_node = build_pointer_type (t); + + t = build_qualified_type (long_unsigned_type_internal_node, TYPE_QUAL_CONST); + ptr_long_unsigned_type_node = build_pointer_type (t); + + ptr_float_type_node + = build_pointer_type (build_qualified_type (float_type_internal_node, + TYPE_QUAL_CONST)); + ptr_double_type_node + = build_pointer_type (build_qualified_type (double_type_internal_node, + TYPE_QUAL_CONST)); + ptr_long_double_type_node + = build_pointer_type (build_qualified_type (long_double_type_internal_node, + TYPE_QUAL_CONST)); + if (dfloat64_type_node) + { + t = build_qualified_type (dfloat64_type_internal_node, TYPE_QUAL_CONST); + ptr_dfloat64_type_node = build_pointer_type (t); + } + else + ptr_dfloat64_type_node = NULL; + + if (dfloat128_type_node) + { + t = build_qualified_type (dfloat128_type_internal_node, TYPE_QUAL_CONST); + ptr_dfloat128_type_node = build_pointer_type (t); + } + else + ptr_dfloat128_type_node = NULL; + + t = build_qualified_type (long_long_integer_type_internal_node, + TYPE_QUAL_CONST); + ptr_long_long_integer_type_node = build_pointer_type (t); + + t = build_qualified_type (long_long_unsigned_type_internal_node, + TYPE_QUAL_CONST); + ptr_long_long_unsigned_type_node = build_pointer_type (t); + /* 128-bit floating point support. KFmode is IEEE 128-bit floating point. IFmode is the IBM extended 128-bit format that is a pair of doubles. TFmode will be either IEEE 128-bit floating point or the IBM double-double @@ -13399,7 +13514,8 @@ rs6000_init_builtins (void) SET_TYPE_MODE (ibm128_float_type_node, IFmode); layout_type (ibm128_float_type_node); } - + t = build_qualified_type (ibm128_float_type_node, TYPE_QUAL_CONST); + ptr_ibm128_float_type_node = build_pointer_type (t); lang_hooks.types.register_builtin_type (ibm128_float_type_node, "__ibm128"); @@ -13407,7 +13523,8 @@ rs6000_init_builtins (void) ieee128_float_type_node = long_double_type_node; else ieee128_float_type_node = float128_type_node; - + t = build_qualified_type (ieee128_float_type_node, TYPE_QUAL_CONST); + ptr_ieee128_float_type_node = build_pointer_type (t); lang_hooks.types.register_builtin_type (ieee128_float_type_node, "__ieee128"); } @@ -13427,6 +13544,8 @@ rs6000_init_builtins (void) TYPE_USER_ALIGN (vector_pair_type_node) = 0; lang_hooks.types.register_builtin_type (vector_pair_type_node, "__vector_pair"); + t = build_qualified_type (vector_pair_type_node, TYPE_QUAL_CONST); + ptr_vector_pair_type_node = build_pointer_type (t); vector_quad_type_node = make_node (OPAQUE_TYPE); SET_TYPE_MODE (vector_quad_type_node, XOmode); @@ -13437,6 +13556,8 @@ rs6000_init_builtins (void) TYPE_USER_ALIGN (vector_quad_type_node) = 0; lang_hooks.types.register_builtin_type (vector_quad_type_node, "__vector_quad"); + t = build_qualified_type (vector_quad_type_node, TYPE_QUAL_CONST); + ptr_vector_quad_type_node = build_pointer_type (t); } /* Initialize the modes for builtin_function_type, mapping a machine mode to @@ -13487,18 +13608,41 @@ rs6000_init_builtins (void) bool_V16QI_type_node = rs6000_vector_type ("__vector __bool char", bool_char_type_node, 16); + ptr_bool_V16QI_type_node + = build_pointer_type (build_qualified_type (bool_V16QI_type_node, + TYPE_QUAL_CONST)); + bool_V8HI_type_node = rs6000_vector_type ("__vector __bool short", bool_short_type_node, 8); + ptr_bool_V8HI_type_node + = build_pointer_type (build_qualified_type (bool_V8HI_type_node, + TYPE_QUAL_CONST)); + bool_V4SI_type_node = rs6000_vector_type ("__vector __bool int", bool_int_type_node, 4); + ptr_bool_V4SI_type_node + = build_pointer_type (build_qualified_type (bool_V4SI_type_node, + TYPE_QUAL_CONST)); + bool_V2DI_type_node = rs6000_vector_type (TARGET_POWERPC64 ? "__vector __bool long" : "__vector __bool long long", bool_long_long_type_node, 2); + ptr_bool_V2DI_type_node + = build_pointer_type (build_qualified_type (bool_V2DI_type_node, + TYPE_QUAL_CONST)); + bool_V1TI_type_node = rs6000_vector_type ("__vector __bool __int128", intTI_type_node, 1); + ptr_bool_V1TI_type_node + = build_pointer_type (build_qualified_type (bool_V1TI_type_node, + TYPE_QUAL_CONST)); + pixel_V8HI_type_node = rs6000_vector_type ("__vector __pixel", pixel_type_node, 8); + ptr_pixel_V8HI_type_node + = build_pointer_type (build_qualified_type (pixel_V8HI_type_node, + TYPE_QUAL_CONST)); pcvoid_type_node = build_pointer_type (build_qualified_type (void_type_node, TYPE_QUAL_CONST)); diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index c5d20d2..3eba1c0 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -2461,6 +2461,47 @@ enum rs6000_builtin_type_index RS6000_BTI_vector_pair, /* unsigned 256-bit types (vector pair). */ RS6000_BTI_vector_quad, /* unsigned 512-bit types (vector quad). */ RS6000_BTI_const_ptr_void, /* const pointer to void */ + RS6000_BTI_ptr_V16QI, + RS6000_BTI_ptr_V1TI, + RS6000_BTI_ptr_V2DI, + RS6000_BTI_ptr_V2DF, + RS6000_BTI_ptr_V4SI, + RS6000_BTI_ptr_V4SF, + RS6000_BTI_ptr_V8HI, + RS6000_BTI_ptr_unsigned_V16QI, + RS6000_BTI_ptr_unsigned_V1TI, + RS6000_BTI_ptr_unsigned_V8HI, + RS6000_BTI_ptr_unsigned_V4SI, + RS6000_BTI_ptr_unsigned_V2DI, + RS6000_BTI_ptr_bool_V16QI, + RS6000_BTI_ptr_bool_V8HI, + RS6000_BTI_ptr_bool_V4SI, + RS6000_BTI_ptr_bool_V2DI, + RS6000_BTI_ptr_bool_V1TI, + RS6000_BTI_ptr_pixel_V8HI, + RS6000_BTI_ptr_INTQI, + RS6000_BTI_ptr_UINTQI, + RS6000_BTI_ptr_INTHI, + RS6000_BTI_ptr_UINTHI, + RS6000_BTI_ptr_INTSI, + RS6000_BTI_ptr_UINTSI, + RS6000_BTI_ptr_INTDI, + RS6000_BTI_ptr_UINTDI, + RS6000_BTI_ptr_INTTI, + RS6000_BTI_ptr_UINTTI, + RS6000_BTI_ptr_long_integer, + RS6000_BTI_ptr_long_unsigned, + RS6000_BTI_ptr_float, + RS6000_BTI_ptr_double, + RS6000_BTI_ptr_long_double, + RS6000_BTI_ptr_dfloat64, + RS6000_BTI_ptr_dfloat128, + RS6000_BTI_ptr_ieee128_float, + RS6000_BTI_ptr_ibm128_float, + RS6000_BTI_ptr_vector_pair, + RS6000_BTI_ptr_vector_quad, + RS6000_BTI_ptr_long_long, + RS6000_BTI_ptr_long_long_unsigned, RS6000_BTI_MAX }; @@ -2517,6 +2558,47 @@ enum rs6000_builtin_type_index #define vector_pair_type_node (rs6000_builtin_types[RS6000_BTI_vector_pair]) #define vector_quad_type_node (rs6000_builtin_types[RS6000_BTI_vector_quad]) #define pcvoid_type_node (rs6000_builtin_types[RS6000_BTI_const_ptr_void]) +#define ptr_V16QI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_V16QI]) +#define ptr_V1TI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_V1TI]) +#define ptr_V2DI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_V2DI]) +#define ptr_V2DF_type_node (rs6000_builtin_types[RS6000_BTI_ptr_V2DF]) +#define ptr_V4SI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_V4SI]) +#define ptr_V4SF_type_node (rs6000_builtin_types[RS6000_BTI_ptr_V4SF]) +#define ptr_V8HI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_V8HI]) +#define ptr_unsigned_V16QI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_unsigned_V16QI]) +#define ptr_unsigned_V1TI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_unsigned_V1TI]) +#define ptr_unsigned_V8HI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_unsigned_V8HI]) +#define ptr_unsigned_V4SI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_unsigned_V4SI]) +#define ptr_unsigned_V2DI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_unsigned_V2DI]) +#define ptr_bool_V16QI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_bool_V16QI]) +#define ptr_bool_V8HI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_bool_V8HI]) +#define ptr_bool_V4SI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_bool_V4SI]) +#define ptr_bool_V2DI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_bool_V2DI]) +#define ptr_bool_V1TI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_bool_V1TI]) +#define ptr_pixel_V8HI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_pixel_V8HI]) +#define ptr_intQI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_INTQI]) +#define ptr_uintQI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_UINTQI]) +#define ptr_intHI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_INTHI]) +#define ptr_uintHI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_UINTHI]) +#define ptr_intSI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_INTSI]) +#define ptr_uintSI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_UINTSI]) +#define ptr_intDI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_INTDI]) +#define ptr_uintDI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_UINTDI]) +#define ptr_intTI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_INTTI]) +#define ptr_uintTI_type_node (rs6000_builtin_types[RS6000_BTI_ptr_UINTTI]) +#define ptr_long_integer_type_node (rs6000_builtin_types[RS6000_BTI_ptr_long_integer]) +#define ptr_long_unsigned_type_node (rs6000_builtin_types[RS6000_BTI_ptr_long_unsigned]) +#define ptr_float_type_node (rs6000_builtin_types[RS6000_BTI_ptr_float]) +#define ptr_double_type_node (rs6000_builtin_types[RS6000_BTI_ptr_double]) +#define ptr_long_double_type_node (rs6000_builtin_types[RS6000_BTI_ptr_long_double]) +#define ptr_dfloat64_type_node (rs6000_builtin_types[RS6000_BTI_ptr_dfloat64]) +#define ptr_dfloat128_type_node (rs6000_builtin_types[RS6000_BTI_ptr_dfloat128]) +#define ptr_ieee128_float_type_node (rs6000_builtin_types[RS6000_BTI_ptr_ieee128_float]) +#define ptr_ibm128_float_type_node (rs6000_builtin_types[RS6000_BTI_ptr_ibm128_float]) +#define ptr_vector_pair_type_node (rs6000_builtin_types[RS6000_BTI_ptr_vector_pair]) +#define ptr_vector_quad_type_node (rs6000_builtin_types[RS6000_BTI_ptr_vector_quad]) +#define ptr_long_long_integer_type_node (rs6000_builtin_types[RS6000_BTI_ptr_long_long]) +#define ptr_long_long_unsigned_type_node (rs6000_builtin_types[RS6000_BTI_ptr_long_long_unsigned]) extern GTY(()) tree rs6000_builtin_types[RS6000_BTI_MAX]; extern GTY(()) tree rs6000_builtin_decls[RS6000_BUILTIN_COUNT]; -- cgit v1.1 From 50cb8300d3bf0b487063784fbbe394301b6c79b2 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 28 Jul 2021 13:22:57 -0400 Subject: rs6000: Add Power10 builtins 2021-07-28 Bill Schmidt gcc/ * config/rs6000/rs6000-builtin-new.def: Add power10 and power10-64 stanzas. --- gcc/config/rs6000/rs6000-builtin-new.def | 523 +++++++++++++++++++++++++++++++ 1 file changed, 523 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def index 0462797..b6fc994 100644 --- a/gcc/config/rs6000/rs6000-builtin-new.def +++ b/gcc/config/rs6000/rs6000-builtin-new.def @@ -2806,3 +2806,526 @@ pure vsc __builtin_vsx_xl_len_r (void *, signed long); XL_LEN_R xl_len_r {} + + +[power10] + const vbq __builtin_altivec_cmpge_1ti (vsq, vsq); + CMPGE_1TI vector_nltv1ti {} + + const vbq __builtin_altivec_cmpge_u1ti (vuq, vuq); + CMPGE_U1TI vector_nltuv1ti {} + + const vbq __builtin_altivec_cmple_1ti (vsq, vsq); + CMPLE_1TI vector_ngtv1ti {} + + const vbq __builtin_altivec_cmple_u1ti (vuq, vuq); + CMPLE_U1TI vector_ngtuv1ti {} + + const unsigned long long __builtin_altivec_cntmbb (vuc, const int<1>); + VCNTMBB vec_cntmb_v16qi {} + + const unsigned long long __builtin_altivec_cntmbd (vull, const int<1>); + VCNTMBD vec_cntmb_v2di {} + + const unsigned long long __builtin_altivec_cntmbh (vus, const int<1>); + VCNTMBH vec_cntmb_v8hi {} + + const unsigned long long __builtin_altivec_cntmbw (vui, const int<1>); + VCNTMBW vec_cntmb_v4si {} + + const vsq __builtin_altivec_div_v1ti (vsq, vsq); + DIV_V1TI vsx_div_v1ti {} + + const vsq __builtin_altivec_dives (vsq, vsq); + DIVES_V1TI vsx_dives_v1ti {} + + const vuq __builtin_altivec_diveu (vuq, vuq); + DIVEU_V1TI vsx_diveu_v1ti {} + + const vsq __builtin_altivec_mods (vsq, vsq); + MODS_V1TI vsx_mods_v1ti {} + + const vuq __builtin_altivec_modu (vuq, vuq); + MODU_V1TI vsx_modu_v1ti {} + + const vuc __builtin_altivec_mtvsrbm (unsigned long long); + MTVSRBM vec_mtvsr_v16qi {} + + const vull __builtin_altivec_mtvsrdm (unsigned long long); + MTVSRDM vec_mtvsr_v2di {} + + const vus __builtin_altivec_mtvsrhm (unsigned long long); + MTVSRHM vec_mtvsr_v8hi {} + + const vuq __builtin_altivec_mtvsrqm (unsigned long long); + MTVSRQM vec_mtvsr_v1ti {} + + const vui __builtin_altivec_mtvsrwm (unsigned long long); + MTVSRWM vec_mtvsr_v4si {} + + pure signed __int128 __builtin_altivec_se_lxvrbx (signed long, const signed char *); + SE_LXVRBX vsx_lxvrbx {lxvrse} + + pure signed __int128 __builtin_altivec_se_lxvrhx (signed long, const signed short *); + SE_LXVRHX vsx_lxvrhx {lxvrse} + + pure signed __int128 __builtin_altivec_se_lxvrwx (signed long, const signed int *); + SE_LXVRWX vsx_lxvrwx {lxvrse} + + pure signed __int128 __builtin_altivec_se_lxvrdx (signed long, const signed long long *); + SE_LXVRDX vsx_lxvrdx {lxvrse} + + void __builtin_altivec_tr_stxvrbx (vsq, signed long, signed char *); + TR_STXVRBX vsx_stxvrbx {stvec} + + void __builtin_altivec_tr_stxvrhx (vsq, signed long, signed int *); + TR_STXVRHX vsx_stxvrhx {stvec} + + void __builtin_altivec_tr_stxvrwx (vsq, signed long, signed short *); + TR_STXVRWX vsx_stxvrwx {stvec} + + void __builtin_altivec_tr_stxvrdx (vsq, signed long, signed long long *); + TR_STXVRDX vsx_stxvrdx {stvec} + + const vuq __builtin_altivec_udiv_v1ti (vuq, vuq); + UDIV_V1TI vsx_udiv_v1ti {} + + const vull __builtin_altivec_vcfuged (vull, vull); + VCFUGED vcfuged {} + + const vsc __builtin_altivec_vclrlb (vsc, signed int); + VCLRLB vclrlb {} + + const vsc __builtin_altivec_vclrrb (vsc, signed int); + VCLRRB vclrrb {} + + const signed int __builtin_altivec_vcmpaet_p (vsq, vsq); + VCMPAET_P vector_ae_v1ti_p {} + + const vbq __builtin_altivec_vcmpequt (vsq, vsq); + VCMPEQUT vector_eqv1ti {} + + const signed int __builtin_altivec_vcmpequt_p (signed int, vsq, vsq); + VCMPEQUT_P vector_eq_v1ti_p {pred} + + const vbq __builtin_altivec_vcmpgtst (vsq, vsq); + VCMPGTST vector_gtv1ti {} + + const signed int __builtin_altivec_vcmpgtst_p (signed int, vsq, vsq); + VCMPGTST_P vector_gt_v1ti_p {pred} + + const vbq __builtin_altivec_vcmpgtut (vuq, vuq); + VCMPGTUT vector_gtuv1ti {} + + const signed int __builtin_altivec_vcmpgtut_p (signed int, vuq, vuq); + VCMPGTUT_P vector_gtu_v1ti_p {pred} + + const vbq __builtin_altivec_vcmpnet (vsq, vsq); + VCMPNET vcmpnet {} + + const signed int __builtin_altivec_vcmpnet_p (vsq, vsq); + VCMPNET_P vector_ne_v1ti_p {} + + const vull __builtin_altivec_vclzdm (vull, vull); + VCLZDM vclzdm {} + + const vull __builtin_altivec_vctzdm (vull, vull); + VCTZDM vctzdm {} + + const vsll __builtin_altivec_vdivesd (vsll, vsll); + VDIVESD dives_v2di {} + + const vsi __builtin_altivec_vdivesw (vsi, vsi); + VDIVESW dives_v4si {} + + const vull __builtin_altivec_vdiveud (vull, vull); + VDIVEUD diveu_v2di {} + + const vui __builtin_altivec_vdiveuw (vui, vui); + VDIVEUW diveu_v4si {} + + const vsll __builtin_altivec_vdivsd (vsll, vsll); + VDIVSD divv2di3 {} + + const vsi __builtin_altivec_vdivsw (vsi, vsi); + VDIVSW divv4si3 {} + + const vull __builtin_altivec_vdivud (vull, vull); + VDIVUD udivv2di3 {} + + const vui __builtin_altivec_vdivuw (vui, vui); + VDIVUW udivv4si3 {} + + const vuc __builtin_altivec_vexpandmb (vuc); + VEXPANDMB vec_expand_v16qi {} + + const vull __builtin_altivec_vexpandmd (vull); + VEXPANDMD vec_expand_v2di {} + + const vus __builtin_altivec_vexpandmh (vus); + VEXPANDMH vec_expand_v8hi {} + + const vuq __builtin_altivec_vexpandmq (vuq); + VEXPANDMQ vec_expand_v1ti {} + + const vui __builtin_altivec_vexpandmw (vui); + VEXPANDMW vec_expand_v4si {} + + const vull __builtin_altivec_vextddvhx (vull, vull, unsigned int); + VEXTRACTDR vextractrv2di {} + + const vull __builtin_altivec_vextddvlx (vull, vull, unsigned int); + VEXTRACTDL vextractlv2di {} + + const vull __builtin_altivec_vextdubvhx (vuc, vuc, unsigned int); + VEXTRACTBR vextractrv16qi {} + + const vull __builtin_altivec_vextdubvlx (vuc, vuc, unsigned int); + VEXTRACTBL vextractlv16qi {} + + const vull __builtin_altivec_vextduhvhx (vus, vus, unsigned int); + VEXTRACTHR vextractrv8hi {} + + const vull __builtin_altivec_vextduhvlx (vus, vus, unsigned int); + VEXTRACTHL vextractlv8hi {} + + const vull __builtin_altivec_vextduwvhx (vui, vui, unsigned int); + VEXTRACTWR vextractrv4si {} + + const vull __builtin_altivec_vextduwvlx (vui, vui, unsigned int); + VEXTRACTWL vextractlv4si {} + + const signed int __builtin_altivec_vextractmb (vsc); + VEXTRACTMB vec_extract_v16qi {} + + const signed int __builtin_altivec_vextractmd (vsll); + VEXTRACTMD vec_extract_v2di {} + + const signed int __builtin_altivec_vextractmh (vss); + VEXTRACTMH vec_extract_v8hi {} + + const signed int __builtin_altivec_vextractmq (vsq); + VEXTRACTMQ vec_extract_v1ti {} + + const signed int __builtin_altivec_vextractmw (vsi); + VEXTRACTMW vec_extract_v4si {} + + const unsigned long long __builtin_altivec_vgnb (vull, const int <2,7>); + VGNB vgnb {} + + const vuc __builtin_altivec_vinsgubvlx (unsigned int, vuc, unsigned int); + VINSERTGPRBL vinsertgl_v16qi {} + + const vsc __builtin_altivec_vinsgubvrx (signed int, vsc, signed int); + VINSERTGPRBR vinsertgr_v16qi {} + + const vull __builtin_altivec_vinsgudvlx (unsigned int, vull, unsigned int); + VINSERTGPRDL vinsertgl_v2di {} + + const vsll __builtin_altivec_vinsgudvrx (signed int, vsll, signed int); + VINSERTGPRDR vinsertgr_v2di {} + + const vus __builtin_altivec_vinsguhvlx (unsigned int, vus, unsigned int); + VINSERTGPRHL vinsertgl_v8hi {} + + const vss __builtin_altivec_vinsguhvrx (signed int, vss, signed int); + VINSERTGPRHR vinsertgr_v8hi {} + + const vui __builtin_altivec_vinsguwvlx (unsigned int, vui, unsigned int); + VINSERTGPRWL vinsertgl_v4si {} + + const vsi __builtin_altivec_vinsguwvrx (signed int, vsi, signed int); + VINSERTGPRWR vinsertgr_v4si {} + + const vuc __builtin_altivec_vinsvubvlx (vuc, vuc, unsigned int); + VINSERTVPRBL vinsertvl_v16qi {} + + const vsc __builtin_altivec_vinsvubvrx (vsc, vsc, signed int); + VINSERTVPRBR vinsertvr_v16qi {} + + const vus __builtin_altivec_vinsvuhvlx (vus, vus, unsigned int); + VINSERTVPRHL vinsertvl_v8hi {} + + const vss __builtin_altivec_vinsvuhvrx (vss, vss, signed int); + VINSERTVPRHR vinsertvr_v8hi {} + + const vui __builtin_altivec_vinsvuwvlx (vui, vui, unsigned int); + VINSERTVPRWL vinsertvl_v4si {} + + const vsi __builtin_altivec_vinsvuwvrx (vsi, vsi, signed int); + VINSERTVPRWR vinsertvr_v4si {} + + const vsll __builtin_altivec_vmodsd (vsll, vsll); + VMODSD modv2di3 {} + + const vsi __builtin_altivec_vmodsw (vsi, vsi); + VMODSW modv4si3 {} + + const vull __builtin_altivec_vmodud (vull, vull); + VMODUD umodv2di3 {} + + const vui __builtin_altivec_vmoduw (vui, vui); + VMODUW umodv4si3 {} + + const vsq __builtin_altivec_vmulesd (vsll, vsll); + VMULESD vec_widen_smult_even_v2di {} + + const vuq __builtin_altivec_vmuleud (vull, vull); + VMULEUD vec_widen_umult_even_v2di {} + + const vsll __builtin_altivec_vmulhsd (vsll, vsll); + VMULHSD smulv2di3_highpart {} + + const vsi __builtin_altivec_vmulhsw (vsi, vsi); + VMULHSW smulv4si3_highpart {} + + const vull __builtin_altivec_vmulhud (vull, vull); + VMULHUD umulv2di3_highpart {} + + const vui __builtin_altivec_vmulhuw (vui, vui); + VMULHUW umulv4si3_highpart {} + + const vsll __builtin_altivec_vmulld (vsll, vsll); + VMULLD mulv2di3 {} + + const vsq __builtin_altivec_vmulosd (vsll, vsll); + VMULOSD vec_widen_smult_odd_v2di {} + + const vuq __builtin_altivec_vmuloud (vull, vull); + VMULOUD vec_widen_umult_odd_v2di {} + + const vsq __builtin_altivec_vnor_v1ti (vsq, vsq); + VNOR_V1TI norv1ti3 {} + + const vuq __builtin_altivec_vnor_v1ti_uns (vuq, vuq); + VNOR_V1TI_UNS norv1ti3 {} + + const vull __builtin_altivec_vpdepd (vull, vull); + VPDEPD vpdepd {} + + const vull __builtin_altivec_vpextd (vull, vull); + VPEXTD vpextd {} + + const vull __builtin_altivec_vreplace_un_uv2di (vull, unsigned long long, const int<4>); + VREPLACE_UN_UV2DI vreplace_un_v2di {} + + const vui __builtin_altivec_vreplace_un_uv4si (vui, unsigned int, const int<4>); + VREPLACE_UN_UV4SI vreplace_un_v4si {} + + const vd __builtin_altivec_vreplace_un_v2df (vd, double, const int<4>); + VREPLACE_UN_V2DF vreplace_un_v2df {} + + const vsll __builtin_altivec_vreplace_un_v2di (vsll, signed long long, const int<4>); + VREPLACE_UN_V2DI vreplace_un_v2di {} + + const vf __builtin_altivec_vreplace_un_v4sf (vf, float, const int<4>); + VREPLACE_UN_V4SF vreplace_un_v4sf {} + + const vsi __builtin_altivec_vreplace_un_v4si (vsi, signed int, const int<4>); + VREPLACE_UN_V4SI vreplace_un_v4si {} + + const vull __builtin_altivec_vreplace_uv2di (vull, unsigned long long, const int<1>); + VREPLACE_ELT_UV2DI vreplace_elt_v2di {} + + const vui __builtin_altivec_vreplace_uv4si (vui, unsigned int, const int<2>); + VREPLACE_ELT_UV4SI vreplace_elt_v4si {} + + const vd __builtin_altivec_vreplace_v2df (vd, double, const int<1>); + VREPLACE_ELT_V2DF vreplace_elt_v2df {} + + const vsll __builtin_altivec_vreplace_v2di (vsll, signed long long, const int<1>); + VREPLACE_ELT_V2DI vreplace_elt_v2di {} + + const vf __builtin_altivec_vreplace_v4sf (vf, float, const int<2>); + VREPLACE_ELT_V4SF vreplace_elt_v4sf {} + + const vsi __builtin_altivec_vreplace_v4si (vsi, signed int, const int<2>); + VREPLACE_ELT_V4SI vreplace_elt_v4si {} + + const vsq __builtin_altivec_vrlq (vsq, vuq); + VRLQ vrotlv1ti3 {} + + const vsq __builtin_altivec_vrlqmi (vsq, vsq, vuq); + VRLQMI altivec_vrlqmi {} + + const vsq __builtin_altivec_vrlqnm (vsq, vuq); + VRLQNM altivec_vrlqnm {} + + const vsq __builtin_altivec_vsignext (vsll); + VSIGNEXTSD2Q vsignextend_v2di_v1ti {} + + const vsc __builtin_altivec_vsldb_v16qi (vsc, vsc, const int<3>); + VSLDB_V16QI vsldb_v16qi {} + + const vsll __builtin_altivec_vsldb_v2di (vsll, vsll, const int<3>); + VSLDB_V2DI vsldb_v2di {} + + const vsi __builtin_altivec_vsldb_v4si (vsi, vsi, const int<3>); + VSLDB_V4SI vsldb_v4si {} + + const vss __builtin_altivec_vsldb_v8hi (vss, vss, const int<3>); + VSLDB_V8HI vsldb_v8hi {} + + const vsq __builtin_altivec_vslq (vsq, vuq); + VSLQ vashlv1ti3 {} + + const vsq __builtin_altivec_vsraq (vsq, vuq); + VSRAQ vashrv1ti3 {} + + const vsc __builtin_altivec_vsrdb_v16qi (vsc, vsc, const int<3>); + VSRDB_V16QI vsrdb_v16qi {} + + const vsll __builtin_altivec_vsrdb_v2di (vsll, vsll, const int<3>); + VSRDB_V2DI vsrdb_v2di {} + + const vsi __builtin_altivec_vsrdb_v4si (vsi, vsi, const int<3>); + VSRDB_V4SI vsrdb_v4si {} + + const vss __builtin_altivec_vsrdb_v8hi (vss, vss, const int<3>); + VSRDB_V8HI vsrdb_v8hi {} + + const vsq __builtin_altivec_vsrq (vsq, vuq); + VSRQ vlshrv1ti3 {} + + const vsc __builtin_altivec_vstribl (vsc); + VSTRIBL vstril_v16qi {} + + const signed int __builtin_altivec_vstribl_p (vsc); + VSTRIBL_P vstril_p_v16qi {} + + const vsc __builtin_altivec_vstribr (vsc); + VSTRIBR vstrir_v16qi {} + + const signed int __builtin_altivec_vstribr_p (vsc); + VSTRIBR_P vstrir_p_v16qi {} + + const vss __builtin_altivec_vstrihl (vss); + VSTRIHL vstril_v8hi {} + + const signed int __builtin_altivec_vstrihl_p (vss); + VSTRIHL_P vstril_p_v8hi {} + + const vss __builtin_altivec_vstrihr (vss); + VSTRIHR vstrir_v8hi {} + + const signed int __builtin_altivec_vstrihr_p (vss); + VSTRIHR_P vstrir_p_v8hi {} + + const signed int __builtin_vsx_xvtlsbb_all_ones (vsc); + XVTLSBB_ONES xvtlsbbo {} + + const signed int __builtin_vsx_xvtlsbb_all_zeros (vsc); + XVTLSBB_ZEROS xvtlsbbz {} + + const vf __builtin_vsx_vxxsplti32dx_v4sf (vf, const int<1>, float); + VXXSPLTI32DX_V4SF xxsplti32dx_v4sf {} + + const vsi __builtin_vsx_vxxsplti32dx_v4si (vsi, const int<1>, signed int); + VXXSPLTI32DX_V4SI xxsplti32dx_v4si {} + + const vd __builtin_vsx_vxxspltidp (float); + VXXSPLTIDP xxspltidp_v2df {} + + const vf __builtin_vsx_vxxspltiw_v4sf (float); + VXXSPLTIW_V4SF xxspltiw_v4sf {} + + const vsi __builtin_vsx_vxxspltiw_v4si (signed int); + VXXSPLTIW_V4SI xxspltiw_v4si {} + + const vuc __builtin_vsx_xvcvbf16spn (vuc); + XVCVBF16SPN vsx_xvcvbf16spn {} + + const vuc __builtin_vsx_xvcvspbf16 (vuc); + XVCVSPBF16 vsx_xvcvspbf16 {} + + const vuc __builtin_vsx_xxblend_v16qi (vuc, vuc, vuc); + VXXBLEND_V16QI xxblend_v16qi {} + + const vd __builtin_vsx_xxblend_v2df (vd, vd, vd); + VXXBLEND_V2DF xxblend_v2df {} + + const vull __builtin_vsx_xxblend_v2di (vull, vull, vull); + VXXBLEND_V2DI xxblend_v2di {} + + const vf __builtin_vsx_xxblend_v4sf (vf, vf, vf); + VXXBLEND_V4SF xxblend_v4sf {} + + const vui __builtin_vsx_xxblend_v4si (vui, vui, vui); + VXXBLEND_V4SI xxblend_v4si {} + + const vus __builtin_vsx_xxblend_v8hi (vus, vus, vus); + VXXBLEND_V8HI xxblend_v8hi {} + + const vull __builtin_vsx_xxeval (vull, vull, vull, const int <8>); + XXEVAL xxeval {} + + const vuc __builtin_vsx_xxgenpcvm_v16qi (vuc, const int <2>); + XXGENPCVM_V16QI xxgenpcvm_v16qi {} + + const vull __builtin_vsx_xxgenpcvm_v2di (vull, const int <2>); + XXGENPCVM_V2DI xxgenpcvm_v2di {} + + const vui __builtin_vsx_xxgenpcvm_v4si (vui, const int <2>); + XXGENPCVM_V4SI xxgenpcvm_v4si {} + + const vus __builtin_vsx_xxgenpcvm_v8hi (vus, const int <2>); + XXGENPCVM_V8HI xxgenpcvm_v8hi {} + + const vuc __builtin_vsx_xxpermx_uv16qi (vuc, vuc, vuc, const int<3>); + XXPERMX_UV16QI xxpermx {} + + const vull __builtin_vsx_xxpermx_uv2di (vull, vull, vuc, const int<3>); + XXPERMX_UV2DI xxpermx {} + + const vui __builtin_vsx_xxpermx_uv4si (vui, vui, vuc, const int<3>); + XXPERMX_UV4SI xxpermx {} + + const vus __builtin_vsx_xxpermx_uv8hi (vus, vus, vuc, const int<3>); + XXPERMX_UV8HI xxpermx {} + + const vsc __builtin_vsx_xxpermx_v16qi (vsc, vsc, vuc, const int<3>); + XXPERMX_V16QI xxpermx {} + + const vd __builtin_vsx_xxpermx_v2df (vd, vd, vuc, const int<3>); + XXPERMX_V2DF xxpermx {} + + const vsll __builtin_vsx_xxpermx_v2di (vsll, vsll, vuc, const int<3>); + XXPERMX_V2DI xxpermx {} + + const vf __builtin_vsx_xxpermx_v4sf (vf, vf, vuc, const int<3>); + XXPERMX_V4SF xxpermx {} + + const vsi __builtin_vsx_xxpermx_v4si (vsi, vsi, vuc, const int<3>); + XXPERMX_V4SI xxpermx {} + + const vss __builtin_vsx_xxpermx_v8hi (vss, vss, vuc, const int<3>); + XXPERMX_V8HI xxpermx {} + + pure unsigned __int128 __builtin_altivec_ze_lxvrbx (signed long, const unsigned char *); + ZE_LXVRBX vsx_lxvrbx {lxvrze} + + pure unsigned __int128 __builtin_altivec_ze_lxvrhx (signed long, const unsigned short *); + ZE_LXVRHX vsx_lxvrhx {lxvrze} + + pure unsigned __int128 __builtin_altivec_ze_lxvrwx (signed long, const unsigned int *); + ZE_LXVRWX vsx_lxvrwx {lxvrze} + + pure unsigned __int128 __builtin_altivec_ze_lxvrdx (signed long, const unsigned long long *); + ZE_LXVRDX vsx_lxvrdx {lxvrze} + + +[power10-64] + const unsigned long long __builtin_cfuged (unsigned long long, unsigned long long); + CFUGED cfuged {} + + const unsigned long long __builtin_cntlzdm (unsigned long long, unsigned long long); + CNTLZDM cntlzdm {} + + const unsigned long long __builtin_cnttzdm (unsigned long long, unsigned long long); + CNTTZDM cnttzdm {} + + const unsigned long long __builtin_pdepd (unsigned long long, unsigned long long); + PDEPD pdepd {} + + const unsigned long long __builtin_pextd (unsigned long long, unsigned long long); + PEXTD pextd {} -- cgit v1.1 From 9cf3f026e281f5eb978a78055d8949a3295b7f10 Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Tue, 24 Aug 2021 16:48:57 -0400 Subject: aix: SYSTEM_IMPLICIT_EXTERN_C AIX 7.3 system headers are C++ safe and GCC no longer needs to define SYSTEM_IMPLICIT_EXTERN_C for AIX 7.3. This patch moves the definition from aix.h to the individual OS-level configuration files and does not define the macro for AIX 7.3. The patch also corrects the definition of TARGET_AIX_VERSION to 73. gcc/ChangeLog: * config/rs6000/aix.h (SYSTEM_IMPLICIT_EXTERN_C): Delete. * config/rs6000/aix71.h (SYSTEM_IMPLICIT_EXTERN_C): Define. * config/rs6000/aix72.h (SYSTEM_IMPLICIT_EXTERN_C): Define. * config/rs6000/aix73.h (TARGET_AIX_VERSION): Increase to 73. --- gcc/config/rs6000/aix.h | 4 +--- gcc/config/rs6000/aix71.h | 4 ++++ gcc/config/rs6000/aix72.h | 3 +++ gcc/config/rs6000/aix73.h | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/aix.h b/gcc/config/rs6000/aix.h index 662785c..0f4d8cb 100644 --- a/gcc/config/rs6000/aix.h +++ b/gcc/config/rs6000/aix.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-3.0-or-later /* Definitions of target machine for GNU compiler, for IBM RS/6000 POWER running AIX. Copyright (C) 2000-2021 Free Software Foundation, Inc. @@ -23,9 +24,6 @@ #undef TARGET_AIX #define TARGET_AIX 1 -/* System headers are not C++-aware. */ -#define SYSTEM_IMPLICIT_EXTERN_C 1 - /* Linux64.h wants to redefine TARGET_AIX based on -m64, but it can't be used in the #if conditional in options-default.h, so provide another macro. */ #undef TARGET_AIX_OS diff --git a/gcc/config/rs6000/aix71.h b/gcc/config/rs6000/aix71.h index 38cfa9e..1bc1560 100644 --- a/gcc/config/rs6000/aix71.h +++ b/gcc/config/rs6000/aix71.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-3.0-or-later /* Definitions of target machine for GNU compiler, for IBM RS/6000 POWER running AIX V7.1. Copyright (C) 2002-2021 Free Software Foundation, Inc. @@ -268,6 +269,9 @@ extern long long int atoll(const char *); #define SET_CMODEL(opt) do {} while (0) #endif +/* System headers are not C++-aware. */ +#define SYSTEM_IMPLICIT_EXTERN_C 1 + /* This target defines SUPPORTS_WEAK and TARGET_ASM_NAMED_SECTION, but does not have crtbegin/end. */ diff --git a/gcc/config/rs6000/aix72.h b/gcc/config/rs6000/aix72.h index a497a7d..cca64f1 100644 --- a/gcc/config/rs6000/aix72.h +++ b/gcc/config/rs6000/aix72.h @@ -270,6 +270,9 @@ extern long long int atoll(const char *); #define SET_CMODEL(opt) do {} while (0) #endif +/* System headers are not C++-aware. */ +#define SYSTEM_IMPLICIT_EXTERN_C 1 + /* This target defines SUPPORTS_WEAK and TARGET_ASM_NAMED_SECTION, but does not have crtbegin/end. */ diff --git a/gcc/config/rs6000/aix73.h b/gcc/config/rs6000/aix73.h index c707c7e..f0ca1a5 100644 --- a/gcc/config/rs6000/aix73.h +++ b/gcc/config/rs6000/aix73.h @@ -274,7 +274,7 @@ extern long long int atoll(const char *); /* This target defines SUPPORTS_WEAK and TARGET_ASM_NAMED_SECTION, but does not have crtbegin/end. */ -#define TARGET_AIX_VERSION 72 +#define TARGET_AIX_VERSION 73 /* AIX 7.2 supports DWARF3+ debugging. */ #define DWARF2_DEBUGGING_INFO 1 -- cgit v1.1 From a20be0cdc068d9ffab7bf0c9d2a8702162746bd8 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Tue, 24 Aug 2021 21:58:14 -0500 Subject: rs6000: Add vec_unpacku_{hi,lo}_v4si The existing vec_unpacku_{hi,lo} supports emulated unsigned unpacking for short and char but misses the support for int. This patch adds the support of vec_unpacku_{hi,lo}_v4si. Meanwhile, the current implementation uses vector permutation way, which requires one extra customized constant vector as the permutation control vector. It's better to use vector merge high/low with zero constant vector, to save the space in constant area as well as the cost to initialize pcv in prologue. This patch updates it with vector merging and simplify it with iterators. gcc/ChangeLog: * config/rs6000/altivec.md (vec_unpacku_hi_v16qi): Remove. (vec_unpacku_hi_v8hi): Likewise. (vec_unpacku_lo_v16qi): Likewise. (vec_unpacku_lo_v8hi): Likewise. (vec_unpacku_hi_): New define_expand. (vec_unpacku_lo_): Likewise. gcc/testsuite/ChangeLog: * gcc.target/powerpc/unpack-vectorize-1.c: New test. * gcc.target/powerpc/unpack-vectorize-1.h: New test. * gcc.target/powerpc/unpack-vectorize-2.c: New test. * gcc.target/powerpc/unpack-vectorize-2.h: New test. * gcc.target/powerpc/unpack-vectorize-3.c: New test. * gcc.target/powerpc/unpack-vectorize-3.h: New test. * gcc.target/powerpc/unpack-vectorize-run-1.c: New test. * gcc.target/powerpc/unpack-vectorize-run-2.c: New test. * gcc.target/powerpc/unpack-vectorize-run-3.c: New test. * gcc.target/powerpc/unpack-vectorize.h: New test. --- gcc/config/rs6000/altivec.md | 158 ++++++++----------------------------------- 1 file changed, 29 insertions(+), 129 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 2c73dde..93d23715 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -134,10 +134,8 @@ UNSPEC_VMULWLUH UNSPEC_VMULWHSH UNSPEC_VMULWLSH - UNSPEC_VUPKHUB - UNSPEC_VUPKHUH - UNSPEC_VUPKLUB - UNSPEC_VUPKLUH + UNSPEC_VUPKHU + UNSPEC_VUPKLU UNSPEC_VPERMSI UNSPEC_VPERMHI UNSPEC_INTERHI @@ -3688,143 +3686,45 @@ [(set_attr "type" "vecperm") (set_attr "isa" "p9v,*")]) -(define_expand "vec_unpacku_hi_v16qi" - [(set (match_operand:V8HI 0 "register_operand" "=v") - (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v")] - UNSPEC_VUPKHUB))] - "TARGET_ALTIVEC" -{ - rtx vzero = gen_reg_rtx (V8HImode); - rtx mask = gen_reg_rtx (V16QImode); - rtvec v = rtvec_alloc (16); - bool be = BYTES_BIG_ENDIAN; - - emit_insn (gen_altivec_vspltish (vzero, const0_rtx)); - - RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, be ? 16 : 7); - RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, be ? 0 : 16); - RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, be ? 16 : 6); - RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, be ? 1 : 16); - RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, be ? 16 : 5); - RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, be ? 2 : 16); - RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, be ? 16 : 4); - RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, be ? 3 : 16); - RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, be ? 16 : 3); - RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, be ? 4 : 16); - RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, be ? 16 : 2); - RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, be ? 5 : 16); - RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, be ? 16 : 1); - RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, be ? 6 : 16); - RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, be ? 16 : 0); - RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, be ? 7 : 16); - - emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v))); - emit_insn (gen_vperm_v16qiv8hi (operands[0], operands[1], vzero, mask)); - DONE; -}) - -(define_expand "vec_unpacku_hi_v8hi" - [(set (match_operand:V4SI 0 "register_operand" "=v") - (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")] - UNSPEC_VUPKHUH))] +(define_expand "vec_unpacku_hi_" + [(set (match_operand:VP 0 "register_operand" "=v") + (unspec:VP [(match_operand: 1 "register_operand" "v")] + UNSPEC_VUPKHU))] "TARGET_ALTIVEC" { - rtx vzero = gen_reg_rtx (V4SImode); - rtx mask = gen_reg_rtx (V16QImode); - rtvec v = rtvec_alloc (16); - bool be = BYTES_BIG_ENDIAN; + rtx vzero = gen_reg_rtx (mode); + emit_insn (gen_altivec_vspltis (vzero, const0_rtx)); - emit_insn (gen_altivec_vspltisw (vzero, const0_rtx)); - - RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, be ? 16 : 7); - RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, be ? 17 : 6); - RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, be ? 0 : 17); - RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, be ? 1 : 16); - RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, be ? 16 : 5); - RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, be ? 17 : 4); - RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, be ? 2 : 17); - RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, be ? 3 : 16); - RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, be ? 16 : 3); - RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, be ? 17 : 2); - RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, be ? 4 : 17); - RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, be ? 5 : 16); - RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, be ? 16 : 1); - RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, be ? 17 : 0); - RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, be ? 6 : 17); - RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, be ? 7 : 16); - - emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v))); - emit_insn (gen_vperm_v8hiv4si (operands[0], operands[1], vzero, mask)); - DONE; -}) + rtx res = gen_reg_rtx (mode); + rtx op1 = operands[1]; -(define_expand "vec_unpacku_lo_v16qi" - [(set (match_operand:V8HI 0 "register_operand" "=v") - (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v")] - UNSPEC_VUPKLUB))] - "TARGET_ALTIVEC" -{ - rtx vzero = gen_reg_rtx (V8HImode); - rtx mask = gen_reg_rtx (V16QImode); - rtvec v = rtvec_alloc (16); - bool be = BYTES_BIG_ENDIAN; - - emit_insn (gen_altivec_vspltish (vzero, const0_rtx)); - - RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, be ? 16 : 15); - RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, be ? 8 : 16); - RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, be ? 16 : 14); - RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, be ? 9 : 16); - RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, be ? 16 : 13); - RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, be ? 10 : 16); - RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, be ? 16 : 12); - RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, be ? 11 : 16); - RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, be ? 16 : 11); - RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, be ? 12 : 16); - RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, be ? 16 : 10); - RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, be ? 13 : 16); - RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, be ? 16 : 9); - RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, be ? 14 : 16); - RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, be ? 16 : 8); - RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, be ? 15 : 16); + if (BYTES_BIG_ENDIAN) + emit_insn (gen_altivec_vmrgh (res, vzero, op1)); + else + emit_insn (gen_altivec_vmrgl (res, op1, vzero)); - emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v))); - emit_insn (gen_vperm_v16qiv8hi (operands[0], operands[1], vzero, mask)); + emit_insn (gen_move_insn (operands[0], gen_lowpart (mode, res))); DONE; }) -(define_expand "vec_unpacku_lo_v8hi" - [(set (match_operand:V4SI 0 "register_operand" "=v") - (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")] - UNSPEC_VUPKLUH))] +(define_expand "vec_unpacku_lo_" + [(set (match_operand:VP 0 "register_operand" "=v") + (unspec:VP [(match_operand: 1 "register_operand" "v")] + UNSPEC_VUPKLU))] "TARGET_ALTIVEC" { - rtx vzero = gen_reg_rtx (V4SImode); - rtx mask = gen_reg_rtx (V16QImode); - rtvec v = rtvec_alloc (16); - bool be = BYTES_BIG_ENDIAN; + rtx vzero = gen_reg_rtx (mode); + emit_insn (gen_altivec_vspltis (vzero, const0_rtx)); - emit_insn (gen_altivec_vspltisw (vzero, const0_rtx)); - - RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, be ? 16 : 15); - RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, be ? 17 : 14); - RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, be ? 8 : 17); - RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, be ? 9 : 16); - RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, be ? 16 : 13); - RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, be ? 17 : 12); - RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, be ? 10 : 17); - RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, be ? 11 : 16); - RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, be ? 16 : 11); - RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, be ? 17 : 10); - RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, be ? 12 : 17); - RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, be ? 13 : 16); - RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, be ? 16 : 9); - RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, be ? 17 : 8); - RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, be ? 14 : 17); - RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, be ? 15 : 16); + rtx res = gen_reg_rtx (mode); + rtx op1 = operands[1]; - emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v))); - emit_insn (gen_vperm_v8hiv4si (operands[0], operands[1], vzero, mask)); + if (BYTES_BIG_ENDIAN) + emit_insn (gen_altivec_vmrgl (res, vzero, op1)); + else + emit_insn (gen_altivec_vmrgh (res, op1, vzero)); + + emit_insn (gen_move_insn (operands[0], gen_lowpart (mode, res))); DONE; }) -- cgit v1.1 From db3d4129b6f4cff685713da514b64ff7bbc401fc Mon Sep 17 00:00:00 2001 From: konglin1 Date: Mon, 9 Aug 2021 10:58:24 +0800 Subject: i386: Fix _mm512_fpclass_ps_mask in O0 [PR 101471] gcc/ChangeLog: PR target/101471 * config/i386/avx512dqintrin.h (_mm512_fpclass_ps_mask): Fix macro define in O0. (_mm512_mask_fpclass_ps_mask): Ditto. gcc/testsuite/ChangeLog: PR target/101471 * gcc.target/i386/avx512f-pr101471.c: New test. --- gcc/config/i386/avx512dqintrin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h index 51c0b12..9794f5d 100644 --- a/gcc/config/i386/avx512dqintrin.h +++ b/gcc/config/i386/avx512dqintrin.h @@ -2814,7 +2814,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) #define _mm512_mask_fpclass_ps_mask(u, x, c) \ ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x),\ - (int) (c),(__mmask8)(u))) + (int) (c),(__mmask16)(u))) #define _mm512_fpclass_pd_mask(X, C) \ ((__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) (__m512d) (X), \ @@ -2822,7 +2822,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) #define _mm512_fpclass_ps_mask(x, c) \ ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x),\ - (int) (c),(__mmask8)-1)) + (int) (c),(__mmask16)-1)) #define _mm_reduce_sd(A, B, C) \ ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), \ -- cgit v1.1 From 87afc7b81cd44d04997add383856b2504af3afe6 Mon Sep 17 00:00:00 2001 From: Hongyu Wang Date: Tue, 17 Aug 2021 16:53:46 +0800 Subject: i386: Optimize lea with zero-extend. [PR 101716] For ASHIFT + ZERO_EXTEND pattern, combine pass failed to match it to lea since it will generate non-canonical zero-extend. Adjust predicate and cost_model to allow combine for lea. gcc/ChangeLog: PR target/101716 * config/i386/i386.c (ix86_live_on_entry): Adjust comment. (ix86_decompose_address): Remove retval check for ASHIFT, allow non-canonical zero extend if AND mask covers ASHIFT count. (ix86_legitimate_address_p): Adjust condition for decompose. (ix86_rtx_costs): Adjust cost for lea with non-canonical zero-extend. Co-Authored by: Uros Bizjak gcc/testsuite/ChangeLog: PR target/101716 * gcc.target/i386/pr101716.c: New test. --- gcc/config/i386/i386.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ebec866..ddbbbce 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -10018,8 +10018,7 @@ ix86_live_on_entry (bitmap regs) /* Extract the parts of an RTL expression that is a valid memory address for an instruction. Return 0 if the structure of the address is - grossly off. Return -1 if the address contains ASHIFT, so it is not - strictly valid, but still used for computing length of lea instruction. */ + grossly off. */ int ix86_decompose_address (rtx addr, struct ix86_address *out) @@ -10029,7 +10028,6 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) HOST_WIDE_INT scale = 1; rtx scale_rtx = NULL_RTX; rtx tmp; - int retval = 1; addr_space_t seg = ADDR_SPACE_GENERIC; /* Allow zero-extended SImode addresses, @@ -10053,6 +10051,27 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) if (CONST_INT_P (addr)) return 0; } + else if (GET_CODE (addr) == AND) + { + /* For ASHIFT inside AND, combine will not generate + canonical zero-extend. Merge mask for AND and shift_count + to check if it is canonical zero-extend. */ + tmp = XEXP (addr, 0); + rtx mask = XEXP (addr, 1); + if (tmp && GET_CODE(tmp) == ASHIFT) + { + rtx shift_val = XEXP (tmp, 1); + if (CONST_INT_P (mask) && CONST_INT_P (shift_val) + && (((unsigned HOST_WIDE_INT) INTVAL(mask) + | ((HOST_WIDE_INT_1U << INTVAL(shift_val)) - 1)) + == 0xffffffff)) + { + addr = lowpart_subreg (SImode, XEXP (addr, 0), + DImode); + } + } + + } } /* Allow SImode subregs of DImode addresses, @@ -10179,7 +10198,6 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) if ((unsigned HOST_WIDE_INT) scale > 3) return 0; scale = 1 << scale; - retval = -1; } else disp = addr; /* displacement */ @@ -10252,7 +10270,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) out->scale = scale; out->seg = seg; - return retval; + return 1; } /* Return cost of the memory address x. @@ -10765,7 +10783,7 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool strict) HOST_WIDE_INT scale; addr_space_t seg; - if (ix86_decompose_address (addr, &parts) <= 0) + if (ix86_decompose_address (addr, &parts) == 0) /* Decomposition failed. */ return false; @@ -20419,6 +20437,12 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, << (GET_MODE (XEXP (x, 1)) != DImode))); return true; } + else if (code == AND + && address_no_seg_operand (x, mode)) + { + *total = cost->lea; + return true; + } /* FALLTHRU */ case NEG: -- cgit v1.1 From bb24717e5042b6e8a3847e780a8d215edb9c62f6 Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Wed, 25 Aug 2021 15:11:47 -0400 Subject: Make xxsplti*, xpermx, xxeval be vecperm type. I noticed that the built-functions for xxspltiw, xxspltidp, xxsplti32dx, xxpermx, and xxeval all used the 'vecsimple' type. These instructions are permute instructions (3 cycle latency) and should use 'vecperm' instead. While I was at it, I changed the UNSPEC name for xxspltidp to be UNSPEC_XXSPLTIDP instead of UNSPEC_XXSPLTID. 2021-08-25 Michael Meissner gcc/ * config/rs6000/vsx.md (UNSPEC_XXSPLTIDP): Rename from UNSPEC_XXSPLTID. (xxspltiw_v4si): Use vecperm type attribute. (xxspltiw_v4si_inst): Use vecperm type attribute. (xxspltiw_v4sf_inst): Likewise. (xxspltidp_v2df): Use vecperm type attribute. Use UNSPEC_XXSPLTIDP instead of UNSPEC_XXSPLTID. (xxspltidp_v2df_inst): Likewise. (xxsplti32dx_v4si): Use vecperm type attribute. (xxsplti32dx_v4si_inst): Likewise. (xxsplti32dx_v4sf_inst): Likewise. (xxblend_): Likewise. (xxpermx): Likewise. (xxpermx_inst): Likewise. (xxeval): Likewise. --- gcc/config/rs6000/vsx.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index e4ca6e9..bf033e3 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -374,7 +374,7 @@ UNSPEC_VDIVEU UNSPEC_XXEVAL UNSPEC_XXSPLTIW - UNSPEC_XXSPLTID + UNSPEC_XXSPLTIDP UNSPEC_XXSPLTI32DX UNSPEC_XXBLEND UNSPEC_XXPERMX @@ -6414,7 +6414,7 @@ UNSPEC_XXSPLTIW))] "TARGET_POWER10" "xxspltiw %x0,%1" - [(set_attr "type" "vecsimple") + [(set_attr "type" "vecperm") (set_attr "prefixed" "yes")]) (define_expand "xxspltiw_v4sf" @@ -6434,14 +6434,14 @@ UNSPEC_XXSPLTIW))] "TARGET_POWER10" "xxspltiw %x0,%1" - [(set_attr "type" "vecsimple") + [(set_attr "type" "vecperm") (set_attr "prefixed" "yes")]) ;; XXSPLTIDP built-in function support (define_expand "xxspltidp_v2df" [(set (match_operand:V2DF 0 "register_operand" ) (unspec:V2DF [(match_operand:SF 1 "const_double_operand")] - UNSPEC_XXSPLTID))] + UNSPEC_XXSPLTIDP))] "TARGET_POWER10" { long value = rs6000_const_f32_to_i32 (operands[1]); @@ -6452,10 +6452,10 @@ (define_insn "xxspltidp_v2df_inst" [(set (match_operand:V2DF 0 "register_operand" "=wa") (unspec:V2DF [(match_operand:SI 1 "c32bit_cint_operand" "n")] - UNSPEC_XXSPLTID))] + UNSPEC_XXSPLTIDP))] "TARGET_POWER10" "xxspltidp %x0,%1" - [(set_attr "type" "vecsimple") + [(set_attr "type" "vecperm") (set_attr "prefixed" "yes")]) ;; XXSPLTI32DX built-in function support @@ -6476,7 +6476,7 @@ GEN_INT (index), operands[3])); DONE; } - [(set_attr "type" "vecsimple")]) + [(set_attr "type" "vecperm")]) (define_insn "xxsplti32dx_v4si_inst" [(set (match_operand:V4SI 0 "register_operand" "=wa") @@ -6486,7 +6486,7 @@ UNSPEC_XXSPLTI32DX))] "TARGET_POWER10" "xxsplti32dx %x0,%2,%3" - [(set_attr "type" "vecsimple") + [(set_attr "type" "vecperm") (set_attr "prefixed" "yes")]) (define_expand "xxsplti32dx_v4sf" @@ -6515,7 +6515,7 @@ UNSPEC_XXSPLTI32DX))] "TARGET_POWER10" "xxsplti32dx %x0,%2,%3" - [(set_attr "type" "vecsimple") + [(set_attr "type" "vecperm") (set_attr "prefixed" "yes")]) ;; XXBLEND built-in function support @@ -6527,7 +6527,7 @@ UNSPEC_XXBLEND))] "TARGET_POWER10" "xxblendv %x0,%x1,%x2,%x3" - [(set_attr "type" "vecsimple") + [(set_attr "type" "vecperm") (set_attr "prefixed" "yes")]) ;; XXPERMX built-in function support @@ -6562,7 +6562,7 @@ DONE; } - [(set_attr "type" "vecsimple")]) + [(set_attr "type" "vecperm")]) (define_insn "xxpermx_inst" [(set (match_operand:V2DI 0 "register_operand" "+v") @@ -6573,7 +6573,7 @@ UNSPEC_XXPERMX))] "TARGET_POWER10" "xxpermx %x0,%x1,%x2,%x3,%4" - [(set_attr "type" "vecsimple") + [(set_attr "type" "vecperm") (set_attr "prefixed" "yes")]) ;; XXEVAL built-in function support @@ -6586,6 +6586,6 @@ UNSPEC_XXEVAL))] "TARGET_POWER10" "xxeval %0,%1,%2,%3,%4" - [(set_attr "type" "vecsimple") + [(set_attr "type" "vecperm") (set_attr "prefixed" "yes")]) -- cgit v1.1 From 226eb7ff5d73669184a4a56cd348fa6c46b3ec52 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 26 Aug 2021 17:26:06 +0200 Subject: [i386] Set all_regs to true in the call to replace_rtx [PR102057] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to replace all REGs equal to FROM. 2021-08-26 Uroš Bizjak gcc/ PR target/102057 * config/i386/i386.md (cmove reg-reg move elimination peephole2s): Set all_regs to true in the call to replace_rtx. --- gcc/config/i386/i386.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 41d8562..c80dcb5 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -19475,8 +19475,8 @@ (match_dup 0)))] { operands[7] = SET_DEST (XVECEXP (PATTERN (peep2_next_insn (1)), 0, 0)); - operands[8] = replace_rtx (operands[5], operands[0], operands[1]); - operands[9] = replace_rtx (operands[6], operands[0], operands[1]); + operands[8] = replace_rtx (operands[5], operands[0], operands[1], true); + operands[9] = replace_rtx (operands[6], operands[0], operands[1], true); }) ;; Eliminate a reg-reg mov by inverting the condition of a cmov (#2). @@ -19507,8 +19507,8 @@ (match_dup 0)))] { operands[7] = SET_DEST (XVECEXP (PATTERN (peep2_next_insn (2)), 0, 0)); - operands[8] = replace_rtx (operands[5], operands[0], operands[1]); - operands[9] = replace_rtx (operands[6], operands[0], operands[1]); + operands[8] = replace_rtx (operands[5], operands[0], operands[1], true); + operands[9] = replace_rtx (operands[6], operands[0], operands[1], true); }) (define_expand "movcc" -- cgit v1.1 From 344951b639a5f6f41c21c6cad04b0580c58aa12e Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Thu, 17 Jun 2021 09:05:24 -0500 Subject: rs6000: Add MMA builtins 2021-06-16 Bill Schmidt gcc/ * config/rs6000/rs6000-builtin-new.def: Add mma stanza. --- gcc/config/rs6000/rs6000-builtin-new.def | 416 +++++++++++++++++++++++++++++++ 1 file changed, 416 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def index b6fc994..d6174fe 100644 --- a/gcc/config/rs6000/rs6000-builtin-new.def +++ b/gcc/config/rs6000/rs6000-builtin-new.def @@ -3329,3 +3329,419 @@ const unsigned long long __builtin_pextd (unsigned long long, unsigned long long); PEXTD pextd {} + + +[mma] + void __builtin_mma_assemble_acc (v512 *, vuc, vuc, vuc, vuc); + ASSEMBLE_ACC nothing {mma} + + v512 __builtin_mma_assemble_acc_internal (vuc, vuc, vuc, vuc); + ASSEMBLE_ACC_INTERNAL mma_assemble_acc {mma} + + void __builtin_mma_assemble_pair (v256 *, vuc, vuc); + ASSEMBLE_PAIR nothing {mma} + + v256 __builtin_mma_assemble_pair_internal (vuc, vuc); + ASSEMBLE_PAIR_INTERNAL vsx_assemble_pair {mma} + + void __builtin_mma_build_acc (v512 *, vuc, vuc, vuc, vuc); + BUILD_ACC nothing {mma} + + v512 __builtin_mma_build_acc_internal (vuc, vuc, vuc, vuc); + BUILD_ACC_INTERNAL mma_assemble_acc {mma} + + void __builtin_mma_disassemble_acc (void *, v512 *); + DISASSEMBLE_ACC nothing {mma,quad} + + vuc __builtin_mma_disassemble_acc_internal (v512, const int<2>); + DISASSEMBLE_ACC_INTERNAL mma_disassemble_acc {mma} + + void __builtin_mma_disassemble_pair (void *, v256 *); + DISASSEMBLE_PAIR nothing {mma,pair} + + vuc __builtin_mma_disassemble_pair_internal (v256, const int<2>); + DISASSEMBLE_PAIR_INTERNAL vsx_disassemble_pair {mma} + + void __builtin_mma_pmxvbf16ger2 (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVBF16GER2 nothing {mma} + + v512 __builtin_mma_pmxvbf16ger2_internal (vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVBF16GER2_INTERNAL mma_pmxvbf16ger2 {mma} + + void __builtin_mma_pmxvbf16ger2nn (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVBF16GER2NN nothing {mma,quad} + + v512 __builtin_mma_pmxvbf16ger2nn_internal (v512, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVBF16GER2NN_INTERNAL mma_pmxvbf16ger2nn {mma,quad} + + void __builtin_mma_pmxvbf16ger2np (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVBF16GER2NP nothing {mma,quad} + + v512 __builtin_mma_pmxvbf16ger2np_internal (v512, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVBF16GER2NP_INTERNAL mma_pmxvbf16ger2np {mma,quad} + + void __builtin_mma_pmxvbf16ger2pn (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVBF16GER2PN nothing {mma,quad} + + v512 __builtin_mma_pmxvbf16ger2pn_internal (v512, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVBF16GER2PN_INTERNAL mma_pmxvbf16ger2pn {mma,quad} + + void __builtin_mma_pmxvbf16ger2pp (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVBF16GER2PP nothing {mma,quad} + + v512 __builtin_mma_pmxvbf16ger2pp_internal (v512, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVBF16GER2PP_INTERNAL mma_pmxvbf16ger2pp {mma,quad} + + void __builtin_mma_pmxvf16ger2 (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVF16GER2 nothing {mma} + + v512 __builtin_mma_pmxvf16ger2_internal (vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVF16GER2_INTERNAL mma_pmxvf16ger2 {mma} + + void __builtin_mma_pmxvf16ger2nn (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVF16GER2NN nothing {mma,quad} + + v512 __builtin_mma_pmxvf16ger2nn_internal (v512, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVF16GER2NN_INTERNAL mma_pmxvf16ger2nn {mma,quad} + + void __builtin_mma_pmxvf16ger2np (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVF16GER2NP nothing {mma,quad} + + v512 __builtin_mma_pmxvf16ger2np_internal (v512, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVF16GER2NP_INTERNAL mma_pmxvf16ger2np {mma,quad} + + void __builtin_mma_pmxvf16ger2pn (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVF16GER2PN nothing {mma,quad} + + v512 __builtin_mma_pmxvf16ger2pn_internal (v512, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVF16GER2PN_INTERNAL mma_pmxvf16ger2pn {mma,quad} + + void __builtin_mma_pmxvf16ger2pp (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVF16GER2PP nothing {mma,quad} + + v512 __builtin_mma_pmxvf16ger2pp_internal (v512, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVF16GER2PP_INTERNAL mma_pmxvf16ger2pp {mma,quad} + + void __builtin_mma_pmxvf32ger (v512 *, vuc, vuc, const int<4>, const int<4>); + PMXVF32GER nothing {mma} + + v512 __builtin_mma_pmxvf32ger_internal (vuc, vuc, const int<4>, const int<4>); + PMXVF32GER_INTERNAL mma_pmxvf32ger {mma} + + void __builtin_mma_pmxvf32gernn (v512 *, vuc, vuc, const int<4>, const int<4>); + PMXVF32GERNN nothing {mma,quad} + + v512 __builtin_mma_pmxvf32gernn_internal (v512, vuc, vuc, const int<4>, const int<4>); + PMXVF32GERNN_INTERNAL mma_pmxvf32gernn {mma,quad} + + void __builtin_mma_pmxvf32gernp (v512 *, vuc, vuc, const int<4>, const int<4>); + PMXVF32GERNP nothing {mma,quad} + + v512 __builtin_mma_pmxvf32gernp_internal (v512, vuc, vuc, const int<4>, const int<4>); + PMXVF32GERNP_INTERNAL mma_pmxvf32gernp {mma,quad} + + void __builtin_mma_pmxvf32gerpn (v512 *, vuc, vuc, const int<4>, const int<4>); + PMXVF32GERPN nothing {mma,quad} + + v512 __builtin_mma_pmxvf32gerpn_internal (v512, vuc, vuc, const int<4>, const int<4>); + PMXVF32GERPN_INTERNAL mma_pmxvf32gerpn {mma,quad} + + void __builtin_mma_pmxvf32gerpp (v512 *, vuc, vuc, const int<4>, const int<4>); + PMXVF32GERPP nothing {mma,quad} + + v512 __builtin_mma_pmxvf32gerpp_internal (v512, vuc, vuc, const int<4>, const int<4>); + PMXVF32GERPP_INTERNAL mma_pmxvf32gerpp {mma,quad} + + void __builtin_mma_pmxvf64ger (v512 *, v256, vuc, const int<4>, const int<2>); + PMXVF64GER nothing {mma,pair} + + v512 __builtin_mma_pmxvf64ger_internal (v256, vuc, const int<4>, const int<2>); + PMXVF64GER_INTERNAL mma_pmxvf64ger {mma,pair} + + void __builtin_mma_pmxvf64gernn (v512 *, v256, vuc, const int<4>, const int<2>); + PMXVF64GERNN nothing {mma,pair,quad} + + v512 __builtin_mma_pmxvf64gernn_internal (v512, v256, vuc, const int<4>, const int<2>); + PMXVF64GERNN_INTERNAL mma_pmxvf64gernn {mma,pair,quad} + + void __builtin_mma_pmxvf64gernp (v512 *, v256, vuc, const int<4>, const int<2>); + PMXVF64GERNP nothing {mma,pair,quad} + + v512 __builtin_mma_pmxvf64gernp_internal (v512, v256, vuc, const int<4>, const int<2>); + PMXVF64GERNP_INTERNAL mma_pmxvf64gernp {mma,pair,quad} + + void __builtin_mma_pmxvf64gerpn (v512 *, v256, vuc, const int<4>, const int<2>); + PMXVF64GERPN nothing {mma,pair,quad} + + v512 __builtin_mma_pmxvf64gerpn_internal (v512, v256, vuc, const int<4>, const int<2>); + PMXVF64GERPN_INTERNAL mma_pmxvf64gerpn {mma,pair,quad} + + void __builtin_mma_pmxvf64gerpp (v512 *, v256, vuc, const int<4>, const int<2>); + PMXVF64GERPP nothing {mma,pair,quad} + + v512 __builtin_mma_pmxvf64gerpp_internal (v512, v256, vuc, const int<4>, const int<2>); + PMXVF64GERPP_INTERNAL mma_pmxvf64gerpp {mma,pair,quad} + + void __builtin_mma_pmxvi16ger2 (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVI16GER2 nothing {mma} + + v512 __builtin_mma_pmxvi16ger2_internal (vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVI16GER2_INTERNAL mma_pmxvi16ger2 {mma} + + void __builtin_mma_pmxvi16ger2pp (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVI16GER2PP nothing {mma,quad} + + v512 __builtin_mma_pmxvi16ger2pp_internal (v512, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVI16GER2PP_INTERNAL mma_pmxvi16ger2pp {mma,quad} + + void __builtin_mma_pmxvi16ger2s (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVI16GER2S nothing {mma} + + v512 __builtin_mma_pmxvi16ger2s_internal (vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVI16GER2S_INTERNAL mma_pmxvi16ger2s {mma} + + void __builtin_mma_pmxvi16ger2spp (v512 *, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVI16GER2SPP nothing {mma,quad} + + v512 __builtin_mma_pmxvi16ger2spp_internal (v512, vuc, vuc, const int<4>, const int<4>, const int<2>); + PMXVI16GER2SPP_INTERNAL mma_pmxvi16ger2spp {mma,quad} + + void __builtin_mma_pmxvi4ger8 (v512 *, vuc, vuc, const int<4>, const int<4>, const int<8>); + PMXVI4GER8 nothing {mma} + + v512 __builtin_mma_pmxvi4ger8_internal (vuc, vuc, const int<4>, const int<4>, const int<8>); + PMXVI4GER8_INTERNAL mma_pmxvi4ger8 {mma} + + void __builtin_mma_pmxvi4ger8pp (v512 *, vuc, vuc, const int<4>, const int<4>, const int<4>); + PMXVI4GER8PP nothing {mma,quad} + + v512 __builtin_mma_pmxvi4ger8pp_internal (v512, vuc, vuc, const int<4>, const int<4>, const int<4>); + PMXVI4GER8PP_INTERNAL mma_pmxvi4ger8pp {mma,quad} + + void __builtin_mma_pmxvi8ger4 (v512 *, vuc, vuc, const int<4>, const int<4>, const int<4>); + PMXVI8GER4 nothing {mma} + + v512 __builtin_mma_pmxvi8ger4_internal (vuc, vuc, const int<4>, const int<4>, const int<4>); + PMXVI8GER4_INTERNAL mma_pmxvi8ger4 {mma} + + void __builtin_mma_pmxvi8ger4pp (v512 *, vuc, vuc, const int<4>, const int<4>, const int<4>); + PMXVI8GER4PP nothing {mma,quad} + + v512 __builtin_mma_pmxvi8ger4pp_internal (v512, vuc, vuc, const int<4>, const int<4>, const int<4>); + PMXVI8GER4PP_INTERNAL mma_pmxvi8ger4pp {mma,quad} + + void __builtin_mma_pmxvi8ger4spp (v512 *, vuc, vuc, const int<4>, const int<4>, const int<4>); + PMXVI8GER4SPP nothing {mma,quad} + + v512 __builtin_mma_pmxvi8ger4spp_internal (v512, vuc, vuc, const int<4>, const int<4>, const int<4>); + PMXVI8GER4SPP_INTERNAL mma_pmxvi8ger4spp {mma,quad} + + void __builtin_mma_xvbf16ger2 (v512 *, vuc, vuc); + XVBF16GER2 nothing {mma} + + v512 __builtin_mma_xvbf16ger2_internal (vuc, vuc); + XVBF16GER2_INTERNAL mma_xvbf16ger2 {mma} + + void __builtin_mma_xvbf16ger2nn (v512 *, vuc, vuc); + XVBF16GER2NN nothing {mma,quad} + + v512 __builtin_mma_xvbf16ger2nn_internal (v512, vuc, vuc); + XVBF16GER2NN_INTERNAL mma_xvbf16ger2nn {mma,quad} + + void __builtin_mma_xvbf16ger2np (v512 *, vuc, vuc); + XVBF16GER2NP nothing {mma,quad} + + v512 __builtin_mma_xvbf16ger2np_internal (v512, vuc, vuc); + XVBF16GER2NP_INTERNAL mma_xvbf16ger2np {mma,quad} + + void __builtin_mma_xvbf16ger2pn (v512 *, vuc, vuc); + XVBF16GER2PN nothing {mma,quad} + + v512 __builtin_mma_xvbf16ger2pn_internal (v512, vuc, vuc); + XVBF16GER2PN_INTERNAL mma_xvbf16ger2pn {mma,quad} + + void __builtin_mma_xvbf16ger2pp (v512 *, vuc, vuc); + XVBF16GER2PP nothing {mma,quad} + + v512 __builtin_mma_xvbf16ger2pp_internal (v512, vuc, vuc); + XVBF16GER2PP_INTERNAL mma_xvbf16ger2pp {mma,quad} + + void __builtin_mma_xvf16ger2 (v512 *, vuc, vuc); + XVF16GER2 nothing {mma} + + v512 __builtin_mma_xvf16ger2_internal (vuc, vuc); + XVF16GER2_INTERNAL mma_xvf16ger2 {mma} + + void __builtin_mma_xvf16ger2nn (v512 *, vuc, vuc); + XVF16GER2NN nothing {mma,quad} + + v512 __builtin_mma_xvf16ger2nn_internal (v512, vuc, vuc); + XVF16GER2NN_INTERNAL mma_xvf16ger2nn {mma,quad} + + void __builtin_mma_xvf16ger2np (v512 *, vuc, vuc); + XVF16GER2NP nothing {mma,quad} + + v512 __builtin_mma_xvf16ger2np_internal (v512, vuc, vuc); + XVF16GER2NP_INTERNAL mma_xvf16ger2np {mma,quad} + + void __builtin_mma_xvf16ger2pn (v512 *, vuc, vuc); + XVF16GER2PN nothing {mma,quad} + + v512 __builtin_mma_xvf16ger2pn_internal (v512, vuc, vuc); + XVF16GER2PN_INTERNAL mma_xvf16ger2pn {mma,quad} + + void __builtin_mma_xvf16ger2pp (v512 *, vuc, vuc); + XVF16GER2PP nothing {mma,quad} + + v512 __builtin_mma_xvf16ger2pp_internal (v512, vuc, vuc); + XVF16GER2PP_INTERNAL mma_xvf16ger2pp {mma,quad} + + void __builtin_mma_xvf32ger (v512 *, vuc, vuc); + XVF32GER nothing {mma} + + v512 __builtin_mma_xvf32ger_internal (vuc, vuc); + XVF32GER_INTERNAL mma_xvf32ger {mma} + + void __builtin_mma_xvf32gernn (v512 *, vuc, vuc); + XVF32GERNN nothing {mma,quad} + + v512 __builtin_mma_xvf32gernn_internal (v512, vuc, vuc); + XVF32GERNN_INTERNAL mma_xvf32gernn {mma,quad} + + void __builtin_mma_xvf32gernp (v512 *, vuc, vuc); + XVF32GERNP nothing {mma,quad} + + v512 __builtin_mma_xvf32gernp_internal (v512, vuc, vuc); + XVF32GERNP_INTERNAL mma_xvf32gernp {mma,quad} + + void __builtin_mma_xvf32gerpn (v512 *, vuc, vuc); + XVF32GERPN nothing {mma,quad} + + v512 __builtin_mma_xvf32gerpn_internal (v512, vuc, vuc); + XVF32GERPN_INTERNAL mma_xvf32gerpn {mma,quad} + + void __builtin_mma_xvf32gerpp (v512 *, vuc, vuc); + XVF32GERPP nothing {mma,quad} + + v512 __builtin_mma_xvf32gerpp_internal (v512, vuc, vuc); + XVF32GERPP_INTERNAL mma_xvf32gerpp {mma,quad} + + void __builtin_mma_xvf64ger (v512 *, v256, vuc); + XVF64GER nothing {mma,pair} + + v512 __builtin_mma_xvf64ger_internal (v256, vuc); + XVF64GER_INTERNAL mma_xvf64ger {mma,pair} + + void __builtin_mma_xvf64gernn (v512 *, v256, vuc); + XVF64GERNN nothing {mma,pair,quad} + + v512 __builtin_mma_xvf64gernn_internal (v512, v256, vuc); + XVF64GERNN_INTERNAL mma_xvf64gernn {mma,pair,quad} + + void __builtin_mma_xvf64gernp (v512 *, v256, vuc); + XVF64GERNP nothing {mma,pair,quad} + + v512 __builtin_mma_xvf64gernp_internal (v512, v256, vuc); + XVF64GERNP_INTERNAL mma_xvf64gernp {mma,pair,quad} + + void __builtin_mma_xvf64gerpn (v512 *, v256, vuc); + XVF64GERPN nothing {mma,pair,quad} + + v512 __builtin_mma_xvf64gerpn_internal (v512, v256, vuc); + XVF64GERPN_INTERNAL mma_xvf64gerpn {mma,pair,quad} + + void __builtin_mma_xvf64gerpp (v512 *, v256, vuc); + XVF64GERPP nothing {mma,pair,quad} + + v512 __builtin_mma_xvf64gerpp_internal (v512, v256, vuc); + XVF64GERPP_INTERNAL mma_xvf64gerpp {mma,pair,quad} + + void __builtin_mma_xvi16ger2 (v512 *, vuc, vuc); + XVI16GER2 nothing {mma} + + v512 __builtin_mma_xvi16ger2_internal (vuc, vuc); + XVI16GER2_INTERNAL mma_xvi16ger2 {mma} + + void __builtin_mma_xvi16ger2pp (v512 *, vuc, vuc); + XVI16GER2PP nothing {mma,quad} + + v512 __builtin_mma_xvi16ger2pp_internal (v512, vuc, vuc); + XVI16GER2PP_INTERNAL mma_xvi16ger2pp {mma,quad} + + void __builtin_mma_xvi16ger2s (v512 *, vuc, vuc); + XVI16GER2S nothing {mma} + + v512 __builtin_mma_xvi16ger2s_internal (vuc, vuc); + XVI16GER2S_INTERNAL mma_xvi16ger2s {mma} + + void __builtin_mma_xvi16ger2spp (v512 *, vuc, vuc); + XVI16GER2SPP nothing {mma,quad} + + v512 __builtin_mma_xvi16ger2spp_internal (v512, vuc, vuc); + XVI16GER2SPP_INTERNAL mma_xvi16ger2spp {mma,quad} + + void __builtin_mma_xvi4ger8 (v512 *, vuc, vuc); + XVI4GER8 nothing {mma} + + v512 __builtin_mma_xvi4ger8_internal (vuc, vuc); + XVI4GER8_INTERNAL mma_xvi4ger8 {mma} + + void __builtin_mma_xvi4ger8pp (v512 *, vuc, vuc); + XVI4GER8PP nothing {mma,quad} + + v512 __builtin_mma_xvi4ger8pp_internal (v512, vuc, vuc); + XVI4GER8PP_INTERNAL mma_xvi4ger8pp {mma,quad} + + void __builtin_mma_xvi8ger4 (v512 *, vuc, vuc); + XVI8GER4 nothing {mma} + + v512 __builtin_mma_xvi8ger4_internal (vuc, vuc); + XVI8GER4_INTERNAL mma_xvi8ger4 {mma} + + void __builtin_mma_xvi8ger4pp (v512 *, vuc, vuc); + XVI8GER4PP nothing {mma,quad} + + v512 __builtin_mma_xvi8ger4pp_internal (v512, vuc, vuc); + XVI8GER4PP_INTERNAL mma_xvi8ger4pp {mma,quad} + + void __builtin_mma_xvi8ger4spp (v512 *, vuc, vuc); + XVI8GER4SPP nothing {mma,quad} + + v512 __builtin_mma_xvi8ger4spp_internal (v512, vuc, vuc); + XVI8GER4SPP_INTERNAL mma_xvi8ger4spp {mma,quad} + + void __builtin_mma_xxmfacc (v512 *); + XXMFACC nothing {mma,quad} + + v512 __builtin_mma_xxmfacc_internal (v512); + XXMFACC_INTERNAL mma_xxmfacc {mma,quad} + + void __builtin_mma_xxmtacc (v512 *); + XXMTACC nothing {mma,quad} + + v512 __builtin_mma_xxmtacc_internal (v512); + XXMTACC_INTERNAL mma_xxmtacc {mma,quad} + + void __builtin_mma_xxsetaccz (v512 *); + XXSETACCZ nothing {mma} + + v512 __builtin_mma_xxsetaccz_internal (); + XXSETACCZ_INTERNAL mma_xxsetaccz {mma} + + void __builtin_vsx_assemble_pair (v256 *, vuc, vuc); + ASSEMBLE_PAIR_V nothing {mma} + + v256 __builtin_vsx_assemble_pair_internal (vuc, vuc); + ASSEMBLE_PAIR_V_INTERNAL vsx_assemble_pair {mma} + + void __builtin_vsx_build_pair (v256 *, vuc, vuc); + BUILD_PAIR nothing {mma} + + v256 __builtin_vsx_build_pair_internal (vuc, vuc); + BUILD_PAIR_INTERNAL vsx_assemble_pair {mma} + + void __builtin_vsx_disassemble_pair (void *, v256 *); + DISASSEMBLE_PAIR_V nothing {mma,pair} + + vuc __builtin_vsx_disassemble_pair_internal (v256, const int<2>); + DISASSEMBLE_PAIR_V_INTERNAL vsx_disassemble_pair {mma} -- cgit v1.1 From 2f9489a1009d98b448d216c39493b6b2d454aa20 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 15 Jun 2021 08:43:11 -0500 Subject: rs6000: Add miscellaneous builtins 2021-06-15 Bill Schmidt gcc/ * config/rs6000/rs6000-builtin-new.def: Add ieee128-hw, dfp, crypto, and htm stanzas. --- gcc/config/rs6000/rs6000-builtin-new.def | 215 +++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def index d6174fe..7bfe9bc 100644 --- a/gcc/config/rs6000/rs6000-builtin-new.def +++ b/gcc/config/rs6000/rs6000-builtin-new.def @@ -2808,6 +2808,221 @@ XL_LEN_R xl_len_r {} +; Builtins requiring hardware support for IEEE-128 floating-point. +[ieee128-hw] + fpmath _Float128 __builtin_addf128_round_to_odd (_Float128, _Float128); + ADDF128_ODD addkf3_odd {} + + fpmath _Float128 __builtin_divf128_round_to_odd (_Float128, _Float128); + DIVF128_ODD divkf3_odd {} + + fpmath _Float128 __builtin_fmaf128_round_to_odd (_Float128, _Float128, _Float128); + FMAF128_ODD fmakf4_odd {} + + fpmath _Float128 __builtin_mulf128_round_to_odd (_Float128, _Float128); + MULF128_ODD mulkf3_odd {} + + const signed int __builtin_vsx_scalar_cmp_exp_qp_eq (_Float128, _Float128); + VSCEQPEQ xscmpexpqp_eq_kf {} + + const signed int __builtin_vsx_scalar_cmp_exp_qp_gt (_Float128, _Float128); + VSCEQPGT xscmpexpqp_gt_kf {} + + const signed int __builtin_vsx_scalar_cmp_exp_qp_lt (_Float128, _Float128); + VSCEQPLT xscmpexpqp_lt_kf {} + + const signed int __builtin_vsx_scalar_cmp_exp_qp_unordered (_Float128, _Float128); + VSCEQPUO xscmpexpqp_unordered_kf {} + + fpmath _Float128 __builtin_sqrtf128_round_to_odd (_Float128); + SQRTF128_ODD sqrtkf2_odd {} + + fpmath _Float128 __builtin_subf128_round_to_odd (_Float128, _Float128); + SUBF128_ODD subkf3_odd {} + + fpmath double __builtin_truncf128_round_to_odd (_Float128); + TRUNCF128_ODD trunckfdf2_odd {} + + const signed long long __builtin_vsx_scalar_extract_expq (_Float128); + VSEEQP xsxexpqp_kf {} + + const signed __int128 __builtin_vsx_scalar_extract_sigq (_Float128); + VSESQP xsxsigqp_kf {} + + const _Float128 __builtin_vsx_scalar_insert_exp_q (unsigned __int128, unsigned long long); + VSIEQP xsiexpqp_kf {} + + const _Float128 __builtin_vsx_scalar_insert_exp_qp (_Float128, unsigned long long); + VSIEQPF xsiexpqpf_kf {} + + const signed int __builtin_vsx_scalar_test_data_class_qp (_Float128, const int<7>); + VSTDCQP xststdcqp_kf {} + + const signed int __builtin_vsx_scalar_test_neg_qp (_Float128); + VSTDCNQP xststdcnegqp_kf {} + + + +; Decimal floating-point builtins. +[dfp] + const _Decimal64 __builtin_ddedpd (const int<2>, _Decimal64); + DDEDPD dfp_ddedpd_dd {} + + const _Decimal128 __builtin_ddedpdq (const int<2>, _Decimal128); + DDEDPDQ dfp_ddedpd_td {} + + const _Decimal64 __builtin_denbcd (const int<1>, _Decimal64); + DENBCD dfp_denbcd_dd {} + + const _Decimal128 __builtin_denbcdq (const int<1>, _Decimal128); + DENBCDQ dfp_denbcd_td {} + + const _Decimal128 __builtin_denb2dfp_v16qi (vsc); + DENB2DFP_V16QI dfp_denbcd_v16qi {} + + const _Decimal64 __builtin_diex (signed long long, _Decimal64); + DIEX dfp_diex_dd {} + + const _Decimal128 __builtin_diexq (signed long long, _Decimal128); + DIEXQ dfp_diex_td {} + + const _Decimal64 __builtin_dscli (_Decimal64, const int<6>); + DSCLI dfp_dscli_dd {} + + const _Decimal128 __builtin_dscliq (_Decimal128, const int<6>); + DSCLIQ dfp_dscli_td {} + + const _Decimal64 __builtin_dscri (_Decimal64, const int<6>); + DSCRI dfp_dscri_dd {} + + const _Decimal128 __builtin_dscriq (_Decimal128, const int<6>); + DSCRIQ dfp_dscri_td {} + + const signed long long __builtin_dxex (_Decimal64); + DXEX dfp_dxex_dd {} + + const signed long long __builtin_dxexq (_Decimal128); + DXEXQ dfp_dxex_td {} + + const _Decimal128 __builtin_pack_dec128 (unsigned long long, unsigned long long); + PACK_TD packtd {} + + void __builtin_set_fpscr_drn (const int[0,7]); + SET_FPSCR_DRN rs6000_set_fpscr_drn {} + + const unsigned long __builtin_unpack_dec128 (_Decimal128, const int<1>); + UNPACK_TD unpacktd {} + + +[crypto] + const vull __builtin_crypto_vcipher (vull, vull); + VCIPHER crypto_vcipher_v2di {} + + const vuc __builtin_crypto_vcipher_be (vuc, vuc); + VCIPHER_BE crypto_vcipher_v16qi {} + + const vull __builtin_crypto_vcipherlast (vull, vull); + VCIPHERLAST crypto_vcipherlast_v2di {} + + const vuc __builtin_crypto_vcipherlast_be (vuc, vuc); + VCIPHERLAST_BE crypto_vcipherlast_v16qi {} + + const vull __builtin_crypto_vncipher (vull, vull); + VNCIPHER crypto_vncipher_v2di {} + + const vuc __builtin_crypto_vncipher_be (vuc, vuc); + VNCIPHER_BE crypto_vncipher_v16qi {} + + const vull __builtin_crypto_vncipherlast (vull, vull); + VNCIPHERLAST crypto_vncipherlast_v2di {} + + const vuc __builtin_crypto_vncipherlast_be (vuc, vuc); + VNCIPHERLAST_BE crypto_vncipherlast_v16qi {} + + const vull __builtin_crypto_vsbox (vull); + VSBOX crypto_vsbox_v2di {} + + const vuc __builtin_crypto_vsbox_be (vuc); + VSBOX_BE crypto_vsbox_v16qi {} + + const vull __builtin_crypto_vshasigmad (vull, const int<1>, const int<4>); + VSHASIGMAD crypto_vshasigmad {} + + const vui __builtin_crypto_vshasigmaw (vui, const int<1>, const int<4>); + VSHASIGMAW crypto_vshasigmaw {} + + +[htm] + unsigned long long __builtin_get_texasr (); + GET_TEXASR nothing {htm,htmspr} + + unsigned long long __builtin_get_texasru (); + GET_TEXASRU nothing {htm,htmspr} + + unsigned long long __builtin_get_tfhar (); + GET_TFHAR nothing {htm,htmspr} + + unsigned long long __builtin_get_tfiar (); + GET_TFIAR nothing {htm,htmspr} + + void __builtin_set_texasr (unsigned long long); + SET_TEXASR nothing {htm,htmspr} + + void __builtin_set_texasru (unsigned long long); + SET_TEXASRU nothing {htm,htmspr} + + void __builtin_set_tfhar (unsigned long long); + SET_TFHAR nothing {htm,htmspr} + + void __builtin_set_tfiar (unsigned long long); + SET_TFIAR nothing {htm,htmspr} + + unsigned int __builtin_tabort (unsigned int); + TABORT tabort {htm,htmcr} + + unsigned int __builtin_tabortdc (unsigned long long, unsigned long long, unsigned long long); + TABORTDC tabortdc {htm,htmcr} + + unsigned int __builtin_tabortdci (unsigned long long, unsigned long long, unsigned long long); + TABORTDCI tabortdci {htm,htmcr} + + unsigned int __builtin_tabortwc (unsigned int, unsigned int, unsigned int); + TABORTWC tabortwc {htm,htmcr} + + unsigned int __builtin_tabortwci (unsigned int, unsigned int, unsigned int); + TABORTWCI tabortwci {htm,htmcr} + + unsigned int __builtin_tbegin (unsigned int); + TBEGIN tbegin {htm,htmcr} + + unsigned int __builtin_tcheck (); + TCHECK tcheck {htm,htmcr} + + unsigned int __builtin_tend (unsigned int); + TEND tend {htm,htmcr} + + unsigned int __builtin_tendall (); + TENDALL tend {htm,htmcr} + + unsigned int __builtin_trechkpt (); + TRECHKPT trechkpt {htm,htmcr} + + unsigned int __builtin_treclaim (unsigned int); + TRECLAIM treclaim {htm,htmcr} + + unsigned int __builtin_tresume (); + TRESUME tsr {htm,htmcr} + + unsigned int __builtin_tsr (unsigned int); + TSR tsr {htm,htmcr} + + unsigned int __builtin_tsuspend (); + TSUSPEND tsr {htm,htmcr} + + unsigned int __builtin_ttest (); + TTEST ttest {htm,htmcr} + + [power10] const vbq __builtin_altivec_cmpge_1ti (vsq, vsq); CMPGE_1TI vector_nltv1ti {} -- cgit v1.1 From a7f13a51990770702618d2fc3f311cd5214e130d Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Mon, 7 Jun 2021 16:09:15 -0500 Subject: rs6000: Add Cell builtins 2021-06-07 Bill Schmidt gcc/ * config/rs6000/rs6000-builtin-new.def: Add cell stanza. --- gcc/config/rs6000/rs6000-builtin-new.def | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def index 7bfe9bc..3e732ce 100644 --- a/gcc/config/rs6000/rs6000-builtin-new.def +++ b/gcc/config/rs6000/rs6000-builtin-new.def @@ -1106,6 +1106,33 @@ VEC_SET_V8HI nothing {set} +; Cell builtins. +[cell] + pure vsc __builtin_altivec_lvlx (signed long, const void *); + LVLX altivec_lvlx {ldvec} + + pure vsc __builtin_altivec_lvlxl (signed long, const void *); + LVLXL altivec_lvlxl {ldvec} + + pure vsc __builtin_altivec_lvrx (signed long, const void *); + LVRX altivec_lvrx {ldvec} + + pure vsc __builtin_altivec_lvrxl (signed long, const void *); + LVRXL altivec_lvrxl {ldvec} + + void __builtin_altivec_stvlx (vsc, signed long, void *); + STVLX altivec_stvlx {stvec} + + void __builtin_altivec_stvlxl (vsc, signed long, void *); + STVLXL altivec_stvlxl {stvec} + + void __builtin_altivec_stvrx (vsc, signed long, void *); + STVRX altivec_stvrx {stvec} + + void __builtin_altivec_stvrxl (vsc, signed long, void *); + STVRXL altivec_stvrxl {stvec} + + ; VSX builtins. [vsx] pure vd __builtin_altivec_lvx_v2df (signed long, const void *); -- cgit v1.1 From e2693a72701fe55995c0cca5a014b733a9f0aa64 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Thu, 26 Aug 2021 14:34:00 -0500 Subject: rs6000: Add remaining overloads 2021-08-26 Bill Schmidt gcc/ * config/rs6000/rs6000-overload.def: Add remaining overloads. --- gcc/config/rs6000/rs6000-overload.def | 6083 +++++++++++++++++++++++++++++++++ 1 file changed, 6083 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-overload.def b/gcc/config/rs6000/rs6000-overload.def index d8028c9..141f831 100644 --- a/gcc/config/rs6000/rs6000-overload.def +++ b/gcc/config/rs6000/rs6000-overload.def @@ -75,8 +75,6091 @@ ; a semicolon are also treated as blank lines. +[BCDADD, __builtin_bcdadd, __builtin_vec_bcdadd] + vsq __builtin_vec_bcdadd (vsq, vsq, const int); + BCDADD_V1TI + vuc __builtin_vec_bcdadd (vuc, vuc, const int); + BCDADD_V16QI + +[BCDADD_EQ, __builtin_bcdadd_eq, __builtin_vec_bcdadd_eq] + signed int __builtin_vec_bcdadd_eq (vsq, vsq, const int); + BCDADD_EQ_V1TI + signed int __builtin_vec_bcdadd_eq (vuc, vuc, const int); + BCDADD_EQ_V16QI + +[BCDADD_GT, __builtin_bcdadd_gt, __builtin_vec_bcdadd_gt] + signed int __builtin_vec_bcdadd_gt (vsq, vsq, const int); + BCDADD_GT_V1TI + signed int __builtin_vec_bcdadd_gt (vuc, vuc, const int); + BCDADD_GT_V16QI + +[BCDADD_LT, __builtin_bcdadd_lt, __builtin_vec_bcdadd_lt] + signed int __builtin_vec_bcdadd_lt (vsq, vsq, const int); + BCDADD_LT_V1TI + signed int __builtin_vec_bcdadd_lt (vuc, vuc, const int); + BCDADD_LT_V16QI + +[BCDADD_OV, __builtin_bcdadd_ov, __builtin_vec_bcdadd_ov] + signed int __builtin_vec_bcdadd_ov (vsq, vsq, const int); + BCDADD_OV_V1TI + signed int __builtin_vec_bcdadd_ov (vuc, vuc, const int); + BCDADD_OV_V16QI + +[BCDDIV10, __builtin_bcddiv10, __builtin_vec_bcddiv10] + vuc __builtin_vec_bcddiv10 (vuc); + BCDDIV10_V16QI + +[BCDINVALID, __builtin_bcdinvalid, __builtin_vec_bcdinvalid] + signed int __builtin_vec_bcdinvalid (vsq); + BCDINVALID_V1TI + signed int __builtin_vec_bcdinvalid (vuc); + BCDINVALID_V16QI + +[BCDMUL10, __builtin_bcdmul10, __builtin_vec_bcdmul10] + vuc __builtin_vec_bcdmul10 (vuc); + BCDMUL10_V16QI + +[BCDSUB, __builtin_bcdsub, __builtin_vec_bcdsub] + vsq __builtin_vec_bcdsub (vsq, vsq, const int); + BCDSUB_V1TI + vuc __builtin_vec_bcdsub (vuc, vuc, const int); + BCDSUB_V16QI + +[BCDSUB_EQ, __builtin_bcdsub_eq, __builtin_vec_bcdsub_eq] + signed int __builtin_vec_bcdsub_eq (vsq, vsq, const int); + BCDSUB_EQ_V1TI + signed int __builtin_vec_bcdsub_eq (vuc, vuc, const int); + BCDSUB_EQ_V16QI + +[BCDSUB_GE, __builtin_bcdsub_ge, __builtin_vec_bcdsub_ge] + signed int __builtin_vec_bcdsub_ge (vsq, vsq, const int); + BCDSUB_GE_V1TI + signed int __builtin_vec_bcdsub_ge (vuc, vuc, const int); + BCDSUB_GE_V16QI + +[BCDSUB_GT, __builtin_bcdsub_gt, __builtin_vec_bcdsub_gt] + signed int __builtin_vec_bcdsub_gt (vsq, vsq, const int); + BCDSUB_GT_V1TI + signed int __builtin_vec_bcdsub_gt (vuc, vuc, const int); + BCDSUB_GT_V16QI + +[BCDSUB_LE, __builtin_bcdsub_le, __builtin_vec_bcdsub_le] + signed int __builtin_vec_bcdsub_le (vsq, vsq, const int); + BCDSUB_LE_V1TI + signed int __builtin_vec_bcdsub_le (vuc, vuc, const int); + BCDSUB_LE_V16QI + +[BCDSUB_LT, __builtin_bcdsub_lt, __builtin_vec_bcdsub_lt] + signed int __builtin_vec_bcdsub_lt (vsq, vsq, const int); + BCDSUB_LT_V1TI + signed int __builtin_vec_bcdsub_lt (vuc, vuc, const int); + BCDSUB_LT_V16QI + +[BCDSUB_OV, __builtin_bcdsub_ov, __builtin_vec_bcdsub_ov] + signed int __builtin_vec_bcdsub_ov (vsq, vsq, const int); + BCDSUB_OV_V1TI + signed int __builtin_vec_bcdsub_ov (vuc, vuc, const int); + BCDSUB_OV_V16QI + +[BCD2DFP, __builtin_bcd2dfp, __builtin_vec_denb2dfp] + _Decimal128 __builtin_vec_denb2dfp (vuc); + DENB2DFP_V16QI + +[CRYPTO_PERMXOR, SKIP, __builtin_crypto_vpermxor] + vuc __builtin_crypto_vpermxor (vuc, vuc, vuc); + VPERMXOR_V16QI + vus __builtin_crypto_vpermxor (vus, vus, vus); + VPERMXOR_V8HI + vui __builtin_crypto_vpermxor (vui, vui, vui); + VPERMXOR_V4SI + vull __builtin_crypto_vpermxor (vull, vull, vull); + VPERMXOR_V2DI + +[CRYPTO_PMSUM, SKIP, __builtin_crypto_vpmsum] + vuc __builtin_crypto_vpmsum (vuc, vuc); + VPMSUMB VPMSUMB_C + vus __builtin_crypto_vpmsum (vus, vus); + VPMSUMH VPMSUMH_C + vui __builtin_crypto_vpmsum (vui, vui); + VPMSUMW VPMSUMW_C + vull __builtin_crypto_vpmsum (vull, vull); + VPMSUMD VPMSUMD_C + +[SCAL_CMPB, SKIP, __builtin_cmpb] + unsigned int __builtin_cmpb (unsigned int, unsigned int); + CMPB_32 + unsigned long long __builtin_cmpb (unsigned long long, unsigned long long); + CMPB + [VEC_ABS, vec_abs, __builtin_vec_abs] vsc __builtin_vec_abs (vsc); ABS_V16QI vss __builtin_vec_abs (vss); ABS_V8HI + vsi __builtin_vec_abs (vsi); + ABS_V4SI + vsll __builtin_vec_abs (vsll); + ABS_V2DI + vf __builtin_vec_abs (vf); + ABS_V4SF + vd __builtin_vec_abs (vd); + XVABSDP + +[VEC_ABSD, vec_absd, __builtin_vec_vadu, _ARCH_PWR9] + vuc __builtin_vec_vadu (vuc, vuc); + VADUB + vus __builtin_vec_vadu (vus, vus); + VADUH + vui __builtin_vec_vadu (vui, vui); + VADUW + +[VEC_ABSS, vec_abss, __builtin_vec_abss] + vsc __builtin_vec_abss (vsc); + ABSS_V16QI + vss __builtin_vec_abss (vss); + ABSS_V8HI + vsi __builtin_vec_abss (vsi); + ABSS_V4SI + +[VEC_ADD, vec_add, __builtin_vec_add] + vsc __builtin_vec_add (vsc, vsc); + VADDUBM VADDUBM_VSC + vuc __builtin_vec_add (vuc, vuc); + VADDUBM VADDUBM_VUC + vss __builtin_vec_add (vss, vss); + VADDUHM VADDUHM_VSS + vus __builtin_vec_add (vus, vus); + VADDUHM VADDUHM_VUS + vsi __builtin_vec_add (vsi, vsi); + VADDUWM VADDUWM_VSI + vui __builtin_vec_add (vui, vui); + VADDUWM VADDUWM_VUI + vsll __builtin_vec_add (vsll, vsll); + VADDUDM VADDUDM_VSLL + vull __builtin_vec_add (vull, vull); + VADDUDM VADDUDM_VULL + vsq __builtin_vec_add (vsq, vsq); + VADDUQM VADDUQM_VSQ + vuq __builtin_vec_add (vuq, vuq); + VADDUQM VADDUQM_VUQ + vf __builtin_vec_add (vf, vf); + VADDFP + vd __builtin_vec_add (vd, vd); + XVADDDP +; The following variants are deprecated. + vsc __builtin_vec_add (vbc, vsc); + VADDUBM VADDUBM_VBC_VSC + vsc __builtin_vec_add (vsc, vbc); + VADDUBM VADDUBM_VSC_VBC + vuc __builtin_vec_add (vbc, vuc); + VADDUBM VADDUBM_VBC_VUC + vuc __builtin_vec_add (vuc, vbc); + VADDUBM VADDUBM_VUC_VBC + vss __builtin_vec_add (vbs, vss); + VADDUHM VADDUHM_VBS_VSS + vss __builtin_vec_add (vss, vbs); + VADDUHM VADDUHM_VSS_VBS + vus __builtin_vec_add (vbs, vus); + VADDUHM VADDUHM_VBS_VUS + vus __builtin_vec_add (vus, vbs); + VADDUHM VADDUHM_VUS_VBS + vsi __builtin_vec_add (vbi, vsi); + VADDUWM VADDUWM_VBI_VSI + vsi __builtin_vec_add (vsi, vbi); + VADDUWM VADDUWM_VSI_VBI + vui __builtin_vec_add (vbi, vui); + VADDUWM VADDUWM_VBI_VUI + vui __builtin_vec_add (vui, vbi); + VADDUWM VADDUWM_VUI_VBI + vsll __builtin_vec_add (vbll, vsll); + VADDUDM VADDUDM_VBLL_VSLL + vsll __builtin_vec_add (vsll, vbll); + VADDUDM VADDUDM_VSLL_VBLL + vull __builtin_vec_add (vbll, vull); + VADDUDM VADDUDM_VBLL_VULL + vull __builtin_vec_add (vull, vbll); + VADDUDM VADDUDM_VULL_VBLL + +[VEC_ADDC, vec_addc, __builtin_vec_addc] + vsi __builtin_vec_addc (vsi, vsi); + VADDCUW VADDCUW_VSI + vui __builtin_vec_addc (vui, vui); + VADDCUW VADDCUW_VUI + vsq __builtin_vec_addc (vsq, vsq); + VADDCUQ VADDCUQ_VSQ + vuq __builtin_vec_addc (vuq, vuq); + VADDCUQ VADDCUQ_VUQ + +; TODO: Note that the entry for VEC_ADDE currently gets ignored in +; altivec_resolve_overloaded_builtin. Revisit whether we can remove +; that. We still need to register the legal builtin forms here. +[VEC_ADDE, vec_adde, __builtin_vec_adde] + vsq __builtin_vec_adde (vsq, vsq, vsq); + VADDEUQM VADDEUQM_VSQ + vuq __builtin_vec_adde (vuq, vuq, vuq); + VADDEUQM VADDEUQM_VUQ + +; TODO: Note that the entry for VEC_ADDEC currently gets ignored in +; altivec_resolve_overloaded_builtin. Revisit whether we can remove +; that. We still need to register the legal builtin forms here. +[VEC_ADDEC, vec_addec, __builtin_vec_addec] + vsq __builtin_vec_addec (vsq, vsq, vsq); + VADDECUQ VADDECUQ_VSQ + vuq __builtin_vec_addec (vuq, vuq, vuq); + VADDECUQ VADDECUQ_VUQ + +[VEC_ADDS, vec_adds, __builtin_vec_adds] + vuc __builtin_vec_adds (vuc, vuc); + VADDUBS + vsc __builtin_vec_adds (vsc, vsc); + VADDSBS + vus __builtin_vec_adds (vus, vus); + VADDUHS + vss __builtin_vec_adds (vss, vss); + VADDSHS + vui __builtin_vec_adds (vui, vui); + VADDUWS + vsi __builtin_vec_adds (vsi, vsi); + VADDSWS +; The following variants are deprecated. + vuc __builtin_vec_adds (vbc, vuc); + VADDUBS VADDUBS_BU + vuc __builtin_vec_adds (vuc, vbc); + VADDUBS VADDUBS_UB + vsc __builtin_vec_adds (vbc, vsc); + VADDSBS VADDSBS_BS + vsc __builtin_vec_adds (vsc, vbc); + VADDSBS VADDSBS_SB + vus __builtin_vec_adds (vbs, vus); + VADDUHS VADDUHS_BU + vus __builtin_vec_adds (vus, vbs); + VADDUHS VADDUHS_UB + vss __builtin_vec_adds (vbs, vss); + VADDSHS VADDSHS_BS + vss __builtin_vec_adds (vss, vbs); + VADDSHS VADDSHS_SB + vui __builtin_vec_adds (vbi, vui); + VADDUWS VADDUWS_BU + vui __builtin_vec_adds (vui, vbi); + VADDUWS VADDUWS_UB + vsi __builtin_vec_adds (vbi, vsi); + VADDSWS VADDSWS_BS + vsi __builtin_vec_adds (vsi, vbi); + VADDSWS VADDSWS_SB + +[VEC_AND, vec_and, __builtin_vec_and] + vsc __builtin_vec_and (vsc, vsc); + VAND_V16QI + vuc __builtin_vec_and (vuc, vuc); + VAND_V16QI_UNS VAND_VUC + vbc __builtin_vec_and (vbc, vbc); + VAND_V16QI_UNS VAND_VBC + vss __builtin_vec_and (vss, vss); + VAND_V8HI + vus __builtin_vec_and (vus, vus); + VAND_V8HI_UNS VAND_VUS + vbs __builtin_vec_and (vbs, vbs); + VAND_V8HI_UNS VAND_VBS + vsi __builtin_vec_and (vsi, vsi); + VAND_V4SI + vui __builtin_vec_and (vui, vui); + VAND_V4SI_UNS VAND_VUI + vbi __builtin_vec_and (vbi, vbi); + VAND_V4SI_UNS VAND_VBI + vsll __builtin_vec_and (vsll, vsll); + VAND_V2DI + vull __builtin_vec_and (vull, vull); + VAND_V2DI_UNS VAND_VULL + vbll __builtin_vec_and (vbll, vbll); + VAND_V2DI_UNS VAND_VBLL + vf __builtin_vec_and (vf, vf); + VAND_V4SF + vd __builtin_vec_and (vd, vd); + VAND_V2DF +; The following variants are deprecated. + vsc __builtin_vec_and (vsc, vbc); + VAND_V16QI VAND_VSC_VBC + vsc __builtin_vec_and (vbc, vsc); + VAND_V16QI VAND_VBC_VSC + vuc __builtin_vec_and (vuc, vbc); + VAND_V16QI_UNS VAND_VUC_VBC + vuc __builtin_vec_and (vbc, vuc); + VAND_V16QI_UNS VAND_VBC_VUC + vss __builtin_vec_and (vss, vbs); + VAND_V8HI VAND_VSS_VBS + vss __builtin_vec_and (vbs, vss); + VAND_V8HI VAND_VBS_VSS + vus __builtin_vec_and (vus, vbs); + VAND_V8HI_UNS VAND_VUS_VBS + vus __builtin_vec_and (vbs, vus); + VAND_V8HI_UNS VAND_VBS_VUS + vsi __builtin_vec_and (vsi, vbi); + VAND_V4SI VAND_VSI_VBI + vsi __builtin_vec_and (vbi, vsi); + VAND_V4SI VAND_VBI_VSI + vui __builtin_vec_and (vui, vbi); + VAND_V4SI_UNS VAND_VUI_VBI + vui __builtin_vec_and (vbi, vui); + VAND_V4SI_UNS VAND_VBI_VUI + vsll __builtin_vec_and (vsll, vbll); + VAND_V2DI VAND_VSLL_VBLL + vsll __builtin_vec_and (vbll, vsll); + VAND_V2DI VAND_VBLL_VSLL + vull __builtin_vec_and (vull, vbll); + VAND_V2DI_UNS VAND_VULL_VBLL + vull __builtin_vec_and (vbll, vull); + VAND_V2DI_UNS VAND_VBLL_VULL + vf __builtin_vec_and (vf, vbi); + VAND_V4SF VAND_VF_VBI + vf __builtin_vec_and (vbi, vf); + VAND_V4SF VAND_VBI_VF + vd __builtin_vec_and (vd, vbll); + VAND_V2DF VAND_VD_VBLL + vd __builtin_vec_and (vbll, vd); + VAND_V2DF VAND_VBLL_VD + +[VEC_ANDC, vec_andc, __builtin_vec_andc] + vbc __builtin_vec_andc (vbc, vbc); + VANDC_V16QI_UNS VANDC_VBC + vsc __builtin_vec_andc (vsc, vsc); + VANDC_V16QI + vuc __builtin_vec_andc (vuc, vuc); + VANDC_V16QI_UNS VANDC_VUC + vbs __builtin_vec_andc (vbs, vbs); + VANDC_V8HI_UNS VANDC_VBS + vss __builtin_vec_andc (vss, vss); + VANDC_V8HI + vus __builtin_vec_andc (vus, vus); + VANDC_V8HI_UNS VANDC_VUS + vbi __builtin_vec_andc (vbi, vbi); + VANDC_V4SI_UNS VANDC_VBI + vsi __builtin_vec_andc (vsi, vsi); + VANDC_V4SI + vui __builtin_vec_andc (vui, vui); + VANDC_V4SI_UNS VANDC_VUI + vbll __builtin_vec_andc (vbll, vbll); + VANDC_V2DI_UNS VANDC_VBLL + vsll __builtin_vec_andc (vsll, vsll); + VANDC_V2DI + vull __builtin_vec_andc (vull, vull); + VANDC_V2DI_UNS VANDC_VULL + vf __builtin_vec_andc (vf, vf); + VANDC_V4SF + vd __builtin_vec_andc (vd, vd); + VANDC_V2DF +; The following variants are deprecated. + vsc __builtin_vec_andc (vsc, vbc); + VANDC_V16QI VANDC_VSC_VBC + vsc __builtin_vec_andc (vbc, vsc); + VANDC_V16QI VANDC_VBC_VSC + vuc __builtin_vec_andc (vuc, vbc); + VANDC_V16QI_UNS VANDC_VUC_VBC + vuc __builtin_vec_andc (vbc, vuc); + VANDC_V16QI_UNS VANDC_VBC_VUC + vss __builtin_vec_andc (vss, vbs); + VANDC_V8HI VANDC_VSS_VBS + vss __builtin_vec_andc (vbs, vss); + VANDC_V8HI VANDC_VBS_VSS + vus __builtin_vec_andc (vus, vbs); + VANDC_V8HI_UNS VANDC_VUS_VBS + vus __builtin_vec_andc (vbs, vus); + VANDC_V8HI_UNS VANDC_VBS_VUS + vsi __builtin_vec_andc (vsi, vbi); + VANDC_V4SI VANDC_VSI_VBI + vsi __builtin_vec_andc (vbi, vsi); + VANDC_V4SI VANDC_VBI_VSI + vui __builtin_vec_andc (vui, vbi); + VANDC_V4SI_UNS VANDC_VUI_VBI + vui __builtin_vec_andc (vbi, vui); + VANDC_V4SI_UNS VANDC_VBI_VUI + vsll __builtin_vec_andc (vsll, vbll); + VANDC_V2DI VANDC_VSLL_VBLL + vsll __builtin_vec_andc (vbll, vsll); + VANDC_V2DI VANDC_VBLL_VSLL + vull __builtin_vec_andc (vull, vbll); + VANDC_V2DI_UNS VANDC_VULL_VBLL + vull __builtin_vec_andc (vbll, vull); + VANDC_V2DI_UNS VANDC_VBLL_VULL + vf __builtin_vec_andc (vf, vbi); + VANDC_V4SF VANDC_VF_VBI + vf __builtin_vec_andc (vbi, vf); + VANDC_V4SF VANDC_VBI_VF + vd __builtin_vec_andc (vd, vbll); + VANDC_V2DF VANDC_VD_VBLL + vd __builtin_vec_andc (vbll, vd); + VANDC_V2DF VANDC_VBLL_VD + +[VEC_AVG, vec_avg, __builtin_vec_avg] + vsc __builtin_vec_avg (vsc, vsc); + VAVGSB + vuc __builtin_vec_avg (vuc, vuc); + VAVGUB + vss __builtin_vec_avg (vss, vss); + VAVGSH + vus __builtin_vec_avg (vus, vus); + VAVGUH + vsi __builtin_vec_avg (vsi, vsi); + VAVGSW + vui __builtin_vec_avg (vui, vui); + VAVGUW + +[VEC_BLENDV, vec_blendv, __builtin_vec_xxblend, _ARCH_PWR10] + vsc __builtin_vec_xxblend (vsc, vsc, vuc); + VXXBLEND_V16QI VXXBLEND_VSC + vuc __builtin_vec_xxblend (vuc, vuc, vuc); + VXXBLEND_V16QI VXXBLEND_VUC + vss __builtin_vec_xxblend (vss, vss, vus); + VXXBLEND_V8HI VXXBLEND_VSS + vus __builtin_vec_xxblend (vus, vus, vus); + VXXBLEND_V8HI VXXBLEND_VUS + vsi __builtin_vec_xxblend (vsi, vsi, vui); + VXXBLEND_V4SI VXXBLEND_VSI + vui __builtin_vec_xxblend (vui, vui, vui); + VXXBLEND_V4SI VXXBLEND_VUI + vsll __builtin_vec_xxblend (vsll, vsll, vull); + VXXBLEND_V2DI VXXBLEND_VSLL + vull __builtin_vec_xxblend (vull, vull, vull); + VXXBLEND_V2DI VXXBLEND_VULL + vf __builtin_vec_xxblend (vf, vf, vui); + VXXBLEND_V4SF + vd __builtin_vec_xxblend (vd, vd, vull); + VXXBLEND_V2DF + +[VEC_BPERM, vec_bperm, __builtin_vec_vbperm_api, _ARCH_PWR8] + vull __builtin_vec_vbperm_api (vull, vuc); + VBPERMD VBPERMD_VULL + vull __builtin_vec_vbperm_api (vuq, vuc); + VBPERMQ VBPERMQ_VUQ + vuc __builtin_vec_vbperm_api (vuc, vuc); + VBPERMQ2 VBPERMQ2_U + vsc __builtin_vec_vbperm_api (vsc, vsc); + VBPERMQ2 VBPERMQ2_S + +[VEC_CEIL, vec_ceil, __builtin_vec_ceil] + vf __builtin_vec_ceil (vf); + VRFIP + vd __builtin_vec_ceil (vd); + XVRDPIP + +[VEC_CFUGE, vec_cfuge, __builtin_vec_cfuge, _ARCH_PWR10] + vull __builtin_vec_cfuge (vull, vull); + VCFUGED + +[VEC_CIPHER_BE, vec_cipher_be, __builtin_vec_vcipher_be, _ARCH_PWR8] + vuc __builtin_vec_vcipher_be (vuc, vuc); + VCIPHER_BE + +[VEC_CIPHERLAST_BE, vec_cipherlast_be, __builtin_vec_vcipherlast_be, _ARCH_PWR8] + vuc __builtin_vec_vcipherlast_be (vuc, vuc); + VCIPHERLAST_BE + +[VEC_CLRL, vec_clrl, __builtin_vec_clrl, _ARCH_PWR10] + vsc __builtin_vec_clrl (vsc, unsigned int); + VCLRLB VCLRLB_S + vuc __builtin_vec_clrl (vuc, unsigned int); + VCLRLB VCLRLB_U + +[VEC_CLRR, vec_clrr, __builtin_vec_clrr, _ARCH_PWR10] + vsc __builtin_vec_clrr (vsc, unsigned int); + VCLRRB VCLRRB_S + vuc __builtin_vec_clrr (vuc, unsigned int); + VCLRRB VCLRRB_U + +; We skip generating a #define because of the C-versus-C++ complexity +; in altivec.h. Look there for the template-y details. +[VEC_CMPAE_P, SKIP, __builtin_vec_vcmpae_p] + signed int __builtin_vec_vcmpae_p (vsc, vsc); + VCMPAEB_P VCMPAEB_VSC_P + signed int __builtin_vec_vcmpae_p (vuc, vuc); + VCMPAEB_P VCMPAEB_VUC_P + signed int __builtin_vec_vcmpae_p (vbc, vbc); + VCMPAEB_P VCMPAEB_VBC_P + signed int __builtin_vec_vcmpae_p (vss, vss); + VCMPAEH_P VCMPAEH_VSS_P + signed int __builtin_vec_vcmpae_p (vus, vus); + VCMPAEH_P VCMPAEH_VUS_P + signed int __builtin_vec_vcmpae_p (vbs, vbs); + VCMPAEH_P VCMPAEH_VBS_P + signed int __builtin_vec_vcmpae_p (vp, vp); + VCMPAEH_P VCMPAEH_VP_P + signed int __builtin_vec_vcmpae_p (vsi, vsi); + VCMPAEW_P VCMPAEW_VSI_P + signed int __builtin_vec_vcmpae_p (vui, vui); + VCMPAEW_P VCMPAEW_VUI_P + signed int __builtin_vec_vcmpae_p (vbi, vbi); + VCMPAEW_P VCMPAEW_VBI_P + signed int __builtin_vec_vcmpae_p (vsll, vsll); + VCMPAED_P VCMPAED_VSLL_P + signed int __builtin_vec_vcmpae_p (vull, vull); + VCMPAED_P VCMPAED_VULL_P + signed int __builtin_vec_vcmpae_p (vbll, vbll); + VCMPAED_P VCMPAED_VBLL_P + signed int __builtin_vec_vcmpae_p (vsq, vsq); + VCMPAET_P VCMPAET_VSQ_P + signed int __builtin_vec_vcmpae_p (vuq, vuq); + VCMPAET_P VCMPAET_VUQ_P + signed int __builtin_vec_vcmpae_p (vf, vf); + VCMPAEFP_P + signed int __builtin_vec_vcmpae_p (vd, vd); + VCMPAEDP_P +; The following variants are deprecated. + signed int __builtin_vec_vcmpae_p (signed int, vbc, vuc); + VCMPAEB_P VCMPAEB_P_BU + signed int __builtin_vec_vcmpae_p (signed int, vuc, vbc); + VCMPAEB_P VCMPAEB_P_UB + signed int __builtin_vec_vcmpae_p (signed int, vbc, vsc); + VCMPAEB_P VCMPAEB_P_BS + signed int __builtin_vec_vcmpae_p (signed int, vsc, vbc); + VCMPAEB_P VCMPAEB_P_SB + signed int __builtin_vec_vcmpae_p (signed int, vbs, vus); + VCMPAEH_P VCMPAEH_P_BU + signed int __builtin_vec_vcmpae_p (signed int, vus, vbs); + VCMPAEH_P VCMPAEH_P_UB + signed int __builtin_vec_vcmpae_p (signed int, vbs, vss); + VCMPAEH_P VCMPAEH_P_BS + signed int __builtin_vec_vcmpae_p (signed int, vss, vbs); + VCMPAEH_P VCMPAEH_P_SB + signed int __builtin_vec_vcmpae_p (signed int, vbi, vui); + VCMPAEW_P VCMPAEW_P_BU + signed int __builtin_vec_vcmpae_p (signed int, vui, vbi); + VCMPAEW_P VCMPAEW_P_UB + signed int __builtin_vec_vcmpae_p (signed int, vbi, vsi); + VCMPAEW_P VCMPAEW_P_BS + signed int __builtin_vec_vcmpae_p (signed int, vsi, vbi); + VCMPAEW_P VCMPAEW_P_SB + signed int __builtin_vec_vcmpae_p (signed int, vbll, vull); + VCMPAED_P VCMPAED_P_BU + signed int __builtin_vec_vcmpae_p (signed int, vull, vbll); + VCMPAED_P VCMPAED_P_UB + signed int __builtin_vec_vcmpae_p (signed int, vbll, vsll); + VCMPAED_P VCMPAED_P_BS + signed int __builtin_vec_vcmpae_p (signed int, vbll, vsll); + VCMPAED_P VCMPAED_P_SB + +[VEC_CMPB, vec_cmpb, __builtin_vec_cmpb] + vsi __builtin_vec_cmpb (vf, vf); + VCMPBFP + +[VEC_CMPEQ, vec_cmpeq, __builtin_vec_cmpeq] + vbc __builtin_vec_cmpeq (vsc, vsc); + VCMPEQUB VCMPEQUB_VSC + vbc __builtin_vec_cmpeq (vuc, vuc); + VCMPEQUB VCMPEQUB_VUC + vbc __builtin_vec_cmpeq (vbc, vbc); + VCMPEQUB VCMPEQUB_VBC + vbs __builtin_vec_cmpeq (vss, vss); + VCMPEQUH VCMPEQUH_VSS + vbs __builtin_vec_cmpeq (vus, vus); + VCMPEQUH VCMPEQUH_VUS + vbs __builtin_vec_cmpeq (vbs, vbs); + VCMPEQUH VCMPEQUH_VBS + vbi __builtin_vec_cmpeq (vsi, vsi); + VCMPEQUW VCMPEQUW_VSI + vbi __builtin_vec_cmpeq (vui, vui); + VCMPEQUW VCMPEQUW_VUI + vbi __builtin_vec_cmpeq (vbi, vbi); + VCMPEQUW VCMPEQUW_VBI + vbll __builtin_vec_cmpeq (vsll, vsll); + VCMPEQUD VCMPEQUD_VSLL + vbll __builtin_vec_cmpeq (vull, vull); + VCMPEQUD VCMPEQUD_VULL + vbll __builtin_vec_cmpeq (vbll, vbll); + VCMPEQUD VCMPEQUD_VBLL + vbq __builtin_vec_cmpeq (vsq, vsq); + VCMPEQUT VCMPEQUT_VSQ + vbq __builtin_vec_cmpeq (vuq, vuq); + VCMPEQUT VCMPEQUT_VUQ + vbi __builtin_vec_cmpeq (vf, vf); + VCMPEQFP + vbll __builtin_vec_cmpeq (vd, vd); + XVCMPEQDP + +; We skip generating a #define because of the C-versus-C++ complexity +; in altivec.h. Look there for the template-y details. +[VEC_CMPEQ_P, SKIP, __builtin_vec_vcmpeq_p] + signed int __builtin_vec_vcmpeq_p (signed int, vuc, vuc); + VCMPEQUB_P VCMPEQUB_PU + signed int __builtin_vec_vcmpeq_p (signed int, vsc, vsc); + VCMPEQUB_P VCMPEQUB_PS + signed int __builtin_vec_vcmpeq_p (signed int, vbc, vbc); + VCMPEQUB_P VCMPEQUB_PB + signed int __builtin_vec_vcmpeq_p (signed int, vus, vus); + VCMPEQUH_P VCMPEQUH_PU + signed int __builtin_vec_vcmpeq_p (signed int, vss, vss); + VCMPEQUH_P VCMPEQUH_PS + signed int __builtin_vec_vcmpeq_p (signed int, vbs, vbs); + VCMPEQUH_P VCMPEQUH_PB + signed int __builtin_vec_vcmpeq_p (signed int, vp, vp); + VCMPEQUH_P VCMPEQUH_PP + signed int __builtin_vec_vcmpeq_p (signed int, vui, vui); + VCMPEQUW_P VCMPEQUW_PU + signed int __builtin_vec_vcmpeq_p (signed int, vsi, vsi); + VCMPEQUW_P VCMPEQUW_PS + signed int __builtin_vec_vcmpeq_p (signed int, vbi, vbi); + VCMPEQUW_P VCMPEQUW_PB + signed int __builtin_vec_vcmpeq_p (signed int, vull, vull); + VCMPEQUD_P VCMPEQUD_PU + signed int __builtin_vec_vcmpeq_p (signed int, vsll, vsll); + VCMPEQUD_P VCMPEQUD_PS + signed int __builtin_vec_vcmpeq_p (signed int, vbll, vbll); + VCMPEQUD_P VCMPEQUD_PB + signed int __builtin_vec_vcmpeq_p (signed int, vsq, vsq); + VCMPEQUT_P VCMPEQUT_P_VSQ + signed int __builtin_vec_vcmpeq_p (signed int, vuq, vuq); + VCMPEQUT_P VCMPEQUT_P_VUQ + signed int __builtin_vec_vcmpeq_p (signed int, vf, vf); + VCMPEQFP_P + signed int __builtin_vec_vcmpeq_p (signed int, vd, vd); + XVCMPEQDP_P +; The following variants are deprecated. + signed int __builtin_vec_vcmpeq_p (signed int, vbc, vuc); + VCMPEQUB_P VCMPEQUB_P_BU + signed int __builtin_vec_vcmpeq_p (signed int, vuc, vbc); + VCMPEQUB_P VCMPEQUB_P_UB + signed int __builtin_vec_vcmpeq_p (signed int, vbc, vsc); + VCMPEQUB_P VCMPEQUB_P_BS + signed int __builtin_vec_vcmpeq_p (signed int, vsc, vbc); + VCMPEQUB_P VCMPEQUB_P_SB + signed int __builtin_vec_vcmpeq_p (signed int, vbs, vus); + VCMPEQUH_P VCMPEQUH_P_BU + signed int __builtin_vec_vcmpeq_p (signed int, vus, vbs); + VCMPEQUH_P VCMPEQUH_P_UB + signed int __builtin_vec_vcmpeq_p (signed int, vbs, vss); + VCMPEQUH_P VCMPEQUH_P_BS + signed int __builtin_vec_vcmpeq_p (signed int, vss, vbs); + VCMPEQUH_P VCMPEQUH_P_SB + signed int __builtin_vec_vcmpeq_p (signed int, vbi, vui); + VCMPEQUW_P VCMPEQUW_P_BU + signed int __builtin_vec_vcmpeq_p (signed int, vui, vbi); + VCMPEQUW_P VCMPEQUW_P_UB + signed int __builtin_vec_vcmpeq_p (signed int, vbi, vsi); + VCMPEQUW_P VCMPEQUW_P_BS + signed int __builtin_vec_vcmpeq_p (signed int, vsi, vbi); + VCMPEQUW_P VCMPEQUW_P_SB + signed int __builtin_vec_vcmpeq_p (signed int, vbll, vull); + VCMPEQUD_P VCMPEQUD_P_BU + signed int __builtin_vec_vcmpeq_p (signed int, vull, vbll); + VCMPEQUD_P VCMPEQUD_P_UB + signed int __builtin_vec_vcmpeq_p (signed int, vbll, vsll); + VCMPEQUD_P VCMPEQUD_P_BS + signed int __builtin_vec_vcmpeq_p (signed int, vbll, vsll); + VCMPEQUD_P VCMPEQUD_P_SB + +[VEC_CMPEQB, SKIP, __builtin_byte_in_set] + signed int __builtin_byte_in_set (unsigned int, unsigned long long); + CMPEQB + +[VEC_CMPGE, vec_cmpge, __builtin_vec_cmpge] + vbc __builtin_vec_cmpge (vsc, vsc); + CMPGE_16QI CMPGE_16QI_VSC + vbc __builtin_vec_cmpge (vuc, vuc); + CMPGE_U16QI CMPGE_16QI_VUC + vbs __builtin_vec_cmpge (vss, vss); + CMPGE_8HI CMPGE_8HI_VSS + vbs __builtin_vec_cmpge (vus, vus); + CMPGE_U8HI CMPGE_8HI_VUS + vbi __builtin_vec_cmpge (vsi, vsi); + CMPGE_4SI CMPGE_4SI_VSI + vbi __builtin_vec_cmpge (vui, vui); + CMPGE_U4SI CMPGE_4SI_VUI + vbll __builtin_vec_cmpge (vsll, vsll); + CMPGE_2DI CMPGE_2DI_VSLL + vbll __builtin_vec_cmpge (vull, vull); + CMPGE_U2DI CMPGE_2DI_VULL + vbq __builtin_vec_cmpge (vsq, vsq); + CMPGE_1TI + vbq __builtin_vec_cmpge (vuq, vuq); + CMPGE_U1TI + vbi __builtin_vec_cmpge (vf, vf); + VCMPGEFP + vbll __builtin_vec_cmpge (vd, vd); + XVCMPGEDP + +; We skip generating a #define because of the C-versus-C++ complexity +; in altivec.h. Look there for the template-y details. +; See altivec_build_resolved_builtin for how we deal with VEC_CMPGE_P. +; It's quite strange and horrible! +[VEC_CMPGE_P, SKIP, __builtin_vec_vcmpge_p] + signed int __builtin_vec_vcmpge_p (signed int, vuc, vuc); + VCMPGTUB_P VCMPGTUB_PR + signed int __builtin_vec_vcmpge_p (signed int, vsc, vsc); + VCMPGTSB_P VCMPGTSB_PR + signed int __builtin_vec_vcmpge_p (signed int, vus, vus); + VCMPGTUH_P VCMPGTUH_PR + signed int __builtin_vec_vcmpge_p (signed int, vss, vss); + VCMPGTSH_P VCMPGTSH_PR + signed int __builtin_vec_vcmpge_p (signed int, vui, vui); + VCMPGTUW_P VCMPGTUW_PR + signed int __builtin_vec_vcmpge_p (signed int, vsi, vsi); + VCMPGTSW_P VCMPGTSW_PR + signed int __builtin_vec_vcmpge_p (signed int, vull, vull); + VCMPGTUD_P VCMPGTUD_PR + signed int __builtin_vec_vcmpge_p (signed int, vsll, vsll); + VCMPGTSD_P VCMPGTSD_PR + signed int __builtin_vec_vcmpge_p (signed int, vuq, vuq); + VCMPGTUT_P VCMPGTUT_PR + signed int __builtin_vec_vcmpge_p (signed int, vsq, vsq); + VCMPGTST_P VCMPGTST_PR + signed int __builtin_vec_vcmpge_p (signed int, vf, vf); + VCMPGEFP_P + signed int __builtin_vec_vcmpge_p (signed int, vd, vd); + XVCMPGEDP_P +; The following variants are deprecated. + signed int __builtin_vec_vcmpge_p (signed int, vbc, vuc); + VCMPGTUB_P VCMPGTUB_PR_BU + signed int __builtin_vec_vcmpge_p (signed int, vuc, vbc); + VCMPGTUB_P VCMPGTUB_PR_UB + signed int __builtin_vec_vcmpge_p (signed int, vbc, vsc); + VCMPGTSB_P VCMPGTSB_PR_BS + signed int __builtin_vec_vcmpge_p (signed int, vsc, vbc); + VCMPGTSB_P VCMPGTSB_PR_SB + signed int __builtin_vec_vcmpge_p (signed int, vbs, vus); + VCMPGTUH_P VCMPGTUH_PR_BU + signed int __builtin_vec_vcmpge_p (signed int, vus, vbs); + VCMPGTUH_P VCMPGTUH_PR_UB + signed int __builtin_vec_vcmpge_p (signed int, vbs, vss); + VCMPGTSH_P VCMPGTSH_PR_BS + signed int __builtin_vec_vcmpge_p (signed int, vss, vbs); + VCMPGTSH_P VCMPGTSH_PR_SB + signed int __builtin_vec_vcmpge_p (signed int, vbi, vui); + VCMPGTUW_P VCMPGTUW_PR_BU + signed int __builtin_vec_vcmpge_p (signed int, vui, vbi); + VCMPGTUW_P VCMPGTUW_PR_UB + signed int __builtin_vec_vcmpge_p (signed int, vbi, vsi); + VCMPGTSW_P VCMPGTSW_PR_BS + signed int __builtin_vec_vcmpge_p (signed int, vsi, vbi); + VCMPGTSW_P VCMPGTSW_PR_SB + signed int __builtin_vec_vcmpge_p (signed int, vbll, vull); + VCMPGTUD_P VCMPGTUD_PR_BU + signed int __builtin_vec_vcmpge_p (signed int, vull, vbll); + VCMPGTUD_P VCMPGTUD_PR_UB + signed int __builtin_vec_vcmpge_p (signed int, vbll, vsll); + VCMPGTSD_P VCMPGTSD_PR_BS + signed int __builtin_vec_vcmpge_p (signed int, vsll, vbll); + VCMPGTSD_P VCMPGTSD_PR_SB + +[VEC_CMPGT, vec_cmpgt, __builtin_vec_cmpgt] + vbc __builtin_vec_cmpgt (vsc, vsc); + VCMPGTSB + vbc __builtin_vec_cmpgt (vuc, vuc); + VCMPGTUB + vbs __builtin_vec_cmpgt (vss, vss); + VCMPGTSH + vbs __builtin_vec_cmpgt (vus, vus); + VCMPGTUH + vbi __builtin_vec_cmpgt (vsi, vsi); + VCMPGTSW + vbi __builtin_vec_cmpgt (vui, vui); + VCMPGTUW + vbll __builtin_vec_cmpgt (vsll, vsll); + VCMPGTSD + vbll __builtin_vec_cmpgt (vull, vull); + VCMPGTUD + vbq __builtin_vec_cmpgt (vsq, vsq); + VCMPGTST + vbq __builtin_vec_cmpgt (vuq, vuq); + VCMPGTUT + vbi __builtin_vec_cmpgt (vf, vf); + VCMPGTFP + vbll __builtin_vec_cmpgt (vd, vd); + XVCMPGTDP + +; We skip generating a #define because of the C-versus-C++ complexity +; in altivec.h. Look there for the template-y details. +[VEC_CMPGT_P, SKIP, __builtin_vec_vcmpgt_p] + signed int __builtin_vec_vcmpgt_p (signed int, vuc, vuc); + VCMPGTUB_P + signed int __builtin_vec_vcmpgt_p (signed int, vsc, vsc); + VCMPGTSB_P + signed int __builtin_vec_vcmpgt_p (signed int, vus, vus); + VCMPGTUH_P + signed int __builtin_vec_vcmpgt_p (signed int, vss, vss); + VCMPGTSH_P + signed int __builtin_vec_vcmpgt_p (signed int, vui, vui); + VCMPGTUW_P + signed int __builtin_vec_vcmpgt_p (signed int, vsi, vsi); + VCMPGTSW_P + signed int __builtin_vec_vcmpgt_p (signed int, vull, vull); + VCMPGTUD_P + signed int __builtin_vec_vcmpgt_p (signed int, vsll, vsll); + VCMPGTSD_P + signed int __builtin_vec_vcmpgt_p (signed int, vuq, vuq); + VCMPGTUT_P + signed int __builtin_vec_vcmpgt_p (signed int, vsq, vsq); + VCMPGTST_P + signed int __builtin_vec_vcmpgt_p (signed int, vf, vf); + VCMPGTFP_P + signed int __builtin_vec_vcmpgt_p (signed int, vd, vd); + XVCMPGTDP_P +; The following variants are deprecated. + signed int __builtin_vec_vcmpgt_p (signed int, vbc, vuc); + VCMPGTUB_P VCMPGTUB_P_BU + signed int __builtin_vec_vcmpgt_p (signed int, vuc, vbc); + VCMPGTUB_P VCMPGTUB_P_UB + signed int __builtin_vec_vcmpgt_p (signed int, vbc, vsc); + VCMPGTSB_P VCMPGTSB_P_BS + signed int __builtin_vec_vcmpgt_p (signed int, vsc, vbc); + VCMPGTSB_P VCMPGTSB_P_SB + signed int __builtin_vec_vcmpgt_p (signed int, vbs, vus); + VCMPGTUH_P VCMPGTUH_P_BU + signed int __builtin_vec_vcmpgt_p (signed int, vus, vbs); + VCMPGTUH_P VCMPGTUH_P_UB + signed int __builtin_vec_vcmpgt_p (signed int, vbs, vss); + VCMPGTSH_P VCMPGTSH_P_BS + signed int __builtin_vec_vcmpgt_p (signed int, vss, vbs); + VCMPGTSH_P VCMPGTSH_P_SB + signed int __builtin_vec_vcmpgt_p (signed int, vbi, vui); + VCMPGTUW_P VCMPGTUW_P_BU + signed int __builtin_vec_vcmpgt_p (signed int, vui, vbi); + VCMPGTUW_P VCMPGTUW_P_UB + signed int __builtin_vec_vcmpgt_p (signed int, vbi, vsi); + VCMPGTSW_P VCMPGTSW_P_BS + signed int __builtin_vec_vcmpgt_p (signed int, vsi, vbi); + VCMPGTSW_P VCMPGTSW_P_SB + signed int __builtin_vec_vcmpgt_p (signed int, vbll, vull); + VCMPGTUD_P VCMPGTUD_P_BU + signed int __builtin_vec_vcmpgt_p (signed int, vull, vbll); + VCMPGTUD_P VCMPGTUD_P_UB + signed int __builtin_vec_vcmpgt_p (signed int, vbll, vsll); + VCMPGTSD_P VCMPGTSD_P_BS + signed int __builtin_vec_vcmpgt_p (signed int, vsll, vbll); + VCMPGTSD_P VCMPGTSD_P_SB + +; Note that there is no entry for VEC_CMPLE. VEC_CMPLE is implemented +; using VEC_CMPGE with reversed arguments in altivec.h. + +; Note that there is no entry for VEC_CMPLT. VEC_CMPLT is implemented +; using VEC_CMPGT with reversed arguments in altivec.h. + +[VEC_CMPNE, vec_cmpne, __builtin_vec_cmpne] + vbc __builtin_vec_cmpne (vbc, vbc); + VCMPNEB VCMPNEB_VBC + vbc __builtin_vec_cmpne (vsc, vsc); + VCMPNEB VCMPNEB_VSC + vbc __builtin_vec_cmpne (vuc, vuc); + VCMPNEB VCMPNEB_VUC + vbs __builtin_vec_cmpne (vbs, vbs); + VCMPNEH VCMPNEH_VBS + vbs __builtin_vec_cmpne (vss, vss); + VCMPNEH VCMPNEH_VSS + vbs __builtin_vec_cmpne (vus, vus); + VCMPNEH VCMPNEH_VUS + vbi __builtin_vec_cmpne (vbi, vbi); + VCMPNEW VCMPNEW_VBI + vbi __builtin_vec_cmpne (vsi, vsi); + VCMPNEW VCMPNEW_VSI + vbi __builtin_vec_cmpne (vui, vui); + VCMPNEW VCMPNEW_VUI + vbq __builtin_vec_cmpne (vsq, vsq); + VCMPNET VCMPNET_VSQ + vbq __builtin_vec_cmpne (vuq, vuq); + VCMPNET VCMPNET_VUQ + +; We skip generating a #define because of the C-versus-C++ complexity +; in altivec.h. Look there for the template-y details. +[VEC_CMPNE_P, SKIP, __builtin_vec_vcmpne_p] + signed int __builtin_vec_vcmpne_p (vsc, vsc); + VCMPNEB_P VCMPNEB_VSC_P + signed int __builtin_vec_vcmpne_p (vuc, vuc); + VCMPNEB_P VCMPNEB_VUC_P + signed int __builtin_vec_vcmpne_p (vbc, vbc); + VCMPNEB_P VCMPNEB_VBC_P + signed int __builtin_vec_vcmpne_p (vss, vss); + VCMPNEH_P VCMPNEH_VSS_P + signed int __builtin_vec_vcmpne_p (vus, vus); + VCMPNEH_P VCMPNEH_VUS_P + signed int __builtin_vec_vcmpne_p (vbs, vbs); + VCMPNEH_P VCMPNEH_VBS_P + signed int __builtin_vec_vcmpne_p (vp, vp); + VCMPNEH_P VCMPNEH_VP_P + signed int __builtin_vec_vcmpne_p (vsi, vsi); + VCMPNEW_P VCMPNEW_VSI_P + signed int __builtin_vec_vcmpne_p (vui, vui); + VCMPNEW_P VCMPNEW_VUI_P + signed int __builtin_vec_vcmpne_p (vbi, vbi); + VCMPNEW_P VCMPNEW_VBI_P + signed int __builtin_vec_vcmpne_p (vsll, vsll); + VCMPNED_P VCMPNED_VSLL_P + signed int __builtin_vec_vcmpne_p (vull, vull); + VCMPNED_P VCMPNED_VULL_P + signed int __builtin_vec_vcmpne_p (vbll, vbll); + VCMPNED_P VCMPNED_VBLL_P + signed int __builtin_vec_vcmpne_p (vsq, vsq); + VCMPNET_P VCMPNET_VSQ_P + signed int __builtin_vec_vcmpne_p (vuq, vuq); + VCMPNET_P VCMPNET_VUQ_P + signed int __builtin_vec_vcmpne_p (vf, vf); + VCMPNEFP_P + signed int __builtin_vec_vcmpne_p (vd, vd); + VCMPNEDP_P +; The following variants are deprecated. + signed int __builtin_vec_vcmpne_p (signed int, vbc, vuc); + VCMPNEB_P VCMPNEB_P_BU + signed int __builtin_vec_vcmpne_p (signed int, vuc, vbc); + VCMPNEB_P VCMPNEB_P_UB + signed int __builtin_vec_vcmpne_p (signed int, vbc, vsc); + VCMPNEB_P VCMPNEB_P_BS + signed int __builtin_vec_vcmpne_p (signed int, vsc, vbc); + VCMPNEB_P VCMPNEB_P_SB + signed int __builtin_vec_vcmpne_p (signed int, vbs, vus); + VCMPNEH_P VCMPNEH_P_BU + signed int __builtin_vec_vcmpne_p (signed int, vus, vbs); + VCMPNEH_P VCMPNEH_P_UB + signed int __builtin_vec_vcmpne_p (signed int, vbs, vss); + VCMPNEH_P VCMPNEH_P_BS + signed int __builtin_vec_vcmpne_p (signed int, vss, vbs); + VCMPNEH_P VCMPNEH_P_SB + signed int __builtin_vec_vcmpne_p (signed int, vbi, vui); + VCMPNEW_P VCMPNEW_P_BU + signed int __builtin_vec_vcmpne_p (signed int, vui, vbi); + VCMPNEW_P VCMPNEW_P_UB + signed int __builtin_vec_vcmpne_p (signed int, vbi, vsi); + VCMPNEW_P VCMPNEW_P_BS + signed int __builtin_vec_vcmpne_p (signed int, vsi, vbi); + VCMPNEW_P VCMPNEW_P_SB + signed int __builtin_vec_vcmpne_p (signed int, vbll, vull); + VCMPNED_P VCMPNED_P_BU + signed int __builtin_vec_vcmpne_p (signed int, vull, vbll); + VCMPNED_P VCMPNED_P_UB + signed int __builtin_vec_vcmpne_p (signed int, vbll, vsll); + VCMPNED_P VCMPNED_P_BS + signed int __builtin_vec_vcmpne_p (signed int, vbll, vsll); + VCMPNED_P VCMPNED_P_SB + +[VEC_CMPNEZ, vec_cmpnez, __builtin_vec_vcmpnez, _ARCH_PWR9] + vbc __builtin_vec_cmpnez (vsc, vsc); + CMPNEZB CMPNEZB_S + vbc __builtin_vec_cmpnez (vuc, vuc); + CMPNEZB CMPNEZB_U + vbs __builtin_vec_cmpnez (vss, vss); + CMPNEZH CMPNEZH_S + vbs __builtin_vec_cmpnez (vus, vus); + CMPNEZH CMPNEZH_U + vbi __builtin_vec_cmpnez (vsi, vsi); + CMPNEZW CMPNEZW_S + vbi __builtin_vec_cmpnez (vui, vui); + CMPNEZW CMPNEZW_U + +; We skip generating a #define because of the C-versus-C++ complexity +; in altivec.h. Look there for the template-y details. +[VEC_CMPNEZ_P, SKIP, __builtin_vec_vcmpnez_p] + signed int __builtin_vec_vcmpnez_p (signed int, vsc, vsc); + VCMPNEZB_P VCMPNEZB_VSC_P + signed int __builtin_vec_vcmpnez_p (signed int, vuc, vuc); + VCMPNEZB_P VCMPNEZB_VUC_P + signed int __builtin_vec_vcmpnez_p (signed int, vss, vss); + VCMPNEZH_P VCMPNEZH_VSS_P + signed int __builtin_vec_vcmpnez_p (signed int, vus, vus); + VCMPNEZH_P VCMPNEZH_VUS_P + signed int __builtin_vec_vcmpnez_p (signed int, vsi, vsi); + VCMPNEZW_P VCMPNEZW_VSI_P + signed int __builtin_vec_vcmpnez_p (signed int, vui, vui); + VCMPNEZW_P VCMPNEZW_VUI_P + +[VEC_CMPRB, SKIP, __builtin_byte_in_range] + signed int __builtin_byte_in_range (unsigned int, unsigned int); + CMPRB + +[VEC_CMPRB2, SKIP, __builtin_byte_in_either_range] + signed int __builtin_byte_in_range (unsigned int, unsigned int); + CMPRB2 + +[VEC_CNTLZ, vec_cntlz, __builtin_vec_vclz, _ARCH_PWR8] + vsc __builtin_vec_vclz (vsc); + VCLZB VCLZB_S + vuc __builtin_vec_vclz (vuc); + VCLZB VCLZB_U + vss __builtin_vec_vclz (vss); + VCLZH VCLZH_S + vus __builtin_vec_vclz (vus); + VCLZH VCLZH_U + vsi __builtin_vec_vclz (vsi); + VCLZW VCLZW_S + vui __builtin_vec_vclz (vui); + VCLZW VCLZW_U + vsll __builtin_vec_vclz (vsll); + VCLZD VCLZD_S + vull __builtin_vec_vclz (vull); + VCLZD VCLZD_U + +[VEC_CNTLZM, vec_cntlzm, __builtin_vec_vclzdm, _ARCH_PWR10] + vull __builtin_vec_vclzdm (vull, vull); + VCLZDM + +[VEC_CNTTZM, vec_cnttzm, __builtin_vec_vctzdm, _ARCH_PWR10] + vull __builtin_vec_vctzdm (vull, vull); + VCTZDM + +[VEC_CNTLZ_LSBB, vec_cntlz_lsbb, __builtin_vec_vclzlsbb, _ARCH_PWR9] + signed int __builtin_vec_vclzlsbb (vsc); + VCLZLSBB_V16QI VCLZLSBB_VSC + signed int __builtin_vec_vclzlsbb (vuc); + VCLZLSBB_V16QI VCLZLSBB_VUC + signed int __builtin_vec_vclzlsbb (vss); + VCLZLSBB_V8HI VCLZLSBB_VSS + signed int __builtin_vec_vclzlsbb (vus); + VCLZLSBB_V8HI VCLZLSBB_VUS + signed int __builtin_vec_vclzlsbb (vsi); + VCLZLSBB_V4SI VCLZLSBB_VSI + signed int __builtin_vec_vclzlsbb (vui); + VCLZLSBB_V4SI VCLZLSBB_VUI + +[VEC_CNTM, vec_cntm, __builtin_vec_cntm, _ARCH_PWR10] + unsigned long long __builtin_vec_cntm (vuc, const int); + VCNTMBB + unsigned long long __builtin_vec_cntm (vus, const int); + VCNTMBH + unsigned long long __builtin_vec_cntm (vui, const int); + VCNTMBW + unsigned long long __builtin_vec_cntm (vull, const int); + VCNTMBD + +[VEC_CNTTZ, vec_cnttz, __builtin_vec_vctz, _ARCH_PWR9] + vsc __builtin_vec_vctz (vsc); + VCTZB VCTZB_S + vuc __builtin_vec_vctz (vuc); + VCTZB VCTZB_U + vss __builtin_vec_vctz (vss); + VCTZH VCTZH_S + vus __builtin_vec_vctz (vus); + VCTZH VCTZH_U + vsi __builtin_vec_vctz (vsi); + VCTZW VCTZW_S + vui __builtin_vec_vctz (vui); + VCTZW VCTZW_U + vsll __builtin_vec_vctz (vsll); + VCTZD VCTZD_S + vull __builtin_vec_vctz (vull); + VCTZD VCTZD_U + +[VEC_CNTTZ_LSBB, vec_cnttz_lsbb, __builtin_vec_vctzlsbb, _ARCH_PWR9] + signed int __builtin_vec_vctzlsbb (vsc); + VCTZLSBB_V16QI VCTZLSBB_VSC + signed int __builtin_vec_vctzlsbb (vuc); + VCTZLSBB_V16QI VCTZLSBB_VUC + signed int __builtin_vec_vctzlsbb (vss); + VCTZLSBB_V8HI VCTZLSBB_VSS + signed int __builtin_vec_vctzlsbb (vus); + VCTZLSBB_V8HI VCTZLSBB_VUS + signed int __builtin_vec_vctzlsbb (vsi); + VCTZLSBB_V4SI VCTZLSBB_VSI + signed int __builtin_vec_vctzlsbb (vui); + VCTZLSBB_V4SI VCTZLSBB_VUI + +[VEC_CONVERT_4F32_8I16, SKIP, __builtin_vec_convert_4f32_8i16] + vus __builtin_vec_convert_4f32_8i16 (vf, vf); + CONVERT_4F32_8I16 + +[VEC_CONVERT_4F32_8F16, vec_pack_to_short_fp32, __builtin_vec_convert_4f32_8f16, _ARCH_PWR9] + vus __builtin_vec_convert_4f32_8f16 (vf, vf); + CONVERT_4F32_8F16 + +[VEC_COPYSIGN, vec_cpsgn, __builtin_vec_copysign] + vf __builtin_vec_copysign (vf, vf); + CPSGNSP + vd __builtin_vec_copysign (vd, vd); + CPSGNDP + +[VEC_CTF, vec_ctf, __builtin_vec_ctf] + vf __builtin_vec_ctf (vsi, const int); + VCFSX + vf __builtin_vec_ctf (vui, const int); + VCFUX + vd __builtin_vec_ctf (vsll, const int); + XVCVSXDDP_SCALE + vd __builtin_vec_ctf (vull, const int); + XVCVUXDDP_SCALE + +[VEC_CTS, vec_cts, __builtin_vec_cts] + vsi __builtin_vec_cts (vf, const int); + VCTSXS + vsll __builtin_vec_cts (vd, const int); + XVCVDPSXDS_SCALE + +[VEC_CTU, vec_ctu, __builtin_vec_ctu] + vui __builtin_vec_ctu (vf, const int); + VCTUXS + vull __builtin_vec_ctu (vd, const int); + XVCVDPUXDS_SCALE + +[VEC_DIV, vec_div, __builtin_vec_div, __VSX__] + vsi __builtin_vec_div (vsi, vsi); + VDIVSW + vui __builtin_vec_div (vui, vui); + VDIVUW + vsll __builtin_vec_div (vsll, vsll); + DIV_V2DI + vull __builtin_vec_div (vull, vull); + UDIV_V2DI + vsq __builtin_vec_div (vsq, vsq); + DIV_V1TI + vuq __builtin_vec_div (vuq, vuq); + UDIV_V1TI + vf __builtin_vec_div (vf, vf); + XVDIVSP + vd __builtin_vec_div (vd, vd); + XVDIVDP + +[VEC_DIVE, vec_dive, __builtin_vec_dive, _ARCH_PWR10] + vsi __builtin_vec_dive (vsi, vsi); + VDIVESW + vui __builtin_vec_dive (vui, vui); + VDIVEUW + vsll __builtin_vec_dive (vsll, vsll); + VDIVESD + vull __builtin_vec_dive (vull, vull); + VDIVEUD + vsq __builtin_vec_dive (vsq, vsq); + DIVES_V1TI + vuq __builtin_vec_dive (vuq, vuq); + DIVEU_V1TI + +[VEC_DOUBLE, vec_double, __builtin_vec_double] + vd __builtin_vec_double (vsll); + XVCVSXDDP + vd __builtin_vec_double (vull); + XVCVUXDDP + +[VEC_DOUBLEE, vec_doublee, __builtin_vec_doublee] + vd __builtin_vec_doublee (vsi); + DOUBLEE_V4SI + vd __builtin_vec_doublee (vui); + UNS_DOUBLEE_V4SI + vd __builtin_vec_doublee (vf); + DOUBLEE_V4SF + +[VEC_DOUBLEH, vec_doubleh, __builtin_vec_doubleh] + vd __builtin_vec_doubleh (vsi); + DOUBLEH_V4SI + vd __builtin_vec_doubleh (vui); + UNS_DOUBLEH_V4SI + vd __builtin_vec_doubleh (vf); + DOUBLEH_V4SF + +[VEC_DOUBLEL, vec_doublel, __builtin_vec_doublel] + vd __builtin_vec_doublel (vsi); + DOUBLEL_V4SI + vd __builtin_vec_doublel (vui); + UNS_DOUBLEL_V4SI + vd __builtin_vec_doublel (vf); + DOUBLEL_V4SF + +[VEC_DOUBLEO, vec_doubleo, __builtin_vec_doubleo] + vd __builtin_vec_doubleo (vsi); + DOUBLEO_V4SI + vd __builtin_vec_doubleo (vui); + UNS_DOUBLEO_V4SI + vd __builtin_vec_doubleo (vf); + DOUBLEO_V4SF + +[VEC_DST, vec_dst, __builtin_vec_dst] + void __builtin_vec_dst (unsigned char *, const int, const int); + DST DST_UC + void __builtin_vec_dst (signed char *, const int, const int); + DST DST_SC + void __builtin_vec_dst (unsigned short *, const int, const int); + DST DST_US + void __builtin_vec_dst (signed short *, const int, const int); + DST DST_SS + void __builtin_vec_dst (unsigned int *, const int, const int); + DST DST_UI + void __builtin_vec_dst (signed int *, const int, const int); + DST DST_SI + void __builtin_vec_dst (unsigned long *, const int, const int); + DST DST_UL + void __builtin_vec_dst (signed long *, const int, const int); + DST DST_SL + void __builtin_vec_dst (unsigned long long *, const int, const int); + DST DST_ULL + void __builtin_vec_dst (signed long long *, const int, const int); + DST DST_SLL + void __builtin_vec_dst (float *, const int, const int); + DST DST_F + void __builtin_vec_dst (vuc *, const int, const int); + DST DST_VUC + void __builtin_vec_dst (vsc *, const int, const int); + DST DST_VSC + void __builtin_vec_dst (vbc *, const int, const int); + DST DST_VBC + void __builtin_vec_dst (vus *, const int, const int); + DST DST_VUS + void __builtin_vec_dst (vss *, const int, const int); + DST DST_VSS + void __builtin_vec_dst (vbs *, const int, const int); + DST DST_VBS + void __builtin_vec_dst (vp *, const int, const int); + DST DST_VP + void __builtin_vec_dst (vui *, const int, const int); + DST DST_VUI + void __builtin_vec_dst (vsi *, const int, const int); + DST DST_VSI + void __builtin_vec_dst (vbi *, const int, const int); + DST DST_VBI + void __builtin_vec_dst (vf *, const int, const int); + DST DST_VF + +[VEC_DSTST, vec_dstst, __builtin_vec_dstst] + void __builtin_vec_dstst (unsigned char *, const int, const int); + DSTST DSTST_UC + void __builtin_vec_dstst (signed char *, const int, const int); + DSTST DSTST_SC + void __builtin_vec_dstst (unsigned short *, const int, const int); + DSTST DSTST_US + void __builtin_vec_dstst (signed short *, const int, const int); + DSTST DSTST_SS + void __builtin_vec_dstst (unsigned int *, const int, const int); + DSTST DSTST_UI + void __builtin_vec_dstst (signed int *, const int, const int); + DSTST DSTST_SI + void __builtin_vec_dstst (unsigned long *, const int, const int); + DSTST DSTST_UL + void __builtin_vec_dstst (signed long *, const int, const int); + DSTST DSTST_SL + void __builtin_vec_dstst (unsigned long long *, const int, const int); + DSTST DSTST_ULL + void __builtin_vec_dstst (signed long long *, const int, const int); + DSTST DSTST_SLL + void __builtin_vec_dstst (float *, const int, const int); + DSTST DSTST_F + void __builtin_vec_dstst (vuc *, const int, const int); + DSTST DSTST_VUC + void __builtin_vec_dstst (vsc *, const int, const int); + DSTST DSTST_VSC + void __builtin_vec_dstst (vbc *, const int, const int); + DSTST DSTST_VBC + void __builtin_vec_dstst (vus *, const int, const int); + DSTST DSTST_VUS + void __builtin_vec_dstst (vss *, const int, const int); + DSTST DSTST_VSS + void __builtin_vec_dstst (vbs *, const int, const int); + DSTST DSTST_VBS + void __builtin_vec_dstst (vp *, const int, const int); + DSTST DSTST_VP + void __builtin_vec_dstst (vui *, const int, const int); + DSTST DSTST_VUI + void __builtin_vec_dstst (vsi *, const int, const int); + DSTST DSTST_VSI + void __builtin_vec_dstst (vbi *, const int, const int); + DSTST DSTST_VBI + void __builtin_vec_dstst (vf *, const int, const int); + DSTST DSTST_VF + +[VEC_DSTSTT, vec_dststt, __builtin_vec_dststt] + void __builtin_vec_dststt (unsigned char *, const int, const int); + DSTSTT DSTSTT_UC + void __builtin_vec_dststt (signed char *, const int, const int); + DSTSTT DSTSTT_SC + void __builtin_vec_dststt (unsigned short *, const int, const int); + DSTSTT DSTSTT_US + void __builtin_vec_dststt (signed short *, const int, const int); + DSTSTT DSTSTT_SS + void __builtin_vec_dststt (unsigned int *, const int, const int); + DSTSTT DSTSTT_UI + void __builtin_vec_dststt (signed int *, const int, const int); + DSTSTT DSTSTT_SI + void __builtin_vec_dststt (unsigned long *, const int, const int); + DSTSTT DSTSTT_UL + void __builtin_vec_dststt (signed long *, const int, const int); + DSTSTT DSTSTT_SL + void __builtin_vec_dststt (unsigned long long *, const int, const int); + DSTSTT DSTSTT_ULL + void __builtin_vec_dststt (signed long long *, const int, const int); + DSTSTT DSTSTT_SLL + void __builtin_vec_dststt (float *, const int, const int); + DSTSTT DSTSTT_F + void __builtin_vec_dststt (vuc *, const int, const int); + DSTSTT DSTSTT_VUC + void __builtin_vec_dststt (vsc *, const int, const int); + DSTSTT DSTSTT_VSC + void __builtin_vec_dststt (vbc *, const int, const int); + DSTSTT DSTSTT_VBC + void __builtin_vec_dststt (vus *, const int, const int); + DSTSTT DSTSTT_VUS + void __builtin_vec_dststt (vss *, const int, const int); + DSTSTT DSTSTT_VSS + void __builtin_vec_dststt (vbs *, const int, const int); + DSTSTT DSTSTT_VBS + void __builtin_vec_dststt (vp *, const int, const int); + DSTSTT DSTSTT_VP + void __builtin_vec_dststt (vui *, const int, const int); + DSTSTT DSTSTT_VUI + void __builtin_vec_dststt (vsi *, const int, const int); + DSTSTT DSTSTT_VSI + void __builtin_vec_dststt (vbi *, const int, const int); + DSTSTT DSTSTT_VBI + void __builtin_vec_dststt (vf *, const int, const int); + DSTSTT DSTSTT_VF + +[VEC_DSTT, vec_dstt, __builtin_vec_dstt] + void __builtin_vec_dstt (unsigned char *, const int, const int); + DSTT DSTT_UC + void __builtin_vec_dstt (signed char *, const int, const int); + DSTT DSTT_SC + void __builtin_vec_dstt (unsigned short *, const int, const int); + DSTT DSTT_US + void __builtin_vec_dstt (signed short *, const int, const int); + DSTT DSTT_SS + void __builtin_vec_dstt (unsigned int *, const int, const int); + DSTT DSTT_UI + void __builtin_vec_dstt (signed int *, const int, const int); + DSTT DSTT_SI + void __builtin_vec_dstt (unsigned long *, const int, const int); + DSTT DSTT_UL + void __builtin_vec_dstt (signed long *, const int, const int); + DSTT DSTT_SL + void __builtin_vec_dstt (unsigned long long *, const int, const int); + DSTT DSTT_ULL + void __builtin_vec_dstt (signed long long *, const int, const int); + DSTT DSTT_SLL + void __builtin_vec_dstt (float *, const int, const int); + DSTT DSTT_F + void __builtin_vec_dstt (vuc *, const int, const int); + DSTT DSTT_VUC + void __builtin_vec_dstt (vsc *, const int, const int); + DSTT DSTT_VSC + void __builtin_vec_dstt (vbc *, const int, const int); + DSTT DSTT_VBC + void __builtin_vec_dstt (vus *, const int, const int); + DSTT DSTT_VUS + void __builtin_vec_dstt (vss *, const int, const int); + DSTT DSTT_VSS + void __builtin_vec_dstt (vbs *, const int, const int); + DSTT DSTT_VBS + void __builtin_vec_dstt (vp *, const int, const int); + DSTT DSTT_VP + void __builtin_vec_dstt (vui *, const int, const int); + DSTT DSTT_VUI + void __builtin_vec_dstt (vsi *, const int, const int); + DSTT DSTT_VSI + void __builtin_vec_dstt (vbi *, const int, const int); + DSTT DSTT_VBI + void __builtin_vec_dstt (vf *, const int, const int); + DSTT DSTT_VF + +[VEC_EQV, vec_eqv, __builtin_vec_eqv, _ARCH_PWR8] + vsc __builtin_vec_eqv (vsc, vsc); + EQV_V16QI + vuc __builtin_vec_eqv (vuc, vuc); + EQV_V16QI_UNS EQV_V16QI_VUC + vbc __builtin_vec_eqv (vbc, vbc); + EQV_V16QI_UNS EQV_V16QI_VBC + vss __builtin_vec_eqv (vss, vss); + EQV_V8HI + vus __builtin_vec_eqv (vus, vus); + EQV_V8HI_UNS EQV_V8HI_VUS + vbs __builtin_vec_eqv (vbs, vbs); + EQV_V8HI_UNS EQV_V8HI_VBS + vsi __builtin_vec_eqv (vsi, vsi); + EQV_V4SI + vui __builtin_vec_eqv (vui, vui); + EQV_V4SI_UNS EQV_V4SI_VUI + vbi __builtin_vec_eqv (vbi, vbi); + EQV_V4SI_UNS EQV_V4SI_VBI + vsll __builtin_vec_eqv (vsll, vsll); + EQV_V2DI + vull __builtin_vec_eqv (vull, vull); + EQV_V2DI_UNS EQV_V2DI_VULL + vbll __builtin_vec_eqv (vbll, vbll); + EQV_V2DI_UNS EQV_V2DI_VBLL + vf __builtin_vec_eqv (vf, vf); + EQV_V4SF + vd __builtin_vec_eqv (vd, vd); + EQV_V2DF +; The following variants are deprecated. + vsc __builtin_vec_eqv (vbc, vsc); + EQV_V16QI EQV_VBC_VSC + vsc __builtin_vec_eqv (vsc, vbc); + EQV_V16QI EQV_VSC_VBC + vuc __builtin_vec_eqv (vbc, vuc); + EQV_V16QI_UNS EQV_VBC_VUC + vuc __builtin_vec_eqv (vuc, vbc); + EQV_V16QI_UNS EQV_VUC_VBC + vss __builtin_vec_eqv (vbs, vss); + EQV_V8HI EQV_VBS_VSS + vss __builtin_vec_eqv (vss, vbs); + EQV_V8HI EQV_VSS_VBS + vus __builtin_vec_eqv (vbs, vus); + EQV_V8HI_UNS EQV_VBS_VUS + vus __builtin_vec_eqv (vus, vbs); + EQV_V8HI_UNS EQV_VUS_VBS + vsi __builtin_vec_eqv (vbi, vsi); + EQV_V4SI EQV_VBI_VSI + vsi __builtin_vec_eqv (vsi, vbi); + EQV_V4SI EQV_VSI_VBI + vui __builtin_vec_eqv (vbi, vui); + EQV_V4SI_UNS EQV_VBI_VUI + vui __builtin_vec_eqv (vui, vbi); + EQV_V4SI_UNS EQV_VUI_VBI + vsll __builtin_vec_eqv (vbll, vsll); + EQV_V2DI EQV_VBLL_VSLL + vsll __builtin_vec_eqv (vsll, vbll); + EQV_V2DI EQV_VSLL_VBLL + vull __builtin_vec_eqv (vbll, vull); + EQV_V2DI_UNS EQV_VBLL_VULL + vull __builtin_vec_eqv (vull, vbll); + EQV_V2DI_UNS EQV_VULL_VBLL + +[VEC_EXPANDM, vec_expandm, __builtin_vec_vexpandm, _ARCH_PWR10] + vuc __builtin_vec_vexpandm (vuc); + VEXPANDMB + vus __builtin_vec_vexpandm (vus); + VEXPANDMH + vui __builtin_vec_vexpandm (vui); + VEXPANDMW + vull __builtin_vec_vexpandm (vull); + VEXPANDMD + vuq __builtin_vec_vexpandm (vuq); + VEXPANDMQ + +[VEC_EXPTE, vec_expte, __builtin_vec_expte] + vf __builtin_vec_expte (vf); + VEXPTEFP + +; There are no actual builtins for vec_extract. There is special handling for +; this in altivec_resolve_overloaded_builtin in rs6000-c.c, where the call +; is replaced by "pointer tricks." The single overload here causes +; __builtin_vec_extract to be registered with the front end so this can +; happen. +[VEC_EXTRACT, vec_extract, __builtin_vec_extract] + vsi __builtin_vec_extract (vsi, signed int); + VSPLTW EXTRACT_FAKERY + +[VEC_EXTRACT_FP_FROM_SHORTH, vec_extract_fp32_from_shorth, __builtin_vec_vextract_fp_from_shorth, _ARCH_PWR9] + vf __builtin_vec_vextract_fp_from_shorth (vus); + VEXTRACT_FP_FROM_SHORTH + +[VEC_EXTRACT_FP_FROM_SHORTL, vec_extract_fp32_from_shortl, __builtin_vec_vextract_fp_from_shortl, _ARCH_PWR9] + vf __builtin_vec_vextract_fp_from_shortl (vus); + VEXTRACT_FP_FROM_SHORTL + +[VEC_EXTRACTH, vec_extracth, __builtin_vec_extracth, _ARCH_PWR10] + vull __builtin_vec_extracth (vuc, vuc, unsigned char); + VEXTRACTBR + vull __builtin_vec_extracth (vus, vus, unsigned char); + VEXTRACTHR + vull __builtin_vec_extracth (vui, vui, unsigned char); + VEXTRACTWR + vull __builtin_vec_extracth (vull, vull, unsigned char); + VEXTRACTDR + +[VEC_EXTRACTL, vec_extractl, __builtin_vec_extractl, _ARCH_PWR10] + vull __builtin_vec_extractl (vuc, vuc, unsigned char); + VEXTRACTBL + vull __builtin_vec_extractl (vus, vus, unsigned char); + VEXTRACTHL + vull __builtin_vec_extractl (vui, vui, unsigned char); + VEXTRACTWL + vull __builtin_vec_extractl (vull, vull, unsigned char); + VEXTRACTDL + +[VEC_EXTRACTM, vec_extractm, __builtin_vec_vextractm, _ARCH_PWR10] + signed int __builtin_vec_vextractm (vuc); + VEXTRACTMB + signed int __builtin_vec_vextractm (vus); + VEXTRACTMH + signed int __builtin_vec_vextractm (vui); + VEXTRACTMW + signed int __builtin_vec_vextractm (vull); + VEXTRACTMD + signed int __builtin_vec_vextractm (vuq); + VEXTRACTMQ + +[VEC_EXTRACT4B, vec_extract4b, __builtin_vec_extract4b, _ARCH_PWR9] + vull __builtin_vec_extract4b (vuc, const int); + EXTRACT4B + +[VEC_EXTULX, vec_xlx, __builtin_vec_vextulx, _ARCH_PWR9] + signed char __builtin_vec_vextulx (unsigned int, vsc); + VEXTUBLX VEXTUBLX_S + unsigned char __builtin_vec_vextulx (unsigned int, vuc); + VEXTUBLX VEXTUBLX_U + signed short __builtin_vec_vextulx (unsigned int, vss); + VEXTUHLX VEXTUHLX_S + unsigned short __builtin_vec_vextulx (unsigned int, vus); + VEXTUHLX VEXTUHLX_U + signed int __builtin_vec_vextulx (unsigned int, vsi); + VEXTUWLX VEXTUWLX_S + unsigned int __builtin_vec_vextulx (unsigned int, vui); + VEXTUWLX VEXTUWLX_U + float __builtin_vec_vextulx (unsigned int, vf); + VEXTUWLX VEXTUWLX_F + +[VEC_EXTURX, vec_xrx, __builtin_vec_vexturx, _ARCH_PWR9] + signed char __builtin_vec_vexturx (unsigned int, vsc); + VEXTUBRX VEXTUBRX_S + unsigned char __builtin_vec_vexturx (unsigned int, vuc); + VEXTUBRX VEXTUBRX_U + signed short __builtin_vec_vexturx (unsigned int, vss); + VEXTUHRX VEXTUHRX_S + unsigned short __builtin_vec_vexturx (unsigned int, vus); + VEXTUHRX VEXTUHRX_U + signed int __builtin_vec_vexturx (unsigned int, vsi); + VEXTUWRX VEXTUWRX_S + unsigned int __builtin_vec_vexturx (unsigned int, vui); + VEXTUWRX VEXTUWRX_U + float __builtin_vec_vexturx (unsigned int, vf); + VEXTUWRX VEXTUWRX_F + +[VEC_FIRSTMATCHINDEX, vec_first_match_index, __builtin_vec_first_match_index, _ARCH_PWR9] + unsigned int __builtin_vec_first_match_index (vsc, vsc); + VFIRSTMATCHINDEX_V16QI FIRSTMATCHINDEX_VSC + unsigned int __builtin_vec_first_match_index (vuc, vuc); + VFIRSTMATCHINDEX_V16QI FIRSTMATCHINDEX_VUC + unsigned int __builtin_vec_first_match_index (vss, vss); + VFIRSTMATCHINDEX_V8HI FIRSTMATCHINDEX_VSS + unsigned int __builtin_vec_first_match_index (vus, vus); + VFIRSTMATCHINDEX_V8HI FIRSTMATCHINDEX_VUS + unsigned int __builtin_vec_first_match_index (vsi, vsi); + VFIRSTMATCHINDEX_V4SI FIRSTMATCHINDEX_VSI + unsigned int __builtin_vec_first_match_index (vui, vui); + VFIRSTMATCHINDEX_V4SI FIRSTMATCHINDEX_VUI + +[VEC_FIRSTMATCHOREOSINDEX, vec_first_match_or_eos_index, __builtin_vec_first_match_or_eos_index, _ARCH_PWR9] + unsigned int __builtin_vec_first_match_or_eos_index (vsc, vsc); + VFIRSTMATCHOREOSINDEX_V16QI FIRSTMATCHOREOSINDEX_VSC + unsigned int __builtin_vec_first_match_or_eos_index (vuc, vuc); + VFIRSTMATCHOREOSINDEX_V16QI FIRSTMATCHOREOSINDEX_VUC + unsigned int __builtin_vec_first_match_or_eos_index (vss, vss); + VFIRSTMATCHOREOSINDEX_V8HI FIRSTMATCHOREOSINDEX_VSS + unsigned int __builtin_vec_first_match_or_eos_index (vus, vus); + VFIRSTMATCHOREOSINDEX_V8HI FIRSTMATCHOREOSINDEX_VUS + unsigned int __builtin_vec_first_match_or_eos_index (vsi, vsi); + VFIRSTMATCHOREOSINDEX_V4SI FIRSTMATCHOREOSINDEX_VSI + unsigned int __builtin_vec_first_match_or_eos_index (vui, vui); + VFIRSTMATCHOREOSINDEX_V4SI FIRSTMATCHOREOSINDEX_VUI + +[VEC_FIRSTMISMATCHINDEX, vec_first_mismatch_index, __builtin_vec_first_mismatch_index, _ARCH_PWR9] + unsigned int __builtin_vec_first_mismatch_index (vsc, vsc); + VFIRSTMISMATCHINDEX_V16QI FIRSTMISMATCHINDEX_VSC + unsigned int __builtin_vec_first_mismatch_index (vuc, vuc); + VFIRSTMISMATCHINDEX_V16QI FIRSTMISMATCHINDEX_VUC + unsigned int __builtin_vec_first_mismatch_index (vss, vss); + VFIRSTMISMATCHINDEX_V8HI FIRSTMISMATCHINDEX_VSS + unsigned int __builtin_vec_first_mismatch_index (vus, vus); + VFIRSTMISMATCHINDEX_V8HI FIRSTMISMATCHINDEX_VUS + unsigned int __builtin_vec_first_mismatch_index (vsi, vsi); + VFIRSTMISMATCHINDEX_V4SI FIRSTMISMATCHINDEX_VSI + unsigned int __builtin_vec_first_mismatch_index (vui, vui); + VFIRSTMISMATCHINDEX_V4SI FIRSTMISMATCHINDEX_VUI + +[VEC_FIRSTMISMATCHOREOSINDEX, vec_first_mismatch_or_eos_index, __builtin_vec_first_mismatch_or_eos_index, _ARCH_PWR9] + unsigned int __builtin_vec_first_mismatch_or_eos_index (vsc, vsc); + VFIRSTMISMATCHOREOSINDEX_V16QI FIRSTMISMATCHOREOSINDEX_VSC + unsigned int __builtin_vec_first_mismatch_or_eos_index (vuc, vuc); + VFIRSTMISMATCHOREOSINDEX_V16QI FIRSTMISMATCHOREOSINDEX_VUC + unsigned int __builtin_vec_first_mismatch_or_eos_index (vss, vss); + VFIRSTMISMATCHOREOSINDEX_V8HI FIRSTMISMATCHOREOSINDEX_VSS + unsigned int __builtin_vec_first_mismatch_or_eos_index (vus, vus); + VFIRSTMISMATCHOREOSINDEX_V8HI FIRSTMISMATCHOREOSINDEX_VUS + unsigned int __builtin_vec_first_mismatch_or_eos_index (vsi, vsi); + VFIRSTMISMATCHOREOSINDEX_V4SI FIRSTMISMATCHOREOSINDEX_VSI + unsigned int __builtin_vec_first_mismatch_or_eos_index (vui, vui); + VFIRSTMISMATCHOREOSINDEX_V4SI FIRSTMISMATCHOREOSINDEX_VUI + +[VEC_FLOAT, vec_float, __builtin_vec_float] + vf __builtin_vec_float (vsi); + XVCVSXWSP + vf __builtin_vec_float (vui); + XVCVUXWSP + +[VEC_FLOAT2, vec_float2, __builtin_vec_float2] + vf __builtin_vec_float2 (vsll, vsll); + FLOAT2_V2DI + vf __builtin_vec_float2 (vull, vull); + UNS_FLOAT2_V2DI + vf __builtin_vec_float2 (vd, vd); + FLOAT2_V2DF + +[VEC_FLOATE, vec_floate, __builtin_vec_floate] + vf __builtin_vec_floate (vsll); + FLOATE_V2DI + vf __builtin_vec_floate (vull); + UNS_FLOATE_V2DI + vf __builtin_vec_floate (vd); + FLOATE_V2DF + +[VEC_FLOATO, vec_floato, __builtin_vec_floato] + vf __builtin_vec_floato (vsll); + FLOATO_V2DI + vf __builtin_vec_floato (vull); + UNS_FLOATO_V2DI + vf __builtin_vec_floato (vd); + FLOATO_V2DF + +[VEC_FLOOR, vec_floor, __builtin_vec_floor] + vf __builtin_vec_floor (vf); + VRFIM + vd __builtin_vec_floor (vd); + XVRDPIM + +[VEC_GB, vec_gb, __builtin_vec_vgbbd, _ARCH_PWR8] + vsc __builtin_vec_vgbbd (vsc); + VGBBD VGBBD_S + vuc __builtin_vec_vgbbd (vuc); + VGBBD VGBBD_U + +[VEC_GENBM, vec_genbm, __builtin_vec_mtvsrbm, _ARCH_PWR10] + vuc __builtin_vec_mtvsrbm (unsigned long long); + MTVSRBM + +[VEC_GENHM, vec_genhm, __builtin_vec_mtvsrhm, _ARCH_PWR10] + vus __builtin_vec_mtvsrhm (unsigned long long); + MTVSRHM + +[VEC_GENWM, vec_genwm, __builtin_vec_mtvsrwm, _ARCH_PWR10] + vui __builtin_vec_mtvsrwm (unsigned long long); + MTVSRWM + +[VEC_GENDM, vec_gendm, __builtin_vec_mtvsrdm, _ARCH_PWR10] + vull __builtin_vec_mtvsrdm (unsigned long long); + MTVSRDM + +[VEC_GENQM, vec_genqm, __builtin_vec_mtvsrqm, _ARCH_PWR10] + vuq __builtin_vec_mtvsrqm (unsigned long long); + MTVSRQM + +[VEC_GENPCVM, vec_genpcvm, __builtin_vec_xxgenpcvm, _ARCH_PWR10] + vuc __builtin_vec_xxgenpcvm (vuc, const int); + XXGENPCVM_V16QI + vus __builtin_vec_xxgenpcvm (vus, const int); + XXGENPCVM_V8HI + vui __builtin_vec_xxgenpcvm (vui, const int); + XXGENPCVM_V4SI + vull __builtin_vec_xxgenpcvm (vull, const int); + XXGENPCVM_V2DI + +[VEC_GNB, vec_gnb, __builtin_vec_gnb, _ARCH_PWR10] + unsigned long long __builtin_vec_gnb (vuq, const int); + VGNB + +; There are no actual builtins for vec_insert. There is special handling for +; this in altivec_resolve_overloaded_builtin in rs6000-c.c, where the call +; is replaced by "pointer tricks." The single overload here causes +; __builtin_vec_insert to be registered with the front end so this can happen. +[VEC_INSERT, vec_insert, __builtin_vec_insert] + vsi __builtin_vec_insert (vsi, vsi, signed int); + XXPERMDI_4SI INSERT_FAKERY + +[VEC_INSERTH, vec_inserth, __builtin_vec_inserth, _ARCH_PWR10] + vuc __builtin_vec_inserth (unsigned char, vuc, unsigned int); + VINSERTGPRBR + vuc __builtin_vec_inserth (vuc, vuc, unsigned int); + VINSERTVPRBR + vus __builtin_vec_inserth (unsigned short, vus, unsigned int); + VINSERTGPRHR + vus __builtin_vec_inserth (vus, vus, unsigned int); + VINSERTVPRHR + vui __builtin_vec_inserth (unsigned int, vui, unsigned int); + VINSERTGPRWR + vui __builtin_vec_inserth (vui, vui, unsigned int); + VINSERTVPRWR + vull __builtin_vec_inserth (unsigned long long, vull, unsigned int); + VINSERTGPRDR + +[VEC_INSERTL, vec_insertl, __builtin_vec_insertl, _ARCH_PWR10] + vuc __builtin_vec_insertl (unsigned char, vuc, unsigned int); + VINSERTGPRBL + vuc __builtin_vec_insertl (vuc, vuc, unsigned int); + VINSERTVPRBL + vus __builtin_vec_insertl (unsigned short, vus, unsigned int); + VINSERTGPRHL + vus __builtin_vec_insertl (vus, vus, unsigned int); + VINSERTVPRHL + vui __builtin_vec_insertl (unsigned int, vui, unsigned int); + VINSERTGPRWL + vui __builtin_vec_insertl (vui, vui, unsigned int); + VINSERTVPRWL + vull __builtin_vec_insertl (unsigned long long, vull, unsigned int); + VINSERTGPRDL + +[VEC_INSERT4B, vec_insert4b, __builtin_vec_insert4b, _ARCH_PWR9] + vuc __builtin_vec_insert4b (vsi, vuc, const int); + INSERT4B INSERT4B_S + vuc __builtin_vec_insert4b (vui, vuc, const int); + INSERT4B INSERT4B_U + +[VEC_LD, vec_ld, __builtin_vec_ld] + vsc __builtin_vec_ld (signed long, const vsc *); + LVX_V16QI LVX_V16QI_VSC + vsc __builtin_vec_ld (signed long, const signed char *); + LVX_V16QI LVX_V16QI_SC + vuc __builtin_vec_ld (signed long, const vuc *); + LVX_V16QI LVX_V16QI_VUC + vuc __builtin_vec_ld (signed long, const unsigned char *); + LVX_V16QI LVX_V16QI_UC + vbc __builtin_vec_ld (signed long, const vbc *); + LVX_V16QI LVX_V16QI_VBC + vss __builtin_vec_ld (signed long, const vss *); + LVX_V8HI LVX_V8HI_VSS + vss __builtin_vec_ld (signed long, const signed short *); + LVX_V8HI LVX_V8HI_SS + vus __builtin_vec_ld (signed long, const vus *); + LVX_V8HI LVX_V8HI_VUS + vus __builtin_vec_ld (signed long, const unsigned short *); + LVX_V8HI LVX_V8HI_US + vbs __builtin_vec_ld (signed long, const vbs *); + LVX_V8HI LVX_V8HI_VBS + vp __builtin_vec_ld (signed long, const vp *); + LVX_V8HI LVX_V8HI_VP + vsi __builtin_vec_ld (signed long, const vsi *); + LVX_V4SI LVX_V4SI_VSI + vsi __builtin_vec_ld (signed long, const signed int *); + LVX_V4SI LVX_V4SI_SI + vui __builtin_vec_ld (signed long, const vui *); + LVX_V4SI LVX_V4SI_VUI + vui __builtin_vec_ld (signed long, const unsigned int *); + LVX_V4SI LVX_V4SI_UI + vbi __builtin_vec_ld (signed long, const vbi *); + LVX_V4SI LVX_V4SI_VBI + vsll __builtin_vec_ld (signed long, const vsll *); + LVX_V2DI LVX_V2DI_VSLL + vsll __builtin_vec_ld (signed long, const signed long long *); + LVX_V2DI LVX_V2DI_SLL + vull __builtin_vec_ld (signed long, const vull *); + LVX_V2DI LVX_V2DI_VULL + vull __builtin_vec_ld (signed long, const unsigned long long *); + LVX_V2DI LVX_V2DI_ULL + vbll __builtin_vec_ld (signed long, const vbll *); + LVX_V2DI LVX_V2DI_VBLL + vsq __builtin_vec_ld (signed long, const vsq *); + LVX_V1TI LVX_V1TI_VSQ + vuq __builtin_vec_ld (signed long, const vuq *); + LVX_V1TI LVX_V1TI_VUQ + vsq __builtin_vec_ld (signed long, const __int128 *); + LVX_V1TI LVX_V1TI_TI + vuq __builtin_vec_ld (signed long, const unsigned __int128 *); + LVX_V1TI LVX_V1TI_UTI + vf __builtin_vec_ld (signed long, const vf *); + LVX_V4SF LVX_V4SF_VF + vf __builtin_vec_ld (signed long, const float *); + LVX_V4SF LVX_V4SF_F + vd __builtin_vec_ld (signed long, const vd *); + LVX_V2DF LVX_V2DF_VD + vd __builtin_vec_ld (signed long, const double *); + LVX_V2DF LVX_V2DF_D +; The following variants are deprecated. + vsi __builtin_vec_ld (signed long, const long *); + LVX_V4SI LVX_V4SI_SL + vui __builtin_vec_ld (signed long, const unsigned long *); + LVX_V4SI LVX_V4SI_UL + +[VEC_LDE, vec_lde, __builtin_vec_lde] + vsc __builtin_vec_lde (signed long, const signed char *); + LVEBX LVEBX_SC + vuc __builtin_vec_lde (signed long, const unsigned char *); + LVEBX LVEBX_UC + vss __builtin_vec_lde (signed long, const signed short *); + LVEHX LVEHX_SS + vus __builtin_vec_lde (signed long, const unsigned short *); + LVEHX LVEHX_US + vsi __builtin_vec_lde (signed long, const signed int *); + LVEWX LVEWX_SI + vui __builtin_vec_lde (signed long, const unsigned int *); + LVEWX LVEWX_UI + vf __builtin_vec_lde (signed long, const float *); + LVEWX LVEWX_F +; The following variants are deprecated. + vsi __builtin_vec_lde (signed long, const long *); + LVEWX LVEWX_SL + vui __builtin_vec_lde (signed long, const unsigned long *); + LVEWX LVEWX_UL + +[VEC_LDL, vec_ldl, __builtin_vec_ldl] + vsc __builtin_vec_ldl (signed long, const vsc *); + LVXL_V16QI LVXL_V16QI_VSC + vsc __builtin_vec_ldl (signed long, const signed char *); + LVXL_V16QI LVXL_V16QI_SC + vuc __builtin_vec_ldl (signed long, const vuc *); + LVXL_V16QI LVXL_V16QI_VUC + vuc __builtin_vec_ldl (signed long, const unsigned char *); + LVXL_V16QI LVXL_V16QI_UC + vbc __builtin_vec_ldl (signed long, const vbc *); + LVXL_V16QI LVXL_V16QI_VBC + vss __builtin_vec_ldl (signed long, const vss *); + LVXL_V8HI LVXL_V8HI_VSS + vss __builtin_vec_ldl (signed long, const signed short *); + LVXL_V8HI LVXL_V8HI_SS + vus __builtin_vec_ldl (signed long, const vus *); + LVXL_V8HI LVXL_V8HI_VUS + vus __builtin_vec_ldl (signed long, const unsigned short *); + LVXL_V8HI LVXL_V8HI_US + vbs __builtin_vec_ldl (signed long, const vbs *); + LVXL_V8HI LVXL_V8HI_VBS + vp __builtin_vec_ldl (signed long, const vp *); + LVXL_V8HI LVXL_V8HI_VP + vsi __builtin_vec_ldl (signed long, const vsi *); + LVXL_V4SI LVXL_V4SI_VSI + vsi __builtin_vec_ldl (signed long, const signed int *); + LVXL_V4SI LVXL_V4SI_SI + vui __builtin_vec_ldl (signed long, const vui *); + LVXL_V4SI LVXL_V4SI_VUI + vui __builtin_vec_ldl (signed long, const unsigned int *); + LVXL_V4SI LVXL_V4SI_UI + vbi __builtin_vec_ldl (signed long, const vbi *); + LVXL_V4SI LVXL_V4SI_VBI + vsll __builtin_vec_ldl (signed long, const vsll *); + LVXL_V2DI LVXL_V2DI_VSLL + vsll __builtin_vec_ldl (signed long, const signed long long *); + LVXL_V2DI LVXL_V2DI_SLL + vull __builtin_vec_ldl (signed long, const vull *); + LVXL_V2DI LVXL_V2DI_VULL + vull __builtin_vec_ldl (signed long, const unsigned long long *); + LVXL_V2DI LVXL_V2DI_ULL + vbll __builtin_vec_ldl (signed long, const vbll *); + LVXL_V2DI LVXL_V2DI_VBLL + vf __builtin_vec_ldl (signed long, const vf *); + LVXL_V4SF LVXL_V4SF_VF + vf __builtin_vec_ldl (signed long, const float *); + LVXL_V4SF LVXL_V4SF_F + vd __builtin_vec_ldl (signed long, const vd *); + LVXL_V2DF LVXL_V2DF_VD + vd __builtin_vec_ldl (signed long, const double *); + LVXL_V2DF LVXL_V2DF_D + +[VEC_LOGE, vec_loge, __builtin_vec_loge] + vf __builtin_vec_loge (vf); + VLOGEFP + +[VEC_LVLX, vec_lvlx, __builtin_vec_lvlx, __PPU__] + vbc __builtin_vec_lvlx (signed long, const vbc *); + LVLX LVLX_VBC + vsc __builtin_vec_lvlx (signed long, const vsc *); + LVLX LVLX_VSC + vsc __builtin_vec_lvlx (signed long, const signed char *); + LVLX LVLX_SC + vuc __builtin_vec_lvlx (signed long, const vuc *); + LVLX LVLX_VUC + vuc __builtin_vec_lvlx (signed long, const unsigned char *); + LVLX LVLX_UC + vbs __builtin_vec_lvlx (signed long, const vbs *); + LVLX LVLX_VBS + vss __builtin_vec_lvlx (signed long, const vss *); + LVLX LVLX_VSS + vss __builtin_vec_lvlx (signed long, const signed short *); + LVLX LVLX_SS + vus __builtin_vec_lvlx (signed long, const vus *); + LVLX LVLX_VUS + vus __builtin_vec_lvlx (signed long, const unsigned short *); + LVLX LVLX_US + vp __builtin_vec_lvlx (signed long, const vp *); + LVLX LVLX_VP + vbi __builtin_vec_lvlx (signed long, const vbi *); + LVLX LVLX_VBI + vsi __builtin_vec_lvlx (signed long, const vsi *); + LVLX LVLX_VSI + vsi __builtin_vec_lvlx (signed long, const signed int *); + LVLX LVLX_SI + vui __builtin_vec_lvlx (signed long, const vui *); + LVLX LVLX_VUI + vui __builtin_vec_lvlx (signed long, const unsigned int *); + LVLX LVLX_UI + vf __builtin_vec_lvlx (signed long, const vf *); + LVLX LVLX_VF + vf __builtin_vec_lvlx (signed long, const float *); + LVLX LVLX_F + +[VEC_LVLXL, vec_lvlxl, __builtin_vec_lvlxl, __PPU__] + vbc __builtin_vec_lvlxl (signed long, const vbc *); + LVLXL LVLXL_VBC + vsc __builtin_vec_lvlxl (signed long, const vsc *); + LVLXL LVLXL_VSC + vsc __builtin_vec_lvlxl (signed long, const signed char *); + LVLXL LVLXL_SC + vuc __builtin_vec_lvlxl (signed long, const vuc *); + LVLXL LVLXL_VUC + vuc __builtin_vec_lvlxl (signed long, const unsigned char *); + LVLXL LVLXL_UC + vbs __builtin_vec_lvlxl (signed long, const vbs *); + LVLXL LVLXL_VBS + vss __builtin_vec_lvlxl (signed long, const vss *); + LVLXL LVLXL_VSS + vss __builtin_vec_lvlxl (signed long, const signed short *); + LVLXL LVLXL_SS + vus __builtin_vec_lvlxl (signed long, const vus *); + LVLXL LVLXL_VUS + vus __builtin_vec_lvlxl (signed long, const unsigned short *); + LVLXL LVLXL_US + vp __builtin_vec_lvlxl (signed long, const vp *); + LVLXL LVLXL_VP + vbi __builtin_vec_lvlxl (signed long, const vbi *); + LVLXL LVLXL_VBI + vsi __builtin_vec_lvlxl (signed long, const vsi *); + LVLXL LVLXL_VSI + vsi __builtin_vec_lvlxl (signed long, const signed int *); + LVLXL LVLXL_SI + vui __builtin_vec_lvlxl (signed long, const vui *); + LVLXL LVLXL_VUI + vui __builtin_vec_lvlxl (signed long, const unsigned int *); + LVLXL LVLXL_UI + vf __builtin_vec_lvlxl (signed long, const vf *); + LVLXL LVLXL_VF + vf __builtin_vec_lvlxl (signed long, const float *); + LVLXL LVLXL_F + +[VEC_LVRX, vec_lvrx, __builtin_vec_lvrx, __PPU__] + vbc __builtin_vec_lvrx (signed long, const vbc *); + LVRX LVRX_VBC + vsc __builtin_vec_lvrx (signed long, const vsc *); + LVRX LVRX_VSC + vsc __builtin_vec_lvrx (signed long, const signed char *); + LVRX LVRX_SC + vuc __builtin_vec_lvrx (signed long, const vuc *); + LVRX LVRX_VUC + vuc __builtin_vec_lvrx (signed long, const unsigned char *); + LVRX LVRX_UC + vbs __builtin_vec_lvrx (signed long, const vbs *); + LVRX LVRX_VBS + vss __builtin_vec_lvrx (signed long, const vss *); + LVRX LVRX_VSS + vss __builtin_vec_lvrx (signed long, const signed short *); + LVRX LVRX_SS + vus __builtin_vec_lvrx (signed long, const vus *); + LVRX LVRX_VUS + vus __builtin_vec_lvrx (signed long, const unsigned short *); + LVRX LVRX_US + vp __builtin_vec_lvrx (signed long, const vp *); + LVRX LVRX_VP + vbi __builtin_vec_lvrx (signed long, const vbi *); + LVRX LVRX_VBI + vsi __builtin_vec_lvrx (signed long, const vsi *); + LVRX LVRX_VSI + vsi __builtin_vec_lvrx (signed long, const signed int *); + LVRX LVRX_SI + vui __builtin_vec_lvrx (signed long, const vui *); + LVRX LVRX_VUI + vui __builtin_vec_lvrx (signed long, const unsigned int *); + LVRX LVRX_UI + vf __builtin_vec_lvrx (signed long, const vf *); + LVRX LVRX_VF + vf __builtin_vec_lvrx (signed long, const float *); + LVRX LVRX_F + +[VEC_LVRXL, vec_lvrxl, __builtin_vec_lvrxl, __PPU__] + vbc __builtin_vec_lvrxl (signed long, const vbc *); + LVRXL LVRXL_VBC + vsc __builtin_vec_lvrxl (signed long, const vsc *); + LVRXL LVRXL_VSC + vsc __builtin_vec_lvrxl (signed long, const signed char *); + LVRXL LVRXL_SC + vuc __builtin_vec_lvrxl (signed long, const vuc *); + LVRXL LVRXL_VUC + vuc __builtin_vec_lvrxl (signed long, const unsigned char *); + LVRXL LVRXL_UC + vbs __builtin_vec_lvrxl (signed long, const vbs *); + LVRXL LVRXL_VBS + vss __builtin_vec_lvrxl (signed long, const vss *); + LVRXL LVRXL_VSS + vss __builtin_vec_lvrxl (signed long, const signed short *); + LVRXL LVRXL_SS + vus __builtin_vec_lvrxl (signed long, const vus *); + LVRXL LVRXL_VUS + vus __builtin_vec_lvrxl (signed long, const unsigned short *); + LVRXL LVRXL_US + vp __builtin_vec_lvrxl (signed long, const vp *); + LVRXL LVRXL_VP + vbi __builtin_vec_lvrxl (signed long, const vbi *); + LVRXL LVRXL_VBI + vsi __builtin_vec_lvrxl (signed long, const vsi *); + LVRXL LVRXL_VSI + vsi __builtin_vec_lvrxl (signed long, const signed int *); + LVRXL LVRXL_SI + vui __builtin_vec_lvrxl (signed long, const vui *); + LVRXL LVRXL_VUI + vui __builtin_vec_lvrxl (signed long, const unsigned int *); + LVRXL LVRXL_UI + vf __builtin_vec_lvrxl (signed long, const vf *); + LVRXL LVRXL_VF + vf __builtin_vec_lvrxl (signed long, const float *); + LVRXL LVRXL_F + +[VEC_LVSL, vec_lvsl, __builtin_vec_lvsl] + vuc __builtin_vec_lvsl (signed long, const unsigned char *); + LVSL LVSL_UC + vuc __builtin_vec_lvsl (signed long, const signed char *); + LVSL LVSL_SC + vuc __builtin_vec_lvsl (signed long, const char *); + LVSL LVSL_STR + vuc __builtin_vec_lvsl (signed long, const unsigned short *); + LVSL LVSL_US + vuc __builtin_vec_lvsl (signed long, const signed short *); + LVSL LVSL_SS + vuc __builtin_vec_lvsl (signed long, const unsigned int *); + LVSL LVSL_UI + vuc __builtin_vec_lvsl (signed long, const signed int *); + LVSL LVSL_SI + vuc __builtin_vec_lvsl (signed long, const unsigned long *); + LVSL LVSL_UL + vuc __builtin_vec_lvsl (signed long, const signed long *); + LVSL LVSL_SL + vuc __builtin_vec_lvsl (signed long, const unsigned long long *); + LVSL LVSL_ULL + vuc __builtin_vec_lvsl (signed long, const signed long long *); + LVSL LVSL_SLL + vuc __builtin_vec_lvsl (signed long, const float *); + LVSL LVSL_F + vuc __builtin_vec_lvsl (signed long, const double *); + LVSL LVSL_D + +[VEC_LVSR, vec_lvsr, __builtin_vec_lvsr] + vuc __builtin_vec_lvsr (signed long, const unsigned char *); + LVSR LVSR_UC + vuc __builtin_vec_lvsr (signed long, const signed char *); + LVSR LVSR_SC + vuc __builtin_vec_lvsr (signed long, const char *); + LVSR LVSR_STR + vuc __builtin_vec_lvsr (signed long, const unsigned short *); + LVSR LVSR_US + vuc __builtin_vec_lvsr (signed long, const signed short *); + LVSR LVSR_SS + vuc __builtin_vec_lvsr (signed long, const unsigned int *); + LVSR LVSR_UI + vuc __builtin_vec_lvsr (signed long, const signed int *); + LVSR LVSR_SI + vuc __builtin_vec_lvsr (signed long, const unsigned long *); + LVSR LVSR_UL + vuc __builtin_vec_lvsr (signed long, const signed long *); + LVSR LVSR_SL + vuc __builtin_vec_lvsr (signed long, const unsigned long long *); + LVSR LVSR_ULL + vuc __builtin_vec_lvsr (signed long, const signed long long *); + LVSR LVSR_SLL + vuc __builtin_vec_lvsr (signed long, const float *); + LVSR LVSR_F + vuc __builtin_vec_lvsr (signed long, const double *); + LVSR LVSR_D + +[VEC_LXVL, vec_xl_len, __builtin_vec_lxvl, _ARCH_PPC64_PWR9] + vsc __builtin_vec_lxvl (const signed char *, unsigned int); + LXVL LXVL_VSC + vuc __builtin_vec_lxvl (const unsigned char *, unsigned int); + LXVL LXVL_VUC + vss __builtin_vec_lxvl (const signed short *, unsigned int); + LXVL LXVL_VSS + vus __builtin_vec_lxvl (const unsigned short *, unsigned int); + LXVL LXVL_VUS + vsi __builtin_vec_lxvl (const signed int *, unsigned int); + LXVL LXVL_VSI + vui __builtin_vec_lxvl (const unsigned int *, unsigned int); + LXVL LXVL_VUI + vsll __builtin_vec_lxvl (const signed long long *, unsigned int); + LXVL LXVL_VSLL + vull __builtin_vec_lxvl (const unsigned long long *, unsigned int); + LXVL LXVL_VULL + vsq __builtin_vec_lxvl (const signed __int128 *, unsigned int); + LXVL LXVL_VSQ + vuq __builtin_vec_lxvl (const unsigned __int128 *, unsigned int); + LXVL LXVL_VUQ + vf __builtin_vec_lxvl (const float *, unsigned int); + LXVL LXVL_VF + vd __builtin_vec_lxvl (const double *, unsigned int); + LXVL LXVL_VD + +[VEC_MADD, vec_madd, __builtin_vec_madd] + vss __builtin_vec_madd (vss, vss, vss); + VMLADDUHM VMLADDUHM_VSS + vss __builtin_vec_madd (vss, vus, vus); + VMLADDUHM VMLADDUHM_VSSVUS + vss __builtin_vec_madd (vus, vss, vss); + VMLADDUHM VMLADDUHM_VUSVSS + vus __builtin_vec_madd (vus, vus, vus); + VMLADDUHM VMLADDUHM_VUS + vf __builtin_vec_madd (vf, vf, vf); + VMADDFP + vd __builtin_vec_madd (vd, vd, vd); + XVMADDDP + +[VEC_MADDS, vec_madds, __builtin_vec_madds] + vss __builtin_vec_madds (vss, vss, vss); + VMHADDSHS + +[VEC_MAX, vec_max, __builtin_vec_max] + vsc __builtin_vec_max (vsc, vsc); + VMAXSB + vuc __builtin_vec_max (vuc, vuc); + VMAXUB + vss __builtin_vec_max (vss, vss); + VMAXSH + vus __builtin_vec_max (vus, vus); + VMAXUH + vsi __builtin_vec_max (vsi, vsi); + VMAXSW + vui __builtin_vec_max (vui, vui); + VMAXUW + vsll __builtin_vec_max (vsll, vsll); + VMAXSD + vull __builtin_vec_max (vull, vull); + VMAXUD + vf __builtin_vec_max (vf, vf); + VMAXFP + vd __builtin_vec_max (vd, vd); + XVMAXDP +; The following variants are deprecated. + vsc __builtin_vec_max (vsc, vbc); + VMAXSB VMAXSB_SB + vsc __builtin_vec_max (vbc, vsc); + VMAXSB VMAXSB_BS + vuc __builtin_vec_max (vuc, vbc); + VMAXUB VMAXUB_UB + vuc __builtin_vec_max (vbc, vuc); + VMAXUB VMAXUB_BU + vss __builtin_vec_max (vss, vbs); + VMAXSH VMAXSH_SB + vss __builtin_vec_max (vbs, vss); + VMAXSH VMAXSH_BS + vus __builtin_vec_max (vus, vbs); + VMAXUH VMAXUH_UB + vus __builtin_vec_max (vbs, vus); + VMAXUH VMAXUH_BU + vsi __builtin_vec_max (vsi, vbi); + VMAXSW VMAXSW_SB + vsi __builtin_vec_max (vbi, vsi); + VMAXSW VMAXSW_BS + vui __builtin_vec_max (vui, vbi); + VMAXUW VMAXUW_UB + vui __builtin_vec_max (vbi, vui); + VMAXUW VMAXUW_BU + vsll __builtin_vec_max (vsll, vbll); + VMAXSD VMAXSD_SB + vsll __builtin_vec_max (vbll, vsll); + VMAXSD VMAXSD_BS + vull __builtin_vec_max (vull, vbll); + VMAXUD VMAXUD_UB + vull __builtin_vec_max (vbll, vull); + VMAXUD VMAXUD_BU + +[VEC_MERGEE, vec_mergee, __builtin_vec_vmrgew, _ARCH_PWR8] + vsi __builtin_vec_vmrgew (vsi, vsi); + VMRGEW_V4SI VMRGEW_VSI + vui __builtin_vec_vmrgew (vui, vui); + VMRGEW_V4SI VMRGEW_VUI + vbi __builtin_vec_vmrgew (vbi, vbi); + VMRGEW_V4SI VMRGEW_VBI + vsll __builtin_vec_vmrgew (vsll, vsll); + VMRGEW_V2DI VMRGEW_VSLL + vull __builtin_vec_vmrgew (vull, vull); + VMRGEW_V2DI VMRGEW_VULL + vbll __builtin_vec_vmrgew (vbll, vbll); + VMRGEW_V2DI VMRGEW_VBLL + vf __builtin_vec_vmrgew (vf, vf); + VMRGEW_V4SF + vd __builtin_vec_vmrgew (vd, vd); + VMRGEW_V2DF + +[VEC_MERGEH, vec_mergeh, __builtin_vec_mergeh] + vbc __builtin_vec_mergeh (vbc, vbc); + VMRGHB VMRGHB_VBC + vsc __builtin_vec_mergeh (vsc, vsc); + VMRGHB VMRGHB_VSC + vuc __builtin_vec_mergeh (vuc, vuc); + VMRGHB VMRGHB_VUC + vbs __builtin_vec_mergeh (vbs, vbs); + VMRGHH VMRGHH_VBS + vss __builtin_vec_mergeh (vss, vss); + VMRGHH VMRGHH_VSS + vus __builtin_vec_mergeh (vus, vus); + VMRGHH VMRGHH_VUS + vp __builtin_vec_mergeh (vp, vp); + VMRGHH VMRGHH_VP + vbi __builtin_vec_mergeh (vbi, vbi); + VMRGHW VMRGHW_VBI + vsi __builtin_vec_mergeh (vsi, vsi); + VMRGHW VMRGHW_VSI + vui __builtin_vec_mergeh (vui, vui); + VMRGHW VMRGHW_VUI + vbll __builtin_vec_mergeh (vbll, vbll); + VEC_MERGEH_V2DI VEC_MERGEH_VBLL + vsll __builtin_vec_mergeh (vsll, vsll); + VEC_MERGEH_V2DI VEC_MERGEH_VSLL + vull __builtin_vec_mergeh (vull, vull); + VEC_MERGEH_V2DI VEC_MERGEH_VULL + vf __builtin_vec_mergeh (vf, vf); + VMRGHW VMRGHW_VF + vd __builtin_vec_mergeh (vd, vd); + VEC_MERGEH_V2DF +; The following variants are deprecated. + vsll __builtin_vec_mergeh (vsll, vbll); + VEC_MERGEH_V2DI VEC_MERGEH_VSLL_VBLL + vsll __builtin_vec_mergeh (vbll, vsll); + VEC_MERGEH_V2DI VEC_MERGEH_VBLL_VSLL + vull __builtin_vec_mergeh (vull, vbll); + VEC_MERGEH_V2DI VEC_MERGEH_VULL_VBLL + vull __builtin_vec_mergeh (vbll, vull); + VEC_MERGEH_V2DI VEC_MERGEH_VBLL_VULL + +[VEC_MERGEL, vec_mergel, __builtin_vec_mergel] + vbc __builtin_vec_mergel (vbc, vbc); + VMRGLB VMRGLB_VBC + vsc __builtin_vec_mergel (vsc, vsc); + VMRGLB VMRGLB_VSC + vuc __builtin_vec_mergel (vuc, vuc); + VMRGLB VMRGLB_VUC + vbs __builtin_vec_mergel (vbs, vbs); + VMRGLH VMRGLH_VBS + vss __builtin_vec_mergel (vss, vss); + VMRGLH VMRGLH_VSS + vus __builtin_vec_mergel (vus, vus); + VMRGLH VMRGLH_VUS + vp __builtin_vec_mergel (vp, vp); + VMRGLH VMRGLH_VP + vbi __builtin_vec_mergel (vbi, vbi); + VMRGLW VMRGLW_VBI + vsi __builtin_vec_mergel (vsi, vsi); + VMRGLW VMRGLW_VSI + vui __builtin_vec_mergel (vui, vui); + VMRGLW VMRGLW_VUI + vbll __builtin_vec_mergel (vbll, vbll); + VEC_MERGEL_V2DI VEC_MERGEL_VBLL + vsll __builtin_vec_mergel (vsll, vsll); + VEC_MERGEL_V2DI VEC_MERGEL_VSLL + vull __builtin_vec_mergel (vull, vull); + VEC_MERGEL_V2DI VEC_MERGEL_VULL + vf __builtin_vec_mergel (vf, vf); + VMRGLW VMRGLW_VF + vd __builtin_vec_mergel (vd, vd); + VEC_MERGEL_V2DF +; The following variants are deprecated. + vsll __builtin_vec_mergel (vsll, vbll); + VEC_MERGEL_V2DI VEC_MERGEL_VSLL_VBLL + vsll __builtin_vec_mergel (vbll, vsll); + VEC_MERGEL_V2DI VEC_MERGEL_VBLL_VSLL + vull __builtin_vec_mergel (vull, vbll); + VEC_MERGEL_V2DI VEC_MERGEL_VULL_VBLL + vull __builtin_vec_mergel (vbll, vull); + VEC_MERGEL_V2DI VEC_MERGEL_VBLL_VULL + +[VEC_MERGEO, vec_mergeo, __builtin_vec_vmrgow, _ARCH_PWR8] + vsi __builtin_vec_vmrgow (vsi, vsi); + VMRGOW_V4SI VMRGOW_VSI + vui __builtin_vec_vmrgow (vui, vui); + VMRGOW_V4SI VMRGOW_VUI + vbi __builtin_vec_vmrgow (vbi, vbi); + VMRGOW_V4SI VMRGOW_VBI + vsll __builtin_vec_vmrgow (vsll, vsll); + VMRGOW_V2DI VMRGOW_VSLL + vull __builtin_vec_vmrgow (vull, vull); + VMRGOW_V2DI VMRGOW_VULL + vbll __builtin_vec_vmrgow (vbll, vbll); + VMRGOW_V2DI VMRGOW_VBLL + vf __builtin_vec_vmrgow (vf, vf); + VMRGOW_V4SF + vd __builtin_vec_vmrgow (vd, vd); + VMRGOW_V2DF + +[VEC_MFVSCR, vec_mfvscr, __builtin_vec_mfvscr] + vus __builtin_vec_mfvscr (); + MFVSCR + +[VEC_MIN, vec_min, __builtin_vec_min] + vsc __builtin_vec_min (vsc, vsc); + VMINSB + vuc __builtin_vec_min (vuc, vuc); + VMINUB + vss __builtin_vec_min (vss, vss); + VMINSH + vus __builtin_vec_min (vus, vus); + VMINUH + vsi __builtin_vec_min (vsi, vsi); + VMINSW + vui __builtin_vec_min (vui, vui); + VMINUW + vsll __builtin_vec_min (vsll, vsll); + VMINSD + vull __builtin_vec_min (vull, vull); + VMINUD + vf __builtin_vec_min (vf, vf); + VMINFP + vd __builtin_vec_min (vd, vd); + XVMINDP +; The following variants are deprecated. + vsc __builtin_vec_min (vsc, vbc); + VMINSB VMINSB_SB + vsc __builtin_vec_min (vbc, vsc); + VMINSB VMINSB_BS + vuc __builtin_vec_min (vuc, vbc); + VMINUB VMINUB_UB + vuc __builtin_vec_min (vbc, vuc); + VMINUB VMINUB_BU + vss __builtin_vec_min (vss, vbs); + VMINSH VMINSH_SB + vss __builtin_vec_min (vbs, vss); + VMINSH VMINSH_BS + vus __builtin_vec_min (vus, vbs); + VMINUH VMINUH_UB + vus __builtin_vec_min (vbs, vus); + VMINUH VMINUH_BU + vsi __builtin_vec_min (vsi, vbi); + VMINSW VMINSW_SB + vsi __builtin_vec_min (vbi, vsi); + VMINSW VMINSW_BS + vui __builtin_vec_min (vui, vbi); + VMINUW VMINUW_UB + vui __builtin_vec_min (vbi, vui); + VMINUW VMINUW_BU + vsll __builtin_vec_min (vsll, vbll); + VMINSD VMINSD_SB + vsll __builtin_vec_min (vbll, vsll); + VMINSD VMINSD_BS + vull __builtin_vec_min (vull, vbll); + VMINUD VMINUD_UB + vull __builtin_vec_min (vbll, vull); + VMINUD VMINUD_BU + +[VEC_MLADD, vec_mladd, __builtin_vec_mladd] + vss __builtin_vec_mladd (vss, vss, vss); + VMLADDUHM VMLADDUHM_VSS2 + vss __builtin_vec_mladd (vss, vus, vus); + VMLADDUHM VMLADDUHM_VSSVUS2 + vss __builtin_vec_mladd (vus, vss, vss); + VMLADDUHM VMLADDUHM_VUSVSS2 + vus __builtin_vec_mladd (vus, vus, vus); + VMLADDUHM VMLADDUHM_VUS2 + +[VEC_MOD, vec_mod, __builtin_vec_mod, _ARCH_PWR10] + vsi __builtin_vec_mod (vsi, vsi); + VMODSW + vui __builtin_vec_mod (vui, vui); + VMODUW + vsll __builtin_vec_mod (vsll, vsll); + VMODSD + vull __builtin_vec_mod (vull, vull); + VMODUD + vsq __builtin_vec_mod (vsq, vsq); + MODS_V1TI + vuq __builtin_vec_mod (vuq, vuq); + MODU_V1TI + +[VEC_MRADDS, vec_mradds, __builtin_vec_mradds] + vss __builtin_vec_mradds (vss, vss, vss); + VMHRADDSHS + +[VEC_MSUB, vec_msub, __builtin_vec_msub, __VSX__] + vf __builtin_vec_msub (vf, vf, vf); + XVMSUBSP + vd __builtin_vec_msub (vd, vd, vd); + XVMSUBDP + +[VEC_MSUM, vec_msum, __builtin_vec_msum] + vui __builtin_vec_msum (vuc, vuc, vui); + VMSUMUBM + vsi __builtin_vec_msum (vsc, vuc, vsi); + VMSUMMBM + vui __builtin_vec_msum (vus, vus, vui); + VMSUMUHM + vsi __builtin_vec_msum (vss, vss, vsi); + VMSUMSHM + vsq __builtin_vec_msum (vsll, vsll, vsq); + VMSUMUDM VMSUMUDM_S + vuq __builtin_vec_msum (vull, vull, vuq); + VMSUMUDM VMSUMUDM_U + +[VEC_MSUMS, vec_msums, __builtin_vec_msums] + vui __builtin_vec_msums (vus, vus, vui); + VMSUMUHS + vsi __builtin_vec_msums (vss, vss, vsi); + VMSUMSHS + +[VEC_MTVSCR, vec_mtvscr, __builtin_vec_mtvscr] + void __builtin_vec_mtvscr (vbc); + MTVSCR MTVSCR_VBC + void __builtin_vec_mtvscr (vsc); + MTVSCR MTVSCR_VSC + void __builtin_vec_mtvscr (vuc); + MTVSCR MTVSCR_VUC + void __builtin_vec_mtvscr (vbs); + MTVSCR MTVSCR_VBS + void __builtin_vec_mtvscr (vss); + MTVSCR MTVSCR_VSS + void __builtin_vec_mtvscr (vus); + MTVSCR MTVSCR_VUS + void __builtin_vec_mtvscr (vp); + MTVSCR MTVSCR_VP + void __builtin_vec_mtvscr (vbi); + MTVSCR MTVSCR_VBI + void __builtin_vec_mtvscr (vsi); + MTVSCR MTVSCR_VSI + void __builtin_vec_mtvscr (vui); + MTVSCR MTVSCR_VUI + +; Note that the entries for VEC_MUL are currently ignored. See rs6000-c.c: +; altivec_resolve_overloaded_builtin, where there is special-case code for +; VEC_MUL. TODO: Is this really necessary? Investigate. Seven missing +; prototypes here...no corresponding builtins. Also added "vmulld" in P10 +; which could be used instead of MUL_V2DI, conditionally? +[VEC_MUL, vec_mul, __builtin_vec_mul] + vsll __builtin_vec_mul (vsll, vsll); + MUL_V2DI + vf __builtin_vec_mul (vf, vf); + XVMULSP + vd __builtin_vec_mul (vd, vd); + XVMULDP + +[VEC_MULE, vec_mule, __builtin_vec_mule] + vss __builtin_vec_mule (vsc, vsc); + VMULESB + vus __builtin_vec_mule (vuc, vuc); + VMULEUB + vsi __builtin_vec_mule (vss, vss); + VMULESH + vui __builtin_vec_mule (vus, vus); + VMULEUH + vsll __builtin_vec_mule (vsi, vsi); + VMULESW + vull __builtin_vec_mule (vui, vui); + VMULEUW + vsq __builtin_vec_mule (vsll, vsll); + VMULESD + vuq __builtin_vec_mule (vull, vull); + VMULEUD + +[VEC_MULH, vec_mulh, __builtin_vec_mulh, _ARCH_PWR10] + vsi __builtin_vec_mulh (vsi, vsi); + VMULHSW + vui __builtin_vec_mulh (vui, vui); + VMULHUW + vsll __builtin_vec_mulh (vsll, vsll); + VMULHSD + vull __builtin_vec_mulh (vull, vull); + VMULHUD + +[VEC_MULO, vec_mulo, __builtin_vec_mulo] + vss __builtin_vec_mulo (vsc, vsc); + VMULOSB + vus __builtin_vec_mulo (vuc, vuc); + VMULOUB + vsi __builtin_vec_mulo (vss, vss); + VMULOSH + vui __builtin_vec_mulo (vus, vus); + VMULOUH + vsll __builtin_vec_mulo (vsi, vsi); + VMULOSW + vull __builtin_vec_mulo (vui, vui); + VMULOUW + vsq __builtin_vec_mulo (vsll, vsll); + VMULOSD + vuq __builtin_vec_mulo (vull, vull); + VMULOUD + +[VEC_NABS, vec_nabs, __builtin_vec_nabs] + vsc __builtin_vec_nabs (vsc); + NABS_V16QI + vss __builtin_vec_nabs (vss); + NABS_V8HI + vsi __builtin_vec_nabs (vsi); + NABS_V4SI + vsll __builtin_vec_nabs (vsll); + NABS_V2DI + vf __builtin_vec_nabs (vf); + NABS_V4SF + vd __builtin_vec_nabs (vd); + NABS_V2DF + +[VEC_NAND, vec_nand, __builtin_vec_nand, _ARCH_PWR8] + vsc __builtin_vec_nand (vsc, vsc); + NAND_V16QI + vuc __builtin_vec_nand (vuc, vuc); + NAND_V16QI_UNS NAND_VUC + vbc __builtin_vec_nand (vbc, vbc); + NAND_V16QI_UNS NAND_VBC + vss __builtin_vec_nand (vss, vss); + NAND_V8HI + vus __builtin_vec_nand (vus, vus); + NAND_V8HI_UNS NAND_VUS + vbs __builtin_vec_nand (vbs, vbs); + NAND_V8HI_UNS NAND_VBS + vsi __builtin_vec_nand (vsi, vsi); + NAND_V4SI + vui __builtin_vec_nand (vui, vui); + NAND_V4SI_UNS NAND_VUI + vbi __builtin_vec_nand (vbi, vbi); + NAND_V4SI_UNS NAND_VBI + vsll __builtin_vec_nand (vsll, vsll); + NAND_V2DI + vull __builtin_vec_nand (vull, vull); + NAND_V2DI_UNS NAND_VULL + vbll __builtin_vec_nand (vbll, vbll); + NAND_V2DI_UNS NAND_VBLL + vf __builtin_vec_nand (vf, vf); + NAND_V4SF + vd __builtin_vec_nand (vd, vd); + NAND_V2DF +; The following variants are deprecated. + vsc __builtin_vec_nand (vbc, vsc); + NAND_V16QI NAND_VBC_VSC + vsc __builtin_vec_nand (vsc, vbc); + NAND_V16QI NAND_VSC_VBC + vuc __builtin_vec_nand (vbc, vuc); + NAND_V16QI_UNS NAND_VBC_VUC + vuc __builtin_vec_nand (vuc, vbc); + NAND_V16QI_UNS NAND_VUC_VBC + vss __builtin_vec_nand (vbs, vss); + NAND_V8HI NAND_VBS_VSS + vss __builtin_vec_nand (vss, vbs); + NAND_V8HI NAND_VSS_VBS + vus __builtin_vec_nand (vbs, vus); + NAND_V8HI_UNS NAND_VBS_VUS + vus __builtin_vec_nand (vus, vbs); + NAND_V8HI_UNS NAND_VUS_VBS + vsi __builtin_vec_nand (vbi, vsi); + NAND_V4SI NAND_VBI_VSI + vsi __builtin_vec_nand (vsi, vbi); + NAND_V4SI NAND_VSI_VBI + vui __builtin_vec_nand (vbi, vui); + NAND_V4SI_UNS NAND_VBI_VUI + vui __builtin_vec_nand (vui, vbi); + NAND_V4SI_UNS NAND_VUI_VBI + vsll __builtin_vec_nand (vbll, vsll); + NAND_V2DI NAND_VBLL_VSLL + vsll __builtin_vec_nand (vsll, vbll); + NAND_V2DI NAND_VSLL_VBLL + vull __builtin_vec_nand (vbll, vull); + NAND_V2DI_UNS NAND_VBLL_VULL + vull __builtin_vec_nand (vull, vbll); + NAND_V2DI_UNS NAND_VULL_VBLL + +[VEC_NCIPHER_BE, vec_ncipher_be, __builtin_vec_vncipher_be, _ARCH_PWR8] + vuc __builtin_vec_vncipher_be (vuc, vuc); + VNCIPHER_BE + +[VEC_NCIPHERLAST_BE, vec_ncipherlast_be, __builtin_vec_vncipherlast_be, _ARCH_PWR8] + vuc __builtin_vec_vncipherlast_be (vuc, vuc); + VNCIPHERLAST_BE + +[VEC_NEARBYINT, vec_nearbyint, __builtin_vec_nearbyint, __VSX__] + vf __builtin_vec_nearbyint (vf); + XVRSPI XVRSPI_NBI + vd __builtin_vec_nearbyint (vd); + XVRDPI XVRDPI_NBI + +[VEC_NEG, vec_neg, __builtin_vec_neg] + vsc __builtin_vec_neg (vsc); + NEG_V16QI + vss __builtin_vec_neg (vss); + NEG_V8HI + vsi __builtin_vec_neg (vsi); + NEG_V4SI + vsll __builtin_vec_neg (vsll); + NEG_V2DI + vf __builtin_vec_neg (vf); + NEG_V4SF + vd __builtin_vec_neg (vd); + NEG_V2DF + +[VEC_NMADD, vec_nmadd, __builtin_vec_nmadd, __VSX__] + vf __builtin_vec_nmadd (vf, vf, vf); + XVNMADDSP + vd __builtin_vec_nmadd (vd, vd, vd); + XVNMADDDP + +[VEC_NMSUB, vec_nmsub, __builtin_vec_nmsub] + vf __builtin_vec_nmsub (vf, vf, vf); + VNMSUBFP + vd __builtin_vec_nmsub (vd, vd, vd); + XVNMSUBDP + +[VEC_NOR, vec_nor, __builtin_vec_nor] + vsc __builtin_vec_nor (vsc, vsc); + VNOR_V16QI + vuc __builtin_vec_nor (vuc, vuc); + VNOR_V16QI_UNS VNOR_V16QI_U + vbc __builtin_vec_nor (vbc, vbc); + VNOR_V16QI_UNS VNOR_V16QI_B + vss __builtin_vec_nor (vss, vss); + VNOR_V8HI + vus __builtin_vec_nor (vus, vus); + VNOR_V8HI_UNS VNOR_V8HI_U + vbs __builtin_vec_nor (vbs, vbs); + VNOR_V8HI_UNS VNOR_V8HI_B + vsi __builtin_vec_nor (vsi, vsi); + VNOR_V4SI + vui __builtin_vec_nor (vui, vui); + VNOR_V4SI_UNS VNOR_V4SI_U + vbi __builtin_vec_nor (vbi, vbi); + VNOR_V4SI_UNS VNOR_V4SI_B + vsll __builtin_vec_nor (vsll, vsll); + VNOR_V2DI + vull __builtin_vec_nor (vull, vull); + VNOR_V2DI_UNS VNOR_V2DI_U + vbll __builtin_vec_nor (vbll, vbll); + VNOR_V2DI_UNS VNOR_V2DI_B + vsq __builtin_vec_nor (vsq, vsq); + VNOR_V1TI VNOR_V1TI_S + vuq __builtin_vec_nor (vuq, vuq); + VNOR_V1TI_UNS VNOR_V1TI_U + vf __builtin_vec_nor (vf, vf); + VNOR_V4SF + vd __builtin_vec_nor (vd, vd); + VNOR_V2DF +; The following variants are deprecated. + vsll __builtin_vec_nor (vsll, vbll); + VNOR_V2DI VNOR_VSLL_VBLL + vsll __builtin_vec_nor (vbll, vsll); + VNOR_V2DI VNOR_VBLL_VSLL + vull __builtin_vec_nor (vull, vbll); + VNOR_V2DI_UNS VNOR_VULL_VBLL + vull __builtin_vec_nor (vbll, vull); + VNOR_V2DI_UNS VNOR_VBLL_VULL + vsq __builtin_vec_nor (vsq, vbq); + VNOR_V1TI VNOR_VSQ_VBQ + vsq __builtin_vec_nor (vbq, vsq); + VNOR_V1TI VNOR_VBQ_VSQ + vuq __builtin_vec_nor (vuq, vbq); + VNOR_V1TI_UNS VNOR_VUQ_VBQ + vuq __builtin_vec_nor (vbq, vuq); + VNOR_V1TI_UNS VNOR_VBQ_VUQ + +[VEC_OR, vec_or, __builtin_vec_or] + vsc __builtin_vec_or (vsc, vsc); + VOR_V16QI + vuc __builtin_vec_or (vuc, vuc); + VOR_V16QI_UNS VOR_V16QI_U + vbc __builtin_vec_or (vbc, vbc); + VOR_V16QI_UNS VOR_V16QI_B + vss __builtin_vec_or (vss, vss); + VOR_V8HI + vus __builtin_vec_or (vus, vus); + VOR_V8HI_UNS VOR_V8HI_U + vbs __builtin_vec_or (vbs, vbs); + VOR_V8HI_UNS VOR_V8HI_B + vsi __builtin_vec_or (vsi, vsi); + VOR_V4SI + vui __builtin_vec_or (vui, vui); + VOR_V4SI_UNS VOR_V4SI_U + vbi __builtin_vec_or (vbi, vbi); + VOR_V4SI_UNS VOR_V4SI_B + vsll __builtin_vec_or (vsll, vsll); + VOR_V2DI + vull __builtin_vec_or (vull, vull); + VOR_V2DI_UNS VOR_V2DI_U + vbll __builtin_vec_or (vbll, vbll); + VOR_V2DI_UNS VOR_V2DI_B + vf __builtin_vec_or (vf, vf); + VOR_V4SF + vd __builtin_vec_or (vd, vd); + VOR_V2DF +; The following variants are deprecated. + vsc __builtin_vec_or (vsc, vbc); + VOR_V16QI VOR_VSC_VBC + vsc __builtin_vec_or (vbc, vsc); + VOR_V16QI VOR_VBC_VSC + vuc __builtin_vec_or (vuc, vbc); + VOR_V16QI_UNS VOR_V16QI_UB + vuc __builtin_vec_or (vbc, vuc); + VOR_V16QI_UNS VOR_V16QI_BU + vss __builtin_vec_or (vss, vbs); + VOR_V8HI VOR_VSS_VBS + vss __builtin_vec_or (vbs, vss); + VOR_V8HI VOR_VBS_VSS + vus __builtin_vec_or (vus, vbs); + VOR_V8HI_UNS VOR_V8HI_UB + vus __builtin_vec_or (vbs, vus); + VOR_V8HI_UNS VOR_V8HI_BU + vsi __builtin_vec_or (vsi, vbi); + VOR_V4SI VOR_VSI_VBI + vsi __builtin_vec_or (vbi, vsi); + VOR_V4SI VOR_VBI_VSI + vui __builtin_vec_or (vui, vbi); + VOR_V4SI_UNS VOR_V4SI_UB + vui __builtin_vec_or (vbi, vui); + VOR_V4SI_UNS VOR_V4SI_BU + vsll __builtin_vec_or (vsll, vbll); + VOR_V2DI VOR_VSLL_VBLL + vsll __builtin_vec_or (vbll, vsll); + VOR_V2DI VOR_VBLL_VSLL + vull __builtin_vec_or (vull, vbll); + VOR_V2DI_UNS VOR_V2DI_UB + vull __builtin_vec_or (vbll, vull); + VOR_V2DI_UNS VOR_V2DI_BU + vf __builtin_vec_or (vf, vbi); + VOR_V4SF VOR_VF_VBI + vf __builtin_vec_or (vbi, vf); + VOR_V4SF VOR_VBI_VF + vd __builtin_vec_or (vd, vbll); + VOR_V2DF VOR_VD_VBLL + vd __builtin_vec_or (vbll, vd); + VOR_V2DF VOR_VBLL_VD + +[VEC_ORC, vec_orc, __builtin_vec_orc, _ARCH_PWR8] + vsc __builtin_vec_orc (vsc, vsc); + ORC_V16QI + vuc __builtin_vec_orc (vuc, vuc); + ORC_V16QI_UNS ORC_VUC + vbc __builtin_vec_orc (vbc, vbc); + ORC_V16QI_UNS ORC_VBC + vss __builtin_vec_orc (vss, vss); + ORC_V8HI + vus __builtin_vec_orc (vus, vus); + ORC_V8HI_UNS ORC_VUS + vbs __builtin_vec_orc (vbs, vbs); + ORC_V8HI_UNS ORC_VBS + vsi __builtin_vec_orc (vsi, vsi); + ORC_V4SI + vui __builtin_vec_orc (vui, vui); + ORC_V4SI_UNS ORC_VUI + vbi __builtin_vec_orc (vbi, vbi); + ORC_V4SI_UNS ORC_VBI + vsll __builtin_vec_orc (vsll, vsll); + ORC_V2DI + vull __builtin_vec_orc (vull, vull); + ORC_V2DI_UNS ORC_VULL + vbll __builtin_vec_orc (vbll, vbll); + ORC_V2DI_UNS ORC_VBLL + vf __builtin_vec_orc (vf, vf); + ORC_V4SF + vd __builtin_vec_orc (vd, vd); + ORC_V2DF +; The following variants are deprecated. + vsc __builtin_vec_orc (vbc, vsc); + ORC_V16QI ORC_VBC_VSC + vsc __builtin_vec_orc (vsc, vbc); + ORC_V16QI ORC_VSC_VBC + vuc __builtin_vec_orc (vbc, vuc); + ORC_V16QI_UNS ORC_VBC_VUC + vuc __builtin_vec_orc (vuc, vbc); + ORC_V16QI_UNS ORC_VUC_VBC + vss __builtin_vec_orc (vbs, vss); + ORC_V8HI ORC_VBS_VSS + vss __builtin_vec_orc (vss, vbs); + ORC_V8HI ORC_VSS_VBS + vus __builtin_vec_orc (vbs, vus); + ORC_V8HI_UNS ORC_VBS_VUS + vus __builtin_vec_orc (vus, vbs); + ORC_V8HI_UNS ORC_VUS_VBS + vsi __builtin_vec_orc (vbi, vsi); + ORC_V4SI ORC_VBI_VSI + vsi __builtin_vec_orc (vsi, vbi); + ORC_V4SI ORC_VSI_VBI + vui __builtin_vec_orc (vbi, vui); + ORC_V4SI_UNS ORC_VBI_VUI + vui __builtin_vec_orc (vui, vbi); + ORC_V4SI_UNS ORC_VUI_VBI + vsll __builtin_vec_orc (vbll, vsll); + ORC_V2DI ORC_VBLL_VSLL + vsll __builtin_vec_orc (vsll, vbll); + ORC_V2DI ORC_VSLL_VBLL + vull __builtin_vec_orc (vbll, vull); + ORC_V2DI_UNS ORC_VBLL_VULL + vull __builtin_vec_orc (vull, vbll); + ORC_V2DI_UNS ORC_VULL_VBLL + +[VEC_PACK, vec_pack, __builtin_vec_pack] + vsc __builtin_vec_pack (vss, vss); + VPKUHUM VPKUHUM_VSS + vuc __builtin_vec_pack (vus, vus); + VPKUHUM VPKUHUM_VUS + vbc __builtin_vec_pack (vbs, vbs); + VPKUHUM VPKUHUM_VBS + vss __builtin_vec_pack (vsi, vsi); + VPKUWUM VPKUWUM_VSI + vus __builtin_vec_pack (vui, vui); + VPKUWUM VPKUWUM_VUI + vbs __builtin_vec_pack (vbi, vbi); + VPKUWUM VPKUWUM_VBI + vsi __builtin_vec_pack (vsll, vsll); + VPKUDUM VPKUDUM_VSLL + vui __builtin_vec_pack (vull, vull); + VPKUDUM VPKUDUM_VULL + vbi __builtin_vec_pack (vbll, vbll); + VPKUDUM VPKUDUM_VBLL + vf __builtin_vec_pack (vd, vd); + FLOAT2_V2DF FLOAT2_V2DF_PACK + +[VEC_PACKPX, vec_packpx, __builtin_vec_packpx] + vp __builtin_vec_packpx (vui, vui); + VPKPX + +[VEC_PACKS, vec_packs, __builtin_vec_packs] + vuc __builtin_vec_packs (vus, vus); + VPKUHUS VPKUHUS_S + vsc __builtin_vec_packs (vss, vss); + VPKSHSS + vus __builtin_vec_packs (vui, vui); + VPKUWUS VPKUWUS_S + vss __builtin_vec_packs (vsi, vsi); + VPKSWSS + vui __builtin_vec_packs (vull, vull); + VPKUDUS VPKUDUS_S + vsi __builtin_vec_packs (vsll, vsll); + VPKSDSS + +[VEC_PACKSU, vec_packsu, __builtin_vec_packsu] + vuc __builtin_vec_packsu (vus, vus); + VPKUHUS VPKUHUS_U + vuc __builtin_vec_packsu (vss, vss); + VPKSHUS + vus __builtin_vec_packsu (vui, vui); + VPKUWUS VPKUWUS_U + vus __builtin_vec_packsu (vsi, vsi); + VPKSWUS + vui __builtin_vec_packsu (vull, vull); + VPKUDUS VPKUDUS_U + vui __builtin_vec_packsu (vsll, vsll); + VPKSDUS + +[VEC_PDEP, vec_pdep, __builtin_vec_vpdepd, _ARCH_PWR10] + vull __builtin_vec_vpdepd (vull, vull); + VPDEPD + +[VEC_PERM, vec_perm, __builtin_vec_perm] + vsc __builtin_vec_perm (vsc, vsc, vuc); + VPERM_16QI + vuc __builtin_vec_perm (vuc, vuc, vuc); + VPERM_16QI_UNS VPERM_16QI_VUC + vbc __builtin_vec_perm (vbc, vbc, vuc); + VPERM_16QI_UNS VPERM_16QI_VBC + vss __builtin_vec_perm (vss, vss, vuc); + VPERM_8HI + vus __builtin_vec_perm (vus, vus, vuc); + VPERM_8HI_UNS VPERM_8HI_VUS + vbs __builtin_vec_perm (vbs, vbs, vuc); + VPERM_8HI_UNS VPERM_8HI_VBS + vp __builtin_vec_perm (vp, vp, vuc); + VPERM_8HI_UNS VPERM_8HI_VP + vsi __builtin_vec_perm (vsi, vsi, vuc); + VPERM_4SI + vui __builtin_vec_perm (vui, vui, vuc); + VPERM_4SI_UNS VPERM_4SI_VUI + vbi __builtin_vec_perm (vbi, vbi, vuc); + VPERM_4SI_UNS VPERM_4SI_VBI + vsll __builtin_vec_perm (vsll, vsll, vuc); + VPERM_2DI + vull __builtin_vec_perm (vull, vull, vuc); + VPERM_2DI_UNS VPERM_2DI_VULL + vbll __builtin_vec_perm (vbll, vbll, vuc); + VPERM_2DI_UNS VPERM_2DI_VBLL + vf __builtin_vec_perm (vf, vf, vuc); + VPERM_4SF + vd __builtin_vec_perm (vd, vd, vuc); + VPERM_2DF + vsq __builtin_vec_perm (vsq, vsq, vuc); + VPERM_1TI + vuq __builtin_vec_perm (vuq, vuq, vuc); + VPERM_1TI_UNS +; The following variants are deprecated. + vsc __builtin_vec_perm (vsc, vuc, vuc); + VPERM_16QI VPERM_VSC_VUC_VUC + vbc __builtin_vec_perm (vbc, vbc, vbc); + VPERM_16QI VPERM_VBC_VBC_VBC + +[VEC_PERMX, vec_permx, __builtin_vec_xxpermx, _ARCH_PWR10] + vsc __builtin_vec_xxpermx (vsc, vsc, vuc, const int); + XXPERMX_UV2DI XXPERMX_VSC + vuc __builtin_vec_xxpermx (vuc, vuc, vuc, const int); + XXPERMX_UV2DI XXPERMX_VUC + vss __builtin_vec_xxpermx (vss, vss, vuc, const int); + XXPERMX_UV2DI XXPERMX_VSS + vus __builtin_vec_xxpermx (vus, vus, vuc, const int); + XXPERMX_UV2DI XXPERMX_VUS + vsi __builtin_vec_xxpermx (vsi, vsi, vuc, const int); + XXPERMX_UV2DI XXPERMX_VSI + vui __builtin_vec_xxpermx (vui, vui, vuc, const int); + XXPERMX_UV2DI XXPERMX_VUI + vsll __builtin_vec_xxpermx (vsll, vsll, vuc, const int); + XXPERMX_UV2DI XXPERMX_VSLL + vull __builtin_vec_xxpermx (vull, vull, vuc, const int); + XXPERMX_UV2DI XXPERMX_VULL + vf __builtin_vec_xxpermx (vf, vf, vuc, const int); + XXPERMX_UV2DI XXPERMX_VF + vd __builtin_vec_xxpermx (vd, vd, vuc, const int); + XXPERMX_UV2DI XXPERMX_VD + +[VEC_PERMXOR, vec_permxor, __builtin_vec_vpermxor] + vsc __builtin_vec_vpermxor (vsc, vsc, vsc); + VPERMXOR VPERMXOR_VSC + vuc __builtin_vec_vpermxor (vuc, vuc, vuc); + VPERMXOR VPERMXOR_VUC + vbc __builtin_vec_vpermxor (vbc, vbc, vbc); + VPERMXOR VPERMXOR_VBC + +[VEC_PEXT, vec_pext, __builtin_vec_vpextd, _ARCH_PWR10] + vull __builtin_vec_vpextd (vull, vull); + VPEXTD + +[VEC_PMSUM, vec_pmsum_be, __builtin_vec_vpmsum] + vus __builtin_vec_vpmsum (vuc, vuc); + VPMSUMB VPMSUMB_V + vui __builtin_vec_vpmsum (vus, vus); + VPMSUMH VPMSUMH_V + vull __builtin_vec_vpmsum (vui, vui); + VPMSUMW VPMSUMW_V + vuq __builtin_vec_vpmsum (vull, vull); + VPMSUMD VPMSUMD_V + +[VEC_POPCNT, vec_popcnt, __builtin_vec_vpopcntu, _ARCH_PWR8] + vuc __builtin_vec_vpopcntu (vsc); + VPOPCNTB + vuc __builtin_vec_vpopcntu (vuc); + VPOPCNTUB + vus __builtin_vec_vpopcntu (vss); + VPOPCNTH + vus __builtin_vec_vpopcntu (vus); + VPOPCNTUH + vui __builtin_vec_vpopcntu (vsi); + VPOPCNTW + vui __builtin_vec_vpopcntu (vui); + VPOPCNTUW + vull __builtin_vec_vpopcntu (vsll); + VPOPCNTD + vull __builtin_vec_vpopcntu (vull); + VPOPCNTUD + +[VEC_PARITY_LSBB, vec_parity_lsbb, __builtin_vec_vparity_lsbb, _ARCH_PWR9] + vui __builtin_vec_vparity_lsbb (vsi); + VPRTYBW VPRTYBW_S + vui __builtin_vec_vparity_lsbb (vui); + VPRTYBW VPRTYBW_U + vull __builtin_vec_vparity_lsbb (vsll); + VPRTYBD VPRTYBD_S + vull __builtin_vec_vparity_lsbb (vull); + VPRTYBD VPRTYBD_U + vuq __builtin_vec_vparity_lsbb (vsq); + VPRTYBQ VPRTYBQ_S + vuq __builtin_vec_vparity_lsbb (vuq); + VPRTYBQ VPRTYBQ_U + +; There are no actual builtins for vec_promote. There is special handling for +; this in altivec_resolve_overloaded_builtin in rs6000-c.c, where the call +; is replaced by a constructor. The single overload here causes +; __builtin_vec_promote to be registered with the front end so that can happen. +[VEC_PROMOTE, vec_promote, __builtin_vec_promote] + vsi __builtin_vec_promote (vsi); + ABS_V4SI PROMOTE_FAKERY + +[VEC_RE, vec_re, __builtin_vec_re] + vf __builtin_vec_re (vf); + VREFP + vd __builtin_vec_re (vd); + XVREDP + +[VEC_RECIP, vec_recipdiv, __builtin_vec_recipdiv] + vf __builtin_vec_recipdiv (vf, vf); + RECIP_V4SF + vd __builtin_vec_recipdiv (vd, vd); + RECIP_V2DF + +[VEC_REPLACE_ELT, vec_replace_elt, __builtin_vec_replace_elt, _ARCH_PWR10] + vui __builtin_vec_replace_elt (vui, unsigned int, const int); + VREPLACE_ELT_UV4SI + vsi __builtin_vec_replace_elt (vsi, signed int, const int); + VREPLACE_ELT_V4SI + vull __builtin_vec_replace_elt (vull, unsigned long long, const int); + VREPLACE_ELT_UV2DI + vsll __builtin_vec_replace_elt (vsll, signed long long, const int); + VREPLACE_ELT_V2DI + vf __builtin_vec_replace_elt (vf, float, const int); + VREPLACE_ELT_V4SF + vd __builtin_vec_replace_elt (vd, double, const int); + VREPLACE_ELT_V2DF + +[VEC_REPLACE_UN, vec_replace_unaligned, __builtin_vec_replace_un, _ARCH_PWR10] + vui __builtin_vec_replace_un (vui, unsigned int, const int); + VREPLACE_UN_UV4SI + vsi __builtin_vec_replace_un (vsi, signed int, const int); + VREPLACE_UN_V4SI + vull __builtin_vec_replace_un (vull, unsigned long long, const int); + VREPLACE_UN_UV2DI + vsll __builtin_vec_replace_un (vsll, signed long long, const int); + VREPLACE_UN_V2DI + vf __builtin_vec_replace_un (vf, float, const int); + VREPLACE_UN_V4SF + vd __builtin_vec_replace_un (vd, double, const int); + VREPLACE_UN_V2DF + +[VEC_REVB, vec_revb, __builtin_vec_revb, _ARCH_PWR8] + vss __builtin_vec_revb (vss); + REVB_V8HI REVB_VSS + vus __builtin_vec_revb (vus); + REVB_V8HI REVB_VUS + vsi __builtin_vec_revb (vsi); + REVB_V4SI REVB_VSI + vui __builtin_vec_revb (vui); + REVB_V4SI REVB_VUI + vsll __builtin_vec_revb (vsll); + REVB_V2DI REVB_VSLL + vull __builtin_vec_revb (vull); + REVB_V2DI REVB_VULL + vsq __builtin_vec_revb (vsq); + REVB_V1TI REVB_VSQ + vuq __builtin_vec_revb (vuq); + REVB_V1TI REVB_VUQ + vf __builtin_vec_revb (vf); + REVB_V4SF + vd __builtin_vec_revb (vd); + REVB_V2DF +; The following variants are deprecated. + vsc __builtin_vec_revb (vsc); + REVB_V16QI REVB_VSC + vuc __builtin_vec_revb (vuc); + REVB_V16QI REVB_VUC + vbc __builtin_vec_revb (vbc); + REVB_V16QI REVB_VBC + vbs __builtin_vec_revb (vbs); + REVB_V8HI REVB_VBS + vbi __builtin_vec_revb (vbi); + REVB_V4SI REVB_VBI + vbll __builtin_vec_revb (vbll); + REVB_V2DI REVB_VBLL + +[VEC_REVE, vec_reve, __builtin_vec_vreve] + vsc __builtin_vec_vreve (vsc); + VREVE_V16QI VREVE_VSC + vuc __builtin_vec_vreve (vuc); + VREVE_V16QI VREVE_VUC + vbc __builtin_vec_vreve (vbc); + VREVE_V16QI VREVE_VBC + vss __builtin_vec_vreve (vss); + VREVE_V8HI VREVE_VSS + vus __builtin_vec_vreve (vus); + VREVE_V8HI VREVE_VUS + vbs __builtin_vec_vreve (vbs); + VREVE_V8HI VREVE_VBS + vsi __builtin_vec_vreve (vsi); + VREVE_V4SI VREVE_VSI + vui __builtin_vec_vreve (vui); + VREVE_V4SI VREVE_VUI + vbi __builtin_vec_vreve (vbi); + VREVE_V4SI VREVE_VBI + vsll __builtin_vec_vreve (vsll); + VREVE_V2DI VREVE_VSLL + vull __builtin_vec_vreve (vull); + VREVE_V2DI VREVE_VULL + vbll __builtin_vec_vreve (vbll); + VREVE_V2DI VREVE_VBLL + vf __builtin_vec_vreve (vf); + VREVE_V4SF + vd __builtin_vec_vreve (vd); + VREVE_V2DF + +[VEC_RINT, vec_rint, __builtin_vec_rint, __VSX__] + vf __builtin_vec_rint (vf); + XVRSPIC + vd __builtin_vec_rint (vd); + XVRDPIC + +[VEC_RL, vec_rl, __builtin_vec_rl] + vsc __builtin_vec_rl (vsc, vuc); + VRLB VRLB_VSC + vuc __builtin_vec_rl (vuc, vuc); + VRLB VRLB_VUC + vss __builtin_vec_rl (vss, vus); + VRLH VRLH_VSS + vus __builtin_vec_rl (vus, vus); + VRLH VRLH_VUS + vsi __builtin_vec_rl (vsi, vui); + VRLW VRLW_VSI + vui __builtin_vec_rl (vui, vui); + VRLW VRLW_VUI + vsll __builtin_vec_rl (vsll, vull); + VRLD VRLD_VSLL + vull __builtin_vec_rl (vull, vull); + VRLD VRLD_VULL + vsq __builtin_vec_rl (vsq, vuq); + VRLQ VRLQ_VSQ + vuq __builtin_vec_rl (vuq, vuq); + VRLQ VRLQ_VUQ + +[VEC_RLMI, vec_rlmi, __builtin_vec_rlmi, _ARCH_PWR9] + vui __builtin_vec_rlmi (vui, vui, vui); + VRLWMI + vull __builtin_vec_rlmi (vull, vull, vull); + VRLDMI + vsq __builtin_vec_rlmi (vsq, vsq, vuq); + VRLQMI VRLQMI_VSQ + vuq __builtin_vec_rlmi (vuq, vuq, vuq); + VRLQMI VRLQMI_VUQ + +[VEC_RLNM, vec_vrlnm, __builtin_vec_rlnm, _ARCH_PWR9] + vui __builtin_vec_rlnm (vui, vui); + VRLWNM + vull __builtin_vec_rlnm (vull, vull); + VRLDNM + vsq __builtin_vec_rlnm (vsq, vuq); + VRLQNM VRLQNM_VSQ + vuq __builtin_vec_rlnm (vuq, vuq); + VRLQNM VRLQNM_VUQ + +[VEC_ROUND, vec_round, __builtin_vec_round] + vf __builtin_vec_round (vf); + VRFIN + vd __builtin_vec_round (vd); + XVRDPI + +[VEC_RSQRT, vec_rsqrt, __builtin_vec_rsqrt] + vf __builtin_vec_rsqrt (vf); + RSQRT_4SF + vd __builtin_vec_rsqrt (vd); + RSQRT_2DF + +[VEC_RSQRTE, vec_rsqrte, __builtin_vec_rsqrte] + vf __builtin_vec_rsqrte (vf); + VRSQRTEFP + vd __builtin_vec_rsqrte (vd); + XVRSQRTEDP + +[VEC_SBOX_BE, vec_sbox_be, __builtin_vec_sbox_be, _ARCH_PWR8] + vuc __builtin_vec_sbox_be (vuc); + VSBOX_BE + +[VEC_SEL, vec_sel, __builtin_vec_sel] + vsc __builtin_vec_sel (vsc, vsc, vbc); + VSEL_16QI VSEL_16QI_B + vsc __builtin_vec_sel (vsc, vsc, vuc); + VSEL_16QI VSEL_16QI_U + vuc __builtin_vec_sel (vuc, vuc, vbc); + VSEL_16QI_UNS VSEL_16QI_UB + vuc __builtin_vec_sel (vuc, vuc, vuc); + VSEL_16QI_UNS VSEL_16QI_UU + vbc __builtin_vec_sel (vbc, vbc, vbc); + VSEL_16QI_UNS VSEL_16QI_BB + vbc __builtin_vec_sel (vbc, vbc, vuc); + VSEL_16QI_UNS VSEL_16QI_BU + vss __builtin_vec_sel (vss, vss, vbs); + VSEL_8HI VSEL_8HI_B + vss __builtin_vec_sel (vss, vss, vus); + VSEL_8HI VSEL_8HI_U + vus __builtin_vec_sel (vus, vus, vbs); + VSEL_8HI_UNS VSEL_8HI_UB + vus __builtin_vec_sel (vus, vus, vus); + VSEL_8HI_UNS VSEL_8HI_UU + vbs __builtin_vec_sel (vbs, vbs, vbs); + VSEL_8HI_UNS VSEL_8HI_BB + vbs __builtin_vec_sel (vbs, vbs, vus); + VSEL_8HI_UNS VSEL_8HI_BU + vsi __builtin_vec_sel (vsi, vsi, vbi); + VSEL_4SI VSEL_4SI_B + vsi __builtin_vec_sel (vsi, vsi, vui); + VSEL_4SI VSEL_4SI_U + vui __builtin_vec_sel (vui, vui, vbi); + VSEL_4SI_UNS VSEL_4SI_UB + vui __builtin_vec_sel (vui, vui, vui); + VSEL_4SI_UNS VSEL_4SI_UU + vbi __builtin_vec_sel (vbi, vbi, vbi); + VSEL_4SI_UNS VSEL_4SI_BB + vbi __builtin_vec_sel (vbi, vbi, vui); + VSEL_4SI_UNS VSEL_4SI_BU + vsll __builtin_vec_sel (vsll, vsll, vbll); + VSEL_2DI_B VSEL_2DI_B + vsll __builtin_vec_sel (vsll, vsll, vull); + VSEL_2DI_B VSEL_2DI_U + vull __builtin_vec_sel (vull, vull, vbll); + VSEL_2DI_UNS VSEL_2DI_UB + vull __builtin_vec_sel (vull, vull, vull); + VSEL_2DI_UNS VSEL_2DI_UU + vbll __builtin_vec_sel (vbll, vbll, vbll); + VSEL_2DI_UNS VSEL_2DI_BB + vbll __builtin_vec_sel (vbll, vbll, vull); + VSEL_2DI_UNS VSEL_2DI_BU + vf __builtin_vec_sel (vf, vf, vbi); + VSEL_4SF VSEL_4SF_B + vf __builtin_vec_sel (vf, vf, vui); + VSEL_4SF VSEL_4SF_U + vd __builtin_vec_sel (vd, vd, vbll); + VSEL_2DF VSEL_2DF_B + vd __builtin_vec_sel (vd, vd, vull); + VSEL_2DF VSEL_2DF_U +; The following variants are deprecated. + vsll __builtin_vec_sel (vsll, vsll, vsll); + VSEL_2DI_B VSEL_2DI_S + vull __builtin_vec_sel (vull, vull, vsll); + VSEL_2DI_UNS VSEL_2DI_US + vf __builtin_vec_sel (vf, vf, vf); + VSEL_4SF VSEL_4SF_F + vf __builtin_vec_sel (vf, vf, vsi); + VSEL_4SF VSEL_4SF_S + vd __builtin_vec_sel (vd, vd, vsll); + VSEL_2DF VSEL_2DF_S + vd __builtin_vec_sel (vd, vd, vd); + VSEL_2DF VSEL_2DF_D + +[VEC_SHASIGMA_BE, vec_shasigma_be, __builtin_crypto_vshasigma] + vui __builtin_crypto_vshasigma (vui, const int, const int); + VSHASIGMAW + vull __builtin_crypto_vshasigma (vull, const int, const int); + VSHASIGMAD + +[VEC_SIGNED, vec_signed, __builtin_vec_vsigned] + vsi __builtin_vec_vsigned (vf); + VEC_VSIGNED_V4SF + vsll __builtin_vec_vsigned (vd); + VEC_VSIGNED_V2DF + +[VEC_SIGNED2, vec_signed2, __builtin_vec_vsigned2] + vsi __builtin_vec_vsigned2 (vd, vd); + VEC_VSIGNED2_V2DF + +[VEC_SIGNEDE, vec_signede, __builtin_vec_vsignede] + vsi __builtin_vec_vsignede (vd); + VEC_VSIGNEDE_V2DF + +[VEC_SIGNEDO, vec_signedo, __builtin_vec_vsignedo] + vsi __builtin_vec_vsignedo (vd); + VEC_VSIGNEDO_V2DF + +[VEC_SIGNEXTI, vec_signexti, __builtin_vec_signexti, _ARCH_PWR9] + vsi __builtin_vec_signexti (vsc); + VSIGNEXTSB2W + vsi __builtin_vec_signexti (vss); + VSIGNEXTSH2W + +[VEC_SIGNEXTLL, vec_signextll, __builtin_vec_signextll, _ARCH_PWR9] + vsll __builtin_vec_signextll (vsc); + VSIGNEXTSB2D + vsll __builtin_vec_signextll (vss); + VSIGNEXTSH2D + vsll __builtin_vec_signextll (vsi); + VSIGNEXTSW2D + +[VEC_SIGNEXTQ, vec_signextq, __builtin_vec_signextq, _ARCH_PWR10] + vsq __builtin_vec_signextq (vsll); + VSIGNEXTSD2Q + +[VEC_SL, vec_sl, __builtin_vec_sl] + vsc __builtin_vec_sl (vsc, vuc); + VSLB VSLB_VSC + vuc __builtin_vec_sl (vuc, vuc); + VSLB VSLB_VUC + vss __builtin_vec_sl (vss, vus); + VSLH VSLH_VSS + vus __builtin_vec_sl (vus, vus); + VSLH VSLH_VUS + vsi __builtin_vec_sl (vsi, vui); + VSLW VSLW_VSI + vui __builtin_vec_sl (vui, vui); + VSLW VSLW_VUI + vsll __builtin_vec_sl (vsll, vull); + VSLD VSLD_VSLL + vull __builtin_vec_sl (vull, vull); + VSLD VSLD_VULL + vsq __builtin_vec_sl (vsq, vuq); + VSLQ VSLQ_VSQ + vuq __builtin_vec_sl (vuq, vuq); + VSLQ VSLQ_VUQ + +[VEC_SLD, vec_sld, __builtin_vec_sld] + vsc __builtin_vec_sld (vsc, vsc, const int); + VSLDOI_16QI VSLDOI_VSC + vbc __builtin_vec_sld (vbc, vbc, const int); + VSLDOI_16QI VSLDOI_VBC + vuc __builtin_vec_sld (vuc, vuc, const int); + VSLDOI_16QI VSLDOI_VUC + vss __builtin_vec_sld (vss, vss, const int); + VSLDOI_8HI VSLDOI_VSS + vbs __builtin_vec_sld (vbs, vbs, const int); + VSLDOI_8HI VSLDOI_VBS + vus __builtin_vec_sld (vus, vus, const int); + VSLDOI_8HI VSLDOI_VUS + vp __builtin_vec_sld (vp, vp, const int); + VSLDOI_8HI VSLDOI_VP + vsi __builtin_vec_sld (vsi, vsi, const int); + VSLDOI_4SI VSLDOI_VSI + vbi __builtin_vec_sld (vbi, vbi, const int); + VSLDOI_4SI VSLDOI_VBI + vui __builtin_vec_sld (vui, vui, const int); + VSLDOI_4SI VSLDOI_VUI + vsll __builtin_vec_sld (vsll, vsll, const int); + VSLDOI_2DI VSLDOI_VSLL + vbll __builtin_vec_sld (vbll, vbll, const int); + VSLDOI_2DI VSLDOI_VBLL + vull __builtin_vec_sld (vull, vull, const int); + VSLDOI_2DI VSLDOI_VULL + vf __builtin_vec_sld (vf, vf, const int); + VSLDOI_4SF + vd __builtin_vec_sld (vd, vd, const int); + VSLDOI_2DF + +[VEC_SLDB, vec_sldb, __builtin_vec_sldb, _ARCH_PWR10] + vsc __builtin_vec_sldb (vsc, vsc, const int); + VSLDB_V16QI VSLDB_VSC + vuc __builtin_vec_sldb (vuc, vuc, const int); + VSLDB_V16QI VSLDB_VUC + vss __builtin_vec_sldb (vss, vss, const int); + VSLDB_V8HI VSLDB_VSS + vus __builtin_vec_sldb (vus, vus, const int); + VSLDB_V8HI VSLDB_VUS + vsi __builtin_vec_sldb (vsi, vsi, const int); + VSLDB_V4SI VSLDB_VSI + vui __builtin_vec_sldb (vui, vui, const int); + VSLDB_V4SI VSLDB_VUI + vsll __builtin_vec_sldb (vsll, vsll, const int); + VSLDB_V2DI VSLDB_VSLL + vull __builtin_vec_sldb (vull, vull, const int); + VSLDB_V2DI VSLDB_VULL + +[VEC_SLDW, vec_sldw, __builtin_vec_sldw] + vsc __builtin_vec_sldw (vsc, vsc, const int); + XXSLDWI_16QI XXSLDWI_VSC + vuc __builtin_vec_sldw (vuc, vuc, const int); + XXSLDWI_16QI XXSLDWI_VUC + vss __builtin_vec_sldw (vss, vss, const int); + XXSLDWI_8HI XXSLDWI_VSS + vus __builtin_vec_sldw (vus, vus, const int); + XXSLDWI_8HI XXSLDWI_VUS + vsi __builtin_vec_sldw (vsi, vsi, const int); + XXSLDWI_4SI XXSLDWI_VSI + vui __builtin_vec_sldw (vui, vui, const int); + XXSLDWI_4SI XXSLDWI_VUI + vsll __builtin_vec_sldw (vsll, vsll, const int); + XXSLDWI_2DI XXSLDWI_VSLL + vull __builtin_vec_sldw (vull, vull, const int); + XXSLDWI_2DI XXSLDWI_VULL + +[VEC_SLL, vec_sll, __builtin_vec_sll] + vsc __builtin_vec_sll (vsc, vuc); + VSL VSL_VSC + vuc __builtin_vec_sll (vuc, vuc); + VSL VSL_VUC + vss __builtin_vec_sll (vss, vuc); + VSL VSL_VSS + vus __builtin_vec_sll (vus, vuc); + VSL VSL_VUS + vp __builtin_vec_sll (vp, vuc); + VSL VSL_VP + vsi __builtin_vec_sll (vsi, vuc); + VSL VSL_VSI + vui __builtin_vec_sll (vui, vuc); + VSL VSL_VUI + vsll __builtin_vec_sll (vsll, vuc); + VSL VSL_VSLL + vull __builtin_vec_sll (vull, vuc); + VSL VSL_VULL +; The following variants are deprecated. + vsc __builtin_vec_sll (vsc, vus); + VSL VSL_VSC_VUS + vsc __builtin_vec_sll (vsc, vui); + VSL VSL_VSC_VUI + vuc __builtin_vec_sll (vuc, vus); + VSL VSL_VUC_VUS + vuc __builtin_vec_sll (vuc, vui); + VSL VSL_VUC_VUI + vbc __builtin_vec_sll (vbc, vuc); + VSL VSL_VBC_VUC + vbc __builtin_vec_sll (vbc, vus); + VSL VSL_VBC_VUS + vbc __builtin_vec_sll (vbc, vui); + VSL VSL_VBC_VUI + vss __builtin_vec_sll (vss, vus); + VSL VSL_VSS_VUS + vss __builtin_vec_sll (vss, vui); + VSL VSL_VSS_VUI + vus __builtin_vec_sll (vus, vus); + VSL VSL_VUS_VUS + vus __builtin_vec_sll (vus, vui); + VSL VSL_VUS_VUI + vbs __builtin_vec_sll (vbs, vuc); + VSL VSL_VBS_VUC + vbs __builtin_vec_sll (vbs, vus); + VSL VSL_VBS_VUS + vbs __builtin_vec_sll (vbs, vui); + VSL VSL_VBS_VUI + vp __builtin_vec_sll (vp, vus); + VSL VSL_VP_VUS + vp __builtin_vec_sll (vp, vui); + VSL VSL_VP_VUI + vsi __builtin_vec_sll (vsi, vus); + VSL VSL_VSI_VUS + vsi __builtin_vec_sll (vsi, vui); + VSL VSL_VSI_VUI + vui __builtin_vec_sll (vui, vus); + VSL VSL_VUI_VUS + vui __builtin_vec_sll (vui, vui); + VSL VSL_VUI_VUI + vbi __builtin_vec_sll (vbi, vuc); + VSL VSL_VBI_VUC + vbi __builtin_vec_sll (vbi, vus); + VSL VSL_VBI_VUS + vbi __builtin_vec_sll (vbi, vui); + VSL VSL_VBI_VUI + vbll __builtin_vec_sll (vbll, vuc); + VSL VSL_VBLL_VUC + vbll __builtin_vec_sll (vbll, vus); + VSL VSL_VBLL_VUS + vbll __builtin_vec_sll (vbll, vull); + VSL VSL_VBLL_VULL + +[VEC_SLO, vec_slo, __builtin_vec_slo] + vsc __builtin_vec_slo (vsc, vsc); + VSLO VSLO_VSCS + vsc __builtin_vec_slo (vsc, vuc); + VSLO VSLO_VSCU + vuc __builtin_vec_slo (vuc, vsc); + VSLO VSLO_VUCS + vuc __builtin_vec_slo (vuc, vuc); + VSLO VSLO_VUCU + vss __builtin_vec_slo (vss, vsc); + VSLO VSLO_VSSS + vss __builtin_vec_slo (vss, vuc); + VSLO VSLO_VSSU + vus __builtin_vec_slo (vus, vsc); + VSLO VSLO_VUSS + vus __builtin_vec_slo (vus, vuc); + VSLO VSLO_VUSU + vp __builtin_vec_slo (vp, vsc); + VSLO VSLO_VPS + vp __builtin_vec_slo (vp, vuc); + VSLO VSLO_VPU + vsi __builtin_vec_slo (vsi, vsc); + VSLO VSLO_VSIS + vsi __builtin_vec_slo (vsi, vuc); + VSLO VSLO_VSIU + vui __builtin_vec_slo (vui, vsc); + VSLO VSLO_VUIS + vui __builtin_vec_slo (vui, vuc); + VSLO VSLO_VUIU + vsll __builtin_vec_slo (vsll, vsc); + VSLO VSLO_VSLLS + vsll __builtin_vec_slo (vsll, vuc); + VSLO VSLO_VSLLU + vull __builtin_vec_slo (vull, vsc); + VSLO VSLO_VULLS + vull __builtin_vec_slo (vull, vuc); + VSLO VSLO_VULLU + vf __builtin_vec_slo (vf, vsc); + VSLO VSLO_VFS + vf __builtin_vec_slo (vf, vuc); + VSLO VSLO_VFU + +[VEC_SLV, vec_slv, __builtin_vec_vslv, _ARCH_PWR9] + vuc __builtin_vec_vslv (vuc, vuc); + VSLV + +[VEC_SPLAT, vec_splat, __builtin_vec_splat] + vsc __builtin_vec_splat (vsc, signed int); + VSPLTB VSPLTB_VSC + vuc __builtin_vec_splat (vuc, signed int); + VSPLTB VSPLTB_VUC + vbc __builtin_vec_splat (vbc, signed int); + VSPLTB VSPLTB_VBC + vss __builtin_vec_splat (vss, signed int); + VSPLTH VSPLTH_VSS + vus __builtin_vec_splat (vus, signed int); + VSPLTH VSPLTH_VUS + vbs __builtin_vec_splat (vbs, signed int); + VSPLTH VSPLTH_VBS + vp __builtin_vec_splat (vp, signed int); + VSPLTH VSPLTH_VP + vf __builtin_vec_splat (vf, signed int); + VSPLTW VSPLTW_VF + vsi __builtin_vec_splat (vsi, signed int); + VSPLTW VSPLTW_VSI + vui __builtin_vec_splat (vui, signed int); + VSPLTW VSPLTW_VUI + vbi __builtin_vec_splat (vbi, signed int); + VSPLTW VSPLTW_VBI + vd __builtin_vec_splat (vd, signed int); + XXSPLTD_V2DF + vsll __builtin_vec_splat (vsll, signed int); + XXSPLTD_V2DI XXSPLTD_VSLL + vull __builtin_vec_splat (vull, signed int); + XXSPLTD_V2DI XXSPLTD_VULL + vbll __builtin_vec_splat (vbll, signed int); + XXSPLTD_V2DI XXSPLTD_VBLL + +[VEC_SPLAT_S8, vec_splat_s8, __builtin_vec_splat_s8] + vsc __builtin_vec_splat_s8 (signed int); + VSPLTISB + +[VEC_SPLAT_S16, vec_splat_s16, __builtin_vec_splat_s16] + vss __builtin_vec_splat_s16 (signed int); + VSPLTISH + +[VEC_SPLAT_S32, vec_splat_s32, __builtin_vec_splat_s32] + vsi __builtin_vec_splat_s32 (signed int); + VSPLTISW + +; There are no entries for vec_splat_u{8,16,32}. These are handled +; in altivec.h with a #define and a cast. + +[VEC_SPLATI, vec_splati, __builtin_vec_xxspltiw, _ARCH_PWR10] + vsi __builtin_vec_xxspltiw (signed int); + VXXSPLTIW_V4SI + vf __builtin_vec_xxspltiw (float); + VXXSPLTIW_V4SF + +[VEC_SPLATID, vec_splatid, __builtin_vec_xxspltid, _ARCH_PWR10] + vd __builtin_vec_xxspltid (float); + VXXSPLTIDP + +[VEC_SPLATI_INS, vec_splati_ins, __builtin_vec_xxsplti32dx, _ARCH_PWR10] + vsi __builtin_vec_xxsplti32dx (vsi, const int, signed int); + VXXSPLTI32DX_V4SI VXXSPLTI32DX_VSI + vui __builtin_vec_xxsplti32dx (vui, const int, unsigned int); + VXXSPLTI32DX_V4SI VXXSPLTI32DX_VUI + vf __builtin_vec_xxsplti32dx (vf, const int, float); + VXXSPLTI32DX_V4SF + +; There are no actual builtins for vec_splats. There is special handling for +; this in altivec_resolve_overloaded_builtin in rs6000-c.c, where the call +; is replaced by a constructor. The single overload here causes +; __builtin_vec_splats to be registered with the front end so that can happen. +[VEC_SPLATS, vec_splats, __builtin_vec_splats] + vsi __builtin_vec_splats (vsi); + ABS_V4SI SPLATS_FAKERY + +[VEC_SQRT, vec_sqrt, __builtin_vec_sqrt, __VSX__] + vf __builtin_vec_sqrt (vf); + XVSQRTSP + vd __builtin_vec_sqrt (vd); + XVSQRTDP + +[VEC_SR, vec_sr, __builtin_vec_sr] + vsc __builtin_vec_sr (vsc, vuc); + VSRB VSRB_VSC + vuc __builtin_vec_sr (vuc, vuc); + VSRB VSRB_VUC + vss __builtin_vec_sr (vss, vus); + VSRH VSRH_VSS + vus __builtin_vec_sr (vus, vus); + VSRH VSRH_VUS + vsi __builtin_vec_sr (vsi, vui); + VSRW VSRW_VSI + vui __builtin_vec_sr (vui, vui); + VSRW VSRW_VUI + vsll __builtin_vec_sr (vsll, vull); + VSRD VSRD_VSLL + vull __builtin_vec_sr (vull, vull); + VSRD VSRD_VULL + vsq __builtin_vec_sr (vsq, vuq); + VSRQ VSRQ_VSQ + vuq __builtin_vec_sr (vuq, vuq); + VSRQ VSRQ_VUQ + +[VEC_SRA, vec_sra, __builtin_vec_sra] + vsc __builtin_vec_sra (vsc, vuc); + VSRAB VSRAB_VSC + vuc __builtin_vec_sra (vuc, vuc); + VSRAB VSRAB_VUC + vss __builtin_vec_sra (vss, vus); + VSRAH VSRAH_VSS + vus __builtin_vec_sra (vus, vus); + VSRAH VSRAH_VUS + vsi __builtin_vec_sra (vsi, vui); + VSRAW VSRAW_VSI + vui __builtin_vec_sra (vui, vui); + VSRAW VSRAW_VUI + vsll __builtin_vec_sra (vsll, vull); + VSRAD VSRAD_VSLL + vull __builtin_vec_sra (vull, vull); + VSRAD VSRAD_VULL + vsq __builtin_vec_sra (vsq, vuq); + VSRAQ VSRAQ_VSQ + vuq __builtin_vec_sra (vuq, vuq); + VSRAQ VSRAQ_VUQ + +[VEC_SRDB, vec_srdb, __builtin_vec_srdb, _ARCH_PWR10] + vsc __builtin_vec_srdb (vsc, vsc, const int); + VSRDB_V16QI VSRDB_VSC + vuc __builtin_vec_srdb (vuc, vuc, const int); + VSRDB_V16QI VSRDB_VUC + vss __builtin_vec_srdb (vss, vss, const int); + VSRDB_V8HI VSRDB_VSS + vus __builtin_vec_srdb (vus, vus, const int); + VSRDB_V8HI VSRDB_VUS + vsi __builtin_vec_srdb (vsi, vsi, const int); + VSRDB_V4SI VSRDB_VSI + vui __builtin_vec_srdb (vui, vui, const int); + VSRDB_V4SI VSRDB_VUI + vsll __builtin_vec_srdb (vsll, vsll, const int); + VSRDB_V2DI VSRDB_VSLL + vull __builtin_vec_srdb (vull, vull, const int); + VSRDB_V2DI VSRDB_VULL + +[VEC_SRL, vec_srl, __builtin_vec_srl] + vsc __builtin_vec_srl (vsc, vuc); + VSR VSR_VSC + vuc __builtin_vec_srl (vuc, vuc); + VSR VSR_VUC + vss __builtin_vec_srl (vss, vuc); + VSR VSR_VSS + vus __builtin_vec_srl (vus, vuc); + VSR VSR_VUS + vp __builtin_vec_srl (vp, vuc); + VSR VSR_VP + vsi __builtin_vec_srl (vsi, vuc); + VSR VSR_VSI + vui __builtin_vec_srl (vui, vuc); + VSR VSR_VUI + vsll __builtin_vec_srl (vsll, vuc); + VSR VSR_VSLL + vull __builtin_vec_srl (vull, vuc); + VSR VSR_VULL +; The following variants are deprecated. + vsc __builtin_vec_srl (vsc, vus); + VSR VSR_VSC_VUS + vsc __builtin_vec_srl (vsc, vui); + VSR VSR_VSC_VUI + vuc __builtin_vec_srl (vuc, vus); + VSR VSR_VUC_VUS + vuc __builtin_vec_srl (vuc, vui); + VSR VSR_VUC_VUI + vbc __builtin_vec_srl (vbc, vuc); + VSR VSR_VBC_VUC + vbc __builtin_vec_srl (vbc, vus); + VSR VSR_VBC_VUS + vbc __builtin_vec_srl (vbc, vui); + VSR VSR_VBC_VUI + vss __builtin_vec_srl (vss, vus); + VSR VSR_VSS_VUS + vss __builtin_vec_srl (vss, vui); + VSR VSR_VSS_VUI + vus __builtin_vec_srl (vus, vus); + VSR VSR_VUS_VUS + vus __builtin_vec_srl (vus, vui); + VSR VSR_VUS_VUI + vbs __builtin_vec_srl (vbs, vuc); + VSR VSR_VBS_VUC + vbs __builtin_vec_srl (vbs, vus); + VSR VSR_VBS_VUS + vbs __builtin_vec_srl (vbs, vui); + VSR VSR_VBS_VUI + vp __builtin_vec_srl (vp, vus); + VSR VSR_VP_VUS + vp __builtin_vec_srl (vp, vui); + VSR VSR_VP_VUI + vsi __builtin_vec_srl (vsi, vus); + VSR VSR_VSI_VUS + vsi __builtin_vec_srl (vsi, vui); + VSR VSR_VSI_VUI + vui __builtin_vec_srl (vui, vus); + VSR VSR_VUI_VUS + vui __builtin_vec_srl (vui, vui); + VSR VSR_VUI_VUI + vbi __builtin_vec_srl (vbi, vuc); + VSR VSR_VBI_VUC + vbi __builtin_vec_srl (vbi, vus); + VSR VSR_VBI_VUS + vbi __builtin_vec_srl (vbi, vui); + VSR VSR_VBI_VUI + +[VEC_SRO, vec_sro, __builtin_vec_sro] + vsc __builtin_vec_sro (vsc, vsc); + VSRO VSRO_VSCS + vsc __builtin_vec_sro (vsc, vuc); + VSRO VSRO_VSCU + vuc __builtin_vec_sro (vuc, vsc); + VSRO VSRO_VUCS + vuc __builtin_vec_sro (vuc, vuc); + VSRO VSRO_VUCU + vss __builtin_vec_sro (vss, vsc); + VSRO VSRO_VSSS + vss __builtin_vec_sro (vss, vuc); + VSRO VSRO_VSSU + vus __builtin_vec_sro (vus, vsc); + VSRO VSRO_VUSS + vus __builtin_vec_sro (vus, vuc); + VSRO VSRO_VUSU + vp __builtin_vec_sro (vp, vsc); + VSRO VSRO_VPS + vp __builtin_vec_sro (vp, vuc); + VSRO VSRO_VPU + vsi __builtin_vec_sro (vsi, vsc); + VSRO VSRO_VSIS + vsi __builtin_vec_sro (vsi, vuc); + VSRO VSRO_VSIU + vui __builtin_vec_sro (vui, vsc); + VSRO VSRO_VUIS + vui __builtin_vec_sro (vui, vuc); + VSRO VSRO_VUIU + vsll __builtin_vec_sro (vsll, vsc); + VSRO VSRO_VSLLS + vsll __builtin_vec_sro (vsll, vuc); + VSRO VSRO_VSLLU + vull __builtin_vec_sro (vull, vsc); + VSRO VSRO_VULLS + vull __builtin_vec_sro (vull, vuc); + VSRO VSRO_VULLU + vf __builtin_vec_sro (vf, vsc); + VSRO VSRO_VFS + vf __builtin_vec_sro (vf, vuc); + VSRO VSRO_VFU + +[VEC_SRV, vec_srv, __builtin_vec_vsrv, _ARCH_PWR9] + vuc __builtin_vec_vsrv (vuc, vuc); + VSRV + +[VEC_ST, vec_st, __builtin_vec_st] + void __builtin_vec_st (vsc, signed long long, vsc *); + STVX_V16QI STVX_VSC + void __builtin_vec_st (vsc, signed long long, signed char *); + STVX_V16QI STVX_SC + void __builtin_vec_st (vuc, signed long long, vuc *); + STVX_V16QI STVX_VUC + void __builtin_vec_st (vuc, signed long long, unsigned char *); + STVX_V16QI STVX_UC + void __builtin_vec_st (vbc, signed long long, vbc *); + STVX_V16QI STVX_VBC + void __builtin_vec_st (vbc, signed long long, signed char *); + STVX_V16QI STVX_SC_B + void __builtin_vec_st (vbc, signed long long, unsigned char *); + STVX_V16QI STVX_UC_B + void __builtin_vec_st (vss, signed long long, vss *); + STVX_V8HI STVX_VSS + void __builtin_vec_st (vss, signed long long, signed short *); + STVX_V8HI STVX_SS + void __builtin_vec_st (vus, signed long long, vus *); + STVX_V8HI STVX_VUS + void __builtin_vec_st (vus, signed long long, unsigned short *); + STVX_V8HI STVX_US + void __builtin_vec_st (vbs, signed long long, vbs *); + STVX_V8HI STVX_VBS + void __builtin_vec_st (vbs, signed long long, signed short *); + STVX_V8HI STVX_SS_B + void __builtin_vec_st (vbs, signed long long, unsigned short *); + STVX_V8HI STVX_US_B + void __builtin_vec_st (vp, signed long long, vp *); + STVX_V8HI STVX_P + void __builtin_vec_st (vsi, signed long long, vsi *); + STVX_V4SI STVX_VSI + void __builtin_vec_st (vsi, signed long long, signed int *); + STVX_V4SI STVX_SI + void __builtin_vec_st (vui, signed long long, vui *); + STVX_V4SI STVX_VUI + void __builtin_vec_st (vui, signed long long, unsigned int *); + STVX_V4SI STVX_UI + void __builtin_vec_st (vbi, signed long long, vbi *); + STVX_V4SI STVX_VBI + void __builtin_vec_st (vbi, signed long long, signed int *); + STVX_V4SI STVX_SI_B + void __builtin_vec_st (vbi, signed long long, unsigned int *); + STVX_V4SI STVX_UI_B + void __builtin_vec_st (vsll, signed long long, vsll *); + STVX_V2DI STVX_VSLL + void __builtin_vec_st (vsll, signed long long, signed long long *); + STVX_V2DI STVX_SLL + void __builtin_vec_st (vull, signed long long, vull *); + STVX_V2DI STVX_VULL + void __builtin_vec_st (vull, signed long long, unsigned long long *); + STVX_V2DI STVX_ULL + void __builtin_vec_st (vbll, signed long long, vbll *); + STVX_V2DI STVX_VBLL + void __builtin_vec_st (vf, signed long long, vf *); + STVX_V4SF STVX_VF + void __builtin_vec_st (vf, signed long long, float *); + STVX_V4SF STVX_F + void __builtin_vec_st (vd, signed long long, vd *); + STVX_V2DF STVX_VD + void __builtin_vec_st (vd, signed long long, double *); + STVX_V2DF STVX_D +; The following variants are deprecated. + void __builtin_vec_st (vbll, signed long long, signed long long *); + STVX_V2DI STVX_SLL_B + void __builtin_vec_st (vbll, signed long long, unsigned long long *); + STVX_V2DI STVX_ULL_B + +[VEC_STE, vec_ste, __builtin_vec_ste] + void __builtin_vec_ste (vsc, signed long long, signed char *); + STVEBX STVEBX_S + void __builtin_vec_ste (vuc, signed long long, unsigned char *); + STVEBX STVEBX_U + void __builtin_vec_ste (vbc, signed long long, signed char *); + STVEBX STVEBX_BS + void __builtin_vec_ste (vbc, signed long long, unsigned char *); + STVEBX STVEBX_BU + void __builtin_vec_ste (vss, signed long long, signed short *); + STVEHX STVEHX_S + void __builtin_vec_ste (vus, signed long long, unsigned short *); + STVEHX STVEHX_U + void __builtin_vec_ste (vbs, signed long long, signed short *); + STVEHX STVEHX_BS + void __builtin_vec_ste (vbs, signed long long, unsigned short *); + STVEHX STVEHX_BU + void __builtin_vec_ste (vp, signed long long, signed short *); + STVEHX STVEHX_PS + void __builtin_vec_ste (vp, signed long long, unsigned short *); + STVEHX STVEHX_PU + void __builtin_vec_ste (vsi, signed long long, signed int *); + STVEWX STVEHWX_S + void __builtin_vec_ste (vui, signed long long, unsigned int *); + STVEWX STVEWX_U + void __builtin_vec_ste (vbi, signed long long, signed int *); + STVEWX STVEWX_BS + void __builtin_vec_ste (vbi, signed long long, unsigned int *); + STVEWX STVEWX_BU + void __builtin_vec_ste (vf, signed long long, float *); + STVEWX STVEWX_F + +; There are no builtins for VEC_STEP; this is handled directly +; with a constant replacement in rs6000_resolve_overloaded_builtin. +; The single overload registers __builtin_vec_step with the front end +; so this can happen. +[VEC_STEP, vec_step, __builtin_vec_step] + signed int __builtin_vec_step (vsi); + VCLZLSBB_V4SI STEP_FAKERY + +[VEC_STL, vec_stl, __builtin_vec_stl] + void __builtin_vec_stl (vsc, signed long long, vsc *); + STVXL_V16QI STVXL_VSC + void __builtin_vec_stl (vsc, signed long long, signed char *); + STVXL_V16QI STVXL_SC + void __builtin_vec_stl (vuc, signed long long, vuc *); + STVXL_V16QI STVXL_VUC + void __builtin_vec_stl (vuc, signed long long, unsigned char *); + STVXL_V16QI STVXL_UC + void __builtin_vec_stl (vbc, signed long long, vbc *); + STVXL_V16QI STVXL_VBC + void __builtin_vec_stl (vbc, signed long long, signed char *); + STVXL_V16QI STVXL_SC_B + void __builtin_vec_stl (vbc, signed long long, unsigned char *); + STVXL_V16QI STVXL_UC_B + void __builtin_vec_stl (vss, signed long long, vss *); + STVXL_V8HI STVXL_VSS + void __builtin_vec_stl (vss, signed long long, signed short *); + STVXL_V8HI STVXL_SS + void __builtin_vec_stl (vus, signed long long, vus *); + STVXL_V8HI STVXL_VUS + void __builtin_vec_stl (vus, signed long long, unsigned short *); + STVXL_V8HI STVXL_US + void __builtin_vec_stl (vbs, signed long long, vbs *); + STVXL_V8HI STVXL_VBS + void __builtin_vec_stl (vbs, signed long long, signed short *); + STVXL_V8HI STVXL_SS_B + void __builtin_vec_stl (vbs, signed long long, unsigned short *); + STVXL_V8HI STVXL_US_B + void __builtin_vec_stl (vp, signed long long, vp *); + STVXL_V8HI STVXL_P + void __builtin_vec_stl (vsi, signed long long, vsi *); + STVXL_V4SI STVXL_VSI + void __builtin_vec_stl (vsi, signed long long, signed int *); + STVXL_V4SI STVXL_SI + void __builtin_vec_stl (vui, signed long long, vui *); + STVXL_V4SI STVXL_VUI + void __builtin_vec_stl (vui, signed long long, unsigned int *); + STVXL_V4SI STVXL_UI + void __builtin_vec_stl (vbi, signed long long, vbi *); + STVXL_V4SI STVXL_VBI + void __builtin_vec_stl (vbi, signed long long, signed int *); + STVXL_V4SI STVXL_SI_B + void __builtin_vec_stl (vbi, signed long long, unsigned int *); + STVXL_V4SI STVXL_UI_B + void __builtin_vec_stl (vsll, signed long long, vsll *); + STVXL_V2DI STVXL_VSLL + void __builtin_vec_stl (vsll, signed long long, signed long long *); + STVXL_V2DI STVXL_SLL + void __builtin_vec_stl (vull, signed long long, vull *); + STVXL_V2DI STVXL_VULL + void __builtin_vec_stl (vull, signed long long, unsigned long long *); + STVXL_V2DI STVXL_ULL + void __builtin_vec_stl (vbll, signed long long, vbll *); + STVXL_V2DI STVXL_VBLL + void __builtin_vec_stl (vbll, signed long long, signed long long *); + STVXL_V2DI STVXL_SLL_B + void __builtin_vec_stl (vbll, signed long long, unsigned long long *); + STVXL_V2DI STVXL_ULL_B + void __builtin_vec_stl (vf, signed long long, vf *); + STVXL_V4SF STVXL_VF + void __builtin_vec_stl (vf, signed long long, float *); + STVXL_V4SF STVXL_F + void __builtin_vec_stl (vd, signed long long, vd *); + STVXL_V2DF STVXL_VD + void __builtin_vec_stl (vd, signed long long, double *); + STVXL_V2DF STVXL_D + +[VEC_STRIL, vec_stril, __builtin_vec_stril, _ARCH_PWR10] + vuc __builtin_vec_stril (vuc); + VSTRIBL VSTRIBL_U + vsc __builtin_vec_stril (vsc); + VSTRIBL VSTRIBL_S + vus __builtin_vec_stril (vus); + VSTRIHL VSTRIHL_U + vss __builtin_vec_stril (vss); + VSTRIHL VSTRIHL_S + +[VEC_STRIL_P, vec_stril_p, __builtin_vec_stril_p, _ARCH_PWR10] + signed int __builtin_vec_stril_p (vuc); + VSTRIBL_P VSTRIBL_PU + signed int __builtin_vec_stril_p (vsc); + VSTRIBL_P VSTRIBL_PS + signed int __builtin_vec_stril_p (vus); + VSTRIHL_P VSTRIHL_PU + signed int __builtin_vec_stril_p (vss); + VSTRIHL_P VSTRIHL_PS + +[VEC_STRIR, vec_strir, __builtin_vec_strir, _ARCH_PWR10] + vuc __builtin_vec_strir (vuc); + VSTRIBR VSTRIBR_U + vsc __builtin_vec_strir (vsc); + VSTRIBR VSTRIBR_S + vus __builtin_vec_strir (vus); + VSTRIHR VSTRIHR_U + vss __builtin_vec_strir (vss); + VSTRIHR VSTRIHR_S + +[VEC_STRIR_P, vec_strir_p, __builtin_vec_strir_p, _ARCH_PWR10] + signed int __builtin_vec_strir_p (vuc); + VSTRIBR_P VSTRIBR_PU + signed int __builtin_vec_strir_p (vsc); + VSTRIBR_P VSTRIBR_PS + signed int __builtin_vec_strir_p (vus); + VSTRIHR_P VSTRIHR_PU + signed int __builtin_vec_strir_p (vss); + VSTRIHR_P VSTRIHR_PS + +[VEC_STVLX, vec_stvlx, __builtin_vec_stvlx, __PPU__] + void __builtin_vec_stvlx (vbc, signed long long, vbc *); + STVLX STVLX_VBC + void __builtin_vec_stvlx (vsc, signed long long, vsc *); + STVLX STVLX_VSC + void __builtin_vec_stvlx (vsc, signed long long, signed char *); + STVLX STVLX_SC + void __builtin_vec_stvlx (vuc, signed long long, vuc *); + STVLX STVLX_VUC + void __builtin_vec_stvlx (vuc, signed long long, unsigned char *); + STVLX STVLX_UC + void __builtin_vec_stvlx (vbs, signed long long, vbs *); + STVLX STVLX_VBS + void __builtin_vec_stvlx (vss, signed long long, vss *); + STVLX STVLX_VSS + void __builtin_vec_stvlx (vss, signed long long, signed short *); + STVLX STVLX_SS + void __builtin_vec_stvlx (vus, signed long long, vus *); + STVLX STVLX_VUS + void __builtin_vec_stvlx (vus, signed long long, unsigned short *); + STVLX STVLX_US + void __builtin_vec_stvlx (vp, signed long long, vp *); + STVLX STVLX_VP + void __builtin_vec_stvlx (vbi, signed long long, vbi *); + STVLX STVLX_VBI + void __builtin_vec_stvlx (vsi, signed long long, vsi *); + STVLX STVLX_VSI + void __builtin_vec_stvlx (vsi, signed long long, signed int *); + STVLX STVLX_SI + void __builtin_vec_stvlx (vui, signed long long, vui *); + STVLX STVLX_VUI + void __builtin_vec_stvlx (vui, signed long long, unsigned int *); + STVLX STVLX_UI + void __builtin_vec_stvlx (vf, signed long long, vf *); + STVLX STVLX_VF + void __builtin_vec_stvlx (vf, signed long long, float *); + STVLX STVLX_F + +[VEC_STVLXL, vec_stvlxl, __builtin_vec_stvlxl, __PPU__] + void __builtin_vec_stvlxl (vbc, signed long long, vbc *); + STVLXL STVLXL_VBC + void __builtin_vec_stvlxl (vsc, signed long long, vsc *); + STVLXL STVLXL_VSC + void __builtin_vec_stvlxl (vsc, signed long long, signed char *); + STVLXL STVLXL_SC + void __builtin_vec_stvlxl (vuc, signed long long, vuc *); + STVLXL STVLXL_VUC + void __builtin_vec_stvlxl (vuc, signed long long, unsigned char *); + STVLXL STVLXL_UC + void __builtin_vec_stvlxl (vbs, signed long long, vbs *); + STVLXL STVLXL_VBS + void __builtin_vec_stvlxl (vss, signed long long, vss *); + STVLXL STVLXL_VSS + void __builtin_vec_stvlxl (vss, signed long long, signed short *); + STVLXL STVLXL_SS + void __builtin_vec_stvlxl (vus, signed long long, vus *); + STVLXL STVLXL_VUS + void __builtin_vec_stvlxl (vus, signed long long, unsigned short *); + STVLXL STVLXL_US + void __builtin_vec_stvlxl (vp, signed long long, vp *); + STVLXL STVLXL_VP + void __builtin_vec_stvlxl (vbi, signed long long, vbi *); + STVLXL STVLXL_VBI + void __builtin_vec_stvlxl (vsi, signed long long, vsi *); + STVLXL STVLXL_VSI + void __builtin_vec_stvlxl (vsi, signed long long, signed int *); + STVLXL STVLXL_SI + void __builtin_vec_stvlxl (vui, signed long long, vui *); + STVLXL STVLXL_VUI + void __builtin_vec_stvlxl (vui, signed long long, unsigned int *); + STVLXL STVLXL_UI + void __builtin_vec_stvlxl (vf, signed long long, vf *); + STVLXL STVLXL_VF + void __builtin_vec_stvlxl (vf, signed long long, float *); + STVLXL STVLXL_F + +[VEC_STVRX, vec_stvrx, __builtin_vec_stvrx, __PPU__] + void __builtin_vec_stvrx (vbc, signed long long, vbc *); + STVRX STVRX_VBC + void __builtin_vec_stvrx (vsc, signed long long, vsc *); + STVRX STVRX_VSC + void __builtin_vec_stvrx (vsc, signed long long, signed char *); + STVRX STVRX_SC + void __builtin_vec_stvrx (vuc, signed long long, vuc *); + STVRX STVRX_VUC + void __builtin_vec_stvrx (vuc, signed long long, unsigned char *); + STVRX STVRX_UC + void __builtin_vec_stvrx (vbs, signed long long, vbs *); + STVRX STVRX_VBS + void __builtin_vec_stvrx (vss, signed long long, vss *); + STVRX STVRX_VSS + void __builtin_vec_stvrx (vss, signed long long, signed short *); + STVRX STVRX_SS + void __builtin_vec_stvrx (vus, signed long long, vus *); + STVRX STVRX_VUS + void __builtin_vec_stvrx (vus, signed long long, unsigned short *); + STVRX STVRX_US + void __builtin_vec_stvrx (vp, signed long long, vp *); + STVRX STVRX_VP + void __builtin_vec_stvrx (vbi, signed long long, vbi *); + STVRX STVRX_VBI + void __builtin_vec_stvrx (vsi, signed long long, vsi *); + STVRX STVRX_VSI + void __builtin_vec_stvrx (vsi, signed long long, signed int *); + STVRX STVRX_SI + void __builtin_vec_stvrx (vui, signed long long, vui *); + STVRX STVRX_VUI + void __builtin_vec_stvrx (vui, signed long long, unsigned int *); + STVRX STVRX_UI + void __builtin_vec_stvrx (vf, signed long long, vf *); + STVRX STVRX_VF + void __builtin_vec_stvrx (vf, signed long long, float *); + STVRX STVRX_F + +[VEC_STVRXL, vec_stvrxl, __builtin_vec_stvrxl, __PPU__] + void __builtin_vec_stvrxl (vbc, signed long long, vbc *); + STVRXL STVRXL_VBC + void __builtin_vec_stvrxl (vsc, signed long long, vsc *); + STVRXL STVRXL_VSC + void __builtin_vec_stvrxl (vsc, signed long long, signed char *); + STVRXL STVRXL_SC + void __builtin_vec_stvrxl (vuc, signed long long, vuc *); + STVRXL STVRXL_VUC + void __builtin_vec_stvrxl (vuc, signed long long, unsigned char *); + STVRXL STVRXL_UC + void __builtin_vec_stvrxl (vbs, signed long long, vbs *); + STVRXL STVRXL_VBS + void __builtin_vec_stvrxl (vss, signed long long, vss *); + STVRXL STVRXL_VSS + void __builtin_vec_stvrxl (vss, signed long long, signed short *); + STVRXL STVRXL_SS + void __builtin_vec_stvrxl (vus, signed long long, vus *); + STVRXL STVRXL_VUS + void __builtin_vec_stvrxl (vus, signed long long, unsigned short *); + STVRXL STVRXL_US + void __builtin_vec_stvrxl (vp, signed long long, vp *); + STVRXL STVRXL_VP + void __builtin_vec_stvrxl (vbi, signed long long, vbi *); + STVRXL STVRXL_VBI + void __builtin_vec_stvrxl (vsi, signed long long, vsi *); + STVRXL STVRXL_VSI + void __builtin_vec_stvrxl (vsi, signed long long, signed int *); + STVRXL STVRXL_SI + void __builtin_vec_stvrxl (vui, signed long long, vui *); + STVRXL STVRXL_VUI + void __builtin_vec_stvrxl (vui, signed long long, unsigned int *); + STVRXL STVRXL_UI + void __builtin_vec_stvrxl (vf, signed long long, vf *); + STVRXL STVRXL_VF + void __builtin_vec_stvrxl (vf, signed long long, float *); + STVRXL STVRXL_F + +[VEC_STXVL, vec_xst_len, __builtin_vec_stxvl, _ARCH_PPC64_PWR9] + void __builtin_vec_stxvl (vsc, signed char *, unsigned int); + STXVL STXVL_VSC + void __builtin_vec_stxvl (vuc, unsigned char *, unsigned int); + STXVL STXVL_VUC + void __builtin_vec_stxvl (vss, signed short *, unsigned int); + STXVL STXVL_VSS + void __builtin_vec_stxvl (vus, unsigned short *, unsigned int); + STXVL STXVL_VUS + void __builtin_vec_stxvl (vsi, signed int *, unsigned int); + STXVL STXVL_VSI + void __builtin_vec_stxvl (vui, unsigned int *, unsigned int); + STXVL STXVL_VUI + void __builtin_vec_stxvl (vsll, signed long long *, unsigned int); + STXVL STXVL_VSLL + void __builtin_vec_stxvl (vull, unsigned long long *, unsigned int); + STXVL STXVL_VULL + void __builtin_vec_stxvl (vsq, signed __int128 *, unsigned int); + STXVL STXVL_VSQ + void __builtin_vec_stxvl (vuq, unsigned __int128 *, unsigned int); + STXVL STXVL_VUQ + void __builtin_vec_stxvl (vf, float *, unsigned int); + STXVL STXVL_VF + void __builtin_vec_stxvl (vd, double *, unsigned int); + STXVL STXVL_VD + +[VEC_SUB, vec_sub, __builtin_vec_sub] + vsc __builtin_vec_sub (vsc, vsc); + VSUBUBM VSUBUBM_VSC + vuc __builtin_vec_sub (vuc, vuc); + VSUBUBM VSUBUBM_VUC + vss __builtin_vec_sub (vss, vss); + VSUBUHM VSUBUHM_VSS + vus __builtin_vec_sub (vus, vus); + VSUBUHM VSUBUHM_VUS + vsi __builtin_vec_sub (vsi, vsi); + VSUBUWM VSUBUWM_VSI + vui __builtin_vec_sub (vui, vui); + VSUBUWM VSUBUWM_VUI + vsll __builtin_vec_sub (vsll, vsll); + VSUBUDM VSUBUDM_VSLL + vull __builtin_vec_sub (vull, vull); + VSUBUDM VSUBUDM_VULL + vsq __builtin_vec_sub (vsq, vsq); + VSUBUQM VSUBUQM_VSQ + vuq __builtin_vec_sub (vuq, vuq); + VSUBUQM VSUBUQM_VUQ + vf __builtin_vec_sub (vf, vf); + VSUBFP + vd __builtin_vec_sub (vd, vd); + XVSUBDP +; The following variants are deprecated. + vsc __builtin_vec_sub (vsc, vbc); + VSUBUBM VSUBUBM_VSC_VBC + vsc __builtin_vec_sub (vbc, vsc); + VSUBUBM VSUBUBM_VBC_VSC + vuc __builtin_vec_sub (vuc, vbc); + VSUBUBM VSUBUBM_VUC_VBC + vuc __builtin_vec_sub (vbc, vuc); + VSUBUBM VSUBUBM_VBC_VUC + vss __builtin_vec_sub (vss, vbs); + VSUBUHM VSUBUHM_VSS_VBS + vss __builtin_vec_sub (vbs, vss); + VSUBUHM VSUBUHM_VBS_VSS + vus __builtin_vec_sub (vus, vbs); + VSUBUHM VSUBUHM_VUS_VBS + vus __builtin_vec_sub (vbs, vus); + VSUBUHM VSUBUHM_VBS_VUS + vsi __builtin_vec_sub (vsi, vbi); + VSUBUWM VSUBUWM_VSI_VBI + vsi __builtin_vec_sub (vbi, vsi); + VSUBUWM VSUBUWM_VBI_VSI + vui __builtin_vec_sub (vui, vbi); + VSUBUWM VSUBUWM_VUI_VBI + vui __builtin_vec_sub (vbi, vui); + VSUBUWM VSUBUWM_VBI_VUI + vsll __builtin_vec_sub (vsll, vbll); + VSUBUDM VSUBUDM_VSLL_VBLL + vsll __builtin_vec_sub (vbll, vsll); + VSUBUDM VSUBUDM_VBLL_VSLL + vull __builtin_vec_sub (vull, vbll); + VSUBUDM VSUBUDM_VULL_VBLL + vull __builtin_vec_sub (vbll, vull); + VSUBUDM VSUBUDM_VBLL_VULL + +[VEC_SUBC, vec_subc, __builtin_vec_subc] + vsi __builtin_vec_subc (vsi, vsi); + VSUBCUW VSUBCUW_VSI + vui __builtin_vec_subc (vui, vui); + VSUBCUW VSUBCUW_VUI + vsq __builtin_vec_subc (vsq, vsq); + VSUBCUQ VSUBCUQ_VSQ + vuq __builtin_vec_subc (vuq, vuq); + VSUBCUQ VSUBCUQ_VUQ + +; TODO: Note that the entry for VEC_SUBE currently gets ignored in +; altivec_resolve_overloaded_builtin. Revisit whether we can remove +; that. We still need to register the legal builtin forms here. +[VEC_SUBE, vec_sube, __builtin_vec_sube] + vsq __builtin_vec_sube (vsq, vsq, vsq); + VSUBEUQM VSUBEUQM_VSQ + vuq __builtin_vec_sube (vuq, vuq, vuq); + VSUBEUQM VSUBEUQM_VUQ + +; TODO: Note that the entry for VEC_SUBEC currently gets ignored in +; altivec_resolve_overloaded_builtin. Revisit whether we can remove +; that. We still need to register the legal builtin forms here. +[VEC_SUBEC, vec_subec, __builtin_vec_subec] + vsq __builtin_vec_subec (vsq, vsq, vsq); + VSUBECUQ VSUBECUQ_VSQ + vuq __builtin_vec_subec (vuq, vuq, vuq); + VSUBECUQ VSUBECUQ_VUQ + +[VEC_SUBS, vec_subs, __builtin_vec_subs] + vuc __builtin_vec_subs (vuc, vuc); + VSUBUBS + vsc __builtin_vec_subs (vsc, vsc); + VSUBSBS + vus __builtin_vec_subs (vus, vus); + VSUBUHS + vss __builtin_vec_subs (vss, vss); + VSUBSHS + vui __builtin_vec_subs (vui, vui); + VSUBUWS + vsi __builtin_vec_subs (vsi, vsi); + VSUBSWS +; The following variants are deprecated. + vuc __builtin_vec_subs (vuc, vbc); + VSUBUBS VSUBUBS_UB + vuc __builtin_vec_subs (vbc, vuc); + VSUBUBS VSUBUBS_BU + vsc __builtin_vec_subs (vsc, vbc); + VSUBSBS VSUBSBS_SB + vsc __builtin_vec_subs (vbc, vsc); + VSUBSBS VSUBSBS_BS + vus __builtin_vec_subs (vus, vbs); + VSUBUHS VSUBUHS_UB + vus __builtin_vec_subs (vbs, vus); + VSUBUHS VSUBUHS_BU + vss __builtin_vec_subs (vss, vbs); + VSUBSHS VSUBSHS_SB + vss __builtin_vec_subs (vbs, vss); + VSUBSHS VSUBSHS_BS + vui __builtin_vec_subs (vui, vbi); + VSUBUWS VSUBUWS_UB + vui __builtin_vec_subs (vbi, vui); + VSUBUWS VSUBUWS_BU + vsi __builtin_vec_subs (vsi, vbi); + VSUBSWS VSUBSWS_SB + vsi __builtin_vec_subs (vbi, vsi); + VSUBSWS VSUBSWS_BS + +[VEC_SUM2S, vec_sum2s, __builtin_vec_sum2s] + vsi __builtin_vec_sum2s (vsi, vsi); + VSUM2SWS + +[VEC_SUM4S, vec_sum4s, __builtin_vec_sum4s] + vui __builtin_vec_sum4s (vuc, vui); + VSUM4UBS + vsi __builtin_vec_sum4s (vsc, vsi); + VSUM4SBS + vsi __builtin_vec_sum4s (vss, vsi); + VSUM4SHS + +[VEC_SUMS, vec_sums, __builtin_vec_sums] + vsi __builtin_vec_sums (vsi, vsi); + VSUMSWS + +[VEC_TERNARYLOGIC, vec_ternarylogic, __builtin_vec_xxeval, _ARCH_PWR10] + vuc __builtin_vec_xxeval (vuc, vuc, vuc, const int); + XXEVAL XXEVAL_VUC + vus __builtin_vec_xxeval (vus, vus, vus, const int); + XXEVAL XXEVAL_VUS + vui __builtin_vec_xxeval (vui, vui, vui, const int); + XXEVAL XXEVAL_VUI + vull __builtin_vec_xxeval (vull, vull, vull, const int); + XXEVAL XXEVAL_VULL + vuq __builtin_vec_xxeval (vuq, vuq, vuq, const int); + XXEVAL XXEVAL_VUQ + +[VEC_TEST_LSBB_ALL_ONES, vec_test_lsbb_all_ones, __builtin_vec_xvtlsbb_all_ones, _ARCH_PWR9] + signed int __builtin_vec_xvtlsbb_all_ones (vuc); + XVTLSBB_ONES + +[VEC_TEST_LSBB_ALL_ZEROS, vec_test_lsbb_all_zeros, __builtin_vec_xvtlsbb_all_zeros, _ARCH_PWR9] + signed int __builtin_vec_xvtlsbb_all_zeros (vuc); + XVTLSBB_ZEROS + +[VEC_TRUNC, vec_trunc, __builtin_vec_trunc] + vf __builtin_vec_trunc (vf); + VRFIZ + vd __builtin_vec_trunc (vd); + XVRDPIZ + +[VEC_TSTSFI_GT, SKIP, __builtin_dfp_dtstsfi_gt] + signed int __builtin_dfp_dtstsfi_gt (const int, _Decimal64); + TSTSFI_GT_DD + signed int __builtin_dfp_dtstsfi_gt (const int, _Decimal128); + TSTSFI_GT_TD + +[VEC_TSTSFI_EQ, SKIP, __builtin_dfp_dtstsfi_eq] + signed int __builtin_dfp_dtstsfi_eq (const int, _Decimal64); + TSTSFI_EQ_DD + signed int __builtin_dfp_dtstsfi_eq (const int, _Decimal128); + TSTSFI_EQ_TD + +[VEC_TSTSFI_LT, SKIP, __builtin_dfp_dtstsfi_lt] + signed int __builtin_dfp_dtstsfi_lt (const int, _Decimal64); + TSTSFI_LT_DD + signed int __builtin_dfp_dtstsfi_lt (const int, _Decimal128); + TSTSFI_LT_TD + +[VEC_TSTSFI_OV, SKIP, __builtin_dfp_dtstsfi_ov] + signed int __builtin_dfp_dtstsfi_ov (const int, _Decimal64); + TSTSFI_OV_DD + signed int __builtin_dfp_dtstsfi_ov (const int, _Decimal128); + TSTSFI_OV_TD + +[VEC_UNPACKH, vec_unpackh, __builtin_vec_unpackh] + vss __builtin_vec_unpackh (vsc); + VUPKHSB VUPKHSB_VSC + vbs __builtin_vec_unpackh (vbc); + VUPKHSB VUPKHSB_VBC + vsi __builtin_vec_unpackh (vss); + VUPKHSH VUPKHSH_VSS + vbi __builtin_vec_unpackh (vbs); + VUPKHSH VUPKHSH_VBS + vui __builtin_vec_unpackh (vp); + VUPKHPX + vsll __builtin_vec_unpackh (vsi); + VUPKHSW VUPKHSW_VSI + vbll __builtin_vec_unpackh (vbi); + VUPKHSW VUPKHSW_VBI + vd __builtin_vec_unpackh (vf); + DOUBLEH_V4SF VUPKHF + +[VEC_UNPACKL, vec_unpackl, __builtin_vec_unpackl] + vss __builtin_vec_unpackl (vsc); + VUPKLSB VUPKLSB_VSC + vbs __builtin_vec_unpackl (vbc); + VUPKLSB VUPKLSB_VBC + vsi __builtin_vec_unpackl (vss); + VUPKLSH VUPKLSH_VSS + vbi __builtin_vec_unpackl (vbs); + VUPKLSH VUPKLSH_VBS + vui __builtin_vec_unpackl (vp); + VUPKLPX + vsll __builtin_vec_unpackl (vsi); + VUPKLSW VUPKLSW_VSI + vbll __builtin_vec_unpackl (vbi); + VUPKLSW VUPKLSW_VBI + vd __builtin_vec_unpackl (vf); + DOUBLEL_V4SF VUPKLF + +[VEC_UNSIGNED, vec_unsigned, __builtin_vec_vunsigned] + vui __builtin_vec_vunsigned (vf); + VEC_VUNSIGNED_V4SF + vull __builtin_vec_vunsigned (vd); + VEC_VUNSIGNED_V2DF + +[VEC_UNSIGNED2, vec_unsigned2, __builtin_vec_vunsigned2] + vui __builtin_vec_vunsigned2 (vd, vd); + VEC_VUNSIGNED2_V2DF + +[VEC_UNSIGNEDE, vec_unsignede, __builtin_vec_vunsignede] + vui __builtin_vec_vunsignede (vd); + VEC_VUNSIGNEDE_V2DF + +[VEC_UNSIGNEDO, vec_unsignedo, __builtin_vec_vunsignedo] + vui __builtin_vec_vunsignedo (vd); + VEC_VUNSIGNEDO_V2DF + +[VEC_VEE, vec_extract_exp, __builtin_vec_extract_exp, _ARCH_PWR9] + vui __builtin_vec_extract_exp (vf); + VEESP + vull __builtin_vec_extract_exp (vd); + VEEDP + +[VEC_VES, vec_extract_sig, __builtin_vec_extract_sig, _ARCH_PWR9] + vui __builtin_vec_extract_sig (vf); + VESSP + vull __builtin_vec_extract_sig (vd); + VESDP + +[VEC_VIE, vec_insert_exp, __builtin_vec_insert_exp, _ARCH_PWR9] + vf __builtin_vec_insert_exp (vf, vui); + VIESP VIESP_VF + vf __builtin_vec_insert_exp (vui, vui); + VIESP VIESP_VUI + vd __builtin_vec_insert_exp (vd, vull); + VIEDP VIEDP_VD + vd __builtin_vec_insert_exp (vull, vull); + VIEDP VIEDP_VULL + +; It is truly unfortunate that vec_vprtyb has an incompatible set of +; interfaces with vec_parity_lsbb. So we can't even deprecate this. +[VEC_VPRTYB, vec_vprtyb, __builtin_vec_vprtyb, _ARCH_PWR9] + vsi __builtin_vec_vprtyb (vsi); + VPRTYBW VPRTYB_VSI + vui __builtin_vec_vprtyb (vui); + VPRTYBW VPRTYB_VUI + vsll __builtin_vec_vprtyb (vsll); + VPRTYBD VPRTYB_VSLL + vull __builtin_vec_vprtyb (vull); + VPRTYBD VPRTYB_VULL + vsq __builtin_vec_vprtyb (vsq); + VPRTYBQ VPRTYB_VSQ + vuq __builtin_vec_vprtyb (vuq); + VPRTYBQ VPRTYB_VUQ + signed __int128 __builtin_vec_vprtyb (signed __int128); + VPRTYBQ VPRTYB_SQ + unsigned __int128 __builtin_vec_vprtyb (unsigned __int128); + VPRTYBQ VPRTYB_UQ + +[VEC_VSCEEQ, scalar_cmp_exp_eq, __builtin_vec_scalar_cmp_exp_eq, _ARCH_PWR9] + signed int __builtin_vec_scalar_cmp_exp_eq (double, double); + VSCEDPEQ + signed int __builtin_vec_scalar_cmp_exp_eq (_Float128, _Float128); + VSCEQPEQ + +[VEC_VSCEGT, scalar_cmp_exp_gt, __builtin_vec_scalar_cmp_exp_gt, _ARCH_PWR9] + signed int __builtin_vec_scalar_cmp_exp_gt (double, double); + VSCEDPGT + signed int __builtin_vec_scalar_cmp_exp_gt (_Float128, _Float128); + VSCEQPGT + +[VEC_VSCELT, scalar_cmp_exp_lt, __builtin_vec_scalar_cmp_exp_lt, _ARCH_PWR9] + signed int __builtin_vec_scalar_cmp_exp_lt (double, double); + VSCEDPLT + signed int __builtin_vec_scalar_cmp_exp_lt (_Float128, _Float128); + VSCEQPLT + +[VEC_VSCEUO, scalar_cmp_exp_unordered, __builtin_vec_scalar_cmp_exp_unordered, _ARCH_PWR9] + signed int __builtin_vec_scalar_cmp_exp_unordered (double, double); + VSCEDPUO + signed int __builtin_vec_scalar_cmp_exp_unordered (_Float128, _Float128); + VSCEQPUO + +[VEC_VSEE, scalar_extract_exp, __builtin_vec_scalar_extract_exp, _ARCH_PWR9] + unsigned int __builtin_vec_scalar_extract_exp (double); + VSEEDP + unsigned int __builtin_vec_scalar_extract_exp (_Float128); + VSEEQP + +[VEC_VSES, scalar_extract_sig, __builtin_vec_scalar_extract_sig, _ARCH_PWR9] + unsigned long long __builtin_vec_scalar_extract_sig (double); + VSESDP + unsigned __int128 __builtin_vec_scalar_extract_sig (_Float128); + VSESQP + +[VEC_VSIE, scalar_insert_exp, __builtin_vec_scalar_insert_exp, _ARCH_PWR9] + double __builtin_vec_scalar_insert_exp (unsigned long long, unsigned long long); + VSIEDP + double __builtin_vec_scalar_insert_exp (double, unsigned long long); + VSIEDPF + _Float128 __builtin_vec_scalar_insert_exp (unsigned __int128, unsigned long long); + VSIEQP + _Float128 __builtin_vec_scalar_insert_exp (_Float128, unsigned long long); + VSIEQPF + +[VEC_VSTDC, scalar_test_data_class, __builtin_vec_scalar_test_data_class, _ARCH_PWR9] + unsigned int __builtin_vec_scalar_test_data_class (float, const int); + VSTDCSP + unsigned int __builtin_vec_scalar_test_data_class (double, const int); + VSTDCDP + unsigned int __builtin_vec_scalar_test_data_class (_Float128, const int); + VSTDCQP + +[VEC_VSTDCN, scalar_test_neg, __builtin_vec_scalar_test_neg, _ARCH_PWR9] + unsigned int __builtin_vec_scalar_test_neg (float); + VSTDCNSP + unsigned int __builtin_vec_scalar_test_neg (double); + VSTDCNDP + unsigned int __builtin_vec_scalar_test_neg (_Float128); + VSTDCNQP + +[VEC_VTDC, vec_test_data_class, __builtin_vec_test_data_class, _ARCH_PWR9] + vbi __builtin_vec_test_data_class (vf, const int); + VTDCSP + vbll __builtin_vec_test_data_class (vd, const int); + VTDCDP + +[VEC_XL, vec_xl, __builtin_vec_vsx_ld, __VSX__] + vsc __builtin_vec_vsx_ld (signed long long, const vsc *); + LXVW4X_V16QI LXVW4X_VSC + vsc __builtin_vec_vsx_ld (signed long long, const signed char *); + LXVW4X_V16QI LXVW4X_SC + vuc __builtin_vec_vsx_ld (signed long long, const vuc *); + LXVW4X_V16QI LXVW4X_VUC + vuc __builtin_vec_vsx_ld (signed long long, const unsigned char *); + LXVW4X_V16QI LXVW4X_UC + vbc __builtin_vec_vsx_ld (signed long long, const vbc *); + LXVW4X_V16QI LXVW4X_VBC + vss __builtin_vec_vsx_ld (signed long long, const vss *); + LXVW4X_V8HI LXVW4X_VSS + vss __builtin_vec_vsx_ld (signed long long, const signed short *); + LXVW4X_V8HI LXVW4X_SS + vus __builtin_vec_vsx_ld (signed long long, const vus *); + LXVW4X_V8HI LXVW4X_VUS + vus __builtin_vec_vsx_ld (signed long long, const unsigned short *); + LXVW4X_V8HI LXVW4X_US + vbs __builtin_vec_vsx_ld (signed long long, const vbs *); + LXVW4X_V8HI LXVW4X_VBS + vp __builtin_vec_vsx_ld (signed long long, const vp *); + LXVW4X_V8HI LXVW4X_P + vsi __builtin_vec_vsx_ld (signed long long, const vsi *); + LXVW4X_V4SI LXVW4X_VSI + vsi __builtin_vec_vsx_ld (signed long long, const signed int *); + LXVW4X_V4SI LXVW4X_SI + vui __builtin_vec_vsx_ld (signed long long, const vui *); + LXVW4X_V4SI LXVW4X_VUI + vui __builtin_vec_vsx_ld (signed long long, const unsigned int *); + LXVW4X_V4SI LXVW4X_UI + vbi __builtin_vec_vsx_ld (signed long long, const vbi *); + LXVW4X_V4SI LXVW4X_VBI + vsll __builtin_vec_vsx_ld (signed long long, const vsll *); + LXVD2X_V2DI LXVD2X_VSLL + vsll __builtin_vec_vsx_ld (signed long long, const signed long long *); + LXVD2X_V2DI LXVD2X_SLL + vull __builtin_vec_vsx_ld (signed long long, const vull *); + LXVD2X_V2DI LXVD2X_VULL + vull __builtin_vec_vsx_ld (signed long long, const unsigned long long *); + LXVD2X_V2DI LXVD2X_ULL + vbll __builtin_vec_vsx_ld (signed long long, const vbll *); + LXVD2X_V2DI LXVD2X_VBLL + vsq __builtin_vec_vsx_ld (signed long long, const vsq *); + LXVD2X_V1TI LXVD2X_VSQ + vsq __builtin_vec_vsx_ld (signed long long, const signed __int128 *); + LXVD2X_V1TI LXVD2X_SQ + vuq __builtin_vec_vsx_ld (signed long long, const unsigned __int128 *); + LXVD2X_V1TI LXVD2X_UQ + vf __builtin_vec_vsx_ld (signed long long, const vf *); + LXVW4X_V4SF LXVW4X_VF + vf __builtin_vec_vsx_ld (signed long long, const float *); + LXVW4X_V4SF LXVW4X_F + vd __builtin_vec_vsx_ld (signed long long, const vd *); + LXVD2X_V2DF LXVD2X_VD + vd __builtin_vec_vsx_ld (signed long long, const double *); + LXVD2X_V2DF LXVD2X_D + +[VEC_XL_BE, vec_xl_be, __builtin_vec_xl_be, __VSX__] + vsc __builtin_vec_xl_be (signed long long, const vsc *); + LD_ELEMREV_V16QI LD_ELEMREV_VSC + vsc __builtin_vec_xl_be (signed long long, const signed char *); + LD_ELEMREV_V16QI LD_ELEMREV_SC + vuc __builtin_vec_xl_be (signed long long, const vuc *); + LD_ELEMREV_V16QI LD_ELEMREV_VUC + vuc __builtin_vec_xl_be (signed long long, const unsigned char *); + LD_ELEMREV_V16QI LD_ELEMREV_UC + vss __builtin_vec_xl_be (signed long long, const vss *); + LD_ELEMREV_V8HI LD_ELEMREV_VSS + vss __builtin_vec_xl_be (signed long long, const signed short *); + LD_ELEMREV_V8HI LD_ELEMREV_SS + vus __builtin_vec_xl_be (signed long long, const vus *); + LD_ELEMREV_V8HI LD_ELEMREV_VUS + vus __builtin_vec_xl_be (signed long long, const unsigned short *); + LD_ELEMREV_V8HI LD_ELEMREV_US + vsi __builtin_vec_xl_be (signed long long, const vsi *); + LD_ELEMREV_V4SI LD_ELEMREV_VSI + vsi __builtin_vec_xl_be (signed long long, const signed int *); + LD_ELEMREV_V4SI LD_ELEMREV_SI + vui __builtin_vec_xl_be (signed long long, const vui *); + LD_ELEMREV_V4SI LD_ELEMREV_VUI + vui __builtin_vec_xl_be (signed long long, const unsigned int *); + LD_ELEMREV_V4SI LD_ELEMREV_UI + vsll __builtin_vec_xl_be (signed long long, const vsll *); + LD_ELEMREV_V2DI LD_ELEMREV_VSLL + vsll __builtin_vec_xl_be (signed long long, const signed long long *); + LD_ELEMREV_V2DI LD_ELEMREV_SLL + vull __builtin_vec_xl_be (signed long long, const vull *); + LD_ELEMREV_V2DI LD_ELEMREV_VULL + vull __builtin_vec_xl_be (signed long long, const unsigned long long *); + LD_ELEMREV_V2DI LD_ELEMREV_ULL + vsq __builtin_vec_xl_be (signed long long, const signed __int128 *); + LD_ELEMREV_V1TI LD_ELEMREV_SQ + vuq __builtin_vec_xl_be (signed long long, const unsigned __int128 *); + LD_ELEMREV_V1TI LD_ELEMREV_UQ + vf __builtin_vec_xl_be (signed long long, const vf *); + LD_ELEMREV_V4SF LD_ELEMREV_VF + vf __builtin_vec_xl_be (signed long long, const float *); + LD_ELEMREV_V4SF LD_ELEMREV_F + vd __builtin_vec_xl_be (signed long long, const vd *); + LD_ELEMREV_V2DF LD_ELEMREV_VD + vd __builtin_vec_xl_be (signed long long, const double *); + LD_ELEMREV_V2DF LD_ELEMREV_DD + +[VEC_XL_LEN_R, vec_xl_len_r, __builtin_vec_xl_len_r, _ARCH_PPC64_PWR9] + vuc __builtin_vsx_xl_len_r (const unsigned char *, unsigned int); + XL_LEN_R + +[VEC_XL_SEXT, vec_xl_sext, __builtin_vec_xl_sext, _ARCH_PWR10] + vsq __builtin_vec_xl_sext (signed long long, const signed char *); + SE_LXVRBX + vsq __builtin_vec_xl_sext (signed long long, const signed short *); + SE_LXVRHX + vsq __builtin_vec_xl_sext (signed long long, const signed int *); + SE_LXVRWX + vsq __builtin_vec_xl_sext (signed long long, const signed long long *); + SE_LXVRDX + +[VEC_XL_ZEXT, vec_xl_zext, __builtin_vec_xl_zext, _ARCH_PWR10] + vuq __builtin_vec_xl_zext (signed long long, const unsigned char *); + ZE_LXVRBX + vuq __builtin_vec_xl_zext (signed long long, const unsigned short *); + ZE_LXVRHX + vuq __builtin_vec_xl_zext (signed long long, const unsigned int *); + ZE_LXVRWX + vuq __builtin_vec_xl_zext (signed long long, const unsigned long long *); + ZE_LXVRDX + +[VEC_XOR, vec_xor, __builtin_vec_xor] + vsc __builtin_vec_xor (vsc, vsc); + VXOR_V16QI + vuc __builtin_vec_xor (vuc, vuc); + VXOR_V16QI_UNS VXOR_VUC + vbc __builtin_vec_xor (vbc, vbc); + VXOR_V16QI_UNS VXOR_VBC + vss __builtin_vec_xor (vss, vss); + VXOR_V8HI + vus __builtin_vec_xor (vus, vus); + VXOR_V8HI_UNS VXOR_VUS + vbs __builtin_vec_xor (vbs, vbs); + VXOR_V8HI_UNS VXOR_VBS + vsi __builtin_vec_xor (vsi, vsi); + VXOR_V4SI + vui __builtin_vec_xor (vui, vui); + VXOR_V4SI_UNS VXOR_VUI + vbi __builtin_vec_xor (vbi, vbi); + VXOR_V4SI_UNS VXOR_VBI + vsll __builtin_vec_xor (vsll, vsll); + VXOR_V2DI + vull __builtin_vec_xor (vull, vull); + VXOR_V2DI_UNS VXOR_VULL + vbll __builtin_vec_xor (vbll, vbll); + VXOR_V2DI_UNS VXOR_VBLL + vf __builtin_vec_xor (vf, vf); + VXOR_V4SF + vd __builtin_vec_xor (vd, vd); + VXOR_V2DF +; The following variants are deprecated. + vsc __builtin_vec_xor (vsc, vbc); + VXOR_V16QI VXOR_VSC_VBC + vsc __builtin_vec_xor (vbc, vsc); + VXOR_V16QI VXOR_VBC_VSC + vsc __builtin_vec_xor (vsc, vuc); + VXOR_V16QI VXOR_VSC_VUC + vuc __builtin_vec_xor (vuc, vbc); + VXOR_V16QI_UNS VXOR_VUC_VBC + vuc __builtin_vec_xor (vbc, vuc); + VXOR_V16QI_UNS VXOR_VBC_VUC + vuc __builtin_vec_xor (vuc, vsc); + VXOR_V16QI_UNS VXOR_VUC_VSC + vss __builtin_vec_xor (vss, vbs); + VXOR_V8HI VXOR_VSS_VBS + vss __builtin_vec_xor (vbs, vss); + VXOR_V8HI VXOR_VBS_VSS + vus __builtin_vec_xor (vus, vbs); + VXOR_V8HI_UNS VXOR_VUS_VBS + vus __builtin_vec_xor (vbs, vus); + VXOR_V8HI_UNS VXOR_VBS_VUS + vsi __builtin_vec_xor (vsi, vbi); + VXOR_V4SI VXOR_VSI_VBI + vsi __builtin_vec_xor (vbi, vsi); + VXOR_V4SI VXOR_VBI_VSI + vui __builtin_vec_xor (vui, vbi); + VXOR_V4SI_UNS VXOR_VUI_VBI + vui __builtin_vec_xor (vbi, vui); + VXOR_V4SI_UNS VXOR_VBI_VUI + vsll __builtin_vec_xor (vsll, vbll); + VXOR_V2DI VXOR_VSLL_VBLL + vsll __builtin_vec_xor (vbll, vsll); + VXOR_V2DI VXOR_VBLL_VSLL + vull __builtin_vec_xor (vull, vbll); + VXOR_V2DI_UNS VXOR_VULL_VBLL + vull __builtin_vec_xor (vbll, vull); + VXOR_V2DI_UNS VXOR_VBLL_VULL + vf __builtin_vec_xor (vf, vbi); + VXOR_V4SF VXOR_VF_VBI + vf __builtin_vec_xor (vbi, vf); + VXOR_V4SF VXOR_VBI_VF + vd __builtin_vec_xor (vd, vbll); + VXOR_V2DF VXOR_VD_VBLL + vd __builtin_vec_xor (vbll, vd); + VXOR_V2DF VXOR_VBLL_VD + +[VEC_XST, vec_xst, __builtin_vec_vsx_st, __VSX__] + void __builtin_vec_vsx_st (vsc, signed long long, vsc *); + STXVW4X_V16QI STXVW4X_VSC + void __builtin_vec_vsx_st (vsc, signed long long, signed char *); + STXVW4X_V16QI STXVW4X_SC + void __builtin_vec_vsx_st (vuc, signed long long, vuc *); + STXVW4X_V16QI STXVW4X_VUC + void __builtin_vec_vsx_st (vuc, signed long long, unsigned char *); + STXVW4X_V16QI STXVW4X_UC + void __builtin_vec_vsx_st (vbc, signed long long, vbc *); + STXVW4X_V16QI STXVW4X_VBC + void __builtin_vec_vsx_st (vbc, signed long long, signed char *); + STXVW4X_V16QI STXVW4X_VBC_S + void __builtin_vec_vsx_st (vbc, signed long long, unsigned char *); + STXVW4X_V16QI STXVW4X_VBC_U + void __builtin_vec_vsx_st (vss, signed long long, vss *); + STXVW4X_V8HI STXVW4X_VSS + void __builtin_vec_vsx_st (vss, signed long long, signed short *); + STXVW4X_V8HI STXVW4X_SS + void __builtin_vec_vsx_st (vus, signed long long, vus *); + STXVW4X_V8HI STXVW4X_VUS + void __builtin_vec_vsx_st (vus, signed long long, unsigned short *); + STXVW4X_V8HI STXVW4X_US + void __builtin_vec_vsx_st (vbs, signed long long, vbs *); + STXVW4X_V8HI STXVW4X_VBS + void __builtin_vec_vsx_st (vbs, signed long long, signed short *); + STXVW4X_V8HI STXVW4X_VBS_S + void __builtin_vec_vsx_st (vbs, signed long long, unsigned short *); + STXVW4X_V8HI STXVW4X_VBS_U + void __builtin_vec_vsx_st (vp, signed long long, vp *); + STXVW4X_V8HI STXVW4X_VP + void __builtin_vec_vsx_st (vsi, signed long long, vsi *); + STXVW4X_V4SI STXVW4X_VSI + void __builtin_vec_vsx_st (vsi, signed long long, signed int *); + STXVW4X_V4SI STXVW4X_SI + void __builtin_vec_vsx_st (vui, signed long long, vui *); + STXVW4X_V4SI STXVW4X_VUI + void __builtin_vec_vsx_st (vui, signed long long, unsigned int *); + STXVW4X_V4SI STXVW4X_UI + void __builtin_vec_vsx_st (vbi, signed long long, vbi *); + STXVW4X_V4SI STXVW4X_VBI + void __builtin_vec_vsx_st (vbi, signed long long, signed int *); + STXVW4X_V4SI STXVW4X_VBI_S + void __builtin_vec_vsx_st (vbi, signed long long, unsigned int *); + STXVW4X_V4SI STXVW4X_VBI_U + void __builtin_vec_vsx_st (vsll, signed long long, vsll *); + STXVD2X_V2DI STXVD2X_VSLL + void __builtin_vec_vsx_st (vsll, signed long long, signed long long *); + STXVD2X_V2DI STXVD2X_SLL + void __builtin_vec_vsx_st (vull, signed long long, vull *); + STXVD2X_V2DI STXVD2X_VULL + void __builtin_vec_vsx_st (vull, signed long long, unsigned long long *); + STXVD2X_V2DI STXVD2X_ULL + void __builtin_vec_vsx_st (vbll, signed long long, vbll *); + STXVD2X_V2DI STXVD2X_VBLL + void __builtin_vec_vsx_st (vsq, signed long long, signed __int128 *); + STXVD2X_V1TI STXVD2X_SQ + void __builtin_vec_vsx_st (vuq, signed long long, unsigned __int128 *); + STXVD2X_V1TI STXVD2X_UQ + void __builtin_vec_vsx_st (vf, signed long long, vf *); + STXVW4X_V4SF STXVW4X_VF + void __builtin_vec_vsx_st (vf, signed long long, float *); + STXVW4X_V4SF STXVW4X_F + void __builtin_vec_vsx_st (vd, signed long long, vd *); + STXVD2X_V2DF STXVD2X_VD + void __builtin_vec_vsx_st (vd, signed long long, double *); + STXVD2X_V2DF STXVD2X_D + +[VEC_XST_BE, vec_xst_be, __builtin_vec_xst_be, __VSX__] + void __builtin_vec_xst_be (vsc, signed long long, vsc *); + ST_ELEMREV_V16QI ST_ELEMREV_VSC + void __builtin_vec_xst_be (vsc, signed long long, signed char *); + ST_ELEMREV_V16QI ST_ELEMREV_SC_ + void __builtin_vec_xst_be (vuc, signed long long, vuc *); + ST_ELEMREV_V16QI ST_ELEMREV_VUC + void __builtin_vec_xst_be (vuc, signed long long, unsigned char *); + ST_ELEMREV_V16QI ST_ELEMREV_UC + void __builtin_vec_xst_be (vss, signed long long, vss *); + ST_ELEMREV_V8HI ST_ELEMREV_VSS + void __builtin_vec_xst_be (vss, signed long long, signed short *); + ST_ELEMREV_V8HI ST_ELEMREV_SS + void __builtin_vec_xst_be (vus, signed long long, vus *); + ST_ELEMREV_V8HI ST_ELEMREV_VUS + void __builtin_vec_xst_be (vus, signed long long, unsigned short *); + ST_ELEMREV_V8HI ST_ELEMREV_US + void __builtin_vec_xst_be (vsi, signed long long, vsi *); + ST_ELEMREV_V4SI ST_ELEMREV_VSI + void __builtin_vec_xst_be (vsi, signed long long, signed int *); + ST_ELEMREV_V4SI ST_ELEMREV_SI + void __builtin_vec_xst_be (vui, signed long long, vui *); + ST_ELEMREV_V4SI ST_ELEMREV_VUI + void __builtin_vec_xst_be (vui, signed long long, unsigned int *); + ST_ELEMREV_V4SI ST_ELEMREV_UI + void __builtin_vec_xst_be (vsll, signed long long, vsll *); + ST_ELEMREV_V2DI ST_ELEMREV_VSLL + void __builtin_vec_xst_be (vsll, signed long long, signed long long *); + ST_ELEMREV_V2DI ST_ELEMREV_SLL + void __builtin_vec_xst_be (vull, signed long long, vull *); + ST_ELEMREV_V2DI ST_ELEMREV_VULL + void __builtin_vec_xst_be (vull, signed long long, unsigned long long *); + ST_ELEMREV_V2DI ST_ELEMREV_ULL + void __builtin_vec_xst_be (vsq, signed long long, signed __int128 *); + ST_ELEMREV_V1TI ST_ELEMREV_SQ + void __builtin_vec_xst_be (vuq, signed long long, unsigned __int128 *); + ST_ELEMREV_V1TI ST_ELEMREV_UQ + void __builtin_vec_xst_be (vf, signed long long, vf *); + ST_ELEMREV_V4SF ST_ELEMREV_VF + void __builtin_vec_xst_be (vf, signed long long, float *); + ST_ELEMREV_V4SF ST_ELEMREV_F + void __builtin_vec_xst_be (vd, signed long long, vd *); + ST_ELEMREV_V2DF ST_ELEMREV_VD + void __builtin_vec_xst_be (vd, signed long long, double *); + ST_ELEMREV_V2DF ST_ELEMREV_D + +[VEC_XST_LEN_R, vec_xst_len_r, __builtin_vec_xst_len_r, _ARCH_PPC64_PWR9] + void __builtin_vsx_xst_len_r (vuc, unsigned char *, unsigned int); + XST_LEN_R + +[VEC_XST_TRUNC, vec_xst_trunc, __builtin_vec_xst_trunc, _ARCH_PWR10] + void __builtin_vec_xst_trunc (vsq, signed long long, signed char *); + TR_STXVRBX TR_STXVRBX_S + void __builtin_vec_xst_trunc (vuq, signed long long, unsigned char *); + TR_STXVRBX TR_STXVRBX_U + void __builtin_vec_xst_trunc (vsq, signed long long, signed short *); + TR_STXVRHX TR_STXVRHX_S + void __builtin_vec_xst_trunc (vuq, signed long long, unsigned short *); + TR_STXVRHX TR_STXVRHX_U + void __builtin_vec_xst_trunc (vsq, signed long long, signed int *); + TR_STXVRWX TR_STXVRWX_S + void __builtin_vec_xst_trunc (vuq, signed long long, unsigned int *); + TR_STXVRWX TR_STXVRWX_U + void __builtin_vec_xst_trunc (vsq, signed long long, signed long long *); + TR_STXVRDX TR_STXVRDX_S + void __builtin_vec_xst_trunc (vuq, signed long long, unsigned long long *); + TR_STXVRDX TR_STXVRDX_U + +[VEC_XXPERMDI, vec_xxpermdi, __builtin_vsx_xxpermdi, __VSX__] + vsc __builtin_vsx_xxpermdi (vsc, vsc, const int); + XXPERMDI_16QI XXPERMDI_VSC + vuc __builtin_vsx_xxpermdi (vuc, vuc, const int); + XXPERMDI_16QI XXPERMDI_VUC + vss __builtin_vsx_xxpermdi (vss, vss, const int); + XXPERMDI_8HI XXPERMDI_VSS + vus __builtin_vsx_xxpermdi (vus, vus, const int); + XXPERMDI_8HI XXPERMDI_VUS + vsi __builtin_vsx_xxpermdi (vsi, vsi, const int); + XXPERMDI_4SI XXPERMDI_VSI + vui __builtin_vsx_xxpermdi (vui, vui, const int); + XXPERMDI_4SI XXPERMDI_VUI + vsll __builtin_vsx_xxpermdi (vsll, vsll, const int); + XXPERMDI_2DI XXPERMDI_VSLL + vull __builtin_vsx_xxpermdi (vull, vull, const int); + XXPERMDI_2DI XXPERMDI_VULL + vf __builtin_vsx_xxpermdi (vf, vf, const int); + XXPERMDI_4SF XXPERMDI_VF + vd __builtin_vsx_xxpermdi (vd, vd, const int); + XXPERMDI_2DF XXPERMDI_VD + +[VEC_XXSLDWI, vec_xxsldwi, __builtin_vsx_xxsldwi, __VSX__] + vsc __builtin_vsx_xxsldwi (vsc, vsc, const int); + XXSLDWI_16QI XXSLDWI_VSC2 + vuc __builtin_vsx_xxsldwi (vuc, vuc, const int); + XXSLDWI_16QI XXSLDWI_VUC2 + vss __builtin_vsx_xxsldwi (vss, vss, const int); + XXSLDWI_8HI XXSLDWI_VSS2 + vus __builtin_vsx_xxsldwi (vus, vus, const int); + XXSLDWI_8HI XXSLDWI_VUS2 + vsi __builtin_vsx_xxsldwi (vsi, vsi, const int); + XXSLDWI_4SI XXSLDWI_VSI2 + vui __builtin_vsx_xxsldwi (vui, vui, const int); + XXSLDWI_4SI XXSLDWI_VUI2 + vsll __builtin_vsx_xxsldwi (vsll, vsll, const int); + XXSLDWI_2DI XXSLDWI_VSLL2 + vull __builtin_vsx_xxsldwi (vull, vull, const int); + XXSLDWI_2DI XXSLDWI_VULL2 + vf __builtin_vsx_xxsldwi (vf, vf, const int); + XXSLDWI_4SF XXSLDWI_VF2 + vd __builtin_vsx_xxsldwi (vd, vd, const int); + XXSLDWI_2DF XXSLDWI_VD2 + + +; ************************************************************************** +; ************************************************************************** +; **** Deprecated overloads that should never have existed at all **** +; ************************************************************************** +; ************************************************************************** + +[VEC_LVEBX, vec_lvebx, __builtin_vec_lvebx] + vsc __builtin_vec_lvebx (signed long, signed char *); + LVEBX LVEBX_DEPR1 + vuc __builtin_vec_lvebx (signed long, unsigned char *); + LVEBX LVEBX_DEPR2 + +[VEC_LVEHX, vec_lvehx, __builtin_vec_lvehx] + vss __builtin_vec_lvehx (signed long, signed short *); + LVEHX LVEHX_DEPR1 + vus __builtin_vec_lvehx (signed long, unsigned short *); + LVEHX LVEHX_DEPR2 + +[VEC_LVEWX, vec_lvewx, __builtin_vec_lvewx] + vf __builtin_vec_lvewx (signed long, float *); + LVEWX LVEWX_DEPR1 + vsi __builtin_vec_lvewx (signed long, signed int *); + LVEWX LVEWX_DEPR2 + vui __builtin_vec_lvewx (signed long, unsigned int *); + LVEWX LVEWX_DEPR3 + vsi __builtin_vec_lvewx (signed long, signed long *); + LVEWX LVEWX_DEPR4 + vui __builtin_vec_lvewx (signed long, unsigned long *); + LVEWX LVEWX_DEPR5 + +[VEC_STVEBX, vec_stvebx, __builtin_vec_stvebx] + void __builtin_vec_stvebx (vsc, signed long, signed char *); + STVEBX STVEBX_DEPR1 + void __builtin_vec_stvebx (vuc, signed long, unsigned char *); + STVEBX STVEBX_DEPR2 + void __builtin_vec_stvebx (vbc, signed long, signed char *); + STVEBX STVEBX_DEPR3 + void __builtin_vec_stvebx (vbc, signed long, signed char *); + STVEBX STVEBX_DEPR4 + void __builtin_vec_stvebx (vsc, signed long, void *); + STVEBX STVEBX_DEPR5 + void __builtin_vec_stvebx (vuc, signed long, void *); + STVEBX STVEBX_DEPR6 + +[VEC_STVEHX, vec_stvehx, __builtin_vec_stvehx] + void __builtin_vec_stvehx (vss, signed long, signed short *); + STVEHX STVEHX_DEPR1 + void __builtin_vec_stvehx (vus, signed long, unsigned short *); + STVEHX STVEHX_DEPR2 + void __builtin_vec_stvehx (vbs, signed long, signed short *); + STVEHX STVEHX_DEPR3 + void __builtin_vec_stvehx (vbs, signed long, signed short *); + STVEHX STVEHX_DEPR4 + void __builtin_vec_stvehx (vss, signed long, void *); + STVEHX STVEHX_DEPR5 + void __builtin_vec_stvehx (vus, signed long, void *); + STVEHX STVEHX_DEPR6 + +[VEC_STVEWX, vec_stvewx, __builtin_vec_stvewx] + void __builtin_vec_stvewx (vf, signed long, float *); + STVEWX STVEWX_DEPR1 + void __builtin_vec_stvewx (vsi, signed long, signed int *); + STVEWX STVEWX_DEPR2 + void __builtin_vec_stvewx (vui, signed long, unsigned int *); + STVEWX STVEWX_DEPR3 + void __builtin_vec_stvewx (vbi, signed long, signed int *); + STVEWX STVEWX_DEPR4 + void __builtin_vec_stvewx (vbi, signed long, unsigned int *); + STVEWX STVEWX_DEPR5 + void __builtin_vec_stvewx (vf, signed long, void *); + STVEWX STVEWX_DEPR6 + void __builtin_vec_stvewx (vsi, signed long, void *); + STVEWX STVEWX_DEPR7 + void __builtin_vec_stvewx (vui, signed long, void *); + STVEWX STVEWX_DEPR8 + +[VEC_TSTSFI_EQ_DD, SKIP, __builtin_dfp_dtstsfi_eq_dd, _ARCH_PWR9] + signed int __builtin_dfp_dtstsfi_eq_dd (const int, _Decimal64); + TSTSFI_EQ_DD TSTSFI_EQ_DD_DEPR1 + +[VEC_TSTSFI_EQ_TD, SKIP, __builtin_dfp_dtstsfi_eq_td, _ARCH_PWR9] + signed int __builtin_dfp_dtstsfi_eq_td (const int, _Decimal128); + TSTSFI_EQ_TD TSTSFI_EQ_TD_DEPR1 + +[VEC_TSTSFI_GT_DD, SKIP, __builtin_dfp_dtstsfi_gt_dd, _ARCH_PWR9] + signed int __builtin_dfp_dtstsfi_gt_dd (const int, _Decimal64); + TSTSFI_GT_DD TSTSFI_GT_DD_DEPR1 + +[VEC_TSTSFI_GT_TD, SKIP, __builtin_dfp_dtstsfi_gt_td, _ARCH_PWR9] + signed int __builtin_dfp_dtstsfi_gt_td (const int, _Decimal128); + TSTSFI_GT_TD TSTSFI_GT_TD_DEPR1 + +[VEC_TSTSFI_LT_DD, SKIP, __builtin_dfp_dtstsfi_lt_dd, _ARCH_PWR9] + signed int __builtin_dfp_dtstsfi_lt_dd (const int, _Decimal64); + TSTSFI_LT_DD TSTSFI_LT_DD_DEPR1 + +[VEC_TSTSFI_LT_TD, SKIP, __builtin_dfp_dtstsfi_lt_td, _ARCH_PWR9] + signed int __builtin_dfp_dtstsfi_lt_td (const int, _Decimal128); + TSTSFI_LT_TD TSTSFI_LT_TD_DEPR1 + +[VEC_TSTSFI_OV_DD, SKIP, __builtin_dfp_dtstsfi_ov_dd, _ARCH_PWR9] + signed int __builtin_dfp_dtstsfi_ov_dd (const int, _Decimal64); + TSTSFI_OV_DD TSTSFI_OV_DD_DEPR1 + +[VEC_TSTSFI_OV_TD, SKIP, __builtin_dfp_dtstsfi_ov_td, _ARCH_PWR9] + signed int __builtin_dfp_dtstsfi_ov_td (const int, _Decimal128); + TSTSFI_OV_TD TSTSFI_OV_TD_DEPR1 + +[VEC_VADDCUQ, vec_vaddcuq, __builtin_vec_vaddcuq, _ARCH_PWR8] + vsq __builtin_vec_vaddcuq (vsq, vsq); + VADDCUQ VADDCUQ_DEPR1 + vuq __builtin_vec_vaddcuq (vuq, vuq); + VADDCUQ VADDCUQ_DEPR2 + +[VEC_VADDECUQ, vec_vaddecuq, __builtin_vec_vaddecuq, _ARCH_PWR8] + vsq __builtin_vec_vaddecuq (vsq, vsq, vsq); + VADDECUQ VADDECUQ_DEPR1 + vuq __builtin_vec_vaddecuq (vuq, vuq, vuq); + VADDECUQ VADDECUQ_DEPR2 + +[VEC_VADDEUQM, vec_vaddeuqm, __builtin_vec_vaddeuqm, _ARCH_PWR8] + vsq __builtin_vec_vaddeuqm (vsq, vsq, vsq); + VADDEUQM VADDEUQM_DEPR1 + vuq __builtin_vec_vaddeuqm (vuq, vuq, vuq); + VADDEUQM VADDEUQM_DEPR2 + +[VEC_VADDFP, vec_vaddfp, __builtin_vec_vaddfp] + vf __builtin_vec_vaddfp (vf, vf); + VADDFP VADDFP_DEPR1 + +[VEC_VADDSBS, vec_vaddsbs, __builtin_vec_vaddsbs] + vsc __builtin_vec_vaddsbs (vsc, vsc); + VADDSBS VADDSBS_DEPR1 + vsc __builtin_vec_vaddsbs (vbc, vsc); + VADDSBS VADDSBS_DEPR2 + vsc __builtin_vec_vaddsbs (vsc, vbc); + VADDSBS VADDSBS_DEPR3 + +[VEC_VADDSHS, vec_vaddshs, __builtin_vec_vaddshs] + vss __builtin_vec_vaddshs (vss, vss); + VADDSHS VADDSHS_DEPR1 + vss __builtin_vec_vaddshs (vbs, vss); + VADDSHS VADDSHS_DEPR2 + vss __builtin_vec_vaddshs (vss, vbs); + VADDSHS VADDSHS_DEPR3 + +[VEC_VADDSWS, vec_vaddsws, __builtin_vec_vaddsws] + vsi __builtin_vec_vaddsws (vsi, vsi); + VADDSWS VADDSWS_DEPR1 + vsi __builtin_vec_vaddsws (vbi, vsi); + VADDSWS VADDSWS_DEPR2 + vsi __builtin_vec_vaddsws (vsi, vbi); + VADDSWS VADDSWS_DEPR3 + +[VEC_VADDUBM, vec_vaddubm, __builtin_vec_vaddubm] + vsc __builtin_vec_vaddubm (vsc, vsc); + VADDUBM VADDUBM_DEPR1 + vuc __builtin_vec_vaddubm (vsc, vuc); + VADDUBM VADDUBM_DEPR2 + vuc __builtin_vec_vaddubm (vuc, vsc); + VADDUBM VADDUBM_DEPR3 + vuc __builtin_vec_vaddubm (vuc, vuc); + VADDUBM VADDUBM_DEPR4 + vsc __builtin_vec_vaddubm (vbc, vsc); + VADDUBM VADDUBM_DEPR5 + vsc __builtin_vec_vaddubm (vsc, vbc); + VADDUBM VADDUBM_DEPR6 + vuc __builtin_vec_vaddubm (vbc, vuc); + VADDUBM VADDUBM_DEPR7 + vuc __builtin_vec_vaddubm (vuc, vbc); + VADDUBM VADDUBM_DEPR8 + +[VEC_VADDUBS, vec_vaddubs, __builtin_vec_vaddubs] + vuc __builtin_vec_vaddubs (vsc, vuc); + VADDUBS VADDUBS_DEPR1 + vuc __builtin_vec_vaddubs (vuc, vsc); + VADDUBS VADDUBS_DEPR2 + vuc __builtin_vec_vaddubs (vuc, vuc); + VADDUBS VADDUBS_DEPR3 + vuc __builtin_vec_vaddubs (vbc, vuc); + VADDUBS VADDUBS_DEPR4 + vuc __builtin_vec_vaddubs (vuc, vbc); + VADDUBS VADDUBS_DEPR5 + +[VEC_VADDUDM, vec_vaddudm, __builtin_vec_vaddudm, _ARCH_PWR8] + vsll __builtin_vec_vaddudm (vbll, vsll); + VADDUDM VADDUDM_DEPR1 + vsll __builtin_vec_vaddudm (vsll, vbll); + VADDUDM VADDUDM_DEPR2 + vsll __builtin_vec_vaddudm (vsll, vsll); + VADDUDM VADDUDM_DEPR3 + vull __builtin_vec_vaddudm (vbll, vull); + VADDUDM VADDUDM_DEPR4 + vull __builtin_vec_vaddudm (vull, vbll); + VADDUDM VADDUDM_DEPR5 + vull __builtin_vec_vaddudm (vull, vull); + VADDUDM VADDUDM_DEPR6 + +[VEC_VADDUHM, vec_vadduhm, __builtin_vec_vadduhm] + vss __builtin_vec_vadduhm (vss, vss); + VADDUHM VADDUHM_DEPR1 + vus __builtin_vec_vadduhm (vss, vus); + VADDUHM VADDUHM_DEPR2 + vus __builtin_vec_vadduhm (vus, vss); + VADDUHM VADDUHM_DEPR3 + vus __builtin_vec_vadduhm (vus, vus); + VADDUHM VADDUHM_DEPR4 + vss __builtin_vec_vadduhm (vbs, vss); + VADDUHM VADDUHM_DEPR5 + vss __builtin_vec_vadduhm (vss, vbs); + VADDUHM VADDUHM_DEPR6 + vus __builtin_vec_vadduhm (vbs, vus); + VADDUHM VADDUHM_DEPR7 + vus __builtin_vec_vadduhm (vus, vbs); + VADDUHM VADDUHM_DEPR8 + +[VEC_VADDUHS, vec_vadduhs, __builtin_vec_vadduhs] + vus __builtin_vec_vadduhs (vss, vus); + VADDUHS VADDUHS_DEPR1 + vus __builtin_vec_vadduhs (vus, vss); + VADDUHS VADDUHS_DEPR2 + vus __builtin_vec_vadduhs (vus, vus); + VADDUHS VADDUHS_DEPR3 + vus __builtin_vec_vadduhs (vbs, vus); + VADDUHS VADDUHS_DEPR4 + vus __builtin_vec_vadduhs (vus, vbs); + VADDUHS VADDUHS_DEPR5 + +[VEC_VADDUQM, vec_vadduqm, __builtin_vec_vadduqm, _ARCH_PWR8] + vsq __builtin_vec_vadduqm (vsq, vsq); + VADDUQM VADDUQM_DEPR1 + vuq __builtin_vec_vadduqm (vuq, vuq); + VADDUQM VADDUQM_DEPR2 + +[VEC_VADDUWM, vec_vadduwm, __builtin_vec_vadduwm] + vsi __builtin_vec_vadduwm (vsi, vsi); + VADDUWM VADDUWM_DEPR1 + vui __builtin_vec_vadduwm (vsi, vui); + VADDUWM VADDUWM_DEPR2 + vui __builtin_vec_vadduwm (vui, vsi); + VADDUWM VADDUWM_DEPR3 + vui __builtin_vec_vadduwm (vui, vui); + VADDUWM VADDUWM_DEPR4 + vsi __builtin_vec_vadduwm (vbi, vsi); + VADDUWM VADDUWM_DEPR5 + vsi __builtin_vec_vadduwm (vsi, vbi); + VADDUWM VADDUWM_DEPR6 + vui __builtin_vec_vadduwm (vbi, vui); + VADDUWM VADDUWM_DEPR7 + vui __builtin_vec_vadduwm (vui, vbi); + VADDUWM VADDUWM_DEPR8 + +[VEC_VADDUWS, vec_vadduws, __builtin_vec_vadduws] + vui __builtin_vec_vadduws (vsi, vui); + VADDUWS VADDUWS_DEPR1 + vui __builtin_vec_vadduws (vui, vsi); + VADDUWS VADDUWS_DEPR2 + vui __builtin_vec_vadduws (vui, vui); + VADDUWS VADDUWS_DEPR3 + vui __builtin_vec_vadduws (vbi, vui); + VADDUWS VADDUWS_DEPR4 + vui __builtin_vec_vadduws (vui, vbi); + VADDUWS VADDUWS_DEPR5 + +[VEC_VADUB, vec_absdb, __builtin_vec_vadub] + vuc __builtin_vec_vadub (vuc, vuc); + VADUB VADUB_DEPR1 + +[VEC_VADUH, vec_absdh, __builtin_vec_vaduh] + vus __builtin_vec_vaduh (vus, vus); + VADUH VADUH_DEPR1 + +[VEC_VADUW, vec_absdw, __builtin_vec_vaduw] + vui __builtin_vec_vaduw (vui, vui); + VADUW VADUW_DEPR1 + +[VEC_VAVGSB, vec_vavgsb, __builtin_vec_vavgsb] + vsc __builtin_vec_vavgsb (vsc, vsc); + VAVGSB VAVGSB_DEPR1 + +[VEC_VAVGSH, vec_vavgsh, __builtin_vec_vavgsh] + vss __builtin_vec_vavgsh (vss, vss); + VAVGSH VAVGSH_DEPR1 + +[VEC_VAVGSW, vec_vavgsw, __builtin_vec_vavgsw] + vsi __builtin_vec_vavgsw (vsi, vsi); + VAVGSW VAVGSW_DEPR1 + +[VEC_VAVGUB, vec_vavgub, __builtin_vec_vavgub] + vuc __builtin_vec_vavgub (vuc, vuc); + VAVGUB VAVGUB_DEPR1 + +[VEC_VAVGUH, vec_vavguh, __builtin_vec_vavguh] + vus __builtin_vec_vavguh (vus, vus); + VAVGUH VAVGUH_DEPR1 + +[VEC_VAVGUW, vec_vavguw, __builtin_vec_vavguw] + vui __builtin_vec_vavguw (vui, vui); + VAVGUW VAVGUW_DEPR1 + +[VEC_VBPERMQ, vec_vbpermq, __builtin_vec_vbpermq, _ARCH_PWR8] + vull __builtin_vec_vbpermq (vull, vuc); + VBPERMQ VBPERMQ_DEPR1 + vsll __builtin_vec_vbpermq (vsc, vsc); + VBPERMQ VBPERMQ_DEPR2 + vull __builtin_vec_vbpermq (vuc, vuc); + VBPERMQ VBPERMQ_DEPR3 + vull __builtin_vec_vbpermq (vuq, vuc); + VBPERMQ VBPERMQ_DEPR4 + +[VEC_VCFSX, vec_vcfsx, __builtin_vec_vcfsx] + vf __builtin_vec_vcfsx (vsi, const int); + VCFSX VCFSX_DEPR1 + +[VEC_VCFUX, vec_vcfux, __builtin_vec_vcfux] + vf __builtin_vec_vcfux (vui, const int); + VCFUX VCFUX_DEPR1 + +[VEC_VCLZB, vec_vclzb, __builtin_vec_vclzb, _ARCH_PWR8] + vsc __builtin_vec_vclzb (vsc); + VCLZB VCLZB_DEPR1 + vuc __builtin_vec_vclzb (vuc); + VCLZB VCLZB_DEPR2 + +[VEC_VCLZD, vec_vclzd, __builtin_vec_vclzd, _ARCH_PWR8] + vsll __builtin_vec_vclzd (vsll); + VCLZD VCLZD_DEPR1 + vull __builtin_vec_vclzd (vull); + VCLZD VCLZD_DEPR2 + +[VEC_VCLZH, vec_vclzh, __builtin_vec_vclzh, _ARCH_PWR8] + vss __builtin_vec_vclzh (vss); + VCLZH VCLZH_DEPR1 + vus __builtin_vec_vclzh (vus); + VCLZH VCLZH_DEPR2 + +[VEC_VCLZW, vec_vclzw, __builtin_vec_vclzw, _ARCH_PWR8] + vsi __builtin_vec_vclzw (vsi); + VCLZW VCLZW_DEPR1 + vui __builtin_vec_vclzw (vui); + VCLZW VCLZW_DEPR2 + +[VEC_VCMPEQFP, vec_vcmpeqfp, __builtin_vec_vcmpeqfp] + vbi __builtin_vec_vcmpeqfp (vf, vf); + VCMPEQFP VCMPEQFP_DEPR1 + +[VEC_VCMPEQUB, vec_vcmpequb, __builtin_vec_vcmpequb] + vbc __builtin_vec_vcmpequb (vsc, vsc); + VCMPEQUB VCMPEQUB_DEPR1 + vbc __builtin_vec_vcmpequb (vuc, vuc); + VCMPEQUB VCMPEQUB_DEPR2 + +[VEC_VCMPEQUH, vec_vcmpequh, __builtin_vec_vcmpequh] + vbs __builtin_vec_vcmpequh (vss, vss); + VCMPEQUH VCMPEQUH_DEPR1 + vbs __builtin_vec_vcmpequh (vus, vus); + VCMPEQUH VCMPEQUH_DEPR2 + +[VEC_VCMPEQUW, vec_vcmpequw, __builtin_vec_vcmpequw] + vbi __builtin_vec_vcmpequw (vsi, vsi); + VCMPEQUW VCMPEQUW_DEPR1 + vbi __builtin_vec_vcmpequw (vui, vui); + VCMPEQUW VCMPEQUW_DEPR2 + +[VEC_VCMPGTFP, vec_vcmpgtfp, __builtin_vec_vcmpgtfp] + vbi __builtin_vec_vcmpgtfp (vf, vf); + VCMPGTFP VCMPGTFP_DEPR1 + +[VEC_VCMPGTSB, vec_vcmpgtsb, __builtin_vec_vcmpgtsb] + vbc __builtin_vec_vcmpgtsb (vsc, vsc); + VCMPGTSB VCMPGTSB_DEPR1 + +[VEC_VCMPGTSH, vec_vcmpgtsh, __builtin_vec_vcmpgtsh] + vbs __builtin_vec_vcmpgtsh (vss, vss); + VCMPGTSH VCMPGTSH_DEPR1 + +[VEC_VCMPGTSW, vec_vcmpgtsw, __builtin_vec_vcmpgtsw] + vbi __builtin_vec_vcmpgtsw (vsi, vsi); + VCMPGTSW VCMPGTSW_DEPR1 + +[VEC_VCMPGTUB, vec_vcmpgtub, __builtin_vec_vcmpgtub] + vbc __builtin_vec_vcmpgtub (vuc, vuc); + VCMPGTUB VCMPGTUB_DEPR1 + +[VEC_VCMPGTUH, vec_vcmpgtuh, __builtin_vec_vcmpgtuh] + vbs __builtin_vec_vcmpgtuh (vus, vus); + VCMPGTUH VCMPGTUH_DEPR1 + +[VEC_VCMPGTUW, vec_vcmpgtuw, __builtin_vec_vcmpgtuw] + vbi __builtin_vec_vcmpgtuw (vui, vui); + VCMPGTUW VCMPGTUW_DEPR1 + +[VEC_VCTZB, vec_vctzb, __builtin_vec_vctzb, _ARCH_PWR9] + vsc __builtin_vec_vctzb (vsc); + VCTZB VCTZB_DEPR1 + vuc __builtin_vec_vctzb (vuc); + VCTZB VCTZB_DEPR2 + +[VEC_VCTZD, vec_vctzd, __builtin_vec_vctzd, _ARCH_PWR9] + vsll __builtin_vec_vctzd (vsll); + VCTZD VCTZD_DEPR1 + vull __builtin_vec_vctzd (vull); + VCTZD VCTZD_DEPR2 + +[VEC_VCTZH, vec_vctzh, __builtin_vec_vctzh, _ARCH_PWR9] + vss __builtin_vec_vctzh (vss); + VCTZH VCTZH_DEPR1 + vus __builtin_vec_vctzh (vus); + VCTZH VCTZH_DEPR2 + +[VEC_VCTZW, vec_vctzw, __builtin_vec_vctzw, _ARCH_PWR9] + vsi __builtin_vec_vctzw (vsi); + VCTZW VCTZW_DEPR1 + vui __builtin_vec_vctzw (vui); + VCTZW VCTZW_DEPR2 + +[VEC_VEEDP, vec_extract_exp_dp, __builtin_vec_extract_exp_dp, _ARCH_PWR9] + vull __builtin_vec_extract_exp_dp (vd); + VEEDP VEEDP_DEPR1 + +[VEC_VEESP, vec_extract_exp_sp, __builtin_vec_extract_exp_sp, _ARCH_PWR9] + vui __builtin_vec_extract_exp_sp (vf); + VEESP VEESP_DEPR1 + +[VEC_VESDP, vec_extract_sig_dp, __builtin_vec_extract_sig_dp, _ARCH_PWR9] + vull __builtin_vec_extract_sig_dp (vd); + VESDP VESDP_DEPR1 + +[VEC_VESSP, vec_extract_sig_sp, __builtin_vec_extract_sig_sp, _ARCH_PWR9] + vui __builtin_vec_extract_sig_sp (vf); + VESSP VESSP_DEPR1 + +[VEC_VIEDP, vec_insert_exp_dp, __builtin_vec_insert_exp_dp, _ARCH_PWR9] + vd __builtin_vec_insert_exp_dp (vd, vull); + VIEDP VIEDP_DEPR1 + vd __builtin_vec_insert_exp_dp (vull, vull); + VIEDP VIEDP_DEPR2 + +[VEC_VIESP, vec_insert_exp_sp, __builtin_vec_insert_exp_sp, _ARCH_PWR9] + vf __builtin_vec_insert_exp_sp (vf, vui); + VIESP VIESP_DEPR1 + vf __builtin_vec_insert_exp_sp (vui, vui); + VIESP VIESP_DEPR2 + +[VEC_VMAXFP, vec_vmaxfp, __builtin_vec_vmaxfp] + vf __builtin_vec_vmaxfp (vf, vf); + VMAXFP VMAXFP_DEPR1 + +[VEC_VMAXSB, vec_vmaxsb, __builtin_vec_vmaxsb] + vsc __builtin_vec_vmaxsb (vsc, vsc); + VMAXSB VMAXSB_DEPR1 + vsc __builtin_vec_vmaxsb (vbc, vsc); + VMAXSB VMAXSB_DEPR2 + vsc __builtin_vec_vmaxsb (vsc, vbc); + VMAXSB VMAXSB_DEPR3 + +[VEC_VMAXSD, vec_vmaxsd, __builtin_vec_vmaxsd] + vsll __builtin_vec_vmaxsd (vsll, vsll); + VMAXSD VMAXSD_DEPR1 + vsll __builtin_vec_vmaxsd (vbll, vsll); + VMAXSD VMAXSD_DEPR2 + vsll __builtin_vec_vmaxsd (vsll, vbll); + VMAXSD VMAXSD_DEPR3 + +[VEC_VMAXSH, vec_vmaxsh, __builtin_vec_vmaxsh] + vss __builtin_vec_vmaxsh (vss, vss); + VMAXSH VMAXSH_DEPR1 + vss __builtin_vec_vmaxsh (vbs, vss); + VMAXSH VMAXSH_DEPR2 + vss __builtin_vec_vmaxsh (vss, vbs); + VMAXSH VMAXSH_DEPR3 + +[VEC_VMAXSW, vec_vmaxsw, __builtin_vec_vmaxsw] + vsi __builtin_vec_vmaxsw (vsi, vsi); + VMAXSW VMAXSW_DEPR1 + vsi __builtin_vec_vmaxsw (vbi, vsi); + VMAXSW VMAXSW_DEPR2 + vsi __builtin_vec_vmaxsw (vsi, vbi); + VMAXSW VMAXSW_DEPR3 + +[VEC_VMAXUB, vec_vmaxub, __builtin_vec_vmaxub] + vuc __builtin_vec_vmaxub (vsc, vuc); + VMAXUB VMAXUB_DEPR1 + vuc __builtin_vec_vmaxub (vuc, vsc); + VMAXUB VMAXUB_DEPR2 + vuc __builtin_vec_vmaxub (vuc, vuc); + VMAXUB VMAXUB_DEPR3 + vuc __builtin_vec_vmaxub (vbc, vuc); + VMAXUB VMAXUB_DEPR4 + vuc __builtin_vec_vmaxub (vuc, vbc); + VMAXUB VMAXUB_DEPR5 + +[VEC_VMAXUD, vec_vmaxud, __builtin_vec_vmaxud] + vull __builtin_vec_vmaxud (vull, vull); + VMAXUD VMAXUD_DEPR1 + vull __builtin_vec_vmaxud (vbll, vull); + VMAXUD VMAXUD_DEPR2 + vull __builtin_vec_vmaxud (vull, vbll); + VMAXUD VMAXUD_DEPR3 + +[VEC_VMAXUH, vec_vmaxuh, __builtin_vec_vmaxuh] + vus __builtin_vec_vmaxuh (vss, vus); + VMAXUH VMAXUH_DEPR1 + vus __builtin_vec_vmaxuh (vus, vss); + VMAXUH VMAXUH_DEPR2 + vus __builtin_vec_vmaxuh (vus, vus); + VMAXUH VMAXUH_DEPR3 + vus __builtin_vec_vmaxuh (vbs, vus); + VMAXUH VMAXUH_DEPR4 + vus __builtin_vec_vmaxuh (vus, vbs); + VMAXUH VMAXUH_DEPR5 + +[VEC_VMAXUW, vec_vmaxuw, __builtin_vec_vmaxuw] + vui __builtin_vec_vmaxuw (vsi, vui); + VMAXUW VMAXUW_DEPR1 + vui __builtin_vec_vmaxuw (vui, vsi); + VMAXUW VMAXUW_DEPR2 + vui __builtin_vec_vmaxuw (vui, vui); + VMAXUW VMAXUW_DEPR3 + vui __builtin_vec_vmaxuw (vbi, vui); + VMAXUW VMAXUW_DEPR4 + vui __builtin_vec_vmaxuw (vui, vbi); + VMAXUW VMAXUW_DEPR5 + +[VEC_VMINFP, vec_vminfp, __builtin_vec_vminfp] + vf __builtin_vec_vminfp (vf, vf); + VMINFP VMINFP_DEPR1 + +[VEC_VMINSB, vec_vminsb, __builtin_vec_vminsb] + vsc __builtin_vec_vminsb (vsc, vsc); + VMINSB VMINSB_DEPR1 + vsc __builtin_vec_vminsb (vbc, vsc); + VMINSB VMINSB_DEPR2 + vsc __builtin_vec_vminsb (vsc, vbc); + VMINSB VMINSB_DEPR3 + +[VEC_VMINSD, vec_vminsd, __builtin_vec_vminsd] + vsll __builtin_vec_vminsd (vsll, vsll); + VMINSD VMINSD_DEPR1 + vsll __builtin_vec_vminsd (vbll, vsll); + VMINSD VMINSD_DEPR2 + vsll __builtin_vec_vminsd (vsll, vbll); + VMINSD VMINSD_DEPR3 + +[VEC_VMINSH, vec_vminsh, __builtin_vec_vminsh] + vss __builtin_vec_vminsh (vss, vss); + VMINSH VMINSH_DEPR1 + vss __builtin_vec_vminsh (vbs, vss); + VMINSH VMINSH_DEPR2 + vss __builtin_vec_vminsh (vss, vbs); + VMINSH VMINSH_DEPR3 + +[VEC_VMINSW, vec_vminsw, __builtin_vec_vminsw] + vsi __builtin_vec_vminsw (vsi, vsi); + VMINSW VMINSW_DEPR1 + vsi __builtin_vec_vminsw (vbi, vsi); + VMINSW VMINSW_DEPR2 + vsi __builtin_vec_vminsw (vsi, vbi); + VMINSW VMINSW_DEPR3 + +[VEC_VMINUB, vec_vminub, __builtin_vec_vminub] + vuc __builtin_vec_vminub (vsc, vuc); + VMINUB VMINUB_DEPR1 + vuc __builtin_vec_vminub (vuc, vsc); + VMINUB VMINUB_DEPR2 + vuc __builtin_vec_vminub (vuc, vuc); + VMINUB VMINUB_DEPR3 + vuc __builtin_vec_vminub (vbc, vuc); + VMINUB VMINUB_DEPR4 + vuc __builtin_vec_vminub (vuc, vbc); + VMINUB VMINUB_DEPR5 + +[VEC_VMINUD, vec_vminud, __builtin_vec_vminud] + vull __builtin_vec_vminud (vull, vull); + VMINUD VMINUD_DEPR1 + vull __builtin_vec_vminud (vbll, vull); + VMINUD VMINUD_DEPR2 + vull __builtin_vec_vminud (vull, vbll); + VMINUD VMINUD_DEPR3 + +[VEC_VMINUH, vec_vminuh, __builtin_vec_vminuh] + vus __builtin_vec_vminuh (vss, vus); + VMINUH VMINUH_DEPR1 + vus __builtin_vec_vminuh (vus, vss); + VMINUH VMINUH_DEPR2 + vus __builtin_vec_vminuh (vus, vus); + VMINUH VMINUH_DEPR3 + vus __builtin_vec_vminuh (vbs, vus); + VMINUH VMINUH_DEPR4 + vus __builtin_vec_vminuh (vus, vbs); + VMINUH VMINUH_DEPR5 + +[VEC_VMINUW, vec_vminuw, __builtin_vec_vminuw] + vui __builtin_vec_vminuw (vsi, vui); + VMINUW VMINUW_DEPR1 + vui __builtin_vec_vminuw (vui, vsi); + VMINUW VMINUW_DEPR2 + vui __builtin_vec_vminuw (vui, vui); + VMINUW VMINUW_DEPR3 + vui __builtin_vec_vminuw (vbi, vui); + VMINUW VMINUW_DEPR4 + vui __builtin_vec_vminuw (vui, vbi); + VMINUW VMINUW_DEPR5 + +[VEC_VMRGHB, vec_vmrghb, __builtin_vec_vmrghb] + vsc __builtin_vec_vmrghb (vsc, vsc); + VMRGHB VMRGHB_DEPR1 + vuc __builtin_vec_vmrghb (vuc, vuc); + VMRGHB VMRGHB_DEPR2 + vbc __builtin_vec_vmrghb (vbc, vbc); + VMRGHB VMRGHB_DEPR3 + +[VEC_VMRGHH, vec_vmrghh, __builtin_vec_vmrghh] + vss __builtin_vec_vmrghh (vss, vss); + VMRGHH VMRGHH_DEPR1 + vus __builtin_vec_vmrghh (vus, vus); + VMRGHH VMRGHH_DEPR2 + vbs __builtin_vec_vmrghh (vbs, vbs); + VMRGHH VMRGHH_DEPR3 + vp __builtin_vec_vmrghh (vp, vp); + VMRGHH VMRGHH_DEPR4 + +[VEC_VMRGHW, vec_vmrghw, __builtin_vec_vmrghw] + vf __builtin_vec_vmrghw (vf, vf); + VMRGHW VMRGHW_DEPR1 + vsi __builtin_vec_vmrghw (vsi, vsi); + VMRGHW VMRGHW_DEPR2 + vui __builtin_vec_vmrghw (vui, vui); + VMRGHW VMRGHW_DEPR3 + vbi __builtin_vec_vmrghw (vbi, vbi); + VMRGHW VMRGHW_DEPR4 + +[VEC_VMRGLB, vec_vmrglb, __builtin_vec_vmrglb] + vsc __builtin_vec_vmrglb (vsc, vsc); + VMRGLB VMRGLB_DEPR1 + vuc __builtin_vec_vmrglb (vuc, vuc); + VMRGLB VMRGLB_DEPR2 + vbc __builtin_vec_vmrglb (vbc, vbc); + VMRGLB VMRGLB_DEPR3 + +[VEC_VMRGLH, vec_vmrglh, __builtin_vec_vmrglh] + vss __builtin_vec_vmrglh (vss, vss); + VMRGLH VMRGLH_DEPR1 + vus __builtin_vec_vmrglh (vus, vus); + VMRGLH VMRGLH_DEPR2 + vbs __builtin_vec_vmrglh (vbs, vbs); + VMRGLH VMRGLH_DEPR3 + vp __builtin_vec_vmrglh (vp, vp); + VMRGLH VMRGLH_DEPR4 + +[VEC_VMRGLW, vec_vmrglw, __builtin_vec_vmrglw] + vf __builtin_vec_vmrglw (vf, vf); + VMRGLW VMRGLW_DEPR1 + vsi __builtin_vec_vmrglw (vsi, vsi); + VMRGLW VMRGLW_DEPR2 + vui __builtin_vec_vmrglw (vui, vui); + VMRGLW VMRGLW_DEPR3 + vbi __builtin_vec_vmrglw (vbi, vbi); + VMRGLW VMRGLW_DEPR4 + +[VEC_VMSUMMBM, vec_vmsummbm, __builtin_vec_vmsummbm] + vsi __builtin_vec_vmsummbm (vsc, vuc, vsi); + VMSUMMBM VMSUMMBM_DEPR1 + +[VEC_VMSUMSHM, vec_vmsumshm, __builtin_vec_vmsumshm] + vsi __builtin_vec_vmsumshm (vss, vss, vsi); + VMSUMSHM VMSUMSHM_DEPR1 + +[VEC_VMSUMSHS, vec_vmsumshs, __builtin_vec_vmsumshs] + vsi __builtin_vec_vmsumshs (vss, vss, vsi); + VMSUMSHS VMSUMSHS_DEPR1 + +[VEC_VMSUMUBM, vec_vmsumubm, __builtin_vec_vmsumubm] + vui __builtin_vec_vmsumubm (vuc, vuc, vui); + VMSUMUBM VMSUMUBM_DEPR1 + +[VEC_VMSUMUDM, vec_vmsumudm, __builtin_vec_vmsumudm] + vuq __builtin_vec_vmsumudm (vull, vull, vuq); + VMSUMUDM VMSUMUDM_DEPR1 + +[VEC_VMSUMUHM, vec_vmsumuhm, __builtin_vec_vmsumuhm] + vui __builtin_vec_vmsumuhm (vus, vus, vui); + VMSUMUHM VMSUMUHM_DEPR1 + +[VEC_VMSUMUHS, vec_vmsumuhs, __builtin_vec_vmsumuhs] + vui __builtin_vec_vmsumuhs (vus, vus, vui); + VMSUMUHS VMSUMUHS_DEPR1 + +[VEC_VMULESB, vec_vmulesb, __builtin_vec_vmulesb] + vss __builtin_vec_vmulesb (vsc, vsc); + VMULESB VMULESB_DEPR1 + +[VEC_VMULESH, vec_vmulesh, __builtin_vec_vmulesh] + vsi __builtin_vec_vmulesh (vss, vss); + VMULESH VMULESH_DEPR1 + +[VEC_VMULESW, SKIP, __builtin_vec_vmulesw] + vsll __builtin_vec_vmulesw (vsi, vsi); + VMULESW VMULESW_DEPR1 + +[VEC_VMULEUB, vec_vmuleub, __builtin_vec_vmuleub] + vus __builtin_vec_vmuleub (vuc, vuc); + VMULEUB VMULEUB_DEPR1 + +[VEC_VMULEUH, vec_vmuleuh, __builtin_vec_vmuleuh] + vui __builtin_vec_vmuleuh (vus, vus); + VMULEUH VMULEUH_DEPR1 + +[VEC_VMULEUW, SKIP, __builtin_vec_vmuleuw] + vull __builtin_vec_vmuleuw (vui, vui); + VMULEUW VMULEUW_DEPR1 + +[VEC_VMULOSB, vec_vmulosb, __builtin_vec_vmulosb] + vss __builtin_vec_vmulosb (vsc, vsc); + VMULOSB VMULOSB_DEPR1 + +[VEC_VMULOSH, vec_vmulosh, __builtin_vec_vmulosh] + vsi __builtin_vec_vmulosh (vss, vss); + VMULOSH VMULOSH_DEPR1 + +[VEC_VMULOSW, SKIP, __builtin_vec_vmulosw] + vsll __builtin_vec_vmulosw (vsi, vsi); + VMULOSW VMULOSW_DEPR1 + +[VEC_VMULOUB, vec_vmuloub, __builtin_vec_vmuloub] + vus __builtin_vec_vmuloub (vuc, vuc); + VMULOUB VMULOUB_DEPR1 + +[VEC_VMULOUH, vec_vmulouh, __builtin_vec_vmulouh] + vui __builtin_vec_vmulouh (vus, vus); + VMULOUH VMULOUH_DEPR1 + +[VEC_VMULOUW, SKIP, __builtin_vec_vmulouw] + vull __builtin_vec_vmulouw (vui, vui); + VMULOUW VMULOUW_DEPR1 + +[VEC_VPKSDSS, vec_vpksdss, __builtin_vec_vpksdss, _ARCH_PWR8] + vsi __builtin_vec_vpksdss (vsll, vsll); + VPKSDSS VPKSDSS_DEPR1 + +[VEC_VPKSDUS, vec_vpksdus, __builtin_vec_vpksdus, _ARCH_PWR8] + vui __builtin_vec_vpksdus (vsll, vsll); + VPKSDUS VPKSDUS_DEPR1 + +[VEC_VPKSHSS, vec_vpkshss, __builtin_vec_vpkshss] + vsc __builtin_vec_vpkshss (vss, vss); + VPKSHSS VPKSHSS_DEPR1 + +[VEC_VPKSHUS, vec_vpkshus, __builtin_vec_vpkshus] + vuc __builtin_vec_vpkshus (vss, vss); + VPKSHUS VPKSHUS_DEPR1 + +[VEC_VPKSWSS, vec_vpkswss, __builtin_vec_vpkswss] + vss __builtin_vec_vpkswss (vsi, vsi); + VPKSWSS VPKSWSS_DEPR1 + +[VEC_VPKSWUS, vec_vpkswus, __builtin_vec_vpkswus] + vus __builtin_vec_vpkswus (vsi, vsi); + VPKSWUS VPKSWUS_DEPR1 + +[VEC_VPKUDUM, vec_vpkudum, __builtin_vec_vpkudum, _ARCH_PWR8] + vsi __builtin_vec_vpkudum (vsll, vsll); + VPKUDUM VPKUDUM_DEPR1 + vui __builtin_vec_vpkudum (vull, vull); + VPKUDUM VPKUDUM_DEPR2 + vbi __builtin_vec_vpkudum (vbll, vbll); + VPKUDUM VPKUDUM_DEPR3 + +[VEC_VPKUDUS, vec_vpkudus, __builtin_vec_vpkudus, _ARCH_PWR8] + vui __builtin_vec_vpkudus (vull, vull); + VPKUDUS VPKUDUS_DEPR1 + +[VEC_VPKUHUM, vec_vpkuhum, __builtin_vec_vpkuhum] + vsc __builtin_vec_vpkuhum (vss, vss); + VPKUHUM VPKUHUM_DEPR1 + vuc __builtin_vec_vpkuhum (vus, vus); + VPKUHUM VPKUHUM_DEPR2 + vbc __builtin_vec_vpkuhum (vbs, vbs); + VPKUHUM VPKUHUM_DEPR3 + +[VEC_VPKUHUS, vec_vpkuhus, __builtin_vec_vpkuhus] + vuc __builtin_vec_vpkuhus (vus, vus); + VPKUHUS VPKUHUS_DEPR1 + +[VEC_VPKUWUM, vec_vpkuwum, __builtin_vec_vpkuwum] + vss __builtin_vec_vpkuwum (vsi, vsi); + VPKUWUM VPKUWUM_DEPR1 + vus __builtin_vec_vpkuwum (vui, vui); + VPKUWUM VPKUWUM_DEPR2 + vbs __builtin_vec_vpkuwum (vbi, vbi); + VPKUWUM VPKUWUM_DEPR3 + +[VEC_VPKUWUS, vec_vpkuwus, __builtin_vec_vpkuwus] + vus __builtin_vec_vpkuwus (vui, vui); + VPKUWUS VPKUWUS_DEPR1 + +[VEC_VPOPCNT, vec_vpopcnt, __builtin_vec_vpopcnt, _ARCH_PWR8] + vsc __builtin_vec_vpopcnt (vsc); + VPOPCNTB VPOPCNT_DEPR1 + vuc __builtin_vec_vpopcnt (vuc); + VPOPCNTB VPOPCNT_DEPR2 + vss __builtin_vec_vpopcnt (vss); + VPOPCNTH VPOPCNT_DEPR3 + vus __builtin_vec_vpopcnt (vus); + VPOPCNTH VPOPCNT_DEPR4 + vsi __builtin_vec_vpopcnt (vsi); + VPOPCNTW VPOPCNT_DEPR5 + vui __builtin_vec_vpopcnt (vui); + VPOPCNTW VPOPCNT_DEPR6 + vsll __builtin_vec_vpopcnt (vsll); + VPOPCNTD VPOPCNT_DEPR7 + vull __builtin_vec_vpopcnt (vull); + VPOPCNTD VPOPCNT_DEPR8 + +[VEC_VPOPCNTB, vec_vpopcntb, __builtin_vec_vpopcntb, _ARCH_PWR8] + vsc __builtin_vec_vpopcntb (vsc); + VPOPCNTB VPOPCNTB_DEPR1 + vuc __builtin_vec_vpopcntb (vuc); + VPOPCNTB VPOPCNTB_DEPR2 + +[VEC_VPOPCNTD, vec_vpopcntd, __builtin_vec_vpopcntd, _ARCH_PWR8] + vsll __builtin_vec_vpopcntd (vsll); + VPOPCNTD VPOPCNTD_DEPR1 + vull __builtin_vec_vpopcntd (vull); + VPOPCNTD VPOPCNTD_DEPR2 + +[VEC_VPOPCNTH, vec_vpopcnth, __builtin_vec_vpopcnth, _ARCH_PWR8] + vss __builtin_vec_vpopcnth (vss); + VPOPCNTH VPOPCNTH_DEPR1 + vus __builtin_vec_vpopcnth (vus); + VPOPCNTH VPOPCNTH_DEPR2 + +[VEC_VPOPCNTW, vec_vpopcntw, __builtin_vec_vpopcntw, _ARCH_PWR8] + vsi __builtin_vec_vpopcntw (vsi); + VPOPCNTW VPOPCNTW_DEPR1 + vui __builtin_vec_vpopcntw (vui); + VPOPCNTW VPOPCNTW_DEPR2 + +[VEC_VPRTYBD, vec_vprtybd, __builtin_vec_vprtybd, _ARCH_PWR9] + vsll __builtin_vec_vprtybd (vsll); + VPRTYBD VPRTYBD_DEPR1 + vull __builtin_vec_vprtybd (vull); + VPRTYBD VPRTYBD_DEPR2 + +[VEC_VPRTYBQ, vec_vprtybq, __builtin_vec_vprtybq, _ARCH_PPC64_PWR9] + vsq __builtin_vec_vprtybq (vsq); + VPRTYBQ VPRTYBQ_DEPR1 + vuq __builtin_vec_vprtybq (vuq); + VPRTYBQ VPRTYBQ_DEPR2 + signed __int128 __builtin_vec_vprtybq (signed __int128); + VPRTYBQ VPRTYBQ_DEPR3 + unsigned __int128 __builtin_vec_vprtybq (unsigned __int128); + VPRTYBQ VPRTYBQ_DEPR4 + +[VEC_VPRTYBW, vec_vprtybw, __builtin_vec_vprtybw, _ARCH_PWR9] + vsi __builtin_vec_vprtybw (vsi); + VPRTYBW VPRTYBW_DEPR1 + vui __builtin_vec_vprtybw (vui); + VPRTYBW VPRTYBW_DEPR2 + +[VEC_VRLB, vec_vrlb, __builtin_vec_vrlb] + vsc __builtin_vec_vrlb (vsc, vuc); + VRLB VRLB_DEPR1 + vuc __builtin_vec_vrlb (vuc, vuc); + VRLB VRLB_DEPR2 + +[VEC_VRLD, SKIP, __builtin_vec_vrld, _ARCH_PWR8] + vsll __builtin_vec_vrld (vsll, vull); + VRLD VRLD_DEPR1 + vull __builtin_vec_vrld (vull, vull); + VRLD VRLD_DEPR2 + +[VEC_VRLH, vec_vrlh, __builtin_vec_vrlh] + vss __builtin_vec_vrlh (vss, vus); + VRLH VRLH_DEPR1 + vus __builtin_vec_vrlh (vus, vus); + VRLH VRLH_DEPR2 + +[VEC_VRLW, vec_vrlw, __builtin_vec_vrlw] + vsi __builtin_vec_vrlw (vsi, vui); + VRLW VRLW_DEPR1 + vui __builtin_vec_vrlw (vui, vui); + VRLW VRLW_DEPR2 + +[VEC_VSLB, vec_vslb, __builtin_vec_vslb] + vsc __builtin_vec_vslb (vsc, vuc); + VSLB VSLB_DEPR1 + vuc __builtin_vec_vslb (vuc, vuc); + VSLB VSLB_DEPR2 + +[VEC_VSLD, SKIP, __builtin_vec_vsld, _ARCH_PWR8] + vsll __builtin_vec_vsld (vsll, vull); + VSLD VSLD_DEPR1 + vull __builtin_vec_vsld (vull, vull); + VSLD VSLD_DEPR2 + +[VEC_VSLH, vec_vslh, __builtin_vec_vslh] + vss __builtin_vec_vslh (vss, vus); + VSLH VSLH_DEPR1 + vus __builtin_vec_vslh (vus, vus); + VSLH VSLH_DEPR2 + +[VEC_VSLW, vec_vslw, __builtin_vec_vslw] + vsi __builtin_vec_vslw (vsi, vui); + VSLW VSLW_DEPR1 + vui __builtin_vec_vslw (vui, vui); + VSLW VSLW_DEPR2 + +[VEC_VSPLTB, vec_vspltb, __builtin_vec_vspltb] + vsc __builtin_vec_vspltb (vsc, const int); + VSPLTB VSPLTB_DEPR1 + vuc __builtin_vec_vspltb (vuc, const int); + VSPLTB VSPLTB_DEPR2 + vbc __builtin_vec_vspltb (vbc, const int); + VSPLTB VSPLTB_DEPR3 + +[VEC_VSPLTH, vec_vsplth, __builtin_vec_vsplth] + vss __builtin_vec_vsplth (vss, const int); + VSPLTH VSPLTH_DEPR1 + vus __builtin_vec_vsplth (vus, const int); + VSPLTH VSPLTH_DEPR2 + vbs __builtin_vec_vsplth (vbs, const int); + VSPLTH VSPLTH_DEPR3 + vp __builtin_vec_vsplth (vp, const int); + VSPLTH VSPLTH_DEPR4 + +[VEC_VSPLTW, vec_vspltw, __builtin_vec_vspltw] + vsi __builtin_vec_vspltw (vsi, const int); + VSPLTW VSPLTW_DEPR1 + vui __builtin_vec_vspltw (vui, const int); + VSPLTW VSPLTW_DEPR2 + vbi __builtin_vec_vspltw (vbi, const int); + VSPLTW VSPLTW_DEPR3 + vf __builtin_vec_vspltw (vf, const int); + VSPLTW VSPLTW_DEPR4 + +[VEC_VSRAB, vec_vsrab, __builtin_vec_vsrab] + vsc __builtin_vec_vsrab (vsc, vuc); + VSRAB VSRAB_DEPR1 + vuc __builtin_vec_vsrab (vuc, vuc); + VSRAB VSRAB_DEPR2 + +[VEC_VSRAD, SKIP, __builtin_vec_vsrad, _ARCH_PWR8] + vsll __builtin_vec_vsrad (vsll, vull); + VSRAD VSRAD_DEPR1 + vull __builtin_vec_vsrad (vull, vull); + VSRAD VSRAD_DEPR2 + +[VEC_VSRAH, vec_vsrah, __builtin_vec_vsrah] + vss __builtin_vec_vsrah (vss, vus); + VSRAH VSRAH_DEPR1 + vus __builtin_vec_vsrah (vus, vus); + VSRAH VSRAH_DEPR2 + +[VEC_VSRAW, vec_vsraw, __builtin_vec_vsraw] + vsi __builtin_vec_vsraw (vsi, vui); + VSRAW VSRAW_DEPR1 + vui __builtin_vec_vsraw (vui, vui); + VSRAW VSRAW_DEPR2 + +[VEC_VSRB, vec_vsrb, __builtin_vec_vsrb] + vsc __builtin_vec_vsrb (vsc, vuc); + VSRB VSRB_DEPR1 + vuc __builtin_vec_vsrb (vuc, vuc); + VSRB VSRB_DEPR2 + +[VEC_VSRD, SKIP, __builtin_vec_vsrd, _ARCH_PWR8] + vsll __builtin_vec_vsrd (vsll, vull); + VSRD VSRD_DEPR1 + vull __builtin_vec_vsrd (vull, vull); + VSRD VSRD_DEPR2 + +[VEC_VSRH, vec_vsrh, __builtin_vec_vsrh] + vss __builtin_vec_vsrh (vss, vus); + VSRH VSRH_DEPR1 + vus __builtin_vec_vsrh (vus, vus); + VSRH VSRH_DEPR2 + +[VEC_VSRW, vec_vsrw, __builtin_vec_vsrw] + vsi __builtin_vec_vsrw (vsi, vui); + VSRW VSRW_DEPR1 + vui __builtin_vec_vsrw (vui, vui); + VSRW VSRW_DEPR2 + +[VEC_VSTDCDP, scalar_test_data_class_dp, __builtin_vec_scalar_test_data_class_dp, _ARCH_PWR9] + unsigned int __builtin_vec_scalar_test_data_class_dp (double, const int); + VSTDCDP VSTDCDP_DEPR1 + +[VEC_VSTDCNDP, scalar_test_neg_dp, __builtin_vec_scalar_test_neg_dp, _ARCH_PWR9] + unsigned int __builtin_vec_scalar_test_neg_dp (double); + VSTDCNDP VSTDCNDP_DEPR1 + +[VEC_VSTDCNQP, scalar_test_neg_qp, __builtin_vec_scalar_test_neg_qp, _ARCH_PWR9] + unsigned int __builtin_vec_scalar_test_neg_qp (_Float128); + VSTDCNQP VSTDCNQP_DEPR1 + +[VEC_VSTDCNSP, scalar_test_neg_sp, __builtin_vec_scalar_test_neg_sp, _ARCH_PWR9] + unsigned int __builtin_vec_scalar_test_neg_sp (float); + VSTDCNSP VSTDCNSP_DEPR1 + +[VEC_VSTDCQP, scalar_test_data_class_qp, __builtin_vec_scalar_test_data_class_qp, _ARCH_PWR9] + unsigned int __builtin_vec_scalar_test_data_class_qp (_Float128, const int); + VSTDCQP VSTDCQP_DEPR1 + +[VEC_VSTDCSP, scalar_test_data_class_sp, __builtin_vec_scalar_test_data_class_sp, _ARCH_PWR9] + unsigned int __builtin_vec_scalar_test_data_class_sp (float, const int); + VSTDCSP VSTDCSP_DEPR1 + +[VEC_VSUBCUQ, vec_vsubcuqP, __builtin_vec_vsubcuq] + vsq __builtin_vec_vsubcuq (vsq, vsq); + VSUBCUQ VSUBCUQ_DEPR1 + vuq __builtin_vec_vsubcuq (vuq, vuq); + VSUBCUQ VSUBCUQ_DEPR2 + +[VEC_VSUBECUQ, vec_vsubecuq, __builtin_vec_vsubecuq, ARCH_PWR8] + vsq __builtin_vec_vsubecuq (vsq, vsq, vsq); + VSUBECUQ VSUBECUQ_DEPR1 + vuq __builtin_vec_vsubecuq (vuq, vuq, vuq); + VSUBECUQ VSUBECUQ_DEPR2 + +[VEC_VSUBEUQM, vec_vsubeuqm, __builtin_vec_vsubeuqm, _ARCH_PWR8] + vsq __builtin_vec_vsubeuqm (vsq, vsq, vsq); + VSUBEUQM VSUBEUQM_DEPR1 + vuq __builtin_vec_vsubeuqm (vuq, vuq, vuq); + VSUBEUQM VSUBEUQM_DEPR2 + +[VEC_VSUBFP, vec_vsubfp, __builtin_vec_vsubfp] + vf __builtin_vec_vsubfp (vf, vf); + VSUBFP VSUBFP_DEPR1 + +[VEC_VSUBSBS, vec_vsubsbs, __builtin_vec_vsubsbs] + vsc __builtin_vec_vsubsbs (vsc, vsc); + VSUBSBS VSUBSBS_DEPR1 + vsc __builtin_vec_vsubsbs (vbc, vsc); + VSUBSBS VSUBSBS_DEPR2 + vsc __builtin_vec_vsubsbs (vsc, vbc); + VSUBSBS VSUBSBS_DEPR3 + +[VEC_VSUBSHS, vec_vsubshs, __builtin_vec_vsubshs] + vss __builtin_vec_vsubshs (vss, vss); + VSUBSHS VSUBSHS_DEPR1 + vss __builtin_vec_vsubshs (vbs, vss); + VSUBSHS VSUBSHS_DEPR2 + vss __builtin_vec_vsubshs (vss, vbs); + VSUBSHS VSUBSHS_DEPR3 + +[VEC_VSUBSWS, vec_vsubsws, __builtin_vec_vsubsws] + vsi __builtin_vec_vsubsws (vsi, vsi); + VSUBSWS VSUBSWS_DEPR1 + vsi __builtin_vec_vsubsws (vbi, vsi); + VSUBSWS VSUBSWS_DEPR2 + vsi __builtin_vec_vsubsws (vsi, vbi); + VSUBSWS VSUBSWS_DEPR3 + +[VEC_VSUBUBM, vec_vsububm, __builtin_vec_vsububm] + vsc __builtin_vec_vsububm (vsc, vsc); + VSUBUBM VSUBUBM_DEPR1 + vuc __builtin_vec_vsububm (vsc, vuc); + VSUBUBM VSUBUBM_DEPR2 + vuc __builtin_vec_vsububm (vuc, vsc); + VSUBUBM VSUBUBM_DEPR3 + vuc __builtin_vec_vsububm (vuc, vuc); + VSUBUBM VSUBUBM_DEPR4 + vsc __builtin_vec_vsububm (vbc, vsc); + VSUBUBM VSUBUBM_DEPR5 + vsc __builtin_vec_vsububm (vsc, vbc); + VSUBUBM VSUBUBM_DEPR6 + vuc __builtin_vec_vsububm (vbc, vuc); + VSUBUBM VSUBUBM_DEPR7 + vuc __builtin_vec_vsububm (vuc, vbc); + VSUBUBM VSUBUBM_DEPR8 + +[VEC_VSUBUBS, vec_vsububs, __builtin_vec_vsububs] + vsc __builtin_vec_vsububs (vsc, vsc); + VSUBUBS VSUBUBS_DEPR1 + vsc __builtin_vec_vsububs (vbc, vsc); + VSUBUBS VSUBUBS_DEPR2 + vsc __builtin_vec_vsububs (vsc, vbc); + VSUBUBS VSUBUBS_DEPR3 + vuc __builtin_vec_vsububs (vsc, vuc); + VSUBUBS VSUBUBS_DEPR4 + vuc __builtin_vec_vsububs (vuc, vsc); + VSUBUBS VSUBUBS_DEPR5 + vuc __builtin_vec_vsububs (vuc, vuc); + VSUBUBS VSUBUBS_DEPR6 + vuc __builtin_vec_vsububs (vbc, vuc); + VSUBUBS VSUBUBS_DEPR7 + vuc __builtin_vec_vsububs (vuc, vbc); + VSUBUBS VSUBUBS_DEPR8 + +[VEC_VSUBUDM, vec_vsubudm, __builtin_vec_vsubudm, _ARCH_PWR8] + vsll __builtin_vec_vsubudm (vbll, vsll); + VSUBUDM VSUBUDM_DEPR1 + vsll __builtin_vec_vsubudm (vsll, vbll); + VSUBUDM VSUBUDM_DEPR2 + vsll __builtin_vec_vsubudm (vsll, vsll); + VSUBUDM VSUBUDM_DEPR3 + vull __builtin_vec_vsubudm (vbll, vull); + VSUBUDM VSUBUDM_DEPR4 + vull __builtin_vec_vsubudm (vull, vbll); + VSUBUDM VSUBUDM_DEPR5 + vull __builtin_vec_vsubudm (vull, vull); + VSUBUDM VSUBUDM_DEPR6 + +[VEC_VSUBUHM, vec_vsubuhm, __builtin_vec_vsubuhm] + vss __builtin_vec_vsubuhm (vss, vss); + VSUBUHM VUSBUHM_DEPR1 + vus __builtin_vec_vsubuhm (vss, vus); + VSUBUHM VUSBUHM_DEPR2 + vus __builtin_vec_vsubuhm (vus, vss); + VSUBUHM VUSBUHM_DEPR3 + vus __builtin_vec_vsubuhm (vus, vus); + VSUBUHM VUSBUHM_DEPR4 + vss __builtin_vec_vsubuhm (vbs, vss); + VSUBUHM VUSBUHM_DEPR5 + vss __builtin_vec_vsubuhm (vss, vbs); + VSUBUHM VUSBUHM_DEPR6 + vus __builtin_vec_vsubuhm (vbs, vus); + VSUBUHM VUSBUHM_DEPR7 + vus __builtin_vec_vsubuhm (vus, vbs); + VSUBUHM VUSBUHM_DEPR8 + +[VEC_VSUBUHS, vec_vsubuhs, __builtin_vec_vsubuhs] + vus __builtin_vec_vsubuhs (vss, vus); + VSUBUHS VSUBUHS_DEPR1 + vus __builtin_vec_vsubuhs (vus, vss); + VSUBUHS VSUBUHS_DEPR2 + vus __builtin_vec_vsubuhs (vus, vus); + VSUBUHS VSUBUHS_DEPR3 + vus __builtin_vec_vsubuhs (vbs, vus); + VSUBUHS VSUBUHS_DEPR4 + vus __builtin_vec_vsubuhs (vus, vbs); + VSUBUHS VSUBUHS_DEPR5 + +[VEC_VSUBUQM, vec_vsubuqm, __builtin_vec_vsubuqm, _ARCH_PWR8] + vsq __builtin_vec_vsubuqm (vsq, vsq); + VSUBUQM VSUBUQM_DEPR1 + vuq __builtin_vec_vsubuqm (vuq, vuq); + VSUBUQM VSUBUQM_DEPR2 + +[VEC_VSUBUWM, vec_vsubuwm, __builtin_vec_vsubuwm] + vsi __builtin_vec_vsubuwm (vbi, vsi); + VSUBUWM VSUBUWM_DEPR1 + vsi __builtin_vec_vsubuwm (vsi, vbi); + VSUBUWM VSUBUWM_DEPR2 + vui __builtin_vec_vsubuwm (vbi, vui); + VSUBUWM VSUBUWM_DEPR3 + vui __builtin_vec_vsubuwm (vui, vbi); + VSUBUWM VSUBUWM_DEPR4 + vsi __builtin_vec_vsubuwm (vsi, vsi); + VSUBUWM VSUBUWM_DEPR5 + vui __builtin_vec_vsubuwm (vsi, vui); + VSUBUWM VSUBUWM_DEPR6 + vui __builtin_vec_vsubuwm (vui, vsi); + VSUBUWM VSUBUWM_DEPR7 + vui __builtin_vec_vsubuwm (vui, vui); + VSUBUWM VSUBUWM_DEPR8 + +[VEC_VSUBUWS, vec_vsubuws, __builtin_vec_vsubuws] + vui __builtin_vec_vsubuws (vsi, vui); + VSUBUWS VSUBUWS_DEPR1 + vui __builtin_vec_vsubuws (vui, vsi); + VSUBUWS VSUBUWS_DEPR2 + vui __builtin_vec_vsubuws (vui, vui); + VSUBUWS VSUBUWS_DEPR3 + vui __builtin_vec_vsubuws (vbi, vui); + VSUBUWS VSUBUWS_DEPR4 + vui __builtin_vec_vsubuws (vui, vbi); + VSUBUWS VSUBUWS_DEPR5 + +[VEC_VSUM4SBS, vec_vsum4sbs, __builtin_vec_vsum4sbs] + vsi __builtin_vec_vsum4sbs (vsc, vsi); + VSUM4SBS VSUM4SBS_DEPR1 + +[VEC_VSUM4SHS, vec_vsum4shs, __builtin_vec_vsum4shs] + vsi __builtin_vec_vsum4shs (vss, vsi); + VSUM4SHS VSUM4SHS_DEPR1 + +[VEC_VSUM4UBS, vec_vsum4ubs, __builtin_vec_vsum4ubs] + vui __builtin_vec_vsum4ubs (vuc, vui); + VSUM4UBS VSUM4UBS_DEPR1 + +[VEC_VTDCDP, vec_test_data_class_dp, __builtin_vec_test_data_class_dp, _ARCH_PWR9] + vbll __builtin_vec_test_data_class_dp (vd, const int); + VTDCDP VTDCDP_DEPR1 + +[VEC_VTDCSP, vec_test_data_class_sp, __builtin_vec_test_data_class_sp, _ARCH_PWR9] + vbi __builtin_vec_test_data_class_sp (vf, const int); + VTDCSP VTDCSP_DEPR1 + +[VEC_UNS_DOUBLEE, vec_uns_doublee, __builtin_vec_uns_doublee] + vd __builtin_vec_uns_doublee (vui); + UNS_DOUBLEE_V4SI UNS_DOUBLEE_DEPR1 + +[VEC_UNS_DOUBLEH, vec_uns_doubleh, __builtin_vec_uns_doubleh] + vd __builtin_vec_uns_doubleh (vui); + UNS_DOUBLEH_V4SI UNS_DOUBLEH_DEPR1 + +[VEC_UNS_DOUBLEL, vec_uns_doublel, __builtin_vec_uns_doublel] + vd __builtin_vec_uns_doublel (vui); + UNS_DOUBLEL_V4SI UNS_DOUBLEL_DEPR1 + +[VEC_UNS_DOUBLEO, vec_uns_doubleo, __builtin_vec_uns_doubleo] + vd __builtin_vec_uns_doubleo (vui); + UNS_DOUBLEO_V4SI UNS_DOUBLEO_DEPR1 + +[VEC_VUPKHPX, vec_vupkhpx, __builtin_vec_vupkhpx] + vui __builtin_vec_vupkhpx (vus); + VUPKHPX VUPKHPX_DEPR1 + vui __builtin_vec_vupkhpx (vp); + VUPKHPX VUPKHPX_DEPR2 + +[VEC_VUPKHSB, vec_vupkhsb, __builtin_vec_vupkhsb] + vss __builtin_vec_vupkhsb (vsc); + VUPKHSB VUPKHSB_DEPR1 + vbs __builtin_vec_vupkhsb (vbc); + VUPKHSB VUPKHSB_DEPR2 + +[VEC_VUPKHSH, vec_vupkhsh, __builtin_vec_vupkhsh] + vsi __builtin_vec_vupkhsh (vss); + VUPKHSH VUPKHSH_DEPR1 + vbi __builtin_vec_vupkhsh (vbs); + VUPKHSH VUPKHSH_DEPR2 + +[VEC_VUPKHSW, vec_vupkhsw, __builtin_vec_vupkhsw, _ARCH_PWR8] + vsll __builtin_vec_vupkhsw (vsi); + VUPKHSW VUPKHSW_DEPR1 + vbll __builtin_vec_vupkhsw (vbi); + VUPKHSW VUPKHSW_DEPR2 + +[VEC_VUPKLPX, vec_vupklpx, __builtin_vec_vupklpx] + vui __builtin_vec_vupklpx (vus); + VUPKLPX VUPKLPX_DEPR1 + vui __builtin_vec_vupklpx (vp); + VUPKLPX VUPKLPX_DEPR2 + +[VEC_VUPKLSB, vec_vupklsb, __builtin_vec_vupklsb] + vss __builtin_vec_vupklsb (vsc); + VUPKLSB VUPKLSB_DEPR1 + vbs __builtin_vec_vupklsb (vbc); + VUPKLSB VUPKLSB_DEPR2 + +[VEC_VUPKLSH, vec_vupklsh, __builtin_vec_vupklsh] + vsi __builtin_vec_vupklsh (vss); + VUPKLSH VUPKLSH_DEPR1 + vbi __builtin_vec_vupklsh (vbs); + VUPKLSH VUPKLSH_DEPR2 + +[VEC_VUPKLSW, vec_vupklsw, __builtin_vec_vupklsw, _ARCH_PWR8] + vsll __builtin_vec_vupklsw (vsi); + VUPKLSW VUPKLSW_DEPR1 + vbll __builtin_vec_vupklsw (vbi); + VUPKLSW VUPKLSW_DEPR2 -- cgit v1.1 From 5a6c626710ad2ac4baa2dba02fac0750177e3305 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 26 Aug 2021 22:08:25 +0200 Subject: [i386] Call force_reg unconditionally. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no point to check RTXes before calling force_reg, force_reg checks for REG RTX by itself. 2021-08-26 Uroš Bizjak gcc/ * config/i386/i386.md (*btr_1): Call force_reg unconditionally. (conditional moves with memory inputs splitters): Ditto. * config/i386/sse.md (one_cmpl2): Simplify. --- gcc/config/i386/i386.md | 15 +++++---------- gcc/config/i386/sse.md | 6 +++--- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index c80dcb5..528116d 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -12728,8 +12728,7 @@ (clobber (reg:CC FLAGS_REG))])] { operands[0] = lowpart_subreg (SImode, operands[0], mode); - if (MEM_P (operands[1])) - operands[1] = force_reg (mode, operands[1]); + operands[1] = force_reg (mode, operands[1]); operands[1] = lowpart_subreg (SImode, operands[1], mode); }) @@ -19352,10 +19351,8 @@ [(set (match_dup 0) (if_then_else:SWI248 (match_dup 1) (match_dup 2) (match_dup 3)))] { - if (MEM_P (operands[2])) - operands[2] = force_reg (mode, operands[2]); - if (MEM_P (operands[3])) - operands[3] = force_reg (mode, operands[3]); + operands[2] = force_reg (mode, operands[2]); + operands[3] = force_reg (mode, operands[3]); }) (define_insn "*movqicc_noc" @@ -19603,10 +19600,8 @@ [(set (match_dup 0) (if_then_else:MODEF (match_dup 1) (match_dup 2) (match_dup 3)))] { - if (MEM_P (operands[2])) - operands[2] = force_reg (mode, operands[2]); - if (MEM_P (operands[3])) - operands[3] = force_reg (mode, operands[3]); + operands[2] = force_reg (mode, operands[2]); + operands[3] = force_reg (mode, operands[3]); }) ;; Don't do conditional moves with memory inputs diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 03fc2df..0ca3229 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -14318,10 +14318,10 @@ (match_dup 2)))] "TARGET_SSE" { + operands[2] = CONSTM1_RTX (mode); + if (!TARGET_AVX512F) - operands[2] = force_reg (mode, CONSTM1_RTX (mode)); - else - operands[2] = CONSTM1_RTX (mode); + operands[2] = force_reg (mode, operands[2]); }) (define_insn "one_cmpl2" -- cgit v1.1 From 0fa4787bf34b173ce6f198e99b6f6dd8a3f98014 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Fri, 11 Dec 2020 19:02:43 +0800 Subject: Fold more shuffle builtins to VEC_PERM_EXPR. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A follow-up to https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html gcc/ PR target/98167 PR target/43147 * config/i386/i386.c (ix86_gimple_fold_builtin): Fold IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512, IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS, IX86_BUILTIN_SHUFPS256. (ix86_masked_all_ones): New function. gcc/testsuite/ * gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase. * gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase. * gcc.target/i386/pr43147.c: New test. --- gcc/config/i386/i386.c | 91 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 22 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ddbbbce..3bb2cab 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -17559,6 +17559,21 @@ ix86_vector_shift_count (tree arg1) return NULL_TREE; } +/* Return true if arg_mask is all ones, ELEMS is elements number of + corresponding vector. */ +static bool +ix86_masked_all_ones (unsigned HOST_WIDE_INT elems, tree arg_mask) +{ + if (TREE_CODE (arg_mask) != INTEGER_CST) + return false; + + unsigned HOST_WIDE_INT mask = TREE_INT_CST_LOW (arg_mask); + if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) + return false; + + return true; +} + static tree ix86_fold_builtin (tree fndecl, int n_args, tree *args, bool ignore ATTRIBUTE_UNUSED) @@ -18044,6 +18059,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) enum tree_code tcode; unsigned HOST_WIDE_INT count; bool is_vshift; + unsigned HOST_WIDE_INT elems; switch (fn_code) { @@ -18367,17 +18383,11 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) gcc_assert (n_args >= 2); arg0 = gimple_call_arg (stmt, 0); arg1 = gimple_call_arg (stmt, 1); - if (n_args > 2) - { - /* This is masked shift. Only optimize if the mask is all ones. */ - tree argl = gimple_call_arg (stmt, n_args - 1); - if (!tree_fits_uhwi_p (argl)) - break; - unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl); - unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); - if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) - break; - } + elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); + /* For masked shift, only optimize if the mask is all ones. */ + if (n_args > 2 + && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, n_args - 1))) + break; if (is_vshift) { if (TREE_CODE (arg1) != VECTOR_CST) @@ -18426,25 +18436,62 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) } break; + case IX86_BUILTIN_SHUFPD512: + case IX86_BUILTIN_SHUFPS512: case IX86_BUILTIN_SHUFPD: + case IX86_BUILTIN_SHUFPD256: + case IX86_BUILTIN_SHUFPS: + case IX86_BUILTIN_SHUFPS256: + arg0 = gimple_call_arg (stmt, 0); + elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); + /* This is masked shuffle. Only optimize if the mask is all ones. */ + if (n_args > 3 + && !ix86_masked_all_ones (elems, + gimple_call_arg (stmt, n_args - 1))) + break; arg2 = gimple_call_arg (stmt, 2); if (TREE_CODE (arg2) == INTEGER_CST) { + unsigned HOST_WIDE_INT shuffle_mask = TREE_INT_CST_LOW (arg2); + /* Check valid imm, refer to gcc.target/i386/testimm-10.c. */ + if (shuffle_mask > 255) + return false; + + machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0))); location_t loc = gimple_location (stmt); - unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); - arg0 = gimple_call_arg (stmt, 0); + tree itype = (imode == E_DFmode + ? long_long_integer_type_node : integer_type_node); + tree vtype = build_vector_type (itype, elems); + tree_vector_builder elts (vtype, elems, 1); + + + /* Transform integer shuffle_mask to vector perm_mask which + is used by vec_perm_expr, refer to shuflp[sd]256/512 in sse.md. */ + for (unsigned i = 0; i != elems; i++) + { + unsigned sel_idx; + /* Imm[1:0](if VL > 128, then use Imm[3:2],Imm[5:4],Imm[7:6]) + provide 2 select constrols for each element of the + destination. */ + if (imode == E_DFmode) + sel_idx = (i & 1) * elems + (i & ~1) + + ((shuffle_mask >> i) & 1); + else + { + /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select + controls for each element of the destination. */ + unsigned j = i % 4; + sel_idx = ((i >> 1) & 1) * elems + (i & ~3) + + ((shuffle_mask >> 2 * j) & 3); + } + elts.quick_push (build_int_cst (itype, sel_idx)); + } + + tree perm_mask = elts.build (); arg1 = gimple_call_arg (stmt, 1); - tree itype = long_long_integer_type_node; - tree vtype = build_vector_type (itype, 2); /* V2DI */ - tree_vector_builder elts (vtype, 2, 1); - /* Ignore bits other than the lowest 2. */ - elts.quick_push (build_int_cst (itype, imask & 1)); - imask >>= 1; - elts.quick_push (build_int_cst (itype, 2 + (imask & 1))); - tree omask = elts.build (); gimple *g = gimple_build_assign (gimple_call_lhs (stmt), VEC_PERM_EXPR, - arg0, arg1, omask); + arg0, arg1, perm_mask); gimple_set_location (g, loc); gsi_replace (gsi, g, false); return true; -- cgit v1.1 From 5faf7120398c9bf290758891a975da1f727d631a Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Thu, 26 Aug 2021 17:14:18 -0400 Subject: aix: packed struct alignment [PR102068] Further fixes to structure alignment when the structure is packed and contains double. This patch checks for packed attribute at the top level. gcc/ChangeLog: PR target/102068 * config/rs6000/rs6000.c (rs6000_adjust_field_align): Use computed alignment if the entire struct has attribute packed. --- gcc/config/rs6000/rs6000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index e073b26..05fb6aa 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -7955,7 +7955,7 @@ rs6000_slow_unaligned_access (machine_mode mode, unsigned int align) unsigned int rs6000_special_adjust_field_align (tree type, unsigned int computed) { - if (computed <= 32) + if (computed <= 32 || TYPE_PACKED (type)) return computed; /* Strip initial arrays. */ -- cgit v1.1 From 2e64eec6719e596e7f095c977edcc63812be18a4 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Thu, 26 Aug 2021 20:23:17 -0500 Subject: rs6000: Add missing unsigned info for some P10 bifs This patch is to make prototypes of some Power10 built-in functions consistent with what's in the documentation, as well as the vector version. Otherwise, useless conversions can be generated in gimple IR, and the vectorized versions will have inconsistent types. gcc/ChangeLog: * config/rs6000/rs6000-call.c (builtin_function_type): Add unsigned signedness for some Power10 bifs. --- gcc/config/rs6000/rs6000-call.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index fd7f24d..3a07118 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -14823,6 +14823,11 @@ builtin_function_type (machine_mode mode_ret, machine_mode mode_arg0, case P8V_BUILTIN_ORC_V4SI_UNS: case P8V_BUILTIN_ORC_V2DI_UNS: case P8V_BUILTIN_ORC_V1TI_UNS: + case P10_BUILTIN_CFUGED: + case P10_BUILTIN_CNTLZDM: + case P10_BUILTIN_CNTTZDM: + case P10_BUILTIN_PDEPD: + case P10_BUILTIN_PEXTD: case P10V_BUILTIN_VCFUGED: case P10V_BUILTIN_VCLZDM: case P10V_BUILTIN_VCTZDM: -- cgit v1.1 From 26f5ea5e141cf1e40289dbc73ac21e85ad39fa57 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Thu, 26 Aug 2021 20:23:58 -0500 Subject: rs6000: Make some BIFs vectorized on P10 This patch is to add the support to make vectorizer able to vectorize some built-in function scalar versions on Power10. gcc/ChangeLog: * config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add support for built-in functions MISC_BUILTIN_DIVWE, MISC_BUILTIN_DIVWEU, MISC_BUILTIN_DIVDE, MISC_BUILTIN_DIVDEU, P10_BUILTIN_CFUGED, P10_BUILTIN_CNTLZDM, P10_BUILTIN_CNTTZDM, P10_BUILTIN_PDEPD and P10_BUILTIN_PEXTD on Power10. gcc/testsuite/ChangeLog: * gcc.target/powerpc/dive-vectorize-1.c: New test. * gcc.target/powerpc/dive-vectorize-1.h: New test. * gcc.target/powerpc/dive-vectorize-2.c: New test. * gcc.target/powerpc/dive-vectorize-2.h: New test. * gcc.target/powerpc/dive-vectorize-run-1.c: New test. * gcc.target/powerpc/dive-vectorize-run-2.c: New test. * gcc.target/powerpc/p10-bifs-vectorize-1.c: New test. * gcc.target/powerpc/p10-bifs-vectorize-1.h: New test. * gcc.target/powerpc/p10-bifs-vectorize-run-1.c: New test. --- gcc/config/rs6000/rs6000.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 05fb6aa..d02c1b6 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5793,6 +5793,59 @@ rs6000_builtin_md_vectorized_function (tree fndecl, tree type_out, default: break; } + + machine_mode in_vmode = TYPE_MODE (type_in); + machine_mode out_vmode = TYPE_MODE (type_out); + + /* Power10 supported vectorized built-in functions. */ + if (TARGET_POWER10 + && in_vmode == out_vmode + && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode)) + { + machine_mode exp_mode = DImode; + machine_mode exp_vmode = V2DImode; + enum rs6000_builtins bif; + switch (fn) + { + case MISC_BUILTIN_DIVWE: + case MISC_BUILTIN_DIVWEU: + exp_mode = SImode; + exp_vmode = V4SImode; + if (fn == MISC_BUILTIN_DIVWE) + bif = P10V_BUILTIN_DIVES_V4SI; + else + bif = P10V_BUILTIN_DIVEU_V4SI; + break; + case MISC_BUILTIN_DIVDE: + case MISC_BUILTIN_DIVDEU: + if (fn == MISC_BUILTIN_DIVDE) + bif = P10V_BUILTIN_DIVES_V2DI; + else + bif = P10V_BUILTIN_DIVEU_V2DI; + break; + case P10_BUILTIN_CFUGED: + bif = P10V_BUILTIN_VCFUGED; + break; + case P10_BUILTIN_CNTLZDM: + bif = P10V_BUILTIN_VCLZDM; + break; + case P10_BUILTIN_CNTTZDM: + bif = P10V_BUILTIN_VCTZDM; + break; + case P10_BUILTIN_PDEPD: + bif = P10V_BUILTIN_VPDEPD; + break; + case P10_BUILTIN_PEXTD: + bif = P10V_BUILTIN_VPEXTD; + break; + default: + return NULL_TREE; + } + + if (in_mode == exp_mode && in_vmode == exp_vmode) + return rs6000_builtin_decls[bif]; + } + return NULL_TREE; } -- cgit v1.1 From 44a545a6abdd330083c1d12ad70092defbba702a Mon Sep 17 00:00:00 2001 From: konglin1 Date: Mon, 9 Aug 2021 11:37:52 +0800 Subject: i386: Fix wrong optimization for consecutive masked scatters [PR 101472] gcc/ChangeLog: PR target/101472 * config/i386/sse.md: (scattersi): Add mask operand to UNSPEC_VSIBADDR. (scattersi): Likewise. (*avx512f_scattersi): Merge mask operand to set_dest. (*avx512f_scatterdi): Likewise gcc/testsuite/ChangeLog: PR target/101472 * gcc.target/i386/avx512f-pr101472.c: New test. * gcc.target/i386/avx512vl-pr101472.c: New test. --- gcc/config/i386/sse.md | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 0ca3229..ac0c463 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -24205,8 +24205,9 @@ "TARGET_AVX512F" { operands[5] - = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[0], operands[2], - operands[4]), UNSPEC_VSIBADDR); + = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, operands[0], operands[2], + operands[4], operands[1]), + UNSPEC_VSIBADDR); }) (define_insn "*avx512f_scattersi" @@ -24214,10 +24215,11 @@ [(unspec:P [(match_operand:P 0 "vsib_address_operand" "Tv") (match_operand: 2 "register_operand" "v") - (match_operand:SI 4 "const1248_operand" "n")] + (match_operand:SI 4 "const1248_operand" "n") + (match_operand: 6 "register_operand" "1")] UNSPEC_VSIBADDR)]) (unspec:VI48F - [(match_operand: 6 "register_operand" "1") + [(match_dup 6) (match_operand:VI48F 3 "register_operand" "v")] UNSPEC_SCATTER)) (clobber (match_scratch: 1 "=&Yk"))] @@ -24243,8 +24245,9 @@ "TARGET_AVX512F" { operands[5] - = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[0], operands[2], - operands[4]), UNSPEC_VSIBADDR); + = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, operands[0], operands[2], + operands[4], operands[1]), + UNSPEC_VSIBADDR); }) (define_insn "*avx512f_scatterdi" @@ -24252,10 +24255,11 @@ [(unspec:P [(match_operand:P 0 "vsib_address_operand" "Tv") (match_operand: 2 "register_operand" "v") - (match_operand:SI 4 "const1248_operand" "n")] + (match_operand:SI 4 "const1248_operand" "n") + (match_operand:QI 6 "register_operand" "1")] UNSPEC_VSIBADDR)]) (unspec:VI48F - [(match_operand:QI 6 "register_operand" "1") + [(match_dup 6) (match_operand: 3 "register_operand" "v")] UNSPEC_SCATTER)) (clobber (match_scratch:QI 1 "=&Yk"))] -- cgit v1.1 From ff6bb9dde10ab665a35bb75527313cd9f7d52f8e Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Fri, 27 Aug 2021 09:55:29 -0500 Subject: rs6000: Execute the automatic built-in initialization code 2021-08-27 Bill Schmidt gcc/ * config/rs6000/rs6000-call.c (rs6000-builtins.h): New #include. (rs6000_init_builtins): Call rs6000_init_generated_builtins. Skip the old initialization logic when new builtins are enabled. * config/rs6000/rs6000-gen-builtins.c (write_decls): Rename rs6000_autoinit_builtins to rs6000_init_generated_builtins. (write_init_file): Likewise. --- gcc/config/rs6000/rs6000-call.c | 12 ++++++++++++ gcc/config/rs6000/rs6000-gen-builtins.c | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index 3a07118..87f9709 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -69,6 +69,7 @@ #include "opts.h" #include "rs6000-internal.h" +#include "rs6000-builtins.h" #if TARGET_MACHO #include "gstab.h" /* for N_SLINE */ @@ -13647,6 +13648,17 @@ rs6000_init_builtins (void) = build_pointer_type (build_qualified_type (void_type_node, TYPE_QUAL_CONST)); + /* Execute the autogenerated initialization code for builtins. */ + rs6000_init_generated_builtins (); + + if (new_builtins_are_live) + { +#ifdef SUBTARGET_INIT_BUILTINS + SUBTARGET_INIT_BUILTINS; +#endif + return; + } + /* Create Altivec, VSX and MMA builtins on machines with at least the general purpose extensions (970 and newer) to allow the use of the target attribute. */ diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index 000e5f9..f3d6156 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -2312,7 +2312,7 @@ write_decls (void) "extern ovlddata rs6000_instance_info[RS6000_INST_MAX];\n"); fprintf (header_file, "extern ovldrecord rs6000_overload_info[];\n\n"); - fprintf (header_file, "extern void rs6000_autoinit_builtins ();\n\n"); + fprintf (header_file, "extern void rs6000_init_generated_builtins ();\n\n"); fprintf (header_file, "extern bool rs6000_new_builtin_is_supported_p " "(rs6000_gen_builtins);\n"); @@ -2794,7 +2794,7 @@ write_init_file (void) fprintf (init_file, "\n"); fprintf (init_file, "void\n"); - fprintf (init_file, "rs6000_autoinit_builtins ()\n"); + fprintf (init_file, "rs6000_init_generated_builtins ()\n"); fprintf (init_file, "{\n"); fprintf (init_file, " tree t;\n"); rbt_inorder_callback (&fntype_rbt, fntype_rbt.rbt_root, write_fntype_init); -- cgit v1.1 From 66526c6a1c01b6110eaeda94ecc885177c636605 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Sat, 29 Aug 2020 22:05:30 +0100 Subject: Darwin : Mark the mod init/term section starts with a linker-visible sym. Some newer assemblers emit section start temp symbols for mod init and term sections if there is no suitable symbol present already. The temp symbols are linker visible and therefore appear in the symbol tables. Since the temp symbol number can vary when debug is enabled, that causes compare-debug fails. The solution is to provide a stable linker-visible symbol. Signed-off-by: Iain Sandoe gcc/ChangeLog: * config/darwin.c (finalize_ctors): Add a section-start linker- visible symbol. (finalize_dtors): Likewise. * config/darwin.h (MIN_LD64_INIT_TERM_START_LABELS): New. --- gcc/config/darwin.c | 37 ++++++++++++++++++++++++++++++++----- gcc/config/darwin.h | 3 +++ 2 files changed, 35 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c index 5d1d13c..667fda7 100644 --- a/gcc/config/darwin.c +++ b/gcc/config/darwin.c @@ -109,6 +109,9 @@ static bool ld_uses_coal_sects = false; each FDE. */ static bool ld_needs_eh_markers = false; +/* Emit a section-start symbol for mod init and term sections. */ +static bool ld_init_term_start_labels = false; + /* Section names. */ section * darwin_sections[NUM_DARWIN_SECTIONS]; @@ -1838,6 +1841,11 @@ finalize_ctors () else switch_to_section (darwin_sections[constructor_section]); + /* Where needed, provide a linker-visible section-start symbol so that we + have stable output between debug and non-debug. */ + if (ld_init_term_start_labels) + fputs (MACHOPIC_INDIRECT ? "_Mod.init:\n" : "_CTOR.sect:\n", asm_out_file); + if (vec_safe_length (ctors) > 1) ctors->qsort (sort_cdtor_records); FOR_EACH_VEC_SAFE_ELT (ctors, i, elt) @@ -1858,6 +1866,11 @@ finalize_dtors () else switch_to_section (darwin_sections[destructor_section]); + /* Where needed, provide a linker-visible section-start symbol so that we + have stable output between debug and non-debug. */ + if (ld_init_term_start_labels) + fputs (MACHOPIC_INDIRECT ? "_Mod.term:\n" : "_DTOR.sect:\n", asm_out_file); + if (vec_safe_length (dtors) > 1) dtors->qsort (sort_cdtor_records); FOR_EACH_VEC_SAFE_ELT (dtors, i, elt) @@ -3228,11 +3241,25 @@ darwin_override_options (void) /* Earlier versions are not specifically accounted, until required. */ } - /* Older Darwin ld could not coalesce weak entities without them being - placed in special sections. */ - if (darwin_target_linker - && (strverscmp (darwin_target_linker, MIN_LD64_NO_COAL_SECTS) < 0)) - ld_uses_coal_sects = true; + /* Some codegen needs to account for the capabilities of the target + linker. */ + if (darwin_target_linker) + { + /* Older Darwin ld could not coalesce weak entities without them being + placed in special sections. */ + if (strverscmp (darwin_target_linker, MIN_LD64_NO_COAL_SECTS) < 0) + ld_uses_coal_sects = true; + + /* Some newer assemblers emit section start temp symbols for mod init + and term sections if there is no suitable symbol present already. + The temp symbols are linker visible and therefore appear in the + symbol tables. Since the temp symbol number can vary when debug is + enabled, that causes compare-debug fails. The solution is to provide + a stable linker-visible symbol. */ + if (strverscmp (darwin_target_linker, + MIN_LD64_INIT_TERM_START_LABELS) >= 0) + ld_init_term_start_labels = true; + } /* In principle, this should be c-family only. However, we really need to set sensible defaults for LTO as well, since the section selection stuff diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h index b1be561..f1d92f8 100644 --- a/gcc/config/darwin.h +++ b/gcc/config/darwin.h @@ -1104,6 +1104,9 @@ extern void darwin_driver_init (unsigned int *,struct cl_decoded_option **); needed, and there is no need for the compiler to emit them. */ #define MIN_LD64_OMIT_STUBS "62.1" +/* Emit start labels for init and term sections from this version. */ +#define MIN_LD64_INIT_TERM_START_LABELS "136.0" + /* If we have no definition for the linker version, pick the minimum version that will bootstrap the compiler. */ #ifndef LD64_VERSION -- cgit v1.1 From ee914ec4f811243ad72aceea4748687c74f38bc6 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Fri, 27 Aug 2021 17:01:37 -0400 Subject: Support limited setcc for H8 gcc/ * config/h8300/bitfield.md (cstore4): Remove expander. * config/h8300/h8300.c (h8300_expand_branch): Remove function. * config/h8300/h8300-protos.h (h8300_expadn_branch): Remove prototype. * config/h8300/h8300.md (eqne): New code iterator. (geultu, geultu_to_c): Similarly. * config/h8300/testcompare.md (cstore4): Dummy expander. (store_c_, store_c_i_): New define_insn_and_splits (cmp_c): New pattern --- gcc/config/h8300/bitfield.md | 11 ------ gcc/config/h8300/h8300-protos.h | 1 - gcc/config/h8300/h8300.c | 24 +------------ gcc/config/h8300/h8300.md | 8 +++++ gcc/config/h8300/testcompare.md | 80 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 89 insertions(+), 35 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/bitfield.md b/gcc/config/h8300/bitfield.md index 82cb161..0d28c75 100644 --- a/gcc/config/h8300/bitfield.md +++ b/gcc/config/h8300/bitfield.md @@ -338,17 +338,6 @@ } [(set_attr "length_table" "bitfield")]) -;;(define_expand "cstore4" -;; [(use (match_operator 1 "eqne_operator" -;; [(match_operand:QHSI 2 "h8300_dst_operand" "") -;; (match_operand:QHSI 3 "h8300_src_operand" "")])) -;; (clobber (match_operand:QHSI 0 "register_operand"))] -;; "TARGET_H8300SX" -;; { -;; h8300_expand_store (operands); -;; DONE; -;; }) - ;;(define_insn "*bstzhireg" ;; [(set (match_operand:HI 0 "register_operand" "=r") ;; (match_operator:HI 1 "eqne_operator" [(cc0) (const_int 0)]))] diff --git a/gcc/config/h8300/h8300-protos.h b/gcc/config/h8300/h8300-protos.h index 3d34401..4a9624f 100644 --- a/gcc/config/h8300/h8300-protos.h +++ b/gcc/config/h8300/h8300-protos.h @@ -45,7 +45,6 @@ extern int compute_a_shift_cc (rtx *, rtx_code); #ifdef HAVE_ATTR_cc extern enum attr_cc compute_plussi_cc (rtx *); #endif -extern void h8300_expand_branch (rtx[]); extern void h8300_expand_store (rtx[]); extern bool expand_a_shift (machine_mode, enum rtx_code, rtx[]); extern int h8300_shift_needs_scratch_p (int, machine_mode, rtx_code); diff --git a/gcc/config/h8300/h8300.c b/gcc/config/h8300/h8300.c index 5f7251a..a63c322 100644 --- a/gcc/config/h8300/h8300.c +++ b/gcc/config/h8300/h8300.c @@ -3256,30 +3256,8 @@ compute_logical_op_length (machine_mode mode, rtx_code code, rtx *operands, rtx_ return length; } - #if 0 -/* Expand a conditional branch. */ - -void -h8300_expand_branch (rtx operands[]) -{ - enum rtx_code code = GET_CODE (operands[0]); - rtx op0 = operands[1]; - rtx op1 = operands[2]; - rtx label = operands[3]; - rtx tmp; - - tmp = gen_rtx_COMPARE (VOIDmode, op0, op1); - emit_insn (gen_rtx_SET (cc0_rtx, tmp)); - - tmp = gen_rtx_fmt_ee (code, VOIDmode, cc0_rtx, const0_rtx); - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, - gen_rtx_LABEL_REF (VOIDmode, label), - pc_rtx); - emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); -} - - + /* Expand a conditional store. */ void diff --git a/gcc/config/h8300/h8300.md b/gcc/config/h8300/h8300.md index 7f49e42..89bfcf1 100644 --- a/gcc/config/h8300/h8300.md +++ b/gcc/config/h8300/h8300.md @@ -233,6 +233,14 @@ (define_code_iterator logicals [ior xor and]) (define_code_iterator ors [ior xor]) + +(define_code_iterator eqne [eq ne]) + +;; For storing the C flag, map from the unsigned comparison to the right +;; code for testing the C bit. +(define_code_iterator geultu [geu ltu]) +(define_code_attr geultu_to_c [(geu "eq") (ltu "ne")]) + (include "movepush.md") (include "mova.md") diff --git a/gcc/config/h8300/testcompare.md b/gcc/config/h8300/testcompare.md index 2919053..9ff7a51 100644 --- a/gcc/config/h8300/testcompare.md +++ b/gcc/config/h8300/testcompare.md @@ -70,6 +70,22 @@ "mov.w %e0,%e0" [(set_attr "length" "2")]) +(define_insn "*cmp_c" + [(set (reg:CCC CC_REG) + (ltu (match_operand:QHSI 0 "h8300_dst_operand" "rQ") + (match_operand:QHSI 1 "h8300_src_operand" "rQi")))] + "reload_completed" + { + if (mode == QImode) + return "cmp.b %X1,%X0"; + else if (mode == HImode) + return "cmp.w %T1,%T0"; + else if (mode == SImode) + return "cmp.l %S1,%S0"; + gcc_unreachable (); + } + [(set_attr "length_table" "add")]) + (define_insn "*cmpqi" [(set (reg:CC CC_REG) (compare (match_operand:QI 0 "h8300_dst_operand" "rQ") @@ -144,3 +160,67 @@ [(parallel [(set (reg:CCZN CC_REG) (compare:CCZN (match_dup 1) (const_int 0))) (set (match_dup 0) (match_dup 1))])]) +;; This exists solely to convince ifcvt to try some store-flag sequences. +;; +;; Essentially we don't want to expose a general store-flag capability. +;; The only generally useful/profitable case is when we want to test the +;; C bit. In that case we can use addx, subx, bst, or bist to get the bit +;; into a GPR. +;; +;; Others could be handled with stc, shifts and masking, but it likely isn't +;; profitable. +;; +(define_expand "cstore4" + [(use (match_operator 1 "eqne_operator" + [(match_operand:QHSI 2 "h8300_dst_operand" "") + (match_operand:QHSI 3 "h8300_src_operand" "")])) + (clobber (match_operand:QHSI 0 "register_operand"))] + "" + { + FAIL; + }) + +;; Storing the C bit is pretty simple since there are many ways to +;; introduce it into a GPR. addx, subx and a variety of bit manipulation +;; instructions +;; +(define_insn "*store_c_" + [(set (match_operand:QHSI 0 "register_operand" "=r") + (eqne:QHSI (reg:CCC CC_REG) (const_int 0)))] + "reload_completed" + { + if ( == NE) + { + if (mode == QImode) + return "xor.b\t%X0,%X0\;bst\t#0,%X0"; + else if (mode == HImode) + return "xor.w\t%T0,%T0\;bst\t#0,%s0"; + else if (mode == SImode) + return "xor.l\t%S0,%S0\;bst\t#0,%w0"; + gcc_unreachable (); + } + else if ( == EQ) + { + if (mode == QImode) + return "xor.b\t%X0,%X0\;bist\t#0,%X0"; + else if (mode == HImode) + return "xor.w\t%T0,%T0\;bist\t#0,%s0"; + else if (mode == SImode) + return "xor.l\t%S0,%S0\;bist\t#0,%w0"; + gcc_unreachable (); + } + } + [(set (attr "length") (symbol_ref "mode == SImode ? 6 : 4"))]) + +;; Recognize this scc and generate code we can match +(define_insn_and_split "*store_c_i_" + [(set (match_operand:QHSI 0 "register_operand" "=r") + (geultu:QHSI (match_operand:QHSI 1 "register_operand" "r") + (match_operand:QHSI 2 "register_operand" "r")))] + "" + "#" + "&& reload_completed" + [(set (reg:CCC CC_REG) + (ltu:CCC (match_dup 1) (match_dup 2))) + (set (match_dup 0) + (:QHSI (reg:CCC CC_REG) (const_int 0)))]) -- cgit v1.1 From fc3e9f58ec18154027aadc6aa055490acb8c2920 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Fri, 4 Sep 2020 14:05:12 +0100 Subject: Darwin, X86 : Implement __cache_clear. We had a NOP cache clear, but there is a suitable mechanism provided by a system call. This connects it up. Signed-off-by: Iain Sandoe gcc/ChangeLog: * config/i386/darwin.h (CLEAR_INSN_CACHE): New. --- gcc/config/i386/darwin.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h index 73b06e2..da0ae5b 100644 --- a/gcc/config/i386/darwin.h +++ b/gcc/config/i386/darwin.h @@ -344,3 +344,8 @@ along with GCC; see the file COPYING3. If not see #undef SUBTARGET_SHADOW_OFFSET #define SUBTARGET_SHADOW_OFFSET \ (TARGET_LP64 ? HOST_WIDE_INT_1 << 44 : HOST_WIDE_INT_1 << 29) + +#undef CLEAR_INSN_CACHE +#define CLEAR_INSN_CACHE(beg, end) \ + extern void sys_icache_invalidate(void *start, size_t len); \ + sys_icache_invalidate ((beg), (size_t)((end)-(beg))) -- cgit v1.1 From c3c669ac811429033c0151f910b38fc009e21ca8 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Fri, 27 Aug 2021 19:49:05 +0100 Subject: Darwin: Fixes for darwin_libc_has_function. Firstly, the checks for availability need not be run for any currently supported Darwin version (or for any version of Darwin on x86). In fact, the only test that is needed that differs from the default is for the availbaility of sincos. Test that and then fall back to the default implementation. Secondly, the funtion appears to be called from the Jit library before the value of darwin_macosx_version_min has been set up - at present we work around this by guarding the checks on having a non-null pointer for darwin_macosx_version_min. Signed-off-by: Iain Sandoe gcc/ChangeLog: * config/darwin.c (darwin_libc_has_function): Do not run the checks for x86 or modern Darwin. Make sure that there is a value set for darwin_macosx_version_min before testing. --- gcc/config/darwin.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c index 667fda7..781742f 100644 --- a/gcc/config/darwin.c +++ b/gcc/config/darwin.c @@ -42,6 +42,7 @@ along with GCC; see the file COPYING3. If not see #include "explow.h" #include "expr.h" #include "langhooks.h" +#include "targhooks.h" #include "toplev.h" #include "lto-section-names.h" #include "intl.h" @@ -3661,19 +3662,22 @@ darwin_rename_builtins (void) } } +/* Implementation for the TARGET_LIBC_HAS_FUNCTION hook. */ + bool darwin_libc_has_function (enum function_class fn_class, tree type ATTRIBUTE_UNUSED) { - if (fn_class == function_sincos) + if (fn_class == function_sincos && darwin_macosx_version_min) return (strverscmp (darwin_macosx_version_min, "10.9") >= 0); - +#if DARWIN_PPC && SUPPORT_DARWIN_LEGACY if (fn_class == function_c99_math_complex || fn_class == function_c99_misc) return (TARGET_64BIT - || strverscmp (darwin_macosx_version_min, "10.3") >= 0); - - return true; + || (darwin_macosx_version_min && + strverscmp (darwin_macosx_version_min, "10.3") >= 0)); +#endif + return default_libc_has_function (fn_class, type); } hashval_t -- cgit v1.1 From d73c44800b53c7e130da29e4eff1960b8311ffcd Mon Sep 17 00:00:00 2001 From: Pat Haugen Date: Mon, 30 Aug 2021 10:58:21 -0500 Subject: Enable store fusion on Power10. gcc/ChangeLog: * config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add OPTION_MASK_P10_FUSION_2STORE. (POWERPC_MASKS): Likewise. * config/rs6000/rs6000.c (rs6000_option_override_internal): Enable store fusion for Power10. (is_fusable_store): New. (power10_sched_reorder): Likewise. (rs6000_sched_reorder): Do Power10 specific reordering. (rs6000_sched_reorder2): Likewise. * config/rs6000/rs6000.opt: Add new option. gcc/testsuite/ChangeLog: * gcc.target/powerpc/fusion-p10-stst.c: New test. * gcc.target/powerpc/fusion-p10-stst2.c: New test. --- gcc/config/rs6000/rs6000-cpus.def | 4 +- gcc/config/rs6000/rs6000.c | 95 +++++++++++++++++++++++++++++++++++++++ gcc/config/rs6000/rs6000.opt | 4 ++ 3 files changed, 102 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def index 6758296..f5812da 100644 --- a/gcc/config/rs6000/rs6000-cpus.def +++ b/gcc/config/rs6000/rs6000-cpus.def @@ -90,7 +90,8 @@ | OPTION_MASK_P10_FUSION_2LOGICAL \ | OPTION_MASK_P10_FUSION_LOGADD \ | OPTION_MASK_P10_FUSION_ADDLOG \ - | OPTION_MASK_P10_FUSION_2ADD) + | OPTION_MASK_P10_FUSION_2ADD \ + | OPTION_MASK_P10_FUSION_2STORE) /* Flags that need to be turned off if -mno-power9-vector. */ #define OTHER_P9_VECTOR_MASKS (OPTION_MASK_FLOAT128_HW \ @@ -143,6 +144,7 @@ | OPTION_MASK_P10_FUSION_LOGADD \ | OPTION_MASK_P10_FUSION_ADDLOG \ | OPTION_MASK_P10_FUSION_2ADD \ + | OPTION_MASK_P10_FUSION_2STORE \ | OPTION_MASK_HTM \ | OPTION_MASK_ISEL \ | OPTION_MASK_MFCRF \ diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index d02c1b6..b7ea148 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -4498,6 +4498,10 @@ rs6000_option_override_internal (bool global_init_p) && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0) rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD; + if (TARGET_POWER10 + && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0) + rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE; + /* Turn off vector pair/mma options on non-power10 systems. */ else if (!TARGET_POWER10 && TARGET_MMA) { @@ -18933,6 +18937,89 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos) return cached_can_issue_more; } +/* Determine if INSN is a store to memory that can be fused with a similar + adjacent store. */ + +static bool +is_fusable_store (rtx_insn *insn, rtx *str_mem) +{ + /* Insn must be a non-prefixed base+disp form store. */ + if (is_store_insn (insn, str_mem) + && get_attr_prefixed (insn) == PREFIXED_NO + && get_attr_update (insn) == UPDATE_NO + && get_attr_indexed (insn) == INDEXED_NO) + { + /* Further restrictions by mode and size. */ + if (!MEM_SIZE_KNOWN_P (*str_mem)) + return false; + + machine_mode mode = GET_MODE (*str_mem); + HOST_WIDE_INT size = MEM_SIZE (*str_mem); + + if (INTEGRAL_MODE_P (mode)) + /* Must be word or dword size. */ + return (size == 4 || size == 8); + else if (FLOAT_MODE_P (mode)) + /* Must be dword size. */ + return (size == 8); + } + + return false; +} + +/* Do Power10 specific reordering of the ready list. */ + +static int +power10_sched_reorder (rtx_insn **ready, int lastpos) +{ + rtx mem1; + + /* Do store fusion during sched2 only. */ + if (!reload_completed) + return cached_can_issue_more; + + /* If the prior insn finished off a store fusion pair then simply + reset the counter and return, nothing more to do. */ + if (load_store_pendulum != 0) + { + load_store_pendulum = 0; + return cached_can_issue_more; + } + + /* Try to pair certain store insns to adjacent memory locations + so that the hardware will fuse them to a single operation. */ + if (TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE + && is_fusable_store (last_scheduled_insn, &mem1)) + { + + /* A fusable store was just scheduled. Scan the ready list for another + store that it can fuse with. */ + int pos = lastpos; + while (pos >= 0) + { + rtx mem2; + /* GPR stores can be ascending or descending offsets, FPR/VSR stores + must be ascending only. */ + if (is_fusable_store (ready[pos], &mem2) + && ((INTEGRAL_MODE_P (GET_MODE (mem1)) + && adjacent_mem_locations (mem1, mem2)) + || (FLOAT_MODE_P (GET_MODE (mem1)) + && (adjacent_mem_locations (mem1, mem2) == mem1)))) + { + /* Found a fusable store. Move it to the end of the ready list + so it is scheduled next. */ + move_to_end_of_ready (ready, pos, lastpos); + + load_store_pendulum = -1; + break; + } + pos--; + } + } + + return cached_can_issue_more; +} + /* We are about to begin issuing insns for this clock cycle. */ static int @@ -18959,6 +19046,10 @@ rs6000_sched_reorder (FILE *dump ATTRIBUTE_UNUSED, int sched_verbose, if (rs6000_tune == PROCESSOR_POWER6) load_store_pendulum = 0; + /* Do Power10 dependent reordering. */ + if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn) + power10_sched_reorder (ready, n_ready - 1); + return rs6000_issue_rate (); } @@ -18980,6 +19071,10 @@ rs6000_sched_reorder2 (FILE *dump, int sched_verbose, rtx_insn **ready, && recog_memoized (last_scheduled_insn) >= 0) return power9_sched_reorder2 (ready, *pn_ready - 1); + /* Do Power10 dependent reordering. */ + if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn) + return power10_sched_reorder (ready, *pn_ready - 1); + return cached_can_issue_more; } diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index 0538db3..3753de1 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -514,6 +514,10 @@ mpower10-fusion-2add Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags) Fuse dependent pairs of add or vaddudm instructions for better performance on power10. +mpower10-fusion-2store +Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags) +Fuse certain store operations together for better performance on power10. + mcrypto Target Mask(CRYPTO) Var(rs6000_isa_flags) Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions. -- cgit v1.1 From b5bc39317bf4da1d51cb6eccf8afcacc90514602 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Thu, 4 Mar 2021 19:54:00 -0600 Subject: rs6000: Darwin builtin support 2021-03-04 Bill Schmidt gcc/ * config/rs6000/darwin.h (SUBTARGET_INIT_BUILTINS): Use the new decl when new_builtins_are_live. * config/rs6000/rs6000-builtin-new.def (__builtin_cfstring): New built-in. --- gcc/config/rs6000/darwin.h | 8 ++++++-- gcc/config/rs6000/rs6000-builtin-new.def | 6 ++++++ 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/darwin.h b/gcc/config/rs6000/darwin.h index 42f39e6..6abf8e8 100644 --- a/gcc/config/rs6000/darwin.h +++ b/gcc/config/rs6000/darwin.h @@ -504,8 +504,12 @@ #define SUBTARGET_INIT_BUILTINS \ do { \ darwin_patch_builtins (); \ - rs6000_builtin_decls[(unsigned) (RS6000_BUILTIN_CFSTRING)] \ - = darwin_init_cfstring_builtins ((unsigned) (RS6000_BUILTIN_CFSTRING)); \ + if (new_builtins_are_live) \ + rs6000_builtin_decls_x[(unsigned) (RS6000_BIF_CFSTRING)] \ + = darwin_init_cfstring_builtins ((unsigned) (RS6000_BIF_CFSTRING)); \ + else \ + rs6000_builtin_decls[(unsigned) (RS6000_BUILTIN_CFSTRING)] \ + = darwin_init_cfstring_builtins ((unsigned) (RS6000_BUILTIN_CFSTRING)); \ } while(0) /* So far, there is no rs6000_fold_builtin, if one is introduced, then diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def index 3e732ce..6a28d51 100644 --- a/gcc/config/rs6000/rs6000-builtin-new.def +++ b/gcc/config/rs6000/rs6000-builtin-new.def @@ -187,6 +187,12 @@ ; Builtins that have been around since time immemorial or are just ; considered available everywhere. [always] +; __builtin_cfstring is for Darwin, which will replace the decl we +; create here with another one during subtarget processing. We just +; need to ensure it has a slot in the builtin enumeration. + void __builtin_cfstring (); + CFSTRING nothing {} + void __builtin_cpu_init (); CPU_INIT nothing {cpu} -- cgit v1.1 From a5027ea2ad22c82adc68e02dae8be7f69131503a Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Mon, 30 Aug 2021 14:55:11 -0500 Subject: rs6000: Add sanity to V2DI_type_node definitions 2021-08-30 Bill Schmidt gcc/ * config/rs6000/rs6000-call.c (rs6000_init_builtins): Change initialization of V2DI_type_node and unsigned_V2DI_type_node. --- gcc/config/rs6000/rs6000-call.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index 87f9709..df405e1 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -13297,15 +13297,23 @@ rs6000_init_builtins (void) tree ftype; tree t; machine_mode mode; + const char *str; if (TARGET_DEBUG_BUILTIN) fprintf (stderr, "rs6000_init_builtins%s%s\n", (TARGET_ALTIVEC) ? ", altivec" : "", (TARGET_VSX) ? ", vsx" : ""); - V2DI_type_node = rs6000_vector_type (TARGET_POWERPC64 ? "__vector long" - : "__vector long long", - long_long_integer_type_node, 2); + if (new_builtins_are_live) + V2DI_type_node = rs6000_vector_type ("__vector long long", + long_long_integer_type_node, 2); + else + { + str = TARGET_POWERPC64 ? "__vector long" : "__vector long long"; + V2DI_type_node = rs6000_vector_type (str, + long_long_integer_type_node, + 2); + } ptr_V2DI_type_node = build_pointer_type (build_qualified_type (V2DI_type_node, TYPE_QUAL_CONST)); @@ -13356,10 +13364,19 @@ rs6000_init_builtins (void) = build_pointer_type (build_qualified_type (unsigned_V4SI_type_node, TYPE_QUAL_CONST)); - unsigned_V2DI_type_node = rs6000_vector_type (TARGET_POWERPC64 - ? "__vector unsigned long" - : "__vector unsigned long long", - long_long_unsigned_type_node, 2); + if (new_builtins_are_live) + unsigned_V2DI_type_node + = rs6000_vector_type ("__vector unsigned long long", + long_long_unsigned_type_node, 2); + else + { + str = TARGET_POWERPC64 + ? "__vector unsigned long" + : "__vector unsigned long long"; + unsigned_V2DI_type_node + = rs6000_vector_type (str, long_long_unsigned_type_node, 2); + } + ptr_unsigned_V2DI_type_node = build_pointer_type (build_qualified_type (unsigned_V2DI_type_node, TYPE_QUAL_CONST)); -- cgit v1.1 From 7465c2ed6f1a1dfb2bec4b18bcad5fe3210b3f4e Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 24 Aug 2021 12:10:19 -0500 Subject: rs6000: Always initialize vector_pair and vector_quad nodes 2021-08-24 Bill Schmidt gcc/ * config/rs6000/rs6000-call.c (rs6000_init_builtins): Remove TARGET_EXTRA_BUILTINS guard. --- gcc/config/rs6000/rs6000-call.c | 49 +++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 26 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index df405e1..e8625d1 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -13551,32 +13551,29 @@ rs6000_init_builtins (void) ieee128_float_type_node = ibm128_float_type_node = long_double_type_node; /* Vector pair and vector quad support. */ - if (TARGET_EXTRA_BUILTINS) - { - vector_pair_type_node = make_node (OPAQUE_TYPE); - SET_TYPE_MODE (vector_pair_type_node, OOmode); - TYPE_SIZE (vector_pair_type_node) = bitsize_int (GET_MODE_BITSIZE (OOmode)); - TYPE_PRECISION (vector_pair_type_node) = GET_MODE_BITSIZE (OOmode); - TYPE_SIZE_UNIT (vector_pair_type_node) = size_int (GET_MODE_SIZE (OOmode)); - SET_TYPE_ALIGN (vector_pair_type_node, 256); - TYPE_USER_ALIGN (vector_pair_type_node) = 0; - lang_hooks.types.register_builtin_type (vector_pair_type_node, - "__vector_pair"); - t = build_qualified_type (vector_pair_type_node, TYPE_QUAL_CONST); - ptr_vector_pair_type_node = build_pointer_type (t); - - vector_quad_type_node = make_node (OPAQUE_TYPE); - SET_TYPE_MODE (vector_quad_type_node, XOmode); - TYPE_SIZE (vector_quad_type_node) = bitsize_int (GET_MODE_BITSIZE (XOmode)); - TYPE_PRECISION (vector_quad_type_node) = GET_MODE_BITSIZE (XOmode); - TYPE_SIZE_UNIT (vector_quad_type_node) = size_int (GET_MODE_SIZE (XOmode)); - SET_TYPE_ALIGN (vector_quad_type_node, 512); - TYPE_USER_ALIGN (vector_quad_type_node) = 0; - lang_hooks.types.register_builtin_type (vector_quad_type_node, - "__vector_quad"); - t = build_qualified_type (vector_quad_type_node, TYPE_QUAL_CONST); - ptr_vector_quad_type_node = build_pointer_type (t); - } + vector_pair_type_node = make_node (OPAQUE_TYPE); + SET_TYPE_MODE (vector_pair_type_node, OOmode); + TYPE_SIZE (vector_pair_type_node) = bitsize_int (GET_MODE_BITSIZE (OOmode)); + TYPE_PRECISION (vector_pair_type_node) = GET_MODE_BITSIZE (OOmode); + TYPE_SIZE_UNIT (vector_pair_type_node) = size_int (GET_MODE_SIZE (OOmode)); + SET_TYPE_ALIGN (vector_pair_type_node, 256); + TYPE_USER_ALIGN (vector_pair_type_node) = 0; + lang_hooks.types.register_builtin_type (vector_pair_type_node, + "__vector_pair"); + t = build_qualified_type (vector_pair_type_node, TYPE_QUAL_CONST); + ptr_vector_pair_type_node = build_pointer_type (t); + + vector_quad_type_node = make_node (OPAQUE_TYPE); + SET_TYPE_MODE (vector_quad_type_node, XOmode); + TYPE_SIZE (vector_quad_type_node) = bitsize_int (GET_MODE_BITSIZE (XOmode)); + TYPE_PRECISION (vector_quad_type_node) = GET_MODE_BITSIZE (XOmode); + TYPE_SIZE_UNIT (vector_quad_type_node) = size_int (GET_MODE_SIZE (XOmode)); + SET_TYPE_ALIGN (vector_quad_type_node, 512); + TYPE_USER_ALIGN (vector_quad_type_node) = 0; + lang_hooks.types.register_builtin_type (vector_quad_type_node, + "__vector_quad"); + t = build_qualified_type (vector_quad_type_node, TYPE_QUAL_CONST); + ptr_vector_quad_type_node = build_pointer_type (t); /* Initialize the modes for builtin_function_type, mapping a machine mode to tree type node. */ -- cgit v1.1 From f4b41701382885fc6219045bed207ec645fded2e Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 30 Aug 2021 15:05:14 +0800 Subject: Unify UNSPEC_MASKED_EQ/GT to the form of UNSPEC_PCMP. Currently for evex vpcmpeqb instruction, we have two forms of rtl template representation, one is (unspec [op1 op2] UNSPEC_MASK_EQ), the other is (unspec [op1, op2, const_int 0] UNSPEC_PCMP), which increases the maintenance burden, such as optimization (not: vpcmpeqb) to (vpcmpneqb) requires two define_insn_and_split to match the two forms respectively, this patch removes UNSPEC_MASK_EQ/GT, unifying them into the form of UNSPEC_PCMP. gcc/ChangeLog: * config/i386/sse.md (*_ucmp3_1): Change from define_split to define_insn_and_split. (*avx2_eq3): Removed. (_eq3): Adjust pattern (_eq3_1): Rename to .. (*_eq3_1): .. this, and adjust pattern. (*avx2_gt3): Removed. (_gt3): Change from define_insn to define_expand, and adjust pattern. (UNSPEC_MASKED_EQ, UNSPEC_MASKED_GT): Removed. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512bw-vpcmpeqb-1.c: Adjust testcase. * gcc.target/i386/avx512bw-vpcmpeqw-1.c: Ditto. * gcc.target/i386/avx512bw-vpcmpgtb-1.c: Ditto. * gcc.target/i386/avx512bw-vpcmpgtw-1.c: Ditto. * gcc.target/i386/avx512f-vpcmpeqd-1.c: Ditto. * gcc.target/i386/avx512f-vpcmpeqq-1.c: Ditto. * gcc.target/i386/avx512f-vpcmpgtd-1.c: Ditto. * gcc.target/i386/avx512f-vpcmpgtq-1.c: Ditto. * gcc.target/i386/avx512vl-vpcmpeqd-1.c: Ditto. * gcc.target/i386/avx512vl-vpcmpeqq-1.c: Ditto. * gcc.target/i386/avx512vl-vpcmpgtd-1.c: Ditto. * gcc.target/i386/avx512vl-vpcmpgtq-1.c: Ditto. * gcc.target/i386/bitwise_mask_op-1.c: Ditto. * gcc.target/i386/bitwise_mask_op-2.c: Ditto. --- gcc/config/i386/sse.md | 100 +++++++++++++++---------------------------------- 1 file changed, 30 insertions(+), 70 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index ac0c463..5785e73 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -100,8 +100,6 @@ UNSPEC_COMPRESS UNSPEC_COMPRESS_STORE UNSPEC_EXPAND - UNSPEC_MASKED_EQ - UNSPEC_MASKED_GT ;; Mask operations UNSPEC_MASKOP @@ -3426,7 +3424,7 @@ ;; For signed comparison, handle EQ 0: NEQ 4, ;; for unsigned comparison extra handle LE:2, NLE:6, equivalent to EQ and NEQ. -(define_split +(define_insn_and_split "*_ucmp3_1" [(set (match_operand: 0 "register_operand") (unspec: [(us_minus:VI12_AVX512VL @@ -3435,9 +3433,11 @@ (match_operand:VI12_AVX512VL 3 "const0_operand") (match_operand:SI 4 "const_0_to_7_operand")] UNSPEC_PCMP_ITER))] - "TARGET_AVX512BW + "TARGET_AVX512BW && ix86_pre_reload_split () && ix86_binary_operator_ok (US_MINUS, mode, operands) && (INTVAL (operands[4]) & ) == 0" + "#" + "&& 1" [(const_int 0)] { /* LE: 2, NLT: 5, NLE: 6, LT: 1 */ @@ -13801,24 +13801,6 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) -(define_insn_and_split "*avx2_eq3" - [(set (match_operand:VI_128_256 0 "register_operand") - (vec_merge:VI_128_256 - (match_operand:VI_128_256 1 "vector_all_ones_operand") - (match_operand:VI_128_256 2 "const0_operand") - (unspec: - [(match_operand:VI_128_256 3 "nonimmediate_operand") - (match_operand:VI_128_256 4 "nonimmediate_operand")] - UNSPEC_MASKED_EQ)))] - "TARGET_AVX512VL && ix86_pre_reload_split () - && !(MEM_P (operands[3]) && MEM_P (operands[4]))" - "#" - "&& 1" - [(set (match_dup 0) - (eq:VI_128_256 - (match_dup 3) - (match_dup 4)))]) - (define_insn_and_split "*avx2_pcmp3_1" [(set (match_operand:VI_128_256 0 "register_operand") (vec_merge:VI_128_256 @@ -13978,8 +13960,9 @@ [(set (match_operand: 0 "register_operand") (unspec: [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand") - (match_operand:VI12_AVX512VL 2 "nonimmediate_operand")] - UNSPEC_MASKED_EQ))] + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand") + (const_int 0)] + UNSPEC_PCMP))] "TARGET_AVX512BW" "ix86_fixup_binary_operands_no_copy (EQ, mode, operands);") @@ -13987,17 +13970,19 @@ [(set (match_operand: 0 "register_operand") (unspec: [(match_operand:VI48_AVX512VL 1 "nonimmediate_operand") - (match_operand:VI48_AVX512VL 2 "nonimmediate_operand")] - UNSPEC_MASKED_EQ))] + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand") + (const_int 0)] + UNSPEC_PCMP))] "TARGET_AVX512F" "ix86_fixup_binary_operands_no_copy (EQ, mode, operands);") -(define_insn "_eq3_1" +(define_insn "*_eq3_1" [(set (match_operand: 0 "register_operand" "=k,k") (unspec: [(match_operand:VI12_AVX512VL 1 "nonimm_or_0_operand" "%v,v") - (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "vm,C")] - UNSPEC_MASKED_EQ))] + (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "vm,C") + (const_int 0)] + UNSPEC_PCMP))] "TARGET_AVX512BW && !(MEM_P (operands[1]) && MEM_P (operands[2]))" "@ vpcmpeq\t{%2, %1, %0|%0, %1, %2} @@ -14007,12 +13992,13 @@ (set_attr "prefix" "evex") (set_attr "mode" "")]) -(define_insn "_eq3_1" +(define_insn "*_eq3_1" [(set (match_operand: 0 "register_operand" "=k,k") (unspec: [(match_operand:VI48_AVX512VL 1 "nonimm_or_0_operand" "%v,v") - (match_operand:VI48_AVX512VL 2 "nonimm_or_0_operand" "vm,C")] - UNSPEC_MASKED_EQ))] + (match_operand:VI48_AVX512VL 2 "nonimm_or_0_operand" "vm,C") + (const_int 0)] + UNSPEC_PCMP))] "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))" "@ vpcmpeq\t{%2, %1, %0|%0, %1, %2} @@ -14082,47 +14068,21 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) -(define_insn_and_split "*avx2_gt3" - [(set (match_operand:VI_128_256 0 "register_operand") - (vec_merge:VI_128_256 - (match_operand:VI_128_256 1 "vector_all_ones_operand") - (match_operand:VI_128_256 2 "const0_operand") - (unspec: - [(match_operand:VI_128_256 3 "register_operand") - (match_operand:VI_128_256 4 "nonimmediate_operand")] - UNSPEC_MASKED_GT)))] - "TARGET_AVX512VL - && ix86_pre_reload_split ()" - "#" - "&& 1" - [(set (match_dup 0) - (gt:VI_128_256 - (match_dup 3) - (match_dup 4)))]) - -(define_insn "_gt3" - [(set (match_operand: 0 "register_operand" "=k") +(define_expand "_gt3" + [(set (match_operand: 0 "register_operand") (unspec: - [(match_operand:VI48_AVX512VL 1 "register_operand" "v") - (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")] UNSPEC_MASKED_GT))] - "TARGET_AVX512F" - "vpcmpgt\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "ssecmp") - (set_attr "prefix_extra" "1") - (set_attr "prefix" "evex") - (set_attr "mode" "")]) + [(match_operand:VI48_AVX512VL 1 "register_operand") + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand") + (const_int 6)] UNSPEC_PCMP))] + "TARGET_AVX512F") -(define_insn "_gt3" - [(set (match_operand: 0 "register_operand" "=k") +(define_expand "_gt3" + [(set (match_operand: 0 "register_operand") (unspec: - [(match_operand:VI12_AVX512VL 1 "register_operand" "v") - (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")] UNSPEC_MASKED_GT))] - "TARGET_AVX512BW" - "vpcmpgt\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "ssecmp") - (set_attr "prefix_extra" "1") - (set_attr "prefix" "evex") - (set_attr "mode" "")]) + [(match_operand:VI12_AVX512VL 1 "register_operand") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand") + (const_int 6)] UNSPEC_PCMP))] + "TARGET_AVX512BW") (define_insn "*sse2_gt3" [(set (match_operand:VI124_128 0 "register_operand" "=x,x") -- cgit v1.1 From 4ecfc7e3debac53a30558d7ae33e8cdfdf351466 Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Sat, 28 Aug 2021 07:28:55 -0400 Subject: MIPS: use mips_isa enum instead hardcoded numbers Currently mips-cpu.defs, mips.h, netbsd.h and config.gcc are using hardcoded numbers for isa level. Let's replace them with more readable enum mips_isa. gcc/ChangeLog: * config/mips/mips.h (struct mips_cpu_info): define enum mips_isa; use enum instead of int for 'isa' member. * config.gcc, config/mips/mips.c, config/mips/mips-cpus.def, config/mips/netbsd.h: replace hardcoded numbers with enum. --- gcc/config/mips/mips-cpus.def | 228 +++++++++++++++++++++--------------------- gcc/config/mips/mips.c | 5 +- gcc/config/mips/mips.h | 84 ++++++++++------ gcc/config/mips/netbsd.h | 5 +- 4 files changed, 172 insertions(+), 150 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/mips/mips-cpus.def b/gcc/config/mips/mips-cpus.def index b02294b..45fb6bc 100644 --- a/gcc/config/mips/mips-cpus.def +++ b/gcc/config/mips/mips-cpus.def @@ -33,146 +33,146 @@ along with GCC; see the file COPYING3. If not see where the arguments are the fields of struct mips_cpu_info. */ /* Entries for generic ISAs. */ -MIPS_CPU ("mips1", PROCESSOR_R3000, 1, 0) -MIPS_CPU ("mips2", PROCESSOR_R6000, 2, PTF_AVOID_BRANCHLIKELY_SIZE) -MIPS_CPU ("mips3", PROCESSOR_R4000, 3, PTF_AVOID_BRANCHLIKELY_SIZE) -MIPS_CPU ("mips4", PROCESSOR_R10000, 4, PTF_AVOID_BRANCHLIKELY_SIZE) +MIPS_CPU ("mips1", PROCESSOR_R3000, MIPS_ISA_MIPS1, 0) +MIPS_CPU ("mips2", PROCESSOR_R6000, MIPS_ISA_MIPS2, PTF_AVOID_BRANCHLIKELY_SIZE) +MIPS_CPU ("mips3", PROCESSOR_R4000, MIPS_ISA_MIPS3, PTF_AVOID_BRANCHLIKELY_SIZE) +MIPS_CPU ("mips4", PROCESSOR_R10000, MIPS_ISA_MIPS4, PTF_AVOID_BRANCHLIKELY_SIZE) /* Prefer not to use branch-likely instructions for generic MIPS32rX and MIPS64rX code. The instructions were officially deprecated in revisions 2 and earlier, but revision 3 is likely to downgrade that to a recommendation to avoid the instructions in code that isn't tuned to a specific processor. */ -MIPS_CPU ("mips32", PROCESSOR_4KC, 32, PTF_AVOID_BRANCHLIKELY_ALWAYS) -MIPS_CPU ("mips32r2", PROCESSOR_74KF2_1, 33, PTF_AVOID_BRANCHLIKELY_ALWAYS) +MIPS_CPU ("mips32", PROCESSOR_4KC, MIPS_ISA_MIPS32, PTF_AVOID_BRANCHLIKELY_ALWAYS) +MIPS_CPU ("mips32r2", PROCESSOR_74KF2_1, MIPS_ISA_MIPS32R2, PTF_AVOID_BRANCHLIKELY_ALWAYS) /* mips32r3 is micromips hense why it uses the M4K processor. */ -MIPS_CPU ("mips32r3", PROCESSOR_M4K, 34, PTF_AVOID_BRANCHLIKELY_ALWAYS) -MIPS_CPU ("mips32r5", PROCESSOR_P5600, 36, PTF_AVOID_BRANCHLIKELY_ALWAYS) -MIPS_CPU ("mips32r6", PROCESSOR_I6400, 37, 0) -MIPS_CPU ("mips64", PROCESSOR_5KC, 64, PTF_AVOID_BRANCHLIKELY_ALWAYS) +MIPS_CPU ("mips32r3", PROCESSOR_M4K, MIPS_ISA_MIPS32R3, PTF_AVOID_BRANCHLIKELY_ALWAYS) +MIPS_CPU ("mips32r5", PROCESSOR_P5600, MIPS_ISA_MIPS32R5, PTF_AVOID_BRANCHLIKELY_ALWAYS) +MIPS_CPU ("mips32r6", PROCESSOR_I6400, MIPS_ISA_MIPS32R6, 0) +MIPS_CPU ("mips64", PROCESSOR_5KC, MIPS_ISA_MIPS64, PTF_AVOID_BRANCHLIKELY_ALWAYS) /* ??? For now just tune the generic MIPS64r2 and above for 5KC as well. */ -MIPS_CPU ("mips64r2", PROCESSOR_5KC, 65, PTF_AVOID_BRANCHLIKELY_ALWAYS) -MIPS_CPU ("mips64r3", PROCESSOR_5KC, 66, PTF_AVOID_BRANCHLIKELY_ALWAYS) -MIPS_CPU ("mips64r5", PROCESSOR_5KC, 68, PTF_AVOID_BRANCHLIKELY_ALWAYS) -MIPS_CPU ("mips64r6", PROCESSOR_I6400, 69, 0) +MIPS_CPU ("mips64r2", PROCESSOR_5KC, MIPS_ISA_MIPS64R2, PTF_AVOID_BRANCHLIKELY_ALWAYS) +MIPS_CPU ("mips64r3", PROCESSOR_5KC, MIPS_ISA_MIPS64R3, PTF_AVOID_BRANCHLIKELY_ALWAYS) +MIPS_CPU ("mips64r5", PROCESSOR_5KC, MIPS_ISA_MIPS64R5, PTF_AVOID_BRANCHLIKELY_ALWAYS) +MIPS_CPU ("mips64r6", PROCESSOR_I6400, MIPS_ISA_MIPS64R6, 0) /* MIPS I processors. */ -MIPS_CPU ("r3000", PROCESSOR_R3000, 1, 0) -MIPS_CPU ("r2000", PROCESSOR_R3000, 1, 0) -MIPS_CPU ("r3900", PROCESSOR_R3900, 1, 0) +MIPS_CPU ("r3000", PROCESSOR_R3000, MIPS_ISA_MIPS1, 0) +MIPS_CPU ("r2000", PROCESSOR_R3000, MIPS_ISA_MIPS1, 0) +MIPS_CPU ("r3900", PROCESSOR_R3900, MIPS_ISA_MIPS1, 0) /* MIPS II processors. */ -MIPS_CPU ("r6000", PROCESSOR_R6000, 2, 0) +MIPS_CPU ("r6000", PROCESSOR_R6000, MIPS_ISA_MIPS2, 0) /* MIPS III processors. */ -MIPS_CPU ("r4000", PROCESSOR_R4000, 3, 0) -MIPS_CPU ("vr4100", PROCESSOR_R4100, 3, 0) -MIPS_CPU ("vr4111", PROCESSOR_R4111, 3, 0) -MIPS_CPU ("vr4120", PROCESSOR_R4120, 3, 0) -MIPS_CPU ("vr4130", PROCESSOR_R4130, 3, 0) -MIPS_CPU ("vr4300", PROCESSOR_R4300, 3, 0) -MIPS_CPU ("r4400", PROCESSOR_R4000, 3, 0) -MIPS_CPU ("r4600", PROCESSOR_R4600, 3, 0) -MIPS_CPU ("orion", PROCESSOR_R4600, 3, 0) -MIPS_CPU ("r4650", PROCESSOR_R4650, 3, 0) -MIPS_CPU ("r4700", PROCESSOR_R4700, 3, 0) -MIPS_CPU ("r5900", PROCESSOR_R5900, 3, 0) +MIPS_CPU ("r4000", PROCESSOR_R4000, MIPS_ISA_MIPS3, 0) +MIPS_CPU ("vr4100", PROCESSOR_R4100, MIPS_ISA_MIPS3, 0) +MIPS_CPU ("vr4111", PROCESSOR_R4111, MIPS_ISA_MIPS3, 0) +MIPS_CPU ("vr4120", PROCESSOR_R4120, MIPS_ISA_MIPS3, 0) +MIPS_CPU ("vr4130", PROCESSOR_R4130, MIPS_ISA_MIPS3, 0) +MIPS_CPU ("vr4300", PROCESSOR_R4300, MIPS_ISA_MIPS3, 0) +MIPS_CPU ("r4400", PROCESSOR_R4000, MIPS_ISA_MIPS3, 0) +MIPS_CPU ("r4600", PROCESSOR_R4600, MIPS_ISA_MIPS3, 0) +MIPS_CPU ("orion", PROCESSOR_R4600, MIPS_ISA_MIPS3, 0) +MIPS_CPU ("r4650", PROCESSOR_R4650, MIPS_ISA_MIPS3, 0) +MIPS_CPU ("r4700", PROCESSOR_R4700, MIPS_ISA_MIPS3, 0) +MIPS_CPU ("r5900", PROCESSOR_R5900, MIPS_ISA_MIPS3, 0) /* ST Loongson 2E/2F processors. */ -MIPS_CPU ("loongson2e", PROCESSOR_LOONGSON_2E, 3, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("loongson2f", PROCESSOR_LOONGSON_2F, 3, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("loongson2e", PROCESSOR_LOONGSON_2E, MIPS_ISA_MIPS3, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("loongson2f", PROCESSOR_LOONGSON_2F, MIPS_ISA_MIPS3, PTF_AVOID_BRANCHLIKELY_SPEED) /* MIPS IV processors. */ -MIPS_CPU ("r8000", PROCESSOR_R8000, 4, 0) -MIPS_CPU ("r10000", PROCESSOR_R10000, 4, 0) -MIPS_CPU ("r12000", PROCESSOR_R10000, 4, 0) -MIPS_CPU ("r14000", PROCESSOR_R10000, 4, 0) -MIPS_CPU ("r16000", PROCESSOR_R10000, 4, 0) -MIPS_CPU ("vr5000", PROCESSOR_R5000, 4, 0) -MIPS_CPU ("vr5400", PROCESSOR_R5400, 4, 0) -MIPS_CPU ("vr5500", PROCESSOR_R5500, 4, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("rm7000", PROCESSOR_R7000, 4, 0) -MIPS_CPU ("rm9000", PROCESSOR_R9000, 4, 0) +MIPS_CPU ("r8000", PROCESSOR_R8000, MIPS_ISA_MIPS4, 0) +MIPS_CPU ("r10000", PROCESSOR_R10000, MIPS_ISA_MIPS4, 0) +MIPS_CPU ("r12000", PROCESSOR_R10000, MIPS_ISA_MIPS4, 0) +MIPS_CPU ("r14000", PROCESSOR_R10000, MIPS_ISA_MIPS4, 0) +MIPS_CPU ("r16000", PROCESSOR_R10000, MIPS_ISA_MIPS4, 0) +MIPS_CPU ("vr5000", PROCESSOR_R5000, MIPS_ISA_MIPS4, 0) +MIPS_CPU ("vr5400", PROCESSOR_R5400, MIPS_ISA_MIPS4, 0) +MIPS_CPU ("vr5500", PROCESSOR_R5500, MIPS_ISA_MIPS4, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("rm7000", PROCESSOR_R7000, MIPS_ISA_MIPS4, 0) +MIPS_CPU ("rm9000", PROCESSOR_R9000, MIPS_ISA_MIPS4, 0) /* MIPS32 processors. */ -MIPS_CPU ("4kc", PROCESSOR_4KC, 32, 0) -MIPS_CPU ("4km", PROCESSOR_4KC, 32, 0) -MIPS_CPU ("4kp", PROCESSOR_4KP, 32, 0) -MIPS_CPU ("4ksc", PROCESSOR_4KC, 32, 0) +MIPS_CPU ("4kc", PROCESSOR_4KC, MIPS_ISA_MIPS32, 0) +MIPS_CPU ("4km", PROCESSOR_4KC, MIPS_ISA_MIPS32, 0) +MIPS_CPU ("4kp", PROCESSOR_4KP, MIPS_ISA_MIPS32, 0) +MIPS_CPU ("4ksc", PROCESSOR_4KC, MIPS_ISA_MIPS32, 0) /* MIPS32 Release 2 processors. */ -MIPS_CPU ("m4k", PROCESSOR_M4K, 33, 0) -MIPS_CPU ("m14kc", PROCESSOR_M4K, 33, 0) -MIPS_CPU ("m14k", PROCESSOR_M4K, 33, 0) -MIPS_CPU ("m14ke", PROCESSOR_M4K, 33, 0) -MIPS_CPU ("m14kec", PROCESSOR_M4K, 33, 0) -MIPS_CPU ("4kec", PROCESSOR_4KC, 33, 0) -MIPS_CPU ("4kem", PROCESSOR_4KC, 33, 0) -MIPS_CPU ("4kep", PROCESSOR_4KP, 33, 0) -MIPS_CPU ("4ksd", PROCESSOR_4KC, 33, 0) - -MIPS_CPU ("24kc", PROCESSOR_24KC, 33, 0) -MIPS_CPU ("24kf2_1", PROCESSOR_24KF2_1, 33, 0) -MIPS_CPU ("24kf", PROCESSOR_24KF2_1, 33, 0) -MIPS_CPU ("24kf1_1", PROCESSOR_24KF1_1, 33, 0) -MIPS_CPU ("24kfx", PROCESSOR_24KF1_1, 33, 0) -MIPS_CPU ("24kx", PROCESSOR_24KF1_1, 33, 0) - -MIPS_CPU ("24kec", PROCESSOR_24KC, 33, 0) /* 24K with DSP. */ -MIPS_CPU ("24kef2_1", PROCESSOR_24KF2_1, 33, 0) -MIPS_CPU ("24kef", PROCESSOR_24KF2_1, 33, 0) -MIPS_CPU ("24kef1_1", PROCESSOR_24KF1_1, 33, 0) -MIPS_CPU ("24kefx", PROCESSOR_24KF1_1, 33, 0) -MIPS_CPU ("24kex", PROCESSOR_24KF1_1, 33, 0) - -MIPS_CPU ("34kc", PROCESSOR_24KC, 33, 0) /* 34K with MT/DSP. */ -MIPS_CPU ("34kf2_1", PROCESSOR_24KF2_1, 33, 0) -MIPS_CPU ("34kf", PROCESSOR_24KF2_1, 33, 0) -MIPS_CPU ("34kf1_1", PROCESSOR_24KF1_1, 33, 0) -MIPS_CPU ("34kfx", PROCESSOR_24KF1_1, 33, 0) -MIPS_CPU ("34kx", PROCESSOR_24KF1_1, 33, 0) -MIPS_CPU ("34kn", PROCESSOR_24KC, 33, 0) /* 34K with MT but no DSP. */ - -MIPS_CPU ("74kc", PROCESSOR_74KC, 33, PTF_AVOID_IMADD) /* 74K with DSPr2. */ -MIPS_CPU ("74kf2_1", PROCESSOR_74KF2_1, 33, PTF_AVOID_IMADD) -MIPS_CPU ("74kf", PROCESSOR_74KF2_1, 33, PTF_AVOID_IMADD) -MIPS_CPU ("74kf1_1", PROCESSOR_74KF1_1, 33, PTF_AVOID_IMADD) -MIPS_CPU ("74kfx", PROCESSOR_74KF1_1, 33, PTF_AVOID_IMADD) -MIPS_CPU ("74kx", PROCESSOR_74KF1_1, 33, PTF_AVOID_IMADD) -MIPS_CPU ("74kf3_2", PROCESSOR_74KF3_2, 33, PTF_AVOID_IMADD) - -MIPS_CPU ("1004kc", PROCESSOR_24KC, 33, 0) /* 1004K with MT/DSP. */ -MIPS_CPU ("1004kf2_1", PROCESSOR_24KF2_1, 33, 0) -MIPS_CPU ("1004kf", PROCESSOR_24KF2_1, 33, 0) -MIPS_CPU ("1004kf1_1", PROCESSOR_24KF1_1, 33, 0) - -MIPS_CPU ("interaptiv", PROCESSOR_24KF2_1, 33, 0) +MIPS_CPU ("m4k", PROCESSOR_M4K, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("m14kc", PROCESSOR_M4K, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("m14k", PROCESSOR_M4K, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("m14ke", PROCESSOR_M4K, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("m14kec", PROCESSOR_M4K, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("4kec", PROCESSOR_4KC, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("4kem", PROCESSOR_4KC, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("4kep", PROCESSOR_4KP, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("4ksd", PROCESSOR_4KC, MIPS_ISA_MIPS32R2, 0) + +MIPS_CPU ("24kc", PROCESSOR_24KC, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("24kf2_1", PROCESSOR_24KF2_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("24kf", PROCESSOR_24KF2_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("24kf1_1", PROCESSOR_24KF1_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("24kfx", PROCESSOR_24KF1_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("24kx", PROCESSOR_24KF1_1, MIPS_ISA_MIPS32R2, 0) + +MIPS_CPU ("24kec", PROCESSOR_24KC, MIPS_ISA_MIPS32R2, 0) /* 24K with DSP. */ +MIPS_CPU ("24kef2_1", PROCESSOR_24KF2_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("24kef", PROCESSOR_24KF2_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("24kef1_1", PROCESSOR_24KF1_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("24kefx", PROCESSOR_24KF1_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("24kex", PROCESSOR_24KF1_1, MIPS_ISA_MIPS32R2, 0) + +MIPS_CPU ("34kc", PROCESSOR_24KC, MIPS_ISA_MIPS32R2, 0) /* 34K with MT/DSP. */ +MIPS_CPU ("34kf2_1", PROCESSOR_24KF2_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("34kf", PROCESSOR_24KF2_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("34kf1_1", PROCESSOR_24KF1_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("34kfx", PROCESSOR_24KF1_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("34kx", PROCESSOR_24KF1_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("34kn", PROCESSOR_24KC, MIPS_ISA_MIPS32R2, 0) /* 34K with MT but no DSP. */ + +MIPS_CPU ("74kc", PROCESSOR_74KC, MIPS_ISA_MIPS32R2, PTF_AVOID_IMADD) /* 74K with DSPr2. */ +MIPS_CPU ("74kf2_1", PROCESSOR_74KF2_1, MIPS_ISA_MIPS32R2, PTF_AVOID_IMADD) +MIPS_CPU ("74kf", PROCESSOR_74KF2_1, MIPS_ISA_MIPS32R2, PTF_AVOID_IMADD) +MIPS_CPU ("74kf1_1", PROCESSOR_74KF1_1, MIPS_ISA_MIPS32R2, PTF_AVOID_IMADD) +MIPS_CPU ("74kfx", PROCESSOR_74KF1_1, MIPS_ISA_MIPS32R2, PTF_AVOID_IMADD) +MIPS_CPU ("74kx", PROCESSOR_74KF1_1, MIPS_ISA_MIPS32R2, PTF_AVOID_IMADD) +MIPS_CPU ("74kf3_2", PROCESSOR_74KF3_2, MIPS_ISA_MIPS32R2, PTF_AVOID_IMADD) + +MIPS_CPU ("1004kc", PROCESSOR_24KC, MIPS_ISA_MIPS32R2, 0) /* 1004K with MT/DSP. */ +MIPS_CPU ("1004kf2_1", PROCESSOR_24KF2_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("1004kf", PROCESSOR_24KF2_1, MIPS_ISA_MIPS32R2, 0) +MIPS_CPU ("1004kf1_1", PROCESSOR_24KF1_1, MIPS_ISA_MIPS32R2, 0) + +MIPS_CPU ("interaptiv", PROCESSOR_24KF2_1, MIPS_ISA_MIPS32R2, 0) /* MIPS32 Release 5 processors. */ -MIPS_CPU ("p5600", PROCESSOR_P5600, 36, (PTF_AVOID_BRANCHLIKELY_SPEED +MIPS_CPU ("p5600", PROCESSOR_P5600, MIPS_ISA_MIPS32R5, (PTF_AVOID_BRANCHLIKELY_SPEED | PTF_AVOID_IMADD)) -MIPS_CPU ("m5100", PROCESSOR_M5100, 36, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("m5101", PROCESSOR_M5100, 36, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("m5100", PROCESSOR_M5100, MIPS_ISA_MIPS32R5, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("m5101", PROCESSOR_M5100, MIPS_ISA_MIPS32R5, PTF_AVOID_BRANCHLIKELY_SPEED) /* MIPS64 processors. */ -MIPS_CPU ("5kc", PROCESSOR_5KC, 64, 0) -MIPS_CPU ("5kf", PROCESSOR_5KF, 64, 0) -MIPS_CPU ("20kc", PROCESSOR_20KC, 64, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("sb1", PROCESSOR_SB1, 64, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("sb1a", PROCESSOR_SB1A, 64, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("sr71000", PROCESSOR_SR71000, 64, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("xlr", PROCESSOR_XLR, 64, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("5kc", PROCESSOR_5KC, MIPS_ISA_MIPS64, 0) +MIPS_CPU ("5kf", PROCESSOR_5KF, MIPS_ISA_MIPS64, 0) +MIPS_CPU ("20kc", PROCESSOR_20KC, MIPS_ISA_MIPS64, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("sb1", PROCESSOR_SB1, MIPS_ISA_MIPS64, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("sb1a", PROCESSOR_SB1A, MIPS_ISA_MIPS64, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("sr71000", PROCESSOR_SR71000, MIPS_ISA_MIPS64, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("xlr", PROCESSOR_XLR, MIPS_ISA_MIPS64, PTF_AVOID_BRANCHLIKELY_SPEED) /* MIPS64 Release 2 processors. */ -MIPS_CPU ("loongson3a", PROCESSOR_GS464, 65, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("gs464", PROCESSOR_GS464, 65, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("gs464e", PROCESSOR_GS464E, 65, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("gs264e", PROCESSOR_GS264E, 65, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("octeon", PROCESSOR_OCTEON, 65, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("octeon+", PROCESSOR_OCTEON, 65, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("octeon2", PROCESSOR_OCTEON2, 65, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("octeon3", PROCESSOR_OCTEON3, 65, PTF_AVOID_BRANCHLIKELY_SPEED) -MIPS_CPU ("xlp", PROCESSOR_XLP, 65, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("loongson3a", PROCESSOR_GS464, MIPS_ISA_MIPS64R2, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("gs464", PROCESSOR_GS464, MIPS_ISA_MIPS64R2, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("gs464e", PROCESSOR_GS464E, MIPS_ISA_MIPS64R2, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("gs264e", PROCESSOR_GS264E, MIPS_ISA_MIPS64R2, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("octeon", PROCESSOR_OCTEON, MIPS_ISA_MIPS64R2, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("octeon+", PROCESSOR_OCTEON, MIPS_ISA_MIPS64R2, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("octeon2", PROCESSOR_OCTEON2, MIPS_ISA_MIPS64R2, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("octeon3", PROCESSOR_OCTEON3, MIPS_ISA_MIPS64R2, PTF_AVOID_BRANCHLIKELY_SPEED) +MIPS_CPU ("xlp", PROCESSOR_XLP, MIPS_ISA_MIPS64R2, PTF_AVOID_BRANCHLIKELY_SPEED) /* MIPS64 Release 6 processors. */ -MIPS_CPU ("i6400", PROCESSOR_I6400, 69, 0) -MIPS_CPU ("i6500", PROCESSOR_I6400, 69, 0) -MIPS_CPU ("p6600", PROCESSOR_P6600, 69, 0) +MIPS_CPU ("i6400", PROCESSOR_I6400, MIPS_ISA_MIPS64R6, 0) +MIPS_CPU ("i6500", PROCESSOR_I6400, MIPS_ISA_MIPS64R6, 0) +MIPS_CPU ("p6600", PROCESSOR_P6600, MIPS_ISA_MIPS64R6, 0) diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 2f7ffe8..493d3de 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -19817,9 +19817,12 @@ mips_set_architecture (const struct mips_cpu_info *info) mips_arch_info = info; mips_arch = info->cpu; mips_isa = info->isa; - if (mips_isa < 32) + if (mips_isa < MIPS_ISA_MIPS32) mips_isa_rev = 0; else + /* we can do this is due to the + * enum of MIPS32rN is from 32 to 37 + * enum of MIPS64rN is from 64 to 69 */ mips_isa_rev = (mips_isa & 31) + 1; } } diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h index 47aac9d..973372e 100644 --- a/gcc/config/mips/mips.h +++ b/gcc/config/mips/mips.h @@ -42,6 +42,23 @@ extern int target_flags_explicit; #define ABI_EABI 3 #define ABI_O64 4 +enum mips_isa { + MIPS_ISA_MIPS1 = 1, + MIPS_ISA_MIPS2 = 2, + MIPS_ISA_MIPS3 = 3, + MIPS_ISA_MIPS4 = 4, + MIPS_ISA_MIPS32 = 32, + MIPS_ISA_MIPS32R2 = 33, + MIPS_ISA_MIPS32R3 = 34, + MIPS_ISA_MIPS32R5 = 36, + MIPS_ISA_MIPS32R6 = 37, + MIPS_ISA_MIPS64 = 64, + MIPS_ISA_MIPS64R2 = 65, + MIPS_ISA_MIPS64R3 = 66, + MIPS_ISA_MIPS64R5 = 68, + MIPS_ISA_MIPS64R6 = 69 +}; + /* Masks that affect tuning. PTF_AVOID_BRANCHLIKELY_SPEED @@ -81,7 +98,7 @@ struct mips_cpu_info { enum processor cpu; /* The ISA level that the processor implements. */ - int isa; + enum mips_isa isa; /* A mask of PTF_* values. */ unsigned int tune_flags; @@ -232,7 +249,7 @@ struct mips_cpu_info { /* Generate mips16 code */ #define TARGET_MIPS16 ((target_flags & MASK_MIPS16) != 0) /* Generate mips16e code. Default 16bit ASE for mips32* and mips64* */ -#define GENERATE_MIPS16E (TARGET_MIPS16 && mips_isa >= 32) +#define GENERATE_MIPS16E (TARGET_MIPS16 && mips_isa >= MIPS_ISA_MIPS32) /* Generate mips16e register save/restore sequences. */ #define GENERATE_MIPS16E_SAVE_RESTORE (GENERATE_MIPS16E && mips_abi == ABI_32) @@ -247,20 +264,20 @@ struct mips_cpu_info { (TARGET_MIPS16 && mips_code_readable >= CODE_READABLE_PCREL) /* Generic ISA defines. */ -#define ISA_MIPS1 (mips_isa == 1) -#define ISA_MIPS2 (mips_isa == 2) -#define ISA_MIPS3 (mips_isa == 3) -#define ISA_MIPS4 (mips_isa == 4) -#define ISA_MIPS32 (mips_isa == 32) -#define ISA_MIPS32R2 (mips_isa == 33) -#define ISA_MIPS32R3 (mips_isa == 34) -#define ISA_MIPS32R5 (mips_isa == 36) -#define ISA_MIPS32R6 (mips_isa == 37) -#define ISA_MIPS64 (mips_isa == 64) -#define ISA_MIPS64R2 (mips_isa == 65) -#define ISA_MIPS64R3 (mips_isa == 66) -#define ISA_MIPS64R5 (mips_isa == 68) -#define ISA_MIPS64R6 (mips_isa == 69) +#define ISA_MIPS1 (mips_isa == MIPS_ISA_MIPS1) +#define ISA_MIPS2 (mips_isa == MIPS_ISA_MIPS2) +#define ISA_MIPS3 (mips_isa == MIPS_ISA_MIPS3) +#define ISA_MIPS4 (mips_isa == MIPS_ISA_MIPS4) +#define ISA_MIPS32 (mips_isa == MIPS_ISA_MIPS32) +#define ISA_MIPS32R2 (mips_isa == MIPS_ISA_MIPS32R2) +#define ISA_MIPS32R3 (mips_isa == MIPS_ISA_MIPS32R3) +#define ISA_MIPS32R5 (mips_isa == MIPS_ISA_MIPS32R5) +#define ISA_MIPS32R6 (mips_isa == MIPS_ISA_MIPS32R6) +#define ISA_MIPS64 (mips_isa == MIPS_ISA_MIPS64) +#define ISA_MIPS64R2 (mips_isa == MIPS_ISA_MIPS64R2) +#define ISA_MIPS64R3 (mips_isa == MIPS_ISA_MIPS64R3) +#define ISA_MIPS64R5 (mips_isa == MIPS_ISA_MIPS64R5) +#define ISA_MIPS64R6 (mips_isa == MIPS_ISA_MIPS64R6) /* Architecture target defines. */ #define TARGET_LOONGSON_2E (mips_arch == PROCESSOR_LOONGSON_2E) @@ -511,12 +528,13 @@ struct mips_cpu_info { builtin_define ("__mips=4"); \ builtin_define ("_MIPS_ISA=_MIPS_ISA_MIPS4"); \ } \ - else if (mips_isa >= 32 && mips_isa < 64) \ + else if (mips_isa >= MIPS_ISA_MIPS32 \ + && mips_isa < MIPS_ISA_MIPS64) \ { \ builtin_define ("__mips=32"); \ builtin_define ("_MIPS_ISA=_MIPS_ISA_MIPS32"); \ } \ - else if (mips_isa >= 64) \ + else if (mips_isa >= MIPS_ISA_MIPS64) \ { \ builtin_define ("__mips=64"); \ builtin_define ("_MIPS_ISA=_MIPS_ISA_MIPS64"); \ @@ -708,25 +726,25 @@ struct mips_cpu_info { #endif #ifndef MULTILIB_ISA_DEFAULT -#if MIPS_ISA_DEFAULT == 1 +#if MIPS_ISA_DEFAULT == MIPS_ISA_MIPS1 #define MULTILIB_ISA_DEFAULT "mips1" -#elif MIPS_ISA_DEFAULT == 2 +#elif MIPS_ISA_DEFAULT == MIPS_ISA_MIPS2 #define MULTILIB_ISA_DEFAULT "mips2" -#elif MIPS_ISA_DEFAULT == 3 +#elif MIPS_ISA_DEFAULT == MIPS_ISA_MIPS3 #define MULTILIB_ISA_DEFAULT "mips3" -#elif MIPS_ISA_DEFAULT == 4 +#elif MIPS_ISA_DEFAULT == MIPS_ISA_MIPS4 #define MULTILIB_ISA_DEFAULT "mips4" -#elif MIPS_ISA_DEFAULT == 32 +#elif MIPS_ISA_DEFAULT == MIPS_ISA_MIPS32 #define MULTILIB_ISA_DEFAULT "mips32" -#elif MIPS_ISA_DEFAULT == 33 +#elif MIPS_ISA_DEFAULT == MIPS_ISA_MIPS32R2 #define MULTILIB_ISA_DEFAULT "mips32r2" -#elif MIPS_ISA_DEFAULT == 37 +#elif MIPS_ISA_DEFAULT == MIPS_ISA_MIPS32R6 #define MULTILIB_ISA_DEFAULT "mips32r6" -#elif MIPS_ISA_DEFAULT == 64 +#elif MIPS_ISA_DEFAULT == MIPS_ISA_MIPS64 #define MULTILIB_ISA_DEFAULT "mips64" -#elif MIPS_ISA_DEFAULT == 65 +#elif MIPS_ISA_DEFAULT == MIPS_ISA_MIPS64R2 #define MULTILIB_ISA_DEFAULT "mips64r2" -#elif MIPS_ISA_DEFAULT == 69 +#elif MIPS_ISA_DEFAULT == MIPS_ISA_MIPS64R6 #define MULTILIB_ISA_DEFAULT "mips64r6" #else #define MULTILIB_ISA_DEFAULT "mips1" @@ -1275,12 +1293,12 @@ struct mips_cpu_info { && !TARGET_MICROMIPS) /* Likewise mtc1 and mfc1. */ -#define ISA_HAS_XFER_DELAY (mips_isa <= 3 \ +#define ISA_HAS_XFER_DELAY (mips_isa <= MIPS_ISA_MIPS3 \ && !TARGET_MIPS5900 \ && !TARGET_LOONGSON_2EF) /* Likewise floating-point comparisons. */ -#define ISA_HAS_FCMP_DELAY (mips_isa <= 3 \ +#define ISA_HAS_FCMP_DELAY (mips_isa <= MIPS_ISA_MIPS3 \ && !TARGET_MIPS5900 \ && !TARGET_LOONGSON_2EF) @@ -1305,7 +1323,7 @@ struct mips_cpu_info { #define ISA_HAS_SYNCI (mips_isa_rev >= 2 && !TARGET_MIPS16) /* ISA includes sync. */ -#define ISA_HAS_SYNC ((mips_isa >= 2 || TARGET_MIPS3900) && !TARGET_MIPS16) +#define ISA_HAS_SYNC ((mips_isa >= MIPS_ISA_MIPS2 || TARGET_MIPS3900) && !TARGET_MIPS16) #define GENERATE_SYNC \ (target_flags_explicit & MASK_LLSC \ ? TARGET_LLSC && !TARGET_MIPS16 \ @@ -1314,7 +1332,7 @@ struct mips_cpu_info { /* ISA includes ll and sc. Note that this implies ISA_HAS_SYNC because the expanders use both ISA_HAS_SYNC and ISA_HAS_LL_SC instructions. */ -#define ISA_HAS_LL_SC (mips_isa >= 2 && !TARGET_MIPS5900 && !TARGET_MIPS16) +#define ISA_HAS_LL_SC (mips_isa >= MIPS_ISA_MIPS2 && !TARGET_MIPS5900 && !TARGET_MIPS16) #define GENERATE_LL_SC \ (target_flags_explicit & MASK_LLSC \ ? TARGET_LLSC && !TARGET_MIPS16 \ @@ -1342,7 +1360,7 @@ struct mips_cpu_info { #define ISA_HAS_POP (TARGET_OCTEON && !TARGET_MIPS16) /* The CACHE instruction is available in non-MIPS16 code. */ -#define TARGET_CACHE_BUILTIN (mips_isa >= 3) +#define TARGET_CACHE_BUILTIN (mips_isa >= MIPS_ISA_MIPS3) /* The CACHE instruction is available. */ #define ISA_HAS_CACHE (TARGET_CACHE_BUILTIN && !TARGET_MIPS16) diff --git a/gcc/config/mips/netbsd.h b/gcc/config/mips/netbsd.h index 5844f00..85c2779 100644 --- a/gcc/config/mips/netbsd.h +++ b/gcc/config/mips/netbsd.h @@ -84,9 +84,10 @@ along with GCC; see the file COPYING3. If not see builtin_define ("__mips=3"); \ else if (ISA_MIPS4) \ builtin_define ("__mips=4"); \ - else if (mips_isa >= 32 && mips_isa < 64) \ + else if (mips_isa >= MIPS_ISA_MIPS32 \ + && mips_isa < MIPS_ISA_MIPS64) \ builtin_define ("__mips=32"); \ - else if (mips_isa >= 64) \ + else if (mips_isa >= MIPS_ISA_64) \ builtin_define ("__mips=64"); \ if (mips_isa_rev > 0) \ builtin_define_with_int_value ("__mips_isa_rev", \ -- cgit v1.1 From d904008df267cbcc01bd6edf98fa0789fb6e94da Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Sat, 28 Aug 2021 07:39:04 -0400 Subject: MIPS: add .module mipsREV to all output asm file Currently, the asm output file for MIPS has no rev info. It can make some trouble, for example: assembler is mips1 by default, gcc is fpxx by default. To assemble the output of gcc -S, we have to pass -mips2 to assembler. gcc/ChangeLog: * config/mips/mips.c (mips_module_isa_name): New. mips_file_start: add .module mipsREV to all asm output --- gcc/config/mips/mips.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 493d3de..a7087ec 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -9841,6 +9841,44 @@ mips_mdebug_abi_name (void) } } +static const char * +mips_module_isa_name () +{ + switch (mips_isa) + { + case MIPS_ISA_MIPS1: + return "mips1"; + case MIPS_ISA_MIPS2: + return "mips2"; + case MIPS_ISA_MIPS3: + return "mips3"; + case MIPS_ISA_MIPS4: + return "mips4"; + case MIPS_ISA_MIPS32: + return "mips32"; + case MIPS_ISA_MIPS32R2: + return "mips32r2"; + case MIPS_ISA_MIPS32R3: + return "mips32r3"; + case MIPS_ISA_MIPS32R5: + return "mips32r5"; + case MIPS_ISA_MIPS32R6: + return "mips32r6"; + case MIPS_ISA_MIPS64: + return "mips64"; + case MIPS_ISA_MIPS64R2: + return "mips64r2"; + case MIPS_ISA_MIPS64R3: + return "mips64r3"; + case MIPS_ISA_MIPS64R5: + return "mips64r5"; + case MIPS_ISA_MIPS64R6: + return "mips64r6"; + default: + gcc_unreachable (); + } +} + /* Implement TARGET_ASM_FILE_START. */ static void @@ -9873,6 +9911,9 @@ mips_file_start (void) mips_nan == MIPS_IEEE_754_2008 ? "2008" : "legacy"); #ifdef HAVE_AS_DOT_MODULE + fprintf (asm_out_file, "\t.module\t%s\n", + mips_module_isa_name ()); + /* Record the FP ABI. See below for comments. */ if (TARGET_NO_FLOAT) #ifdef HAVE_AS_GNU_ATTRIBUTE -- cgit v1.1 From e4d86078881bb7bb57bc6e68c22211707d2b3dc7 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Fri, 23 Jul 2021 17:21:06 +0000 Subject: Fix x86/56337 : 1<<28 alignment is broken The problem here is the x86_64 back-end uses a signed integer for alignment and then divides by BITS_PER_UNIT so if we had INT_MIN (which is what 1<<28*8 is), we would get the wrong result. This fixes the problem by using unsigned for the argument to x86_output_aligned_bss and x86_output_aligned_bss. OK? Bootstrapped and tested on x86_64-linux-gnu. gcc/ChangeLog: PR target/56337 * config/i386/i386-protos.h (x86_output_aligned_bss): Change align argument to unsigned type. (x86_elf_aligned_decl_common): Likewise. * config/i386/i386.c (x86_elf_aligned_decl_common): Likewise. (x86_output_aligned_bss): Likewise. --- gcc/config/i386/i386-protos.h | 4 ++-- gcc/config/i386/i386.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 2fd1307..355df11 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -326,9 +326,9 @@ struct ix86_address extern int ix86_decompose_address (rtx, struct ix86_address *); extern int memory_address_length (rtx, bool); extern void x86_output_aligned_bss (FILE *, tree, const char *, - unsigned HOST_WIDE_INT, int); + unsigned HOST_WIDE_INT, unsigned); extern void x86_elf_aligned_decl_common (FILE *, tree, const char *, - unsigned HOST_WIDE_INT, int); + unsigned HOST_WIDE_INT, unsigned); #ifdef RTX_CODE extern void ix86_fp_comparison_codes (enum rtx_code code, enum rtx_code *, diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 3bb2cab..4681b66 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -841,7 +841,7 @@ x86_64_elf_unique_section (tree decl, int reloc) void x86_elf_aligned_decl_common (FILE *file, tree decl, const char *name, unsigned HOST_WIDE_INT size, - int align) + unsigned align) { if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC) && size > (unsigned int)ix86_section_threshold) @@ -862,7 +862,7 @@ x86_elf_aligned_decl_common (FILE *file, tree decl, void x86_output_aligned_bss (FILE *file, tree decl, const char *name, - unsigned HOST_WIDE_INT size, int align) + unsigned HOST_WIDE_INT size, unsigned align) { if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC) && size > (unsigned int)ix86_section_threshold) -- cgit v1.1 From d158c3f77738e1d44aa117c1674e9ec8dee38661 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Tue, 31 Aug 2021 09:48:02 -0400 Subject: More stabs removal. gcc/ * config.gcc (cris-*-elf, cris-*-none): Remove dbxelf.h from tm_file. (m32r-*-elf, m32rle-*-elf, m32r-*-linux): Likewise. (mn10300-*-*, am33_2.0-*-linux*): Likewise. (xtensa*-*-elf, xtensa*-*-linux, xtensa*-*-uclinux): Likewise. (m32c-*-elf*, m32c-*-rtems*): Likewise. * config/cris/cris.h (DBX_NO_XREFS): Remove. (DBX_CONTIN_LENGTH, DBX_CONTIN_CHAR): Likewise. * config/m32r/m32r.h (DBXOUT_SOURCE_LINE): Likewise. (DBX_DEBUGGING_INFO, DBX_CONTIN_LENGTH): Likewise. * config/mn10300/mn10300.h (DEFAULT_GDB_EXTENSIONS): Likewise. * config/mn10300/linux.h (DBX_REGISTER_NAMES): Likewise. --- gcc/config/cris/cris.h | 18 ------------------ gcc/config/m32r/m32r.h | 28 +--------------------------- gcc/config/mn10300/linux.h | 2 -- gcc/config/mn10300/mn10300.h | 3 --- 4 files changed, 1 insertion(+), 50 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/cris/cris.h b/gcc/config/cris/cris.h index 1ab830e..4276b6a7 100644 --- a/gcc/config/cris/cris.h +++ b/gcc/config/cris/cris.h @@ -901,24 +901,6 @@ struct cum_args {int regs;}; /* FIXME: Investigate DEBUGGER_AUTO_OFFSET, DEBUGGER_ARG_OFFSET. */ -/* Node: DBX Options */ - -/* Is this correct? Check later. */ -#define DBX_NO_XREFS - -#define DBX_CONTIN_LENGTH 0 - -/* FIXME: Is this needed when we have 0 DBX_CONTIN_LENGTH? */ -#define DBX_CONTIN_CHAR '?' - - -/* Node: DBX Hooks */ -/* (no definitions) */ - -/* Node: File names and DBX */ -/* (no definitions) */ - - /* Node: DWARF */ /* (no definitions) */ diff --git a/gcc/config/m32r/m32r.h b/gcc/config/m32r/m32r.h index 83a4b0b..047805f 100644 --- a/gcc/config/m32r/m32r.h +++ b/gcc/config/m32r/m32r.h @@ -769,29 +769,6 @@ L2: .word STATIC /* Globalizing directive for a label. */ #define GLOBAL_ASM_OP "\t.global\t" -/* We do not use DBX_LINES_FUNCTION_RELATIVE or - dbxout_stab_value_internal_label_diff here because - we need to use .debugsym for the line label. */ - -#define DBX_OUTPUT_SOURCE_LINE(file, line, counter) \ - do \ - { \ - const char * begin_label = \ - XSTR (XEXP (DECL_RTL (current_function_decl), 0), 0); \ - char label[64]; \ - ASM_GENERATE_INTERNAL_LABEL (label, "LM", counter); \ - \ - dbxout_begin_stabn_sline (line); \ - assemble_name (file, label); \ - putc ('-', file); \ - assemble_name (file, begin_label); \ - fputs ("\n\t.debugsym ", file); \ - assemble_name (file, label); \ - putc ('\n', file); \ - counter += 1; \ - } \ - while (0) - /* How to refer to registers in assembler output. This sequence is indexed by compiler's hard-register-number (see above). */ #ifndef SUBTARGET_REGISTER_NAMES @@ -930,16 +907,13 @@ L2: .word STATIC /* Debugging information. */ -/* Generate DBX and DWARF debugging information. */ -#define DBX_DEBUGGING_INFO 1 +/* Generate DWARF debugging information. */ #define DWARF2_DEBUGGING_INFO 1 /* Use DWARF2 debugging info by default. */ #undef PREFERRED_DEBUGGING_TYPE #define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG -/* Turn off splitting of long stabs. */ -#define DBX_CONTIN_LENGTH 0 /* Miscellaneous. */ diff --git a/gcc/config/mn10300/linux.h b/gcc/config/mn10300/linux.h index 657bfe4..44825eb 100644 --- a/gcc/config/mn10300/linux.h +++ b/gcc/config/mn10300/linux.h @@ -44,8 +44,6 @@ #undef PROCESSOR_DEFAULT #define PROCESSOR_DEFAULT PROCESSOR_AM33_2 -#define DBX_REGISTER_NUMBER(REGNO) (REGNO) - extern int mn10300_protect_label; #undef PRINT_OPERAND diff --git a/gcc/config/mn10300/mn10300.h b/gcc/config/mn10300/mn10300.h index d94d8e6..cb0d072 100644 --- a/gcc/config/mn10300/mn10300.h +++ b/gcc/config/mn10300/mn10300.h @@ -649,9 +649,6 @@ do { \ if ((LOG) != 0) \ fprintf (FILE, "\t.align %d\n", (LOG)) -/* We don't have to worry about dbx compatibility for the mn10300. */ -#define DEFAULT_GDB_EXTENSIONS 1 - /* Use dwarf2 debugging info by default. */ #undef PREFERRED_DEBUGGING_TYPE #define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG -- cgit v1.1 From a45786e9a31f995087d8cb42bc3a4fe06911e588 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Tue, 31 Aug 2021 04:41:14 +0000 Subject: Fix target/101934: aarch64 memset code creates unaligned stores for -mstrict-align The problem here is the aarch64_expand_setmem code did not check STRICT_ALIGNMENT if it is creating an overlapping store. This patch adds that check and the testcase works. gcc/ChangeLog: PR target/101934 * config/aarch64/aarch64.c (aarch64_expand_setmem): Check STRICT_ALIGNMENT before creating an overlapping store. gcc/testsuite/ChangeLog: PR target/101934 * gcc.target/aarch64/memset-strict-align-1.c: New test. --- gcc/config/aarch64/aarch64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 3213585..26d59ba 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -23566,8 +23566,8 @@ aarch64_expand_setmem (rtx *operands) /* Do certain trailing copies as overlapping if it's going to be cheaper. i.e. less instructions to do so. For instance doing a 15 byte copy it's more efficient to do two overlapping 8 byte copies than - 8 + 4 + 2 + 1. */ - if (n > 0 && n < copy_limit / 2) + 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */ + if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT) { next_mode = smallest_mode_for_size (n, MODE_INT); int n_bits = GET_MODE_BITSIZE (next_mode).to_constant (); -- cgit v1.1 From 6e16b2123dd1cf5bab23cda0ce65223e5d55eded Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Thu, 2 Sep 2021 10:17:18 +0800 Subject: Revert "MIPS: add .module mipsREV to all output asm file" This reverts commit d904008df267cbcc01bd6edf98fa0789fb6e94da. ".module MIPSRev" has higher priority than -march=octeon or like. It makes assembler cannot recognize the extension instructions of octeon (See pr62030-octeon.c). --- gcc/config/mips/mips.c | 41 ----------------------------------------- 1 file changed, 41 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index a7087ec..493d3de 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -9841,44 +9841,6 @@ mips_mdebug_abi_name (void) } } -static const char * -mips_module_isa_name () -{ - switch (mips_isa) - { - case MIPS_ISA_MIPS1: - return "mips1"; - case MIPS_ISA_MIPS2: - return "mips2"; - case MIPS_ISA_MIPS3: - return "mips3"; - case MIPS_ISA_MIPS4: - return "mips4"; - case MIPS_ISA_MIPS32: - return "mips32"; - case MIPS_ISA_MIPS32R2: - return "mips32r2"; - case MIPS_ISA_MIPS32R3: - return "mips32r3"; - case MIPS_ISA_MIPS32R5: - return "mips32r5"; - case MIPS_ISA_MIPS32R6: - return "mips32r6"; - case MIPS_ISA_MIPS64: - return "mips64"; - case MIPS_ISA_MIPS64R2: - return "mips64r2"; - case MIPS_ISA_MIPS64R3: - return "mips64r3"; - case MIPS_ISA_MIPS64R5: - return "mips64r5"; - case MIPS_ISA_MIPS64R6: - return "mips64r6"; - default: - gcc_unreachable (); - } -} - /* Implement TARGET_ASM_FILE_START. */ static void @@ -9911,9 +9873,6 @@ mips_file_start (void) mips_nan == MIPS_IEEE_754_2008 ? "2008" : "legacy"); #ifdef HAVE_AS_DOT_MODULE - fprintf (asm_out_file, "\t.module\t%s\n", - mips_module_isa_name ()); - /* Record the FP ABI. See below for comments. */ if (TARGET_NO_FLOAT) #ifdef HAVE_AS_GNU_ATTRIBUTE -- cgit v1.1 From 7cbc870c495cebc61f5d0ebb975856c207a42fab Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 5 Jul 2021 17:05:45 +0800 Subject: Enable _Float16 type for TARGET_SSE2 and above. gcc/ChangeLog: * config/i386/i386-modes.def (FLOAT_MODE): Define ieee HFmode. * config/i386/i386.c (enum x86_64_reg_class): Add X86_64_SSEHF_CLASS. (merge_classes): Handle X86_64_SSEHF_CLASS. (examine_argument): Ditto. (construct_container): Ditto. (classify_argument): Ditto, and set HFmode/HCmode to X86_64_SSEHF_CLASS. (function_value_32): Return _FLoat16/Complex Float16 by %xmm0. (function_value_64): Return _Float16/Complex Float16 by SSE register. (ix86_print_operand): Handle CONST_DOUBLE HFmode. (ix86_secondary_reload): Require gpr as intermediate register to store _Float16 from sse register when sse4 is not available. (ix86_libgcc_floating_mode_supported_p): Enable _FLoat16 under sse2. (ix86_scalar_mode_supported_p): Ditto. (TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P): Defined. * config/i386/i386.h (VALID_SSE2_REG_MODE): Add HFmode. (VALID_INT_MODE_P): Add HFmode and HCmode. * config/i386/i386.md (*pushhf_rex64): New define_insn. (*pushhf): Ditto. (*movhf_internal): Ditto. * doc/extend.texi (Half-Precision Floating Point): Documemt _Float16 for x86. gcc/lto/ChangeLog: * lto-lang.c (lto_type_for_mode): Return float16_type_node when mode == TYPE_MODE (float16_type_node). gcc/testsuite/ChangeLog * gcc.target/i386/sse2-float16-1.c: New test. * gcc.target/i386/sse2-float16-2.c: Ditto. * gcc.target/i386/sse2-float16-3.c: Ditto. * gcc.target/i386/float16-5.c: New test. --- gcc/config/i386/i386-modes.def | 1 + gcc/config/i386/i386.c | 91 +++++++++++++++++++++++++++++-- gcc/config/i386/i386.h | 3 +- gcc/config/i386/i386.md | 118 +++++++++++++++++++++++++++++++++++++++-- 4 files changed, 205 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def index 4e7014b..9232f59 100644 --- a/gcc/config/i386/i386-modes.def +++ b/gcc/config/i386/i386-modes.def @@ -23,6 +23,7 @@ along with GCC; see the file COPYING3. If not see FRACTIONAL_FLOAT_MODE (XF, 80, 12, ieee_extended_intel_96_format); FLOAT_MODE (TF, 16, ieee_quad_format); +FLOAT_MODE (HF, 2, ieee_half_format); /* In ILP32 mode, XFmode has size 12 and alignment 4. In LP64 mode, XFmode has size and alignment 16. */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 4681b66..bfefbd7 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -387,6 +387,7 @@ enum x86_64_reg_class X86_64_INTEGER_CLASS, X86_64_INTEGERSI_CLASS, X86_64_SSE_CLASS, + X86_64_SSEHF_CLASS, X86_64_SSESF_CLASS, X86_64_SSEDF_CLASS, X86_64_SSEUP_CLASS, @@ -2027,8 +2028,10 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2) return X86_64_MEMORY_CLASS; /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */ - if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS) - || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS)) + if ((class1 == X86_64_INTEGERSI_CLASS + && (class2 == X86_64_SSESF_CLASS || class2 == X86_64_SSEHF_CLASS)) + || (class2 == X86_64_INTEGERSI_CLASS + && (class1 == X86_64_SSESF_CLASS || class1 == X86_64_SSEHF_CLASS))) return X86_64_INTEGERSI_CLASS; if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS) @@ -2182,6 +2185,8 @@ classify_argument (machine_mode mode, const_tree type, /* The partial classes are now full classes. */ if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4) subclasses[0] = X86_64_SSE_CLASS; + if (subclasses[0] == X86_64_SSEHF_CLASS && bytes != 2) + subclasses[0] = X86_64_SSE_CLASS; if (subclasses[0] == X86_64_INTEGERSI_CLASS && !((bit_offset % 64) == 0 && bytes == 4)) subclasses[0] = X86_64_INTEGER_CLASS; @@ -2354,6 +2359,12 @@ classify_argument (machine_mode mode, const_tree type, gcc_unreachable (); case E_CTImode: return 0; + case E_HFmode: + if (!(bit_offset % 64)) + classes[0] = X86_64_SSEHF_CLASS; + else + classes[0] = X86_64_SSE_CLASS; + return 1; case E_SFmode: if (!(bit_offset % 64)) classes[0] = X86_64_SSESF_CLASS; @@ -2371,6 +2382,15 @@ classify_argument (machine_mode mode, const_tree type, classes[0] = X86_64_SSE_CLASS; classes[1] = X86_64_SSEUP_CLASS; return 2; + case E_HCmode: + classes[0] = X86_64_SSE_CLASS; + if (!(bit_offset % 64)) + return 1; + else + { + classes[1] = X86_64_SSEHF_CLASS; + return 2; + } case E_SCmode: classes[0] = X86_64_SSE_CLASS; if (!(bit_offset % 64)) @@ -2485,6 +2505,7 @@ examine_argument (machine_mode mode, const_tree type, int in_return, (*int_nregs)++; break; case X86_64_SSE_CLASS: + case X86_64_SSEHF_CLASS: case X86_64_SSESF_CLASS: case X86_64_SSEDF_CLASS: (*sse_nregs)++; @@ -2584,13 +2605,14 @@ construct_container (machine_mode mode, machine_mode orig_mode, /* First construct simple cases. Avoid SCmode, since we want to use single register to pass this type. */ - if (n == 1 && mode != SCmode) + if (n == 1 && mode != SCmode && mode != HCmode) switch (regclass[0]) { case X86_64_INTEGER_CLASS: case X86_64_INTEGERSI_CLASS: return gen_rtx_REG (mode, intreg[0]); case X86_64_SSE_CLASS: + case X86_64_SSEHF_CLASS: case X86_64_SSESF_CLASS: case X86_64_SSEDF_CLASS: if (mode != BLKmode) @@ -2687,6 +2709,14 @@ construct_container (machine_mode mode, machine_mode orig_mode, GEN_INT (i*8)); intreg++; break; + case X86_64_SSEHF_CLASS: + exp [nexps++] + = gen_rtx_EXPR_LIST (VOIDmode, + gen_rtx_REG (HFmode, + GET_SSE_REGNO (sse_regno)), + GEN_INT (i*8)); + sse_regno++; + break; case X86_64_SSESF_CLASS: exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode, @@ -3907,6 +3937,19 @@ function_value_32 (machine_mode orig_mode, machine_mode mode, /* Most things go in %eax. */ regno = AX_REG; + /* Return _Float16/_Complex _Foat16 by sse register. */ + if (mode == HFmode) + regno = FIRST_SSE_REG; + if (mode == HCmode) + { + rtx ret = gen_rtx_PARALLEL (mode, rtvec_alloc(1)); + XVECEXP (ret, 0, 0) + = gen_rtx_EXPR_LIST (VOIDmode, + gen_rtx_REG (SImode, FIRST_SSE_REG), + GEN_INT (0)); + return ret; + } + /* Override FP return register with %xmm0 for local functions when SSE math is enabled or for functions with sseregparm attribute. */ if ((fn || fntype) && (mode == SFmode || mode == DFmode)) @@ -3943,6 +3986,8 @@ function_value_64 (machine_mode orig_mode, machine_mode mode, switch (mode) { + case E_HFmode: + case E_HCmode: case E_SFmode: case E_SCmode: case E_DFmode: @@ -13455,6 +13500,15 @@ ix86_print_operand (FILE *file, rtx x, int code) (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P'); } + else if (CONST_DOUBLE_P (x) && GET_MODE (x) == HFmode) + { + long l = real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (x), + REAL_MODE_FORMAT (HFmode)); + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('$', file); + fprintf (file, "0x%04x", (unsigned int) l); + } + else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode) { long l; @@ -19107,6 +19161,16 @@ ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass, return NO_REGS; } + /* Require movement to gpr, and then store to memory. */ + if (mode == HFmode + && !TARGET_SSE4_1 + && SSE_CLASS_P (rclass) + && !in_p && MEM_P (x)) + { + sri->extra_cost = 1; + return GENERAL_REGS; + } + /* This condition handles corner case where an expression involving pointers gets vectorized. We're trying to use the address of a stack slot as a vector initializer. @@ -21781,10 +21845,27 @@ ix86_scalar_mode_supported_p (scalar_mode mode) return default_decimal_float_supported_p (); else if (mode == TFmode) return true; + else if (mode == HFmode && TARGET_SSE2) + return true; else return default_scalar_mode_supported_p (mode); } +/* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE + if MODE is HFmode, and punt to the generic implementation otherwise. */ + +static bool +ix86_libgcc_floating_mode_supported_p (scalar_float_mode mode) +{ + /* NB: Always return TRUE for HFmode so that the _Float16 type will + be defined by the C front-end for AVX512FP16 intrinsics. We will + issue an error in ix86_expand_move for HFmode if AVX512FP16 isn't + enabled. */ + return ((mode == HFmode && TARGET_SSE2) + ? true + : default_libgcc_floating_mode_supported_p (mode)); +} + /* Implements target hook vector_mode_supported_p. */ static bool ix86_vector_mode_supported_p (machine_mode mode) @@ -24067,6 +24148,10 @@ ix86_run_selftests (void) #undef TARGET_SCALAR_MODE_SUPPORTED_P #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p +#undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P +#define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \ +ix86_libgcc_floating_mode_supported_p + #undef TARGET_VECTOR_MODE_SUPPORTED_P #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 6511422..f671dae 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1025,7 +1025,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define VALID_SSE2_REG_MODE(MODE) \ ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode \ || (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode \ - || (MODE) == V2DImode || (MODE) == DFmode) + || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode) #define VALID_SSE_REG_MODE(MODE) \ ((MODE) == V1TImode || (MODE) == TImode \ @@ -1054,6 +1054,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); || (MODE) == CQImode || (MODE) == CHImode \ || (MODE) == CSImode || (MODE) == CDImode \ || (MODE) == SDmode || (MODE) == DDmode \ + || (MODE) == HFmode || (MODE) == HCmode \ || (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode \ || (TARGET_64BIT \ && ((MODE) == TImode || (MODE) == CTImode \ diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 528116d..0cd151c 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -1226,6 +1226,9 @@ ;; All x87 floating point modes (define_mode_iterator X87MODEF [SF DF XF]) +;; All x87 floating point modes plus HF +(define_mode_iterator X87MODEFH [SF DF XF HF]) + ;; All SSE floating point modes (define_mode_iterator SSEMODEF [SF DF TF]) (define_mode_attr ssevecmodef [(SF "V4SF") (DF "V2DF") (TF "TF")]) @@ -3134,6 +3137,32 @@ operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx); }) +(define_insn "*pushhf_rex64" + [(set (match_operand:HF 0 "push_operand" "=X,X") + (match_operand:HF 1 "nonmemory_no_elim_operand" "r,x"))] + "TARGET_64BIT" +{ + /* Anything else should be already split before reg-stack. */ + gcc_assert (which_alternative == 0); + return "push{q}\t%q1"; +} + [(set_attr "isa" "*,sse4") + (set_attr "type" "push,multi") + (set_attr "mode" "DI,TI")]) + +(define_insn "*pushhf" + [(set (match_operand:HF 0 "push_operand" "=X,X") + (match_operand:HF 1 "general_no_elim_operand" "rmF,x"))] + "!TARGET_64BIT" +{ + /* Anything else should be already split before reg-stack. */ + gcc_assert (which_alternative == 0); + return "push{l}\t%k1"; +} + [(set_attr "isa" "*,sse4") + (set_attr "type" "push,multi") + (set_attr "mode" "SI,TI")]) + (define_insn "*pushsf_rex64" [(set (match_operand:SF 0 "push_operand" "=X,X,X") (match_operand:SF 1 "nonmemory_no_elim_operand" "f,rF,v"))] @@ -3162,10 +3191,11 @@ (set_attr "unit" "i387,*,*") (set_attr "mode" "SF,SI,SF")]) +(define_mode_iterator MODESH [SF HF]) ;; %%% Kill this when call knows how to work this out. (define_split - [(set (match_operand:SF 0 "push_operand") - (match_operand:SF 1 "any_fp_register_operand"))] + [(set (match_operand:MODESH 0 "push_operand") + (match_operand:MODESH 1 "any_fp_register_operand"))] "reload_completed" [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2))) (set (match_dup 0) (match_dup 1))] @@ -3213,8 +3243,8 @@ "ix86_expand_move (TFmode, operands); DONE;") (define_expand "mov" - [(set (match_operand:X87MODEF 0 "nonimmediate_operand") - (match_operand:X87MODEF 1 "general_operand"))] + [(set (match_operand:X87MODEFH 0 "nonimmediate_operand") + (match_operand:X87MODEFH 1 "general_operand"))] "" "ix86_expand_move (mode, operands); DONE;") @@ -3650,6 +3680,86 @@ ] (const_string "*")))]) +(define_insn "*movhf_internal" + [(set (match_operand:HF 0 "nonimmediate_operand" + "=?r,?m,v,v,?r,m,?v,v") + (match_operand:HF 1 "general_operand" + "rmF,rF,C,v, v,v, r,m"))] + "!(MEM_P (operands[0]) && MEM_P (operands[1])) + && (lra_in_progress + || reload_completed + || !CONST_DOUBLE_P (operands[1]) + || (TARGET_SSE && TARGET_SSE_MATH + && standard_sse_constant_p (operands[1], HFmode) == 1) + || memory_operand (operands[0], HFmode))" +{ + switch (get_attr_type (insn)) + { + case TYPE_IMOV: + return "mov{w}\t{%1, %0|%0, %1}"; + + case TYPE_SSELOG1: + return standard_sse_constant_opcode (insn, operands); + + case TYPE_SSEMOV: + return ix86_output_ssemov (insn, operands); + + case TYPE_SSELOG: + if (SSE_REG_P (operands[0])) + return MEM_P (operands[1]) + ? "pinsrw\t{$0, %1, %0|%0, %1, 0}" + : "pinsrw\t{$0, %k1, %0|%0, %k1, 0}"; + else + return MEM_P (operands[1]) + ? "pextrw\t{$0, %1, %0|%0, %1, 0}" + : "pextrw\t{$0, %1, %k0|%k0, %k1, 0}"; + + default: + gcc_unreachable (); + } +} + [(set (attr "isa") + (cond [(eq_attr "alternative" "2,3,4,6,7") + (const_string "sse2") + (eq_attr "alternative" "5") + (const_string "sse4") + ] + (const_string "*"))) + (set (attr "type") + (cond [(eq_attr "alternative" "0,1") + (const_string "imov") + (eq_attr "alternative" "2") + (const_string "sselog1") + (eq_attr "alternative" "4,5,6,7") + (const_string "sselog") + ] + (const_string "ssemov"))) + (set (attr "memory") + (cond [(eq_attr "alternative" "4,6") + (const_string "none") + (eq_attr "alternative" "5") + (const_string "store") + (eq_attr "alternative" "7") + (const_string "load") + ] + (const_string "*"))) + (set (attr "prefix") + (cond [(eq_attr "alternative" "0,1") + (const_string "orig") + ] + (const_string "maybe_vex"))) + (set (attr "mode") + (cond [(eq_attr "alternative" "0,1") + (const_string "HI") + (eq_attr "alternative" "2") + (const_string "V4SF") + (eq_attr "alternative" "4,5,6,7") + (const_string "TI") + (eq_attr "alternative" "3") + (const_string "SF") + ] + (const_string "*")))]) + (define_split [(set (match_operand 0 "any_fp_register_operand") (match_operand 1 "memory_operand"))] -- cgit v1.1 From de6795bbf58c7085933a1f86a88d8193ea72e26b Mon Sep 17 00:00:00 2001 From: liuhongt Date: Thu, 2 Sep 2021 12:49:46 +0800 Subject: Remove macro check for __AMX_BF16/INT8/TILE__ in header file. gcc/ChangeLog: PR target/102166 * config/i386/amxbf16intrin.h : Remove macro check for __AMX_BF16__. * config/i386/amxint8intrin.h : Remove macro check for __AMX_INT8__. * config/i386/amxtileintrin.h : Remove macro check for __AMX_TILE__. gcc/testsuite/ChangeLog: PR target/102166 * g++.target/i386/pr102166.C: New test. --- gcc/config/i386/amxbf16intrin.h | 2 +- gcc/config/i386/amxint8intrin.h | 2 +- gcc/config/i386/amxtileintrin.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/amxbf16intrin.h b/gcc/config/i386/amxbf16intrin.h index 8c24cdd..1d60e8e 100644 --- a/gcc/config/i386/amxbf16intrin.h +++ b/gcc/config/i386/amxbf16intrin.h @@ -34,7 +34,7 @@ #define __DISABLE_AMX_BF16__ #endif /* __AMX_BF16__ */ -#if defined(__x86_64__) && defined(__AMX_BF16__) +#if defined(__x86_64__) #define _tile_dpbf16ps_internal(dst,src1,src2) \ __asm__ volatile\ ("{tdpbf16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) diff --git a/gcc/config/i386/amxint8intrin.h b/gcc/config/i386/amxint8intrin.h index 180c243..dbb7b6c 100644 --- a/gcc/config/i386/amxint8intrin.h +++ b/gcc/config/i386/amxint8intrin.h @@ -34,7 +34,7 @@ #define __DISABLE_AMX_INT8__ #endif /* __AMX_INT8__ */ -#if defined(__x86_64__) && defined(__AMX_INT8__) +#if defined(__x86_64__) #define _tile_int8_dp_internal(name,dst,src1,src2) \ __asm__ volatile \ ("{"#name"\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|"#name"\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h index 16c8b6e..75d784a 100644 --- a/gcc/config/i386/amxtileintrin.h +++ b/gcc/config/i386/amxtileintrin.h @@ -34,7 +34,7 @@ #define __DISABLE_AMX_TILE__ #endif /* __AMX_TILE__ */ -#if defined(__x86_64__) && defined(__AMX_TILE__) +#if defined(__x86_64__) extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _tile_loadconfig (const void *__config) -- cgit v1.1 From 98f1dd0212d57659d1234914791f51ca9f3aba89 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Fri, 3 Sep 2021 02:12:29 +0000 Subject: Fix some GC issues in the aarch64 back-end. I got some ICEs in my latest testsing while running the libstdc++ testsuite. I had noticed the problem was connected to types and had just touched the builtins code but nothing which could have caused this and I looked for some types/variables that were not being marked with GTY. OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions. gcc/ChangeLog: * config/aarch64/aarch64-builtins.c (struct aarch64_simd_type_info): Mark with GTY. (aarch64_simd_types): Likewise. (aarch64_simd_intOI_type_node): Likewise. (aarch64_simd_intCI_type_node): Likewise. (aarch64_simd_intXI_type_node): Likewise. * config/aarch64/aarch64.h (aarch64_fp16_type_node): Likewise. (aarch64_fp16_ptr_type_node): Likewise. (aarch64_bf16_type_node): Likewise. (aarch64_bf16_ptr_type_node): Likewise. --- gcc/config/aarch64/aarch64-builtins.c | 10 +++++----- gcc/config/aarch64/aarch64.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index f6b41d9..eef9fc0 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -593,7 +593,7 @@ enum aarch64_simd_type }; #undef ENTRY -struct aarch64_simd_type_info +struct GTY(()) aarch64_simd_type_info { enum aarch64_simd_type type; @@ -625,14 +625,14 @@ struct aarch64_simd_type_info #define ENTRY(E, M, Q, G) \ {E, "__" #E, #G "__" #E, NULL_TREE, NULL_TREE, E_##M##mode, qualifier_##Q}, -static struct aarch64_simd_type_info aarch64_simd_types [] = { +static GTY(()) struct aarch64_simd_type_info aarch64_simd_types [] = { #include "aarch64-simd-builtin-types.def" }; #undef ENTRY -static tree aarch64_simd_intOI_type_node = NULL_TREE; -static tree aarch64_simd_intCI_type_node = NULL_TREE; -static tree aarch64_simd_intXI_type_node = NULL_TREE; +static GTY(()) tree aarch64_simd_intOI_type_node = NULL_TREE; +static GTY(()) tree aarch64_simd_intCI_type_node = NULL_TREE; +static GTY(()) tree aarch64_simd_intXI_type_node = NULL_TREE; /* The user-visible __fp16 type, and a pointer to that type. Used across the back-end. */ diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index bfffbcd..a5ba6c2 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -1262,13 +1262,13 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); /* This type is the user-visible __fp16, and a pointer to that type. We need it in many places in the backend. Defined in aarch64-builtins.c. */ -extern tree aarch64_fp16_type_node; -extern tree aarch64_fp16_ptr_type_node; +extern GTY(()) tree aarch64_fp16_type_node; +extern GTY(()) tree aarch64_fp16_ptr_type_node; /* This type is the user-visible __bf16, and a pointer to that type. Defined in aarch64-builtins.c. */ -extern tree aarch64_bf16_type_node; -extern tree aarch64_bf16_ptr_type_node; +extern GTY(()) tree aarch64_bf16_type_node; +extern GTY(()) tree aarch64_bf16_ptr_type_node; /* The generic unwind code in libgcc does not initialize the frame pointer. So in order to unwind a function using a frame pointer, the very first -- cgit v1.1 From 6b69bf5729852d0510abbe8e04078ddecef0cf17 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Thu, 2 Sep 2021 20:59:04 +0000 Subject: Fix target/102173 ICE after error recovery After the recent r12-3278-823685221de986a change, the testcase gcc.target/aarch64/sve/acle/general-c/type_redef_1.c started to ICE as the code was not ready for error_mark_node in the type. This fixes that and the testcase now passes. gcc/ChangeLog: * config/aarch64/aarch64-sve-builtins.cc (register_vector_type): Handle error_mark_node as the type of the type_decl. --- gcc/config/aarch64/aarch64-sve-builtins.cc | 1 + 1 file changed, 1 insertion(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc index f71b287..bc92213 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc @@ -3416,6 +3416,7 @@ register_vector_type (vector_type_index type) installing an incorrect type. */ if (decl && TREE_CODE (decl) == TYPE_DECL + && TREE_TYPE (decl) != error_mark_node && TYPE_MAIN_VARIANT (TREE_TYPE (decl)) == vectype) vectype = TREE_TYPE (decl); acle_vector_types[0][type] = vectype; -- cgit v1.1 From 2484f7a4b0f52e6ed04754be336f1fa6fde47f6b Mon Sep 17 00:00:00 2001 From: Segher Boessenkool Date: Thu, 2 Sep 2021 16:38:24 +0000 Subject: rs6000: Don't use r12 for CR save on ELFv2 (PR102107) CR is saved and/or restored on some paths where GPR12 is already live since it has a meaning in the calling convention in the ELFv2 ABI. It is not completely clear to me that we can always use r11 here, but it does seem save, there is checking code (to detect conflicts here), and it is stage 1. So here goes. 2021-09-03 Segher Boessenkool PR target/102107 * config/rs6000/rs6000-logue.c (rs6000_emit_prologue): On ELFv2 use r11 instead of r12 for CR save, in all cases. --- gcc/config/rs6000/rs6000-logue.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c index 07337c4..e363d56 100644 --- a/gcc/config/rs6000/rs6000-logue.c +++ b/gcc/config/rs6000/rs6000-logue.c @@ -3293,10 +3293,13 @@ rs6000_emit_prologue (void) /* If we need to save CR, put it into r12 or r11. Choose r12 except when r12 will be needed by out-of-line gpr save. */ - cr_save_regno = ((DEFAULT_ABI == ABI_AIX || DEFAULT_ABI == ABI_ELFv2) - && !(strategy & (SAVE_INLINE_GPRS - | SAVE_NOINLINE_GPRS_SAVES_LR)) - ? 11 : 12); + if (DEFAULT_ABI == ABI_AIX + && !(strategy & (SAVE_INLINE_GPRS | SAVE_NOINLINE_GPRS_SAVES_LR))) + cr_save_regno = 11; + else if (DEFAULT_ABI == ABI_ELFv2) + cr_save_regno = 11; + else + cr_save_regno = 12; if (!WORLD_SAVE_P (info) && info->cr_save_p && REGNO (frame_reg_rtx) != cr_save_regno -- cgit v1.1 From b27416a7a91b7e6b6b018411ac85cad556ff9903 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Sun, 5 Sep 2021 00:08:34 -0400 Subject: Improve handling of C bit for setcc insns gcc/ * config/h8300/h8300.md (QHSI2 mode iterator): New mode iterator. * config/h8300/testcompare.md (store_c): Update name, use new QHSI2 iterator. (store_neg_c, store_shifted_c): New patterns. --- gcc/config/h8300/h8300.md | 1 + gcc/config/h8300/testcompare.md | 122 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 120 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/h8300/h8300.md b/gcc/config/h8300/h8300.md index 89bfcf1..e81e21b 100644 --- a/gcc/config/h8300/h8300.md +++ b/gcc/config/h8300/h8300.md @@ -223,6 +223,7 @@ (define_mode_iterator HSI [HI SI]) (define_mode_iterator QHSI [QI HI SI]) +(define_mode_iterator QHSI2 [QI HI SI]) (define_mode_iterator QHSIF [QI HI SI SF]) diff --git a/gcc/config/h8300/testcompare.md b/gcc/config/h8300/testcompare.md index 9ff7a51..0ee3e36 100644 --- a/gcc/config/h8300/testcompare.md +++ b/gcc/config/h8300/testcompare.md @@ -212,11 +212,96 @@ } [(set (attr "length") (symbol_ref "mode == SImode ? 6 : 4"))]) +;; Similarly, but with a negated result +(define_insn "*store_neg_c_" + [(set (match_operand:QHSI 0 "register_operand" "=r") + (neg:QHSI (ne:QHSI (reg:CCC CC_REG) (const_int 0))))] + "reload_completed" + { + if (mode == QImode) + return "subx\t%X0,%X0"; + else if (mode == HImode) + return "subx\t%X0,%X0\;exts.w\t%T0"; + else if (mode == SImode) + return "subx\t%X0,%X0\;exts.w\t%T0\;exts.l\t%S0"; + gcc_unreachable (); + } + [(set + (attr "length") + (symbol_ref "(mode == SImode ? 6 : mode == HImode ? 4 : 2)"))]) + +;; Using b[i]st we can store the C bit into any of the low 16 bits of +;; a destination. We can also rotate it up into the high bit of a 32 bit +;; destination. +(define_insn "*store_shifted_c" + [(set (match_operand:QHSI 0 "register_operand" "=r") + (ashift:QHSI (eqne:QHSI (reg:CCC CC_REG) (const_int 0)) + (match_operand 1 "immediate_operand" "n")))] + "(reload_completed + && (INTVAL (operands[1]) == 31 || INTVAL (operands[1]) <= 15))" + { + if ( == NE) + { + if (mode == QImode) + return "xor.b\t%X0,%X0\;bst\t%1,%X0"; + else if (mode == HImode && INTVAL (operands[1]) < 8) + return "xor.w\t%T0,%T0\;bst\t%1,%X0"; + else if (mode == HImode) + { + operands[1] = GEN_INT (INTVAL (operands[1]) - 8); + output_asm_insn ("xor.w\t%T0,%T0\;bst\t%1,%t0", operands); + return ""; + } + else if (mode == SImode && INTVAL (operands[1]) == 31) + return "xor.l\t%S0,%S0\;rotxr.l\t%S0"; + else if (mode == SImode && INTVAL (operands[1]) < 8) + return "xor.l\t%S0,%S0\;bst\t%1,%X0"; + else if (mode == SImode) + { + operands[1] = GEN_INT (INTVAL (operands[1]) - 8); + output_asm_insn ("xor.l\t%S0,%S0\;bst\t%1,%t0", operands); + return ""; + } + gcc_unreachable (); + } + else if ( == EQ) + { + if (mode == QImode) + return "xor.b\t%X0,%X0\;bist\t%1,%X0"; + else if (mode == HImode && INTVAL (operands[1]) < 8) + return "xor.w\t%T0,%T0\;bist\t%1,%X0"; + else if (mode == HImode) + { + operands[1] = GEN_INT (INTVAL (operands[1]) - 8); + output_asm_insn ("xor.w\t%T0,%T0\;bist\t%1,%t0", operands); + return ""; + } + else if (mode == SImode && INTVAL (operands[1]) == 31) + return "xor.l\t%S0,%S0\;bixor\t#0,%X0\;rotxr.l\t%S0"; + else if (mode == SImode && INTVAL (operands[1]) < 8) + return "xor.l\t%S0,%S0\;bist\t%1,%X0"; + else if (mode == SImode) + { + operands[1] = GEN_INT (INTVAL (operands[1]) - 8); + output_asm_insn ("xor.l\t%S0,%S0\;bist\t%1,%t0", operands); + return ""; + } + gcc_unreachable (); + } + gcc_unreachable (); + } + [(set + (attr "length") + (symbol_ref "(mode == QImode ? 4 + : mode == HImode ? 4 + : == NE ? 6 + : INTVAL (operands[1]) == 31 ? 8 : 6)"))]) + ;; Recognize this scc and generate code we can match -(define_insn_and_split "*store_c_i_" +(define_insn_and_split "*store_c" [(set (match_operand:QHSI 0 "register_operand" "=r") - (geultu:QHSI (match_operand:QHSI 1 "register_operand" "r") - (match_operand:QHSI 2 "register_operand" "r")))] + (geultu:QHSI (match_operand:QHSI2 1 "register_operand" "r") + (match_operand:QHSI2 2 "register_operand" "r")))] "" "#" "&& reload_completed" @@ -224,3 +309,34 @@ (ltu:CCC (match_dup 1) (match_dup 2))) (set (match_dup 0) (:QHSI (reg:CCC CC_REG) (const_int 0)))]) + +;; We can fold in negation of the result and generate better code than +;; what the generic bits would do when testing for C == 1 +(define_insn_and_split "*store_neg_c" + [(set (match_operand:QHSI 0 "register_operand" "=r") + (neg:QHSI + (ltu:QHSI (match_operand:QHSI2 1 "register_operand" "r") + (match_operand:QHSI2 2 "register_operand" "r"))))] + "" + "#" + "&& reload_completed" + [(set (reg:CCC CC_REG) + (ltu:CCC (match_dup 1) (match_dup 2))) + (set (match_dup 0) + (neg:QHSI (ne:QHSI (reg:CCC CC_REG) (const_int 0))))]) + +;; We can use rotates and bst/bist to put the C bit into various places +;; in the destination. +(define_insn_and_split "*store_shifted_c" + [(set (match_operand:QHSI 0 "register_operand" "=r") + (ashift:QHSI (geultu:QHSI (match_operand:QHSI2 1 "register_operand" "r") + (match_operand:QHSI2 2 "register_operand" "r")) + (match_operand 3 "immediate_operand" "n")))] + "INTVAL (operands[3]) == 31 || INTVAL (operands[3]) <= 15" + "#" + "&& reload_completed" + [(set (reg:CCC CC_REG) (ltu:CCC (match_dup 1) (match_dup 2))) + (set (match_dup 0) + (ashift:QHSI (:QHSI (reg:CCC CC_REG) (const_int 0)) + (match_dup 3)))]) + -- cgit v1.1 From 652bef70d392f9541b12ef65b461009c8c8fd54a Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sat, 4 Sep 2021 08:28:00 -0700 Subject: x86: Add non-destructive source to @xorsign3_1 Add non-destructive source alternative to @xorsign3_1 for AVX. gcc/ PR target/89984 * config/i386/i386-expand.c (ix86_split_xorsign): Use operands[2]. * config/i386/i386.md (@xorsign3_1): Add non-destructive source alternative for AVX. gcc/testsuite/ PR target/89984 * gcc.target/i386/pr89984-1.c: New test. * gcc.target/i386/pr89984-2.c: Likewise. * gcc.target/i386/xorsign-avx.c: Likewise. --- gcc/config/i386/i386-expand.c | 13 ++++++++----- gcc/config/i386/i386.md | 11 ++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 2500dbf..273a0ba 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -2279,21 +2279,24 @@ void ix86_split_xorsign (rtx operands[]) { machine_mode mode, vmode; - rtx dest, op0, mask, x; + rtx dest, op0, op1, mask, x; dest = operands[0]; op0 = operands[1]; + op1 = operands[2]; mask = operands[3]; mode = GET_MODE (dest); vmode = GET_MODE (mask); - dest = lowpart_subreg (vmode, dest, mode); - x = gen_rtx_AND (vmode, dest, mask); - emit_insn (gen_rtx_SET (dest, x)); + op1 = lowpart_subreg (vmode, op1, mode); + x = gen_rtx_AND (vmode, op1, mask); + emit_insn (gen_rtx_SET (op1, x)); op0 = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_XOR (vmode, dest, op0); + x = gen_rtx_XOR (vmode, op1, op0); + + dest = lowpart_subreg (vmode, dest, mode); emit_insn (gen_rtx_SET (dest, x)); } diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 0cd151c..18b91c7 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -10806,17 +10806,18 @@ "ix86_expand_xorsign (operands); DONE;") (define_insn_and_split "@xorsign3_1" - [(set (match_operand:MODEF 0 "register_operand" "=Yv") + [(set (match_operand:MODEF 0 "register_operand" "=Yv,Yv") (unspec:MODEF - [(match_operand:MODEF 1 "register_operand" "Yv") - (match_operand:MODEF 2 "register_operand" "0") - (match_operand: 3 "nonimmediate_operand" "Yvm")] + [(match_operand:MODEF 1 "register_operand" "Yv,Yv") + (match_operand:MODEF 2 "register_operand" "0,Yv") + (match_operand: 3 "nonimmediate_operand" "Yvm,Yvm")] UNSPEC_XORSIGN))] "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" "#" "&& reload_completed" [(const_int 0)] - "ix86_split_xorsign (operands); DONE;") + "ix86_split_xorsign (operands); DONE;" + [(set_attr "isa" "noavx,avx")]) ;; One complement instructions -- cgit v1.1 From 58572bbb62c9588c658fd7656ee359d27c306fb2 Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Fri, 3 Sep 2021 03:32:26 -0400 Subject: MIPS: add .module arch and ase to all output asm Currently, the asm output file for MIPS has no rev info. It can make some trouble, for example: assembler is mips1 by default, gcc is fpxx by default. To assemble the output of gcc -S, we have to pass -mips2 to assembler. The same situation is for some CPU has extension insn. Octeon is an example. So we can just add ".set arch=octeon". If an ASE is enabled, .module ase will also be used. gcc/ChangeLog: * config/mips/mips.c (mips_file_start): add .module for arch and ase. --- gcc/config/mips/mips.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 493d3de..ade5d70 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -9896,6 +9896,43 @@ mips_file_start (void) else fputs ("\t.module\tnooddspreg\n", asm_out_file); + fprintf (asm_out_file, "\t.module\tarch=%s\n", mips_arch_info->name); + /* FIXME: DSPR3 is not supported by GCC? gas does support it */ + if (TARGET_DSPR2) + fputs ("\t.module\tdspr2\n", asm_out_file); + else if (TARGET_DSP) + fputs ("\t.module\tdsp\n", asm_out_file); + if (TARGET_EVA) + fputs ("\t.module\teva\n", asm_out_file); + if (TARGET_MCU) + fputs ("\t.module\tmcu\n", asm_out_file); + if (TARGET_MDMX) + fputs ("\t.module\tmdmx\n", asm_out_file); + if (TARGET_MIPS3D) + fputs ("\t.module\tmips3d\n", asm_out_file); + if (TARGET_MT) + fputs ("\t.module\tmt\n", asm_out_file); + if (TARGET_SMARTMIPS) + fputs ("\t.module\tsmartmips\n", asm_out_file); + if (TARGET_VIRT) + fputs ("\t.module\tvirt\n", asm_out_file); + if (TARGET_MSA) + fputs ("\t.module\tmsa\n", asm_out_file); + if (TARGET_XPA) + fputs ("\t.module\txpa\n", asm_out_file); + /* FIXME: MIPS16E2 is not supported by GCC? gas does support it */ + if (TARGET_CRC) + fputs ("\t.module\tcrc\n", asm_out_file); + if (TARGET_GINV) + fputs ("\t.module\tginv\n", asm_out_file); + if (TARGET_LOONGSON_MMI) + fputs ("\t.module\tloongson-mmi\n", asm_out_file); + /* FIXME: LOONGSON-CAM is not supported by GCC? gas does support it */ + if (TARGET_LOONGSON_EXT2) + fputs ("\t.module\tloongson-ext2\n", asm_out_file); + else if (TARGET_LOONGSON_EXT) + fputs ("\t.module\tloongson-ext\n", asm_out_file); + #else #ifdef HAVE_AS_GNU_ATTRIBUTE { -- cgit v1.1 From 546ecb0054af302acf0839c7f3eb78598f8c0672 Mon Sep 17 00:00:00 2001 From: Xionghu Luo Date: Mon, 6 Sep 2021 20:22:50 -0500 Subject: rs6000: Expand fmod and remainder when built with fast-math [PR97142] fmod/fmodf and remainder/remainderf could be expanded instead of library call when fast-math build, which is much faster. fmodf: fdivs f0,f1,f2 friz f0,f0 fnmsubs f1,f2,f0,f1 remainderf: fdivs f0,f1,f2 frin f0,f0 fnmsubs f1,f2,f0,f1 SPEC2017 Ofast P8LE: 511.povray_r +1.14%, 526.blender_r +1.72% gcc/ChangeLog: 2021-09-07 Xionghu Luo PR target/97142 * config/rs6000/rs6000.md (fmod3): New define_expand. (remainder3): Likewise. gcc/testsuite/ChangeLog: 2021-09-07 Xionghu Luo PR target/97142 * gcc.target/powerpc/pr97142.c: New test. --- gcc/config/rs6000/rs6000.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index a84438f..6bec2bd 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -4986,6 +4986,42 @@ [(set_attr "type" "fp") (set_attr "isa" "*,")]) +(define_expand "fmod3" + [(use (match_operand:SFDF 0 "gpc_reg_operand")) + (use (match_operand:SFDF 1 "gpc_reg_operand")) + (use (match_operand:SFDF 2 "gpc_reg_operand"))] + "TARGET_HARD_FLOAT + && TARGET_FPRND + && flag_unsafe_math_optimizations" +{ + rtx div = gen_reg_rtx (mode); + emit_insn (gen_div3 (div, operands[1], operands[2])); + + rtx friz = gen_reg_rtx (mode); + emit_insn (gen_btrunc2 (friz, div)); + + emit_insn (gen_nfms4 (operands[0], operands[2], friz, operands[1])); + DONE; + }) + +(define_expand "remainder3" + [(use (match_operand:SFDF 0 "gpc_reg_operand")) + (use (match_operand:SFDF 1 "gpc_reg_operand")) + (use (match_operand:SFDF 2 "gpc_reg_operand"))] + "TARGET_HARD_FLOAT + && TARGET_FPRND + && flag_unsafe_math_optimizations" +{ + rtx div = gen_reg_rtx (mode); + emit_insn (gen_div3 (div, operands[1], operands[2])); + + rtx frin = gen_reg_rtx (mode); + emit_insn (gen_round2 (frin, div)); + + emit_insn (gen_nfms4 (operands[0], operands[2], frin, operands[1])); + DONE; + }) + (define_insn "*rsqrt2" [(set (match_operand:SFDF 0 "gpc_reg_operand" "=,wa") (unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" ",wa")] -- cgit v1.1 From ad9fcb961c0705f56907a728c3748c011a0a8048 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sat, 4 Sep 2021 07:48:43 -0700 Subject: x86: Enable FMA in unsigned SI to SF expanders Enable FMA in scalar/vector unsigned SI to SF expanders. Don't check TARGET_AVX512F which has vcvtusi2ss and vcvtudq2ps instructions. gcc/ PR target/85819 * config/i386/i386-expand.c (ix86_expand_convert_uns_sisf_sse): Enable FMA. (ix86_expand_vector_convert_uns_vsivsf): Likewise. gcc/testsuite/ PR target/85819 * gcc.target/i386/pr85819-1a.c: New test. * gcc.target/i386/pr85819-1b.c: Likewise. * gcc.target/i386/pr85819-2a.c: Likewise. * gcc.target/i386/pr85819-2b.c: Likewise. * gcc.target/i386/pr85819-2c.c: Likewise. * gcc.target/i386/pr85819-3.c: Likewise. --- gcc/config/i386/i386-expand.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 273a0ba..3f90f67 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -1851,12 +1851,21 @@ ix86_expand_convert_uns_sisf_sse (rtx target, rtx input) fp_lo = gen_reg_rtx (SFmode); emit_insn (gen_floatsisf2 (fp_hi, int_hi)); emit_insn (gen_floatsisf2 (fp_lo, int_lo)); - fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi, - 0, OPTAB_DIRECT); - fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target, - 0, OPTAB_DIRECT); - if (!rtx_equal_p (target, fp_hi)) - emit_move_insn (target, fp_hi); + if (TARGET_FMA) + { + x = validize_mem (force_const_mem (SFmode, x)); + fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo); + emit_move_insn (target, fp_hi); + } + else + { + fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi, + 0, OPTAB_DIRECT); + fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target, + 0, OPTAB_DIRECT); + if (!rtx_equal_p (target, fp_hi)) + emit_move_insn (target, fp_hi); + } } /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert @@ -1888,12 +1897,20 @@ ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val) real_ldexp (&TWO16r, &dconst1, 16); tmp[5] = const_double_from_real_value (TWO16r, SFmode); tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5])); - tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1, - OPTAB_DIRECT); - tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1, - OPTAB_DIRECT); - if (tmp[7] != target) - emit_move_insn (target, tmp[7]); + if (TARGET_FMA) + { + tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]); + emit_move_insn (target, tmp[6]); + } + else + { + tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], + NULL_RTX, 1, OPTAB_DIRECT); + tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], + target, 1, OPTAB_DIRECT); + if (tmp[7] != target) + emit_move_insn (target, tmp[7]); + } } /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc* -- cgit v1.1 From e29a9607faae320a92f19b38f0424037ac3bdbfe Mon Sep 17 00:00:00 2001 From: Indu Bhagat Date: Tue, 7 Sep 2021 11:17:55 -0700 Subject: bpf: Add new -mco-re option for BPF CO-RE -mco-re in the BPF backend enables code generation for the CO-RE usecase. LTO is disabled for CO-RE compilations. gcc/ChangeLog: * config/bpf/bpf.c (bpf_option_override): For BPF backend, disable LTO support when compiling for CO-RE. * config/bpf/bpf.opt: Add new command line option -mco-re. gcc/testsuite/ChangeLog: * gcc.target/bpf/core-lto-1.c: New test. --- gcc/config/bpf/bpf.c | 25 +++++++++++++++++++++++++ gcc/config/bpf/bpf.opt | 4 ++++ 2 files changed, 29 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/bpf/bpf.c b/gcc/config/bpf/bpf.c index e635f9e..7228978 100644 --- a/gcc/config/bpf/bpf.c +++ b/gcc/config/bpf/bpf.c @@ -54,6 +54,7 @@ along with GCC; see the file COPYING3. If not see #include "builtins.h" #include "predict.h" #include "langhooks.h" +#include "flags.h" /* Per-function machine data. */ struct GTY(()) machine_function @@ -158,6 +159,30 @@ bpf_option_override (void) { /* Set the initializer for the per-function status structure. */ init_machine_status = bpf_init_machine_status; + + /* BPF CO-RE support requires BTF debug info generation. */ + if (TARGET_BPF_CORE && !btf_debuginfo_p ()) + error ("BPF CO-RE requires BTF debugging information, use %<-gbtf%>"); + + /* To support the portability needs of BPF CO-RE approach, BTF debug + information includes the BPF CO-RE relocations. */ + if (TARGET_BPF_CORE) + write_symbols |= BTF_WITH_CORE_DEBUG; + + /* Unlike much of the other BTF debug information, the information necessary + for CO-RE relocations is added to the CTF container by the BPF backend. + Enabling LTO adds some complications in the generation of the BPF CO-RE + relocations because if LTO is in effect, the relocations need to be + generated late in the LTO link phase. This poses a new challenge for the + compiler to now provide means to combine the early BTF and late BTF CO-RE + debug info, similar to DWARF debug info. BTF/CO-RE debug info is not + amenable to such a split generation and a later merging. + + In any case, in absence of linker support for BTF sections at this time, + it is acceptable to simply disallow LTO for BPF CO-RE compilations. */ + + if (flag_lto && TARGET_BPF_CORE) + sorry ("BPF CO-RE does not support LTO"); } #undef TARGET_OPTION_OVERRIDE diff --git a/gcc/config/bpf/bpf.opt b/gcc/config/bpf/bpf.opt index 916b53c..4493067 100644 --- a/gcc/config/bpf/bpf.opt +++ b/gcc/config/bpf/bpf.opt @@ -127,3 +127,7 @@ Generate little-endian eBPF. mframe-limit= Target Joined RejectNegative UInteger IntegerRange(0, 32767) Var(bpf_frame_limit) Init(512) Set a hard limit for the size of each stack frame, in bytes. + +mco-re +Target Mask(BPF_CORE) +Generate all necessary information for BPF Compile Once - Run Everywhere. -- cgit v1.1 From 8bdabb37549f12ce727800a1c8aa182c0b1dd42a Mon Sep 17 00:00:00 2001 From: David Faust Date: Tue, 3 Aug 2021 10:27:44 -0700 Subject: bpf: BPF CO-RE support This commit introduces support for BPF Compile Once - Run Everywhere (CO-RE) in GCC. gcc/ChangeLog: * config/bpf/bpf.c: Adjust includes. (bpf_handle_preserve_access_index_attribute): New function. (bpf_attribute_table): Use it here. (bpf_builtins): Add BPF_BUILTIN_PRESERVE_ACCESS_INDEX. (bpf_option_override): Handle "-mco-re" option. (bpf_asm_init_sections): New. (TARGET_ASM_INIT_SECTIONS): Redefine. (bpf_file_end): New. (TARGET_ASM_FILE_END): Redefine. (bpf_init_builtins): Add "__builtin_preserve_access_index". (bpf_core_compute, bpf_core_get_index): New. (is_attr_preserve_access): New. (bpf_expand_builtin): Handle new builtins. (bpf_core_newdecl, bpf_core_is_maybe_aggregate_access): New. (bpf_core_walk): New. (bpf_resolve_overloaded_builtin): New. (TARGET_RESOLVE_OVERLOADED_BUILTIN): Redefine. (handle_attr): New. (pass_bpf_core_attr): New RTL pass. * config/bpf/bpf-passes.def: New file. * config/bpf/bpf-protos.h (make_pass_bpf_core_attr): New. * config/bpf/coreout.c: New file. * config/bpf/coreout.h: Likewise. * config/bpf/t-bpf (TM_H): Add $(srcdir)/config/bpf/coreout.h. (coreout.o): New rule. (PASSES_EXTRA): Add $(srcdir)/config/bpf/bpf-passes.def. * config.gcc (bpf): Add coreout.h to extra_headers. Add coreout.o to extra_objs. Add $(srcdir)/config/bpf/coreout.c to target_gtfiles. --- gcc/config/bpf/bpf-passes.def | 20 ++ gcc/config/bpf/bpf-protos.h | 2 + gcc/config/bpf/bpf.c | 591 ++++++++++++++++++++++++++++++++++++++++++ gcc/config/bpf/coreout.c | 356 +++++++++++++++++++++++++ gcc/config/bpf/coreout.h | 114 ++++++++ gcc/config/bpf/t-bpf | 8 + 6 files changed, 1091 insertions(+) create mode 100644 gcc/config/bpf/bpf-passes.def create mode 100644 gcc/config/bpf/coreout.c create mode 100644 gcc/config/bpf/coreout.h (limited to 'gcc/config') diff --git a/gcc/config/bpf/bpf-passes.def b/gcc/config/bpf/bpf-passes.def new file mode 100644 index 0000000..3e96165 --- /dev/null +++ b/gcc/config/bpf/bpf-passes.def @@ -0,0 +1,20 @@ +/* Declaration of target-specific passes for eBPF. + Copyright (C) 2021 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +INSERT_PASS_AFTER (pass_df_initialize_opt, 1, pass_bpf_core_attr); diff --git a/gcc/config/bpf/bpf-protos.h b/gcc/config/bpf/bpf-protos.h index aeb5126..7ce3386 100644 --- a/gcc/config/bpf/bpf-protos.h +++ b/gcc/config/bpf/bpf-protos.h @@ -30,4 +30,6 @@ extern void bpf_print_operand_address (FILE *, rtx); extern void bpf_expand_prologue (void); extern void bpf_expand_epilogue (void); +rtl_opt_pass * make_pass_bpf_core_attr (gcc::context *); + #endif /* ! GCC_BPF_PROTOS_H */ diff --git a/gcc/config/bpf/bpf.c b/gcc/config/bpf/bpf.c index 7228978..01d9c03 100644 --- a/gcc/config/bpf/bpf.c +++ b/gcc/config/bpf/bpf.c @@ -56,6 +56,24 @@ along with GCC; see the file COPYING3. If not see #include "langhooks.h" #include "flags.h" +#include "cfg.h" /* needed for struct control_flow_graph used in BB macros */ +#include "gimple.h" +#include "gimple-iterator.h" +#include "gimple-walk.h" +#include "tree-pass.h" +#include "tree-iterator.h" + +#include "context.h" +#include "pass_manager.h" + +#include "gimplify.h" +#include "gimplify-me.h" + +#include "ctfc.h" +#include "btf.h" + +#include "coreout.h" + /* Per-function machine data. */ struct GTY(()) machine_function { @@ -105,6 +123,27 @@ bpf_handle_fndecl_attribute (tree *node, tree name, return NULL_TREE; } +/* Handle preserve_access_index attribute, which can be applied to structs, + unions and classes. Actually adding the attribute to the TYPE_DECL is + taken care of for us, so just warn for types that aren't supported. */ + +static tree +bpf_handle_preserve_access_index_attribute (tree *node, tree name, + tree args, + int flags, + bool *no_add_attrs) +{ + if (TREE_CODE (*node) != RECORD_TYPE && TREE_CODE (*node) != UNION_TYPE) + { + warning (OPT_Wattributes, + "%qE attribute only applies to structure, union and class types", + name); + *no_add_attrs = true; + } + + return NULL_TREE; +} + /* Target-specific attributes. */ static const struct attribute_spec bpf_attribute_table[] = @@ -117,6 +156,11 @@ static const struct attribute_spec bpf_attribute_table[] = { "kernel_helper", 1, 1, true, false, false, false, bpf_handle_fndecl_attribute, NULL }, + /* CO-RE support: attribute to mark that all accesses to the declared + struct/union/array should be recorded. */ + { "preserve_access_index", 0, -1, false, true, false, true, + bpf_handle_preserve_access_index_attribute, NULL }, + /* The last attribute spec is set to be NULL. */ { NULL, 0, 0, false, false, false, false, NULL, NULL } }; @@ -137,11 +181,18 @@ enum bpf_builtins BPF_BUILTIN_LOAD_BYTE, BPF_BUILTIN_LOAD_HALF, BPF_BUILTIN_LOAD_WORD, + + /* Compile Once - Run Everywhere (CO-RE) support. */ + BPF_BUILTIN_PRESERVE_ACCESS_INDEX, + BPF_BUILTIN_MAX, }; static GTY (()) tree bpf_builtins[(int) BPF_BUILTIN_MAX]; + +void bpf_register_coreattr_pass (void); + /* Initialize the per-function machine status. */ static struct machine_function * @@ -183,11 +234,57 @@ bpf_option_override (void) if (flag_lto && TARGET_BPF_CORE) sorry ("BPF CO-RE does not support LTO"); + + /* -gbtf implies -mcore when using the BPF backend, unless -mno-co-re + is specified. */ + if (btf_debuginfo_p () && !(target_flags_explicit & MASK_BPF_CORE)) + { + target_flags |= MASK_BPF_CORE; + write_symbols |= BTF_WITH_CORE_DEBUG; + } } #undef TARGET_OPTION_OVERRIDE #define TARGET_OPTION_OVERRIDE bpf_option_override +/* Return FALSE iff -mcore has been specified. */ + +static bool +ctfc_debuginfo_early_finish_p (void) +{ + if (TARGET_BPF_CORE) + return false; + else + return true; +} + +#undef TARGET_CTFC_DEBUGINFO_EARLY_FINISH_P +#define TARGET_CTFC_DEBUGINFO_EARLY_FINISH_P ctfc_debuginfo_early_finish_p + +/* Implement TARGET_ASM_INIT_SECTIONS. */ + +static void +bpf_asm_init_sections (void) +{ + if (TARGET_BPF_CORE) + btf_ext_init (); +} + +#undef TARGET_ASM_INIT_SECTIONS +#define TARGET_ASM_INIT_SECTIONS bpf_asm_init_sections + +/* Implement TARGET_ASM_FILE_END. */ + +static void +bpf_file_end (void) +{ + if (TARGET_BPF_CORE) + btf_ext_output (); +} + +#undef TARGET_ASM_FILE_END +#define TARGET_ASM_FILE_END bpf_file_end + /* Define target-specific CPP macros. This function in used in the definition of TARGET_CPU_CPP_BUILTINS in bpf.h */ @@ -837,11 +934,18 @@ bpf_init_builtins (void) build_function_type_list (ullt, ullt, 0)); def_builtin ("__builtin_bpf_load_word", BPF_BUILTIN_LOAD_WORD, build_function_type_list (ullt, ullt, 0)); + def_builtin ("__builtin_preserve_access_index", + BPF_BUILTIN_PRESERVE_ACCESS_INDEX, + build_function_type_list (ptr_type_node, ptr_type_node, 0)); } #undef TARGET_INIT_BUILTINS #define TARGET_INIT_BUILTINS bpf_init_builtins +static tree bpf_core_compute (tree, vec *); +static int bpf_core_get_index (const tree); +static bool is_attr_preserve_access (tree); + /* Expand a call to a BPF-specific built-in function that was set up with bpf_init_builtins. */ @@ -892,7 +996,75 @@ bpf_expand_builtin (tree exp, rtx target ATTRIBUTE_UNUSED, /* The result of the load is in R0. */ return gen_rtx_REG (ops[0].mode, BPF_R0); } + else if (code == -1) + { + /* A resolved overloaded builtin, e.g. __bpf_preserve_access_index_si */ + tree arg = CALL_EXPR_ARG (exp, 0); + + if (arg == NULL_TREE) + return NULL_RTX; + + auto_vec accessors; + tree container; + if (TREE_CODE (arg) == SSA_NAME) + { + gimple *def_stmt = SSA_NAME_DEF_STMT (arg); + + if (is_gimple_assign (def_stmt)) + arg = gimple_assign_rhs1 (def_stmt); + else + return expand_normal (arg); + } + + /* Avoid double-recording information if the argument is an access to + a struct/union marked __attribute__((preserve_access_index)). This + Will be handled by the attribute handling pass. */ + if (is_attr_preserve_access (arg)) + return expand_normal (arg); + + container = bpf_core_compute (arg, &accessors); + + /* Any valid use of the builtin must have at least one access. Otherwise, + there is nothing to record and nothing to do. This is primarily a + guard against optimizations leading to unexpected expressions in the + argument of the builtin. For example, if the builtin is used to read + a field of a structure which can be statically determined to hold a + constant value, the argument to the builtin will be optimized to that + constant. This is OK, and means the builtin call is superfluous. + e.g. + struct S foo; + foo.a = 5; + int x = __preserve_access_index (foo.a); + ... do stuff with x + 'foo.a' in the builtin argument will be optimized to '5' with -01+. + This sequence does not warrant recording a CO-RE relocation. */ + + if (accessors.length () < 1) + return expand_normal (arg); + + accessors.reverse (); + + container = TREE_TYPE (container); + + rtx_code_label *label = gen_label_rtx (); + LABEL_PRESERVE_P (label) = 1; + emit_label (label); + + /* Determine what output section this relocation will apply to. + If this function is associated with a section, use that. Otherwise, + fall back on '.text'. */ + const char * section_name; + if (current_function_decl && DECL_SECTION_NAME (current_function_decl)) + section_name = DECL_SECTION_NAME (current_function_decl); + else + section_name = ".text"; + + /* Add the CO-RE relocation information to the BTF container. */ + bpf_core_reloc_add (container, section_name, &accessors, label); + + return expand_normal (arg); + } gcc_unreachable (); } @@ -946,6 +1118,425 @@ bpf_debug_unwind_info () #undef TARGET_ASM_ALIGNED_DI_OP #define TARGET_ASM_ALIGNED_DI_OP "\t.dword\t" + +/* BPF Compile Once - Run Everywhere (CO-RE) support routines. + + BPF CO-RE is supported in two forms: + - A target builtin, __builtin_preserve_access_index + + This builtin accepts a single argument. Any access to an aggregate data + structure (struct, union or array) within the argument will be recorded by + the CO-RE machinery, resulting in a relocation record being placed in the + .BTF.ext section of the output. + + It is implemented in bpf_resolve_overloaded_builtin () and + bpf_expand_builtin (), using the supporting routines below. + + - An attribute, __attribute__((preserve_access_index)) + + This attribute can be applied to struct and union types. Any access to a + type with this attribute will be recorded by the CO-RE machinery. + + The pass pass_bpf_core_attr, below, implements support for + this attribute. */ + +/* Traverse the subtree under NODE, which is expected to be some form of + aggregate access the CO-RE machinery cares about (like a read of a member of + a struct or union), collecting access indices for the components and storing + them in the vector referenced by ACCESSORS. + + Return the ultimate (top-level) container of the aggregate access. In general, + this will be a VAR_DECL or some kind of REF. + + Note that the accessors are computed *in reverse order* of how the BPF + CO-RE machinery defines them. The vector needs to be reversed (or simply + output in reverse order) for the .BTF.ext relocation information. */ + +static tree +bpf_core_compute (tree node, vec *accessors) +{ + + if (TREE_CODE (node) == ADDR_EXPR) + node = TREE_OPERAND (node, 0); + + else if (TREE_CODE (node) == INDIRECT_REF + || TREE_CODE (node) == POINTER_PLUS_EXPR) + { + accessors->safe_push (0); + return TREE_OPERAND (node, 0); + } + + while (1) + { + switch (TREE_CODE (node)) + { + case COMPONENT_REF: + accessors->safe_push (bpf_core_get_index (TREE_OPERAND (node, 1))); + break; + + case ARRAY_REF: + case ARRAY_RANGE_REF: + accessors->safe_push (bpf_core_get_index (node)); + break; + + case MEM_REF: + accessors->safe_push (bpf_core_get_index (node)); + if (TREE_CODE (TREE_OPERAND (node, 0)) == ADDR_EXPR) + node = TREE_OPERAND (TREE_OPERAND (node, 0), 0); + goto done; + + default: + goto done; + } + node = TREE_OPERAND (node, 0); + } + done: + return node; + +} + +/* Compute the index of the NODE in its immediate container. + NODE should be a FIELD_DECL (i.e. of struct or union), or an ARRAY_REF. */ +static int +bpf_core_get_index (const tree node) +{ + enum tree_code code = TREE_CODE (node); + + if (code == FIELD_DECL) + { + /* Lookup the index from the BTF information. Some struct/union members + may not be emitted in BTF; only the BTF container has enough + information to compute the correct index. */ + int idx = bpf_core_get_sou_member_index (ctf_get_tu_ctfc (), node); + if (idx >= 0) + return idx; + } + + else if (code == ARRAY_REF || code == ARRAY_RANGE_REF || code == MEM_REF) + { + /* For array accesses, the index is operand 1. */ + tree index = TREE_OPERAND (node, 1); + + /* If the indexing operand is a constant, extracting is trivial. */ + if (TREE_CODE (index) == INTEGER_CST && tree_fits_shwi_p (index)) + return tree_to_shwi (index); + } + + return -1; +} + +/* Synthesize a new builtin function declaration at LOC with signature TYPE. + Used by bpf_resolve_overloaded_builtin to resolve calls to + __builtin_preserve_access_index. */ + +static tree +bpf_core_newdecl (location_t loc, tree type) +{ + tree rettype = build_function_type_list (type, type, NULL); + tree newdecl = NULL_TREE; + char name[80]; + int len = snprintf (name, sizeof (name), "%s", "__builtin_pai_"); + + static unsigned long cnt = 0; + len = snprintf (name + len, sizeof (name) - len, "%lu", cnt++); + + return add_builtin_function_ext_scope (name, rettype, -1, BUILT_IN_MD, NULL, + NULL_TREE); +} + +/* Return whether EXPR could access some aggregate data structure that + BPF CO-RE support needs to know about. */ + +static int +bpf_core_is_maybe_aggregate_access (tree expr) +{ + enum tree_code code = TREE_CODE (expr); + if (code == COMPONENT_REF || code == ARRAY_REF) + return 1; + + if (code == ADDR_EXPR) + return bpf_core_is_maybe_aggregate_access (TREE_OPERAND (expr, 0)); + + return 0; +} + +/* Callback function used with walk_tree from bpf_resolve_overloaded_builtin. */ + +static tree +bpf_core_walk (tree *tp, int *walk_subtrees, void *data) +{ + location_t loc = *((location_t *) data); + + /* If this is a type, don't do anything. */ + if (TYPE_P (*tp)) + { + *walk_subtrees = 0; + return NULL_TREE; + } + + if (bpf_core_is_maybe_aggregate_access (*tp)) + { + tree newdecl = bpf_core_newdecl (loc, TREE_TYPE (*tp)); + tree newcall = build_call_expr_loc (loc, newdecl, 1, *tp); + *tp = newcall; + *walk_subtrees = 0; + } + + return NULL_TREE; +} + + +/* Implement TARGET_RESOLVE_OVERLOADED_BUILTIN (see gccint manual section + Target Macros::Misc.). + We use this for the __builtin_preserve_access_index builtin for CO-RE + support. + + FNDECL is the declaration of the builtin, and ARGLIST is the list of + arguments passed to it, and is really a vec *. + + In this case, the 'operation' implemented by the builtin is a no-op; + the builtin is just a marker. So, the result is simply the argument. */ + +static tree +bpf_resolve_overloaded_builtin (location_t loc, tree fndecl, void *arglist) +{ + if (DECL_MD_FUNCTION_CODE (fndecl) != BPF_BUILTIN_PRESERVE_ACCESS_INDEX) + return NULL_TREE; + + /* We only expect one argument, but it may be an arbitrarily-complicated + statement-expression. */ + vec *params = static_cast *> (arglist); + unsigned n_params = params ? params->length() : 0; + + if (n_params != 1) + { + error_at (loc, "expected exactly 1 argument"); + return NULL_TREE; + } + + tree param = (*params)[0]; + + /* If not generating BPF_CORE information, the builtin does nothing. */ + if (!TARGET_BPF_CORE) + return param; + + /* Do remove_c_maybe_const_expr for the arg. + TODO: WHY do we have to do this here? Why doesn't c-typeck take care + of it before or after this hook? */ + if (TREE_CODE (param) == C_MAYBE_CONST_EXPR) + param = C_MAYBE_CONST_EXPR_EXPR (param); + + /* Construct a new function declaration with the correct type, and return + a call to it. + + Calls with statement-expressions, for example: + _(({ foo->a = 1; foo->u[2].b = 2; })) + require special handling. + + We rearrange this into a new block scope in which each statement + becomes a unique builtin call: + { + _ ({ foo->a = 1;}); + _ ({ foo->u[2].b = 2;}); + } + + This ensures that all the relevant information remains within the + expression trees the builtin finally gets. */ + + walk_tree (¶m, bpf_core_walk, (void *) &loc, NULL); + + return param; +} + +#undef TARGET_RESOLVE_OVERLOADED_BUILTIN +#define TARGET_RESOLVE_OVERLOADED_BUILTIN bpf_resolve_overloaded_builtin + + +/* Handling for __attribute__((preserve_access_index)) for BPF CO-RE support. + + This attribute marks a structure/union/array type as "preseve", so that + every access to that type should be recorded and replayed by the BPF loader; + this is just the same functionality as __builtin_preserve_access_index, + but in the form of an attribute for an entire aggregate type. + + Note also that nested structs behave as though they all have the attribute. + For example: + struct X { int a; }; + struct Y { struct X bar} __attribute__((preserve_access_index)); + struct Y foo; + foo.bar.a; + will record access all the way to 'a', even though struct X does not have + the preserve_access_index attribute. + + This is to follow LLVM behavior. + + This pass finds all accesses to objects of types marked with the attribute, + and wraps them in the same "low-level" builtins used by the builtin version. + All logic afterwards is therefore identical to the builtin version of + preserve_access_index. */ + +/* True iff tree T accesses any member of a struct/union/class which is marked + with the PRESERVE_ACCESS_INDEX attribute. */ + +static bool +is_attr_preserve_access (tree t) +{ + if (t == NULL_TREE) + return false; + + poly_int64 bitsize, bitpos; + tree var_off; + machine_mode mode; + int sign, reverse, vol; + + tree base = get_inner_reference (t, &bitsize, &bitpos, &var_off, &mode, + &sign, &reverse, &vol); + + if (TREE_CODE (base) == MEM_REF) + { + return lookup_attribute ("preserve_access_index", + TYPE_ATTRIBUTES (TREE_TYPE (base))); + } + + if (TREE_CODE (t) == COMPONENT_REF) + { + /* preserve_access_index propegates into nested structures, + so check whether this is a component of another component + which in turn is part of such a struct. */ + + const tree op = TREE_OPERAND (t, 0); + + if (TREE_CODE (op) == COMPONENT_REF) + return is_attr_preserve_access (op); + + const tree container = DECL_CONTEXT (TREE_OPERAND (t, 1)); + + return lookup_attribute ("preserve_access_index", + TYPE_ATTRIBUTES (container)); + } + + else if (TREE_CODE (t) == ADDR_EXPR) + return is_attr_preserve_access (TREE_OPERAND (t, 0)); + + return false; +} + +/* The body of pass_bpf_core_attr. Scan RTL for accesses to structs/unions + marked with __attribute__((preserve_access_index)) and generate a CO-RE + relocation for any such access. */ + +static void +handle_attr_preserve (function *fn) +{ + basic_block bb; + rtx_insn *insn; + rtx_code_label *label; + FOR_EACH_BB_FN (bb, fn) + { + FOR_BB_INSNS (bb, insn) + { + if (!NONJUMP_INSN_P (insn)) + continue; + rtx pat = PATTERN (insn); + if (GET_CODE (pat) != SET) + continue; + + start_sequence(); + + for (int i = 0; i < 2; i++) + { + rtx mem = XEXP (pat, i); + if (MEM_P (mem)) + { + tree expr = MEM_EXPR (mem); + if (!expr) + continue; + + if (TREE_CODE (expr) == MEM_REF + && TREE_CODE (TREE_OPERAND (expr, 0)) == SSA_NAME) + { + gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (expr, 0)); + if (is_gimple_assign (def_stmt)) + expr = gimple_assign_rhs1 (def_stmt); + } + + if (is_attr_preserve_access (expr)) + { + auto_vec accessors; + tree container = bpf_core_compute (expr, &accessors); + if (accessors.length () < 1) + continue; + accessors.reverse (); + + container = TREE_TYPE (container); + const char * section_name; + if (DECL_SECTION_NAME (fn->decl)) + section_name = DECL_SECTION_NAME (fn->decl); + else + section_name = ".text"; + + label = gen_label_rtx (); + LABEL_PRESERVE_P (label) = 1; + emit_label (label); + + /* Add the CO-RE relocation information to the BTF container. */ + bpf_core_reloc_add (container, section_name, &accessors, label); + } + } + } + rtx_insn *seq = get_insns (); + end_sequence (); + emit_insn_before (seq, insn); + } + } +} + + +/* This pass finds accesses to structures marked with the BPF target attribute + __attribute__((preserve_access_index)). For every such access, a CO-RE + relocation record is generated, to be output in the .BTF.ext section. */ + +namespace { + +const pass_data pass_data_bpf_core_attr = +{ + RTL_PASS, /* type */ + "bpf_core_attr", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_NONE, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_bpf_core_attr : public rtl_opt_pass +{ +public: + pass_bpf_core_attr (gcc::context *ctxt) + : rtl_opt_pass (pass_data_bpf_core_attr, ctxt) + {} + + virtual bool gate (function *) { return TARGET_BPF_CORE; } + virtual unsigned int execute (function *); +}; + +unsigned int +pass_bpf_core_attr::execute (function *fn) +{ + handle_attr_preserve (fn); + return 0; +} + +} /* Anonymous namespace. */ + +rtl_opt_pass * +make_pass_bpf_core_attr (gcc::context *ctxt) +{ + return new pass_bpf_core_attr (ctxt); +} + /* Finally, build the GCC target. */ struct gcc_target targetm = TARGET_INITIALIZER; diff --git a/gcc/config/bpf/coreout.c b/gcc/config/bpf/coreout.c new file mode 100644 index 0000000..d5486b4 --- /dev/null +++ b/gcc/config/bpf/coreout.c @@ -0,0 +1,356 @@ +/* BPF Compile Once - Run Everywhere (CO-RE) support. + Copyright (C) 2021 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#define IN_TARGET_CODE 1 + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "target.h" +#include "memmodel.h" +#include "tm_p.h" +#include "output.h" +#include "dwarf2asm.h" +#include "ctfc.h" +#include "btf.h" +#include "rtl.h" + +#include "coreout.h" + +/* This file contains data structures and routines for construction and output + of BPF Compile Once - Run Everywhere (BPF CO-RE) information. + + eBPF programs written in C usually include Linux kernel headers, so that + they may interact with kernel data structures in a useful way. This + intrudces two major portability issues: + + 1. Kernel data structures regularly change, with fields added, moved or + deleted between versions. An eBPF program cannot in general be expected + to run on any systems which does not share an identical kernel version to + the system on which it was compiled. + + 2. Included kernel headers (and used data structures) may be internal, not + exposed in an userspace API, and therefore target-specific. An eBPF + program compiled on an x86_64 machine will include x86_64 kernel headers. + The resulting program may not run well (or at all) in machines of + another architecture. + + BPF CO-RE is designed to solve the first issue by leveraging the BPF loader + to adjust references to kernel data structures made by the program as-needed + according to versions of structures actually present on the host kernel. + + To achieve this, additional information is placed in a ".BTF.ext" section. + This information tells the loader which references will require adjusting, + and how to perform each necessary adjustment. + + For any access to a data structure which may require load-time adjustment, + the following information is recorded (making up a CO-RE relocation record): + - The BTF type ID of the outermost structure which is accessed. + - An access string encoding the accessed member via a series of member and + array indexes. These indexes are used to look up detailed BTF information + about the member. + - The offset of the appropriate instruction to patch in the BPF program. + - An integer specifying what kind of relocation to perform. + + A CO-RE-capable BPF loader reads this information together with the BTF + information of the program, compares it against BTF information of the host + kernel, and determines the appropriate way to patch the specified + instruction. + + Once all CO-RE relocations are resolved, the program is loaded and verified + as usual. The process can be summarized with the following diagram: + + +------------+ + | C compiler | + +-----+------+ + | BPF + BTF + CO-RE relocations + v + +------------+ + +--->| BPF loader | + | +-----+------+ + | | BPF (adapted) + BTF | v + | +------------+ + +----+ Kernel | + +------------+ + + Note that a single ELF object may contain multiple eBPF programs. As a + result, a single .BTF.ext section can contain CO-RE relocations for multiple + programs in distinct sections. */ + +/* Internal representation of a BPF CO-RE relocation record. */ + +typedef struct GTY (()) bpf_core_reloc { + unsigned int bpfcr_type; /* BTF type ID of container. */ + unsigned int bpfcr_astr_off; /* Offset of access string in .BTF + string table. */ + rtx_code_label * bpfcr_insn_label; /* RTX label attached to instruction + to patch. */ + enum btf_core_reloc_kind bpfcr_kind; /* Kind of relocation to perform. */ +} bpf_core_reloc_t; + +typedef bpf_core_reloc_t * bpf_core_reloc_ref; + +/* Internal representation of a CO-RE relocation (sub)section of the + .BTF.ext information. One such section is generated for each ELF section + in the output object having relocations that a BPF loader must resolve. */ + +typedef struct GTY (()) bpf_core_section { + /* Name of ELF section to which these CO-RE relocations apply. */ + const char * name; + + /* Offset of section name in .BTF string table. */ + uint32_t name_offset; + + /* Relocations in the section. */ + vec * GTY (()) relocs; +} bpf_core_section_t; + +typedef bpf_core_section_t * bpf_core_section_ref; + +/* BTF.ext debug info section. */ + +static GTY (()) section * btf_ext_info_section; + +static int btf_ext_label_num; + +#ifndef BTF_EXT_INFO_SECTION_NAME +#define BTF_EXT_INFO_SECTION_NAME ".BTF.ext" +#endif + +#define BTF_EXT_INFO_SECTION_FLAGS (SECTION_DEBUG) + +#define MAX_BTF_EXT_LABEL_BYTES 40 + +static char btf_ext_info_section_label[MAX_BTF_EXT_LABEL_BYTES]; + +#ifndef BTF_EXT_INFO_SECTION_LABEL +#define BTF_EXT_INFO_SECTION_LABEL "Lbtfext" +#endif + +static GTY (()) vec *bpf_core_sections; + + +/* Create a new BPF CO-RE relocation record, and add it to the appropriate + CO-RE section. */ + +void +bpf_core_reloc_add (const tree type, const char * section_name, + vec *accessors, rtx_code_label *label) +{ + char buf[40]; + unsigned int i, n = 0; + + /* A valid CO-RE access must have at least one accessor. */ + if (accessors->length () < 1) + return; + + for (i = 0; i < accessors->length () - 1; i++) + n += snprintf (buf + n, sizeof (buf) - n, "%u:", (*accessors)[i]); + snprintf (buf + n, sizeof (buf) - n, "%u", (*accessors)[i]); + + bpf_core_reloc_ref bpfcr = ggc_cleared_alloc (); + ctf_container_ref ctfc = ctf_get_tu_ctfc (); + + /* Buffer the access string in the auxiliary strtab. Since the two string + tables are concatenated, add the length of the first to the offset. */ + size_t strtab_len = ctfc_get_strtab_len (ctfc, CTF_STRTAB); + ctf_add_string (ctfc, buf, &(bpfcr->bpfcr_astr_off), CTF_AUX_STRTAB); + bpfcr->bpfcr_astr_off += strtab_len; + + bpfcr->bpfcr_type = get_btf_id (ctf_lookup_tree_type (ctfc, type)); + bpfcr->bpfcr_insn_label = label; + bpfcr->bpfcr_kind = BPF_RELO_FIELD_BYTE_OFFSET; + + /* Add the CO-RE reloc to the appropriate section. */ + bpf_core_section_ref sec; + FOR_EACH_VEC_ELT (*bpf_core_sections, i, sec) + if (strcmp (sec->name, section_name) == 0) + { + vec_safe_push (sec->relocs, bpfcr); + return; + } + + /* If the CO-RE section does not yet exist, create it. */ + sec = ggc_cleared_alloc (); + + ctf_add_string (ctfc, section_name, &sec->name_offset, CTF_AUX_STRTAB); + sec->name_offset += strtab_len; + if (strcmp (section_name, "")) + ctfc->ctfc_aux_strlen += strlen (section_name) + 1; + + sec->name = section_name; + vec_alloc (sec->relocs, 1); + vec_safe_push (sec->relocs, bpfcr); + + vec_safe_push (bpf_core_sections, sec); +} + +/* Return the 0-based index of the field NODE in its containing struct or union + type. */ + +int +bpf_core_get_sou_member_index (ctf_container_ref ctfc, const tree node) +{ + if (TREE_CODE (node) == FIELD_DECL) + { + const tree container = DECL_CONTEXT (node); + const char * name = IDENTIFIER_POINTER (DECL_NAME (node)); + + /* Lookup the CTF type info for the containing type. */ + dw_die_ref die = lookup_type_die (container); + if (die == NULL) + return -1; + + ctf_dtdef_ref dtd = ctf_dtd_lookup (ctfc, die); + if (dtd == NULL) + return -1; + + unsigned int kind = CTF_V2_INFO_KIND (dtd->dtd_data.ctti_info); + if (kind != CTF_K_STRUCT && kind != CTF_K_UNION) + return -1; + + int i = 0; + ctf_dmdef_t * dmd; + for (dmd = dtd->dtd_u.dtu_members; + dmd != NULL; dmd = (ctf_dmdef_t *) ctf_dmd_list_next (dmd)) + { + if (get_btf_id (dmd->dmd_type) > BTF_MAX_TYPE) + continue; + if (strcmp (dmd->dmd_name, name) == 0) + return i; + i++; + } + } + return -1; +} + +/* Compute and output the header of a .BTF.ext debug info section. */ + +static void +output_btfext_header (void) +{ + switch_to_section (btf_ext_info_section); + ASM_OUTPUT_LABEL (asm_out_file, btf_ext_info_section_label); + + dw2_asm_output_data (2, BTF_MAGIC, "btf_magic"); + dw2_asm_output_data (1, BTF_VERSION, "btfext_version"); + dw2_asm_output_data (1, 0, "btfext_flags"); + dw2_asm_output_data (4, sizeof (struct btf_ext_header), "btfext_hdr_len"); + + uint32_t func_info_off = 0, func_info_len = 0; + uint32_t line_info_off = 0, line_info_len = 0; + uint32_t core_relo_off = 0, core_relo_len = 0; + + /* Header core_relo_len is the sum total length in bytes of all CO-RE + relocation sections. */ + size_t i; + bpf_core_section_ref sec; + core_relo_len += vec_safe_length (bpf_core_sections) + * sizeof (struct btf_ext_section_header); + + FOR_EACH_VEC_ELT (*bpf_core_sections, i, sec) + core_relo_len += + vec_safe_length (sec->relocs) * sizeof (struct btf_ext_reloc); + + dw2_asm_output_data (4, func_info_off, "func_info_offset"); + dw2_asm_output_data (4, func_info_len, "func_info_len"); + + dw2_asm_output_data (4, line_info_off, "line_info_offset"); + dw2_asm_output_data (4, line_info_len, "line_info_len"); + + dw2_asm_output_data (4, core_relo_off, "core_relo_offset"); + dw2_asm_output_data (4, core_relo_len, "core_relo_len"); +} + +/* Output a single CO-RE relocation record. */ + +static void +output_asm_btfext_core_reloc (bpf_core_reloc_ref bpfcr) +{ + dw2_assemble_integer (4, gen_rtx_LABEL_REF (Pmode, bpfcr->bpfcr_insn_label)); + fprintf (asm_out_file, "\t%s bpfcr_insn\n", ASM_COMMENT_START); + + dw2_asm_output_data (4, bpfcr->bpfcr_type, "bpfcr_type"); + dw2_asm_output_data (4, bpfcr->bpfcr_astr_off, "bpfcr_astr_off"); + dw2_asm_output_data (4, bpfcr->bpfcr_kind, "bpfcr_kind"); +} + +/* Output all CO-RE relocation records for a section. */ + +static void +output_btfext_core_relocs (bpf_core_section_ref sec) +{ + size_t i; + bpf_core_reloc_ref bpfcr; + FOR_EACH_VEC_ELT (*(sec->relocs), i, bpfcr) + output_asm_btfext_core_reloc (bpfcr); +} + +/* Output all CO-RE relocation sections. */ + +static void +output_btfext_core_sections (void) +{ + size_t i; + bpf_core_section_ref sec; + FOR_EACH_VEC_ELT (*bpf_core_sections, i, sec) + { + /* BTF Ext section info. */ + dw2_asm_output_data (4, sizeof (struct btf_ext_reloc), + "btfext_secinfo_rec_size"); + + /* Section name offset, refers to the offset of a string with the name of + the section to which these CORE relocations refer, e.g. '.text'. + The string is buffered in the BTF strings table. */ + dw2_asm_output_data (4, sec->name_offset, "btfext_secinfo_sec_name_off"); + dw2_asm_output_data (4, vec_safe_length (sec->relocs), + "btfext_secinfo_num_recs"); + + output_btfext_core_relocs (sec); + } +} + +/* Initialize sections, labels, and data structures for BTF.ext output. */ + +void +btf_ext_init (void) +{ + btf_ext_info_section = get_section (BTF_EXT_INFO_SECTION_NAME, + BTF_EXT_INFO_SECTION_FLAGS, NULL); + + ASM_GENERATE_INTERNAL_LABEL (btf_ext_info_section_label, + BTF_EXT_INFO_SECTION_LABEL, + btf_ext_label_num++); + + vec_alloc (bpf_core_sections, 1); +} + +/* Output the entire .BTF.ext section. */ + +void +btf_ext_output (void) +{ + output_btfext_header (); + output_btfext_core_sections (); + + bpf_core_sections = NULL; +} + +#include "gt-coreout.h" diff --git a/gcc/config/bpf/coreout.h b/gcc/config/bpf/coreout.h new file mode 100644 index 0000000..82c203d --- /dev/null +++ b/gcc/config/bpf/coreout.h @@ -0,0 +1,114 @@ +/* coreout.h - Declarations and definitions related to + BPF Compile Once - Run Everywhere (CO-RE) support. + Copyright (C) 2021 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + + +#ifndef __COREOUT_H +#define __COREOUT_H + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +/* .BTF.ext information. */ + +struct btf_ext_section_header +{ + uint32_t kind; + uint32_t sec_name_off; + uint32_t num_records; +}; + +/* A funcinfo record, in the .BTF.ext funcinfo section. */ +struct btf_ext_funcinfo +{ + uint32_t insn_off; /* Offset of the first instruction of the function. */ + uint32_t type; /* Type ID of a BTF_KIND_FUNC type. */ +}; + +/* A lineinfo record, in the .BTF.ext lineinfo section. */ +struct btf_ext_lineinfo +{ + uint32_t insn_off; /* Offset of the instruction. */ + uint32_t file_name_off; /* Offset of file name in BTF string table. */ + uint32_t line_off; /* Offset of source line in BTF string table. */ + uint32_t line_col; /* Line number (bits 31-11) and column (11-0). */ +}; + +enum btf_core_reloc_kind +{ + BPF_RELO_FIELD_BYTE_OFFSET = 0, + BPF_RELO_FIELD_BYTE_SIZE = 1, + BPF_RELO_FIELD_EXISTS = 2, + BPF_RELO_FIELD_SIGNED = 3, + BPF_RELO_FIELD_LSHIFT_U64 = 4, + BPF_RELO_FIELD_RSHIFT_U64 = 5, + BPF_RELO_TYPE_ID_LOCAL = 6, + BPF_RELO_TYPE_ID_TARGET = 7, + BPF_RELO_TYPE_EXISTS = 8, + BPF_RELO_TYPE_SIZE = 9, + BPF_RELO_ENUMVAL_EXISTS = 10, + BPF_RELO_ENUMVAL_VALUE = 11 +}; + +struct btf_ext_reloc +{ + uint32_t insn_off; /* Offset of instruction to be patched. A + section-relative label at compile time. */ + uint32_t type_id; /* Type ID of the outermost containing entity, e.g. + the containing structure. */ + uint32_t access_str_off; /* Offset of CO-RE accessor string in .BTF strings + section. */ + uint32_t kind; /* An enum btf_core_reloc_kind. Note that it always + takes 32 bits. */ +}; + +struct btf_ext_header +{ + uint16_t magic; /* Magic number (BTF_MAGIC). */ + uint8_t version; /* Data format version (BTF_VERSION). */ + uint8_t flags; /* Flags. Currently unused. */ + uint32_t hdr_len; /* Length of this header in bytes. */ + + /* Following offsets are relative to the end of this header, in bytes. + Following lengths are in bytes. */ + uint32_t func_info_off; /* Offset of funcinfo section. */ + uint32_t func_info_len; /* Length of funcinfo section. */ + uint32_t line_info_off; /* Offset of lineinfo section. */ + uint32_t line_info_len; /* Length of lineinfo section. */ + + uint32_t core_relo_off; /* Offset of CO-RE relocation section. */ + uint32_t core_relo_len; /* Length of CO-RE relocation section. */ +}; + +extern void btf_ext_init (void); +extern void btf_ext_output (void); + +extern void bpf_core_reloc_add (const tree, const char *, vec *, + rtx_code_label *); +extern int bpf_core_get_sou_member_index (ctf_container_ref, const tree); + +#ifdef __cplusplus +} +#endif + +#endif /* __COREOUT_H */ diff --git a/gcc/config/bpf/t-bpf b/gcc/config/bpf/t-bpf index e69de29..b37bf85 100644 --- a/gcc/config/bpf/t-bpf +++ b/gcc/config/bpf/t-bpf @@ -0,0 +1,8 @@ + +TM_H += $(srcdir)/config/bpf/coreout.h + +coreout.o: $(srcdir)/config/bpf/coreout.c + $(COMPILE) $< + $(POSTCOMPILE) + +PASSES_EXTRA += $(srcdir)/config/bpf/bpf-passes.def -- cgit v1.1 From b552c4e601c7fdc4d341e29cc1ed6081d42b00d0 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Tue, 7 Sep 2021 15:40:00 -0700 Subject: gcc: xtensa: fix PR target/102115 2021-09-07 Takayuki 'January June' Suwa gcc/ PR target/102115 * config/xtensa/xtensa.c (xtensa_emit_move_sequence): Add 'CONST_INT_P (src)' to the condition of the block that tries to eliminate literal when loading integer contant. --- gcc/config/xtensa/xtensa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/xtensa/xtensa.c b/gcc/config/xtensa/xtensa.c index f4f8f19..8d67551 100644 --- a/gcc/config/xtensa/xtensa.c +++ b/gcc/config/xtensa/xtensa.c @@ -1084,7 +1084,8 @@ xtensa_emit_move_sequence (rtx *operands, machine_mode mode) { /* Try to emit MOVI + SLLI sequence, that is smaller than L32R + literal. */ - if (optimize_size && mode == SImode && register_operand (dst, mode)) + if (optimize_size && mode == SImode && CONST_INT_P (src) + && register_operand (dst, mode)) { HOST_WIDE_INT srcval = INTVAL (src); int shift = ctz_hwi (srcval); -- cgit v1.1 From f19a327077ecc34a51487761378b9edb43c82997 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 2 Aug 2021 10:56:45 +0800 Subject: Support -fexcess-precision=16 which will enable FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 when backend supports _Float16. gcc/ada/ChangeLog: * gcc-interface/misc.c (gnat_post_options): Issue an error for -fexcess-precision=16. gcc/c-family/ChangeLog: * c-common.c (excess_precision_mode_join): Update below comments. (c_ts18661_flt_eval_method): Set excess_precision_type to EXCESS_PRECISION_TYPE_FLOAT16 when -fexcess-precision=16. * c-cppbuiltin.c (cpp_atomic_builtins): Update below comments. (c_cpp_flt_eval_method_iec_559): Set excess_precision_type to EXCESS_PRECISION_TYPE_FLOAT16 when -fexcess-precision=16. gcc/ChangeLog: * common.opt: Support -fexcess-precision=16. * config/aarch64/aarch64.c (aarch64_excess_precision): Return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 when EXCESS_PRECISION_TYPE_FLOAT16. * config/arm/arm.c (arm_excess_precision): Ditto. * config/i386/i386.c (ix86_get_excess_precision): Ditto. * config/m68k/m68k.c (m68k_excess_precision): Issue an error when EXCESS_PRECISION_TYPE_FLOAT16. * config/s390/s390.c (s390_excess_precision): Ditto. * coretypes.h (enum excess_precision_type): Add EXCESS_PRECISION_TYPE_FLOAT16. * doc/tm.texi (TARGET_C_EXCESS_PRECISION): Update documents. * doc/tm.texi.in (TARGET_C_EXCESS_PRECISION): Ditto. * doc/extend.texi (Half-Precision): Document -fexcess-precision=16. * flag-types.h (enum excess_precision): Add EXCESS_PRECISION_FLOAT16. * target.def (excess_precision): Update document. * tree.c (excess_precision_type): Set excess_precision_type to EXCESS_PRECISION_FLOAT16 when -fexcess-precision=16. gcc/fortran/ChangeLog: * options.c (gfc_post_options): Issue an error for -fexcess-precision=16. gcc/testsuite/ChangeLog: * gcc.target/i386/float16-6.c: New test. * gcc.target/i386/float16-7.c: New test. --- gcc/config/aarch64/aarch64.c | 1 + gcc/config/arm/arm.c | 1 + gcc/config/i386/i386.c | 5 +++++ gcc/config/m68k/m68k.c | 3 +++ gcc/config/s390/s390.c | 3 +++ 5 files changed, 13 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 26d59ba..1fbe9e0 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -25045,6 +25045,7 @@ aarch64_excess_precision (enum excess_precision_type type) ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT); case EXCESS_PRECISION_TYPE_IMPLICIT: + case EXCESS_PRECISION_TYPE_FLOAT16: return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16; default: gcc_unreachable (); diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 5c92941..f1e6282 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -25612,6 +25612,7 @@ arm_excess_precision (enum excess_precision_type type) ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT); case EXCESS_PRECISION_TYPE_IMPLICIT: + case EXCESS_PRECISION_TYPE_FLOAT16: return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16; default: gcc_unreachable (); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index bfefbd7..210fc42 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -23599,6 +23599,11 @@ ix86_get_excess_precision (enum excess_precision_type type) return (type == EXCESS_PRECISION_TYPE_STANDARD ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT : FLT_EVAL_METHOD_UNPREDICTABLE); + case EXCESS_PRECISION_TYPE_FLOAT16: + if (TARGET_80387 + && !(TARGET_SSE_MATH && TARGET_SSE)) + error ("%<-fexcess-precision=16%> is not compatible with %<-mfpmath=387%>"); + return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16; default: gcc_unreachable (); } diff --git a/gcc/config/m68k/m68k.c b/gcc/config/m68k/m68k.c index 3f63c60..0248eb7 100644 --- a/gcc/config/m68k/m68k.c +++ b/gcc/config/m68k/m68k.c @@ -7115,6 +7115,9 @@ m68k_excess_precision (enum excess_precision_type type) return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT; return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE; + case EXCESS_PRECISION_TYPE_FLOAT16: + error ("%<-fexcess-precision=16%> is not supported on this target"); + break; default: gcc_unreachable (); } diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 673a134..54dd633 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -16549,6 +16549,9 @@ s390_excess_precision (enum excess_precision_type type) ensure consistency with the implementation in glibc, report that float is evaluated to the range and precision of double. */ return FLT_EVAL_METHOD_PROMOTE_TO_DOUBLE; + case EXCESS_PRECISION_TYPE_FLOAT16: + error ("%<-fexcess-precision=16%> is not supported on this target"); + break; default: gcc_unreachable (); } -- cgit v1.1 From a68412117fa47786bd82ab79b009ec7933aef476 Mon Sep 17 00:00:00 2001 From: "Guo, Xuepeng" Date: Mon, 24 Dec 2018 19:39:26 -0800 Subject: AVX512FP16: Initial support for AVX512FP16 feature and scalar _Float16 instructions. gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_available_features): Detect FEATURE_AVX512FP16. * common/config/i386/i386-common.c (OPTION_MASK_ISA_AVX512FP16_SET, OPTION_MASK_ISA_AVX512FP16_UNSET, OPTION_MASK_ISA2_AVX512FP16_SET, OPTION_MASK_ISA2_AVX512FP16_UNSET): New. (OPTION_MASK_ISA2_AVX512BW_UNSET, OPTION_MASK_ISA2_AVX512BF16_UNSET): Add AVX512FP16. (ix86_handle_option): Handle -mavx512fp16. * common/config/i386/i386-cpuinfo.h (enum processor_features): Add FEATURE_AVX512FP16. * common/config/i386/i386-isas.h: Add entry for AVX512FP16. * config.gcc: Add avx512fp16intrin.h. * config/i386/avx512fp16intrin.h: New intrinsic header. * config/i386/cpuid.h: Add bit_AVX512FP16. * config/i386/i386-builtin-types.def: (FLOAT16): New primitive type. * config/i386/i386-builtins.c: Support _Float16 type for i386 backend. (ix86_register_float16_builtin_type): New function. (ix86_float16_type_node): New. * config/i386/i386-c.c (ix86_target_macros_internal): Define __AVX512FP16__. * config/i386/i386-expand.c (ix86_expand_branch): Support HFmode. (ix86_prepare_fp_compare_args): Adjust TARGET_SSE_MATH && SSE_FLOAT_MODE_P to SSE_FLOAT_MODE_SSEMATH_OR_HF_P. (ix86_expand_fp_movcc): Ditto. * config/i386/i386-isa.def: Add PTA define for AVX512FP16. * config/i386/i386-options.c (isa2_opts): Add -mavx512fp16. (ix86_valid_target_attribute_inner_p): Add avx512fp16 attribute. * config/i386/i386.c (ix86_get_ssemov): Use vmovdqu16/vmovw/vmovsh for HFmode/HImode scalar or vector. (ix86_get_excess_precision): Use FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 when TARGET_AVX512FP16 existed. (sse_store_index): Use SFmode cost for HFmode cost. (inline_memory_move_cost): Add HFmode, and perfer SSE cost over GPR cost for HFmode. (ix86_hard_regno_mode_ok): Allow HImode in sse register. (ix86_mangle_type): Add manlging for _Float16 type. (inline_secondary_memory_needed): No memory is needed for 16bit movement between gpr and sse reg under TARGET_AVX512FP16. (ix86_multiplication_cost): Adjust TARGET_SSE_MATH && SSE_FLOAT_MODE_P to SSE_FLOAT_MODE_SSEMATH_OR_HF_P. (ix86_division_cost): Ditto. (ix86_rtx_costs): Ditto. (ix86_add_stmt_cost): Ditto. (ix86_optab_supported_p): Ditto. * config/i386/i386.h (VALID_AVX512F_SCALAR_MODE): Add HFmode. (SSE_FLOAT_MODE_SSEMATH_OR_HF_P): Add HFmode. (PTA_SAPPHIRERAPIDS): Add PTA_AVX512FP16. * config/i386/i386.md (mode): Add HFmode. (MODE_SIZE): Add HFmode. (isa): Add avx512fp16. (enabled): Handle avx512fp16. (ssemodesuffix): Add sh suffix for HFmode. (comm): Add mult, div. (plusminusmultdiv): New code iterator. (insn): Add mult, div. (*movhf_internal): Adjust for avx512fp16 instruction. (*movhi_internal): Ditto. (*cmpihf): New define_insn for HFmode. (*ieee_shf3): Likewise. (extendhf2): Likewise. (trunchf2): Likewise. (floathf2): Likewise. (*hf): Likewise. (cbranchhf4): New expander. (movhfcc): Likewise. (hf3): Likewise. (mulhf3): Likewise. (divhf3): Likewise. * config/i386/i386.opt: Add mavx512fp16. * config/i386/immintrin.h: Include avx512fp16intrin.h. * doc/invoke.texi: Add mavx512fp16. * doc/extend.texi: Add avx512fp16 Usage Notes. gcc/testsuite/ChangeLog: * gcc.target/i386/avx-1.c: Add -mavx512fp16 in dg-options. * gcc.target/i386/avx-2.c: Ditto. * gcc.target/i386/avx512-check.h: Check cpuid for AVX512FP16. * gcc.target/i386/funcspec-56.inc: Add new target attribute check. * gcc.target/i386/sse-13.c: Add -mavx512fp16. * gcc.target/i386/sse-14.c: Ditto. * gcc.target/i386/sse-22.c: Ditto. * gcc.target/i386/sse-23.c: Ditto. * lib/target-supports.exp: (check_effective_target_avx512fp16): New. * g++.target/i386/float16-1.C: New test. * g++.target/i386/float16-2.C: Ditto. * g++.target/i386/float16-3.C: Ditto. * gcc.target/i386/avx512fp16-12a.c: Ditto. * gcc.target/i386/avx512fp16-12b.c: Ditto. * gcc.target/i386/float16-3a.c: Ditto. * gcc.target/i386/float16-3b.c: Ditto. * gcc.target/i386/float16-4a.c: Ditto. * gcc.target/i386/float16-4b.c: Ditto. * gcc.target/i386/pr54855-12.c: Ditto. * g++.dg/other/i386-2.C: Ditto. * g++.dg/other/i386-3.C: Ditto. Co-Authored-By: H.J. Lu Co-Authored-By: Liu Hongtao Co-Authored-By: Wang Hongyu Co-Authored-By: Xu Dianhong --- gcc/config/i386/avx512fp16intrin.h | 53 ++++++++++ gcc/config/i386/cpuid.h | 1 + gcc/config/i386/i386-builtin-types.def | 1 + gcc/config/i386/i386-builtins.c | 23 +++++ gcc/config/i386/i386-c.c | 2 + gcc/config/i386/i386-expand.c | 5 +- gcc/config/i386/i386-isa.def | 1 + gcc/config/i386/i386-options.c | 4 +- gcc/config/i386/i386.c | 136 +++++++++++++++++++------- gcc/config/i386/i386.h | 11 ++- gcc/config/i386/i386.md | 172 +++++++++++++++++++++++++++++---- gcc/config/i386/i386.opt | 4 + gcc/config/i386/immintrin.h | 4 + 13 files changed, 357 insertions(+), 60 deletions(-) create mode 100644 gcc/config/i386/avx512fp16intrin.h (limited to 'gcc/config') diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h new file mode 100644 index 0000000..38d6316 --- /dev/null +++ b/gcc/config/i386/avx512fp16intrin.h @@ -0,0 +1,53 @@ +/* Copyright (C) 2019 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512FP16INTRIN_H_INCLUDED +#define __AVX512FP16INTRIN_H_INCLUDED + +#ifndef __AVX512FP16__ +#pragma GCC push_options +#pragma GCC target("avx512fp16") +#define __DISABLE_AVX512FP16__ +#endif /* __AVX512FP16__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16))); +typedef _Float16 __v16hf __attribute__ ((__vector_size__ (32))); +typedef _Float16 __v32hf __attribute__ ((__vector_size__ (64))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef _Float16 __m128h __attribute__ ((__vector_size__ (16), __may_alias__)); +typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), __may_alias__)); +typedef _Float16 __m512h __attribute__ ((__vector_size__ (64), __may_alias__)); + +#ifdef __DISABLE_AVX512FP16__ +#undef __DISABLE_AVX512FP16__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512FP16__ */ + +#endif /* __AVX512FP16INTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h index aebc17c..82b8050 100644 --- a/gcc/config/i386/cpuid.h +++ b/gcc/config/i386/cpuid.h @@ -126,6 +126,7 @@ #define bit_AVX5124VNNIW (1 << 2) #define bit_AVX5124FMAPS (1 << 3) #define bit_AVX512VP2INTERSECT (1 << 8) +#define bit_AVX512FP16 (1 << 23) #define bit_IBT (1 << 20) #define bit_UINTR (1 << 5) #define bit_PCONFIG (1 << 18) diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 3ca313c..1768b88 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -68,6 +68,7 @@ DEF_PRIMITIVE_TYPE (UINT8, unsigned_char_type_node) DEF_PRIMITIVE_TYPE (UINT16, short_unsigned_type_node) DEF_PRIMITIVE_TYPE (INT64, long_long_integer_type_node) DEF_PRIMITIVE_TYPE (UINT64, long_long_unsigned_type_node) +DEF_PRIMITIVE_TYPE (FLOAT16, ix86_float16_type_node) DEF_PRIMITIVE_TYPE (FLOAT, float_type_node) DEF_PRIMITIVE_TYPE (DOUBLE, double_type_node) DEF_PRIMITIVE_TYPE (FLOAT80, float80_type_node) diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c index 204e290..1799701 100644 --- a/gcc/config/i386/i386-builtins.c +++ b/gcc/config/i386/i386-builtins.c @@ -125,6 +125,7 @@ BDESC_VERIFYS (IX86_BUILTIN_MAX, /* Table for the ix86 builtin non-function types. */ static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1]; +tree ix86_float16_type_node = NULL_TREE; /* Retrieve an element from the above table, building some of the types lazily. */ @@ -1344,6 +1345,26 @@ ix86_init_builtins_va_builtins_abi (void) } static void +ix86_register_float16_builtin_type (void) +{ + /* Provide the _Float16 type and float16_type_node if needed so that + it can be used in AVX512FP16 intrinsics and builtins. */ + if (!float16_type_node) + { + ix86_float16_type_node = make_node (REAL_TYPE); + TYPE_PRECISION (ix86_float16_type_node) = 16; + SET_TYPE_MODE (ix86_float16_type_node, HFmode); + layout_type (ix86_float16_type_node); + } + else + ix86_float16_type_node = float16_type_node; + + if (!maybe_get_identifier ("_Float16") && TARGET_SSE2) + lang_hooks.types.register_builtin_type (ix86_float16_type_node, + "_Float16"); +} + +static void ix86_init_builtin_types (void) { tree float80_type_node, const_string_type_node; @@ -1371,6 +1392,8 @@ ix86_init_builtin_types (void) it. */ lang_hooks.types.register_builtin_type (float128_type_node, "__float128"); + ix86_register_float16_builtin_type (); + const_string_type_node = build_pointer_type (build_qualified_type (char_type_node, TYPE_QUAL_CONST)); diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c index 5ed0de0..cc64f85 100644 --- a/gcc/config/i386/i386-c.c +++ b/gcc/config/i386/i386-c.c @@ -598,6 +598,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__PTWRITE__"); if (isa_flag2 & OPTION_MASK_ISA2_AVX512BF16) def_or_undef (parse_in, "__AVX512BF16__"); + if (isa_flag2 & OPTION_MASK_ISA2_AVX512FP16) + def_or_undef (parse_in, "__AVX512FP16__"); if (TARGET_MMX_WITH_SSE) def_or_undef (parse_in, "__MMX_WITH_SSE__"); if (isa_flag2 & OPTION_MASK_ISA2_ENQCMD) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 3f90f67..fb3873b 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -2351,6 +2351,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label) switch (mode) { + case E_HFmode: case E_SFmode: case E_DFmode: case E_XFmode: @@ -2664,7 +2665,7 @@ ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) bool unordered_compare = ix86_unordered_fp_compare (code); rtx op0 = *pop0, op1 = *pop1; machine_mode op_mode = GET_MODE (op0); - bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode); + bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode); /* All of the unordered compare instructions only work on registers. The same is true of the fcomi compare instructions. The XFmode @@ -4149,7 +4150,7 @@ ix86_expand_fp_movcc (rtx operands[]) rtx op0 = XEXP (operands[1], 0); rtx op1 = XEXP (operands[1], 1); - if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode)) + if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode)) { machine_mode cmode; diff --git a/gcc/config/i386/i386-isa.def b/gcc/config/i386/i386-isa.def index a0d46cb..83d9302 100644 --- a/gcc/config/i386/i386-isa.def +++ b/gcc/config/i386/i386-isa.def @@ -108,3 +108,4 @@ DEF_PTA(HRESET) DEF_PTA(KL) DEF_PTA(WIDEKL) DEF_PTA(AVXVNNI) +DEF_PTA(AVX512FP16) diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index fee5a48..2cb87ce 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -223,7 +223,8 @@ static struct ix86_target_opts isa2_opts[] = { "-mhreset", OPTION_MASK_ISA2_HRESET }, { "-mkl", OPTION_MASK_ISA2_KL }, { "-mwidekl", OPTION_MASK_ISA2_WIDEKL }, - { "-mavxvnni", OPTION_MASK_ISA2_AVXVNNI } + { "-mavxvnni", OPTION_MASK_ISA2_AVXVNNI }, + { "-mavx512fp16", OPTION_MASK_ISA2_AVX512FP16 } }; static struct ix86_target_opts isa_opts[] = { @@ -1049,6 +1050,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], IX86_ATTR_ISA ("amx-bf16", OPT_mamx_bf16), IX86_ATTR_ISA ("hreset", OPT_mhreset), IX86_ATTR_ISA ("avxvnni", OPT_mavxvnni), + IX86_ATTR_ISA ("avx512fp16", OPT_mavx512fp16), /* enum options */ IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_), diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 210fc42..b2a58b0 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -5508,6 +5508,14 @@ ix86_output_ssemov (rtx_insn *insn, rtx *operands) case MODE_SI: return "%vmovd\t{%1, %0|%0, %1}"; + case MODE_HI: + if (GENERAL_REG_P (operands[0])) + return "vmovw\t{%1, %k0|%k0, %1}"; + else if (GENERAL_REG_P (operands[1])) + return "vmovw\t{%k1, %0|%0, %k1}"; + else + return "vmovw\t{%1, %0|%0, %1}"; + case MODE_DF: if (TARGET_AVX && REG_P (operands[0]) && REG_P (operands[1])) return "vmovsd\t{%d1, %0|%0, %d1}"; @@ -5520,6 +5528,12 @@ ix86_output_ssemov (rtx_insn *insn, rtx *operands) else return "%vmovss\t{%1, %0|%0, %1}"; + case MODE_HF: + if (REG_P (operands[0]) && REG_P (operands[1])) + return "vmovsh\t{%d1, %0|%0, %d1}"; + else + return "vmovsh\t{%1, %0|%0, %1}"; + case MODE_V1DF: gcc_assert (!TARGET_AVX); return "movlpd\t{%1, %0|%0, %1}"; @@ -13999,7 +14013,7 @@ output_387_binary_op (rtx_insn *insn, rtx *operands) if (is_sse) { - p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd"; + p = GET_MODE (operands[0]) == SFmode ? "ss" : "sd"; strcat (buf, p); if (TARGET_AVX) @@ -19311,10 +19325,19 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1, if (!TARGET_SSE2) return true; + if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2))) + return true; + + int msize = GET_MODE_SIZE (mode); + /* Between SSE and general, we have moves no larger than word size. */ - if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2)) - || GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode) - || GET_MODE_SIZE (mode) > UNITS_PER_WORD) + if (msize > UNITS_PER_WORD) + return true; + + /* In addition to SImode moves, AVX512FP16 also enables HImode moves. */ + int minsize = GET_MODE_SIZE (TARGET_AVX512FP16 ? HImode : SImode); + + if (msize < minsize) return true; /* If the target says that inter-unit moves are more expensive @@ -19408,21 +19431,27 @@ ix86_can_change_mode_class (machine_mode from, machine_mode to, static inline int sse_store_index (machine_mode mode) { - switch (GET_MODE_SIZE (mode)) - { - case 4: - return 0; - case 8: - return 1; - case 16: - return 2; - case 32: - return 3; - case 64: - return 4; - default: - return -1; - } + /* NB: Use SFmode cost for HFmode instead of adding HFmode load/store + costs to processor_costs, which requires changes to all entries in + processor cost table. */ + if (mode == E_HFmode) + mode = E_SFmode; + + switch (GET_MODE_SIZE (mode)) + { + case 4: + return 0; + case 8: + return 1; + case 16: + return 2; + case 32: + return 3; + case 64: + return 4; + default: + return -1; + } } /* Return the cost of moving data of mode M between a @@ -19444,11 +19473,13 @@ static inline int inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in) { int cost; + if (FLOAT_CLASS_P (regclass)) { int index; switch (mode) { + case E_HFmode: case E_SFmode: index = 0; break; @@ -19549,11 +19580,32 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in) } break; case 2: - if (in == 2) - return MAX (ix86_cost->hard_register.int_load[1], - ix86_cost->hard_register.int_store[1]); - return in ? ix86_cost->hard_register.int_load[1] - : ix86_cost->hard_register.int_store[1]; + { + int cost; + if (in == 2) + cost = MAX (ix86_cost->hard_register.int_load[1], + ix86_cost->hard_register.int_store[1]); + else + cost = in ? ix86_cost->hard_register.int_load[1] + : ix86_cost->hard_register.int_store[1]; + + if (mode == E_HFmode) + { + /* Prefer SSE over GPR for HFmode. */ + int sse_cost; + int index = sse_store_index (mode); + if (in == 2) + sse_cost = MAX (ix86_cost->hard_register.sse_load[index], + ix86_cost->hard_register.sse_store[index]); + else + sse_cost = (in + ? ix86_cost->hard_register.sse_load [index] + : ix86_cost->hard_register.sse_store [index]); + if (sse_cost >= cost) + cost = sse_cost + 1; + } + return cost; + } default: if (in == 2) cost = MAX (ix86_cost->hard_register.int_load[2], @@ -19727,6 +19779,8 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) - XI mode - any of 512-bit wide vector mode - any scalar mode. */ + /* For AVX512FP16, vmovw supports movement of HImode + between gpr and sse registser. */ if (TARGET_AVX512F && (mode == XImode || VALID_AVX512F_REG_MODE (mode) @@ -20048,7 +20102,7 @@ ix86_multiplication_cost (const struct processor_costs *cost, if (VECTOR_MODE_P (mode)) inner_mode = GET_MODE_INNER (mode); - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode)) return inner_mode == DFmode ? cost->mulsd : cost->mulss; else if (X87_FLOAT_MODE_P (mode)) return cost->fmul; @@ -20100,7 +20154,7 @@ ix86_division_cost (const struct processor_costs *cost, if (VECTOR_MODE_P (mode)) inner_mode = GET_MODE_INNER (mode); - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode)) return inner_mode == DFmode ? cost->divsd : cost->divss; else if (X87_FLOAT_MODE_P (mode)) return cost->fdiv; @@ -20518,7 +20572,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, return true; } - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode)) { *total = cost->addss; return false; @@ -20557,7 +20611,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, /* FALLTHRU */ case NEG: - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode)) { *total = cost->sse_op; return false; @@ -20639,14 +20693,14 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, return false; case FLOAT_EXTEND: - if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)) + if (!SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode)) *total = 0; else *total = ix86_vec_cost (mode, cost->addss); return false; case FLOAT_TRUNCATE: - if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)) + if (!SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode)) *total = cost->fadd; else *total = ix86_vec_cost (mode, cost->addss); @@ -20656,7 +20710,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, /* SSE requires memory load for the constant operand. It may make sense to account for this. Of course the constant operand may or may not be reused. */ - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode)) *total = cost->sse_op; else if (X87_FLOAT_MODE_P (mode)) *total = cost->fabs; @@ -20665,7 +20719,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, return false; case SQRT: - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode)) *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd; else if (X87_FLOAT_MODE_P (mode)) *total = cost->fsqrt; @@ -22154,6 +22208,10 @@ ix86_mangle_type (const_tree type) switch (TYPE_MODE (type)) { + case E_HFmode: + /* _Float16 is "DF16_". + Align with clang's decision in https://reviews.llvm.org/D33719. */ + return "DF16_"; case E_TFmode: /* __float128 is "g". */ return "g"; @@ -22777,7 +22835,7 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, case MINUS_EXPR: if (kind == scalar_stmt) { - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode)) stmt_cost = ix86_cost->addss; else if (X87_FLOAT_MODE_P (mode)) stmt_cost = ix86_cost->fadd; @@ -22803,7 +22861,7 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, break; case NEGATE_EXPR: - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode)) stmt_cost = ix86_cost->sse_op; else if (X87_FLOAT_MODE_P (mode)) stmt_cost = ix86_cost->fchs; @@ -22859,7 +22917,7 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, case BIT_XOR_EXPR: case BIT_AND_EXPR: case BIT_NOT_EXPR: - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode)) stmt_cost = ix86_cost->sse_op; else if (VECTOR_MODE_P (mode)) stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); @@ -23574,14 +23632,18 @@ ix86_get_excess_precision (enum excess_precision_type type) /* The fastest type to promote to will always be the native type, whether that occurs with implicit excess precision or otherwise. */ - return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT; + return TARGET_AVX512FP16 + ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 + : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT; case EXCESS_PRECISION_TYPE_STANDARD: case EXCESS_PRECISION_TYPE_IMPLICIT: /* Otherwise, the excess precision we want when we are in a standards compliant mode, and the implicit precision we provide would be identical were it not for the unpredictable cases. */ - if (!TARGET_80387) + if (TARGET_AVX512FP16 && TARGET_SSE_MATH) + return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16; + else if (!TARGET_80387) return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT; else if (!TARGET_MIX_SSE_I387) { diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index f671dae..2ac8f3e 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1007,7 +1007,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define VALID_AVX512F_SCALAR_MODE(MODE) \ ((MODE) == DImode || (MODE) == DFmode || (MODE) == SImode \ - || (MODE) == SFmode) + || (MODE) == SFmode \ + || (TARGET_AVX512FP16 && ((MODE) == HImode || (MODE) == HFmode))) #define VALID_AVX512F_REG_MODE(MODE) \ ((MODE) == V8DImode || (MODE) == V8DFmode || (MODE) == V64QImode \ @@ -1046,7 +1047,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define VALID_FP_MODE_P(MODE) \ ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode \ - || (MODE) == SCmode || (MODE) == DCmode || (MODE) == XCmode) \ + || (MODE) == SCmode || (MODE) == DCmode || (MODE) == XCmode) #define VALID_INT_MODE_P(MODE) \ ((MODE) == QImode || (MODE) == HImode \ @@ -1079,6 +1080,10 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define SSE_FLOAT_MODE_P(MODE) \ ((TARGET_SSE && (MODE) == SFmode) || (TARGET_SSE2 && (MODE) == DFmode)) +#define SSE_FLOAT_MODE_SSEMATH_OR_HF_P(MODE) \ + ((SSE_FLOAT_MODE_P (MODE) && TARGET_SSE_MATH) \ + || (TARGET_AVX512FP16 && (MODE) == HFmode)) + #define FMA4_VEC_FLOAT_MODE_P(MODE) \ (TARGET_FMA4 && ((MODE) == V4SFmode || (MODE) == V2DFmode \ || (MODE) == V8SFmode || (MODE) == V4DFmode)) @@ -2295,7 +2300,7 @@ constexpr wide_int_bitmask PTA_TIGERLAKE = PTA_ICELAKE_CLIENT | PTA_MOVDIRI constexpr wide_int_bitmask PTA_SAPPHIRERAPIDS = PTA_COOPERLAKE | PTA_MOVDIRI | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_ENQCMD | PTA_CLDEMOTE | PTA_PTWRITE | PTA_WAITPKG | PTA_SERIALIZE | PTA_TSXLDTRK | PTA_AMX_TILE - | PTA_AMX_INT8 | PTA_AMX_BF16 | PTA_UINTR | PTA_AVXVNNI; + | PTA_AMX_INT8 | PTA_AMX_BF16 | PTA_UINTR | PTA_AVXVNNI | PTA_AVX512FP16; constexpr wide_int_bitmask PTA_KNL = PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD | PTA_PREFETCHWT1; constexpr wide_int_bitmask PTA_BONNELL = PTA_CORE2 | PTA_MOVBE; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 18b91c7..dcbbf2b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -499,7 +499,7 @@ ;; Main data type used by the insn (define_attr "mode" - "unknown,none,QI,HI,SI,DI,TI,OI,XI,SF,DF,XF,TF,V16SF,V8SF,V4DF,V4SF, + "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V16SF,V8SF,V4DF,V4SF, V2DF,V2SF,V1DF,V8DF" (const_string "unknown")) @@ -835,8 +835,7 @@ sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx, avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f, avx512bw,noavx512bw,avx512dq,noavx512dq, - avx512vl,noavx512vl, - avxvnni,avx512vnnivl" + avx512vl,noavx512vl,avxvnni,avx512vnnivl,avx512fp16" (const_string "base")) ;; Define instruction set of MMX instructions @@ -888,6 +887,8 @@ (eq_attr "isa" "avxvnni") (symbol_ref "TARGET_AVXVNNI") (eq_attr "isa" "avx512vnnivl") (symbol_ref "TARGET_AVX512VNNI && TARGET_AVX512VL") + (eq_attr "isa" "avx512fp16") + (symbol_ref "TARGET_AVX512FP16") (eq_attr "mmx_isa" "native") (symbol_ref "!TARGET_MMX_WITH_SSE") @@ -909,6 +910,7 @@ (set_attr "type" "multi")]) (define_code_iterator plusminus [plus minus]) +(define_code_iterator plusminusmultdiv [plus minus mult div]) (define_code_iterator sat_plusminus [ss_plus us_plus ss_minus us_minus]) @@ -924,7 +926,8 @@ ;; Mark commutative operators as such in constraints. (define_code_attr comm [(plus "%") (ss_plus "%") (us_plus "%") - (minus "") (ss_minus "") (us_minus "")]) + (minus "") (ss_minus "") (us_minus "") + (mult "%") (div "")]) ;; Mapping of max and min (define_code_iterator maxmin [smax smin umax umin]) @@ -1024,7 +1027,8 @@ (minus "sub") (ss_minus "sssub") (us_minus "ussub") (sign_extend "extend") (zero_extend "zero_extend") (ashift "ashl") (lshiftrt "lshr") (ashiftrt "ashr") - (rotate "rotl") (rotatert "rotr")]) + (rotate "rotl") (rotatert "rotr") + (mult "mul") (div "div")]) ;; All integer modes. (define_mode_iterator SWI1248x [QI HI SI DI]) @@ -1092,8 +1096,9 @@ ;; compile time constant, it is faster to use than ;; GET_MODE_SIZE (mode). For XFmode which depends on ;; command line options just use GET_MODE_SIZE macro. -(define_mode_attr MODE_SIZE [(QI "1") (HI "2") (SI "4") (DI "8") (TI "16") - (SF "4") (DF "8") (XF "GET_MODE_SIZE (XFmode)") +(define_mode_attr MODE_SIZE [(QI "1") (HI "2") (SI "4") (DI "8") + (TI "16") (HF "2") (SF "4") (DF "8") + (XF "GET_MODE_SIZE (XFmode)") (V16QI "16") (V32QI "32") (V64QI "64") (V8HI "16") (V16HI "32") (V32HI "64") (V4SI "16") (V8SI "32") (V16SI "64") @@ -1226,8 +1231,8 @@ ;; All x87 floating point modes (define_mode_iterator X87MODEF [SF DF XF]) -;; All x87 floating point modes plus HF -(define_mode_iterator X87MODEFH [SF DF XF HF]) +;; All x87 floating point modes plus HFmode +(define_mode_iterator X87MODEFH [HF SF DF XF]) ;; All SSE floating point modes (define_mode_iterator SSEMODEF [SF DF TF]) @@ -1235,7 +1240,7 @@ ;; SSE instruction suffix for various modes (define_mode_attr ssemodesuffix - [(SF "ss") (DF "sd") + [(HF "sh") (SF "ss") (DF "sd") (V16SF "ps") (V8DF "pd") (V8SF "ps") (V4DF "pd") (V4SF "ps") (V2DF "pd") @@ -1500,6 +1505,23 @@ DONE; }) +(define_expand "cbranchhf4" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:HF 1 "cmp_fp_expander_operand") + (match_operand:HF 2 "cmp_fp_expander_operand"))) + (set (pc) (if_then_else + (match_operator 0 "ix86_fp_comparison_operator" + [(reg:CC FLAGS_REG) + (const_int 0)]) + (label_ref (match_operand 3)) + (pc)))] + "TARGET_AVX512FP16" +{ + ix86_expand_branch (GET_CODE (operands[0]), + operands[1], operands[2], operands[3]); + DONE; +}) + (define_expand "cbranch4" [(set (reg:CC FLAGS_REG) (compare:CC (match_operand:MODEF 1 "cmp_fp_expander_operand") @@ -1709,6 +1731,17 @@ (eq_attr "alternative" "0") (symbol_ref "true") (symbol_ref "false"))))]) + +(define_insn "*cmpihf" + [(set (reg:CCFP FLAGS_REG) + (compare:CCFP + (match_operand:HF 0 "register_operand" "v") + (match_operand:HF 1 "nonimmediate_operand" "vm")))] + "TARGET_AVX512FP16" + "vcomish\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecomi") + (set_attr "prefix" "evex") + (set_attr "mode" "HF")]) ;; Push/pop instructions. @@ -2440,8 +2473,8 @@ (symbol_ref "true")))]) (define_insn "*movhi_internal" - [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k ,*r,*m,*k") - (match_operand:HI 1 "general_operand" "r ,rn,rm,rn,*r,*km,*k,*k,CBC"))] + [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k ,*r,*m,*k,?r,?v,*v,*v,*m") + (match_operand:HI 1 "general_operand" "r ,rn,rm,rn,*r,*km,*k,*k,CBC,v, r, v, m, v"))] "!(MEM_P (operands[0]) && MEM_P (operands[1])) && ix86_hardreg_mov_ok (operands[0], operands[1])" @@ -2467,6 +2500,9 @@ gcc_unreachable (); } + case TYPE_SSEMOV: + return ix86_output_ssemov (insn, operands); + case TYPE_MSKLOG: if (operands[1] == const0_rtx) return "kxorw\t%0, %0, %0"; @@ -2481,8 +2517,15 @@ return "mov{w}\t{%1, %0|%0, %1}"; } } - [(set (attr "type") - (cond [(eq_attr "alternative" "4,5,6,7") + [(set (attr "isa") + (cond [(eq_attr "alternative" "9,10,11,12,13") + (const_string "avx512fp16") + ] + (const_string "*"))) + (set (attr "type") + (cond [(eq_attr "alternative" "9,10,11,12,13") + (const_string "ssemov") + (eq_attr "alternative" "4,5,6,7") (const_string "mskmov") (eq_attr "alternative" "8") (const_string "msklog") @@ -2507,6 +2550,8 @@ (set (attr "mode") (cond [(eq_attr "type" "imovx") (const_string "SI") + (eq_attr "alternative" "11") + (const_string "HF") (and (eq_attr "alternative" "1,2") (match_operand:HI 1 "aligned_operand")) (const_string "SI") @@ -3731,7 +3776,10 @@ (eq_attr "alternative" "2") (const_string "sselog1") (eq_attr "alternative" "4,5,6,7") - (const_string "sselog") + (if_then_else + (match_test ("TARGET_AVX512FP16")) + (const_string "ssemov") + (const_string "sselog")) ] (const_string "ssemov"))) (set (attr "memory") @@ -3754,9 +3802,15 @@ (eq_attr "alternative" "2") (const_string "V4SF") (eq_attr "alternative" "4,5,6,7") - (const_string "TI") + (if_then_else + (match_test "TARGET_AVX512FP16") + (const_string "HI") + (const_string "TI")) (eq_attr "alternative" "3") - (const_string "SF") + (if_then_else + (match_test "TARGET_AVX512FP16") + (const_string "HF") + (const_string "SF")) ] (const_string "*")))]) @@ -4497,6 +4551,17 @@ emit_move_insn (operands[0], CONST0_RTX (V2DFmode)); }) +(define_insn "extendhf2" + [(set (match_operand:MODEF 0 "nonimm_ssenomem_operand" "=v") + (float_extend:MODEF + (match_operand:HF 1 "nonimmediate_operand" "vm")))] + "TARGET_AVX512FP16" + "vcvtsh2\t{%1, %0, %0|%0, %0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + + (define_expand "extendxf2" [(set (match_operand:XF 0 "nonimmediate_operand") (float_extend:XF (match_operand:MODEF 1 "general_operand")))] @@ -4674,6 +4739,18 @@ (symbol_ref "flag_unsafe_math_optimizations") ] (symbol_ref "true")))]) + +;; Conversion from {SF,DF}mode to HFmode. + +(define_insn "trunchf2" + [(set (match_operand:HF 0 "register_operand" "=v") + (float_truncate:HF + (match_operand:MODEF 1 "nonimmediate_operand" "vm")))] + "TARGET_AVX512FP16" + "vcvt2sh\t{%1, %d0|%d0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "HF")]) ;; Signed conversion to DImode. @@ -5050,6 +5127,16 @@ (symbol_ref "TARGET_INTER_UNIT_CONVERSIONS")] (symbol_ref "true")))]) +(define_insn "floathf2" + [(set (match_operand:HF 0 "register_operand" "=v") + (any_float:HF + (match_operand:SWI48 1 "nonimmediate_operand" "rm")))] + "TARGET_AVX512FP16" + "vcvtsi2sh\t{%1, %d0|%d0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "evex") + (set_attr "mode" "HF")]) + (define_insn "*floatdi2_i387" [(set (match_operand:MODEF 0 "register_operand" "=f") (float:MODEF (match_operand:DI 1 "nonimmediate_operand" "m")))] @@ -7653,6 +7740,13 @@ (match_operand:XF 2 "register_operand")))] "TARGET_80387") +(define_expand "hf3" + [(set (match_operand:HF 0 "register_operand") + (plusminus:HF + (match_operand:HF 1 "register_operand") + (match_operand:HF 2 "nonimmediate_operand")))] + "TARGET_AVX512FP16") + (define_expand "3" [(set (match_operand:MODEF 0 "register_operand") (plusminus:MODEF @@ -8230,6 +8324,12 @@ (match_operand:XF 2 "register_operand")))] "TARGET_80387") +(define_expand "mulhf3" + [(set (match_operand:HF 0 "register_operand") + (mult:HF (match_operand:HF 1 "register_operand") + (match_operand:HF 2 "nonimmediate_operand")))] + "TARGET_AVX512FP16") + (define_expand "mul3" [(set (match_operand:MODEF 0 "register_operand") (mult:MODEF (match_operand:MODEF 1 "register_operand") @@ -8247,6 +8347,12 @@ (match_operand:XF 2 "register_operand")))] "TARGET_80387") +(define_expand "divhf3" + [(set (match_operand:HF 0 "register_operand") + (div:HF (match_operand:HF 1 "register_operand") + (match_operand:HF 2 "nonimmediate_operand")))] + "TARGET_AVX512FP16") + (define_expand "div3" [(set (match_operand:MODEF 0 "register_operand") (div:MODEF (match_operand:MODEF 1 "register_operand") @@ -16667,6 +16773,17 @@ (symbol_ref "true") (symbol_ref "false"))))]) +(define_insn "*hf" + [(set (match_operand:HF 0 "register_operand" "=v") + (plusminusmultdiv:HF + (match_operand:HF 1 "nonimmediate_operand" "v") + (match_operand:HF 2 "nonimmediate_operand" "vm")))] + "TARGET_AVX512FP16 + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "vsh\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "mode" "HF")]) + (define_insn "*rcpsf2_sse" [(set (match_operand:SF 0 "register_operand" "=x,x,x") (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "0,x,m")] @@ -19619,6 +19736,15 @@ operands[9] = replace_rtx (operands[6], operands[0], operands[1], true); }) +(define_expand "movhfcc" + [(set (match_operand:HF 0 "register_operand") + (if_then_else:HF + (match_operand 1 "comparison_operator") + (match_operand:HF 2 "register_operand") + (match_operand:HF 3 "register_operand")))] + "TARGET_AVX512FP16" + "if (ix86_expand_fp_movcc (operands)) DONE; else FAIL;") + (define_expand "movcc" [(set (match_operand:X87MODEF 0 "register_operand") (if_then_else:X87MODEF @@ -19785,6 +19911,18 @@ ;; Their operands are not commutative, and thus they may be used in the ;; presence of -0.0 and NaN. +(define_insn "*ieee_shf3" + [(set (match_operand:HF 0 "register_operand" "=v") + (unspec:HF + [(match_operand:HF 1 "register_operand" "v") + (match_operand:HF 2 "nonimmediate_operand" "vm")] + IEEE_MAXMIN))] + "TARGET_AVX512FP16" + "vsh\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "type" "sseadd") + (set_attr "mode" "HF")]) + (define_insn "*ieee_s3" [(set (match_operand:MODEF 0 "register_operand" "=x,v") (unspec:MODEF diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 7b8547b..ad36697 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1166,3 +1166,7 @@ Emit GNU_PROPERTY_X86_ISA_1_NEEDED GNU property. mmwait Target Mask(ISA2_MWAIT) Var(ix86_isa_flags2) Save Support MWAIT and MONITOR built-in functions and code generation. + +mavx512fp16 +Target Mask(ISA2_AVX512FP16) Var(ix86_isa_flags2) Save +Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX512FP16 built-in functions and code generation. diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h index f129de4..2421a78 100644 --- a/gcc/config/i386/immintrin.h +++ b/gcc/config/i386/immintrin.h @@ -94,6 +94,10 @@ #include +#ifdef __SSE2__ +#include +#endif + #include #include -- cgit v1.1 From 9e2a82e1f9d2c4afc62b864a9cd9efe2e8ecce7d Mon Sep 17 00:00:00 2001 From: liuhongt Date: Fri, 9 Jul 2021 11:24:45 +0800 Subject: AVX512FP16: Support vector init/broadcast/set/extract for FP16. gcc/ChangeLog: * config/i386/avx512fp16intrin.h (_mm_set_ph): New intrinsic. (_mm256_set_ph): Likewise. (_mm512_set_ph): Likewise. (_mm_setr_ph): Likewise. (_mm256_setr_ph): Likewise. (_mm512_setr_ph): Likewise. (_mm_set1_ph): Likewise. (_mm256_set1_ph): Likewise. (_mm512_set1_ph): Likewise. (_mm_setzero_ph): Likewise. (_mm256_setzero_ph): Likewise. (_mm512_setzero_ph): Likewise. (_mm_set_sh): Likewise. (_mm_load_sh): Likewise. (_mm_store_sh): Likewise. * config/i386/i386-builtin-types.def (V8HF): New type. (DEF_FUNCTION_TYPE (V8HF, V8HI)): New builtin function type * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate): Support vector HFmodes. (ix86_expand_vector_init_one_nonzero): Likewise. (ix86_expand_vector_init_one_var): Likewise. (ix86_expand_vector_init_interleave): Likewise. (ix86_expand_vector_init_general): Likewise. (ix86_expand_vector_set): Likewise. (ix86_expand_vector_extract): Likewise. (ix86_expand_vector_init_concat): Likewise. (ix86_expand_sse_movcc): Handle vector HFmodes. (ix86_expand_vector_set_var): Ditto. * config/i386/i386-modes.def: Add HF vector modes in comment. * config/i386/i386.c (classify_argument): Add HF vector modes. (ix86_hard_regno_mode_ok): Allow HF vector modes for AVX512FP16. (ix86_vector_mode_supported_p): Likewise. (ix86_set_reg_reg_cost): Handle vector HFmode. (ix86_get_ssemov): Handle vector HFmode. (function_arg_advance_64): Pass unamed V16HFmode and V32HFmode by stack. (function_arg_advance_32): Pass V8HF/V16HF/V32HF by sse reg for 32bit mode. (function_arg_advance_32): Ditto. * config/i386/i386.h (VALID_AVX512FP16_REG_MODE): New. (VALID_AVX256_REG_OR_OI_MODE): Rename to .. (VALID_AVX256_REG_OR_OI_VHF_MODE): .. this, and add V16HF. (VALID_SSE2_REG_VHF_MODE): New. (VALID_AVX512VL_128_REG_MODE): Add V8HF and TImode. (SSE_REG_MODE_P): Add vector HFmode. * config/i386/i386.md (mode): Add HF vector modes. (MODE_SIZE): Likewise. (ssemodesuffix): Add ph suffix for HF vector modes. * config/i386/sse.md (VFH_128): New mode iterator. (VMOVE): Adjust for HF vector modes. (V): Likewise. (V_256_512): Likewise. (avx512): Likewise. (avx512fmaskmode): Likewise. (shuffletype): Likewise. (sseinsnmode): Likewise. (ssedoublevecmode): Likewise. (ssehalfvecmode): Likewise. (ssehalfvecmodelower): Likewise. (ssePScmode): Likewise. (ssescalarmode): Likewise. (ssescalarmodelower): Likewise. (sseintprefix): Likewise. (i128): Likewise. (bcstscalarsuff): Likewise. (xtg_mode): Likewise. (VI12HF_AVX512VL): New mode_iterator. (VF_AVX512FP16): Likewise. (VIHF): Likewise. (VIHF_256): Likewise. (VIHF_AVX512BW): Likewise. (V16_256): Likewise. (V32_512): Likewise. (sseintmodesuffix): New mode_attr. (sse): Add scalar and vector HFmodes. (ssescalarmode): Add vector HFmode mapping. (ssescalarmodesuffix): Add sh suffix for HFmode. (*_vm3): Use VFH_128. (*_vm3): Likewise. (*ieee_3): Likewise. (_blendm): New define_insn. (vec_setv8hf): New define_expand. (vec_set_0): New define_insn for HF vector set. (*avx512fp16_movsh): Likewise. (avx512fp16_movsh): Likewise. (vec_extract_lo_v32hi): Rename to ... (vec_extract_lo_): ... this, and adjust to allow HF vector modes. (vec_extract_hi_v32hi): Likewise. (vec_extract_hi_): Likewise. (vec_extract_lo_v16hi): Likewise. (vec_extract_lo_): Likewise. (vec_extract_hi_v16hi): Likewise. (vec_extract_hi_): Likewise. (vec_set_hi_v16hi): Likewise. (vec_set_hi_): Likewise. (vec_set_lo_v16hi): Likewise. (vec_set_lo_): Likewise. (*vec_extract_0): New define_insn_and_split for HF vector extract. (*vec_extracthf): New define_insn. (VEC_EXTRACT_MODE): Add HF vector modes. (PINSR_MODE): Add V8HF. (sse2p4_1): Likewise. (pinsr_evex_isa): Likewise. (_pinsr): Adjust to support insert for V8HFmode. (pbroadcast_evex_isa): Add HF vector modes. (AVX2_VEC_DUP_MODE): Likewise. (VEC_INIT_MODE): Likewise. (VEC_INIT_HALF_MODE): Likewise. (avx2_pbroadcast): Adjust to support HF vector mode broadcast. (avx2_pbroadcast_1): Likewise. (_vec_dup_1): Likewise. (_vec_dup): Likewise. (_vec_dup_gpr): Likewise. --- gcc/config/i386/avx512fp16intrin.h | 172 ++++++++++++++ gcc/config/i386/i386-builtin-types.def | 6 +- gcc/config/i386/i386-expand.c | 124 +++++++++- gcc/config/i386/i386-modes.def | 12 +- gcc/config/i386/i386.c | 44 +++- gcc/config/i386/i386.h | 15 +- gcc/config/i386/i386.md | 13 +- gcc/config/i386/sse.md | 404 +++++++++++++++++++++++++-------- 8 files changed, 658 insertions(+), 132 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h index 38d6316..3fc0770 100644 --- a/gcc/config/i386/avx512fp16intrin.h +++ b/gcc/config/i386/avx512fp16intrin.h @@ -45,6 +45,178 @@ typedef _Float16 __m128h __attribute__ ((__vector_size__ (16), __may_alias__)); typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), __may_alias__)); typedef _Float16 __m512h __attribute__ ((__vector_size__ (64), __may_alias__)); +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_ph (_Float16 __A7, _Float16 __A6, _Float16 __A5, + _Float16 __A4, _Float16 __A3, _Float16 __A2, + _Float16 __A1, _Float16 __A0) +{ + return __extension__ (__m128h)(__v8hf){ __A0, __A1, __A2, __A3, + __A4, __A5, __A6, __A7 }; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_ph (_Float16 __A15, _Float16 __A14, _Float16 __A13, + _Float16 __A12, _Float16 __A11, _Float16 __A10, + _Float16 __A9, _Float16 __A8, _Float16 __A7, + _Float16 __A6, _Float16 __A5, _Float16 __A4, + _Float16 __A3, _Float16 __A2, _Float16 __A1, + _Float16 __A0) +{ + return __extension__ (__m256h)(__v16hf){ __A0, __A1, __A2, __A3, + __A4, __A5, __A6, __A7, + __A8, __A9, __A10, __A11, + __A12, __A13, __A14, __A15 }; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_ph (_Float16 __A31, _Float16 __A30, _Float16 __A29, + _Float16 __A28, _Float16 __A27, _Float16 __A26, + _Float16 __A25, _Float16 __A24, _Float16 __A23, + _Float16 __A22, _Float16 __A21, _Float16 __A20, + _Float16 __A19, _Float16 __A18, _Float16 __A17, + _Float16 __A16, _Float16 __A15, _Float16 __A14, + _Float16 __A13, _Float16 __A12, _Float16 __A11, + _Float16 __A10, _Float16 __A9, _Float16 __A8, + _Float16 __A7, _Float16 __A6, _Float16 __A5, + _Float16 __A4, _Float16 __A3, _Float16 __A2, + _Float16 __A1, _Float16 __A0) +{ + return __extension__ (__m512h)(__v32hf){ __A0, __A1, __A2, __A3, + __A4, __A5, __A6, __A7, + __A8, __A9, __A10, __A11, + __A12, __A13, __A14, __A15, + __A16, __A17, __A18, __A19, + __A20, __A21, __A22, __A23, + __A24, __A25, __A26, __A27, + __A28, __A29, __A30, __A31 }; +} + +/* Create vectors of elements in the reversed order from _mm_set_ph, + _mm256_set_ph and _mm512_set_ph functions. */ + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2, + _Float16 __A3, _Float16 __A4, _Float16 __A5, + _Float16 __A6, _Float16 __A7) +{ + return _mm_set_ph (__A7, __A6, __A5, __A4, __A3, __A2, __A1, __A0); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2, + _Float16 __A3, _Float16 __A4, _Float16 __A5, + _Float16 __A6, _Float16 __A7, _Float16 __A8, + _Float16 __A9, _Float16 __A10, _Float16 __A11, + _Float16 __A12, _Float16 __A13, _Float16 __A14, + _Float16 __A15) +{ + return _mm256_set_ph (__A15, __A14, __A13, __A12, __A11, __A10, __A9, + __A8, __A7, __A6, __A5, __A4, __A3, __A2, __A1, + __A0); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2, + _Float16 __A3, _Float16 __A4, _Float16 __A5, + _Float16 __A6, _Float16 __A7, _Float16 __A8, + _Float16 __A9, _Float16 __A10, _Float16 __A11, + _Float16 __A12, _Float16 __A13, _Float16 __A14, + _Float16 __A15, _Float16 __A16, _Float16 __A17, + _Float16 __A18, _Float16 __A19, _Float16 __A20, + _Float16 __A21, _Float16 __A22, _Float16 __A23, + _Float16 __A24, _Float16 __A25, _Float16 __A26, + _Float16 __A27, _Float16 __A28, _Float16 __A29, + _Float16 __A30, _Float16 __A31) + +{ + return _mm512_set_ph (__A31, __A30, __A29, __A28, __A27, __A26, __A25, + __A24, __A23, __A22, __A21, __A20, __A19, __A18, + __A17, __A16, __A15, __A14, __A13, __A12, __A11, + __A10, __A9, __A8, __A7, __A6, __A5, __A4, __A3, + __A2, __A1, __A0); +} + +/* Broadcast _Float16 to vector. */ + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_ph (_Float16 __A) +{ + return _mm_set_ph (__A, __A, __A, __A, __A, __A, __A, __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_ph (_Float16 __A) +{ + return _mm256_set_ph (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_ph (_Float16 __A) +{ + return _mm512_set_ph (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); +} + +/* Create a vector with all zeros. */ + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_ph (void) +{ + return _mm_set1_ph (0.0f); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setzero_ph (void) +{ + return _mm256_set1_ph (0.0f); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_ph (void) +{ + return _mm512_set1_ph (0.0f); +} + +/* Create a vector with element 0 as F and the rest zero. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_sh (_Float16 __F) +{ + return _mm_set_ph (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, __F); +} + +/* Create a vector with element 0 as *P and the rest zero. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_sh (void const *__P) +{ + return _mm_set_ph (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + *(_Float16 const *) __P); +} + +/* Stores the lower _Float16 value. */ +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_sh (void *__P, __m128h __A) +{ + *(_Float16 *) __P = ((__v8hf)__A)[0]; +} + #ifdef __DISABLE_AVX512FP16__ #undef __DISABLE_AVX512FP16__ #pragma GCC pop_options diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 1768b88..4df6ee1 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -85,6 +85,7 @@ DEF_VECTOR_TYPE (V8QI, QI) # SSE vectors DEF_VECTOR_TYPE (V2DF, DOUBLE) DEF_VECTOR_TYPE (V4SF, FLOAT) +DEF_VECTOR_TYPE (V8HF, FLOAT16) DEF_VECTOR_TYPE (V2DI, DI) DEF_VECTOR_TYPE (V4SI, SI) DEF_VECTOR_TYPE (V8HI, HI) @@ -1297,4 +1298,7 @@ DEF_FUNCTION_TYPE (UINT, UINT, V2DI, V2DI, PVOID) DEF_FUNCTION_TYPE (UINT, UINT, V2DI, PVOID) DEF_FUNCTION_TYPE (VOID, V2DI, V2DI, V2DI, UINT) DEF_FUNCTION_TYPE (UINT8, PV2DI, V2DI, PCVOID) -DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID) \ No newline at end of file +DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID) + +# FP16 builtins +DEF_FUNCTION_TYPE (V8HF, V8HI) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index fb3873b..dfffbe5 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -3989,6 +3989,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) break; case E_V16QImode: case E_V8HImode: + case E_V8HFmode: case E_V4SImode: case E_V2DImode: if (TARGET_SSE4_1) @@ -4011,6 +4012,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) break; case E_V32QImode: case E_V16HImode: + case E_V16HFmode: case E_V8SImode: case E_V4DImode: if (TARGET_AVX2) @@ -4030,6 +4032,9 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V32HImode: gen = gen_avx512bw_blendmv32hi; break; + case E_V32HFmode: + gen = gen_avx512bw_blendmv32hf; + break; case E_V16SImode: gen = gen_avx512f_blendmv16si; break; @@ -14228,6 +14233,11 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, } return true; + case E_V8HFmode: + case E_V16HFmode: + case E_V32HFmode: + return ix86_vector_duplicate_value (mode, target, val); + default: return false; } @@ -14312,6 +14322,18 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0; gen_vec_set_0 = gen_vec_setv8di_0; break; + case E_V8HFmode: + use_vector_set = TARGET_AVX512FP16 && one_var == 0; + gen_vec_set_0 = gen_vec_setv8hf_0; + break; + case E_V16HFmode: + use_vector_set = TARGET_AVX512FP16 && one_var == 0; + gen_vec_set_0 = gen_vec_setv16hf_0; + break; + case E_V32HFmode: + use_vector_set = TARGET_AVX512FP16 && one_var == 0; + gen_vec_set_0 = gen_vec_setv32hf_0; + break; default: break; } @@ -14461,6 +14483,8 @@ ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode, if (!TARGET_64BIT) return false; /* FALLTHRU */ + case E_V8HFmode: + case E_V16HFmode: case E_V4DFmode: case E_V8SFmode: case E_V8SImode: @@ -14541,6 +14565,9 @@ ix86_expand_vector_init_concat (machine_mode mode, case 2: switch (mode) { + case E_V32HFmode: + half_mode = V16HFmode; + break; case E_V16SImode: half_mode = V8SImode; break; @@ -14553,6 +14580,9 @@ ix86_expand_vector_init_concat (machine_mode mode, case E_V8DFmode: half_mode = V4DFmode; break; + case E_V16HFmode: + half_mode = V8HFmode; + break; case E_V8SImode: half_mode = V4SImode; break; @@ -14695,13 +14725,22 @@ ix86_expand_vector_init_interleave (machine_mode mode, { machine_mode first_imode, second_imode, third_imode, inner_mode; int i, j; - rtx op0, op1; + rtx op, op0, op1; rtx (*gen_load_even) (rtx, rtx, rtx); rtx (*gen_interleave_first_low) (rtx, rtx, rtx); rtx (*gen_interleave_second_low) (rtx, rtx, rtx); switch (mode) { + case E_V8HFmode: + gen_load_even = gen_vec_setv8hf; + gen_interleave_first_low = gen_vec_interleave_lowv4si; + gen_interleave_second_low = gen_vec_interleave_lowv2di; + inner_mode = HFmode; + first_imode = V4SImode; + second_imode = V2DImode; + third_imode = VOIDmode; + break; case E_V8HImode: gen_load_even = gen_vec_setv8hi; gen_interleave_first_low = gen_vec_interleave_lowv4si; @@ -14726,9 +14765,19 @@ ix86_expand_vector_init_interleave (machine_mode mode, for (i = 0; i < n; i++) { + op = ops [i + i]; + if (inner_mode == HFmode) + { + /* Convert HFmode to HImode. */ + op1 = gen_reg_rtx (HImode); + op1 = gen_rtx_SUBREG (HImode, force_reg (HFmode, op), 0); + op = gen_reg_rtx (HImode); + emit_move_insn (op, op1); + } + /* Extend the odd elment to SImode using a paradoxical SUBREG. */ op0 = gen_reg_rtx (SImode); - emit_move_insn (op0, gen_lowpart (SImode, ops [i + i])); + emit_move_insn (op0, gen_lowpart (SImode, op)); /* Insert the SImode value as low element of V4SImode vector. */ op1 = gen_reg_rtx (V4SImode); @@ -14865,6 +14914,10 @@ ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode, half_mode = V8HImode; goto half; + case E_V16HFmode: + half_mode = V8HFmode; + goto half; + half: n = GET_MODE_NUNITS (mode); for (i = 0; i < n; i++) @@ -14888,6 +14941,11 @@ half: half_mode = V16HImode; goto quarter; + case E_V32HFmode: + quarter_mode = V8HFmode; + half_mode = V16HFmode; + goto quarter; + quarter: n = GET_MODE_NUNITS (mode); for (i = 0; i < n; i++) @@ -14924,6 +14982,9 @@ quarter: move from GPR to SSE register directly. */ if (!TARGET_INTER_UNIT_MOVES_TO_VEC) break; + /* FALLTHRU */ + + case E_V8HFmode: n = GET_MODE_NUNITS (mode); for (i = 0; i < n; i++) @@ -15171,6 +15232,16 @@ ix86_expand_vector_set_var (rtx target, rtx val, rtx idx) case E_V16SFmode: cmp_mode = V16SImode; break; + /* TARGET_AVX512FP16 implies TARGET_AVX512BW. */ + case E_V8HFmode: + cmp_mode = V8HImode; + break; + case E_V16HFmode: + cmp_mode = V16HImode; + break; + case E_V32HFmode: + cmp_mode = V32HImode; + break; default: gcc_unreachable (); } @@ -15207,23 +15278,25 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) machine_mode half_mode; bool use_vec_merge = false; rtx tmp; - static rtx (*gen_extract[6][2]) (rtx, rtx) + static rtx (*gen_extract[7][2]) (rtx, rtx) = { { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi }, { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi }, { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si }, { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di }, { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf }, - { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df } + { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }, + { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf } }; - static rtx (*gen_insert[6][2]) (rtx, rtx, rtx) + static rtx (*gen_insert[7][2]) (rtx, rtx, rtx) = { { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi }, { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi }, { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si }, { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di }, { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf }, - { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df } + { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }, + { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf }, }; int i, j, n; machine_mode mmode = VOIDmode; @@ -15390,6 +15463,10 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) } return; + case E_V8HFmode: + use_vec_merge = true; + break; + case E_V8HImode: case E_V2HImode: use_vec_merge = TARGET_SSE2; @@ -15413,6 +15490,12 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) n = 16; goto half; + case E_V16HFmode: + half_mode = V8HFmode; + j = 6; + n = 8; + goto half; + case E_V16HImode: half_mode = V8HImode; j = 1; @@ -15493,6 +15576,13 @@ half: } break; + case E_V32HFmode: + if (TARGET_AVX512BW) + { + mmode = SImode; + gen_blendm = gen_avx512bw_blendmv32hf; + } + break; case E_V32HImode: if (TARGET_AVX512BW) { @@ -15864,6 +15954,28 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) ix86_expand_vector_extract (false, target, tmp, elt & 3); return; + case E_V32HFmode: + tmp = gen_reg_rtx (V16HFmode); + if (elt < 16) + emit_insn (gen_vec_extract_lo_v32hf (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v32hf (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 15); + return; + + case E_V16HFmode: + tmp = gen_reg_rtx (V8HFmode); + if (elt < 8) + emit_insn (gen_vec_extract_lo_v16hf (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v16hf (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 7); + return; + + case E_V8HFmode: + use_vec_extr = true; + break; + case E_V8QImode: use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; /* ??? Could extract the appropriate HImode element and shift. */ diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def index 9232f59..fcadfcd 100644 --- a/gcc/config/i386/i386-modes.def +++ b/gcc/config/i386/i386-modes.def @@ -84,12 +84,12 @@ VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI */ VECTOR_MODES (INT, 32); /* V32QI V16HI V8SI V4DI */ VECTOR_MODES (INT, 64); /* V64QI V32HI V16SI V8DI */ VECTOR_MODES (INT, 128); /* V128QI V64HI V32SI V16DI */ -VECTOR_MODES (FLOAT, 8); /* V2SF */ -VECTOR_MODES (FLOAT, 16); /* V4SF V2DF */ -VECTOR_MODES (FLOAT, 32); /* V8SF V4DF V2TF */ -VECTOR_MODES (FLOAT, 64); /* V16SF V8DF V4TF */ -VECTOR_MODES (FLOAT, 128); /* V32SF V16DF V8TF */ -VECTOR_MODES (FLOAT, 256); /* V64SF V32DF V16TF */ +VECTOR_MODES (FLOAT, 8); /* V4HF V2SF */ +VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */ +VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF V2TF */ +VECTOR_MODES (FLOAT, 64); /* V32HF V16SF V8DF V4TF */ +VECTOR_MODES (FLOAT, 128); /* V64HF V32SF V16DF V8TF */ +VECTOR_MODES (FLOAT, 256); /* V128HF V64SF V32DF V16TF */ VECTOR_MODE (INT, TI, 1); /* V1TI */ VECTOR_MODE (INT, DI, 1); /* V1DI */ VECTOR_MODE (INT, SI, 1); /* V1SI */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index b2a58b0..dc649f9 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -2422,6 +2422,7 @@ classify_argument (machine_mode mode, const_tree type, case E_V8SFmode: case E_V8SImode: case E_V32QImode: + case E_V16HFmode: case E_V16HImode: case E_V4DFmode: case E_V4DImode: @@ -2432,6 +2433,7 @@ classify_argument (machine_mode mode, const_tree type, return 4; case E_V8DFmode: case E_V16SFmode: + case E_V32HFmode: case E_V8DImode: case E_V16SImode: case E_V32HImode: @@ -2449,6 +2451,7 @@ classify_argument (machine_mode mode, const_tree type, case E_V4SImode: case E_V16QImode: case E_V8HImode: + case E_V8HFmode: case E_V2DFmode: case E_V2DImode: classes[0] = X86_64_SSE_CLASS; @@ -2862,12 +2865,14 @@ pass_in_reg: break; /* FALLTHRU */ + case E_V16HFmode: case E_V8SFmode: case E_V8SImode: case E_V64QImode: case E_V32HImode: case E_V16SImode: case E_V8DImode: + case E_V32HFmode: case E_V16SFmode: case E_V8DFmode: case E_V32QImode: @@ -2879,6 +2884,7 @@ pass_in_reg: case E_V8HImode: case E_V4SImode: case E_V2DImode: + case E_V8HFmode: case E_V4SFmode: case E_V2DFmode: if (!type || !AGGREGATE_TYPE_P (type)) @@ -2933,7 +2939,9 @@ function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode, /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */ if (!named && (VALID_AVX512F_REG_MODE (mode) - || VALID_AVX256_REG_MODE (mode))) + || VALID_AVX256_REG_MODE (mode) + || mode == V16HFmode + || mode == V32HFmode)) return 0; if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs) @@ -3101,6 +3109,7 @@ pass_in_reg: case E_V8HImode: case E_V4SImode: case E_V2DImode: + case E_V8HFmode: case E_V4SFmode: case E_V2DFmode: if (!type || !AGGREGATE_TYPE_P (type)) @@ -3120,8 +3129,10 @@ pass_in_reg: case E_V32HImode: case E_V16SImode: case E_V8DImode: + case E_V32HFmode: case E_V16SFmode: case E_V8DFmode: + case E_V16HFmode: case E_V8SFmode: case E_V8SImode: case E_V32QImode: @@ -3180,12 +3191,14 @@ function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode, default: break; + case E_V16HFmode: case E_V8SFmode: case E_V8SImode: case E_V32QImode: case E_V16HImode: case E_V4DFmode: case E_V4DImode: + case E_V32HFmode: case E_V16SFmode: case E_V16SImode: case E_V64QImode: @@ -4680,12 +4693,14 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, nat_mode = type_natural_mode (type, NULL, false); switch (nat_mode) { + case E_V16HFmode: case E_V8SFmode: case E_V8SImode: case E_V32QImode: case E_V16HImode: case E_V4DFmode: case E_V4DImode: + case E_V32HFmode: case E_V16SFmode: case E_V16SImode: case E_V64QImode: @@ -5359,7 +5374,12 @@ ix86_get_ssemov (rtx *operands, unsigned size, switch (type) { case opcode_int: - opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32"; + if (scalar_mode == E_HFmode) + opcode = (misaligned_p + ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64") + : "vmovdqa64"); + else + opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32"; break; case opcode_float: opcode = misaligned_p ? "vmovups" : "vmovaps"; @@ -5373,6 +5393,11 @@ ix86_get_ssemov (rtx *operands, unsigned size, { switch (scalar_mode) { + case E_HFmode: + opcode = (misaligned_p + ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64") + : "vmovdqa64"); + break; case E_SFmode: opcode = misaligned_p ? "%vmovups" : "%vmovaps"; break; @@ -19479,7 +19504,6 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in) int index; switch (mode) { - case E_HFmode: case E_SFmode: index = 0; break; @@ -19783,6 +19807,7 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) between gpr and sse registser. */ if (TARGET_AVX512F && (mode == XImode + || mode == V32HFmode || VALID_AVX512F_REG_MODE (mode) || VALID_AVX512F_SCALAR_MODE (mode))) return true; @@ -19797,9 +19822,7 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) /* TODO check for QI/HI scalars. */ /* AVX512VL allows sse regs16+ for 128/256 bit modes. */ if (TARGET_AVX512VL - && (mode == OImode - || mode == TImode - || VALID_AVX256_REG_MODE (mode) + && (VALID_AVX256_REG_OR_OI_VHF_MODE (mode) || VALID_AVX512VL_128_REG_MODE (mode))) return true; @@ -19809,9 +19832,9 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) /* OImode and AVX modes are available only when AVX is enabled. */ return ((TARGET_AVX - && VALID_AVX256_REG_OR_OI_MODE (mode)) + && VALID_AVX256_REG_OR_OI_VHF_MODE (mode)) || VALID_SSE_REG_MODE (mode) - || VALID_SSE2_REG_MODE (mode) + || VALID_SSE2_REG_VHF_MODE (mode) || VALID_MMX_REG_MODE (mode) || VALID_MMX_REG_MODE_3DNOW (mode)); } @@ -20022,7 +20045,8 @@ ix86_set_reg_reg_cost (machine_mode mode) case MODE_VECTOR_INT: case MODE_VECTOR_FLOAT: - if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) + if ((TARGET_AVX512FP16 && VALID_AVX512FP16_REG_MODE (mode)) + || (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) || (TARGET_AVX && VALID_AVX256_REG_MODE (mode)) || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode)) || (TARGET_SSE && VALID_SSE_REG_MODE (mode)) @@ -21935,6 +21959,8 @@ ix86_vector_mode_supported_p (machine_mode mode) if ((TARGET_MMX || TARGET_MMX_WITH_SSE) && VALID_MMX_REG_MODE (mode)) return true; + if (TARGET_AVX512FP16 && VALID_AVX512FP16_REG_MODE (mode)) + return true; if ((TARGET_3DNOW || TARGET_MMX_WITH_SSE) && VALID_MMX_REG_MODE_3DNOW (mode)) return true; diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 2ac8f3e..73237b8 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1002,8 +1002,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); || (MODE) == V4DImode || (MODE) == V2TImode || (MODE) == V8SFmode \ || (MODE) == V4DFmode) -#define VALID_AVX256_REG_OR_OI_MODE(MODE) \ - (VALID_AVX256_REG_MODE (MODE) || (MODE) == OImode) +#define VALID_AVX256_REG_OR_OI_VHF_MODE(MODE) \ + (VALID_AVX256_REG_MODE (MODE) || (MODE) == OImode || (MODE) == V16HFmode) #define VALID_AVX512F_SCALAR_MODE(MODE) \ ((MODE) == DImode || (MODE) == DFmode || (MODE) == SImode \ @@ -1021,13 +1021,20 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define VALID_AVX512VL_128_REG_MODE(MODE) \ ((MODE) == V2DImode || (MODE) == V2DFmode || (MODE) == V16QImode \ || (MODE) == V4SImode || (MODE) == V4SFmode || (MODE) == V8HImode \ - || (MODE) == TFmode || (MODE) == V1TImode) + || (MODE) == TFmode || (MODE) == V1TImode || (MODE) == V8HFmode \ + || (MODE) == TImode) + +#define VALID_AVX512FP16_REG_MODE(MODE) \ + ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode) #define VALID_SSE2_REG_MODE(MODE) \ ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode \ || (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode \ || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode) +#define VALID_SSE2_REG_VHF_MODE(MODE) \ + (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode) + #define VALID_SSE_REG_MODE(MODE) \ ((MODE) == V1TImode || (MODE) == TImode \ || (MODE) == V4SFmode || (MODE) == V4SImode \ @@ -1072,7 +1079,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode \ || (MODE) == V2TImode || (MODE) == V8DImode || (MODE) == V64QImode \ || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode \ - || (MODE) == V16SFmode) + || (MODE) == V16SFmode || VALID_AVX512FP16_REG_MODE (MODE)) #define X87_FLOAT_MODE_P(MODE) \ (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode)) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index dcbbf2b..fe36d7e 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -499,8 +499,8 @@ ;; Main data type used by the insn (define_attr "mode" - "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V16SF,V8SF,V4DF,V4SF, - V2DF,V2SF,V1DF,V8DF" + "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V32HF,V16HF,V8HF, + V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF" (const_string "unknown")) ;; The CPU unit operations uses. @@ -1105,7 +1105,8 @@ (V2DI "16") (V4DI "32") (V8DI "64") (V1TI "16") (V2TI "32") (V4TI "64") (V2DF "16") (V4DF "32") (V8DF "64") - (V4SF "16") (V8SF "32") (V16SF "64")]) + (V4SF "16") (V8SF "32") (V16SF "64") + (V8HF "16") (V16HF "32") (V32HF "64")]) ;; Double word integer modes as mode attribute. (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")]) @@ -1241,9 +1242,9 @@ ;; SSE instruction suffix for various modes (define_mode_attr ssemodesuffix [(HF "sh") (SF "ss") (DF "sd") - (V16SF "ps") (V8DF "pd") - (V8SF "ps") (V4DF "pd") - (V4SF "ps") (V2DF "pd") + (V32HF "ph") (V16SF "ps") (V8DF "pd") + (V16HF "ph") (V8SF "ps") (V4DF "pd") + (V8HF "ph") (V4SF "ps") (V2DF "pd") (V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q") (V32QI "b") (V16HI "w") (V8SI "d") (V4DI "q") (V64QI "b") (V32HI "w") (V16SI "d") (V8DI "q")]) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 5785e73..a1ad410 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -222,6 +222,7 @@ (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX") V1TI + (V32HF "TARGET_AVX512F") (V16HF "TARGET_AVX") V8HF (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) @@ -237,6 +238,13 @@ [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL") V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")]) +(define_mode_iterator VI12HF_AVX512VL + [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL") + V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL") + (V32HF "TARGET_AVX512FP16") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")]) + ;; Same iterator, but without supposed TARGET_AVX512BW (define_mode_iterator VI12_AVX512VLBW [(V64QI "TARGET_AVX512BW") (V16QI "TARGET_AVX512VL") @@ -252,6 +260,8 @@ (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI + (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") + (V8HF "TARGET_AVX512FP16") (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) @@ -274,7 +284,8 @@ (define_mode_iterator V_256_512 [V32QI V16HI V8SI V4DI V8SF V4DF (V64QI "TARGET_AVX512F") (V32HI "TARGET_AVX512F") (V16SI "TARGET_AVX512F") - (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")]) + (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F") + (V16HF "TARGET_AVX512FP16") (V32HF "TARGET_AVX512FP16")]) ;; All vector float modes (define_mode_iterator VF @@ -318,6 +329,11 @@ (define_mode_iterator VF_128 [V4SF (V2DF "TARGET_SSE2")]) +;; All 128bit vector HF/SF/DF modes +(define_mode_iterator VFH_128 + [(V8HF "TARGET_AVX512FP16") + V4SF (V2DF "TARGET_SSE2")]) + ;; All 256bit vector float modes (define_mode_iterator VF_256 [V8SF V4DF]) @@ -352,6 +368,9 @@ (define_mode_iterator VF1_AVX512VL [V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")]) +(define_mode_iterator VF_AVX512FP16 + [V32HF V16HF V8HF]) + ;; All vector integer modes (define_mode_iterator VI [(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") @@ -360,6 +379,16 @@ (V8SI "TARGET_AVX") V4SI (V4DI "TARGET_AVX") V2DI]) +;; All vector integer and HF modes +(define_mode_iterator VIHF + [(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") + (V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX") V16QI + (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX") V8HI + (V8SI "TARGET_AVX") V4SI + (V4DI "TARGET_AVX") V2DI + (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") + (V8HF "TARGET_AVX512FP16")]) + (define_mode_iterator VI_AVX2 [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX2") V8HI @@ -569,6 +598,7 @@ (V8HI "avx512vl") (V16HI "avx512vl") (V32HI "avx512bw") (V4SI "avx512vl") (V8SI "avx512vl") (V16SI "avx512f") (V2DI "avx512vl") (V4DI "avx512vl") (V8DI "avx512f") + (V8HF "avx512fp16") (V16HF "avx512vl") (V32HF "avx512bw") (V4SF "avx512vl") (V8SF "avx512vl") (V16SF "avx512f") (V2DF "avx512vl") (V4DF "avx512vl") (V8DF "avx512f")]) @@ -629,12 +659,13 @@ (V8HI "avx512vl") (V16HI "avx512vl") (V32HI "avx512bw")]) (define_mode_attr shuffletype - [(V16SF "f") (V16SI "i") (V8DF "f") (V8DI "i") - (V8SF "f") (V8SI "i") (V4DF "f") (V4DI "i") - (V4SF "f") (V4SI "i") (V2DF "f") (V2DI "i") - (V32HI "i") (V16HI "i") (V8HI "i") - (V64QI "i") (V32QI "i") (V16QI "i") - (V4TI "i") (V2TI "i") (V1TI "i")]) + [(V32HF "f") (V16HF "f") (V8HF "f") + (V16SF "f") (V16SI "i") (V8DF "f") (V8DI "i") + (V8SF "f") (V8SI "i") (V4DF "f") (V4DI "i") + (V4SF "f") (V4SI "i") (V2DF "f") (V2DI "i") + (V32HI "i") (V16HI "i") (V8HI "i") + (V64QI "i") (V32QI "i") (V16QI "i") + (V4TI "i") (V2TI "i") (V1TI "i")]) (define_mode_attr ssequartermode [(V16SF "V4SF") (V8DF "V2DF") (V16SI "V4SI") (V8DI "V2DI")]) @@ -671,6 +702,8 @@ ;; All 128 and 256bit vector integer modes (define_mode_iterator VI_128_256 [V16QI V8HI V4SI V2DI V32QI V16HI V8SI V4DI]) +;; All 256bit vector integer and HF modes +(define_mode_iterator VIHF_256 [V32QI V16HI V8SI V4DI V16HF]) ;; Various 128bit vector integer mode combinations (define_mode_iterator VI12_128 [V16QI V8HI]) @@ -697,6 +730,9 @@ (define_mode_iterator VI4_256_8_512 [V8SI V8DI]) (define_mode_iterator VI_AVX512BW [V16SI V8DI (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")]) +(define_mode_iterator VIHF_AVX512BW + [V16SI V8DI (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW") + (V32HF "TARGET_AVX512FP16")]) ;; Int-float size matches (define_mode_iterator VI4F_128 [V4SI V4SF]) @@ -737,6 +773,9 @@ (V8SF "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL") V16SF V8DF]) +(define_mode_iterator V16_256 [V16HI V16HF]) +(define_mode_iterator V32_512 [V32HI V32HF]) + (define_mode_attr avx512bcst [(V4SI "%{1to4%}") (V2DI "%{1to2%}") (V8SI "%{1to8%}") (V4DI "%{1to4%}") @@ -747,8 +786,10 @@ ;; Mapping from float mode to required SSE level (define_mode_attr sse - [(SF "sse") (DF "sse2") + [(SF "sse") (DF "sse2") (HF "avx512fp16") (V4SF "sse") (V2DF "sse2") + (V32HF "avx512fp16") (V16HF "avx512fp16") + (V8HF "avx512fp16") (V16SF "avx512f") (V8SF "avx") (V8DF "avx512f") (V4DF "avx")]) @@ -784,6 +825,7 @@ (V16SF "V16SF") (V8DF "V8DF") (V8SF "V8SF") (V4DF "V4DF") (V4SF "V4SF") (V2DF "V2DF") + (V8HF "TI") (V16HF "OI") (V32HF "XI") (TI "TI")]) ;; SSE constant -1 constraint @@ -791,9 +833,16 @@ [(V64QI "BC") (V32HI "BC") (V16SI "BC") (V8DI "BC") (V4TI "BC") (V32QI "BC") (V16HI "BC") (V8SI "BC") (V4DI "BC") (V2TI "BC") (V16QI "BC") (V8HI "BC") (V4SI "BC") (V2DI "BC") (V1TI "BC") - (V16SF "BF") (V8DF "BF") - (V8SF "BF") (V4DF "BF") - (V4SF "BF") (V2DF "BF")]) + (V32HF "BF") (V16SF "BF") (V8DF "BF") + (V16HF "BF") (V8SF "BF") (V4DF "BF") + (V8HF "BF") (V4SF "BF") (V2DF "BF")]) + +;; SSE integer instruction suffix for various modes +(define_mode_attr sseintmodesuffix + [(V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q") + (V32QI "b") (V16HI "w") (V8SI "d") (V4DI "q") + (V64QI "b") (V32HI "w") (V16SI "d") (V8DI "q") + (V8HF "w") (V16HF "w") (V32HF "w")]) ;; Mapping of vector modes to corresponding mask size (define_mode_attr avx512fmaskmode @@ -801,6 +850,7 @@ (V32HI "SI") (V16HI "HI") (V8HI "QI") (V4HI "QI") (V16SI "HI") (V8SI "QI") (V4SI "QI") (V8DI "QI") (V4DI "QI") (V2DI "QI") + (V32HF "SI") (V16HF "HI") (V8HF "QI") (V16SF "HI") (V8SF "QI") (V4SF "QI") (V8DF "QI") (V4DF "QI") (V2DF "QI")]) @@ -810,6 +860,7 @@ (V32HI "si") (V16HI "hi") (V8HI "qi") (V4HI "qi") (V16SI "hi") (V8SI "qi") (V4SI "qi") (V8DI "qi") (V4DI "qi") (V2DI "qi") + (V32HF "si") (V16HF "hi") (V8HF "qi") (V16SF "hi") (V8SF "qi") (V4SF "qi") (V8DF "qi") (V4DF "qi") (V2DF "qi")]) @@ -854,7 +905,8 @@ (V16QI "V32QI") (V8HI "V16HI") (V4SI "V8SI") (V2DI "V4DI") (V16SF "V32SF") (V8DF "V16DF") (V8SF "V16SF") (V4DF "V8DF") - (V4SF "V8SF") (V2DF "V4DF")]) + (V4SF "V8SF") (V2DF "V4DF") + (V32HF "V64HF") (V16HF "V32HF") (V8HF "V16HF")]) ;; Mapping of vector modes to a vector mode of half size ;; instead of V1DI/V1DF, DI/DF are used for V2DI/V2DF although they are scalar. @@ -864,7 +916,8 @@ (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V2DI "DI") (V16SF "V8SF") (V8DF "V4DF") (V8SF "V4SF") (V4DF "V2DF") - (V4SF "V2SF") (V2DF "DF")]) + (V4SF "V2SF") (V2DF "DF") + (V32HF "V16HF") (V16HF "V8HF") (V8HF "V4HF")]) (define_mode_attr ssehalfvecmodelower [(V64QI "v32qi") (V32HI "v16hi") (V16SI "v8si") (V8DI "v4di") (V4TI "v2ti") @@ -872,9 +925,10 @@ (V16QI "v8qi") (V8HI "v4hi") (V4SI "v2si") (V16SF "v8sf") (V8DF "v4df") (V8SF "v4sf") (V4DF "v2df") - (V4SF "v2sf")]) + (V4SF "v2sf") + (V32HF "v16hf") (V16HF "v8hf") (V8HF "v4hf")]) -;; Mapping of vector modes ti packed single mode of the same size +;; Mapping of vector modes to packed single mode of the same size (define_mode_attr ssePSmode [(V16SI "V16SF") (V8DF "V16SF") (V16SF "V16SF") (V8DI "V16SF") @@ -884,7 +938,8 @@ (V4DI "V8SF") (V2DI "V4SF") (V4TI "V16SF") (V2TI "V8SF") (V1TI "V4SF") (V8SF "V8SF") (V4SF "V4SF") - (V4DF "V8SF") (V2DF "V4SF")]) + (V4DF "V8SF") (V2DF "V4SF") + (V32HF "V16SF") (V16HF "V8SF") (V8HF "V4SF")]) (define_mode_attr ssePSmode2 [(V8DI "V8SF") (V4DI "V4SF")]) @@ -895,6 +950,7 @@ (V32HI "HI") (V16HI "HI") (V8HI "HI") (V16SI "SI") (V8SI "SI") (V4SI "SI") (V8DI "DI") (V4DI "DI") (V2DI "DI") + (V32HF "HF") (V16HF "HF") (V8HF "HF") (V16SF "SF") (V8SF "SF") (V4SF "SF") (V8DF "DF") (V4DF "DF") (V2DF "DF") (V4TI "TI") (V2TI "TI")]) @@ -905,6 +961,7 @@ (V32HI "hi") (V16HI "hi") (V8HI "hi") (V16SI "si") (V8SI "si") (V4SI "si") (V8DI "di") (V4DI "di") (V2DI "di") + (V32HF "hf") (V16HF "hf") (V8HF "hf") (V16SF "sf") (V8SF "sf") (V4SF "sf") (V8DF "df") (V4DF "df") (V2DF "df") (V4TI "ti") (V2TI "ti")]) @@ -915,6 +972,7 @@ (V32HI "V8HI") (V16HI "V8HI") (V8HI "V8HI") (V16SI "V4SI") (V8SI "V4SI") (V4SI "V4SI") (V8DI "V2DI") (V4DI "V2DI") (V2DI "V2DI") + (V32HF "V8HF") (V16HF "V8HF") (V8HF "V8HF") (V16SF "V4SF") (V8SF "V4SF") (V4SF "V4SF") (V8DF "V2DF") (V4DF "V2DF") (V2DF "V2DF")]) @@ -935,6 +993,7 @@ (V16SI "d") (V8SI "d") (V4SI "d") (V16SF "d") (V8SF "d") (V4SF "d") (V32HI "d") (V16HI "d") (V8HI "d") + (V32HF "d") (V16HF "d") (V8HF "d") (V64QI "d") (V32QI "d") (V16QI "d")]) ;; Number of scalar elements in each vector type @@ -959,10 +1018,11 @@ (V64QI "8") (V32QI "8") (V16QI "8") (V32HI "16") (V16HI "16") (V8HI "16") (V16SI "32") (V8SI "32") (V4SI "32") + (V32HF "16") (V16HF "16") (V8HF "16") (V16SF "32") (V8SF "32") (V4SF "32") (V8DF "64") (V4DF "64") (V2DF "64")]) -;; SSE prefix for integer vector modes +;; SSE prefix for integer and HF vector modes (define_mode_attr sseintprefix [(V2DI "p") (V2DF "") (V4DI "p") (V4DF "") @@ -970,16 +1030,16 @@ (V4SI "p") (V4SF "") (V8SI "p") (V8SF "") (V16SI "p") (V16SF "") - (V16QI "p") (V8HI "p") - (V32QI "p") (V16HI "p") - (V64QI "p") (V32HI "p")]) + (V16QI "p") (V8HI "p") (V8HF "p") + (V32QI "p") (V16HI "p") (V16HF "p") + (V64QI "p") (V32HI "p") (V32HF "p")]) ;; SSE scalar suffix for vector modes (define_mode_attr ssescalarmodesuffix - [(SF "ss") (DF "sd") - (V16SF "ss") (V8DF "sd") - (V8SF "ss") (V4DF "sd") - (V4SF "ss") (V2DF "sd") + [(HF "sh") (SF "ss") (DF "sd") + (V32HF "sh") (V16SF "ss") (V8DF "sd") + (V16HF "sh") (V8SF "ss") (V4DF "sd") + (V8HF "sh") (V4SF "ss") (V2DF "sd") (V16SI "d") (V8DI "q") (V8SI "d") (V4DI "q") (V4SI "d") (V2DI "q")]) @@ -1007,7 +1067,8 @@ ;; i128 for integer vectors and TARGET_AVX2, f128 otherwise. ;; i64x4 or f64x4 for 512bit modes. (define_mode_attr i128 - [(V16SF "f64x4") (V8SF "f128") (V8DF "f64x4") (V4DF "f128") + [(V16HF "%~128") (V32HF "i64x4") (V16SF "f64x4") (V8SF "f128") + (V8DF "f64x4") (V4DF "f128") (V64QI "i64x4") (V32QI "%~128") (V32HI "i64x4") (V16HI "%~128") (V16SI "i64x4") (V8SI "%~128") (V8DI "i64x4") (V4DI "%~128")]) @@ -1031,14 +1092,18 @@ (V32HI "w") (V16HI "w") (V8HI "w") (V16SI "d") (V8SI "d") (V4SI "d") (V8DI "q") (V4DI "q") (V2DI "q") + (V32HF "w") (V16HF "w") (V8HF "w") (V16SF "ss") (V8SF "ss") (V4SF "ss") (V8DF "sd") (V4DF "sd") (V2DF "sd")]) ;; Tie mode of assembler operand to mode iterator (define_mode_attr xtg_mode - [(V16QI "x") (V8HI "x") (V4SI "x") (V2DI "x") (V4SF "x") (V2DF "x") - (V32QI "t") (V16HI "t") (V8SI "t") (V4DI "t") (V8SF "t") (V4DF "t") - (V64QI "g") (V32HI "g") (V16SI "g") (V8DI "g") (V16SF "g") (V8DF "g")]) + [(V16QI "x") (V8HI "x") (V4SI "x") (V2DI "x") + (V8HF "x") (V4SF "x") (V2DF "x") + (V32QI "t") (V16HI "t") (V8SI "t") (V4DI "t") + (V16HF "t") (V8SF "t") (V4DF "t") + (V64QI "g") (V32HI "g") (V16SI "g") (V8DI "g") + (V32HF "g") (V16SF "g") (V8DF "g")]) ;; Half mask mode for unpacks (define_mode_attr HALFMASKMODE @@ -1334,6 +1399,20 @@ (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn "_blendm" + [(set (match_operand:VF_AVX512FP16 0 "register_operand" "=v,v") + (vec_merge:VF_AVX512FP16 + (match_operand:VF_AVX512FP16 2 "nonimmediate_operand" "vm,vm") + (match_operand:VF_AVX512FP16 1 "nonimm_or_0_operand" "0C,v") + (match_operand: 3 "register_operand" "Yk,Yk")))] + "TARGET_AVX512BW" + "@ + vmovdqu\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2} + vpblendmw\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + (define_insn "_store_mask" [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m") (vec_merge:V48_AVX512VL @@ -1963,12 +2042,12 @@ ;; Standard scalar operation patterns which preserve the rest of the ;; vector for combiner. (define_insn "*_vm3" - [(set (match_operand:VF_128 0 "register_operand" "=x,v") - (vec_merge:VF_128 - (vec_duplicate:VF_128 + [(set (match_operand:VFH_128 0 "register_operand" "=x,v") + (vec_merge:VFH_128 + (vec_duplicate:VFH_128 (plusminus: (vec_select: - (match_operand:VF_128 1 "register_operand" "0,v") + (match_operand:VFH_128 1 "register_operand" "0,v") (parallel [(const_int 0)])) (match_operand: 2 "nonimmediate_operand" "xm,vm"))) (match_dup 1) @@ -1979,7 +2058,16 @@ v\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sseadd") - (set_attr "prefix" "orig,vex") + (set (attr "prefix") + (cond [(eq_attr "alternative" "0") + (const_string "orig") + (eq_attr "alternative" "1") + (if_then_else + (match_test "mode == V8HFmode") + (const_string "evex") + (const_string "vex")) + ] + (const_string "*"))) (set_attr "mode" "")]) (define_insn "_vm3" @@ -2044,12 +2132,12 @@ ;; Standard scalar operation patterns which preserve the rest of the ;; vector for combiner. (define_insn "*_vm3" - [(set (match_operand:VF_128 0 "register_operand" "=x,v") - (vec_merge:VF_128 - (vec_duplicate:VF_128 + [(set (match_operand:VFH_128 0 "register_operand" "=x,v") + (vec_merge:VFH_128 + (vec_duplicate:VFH_128 (multdiv: (vec_select: - (match_operand:VF_128 1 "register_operand" "0,v") + (match_operand:VFH_128 1 "register_operand" "0,v") (parallel [(const_int 0)])) (match_operand: 2 "nonimmediate_operand" "xm,vm"))) (match_dup 1) @@ -2060,7 +2148,16 @@ v\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sse") - (set_attr "prefix" "orig,vex") + (set (attr "prefix") + (cond [(eq_attr "alternative" "0") + (const_string "orig") + (eq_attr "alternative" "1") + (if_then_else + (match_test "mode == V8HFmode") + (const_string "evex") + (const_string "vex")) + ] + (const_string "*"))) (set_attr "btver2_decode" "direct,double") (set_attr "mode" "")]) @@ -2482,12 +2579,12 @@ ;; Standard scalar operation patterns which preserve the rest of the ;; vector for combiner. (define_insn "*ieee_3" - [(set (match_operand:VF_128 0 "register_operand" "=x,v") - (vec_merge:VF_128 - (vec_duplicate:VF_128 + [(set (match_operand:VFH_128 0 "register_operand" "=x,v") + (vec_merge:VFH_128 + (vec_duplicate:VFH_128 (unspec: [(vec_select: - (match_operand:VF_128 1 "register_operand" "0,v") + (match_operand:VFH_128 1 "register_operand" "0,v") (parallel [(const_int 0)])) (match_operand: 2 "nonimmediate_operand" "xm,vm")] IEEE_MAXMIN)) @@ -2500,7 +2597,16 @@ [(set_attr "isa" "noavx,avx") (set_attr "type" "sseadd") (set_attr "btver2_sse_attr" "maxmin") - (set_attr "prefix" "orig,vex") + (set (attr "prefix") + (cond [(eq_attr "alternative" "0") + (const_string "orig") + (eq_attr "alternative" "1") + (if_then_else + (match_test "mode == V8HFmode") + (const_string "evex") + (const_string "vex")) + ] + (const_string "*"))) (set_attr "mode" "")]) (define_insn "_vm3" @@ -8576,6 +8682,47 @@ ] (symbol_ref "true")))]) +;; vmovw clears also the higer bits +(define_insn "vec_set_0" + [(set (match_operand:VF_AVX512FP16 0 "register_operand" "=v,v") + (vec_merge:VF_AVX512FP16 + (vec_duplicate:VF_AVX512FP16 + (match_operand:HF 2 "nonimmediate_operand" "r,m")) + (match_operand:VF_AVX512FP16 1 "const0_operand" "C,C") + (const_int 1)))] + "TARGET_AVX512FP16" + "@ + vmovw\t{%k2, %x0|%x0, %k2} + vmovw\t{%2, %x0|%x0, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "HF")]) + +(define_insn "*avx512fp16_movsh" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_merge:V8HF + (vec_duplicate:V8HF + (match_operand:HF 2 "register_operand" "v")) + (match_operand:V8HF 1 "register_operand" "v") + (const_int 1)))] + "TARGET_AVX512FP16" + "vmovsh\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "HF")]) + +(define_insn "avx512fp16_movsh" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_merge:V8HF + (match_operand:V8HF 2 "register_operand" "v") + (match_operand:V8HF 1 "register_operand" "v") + (const_int 1)))] + "TARGET_AVX512FP16" + "vmovsh\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "HF")]) + ;; A subset is vec_setv4sf. (define_insn "*vec_setv4sf_sse4_1" [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v") @@ -8711,6 +8858,20 @@ DONE; }) +(define_expand "vec_setv8hf" + [(match_operand:V8HF 0 "register_operand") + (match_operand:HF 1 "register_operand") + (match_operand 2 "vec_setm_sse41_operand")] + "TARGET_SSE" +{ + if (CONST_INT_P (operands[2])) + ix86_expand_vector_set (false, operands[0], operands[1], + INTVAL (operands[2])); + else + ix86_expand_vector_set_var (operands[0], operands[1], operands[2]); + DONE; +}) + (define_expand "vec_set" [(match_operand:V_256_512 0 "register_operand") (match_operand: 1 "register_operand") @@ -9426,10 +9587,10 @@ (set_attr "length_immediate" "1") (set_attr "mode" "")]) -(define_insn_and_split "vec_extract_lo_v32hi" - [(set (match_operand:V16HI 0 "nonimmediate_operand" "=v,v,m") - (vec_select:V16HI - (match_operand:V32HI 1 "nonimmediate_operand" "v,m,v") +(define_insn_and_split "vec_extract_lo_" + [(set (match_operand: 0 "nonimmediate_operand" "=v,v,m") + (vec_select: + (match_operand:V32_512 1 "nonimmediate_operand" "v,m,v") (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3) (const_int 4) (const_int 5) @@ -9456,9 +9617,10 @@ if (!TARGET_AVX512VL && REG_P (operands[0]) && EXT_REX_SSE_REG_P (operands[1])) - operands[0] = lowpart_subreg (V32HImode, operands[0], V16HImode); + operands[0] = lowpart_subreg (mode, operands[0], + mode); else - operands[1] = gen_lowpart (V16HImode, operands[1]); + operands[1] = gen_lowpart (mode, operands[1]); } [(set_attr "type" "sselog1") (set_attr "prefix_extra" "1") @@ -9467,10 +9629,10 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) -(define_insn "vec_extract_hi_v32hi" - [(set (match_operand:V16HI 0 "nonimmediate_operand" "=vm") - (vec_select:V16HI - (match_operand:V32HI 1 "register_operand" "v") +(define_insn "vec_extract_hi_" + [(set (match_operand: 0 "nonimmediate_operand" "=vm") + (vec_select: + (match_operand:V32_512 1 "register_operand" "v") (parallel [(const_int 16) (const_int 17) (const_int 18) (const_int 19) (const_int 20) (const_int 21) @@ -9487,10 +9649,10 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) -(define_insn_and_split "vec_extract_lo_v16hi" - [(set (match_operand:V8HI 0 "nonimmediate_operand" "=v,m") - (vec_select:V8HI - (match_operand:V16HI 1 "nonimmediate_operand" "vm,v") +(define_insn_and_split "vec_extract_lo_" + [(set (match_operand: 0 "nonimmediate_operand" "=v,m") + (vec_select: + (match_operand:V16_256 1 "nonimmediate_operand" "vm,v") (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3) (const_int 4) (const_int 5) @@ -9499,12 +9661,12 @@ "#" "&& reload_completed" [(set (match_dup 0) (match_dup 1))] - "operands[1] = gen_lowpart (V8HImode, operands[1]);") + "operands[1] = gen_lowpart (mode, operands[1]);") -(define_insn "vec_extract_hi_v16hi" - [(set (match_operand:V8HI 0 "nonimmediate_operand" "=xm,vm,vm") - (vec_select:V8HI - (match_operand:V16HI 1 "register_operand" "x,v,v") +(define_insn "vec_extract_hi_" + [(set (match_operand: 0 "nonimmediate_operand" "=xm,vm,vm") + (vec_select: + (match_operand:V16_256 1 "register_operand" "x,v,v") (parallel [(const_int 8) (const_int 9) (const_int 10) (const_int 11) (const_int 12) (const_int 13) @@ -9640,12 +9802,41 @@ (set_attr "prefix" "vex,evex,evex") (set_attr "mode" "OI")]) +;; NB: *vec_extract_0 must be placed before *vec_extracthf. +;; Otherwise, it will be ignored. +(define_insn_and_split "*vec_extract_0" + [(set (match_operand:HF 0 "nonimmediate_operand" "=v,m,r") + (vec_select:HF + (match_operand:VF_AVX512FP16 1 "nonimmediate_operand" "vm,v,m") + (parallel [(const_int 0)])))] + "TARGET_AVX512FP16 && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "#" + "&& reload_completed" + [(set (match_dup 0) (match_dup 1))] + "operands[1] = gen_lowpart (HFmode, operands[1]);") + +(define_insn "*vec_extracthf" + [(set (match_operand:HF 0 "register_sse4nonimm_operand" "=r,m") + (vec_select:HF + (match_operand:V8HF 1 "register_operand" "v,v") + (parallel + [(match_operand:SI 2 "const_0_to_7_operand")])))] + "TARGET_AVX512FP16" + "@ + vpextrw\t{%2, %1, %k0|%k0, %1, %2} + vpextrw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "maybe_evex") + (set_attr "mode" "TI")]) + ;; Modes handled by vec_extract patterns. (define_mode_iterator VEC_EXTRACT_MODE [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX") V16QI (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX") V8HI (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI + (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") + (V8HF "TARGET_AVX512FP16") (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX")]) @@ -15360,16 +15551,16 @@ ;; Modes handled by pinsr patterns. (define_mode_iterator PINSR_MODE - [(V16QI "TARGET_SSE4_1") V8HI + [(V16QI "TARGET_SSE4_1") V8HI (V8HF "TARGET_AVX512FP16") (V4SI "TARGET_SSE4_1") (V2DI "TARGET_SSE4_1 && TARGET_64BIT")]) (define_mode_attr sse2p4_1 - [(V16QI "sse4_1") (V8HI "sse2") + [(V16QI "sse4_1") (V8HI "sse2") (V8HF "sse4_1") (V4SI "sse4_1") (V2DI "sse4_1")]) (define_mode_attr pinsr_evex_isa - [(V16QI "avx512bw") (V8HI "avx512bw") + [(V16QI "avx512bw") (V8HI "avx512bw") (V8HF "avx512bw") (V4SI "avx512dq") (V2DI "avx512dq")]) ;; sse4_1_pinsrd must come before sse2_loadld since it is preferred. @@ -15397,11 +15588,19 @@ case 2: case 4: if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode)) - return "vpinsr\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; + { + if (mode == V8HFmode) + return "vpinsrw\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; + else + return "vpinsr\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; + } /* FALLTHRU */ case 3: case 5: - return "vpinsr\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + if (mode == V8HFmode) + return "vpinsrw\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + else + return "vpinsr\t{%3, %2, %1, %0|%0, %1, %2, %3}"; default: gcc_unreachable (); } @@ -22151,16 +22350,17 @@ [(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw") (V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw") (V16SI "avx512f") (V8SI "avx512f") (V4SI "avx512f") - (V8DI "avx512f") (V4DI "avx512f") (V2DI "avx512f")]) + (V8DI "avx512f") (V4DI "avx512f") (V2DI "avx512f") + (V32HF "avx512bw") (V16HF "avx512bw") (V8HF "avx512bw")]) (define_insn "avx2_pbroadcast" - [(set (match_operand:VI 0 "register_operand" "=x,v") - (vec_duplicate:VI + [(set (match_operand:VIHF 0 "register_operand" "=x,v") + (vec_duplicate:VIHF (vec_select: (match_operand: 1 "nonimmediate_operand" "xm,vm") (parallel [(const_int 0)]))))] "TARGET_AVX2" - "vpbroadcast\t{%1, %0|%0, %1}" + "vpbroadcast\t{%1, %0|%0, %1}" [(set_attr "isa" "*,") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") @@ -22168,17 +22368,17 @@ (set_attr "mode" "")]) (define_insn "avx2_pbroadcast_1" - [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v") - (vec_duplicate:VI_256 + [(set (match_operand:VIHF_256 0 "register_operand" "=x,x,v,v") + (vec_duplicate:VIHF_256 (vec_select: - (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v") + (match_operand:VIHF_256 1 "nonimmediate_operand" "m,x,m,v") (parallel [(const_int 0)]))))] "TARGET_AVX2" "@ - vpbroadcast\t{%1, %0|%0, %1} - vpbroadcast\t{%x1, %0|%0, %x1} - vpbroadcast\t{%1, %0|%0, %1} - vpbroadcast\t{%x1, %0|%0, %x1}" + vpbroadcast\t{%1, %0|%0, %1} + vpbroadcast\t{%x1, %0|%0, %x1} + vpbroadcast\t{%1, %0|%0, %1} + vpbroadcast\t{%x1, %0|%0, %x1}" [(set_attr "isa" "*,*,,") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") @@ -22532,15 +22732,15 @@ (set_attr "mode" "V4DF")]) (define_insn "_vec_dup_1" - [(set (match_operand:VI_AVX512BW 0 "register_operand" "=v,v") - (vec_duplicate:VI_AVX512BW + [(set (match_operand:VIHF_AVX512BW 0 "register_operand" "=v,v") + (vec_duplicate:VIHF_AVX512BW (vec_select: - (match_operand:VI_AVX512BW 1 "nonimmediate_operand" "v,m") + (match_operand:VIHF_AVX512BW 1 "nonimmediate_operand" "v,m") (parallel [(const_int 0)]))))] "TARGET_AVX512F" "@ - vpbroadcast\t{%x1, %0|%0, %x1} - vpbroadcast\t{%x1, %0|%0, %1}" + vpbroadcast\t{%x1, %0|%0, %x1} + vpbroadcast\t{%x1, %0|%0, %1}" [(set_attr "type" "ssemov") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -22565,8 +22765,8 @@ (set_attr "mode" "")]) (define_insn "_vec_dup" - [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") - (vec_duplicate:VI12_AVX512VL + [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v") + (vec_duplicate:VI12HF_AVX512VL (vec_select: (match_operand: 1 "nonimmediate_operand" "vm") (parallel [(const_int 0)]))))] @@ -22601,8 +22801,8 @@ (set_attr "mode" "")]) (define_insn "_vec_dup_gpr" - [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v,v") - (vec_duplicate:VI12_AVX512VL + [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v,v") + (vec_duplicate:VI12HF_AVX512VL (match_operand: 1 "nonimmediate_operand" "vm,r")))] "TARGET_AVX512BW" "@ @@ -22697,7 +22897,7 @@ [(V8SF "ss") (V4DF "sd") (V8SI "ss") (V4DI "sd")]) ;; Modes handled by AVX2 vec_dup patterns. (define_mode_iterator AVX2_VEC_DUP_MODE - [V32QI V16QI V16HI V8HI V8SI V4SI]) + [V32QI V16QI V16HI V8HI V8SI V4SI V16HF V8HF]) (define_insn "*vec_dup" [(set (match_operand:AVX2_VEC_DUP_MODE 0 "register_operand" "=x,x,v") @@ -23253,12 +23453,12 @@ (set_attr "prefix" "vex") (set_attr "mode" "")]) -(define_insn "vec_set_lo_v16hi" - [(set (match_operand:V16HI 0 "register_operand" "=x,v") - (vec_concat:V16HI - (match_operand:V8HI 2 "nonimmediate_operand" "xm,vm") - (vec_select:V8HI - (match_operand:V16HI 1 "register_operand" "x,v") +(define_insn "vec_set_lo_" + [(set (match_operand:V16_256 0 "register_operand" "=x,v") + (vec_concat:V16_256 + (match_operand: 2 "nonimmediate_operand" "xm,vm") + (vec_select: + (match_operand:V16_256 1 "register_operand" "x,v") (parallel [(const_int 8) (const_int 9) (const_int 10) (const_int 11) (const_int 12) (const_int 13) @@ -23273,16 +23473,16 @@ (set_attr "prefix" "vex,evex") (set_attr "mode" "OI")]) -(define_insn "vec_set_hi_v16hi" - [(set (match_operand:V16HI 0 "register_operand" "=x,v") - (vec_concat:V16HI - (vec_select:V8HI - (match_operand:V16HI 1 "register_operand" "x,v") +(define_insn "vec_set_hi_" + [(set (match_operand:V16_256 0 "register_operand" "=x,v") + (vec_concat:V16_256 + (vec_select: + (match_operand:V16_256 1 "register_operand" "x,v") (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3) (const_int 4) (const_int 5) (const_int 6) (const_int 7)])) - (match_operand:V8HI 2 "nonimmediate_operand" "xm,vm")))] + (match_operand: 2 "nonimmediate_operand" "xm,vm")))] "TARGET_AVX" "@ vinsert%~128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1} @@ -23459,6 +23659,8 @@ (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI + (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") + (V8HF "TARGET_AVX512FP16") (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2") (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX")]) @@ -23470,6 +23672,8 @@ (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") + (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") + (V8HF "TARGET_AVX512FP16") (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V4TI "TARGET_AVX512F")]) -- cgit v1.1 From a7b626d98a9a821ffb33466818d6aa86cac1d6fd Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 8 Sep 2021 11:25:31 +0200 Subject: i386: Fix up @xorsign3_1 [PR102224] As the testcase shows, we miscompile @xorsign3_1 if both input operands are in the same register, because the splitter overwrites op1 before with op1 & mask before using op0. For dest = xorsign op0, op0 we can actually simplify it from dest = (op0 & mask) ^ op0 to dest = op0 & ~mask (aka abs). The expander change is an optimization improvement, if we at expansion time know it is xorsign op0, op0, we can emit abs right away and get better code through that. The @xorsign3_1 is a fix for the case where xorsign wouldn't be known to have same operands during expansion, but during RTL optimizations they would appear. For non-AVX we need to use earlyclobber, we require dest and op1 to be the same but op0 must be different because we overwrite op1 first. For AVX the constraints ensure that at most 2 of the 3 operands may be the same register and if both inputs are the same, handles that case. This case can be easily tested with the xorsign3 expander change reverted. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? Thinking about it more this morning, while this patch fixes the problems revealed in the testcase, the recent PR89984 change was buggy too, but perhaps that can be fixed incrementally. Because for AVX the new code destructively modifies op1. If that is different from dest, say on: float foo (float x, float y) { return x * __builtin_copysignf (1.0f, y) + y; } then we get after RA: (insn 8 7 9 2 (set (reg:SF 20 xmm0 [orig:82 _2 ] [82]) (unspec:SF [ (reg:SF 20 xmm0 [88]) (reg:SF 21 xmm1 [89]) (mem/u/c:V4SF (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S16 A128]) ] UNSPEC_XORSIGN)) "hohoho.c":4:12 649 {xorsignsf3_1} (nil)) (insn 9 8 15 2 (set (reg:SF 20 xmm0 [87]) (plus:SF (reg:SF 20 xmm0 [orig:82 _2 ] [82]) (reg:SF 21 xmm1 [89]))) "hohoho.c":4:44 1021 {*fop_sf_comm} (nil)) but split the xorsign into: vandps .LC0(%rip), %xmm1, %xmm1 vxorps %xmm0, %xmm1, %xmm0 and then the addition: vaddss %xmm1, %xmm0, %xmm0 which means we miscompile it - instead of adding y in the end we add __builtin_copysignf (0.0f, y). So, wonder if we don't want instead in addition to the &Yv <- Yv, 0 alternative (enabled for both pre-AVX and AVX as in this patch) the &Yv <- Yv, Yv where destination must be different from inputs and another Yv <- Yv, Yv where it can be the same but then need a match_scratch (with X for the other alternatives and =Yv for the last one). That way we'd always have a safe register we can store the op1 & mask value into, either the destination (in the first alternative known to be equal to op1 which is needed for non-AVX but ok for AVX too), in the second alternative known to be different from both inputs and in the third which could be used for those float bar (float x, float y) { return x * __builtin_copysignf (1.0f, y); } cases where op1 is naturally xmm1 and dest == op0 naturally xmm0 we'd use some other register like xmm2. 2021-09-08 Jakub Jelinek PR target/102224 * config/i386/i386.md (xorsign3): If operands[1] is equal to operands[2], emit abs2 instead. (@xorsign3_1): Add early-clobbers for output operand, enable first alternative even for avx, add another alternative with =&Yv <- 0, Yv, Yvm constraints. * config/i386/i386-expand.c (ix86_split_xorsign): If op0 is equal to op1, emit vpandn instead. * gcc.dg/pr102224.c: New test. * gcc.target/i386/avx-pr102224.c: New test. --- gcc/config/i386/i386-expand.c | 37 ++++++++++++++++++++++++++++++++----- gcc/config/i386/i386.md | 18 ++++++++++++------ 2 files changed, 44 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index dfffbe5..0cc572c 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -2306,12 +2306,39 @@ ix86_split_xorsign (rtx operands[]) mode = GET_MODE (dest); vmode = GET_MODE (mask); - op1 = lowpart_subreg (vmode, op1, mode); - x = gen_rtx_AND (vmode, op1, mask); - emit_insn (gen_rtx_SET (op1, x)); + /* The constraints ensure that for non-AVX dest == op1 is + different from op0, and for AVX that at most two of + dest, op0 and op1 are the same register but the third one + is different. */ + if (rtx_equal_p (op0, op1)) + { + gcc_assert (TARGET_AVX && !rtx_equal_p (op0, dest)); + if (vmode == V4SFmode) + vmode = V4SImode; + else + { + gcc_assert (vmode == V2DFmode); + vmode = V2DImode; + } + mask = lowpart_subreg (vmode, mask, GET_MODE (mask)); + if (MEM_P (mask)) + { + rtx msk = lowpart_subreg (vmode, dest, mode); + emit_insn (gen_rtx_SET (msk, mask)); + mask = msk; + } + op0 = lowpart_subreg (vmode, op0, mode); + x = gen_rtx_AND (vmode, gen_rtx_NOT (vmode, mask), op0); + } + else + { + op1 = lowpart_subreg (vmode, op1, mode); + x = gen_rtx_AND (vmode, op1, mask); + emit_insn (gen_rtx_SET (op1, x)); - op0 = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_XOR (vmode, op1, op0); + op0 = lowpart_subreg (vmode, op0, mode); + x = gen_rtx_XOR (vmode, op1, op0); + } dest = lowpart_subreg (vmode, dest, mode); emit_insn (gen_rtx_SET (dest, x)); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index fe36d7e..0414f24 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -10910,21 +10910,27 @@ (match_operand:MODEF 1 "register_operand") (match_operand:MODEF 2 "register_operand")] "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" - "ix86_expand_xorsign (operands); DONE;") +{ + if (rtx_equal_p (operands[1], operands[2])) + emit_insn (gen_abs2 (operands[0], operands[1])); + else + ix86_expand_xorsign (operands); + DONE; +}) (define_insn_and_split "@xorsign3_1" - [(set (match_operand:MODEF 0 "register_operand" "=Yv,Yv") + [(set (match_operand:MODEF 0 "register_operand" "=&Yv,&Yv,&Yv") (unspec:MODEF - [(match_operand:MODEF 1 "register_operand" "Yv,Yv") - (match_operand:MODEF 2 "register_operand" "0,Yv") - (match_operand: 3 "nonimmediate_operand" "Yvm,Yvm")] + [(match_operand:MODEF 1 "register_operand" "Yv,0,Yv") + (match_operand:MODEF 2 "register_operand" "0,Yv,Yv") + (match_operand: 3 "nonimmediate_operand" "Yvm,Yvm,Yvm")] UNSPEC_XORSIGN))] "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" "#" "&& reload_completed" [(const_int 0)] "ix86_split_xorsign (operands); DONE;" - [(set_attr "isa" "noavx,avx")]) + [(set_attr "isa" "*,avx,avx")]) ;; One complement instructions -- cgit v1.1 From 7485a52551d71db2e8bbfc4c484196bcc321a1cd Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 8 Sep 2021 14:06:10 +0200 Subject: i386: Fix up xorsign for AVX [PR89984] Thinking about it more this morning, while this patch fixes the problems revealed in the testcase, the recent PR89984 change was buggy too, but perhaps that can be fixed incrementally. Because for AVX the new code destructively modifies op1. If that is different from dest, say on: float foo (float x, float y) { return x * __builtin_copysignf (1.0f, y) + y; } then we get after RA: (insn 8 7 9 2 (set (reg:SF 20 xmm0 [orig:82 _2 ] [82]) (unspec:SF [ (reg:SF 20 xmm0 [88]) (reg:SF 21 xmm1 [89]) (mem/u/c:V4SF (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S16 A128]) ] UNSPEC_XORSIGN)) "hohoho.c":4:12 649 {xorsignsf3_1} (nil)) (insn 9 8 15 2 (set (reg:SF 20 xmm0 [87]) (plus:SF (reg:SF 20 xmm0 [orig:82 _2 ] [82]) (reg:SF 21 xmm1 [89]))) "hohoho.c":4:44 1021 {*fop_sf_comm} (nil)) but split the xorsign into: vandps .LC0(%rip), %xmm1, %xmm1 vxorps %xmm0, %xmm1, %xmm0 and then the addition: vaddss %xmm1, %xmm0, %xmm0 which means we miscompile it - instead of adding y in the end we add __builtin_copysignf (0.0f, y). So, wonder if we don't want instead in addition to the &Yv <- Yv, 0 alternative (enabled for both pre-AVX and AVX as in this patch) the &Yv <- Yv, Yv where destination must be different from inputs and another Yv <- Yv, Yv where it can be the same but then need a match_scratch (with X for the other alternatives and =Yv for the last one). That way we'd always have a safe register we can store the op1 & mask value into, either the destination (in the first alternative known to be equal to op1 which is needed for non-AVX but ok for AVX too), in the second alternative known to be different from both inputs and in the third which could be used for those float bar (float x, float y) { return x * __builtin_copysignf (1.0f, y); } cases where op1 is naturally xmm1 and dest == op0 naturally xmm0 we'd use some other register like xmm2. On Wed, Sep 08, 2021 at 05:23:40PM +0800, Hongtao Liu wrote: > I'm curious why we need the post_reload splitter @xorsign3_1 > for scalar mode, can't we just expand them into and/xor operations in > the expander, just like vector modes did. Following seems to work for all the testcases I've tried (and in some generates better code than the post-reload splitter). 2021-09-08 Jakub Jelinek liuhongt PR target/89984 * config/i386/i386.md (@xorsign3_1): Remove. * config/i386/i386-expand.c (ix86_expand_xorsign): Expand right away into AND with mask and XOR, using paradoxical subregs. (ix86_split_xorsign): Remove. * config/i386/i386-protos.h (ix86_split_xorsign): Remove. * gcc.target/i386/avx-pr102224.c: Fix up PR number. * gcc.dg/pr89984.c: New test. * gcc.target/i386/avx-pr89984.c: New test. --- gcc/config/i386/i386-expand.c | 59 +++++-------------------------------------- gcc/config/i386/i386-protos.h | 1 - gcc/config/i386/i386.md | 14 ---------- 3 files changed, 7 insertions(+), 67 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 0cc572c..badbacc 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -2270,7 +2270,7 @@ void ix86_expand_xorsign (rtx operands[]) { machine_mode mode, vmode; - rtx dest, op0, op1, mask; + rtx dest, op0, op1, mask, x, temp; dest = operands[0]; op0 = operands[1]; @@ -2285,60 +2285,15 @@ ix86_expand_xorsign (rtx operands[]) else gcc_unreachable (); + temp = gen_reg_rtx (vmode); mask = ix86_build_signbit_mask (vmode, 0, 0); - emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask)); -} - -/* Deconstruct an xorsign operation into bit masks. */ - -void -ix86_split_xorsign (rtx operands[]) -{ - machine_mode mode, vmode; - rtx dest, op0, op1, mask, x; - - dest = operands[0]; - op0 = operands[1]; - op1 = operands[2]; - mask = operands[3]; - - mode = GET_MODE (dest); - vmode = GET_MODE (mask); + op1 = lowpart_subreg (vmode, op1, mode); + x = gen_rtx_AND (vmode, op1, mask); + emit_insn (gen_rtx_SET (temp, x)); - /* The constraints ensure that for non-AVX dest == op1 is - different from op0, and for AVX that at most two of - dest, op0 and op1 are the same register but the third one - is different. */ - if (rtx_equal_p (op0, op1)) - { - gcc_assert (TARGET_AVX && !rtx_equal_p (op0, dest)); - if (vmode == V4SFmode) - vmode = V4SImode; - else - { - gcc_assert (vmode == V2DFmode); - vmode = V2DImode; - } - mask = lowpart_subreg (vmode, mask, GET_MODE (mask)); - if (MEM_P (mask)) - { - rtx msk = lowpart_subreg (vmode, dest, mode); - emit_insn (gen_rtx_SET (msk, mask)); - mask = msk; - } - op0 = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_AND (vmode, gen_rtx_NOT (vmode, mask), op0); - } - else - { - op1 = lowpart_subreg (vmode, op1, mode); - x = gen_rtx_AND (vmode, op1, mask); - emit_insn (gen_rtx_SET (op1, x)); - - op0 = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_XOR (vmode, op1, op0); - } + op0 = lowpart_subreg (vmode, op0, mode); + x = gen_rtx_XOR (vmode, temp, op0); dest = lowpart_subreg (vmode, dest, mode); emit_insn (gen_rtx_SET (dest, x)); diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 355df11..72644e3 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -138,7 +138,6 @@ extern void ix86_expand_copysign (rtx []); extern void ix86_split_copysign_const (rtx []); extern void ix86_split_copysign_var (rtx []); extern void ix86_expand_xorsign (rtx []); -extern void ix86_split_xorsign (rtx []); extern bool ix86_unary_operator_ok (enum rtx_code, machine_mode, rtx[]); extern bool ix86_match_ccmode (rtx, machine_mode); extern void ix86_expand_branch (enum rtx_code, rtx, rtx, rtx); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 0414f24..6b4ceb2 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -10917,20 +10917,6 @@ ix86_expand_xorsign (operands); DONE; }) - -(define_insn_and_split "@xorsign3_1" - [(set (match_operand:MODEF 0 "register_operand" "=&Yv,&Yv,&Yv") - (unspec:MODEF - [(match_operand:MODEF 1 "register_operand" "Yv,0,Yv") - (match_operand:MODEF 2 "register_operand" "0,Yv,Yv") - (match_operand: 3 "nonimmediate_operand" "Yvm,Yvm,Yvm")] - UNSPEC_XORSIGN))] - "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" - "#" - "&& reload_completed" - [(const_int 0)] - "ix86_split_xorsign (operands); DONE;" - [(set_attr "isa" "*,avx,avx")]) ;; One complement instructions -- cgit v1.1 From 86e6268cff328e27ee6f90e2afc35b6f437a25cd Mon Sep 17 00:00:00 2001 From: Segher Boessenkool Date: Wed, 8 Sep 2021 13:10:30 +0000 Subject: rs6000: Fix ELFv2 r12 use in epilogue We cannot use r12 here, it is already in use as the GEP (for sibling calls). 2021-09-08 Segher Boessenkool PR target/102107 * config/rs6000/rs6000-logue.c (rs6000_emit_epilogue): For ELFv2 use r11 instead of r12 for restoring CR. --- gcc/config/rs6000/rs6000-logue.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c index e363d56..9965a8a 100644 --- a/gcc/config/rs6000/rs6000-logue.c +++ b/gcc/config/rs6000/rs6000-logue.c @@ -4815,6 +4815,10 @@ rs6000_emit_epilogue (enum epilogue_type epilogue_type) else if (REGNO (frame_reg_rtx) == 12) cr_save_regno = 11; + /* For ELFv2 r12 is already in use as the GEP. */ + if (DEFAULT_ABI == ABI_ELFv2) + cr_save_regno = 11; + cr_save_reg = load_cr_save (cr_save_regno, frame_reg_rtx, info->cr_save_offset + frame_off, exit_func); -- cgit v1.1 From 60eec23b5eda0f350e572586eee738eab0804a74 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 8 Sep 2021 16:19:37 +0800 Subject: Optimize vec_extract for 256/512-bit vector when index exceeds the lower 128 bits. - vextracti32x8 $0x1, %zmm0, %ymm0 - vmovd %xmm0, %eax + valignd $8, %zmm0, %zmm0, %zmm1 + vmovd %xmm1, %eax - vextracti32x8 $0x1, %zmm0, %ymm0 - vextracti128 $0x1, %ymm0, %xmm0 - vpextrd $3, %xmm0, %eax + valignd $15, %zmm0, %zmm0, %zmm1 + vmovd %xmm1, %eax - vextractf64x2 $0x1, %ymm0, %xmm0 + valignq $2, %ymm0, %ymm0, %ymm0 - vextractf64x4 $0x1, %zmm0, %ymm0 - vextractf64x2 $0x1, %ymm0, %xmm0 - vunpckhpd %xmm0, %xmm0, %xmm0 + valignq $7, %zmm0, %zmm0, %zmm0 gcc/ChangeLog: PR target/91103 * config/i386/sse.md (*vec_extract_valign): New define_insn. gcc/testsuite/ChangeLog: PR target/91103 * gcc.target/i386/pr91103-1.c: New test. * gcc.target/i386/pr91103-2.c: New test. --- gcc/config/i386/sse.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a1ad410..ee81fdb 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -233,6 +233,12 @@ V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL") V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")]) +(define_mode_iterator V48_256_512_AVX512VL + [V16SI (V8SI "TARGET_AVX512VL") + V8DI (V4DI "TARGET_AVX512VL") + V16SF (V8SF "TARGET_AVX512VL") + V8DF (V4DF "TARGET_AVX512VL")]) + ;; 1,2 byte AVX-512{BW,VL} vector modes. Supposed TARGET_AVX512BW baseline. (define_mode_iterator VI12_AVX512VL [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL") @@ -828,6 +834,15 @@ (V8HF "TI") (V16HF "OI") (V32HF "XI") (TI "TI")]) +(define_mode_attr sseintvecinsnmode + [(V64QI "XI") (V32HI "XI") (V16SI "XI") (V8DI "XI") (V4TI "XI") + (V32QI "OI") (V16HI "OI") (V8SI "OI") (V4DI "OI") (V2TI "OI") + (V16QI "TI") (V8HI "TI") (V4SI "TI") (V2DI "TI") (V1TI "TI") + (V16SF "XI") (V8DF "XI") + (V8SF "OI") (V4DF "OI") + (V4SF "TI") (V2DF "TI") + (TI "TI")]) + ;; SSE constant -1 constraint (define_mode_attr sseconstm1 [(V64QI "BC") (V32HI "BC") (V16SI "BC") (V8DI "BC") (V4TI "BC") @@ -10517,6 +10532,23 @@ [(set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_mode_attr vec_extract_imm_predicate + [(V16SF "const_0_to_15_operand") (V8SF "const_0_to_7_operand") + (V16SI "const_0_to_15_operand") (V8SI "const_0_to_7_operand") + (V8DF "const_0_to_7_operand") (V4DF "const_0_to_3_operand") + (V8DI "const_0_to_7_operand") (V4DI "const_0_to_3_operand")]) + +(define_insn "*vec_extract_valign" + [(set (match_operand: 0 "register_operand" "=v") + (vec_select: + (match_operand:V48_256_512_AVX512VL 1 "register_operand" "v") + (parallel [(match_operand 2 "")])))] + "TARGET_AVX512F + && INTVAL(operands[2]) >= 16 / GET_MODE_SIZE (mode)" + "valign\t{%2, %1, %1, %0|%0, %1, %1, %2}"; + [(set_attr "prefix" "evex") + (set_attr "mode" "")]) + (define_expand "avx512f_shufps512_mask" [(match_operand:V16SF 0 "register_operand") (match_operand:V16SF 1 "register_operand") -- cgit v1.1 From 8f323c712ea76cc4506b03895e9b991e4e4b2baf Mon Sep 17 00:00:00 2001 From: liuhongt Date: Tue, 7 Sep 2021 12:39:04 +0800 Subject: Optimize v4sf reduction. gcc/ChangeLog: PR target/101059 * config/i386/sse.md (reduc_plus_scal_): Split to .. (reduc_plus_scal_v4sf): .. this, New define_expand. (reduc_plus_scal_v2df): .. and this, New define_expand. gcc/testsuite/ChangeLog: PR target/101059 * gcc.target/i386/sse2-pr101059.c: New test. * gcc.target/i386/sse3-pr101059.c: New test. --- gcc/config/i386/sse.md | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index ee81fdb..9c67750 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2995,19 +2995,36 @@ (set_attr "prefix_rep" "1,*") (set_attr "mode" "V4SF")]) -(define_mode_iterator REDUC_SSE_PLUS_MODE - [(V2DF "TARGET_SSE") (V4SF "TARGET_SSE")]) +(define_expand "reduc_plus_scal_v4sf" + [(plus:V4SF + (match_operand:SF 0 "register_operand") + (match_operand:V4SF 1 "register_operand"))] + "TARGET_SSE" +{ + rtx vtmp = gen_reg_rtx (V4SFmode); + rtx stmp = gen_reg_rtx (SFmode); + if (TARGET_SSE3) + emit_insn (gen_sse3_movshdup (vtmp, operands[1])); + else + emit_insn (gen_sse_shufps (vtmp, operands[1], operands[1], GEN_INT(177))); -(define_expand "reduc_plus_scal_" - [(plus:REDUC_SSE_PLUS_MODE - (match_operand: 0 "register_operand") - (match_operand:REDUC_SSE_PLUS_MODE 1 "register_operand"))] - "" + emit_insn (gen_addv4sf3 (operands[1], operands[1], vtmp)); + emit_insn (gen_sse_movhlps (vtmp, vtmp, operands[1])); + emit_insn (gen_vec_extractv4sfsf (stmp, vtmp, const0_rtx)); + emit_insn (gen_vec_extractv4sfsf (operands[0], operands[1], const0_rtx)); + emit_insn (gen_addsf3 (operands[0], operands[0], stmp)); + DONE; +}) + +(define_expand "reduc_plus_scal_v2df" + [(plus:V2DF + (match_operand:DF 0 "register_operand") + (match_operand:V2DF 1 "register_operand"))] + "TARGET_SSE" { - rtx tmp = gen_reg_rtx (mode); - ix86_expand_reduc (gen_add3, tmp, operands[1]); - emit_insn (gen_vec_extract (operands[0], tmp, - const0_rtx)); + rtx tmp = gen_reg_rtx (V2DFmode); + ix86_expand_reduc (gen_addv2df3, tmp, operands[1]); + emit_insn (gen_vec_extractv2dfdf (operands[0], tmp, const0_rtx)); DONE; }) -- cgit v1.1 From bd7a34ef5564f4240c3839c89d7e695c9ef4e49d Mon Sep 17 00:00:00 2001 From: liuhongt Date: Fri, 18 Jan 2019 14:09:24 -0800 Subject: AVX512FP16: Add vaddph/vsubph/vdivph/vmulph. gcc/ChangeLog: * config.gcc: Add avx512fp16vlintrin.h. * config/i386/avx512fp16intrin.h: (_mm512_add_ph): New intrinsic. (_mm512_mask_add_ph): Likewise. (_mm512_maskz_add_ph): Likewise. (_mm512_sub_ph): Likewise. (_mm512_mask_sub_ph): Likewise. (_mm512_maskz_sub_ph): Likewise. (_mm512_mul_ph): Likewise. (_mm512_mask_mul_ph): Likewise. (_mm512_maskz_mul_ph): Likewise. (_mm512_div_ph): Likewise. (_mm512_mask_div_ph): Likewise. (_mm512_maskz_div_ph): Likewise. (_mm512_add_round_ph): Likewise. (_mm512_mask_add_round_ph): Likewise. (_mm512_maskz_add_round_ph): Likewise. (_mm512_sub_round_ph): Likewise. (_mm512_mask_sub_round_ph): Likewise. (_mm512_maskz_sub_round_ph): Likewise. (_mm512_mul_round_ph): Likewise. (_mm512_mask_mul_round_ph): Likewise. (_mm512_maskz_mul_round_ph): Likewise. (_mm512_div_round_ph): Likewise. (_mm512_mask_div_round_ph): Likewise. (_mm512_maskz_div_round_ph): Likewise. * config/i386/avx512fp16vlintrin.h: New header. * config/i386/i386-builtin-types.def (V16HF, V8HF, V32HF): Add new builtin types. * config/i386/i386-builtin.def: Add corresponding builtins. * config/i386/i386-expand.c (ix86_expand_args_builtin): Handle new builtin types. (ix86_expand_round_builtin): Likewise. * config/i386/immintrin.h: Include avx512fp16vlintrin.h * config/i386/sse.md (VFH): New mode_iterator. (VF2H): Likewise. (avx512fmaskmode): Add HF vector modes. (avx512fmaskhalfmode): Likewise. (3): Adjust to for HF vector modes. (*3): Likewise. (mul3): Likewise. (*mul3): Likewise. (div3): Likewise. (_div3): Likewise. * config/i386/subst.md (SUBST_V): Add HF vector modes. (SUBST_A): Likewise. (round_mode512bit_condition): Adjust for V32HFmode. gcc/testsuite/ChangeLog: * gcc.target/i386/avx-1.c: Add -mavx512vl and test for new intrinsics. * gcc.target/i386/avx-2.c: Add -mavx512vl. * gcc.target/i386/avx512fp16-11a.c: New test. * gcc.target/i386/avx512fp16-11b.c: Ditto. * gcc.target/i386/avx512vlfp16-11a.c: Ditto. * gcc.target/i386/avx512vlfp16-11b.c: Ditto. * gcc.target/i386/sse-13.c: Add test for new builtins. * gcc.target/i386/sse-23.c: Ditto. * gcc.target/i386/sse-14.c: Add test for new intrinsics. * gcc.target/i386/sse-22.c: Ditto. --- gcc/config/i386/avx512fp16intrin.h | 251 +++++++++++++++++++++++++++++++++ gcc/config/i386/avx512fp16vlintrin.h | 219 ++++++++++++++++++++++++++++ gcc/config/i386/i386-builtin-types.def | 7 + gcc/config/i386/i386-builtin.def | 20 +++ gcc/config/i386/i386-expand.c | 5 + gcc/config/i386/immintrin.h | 2 + gcc/config/i386/sse.md | 61 +++++--- gcc/config/i386/subst.md | 6 +- 8 files changed, 547 insertions(+), 24 deletions(-) create mode 100644 gcc/config/i386/avx512fp16vlintrin.h (limited to 'gcc/config') diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h index 3fc0770..3e9d676 100644 --- a/gcc/config/i386/avx512fp16intrin.h +++ b/gcc/config/i386/avx512fp16intrin.h @@ -217,6 +217,257 @@ _mm_store_sh (void *__P, __m128h __A) *(_Float16 *) __P = ((__v8hf)__A)[0]; } +/* Intrinsics v[add,sub,mul,div]ph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_ph (__m512h __A, __m512h __B) +{ + return (__m512h) ((__v32hf) __A + (__v32hf) __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_vaddph_v32hf_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_vaddph_v32hf_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_ph (__m512h __A, __m512h __B) +{ + return (__m512h) ((__v32hf) __A - (__v32hf) __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_vsubph_v32hf_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_vsubph_v32hf_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_ph (__m512h __A, __m512h __B) +{ + return (__m512h) ((__v32hf) __A * (__v32hf) __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_vmulph_v32hf_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_vmulph_v32hf_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_ph (__m512h __A, __m512h __B) +{ + return (__m512h) ((__v32hf) __A / (__v32hf) __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_vdivph_v32hf_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_vdivph_v32hf_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_vaddph_v32hf_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_vaddph_v32hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_vaddph_v32hf_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_vsubph_v32hf_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_vsubph_v32hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_vsubph_v32hf_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_vmulph_v32hf_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_vmulph_v32hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_vmulph_v32hf_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_vdivph_v32hf_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_vdivph_v32hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_vdivph_v32hf_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} +#else +#define _mm512_add_round_ph(A, B, C) \ + ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((A), (B), \ + _mm512_setzero_ph (),\ + (__mmask32)-1, (C))) + +#define _mm512_mask_add_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((C), (D), (A), (B), (E))) + +#define _mm512_maskz_add_round_ph(A, B, C, D) \ + ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((B), (C), \ + _mm512_setzero_ph (),\ + (A), (D))) + +#define _mm512_sub_round_ph(A, B, C) \ + ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((A), (B), \ + _mm512_setzero_ph (),\ + (__mmask32)-1, (C))) + +#define _mm512_mask_sub_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((C), (D), (A), (B), (E))) + +#define _mm512_maskz_sub_round_ph(A, B, C, D) \ + ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((B), (C), \ + _mm512_setzero_ph (),\ + (A), (D))) + +#define _mm512_mul_round_ph(A, B, C) \ + ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((A), (B), \ + _mm512_setzero_ph (),\ + (__mmask32)-1, (C))) + +#define _mm512_mask_mul_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((C), (D), (A), (B), (E))) + +#define _mm512_maskz_mul_round_ph(A, B, C, D) \ + ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((B), (C), \ + _mm512_setzero_ph (),\ + (A), (D))) + +#define _mm512_div_round_ph(A, B, C) \ + ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((A), (B), \ + _mm512_setzero_ph (),\ + (__mmask32)-1, (C))) + +#define _mm512_mask_div_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((C), (D), (A), (B), (E))) + +#define _mm512_maskz_div_round_ph(A, B, C, D) \ + ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((B), (C), \ + _mm512_setzero_ph (),\ + (A), (D))) +#endif /* __OPTIMIZE__ */ + #ifdef __DISABLE_AVX512FP16__ #undef __DISABLE_AVX512FP16__ #pragma GCC pop_options diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h new file mode 100644 index 0000000..75fa9eb --- /dev/null +++ b/gcc/config/i386/avx512fp16vlintrin.h @@ -0,0 +1,219 @@ +/* Copyright (C) 2019 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512FP16VLINTRIN_H_INCLUDED +#define __AVX512FP16VLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512FP16__) +#pragma GCC push_options +#pragma GCC target("avx512fp16,avx512vl") +#define __DISABLE_AVX512FP16VL__ +#endif /* __AVX512FP16VL__ */ + +/* Intrinsics v[add,sub,mul,div]ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_ph (__m128h __A, __m128h __B) +{ + return (__m128h) ((__v8hf) __A + (__v8hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_ph (__m256h __A, __m256h __B) +{ + return (__m256h) ((__v16hf) __A + (__v16hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vaddph_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_vaddph_v16hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vaddph_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_vaddph_v16hf_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_ph (__m128h __A, __m128h __B) +{ + return (__m128h) ((__v8hf) __A - (__v8hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_ph (__m256h __A, __m256h __B) +{ + return (__m256h) ((__v16hf) __A - (__v16hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vsubph_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_vsubph_v16hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vsubph_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_vsubph_v16hf_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_ph (__m128h __A, __m128h __B) +{ + return (__m128h) ((__v8hf) __A * (__v8hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_ph (__m256h __A, __m256h __B) +{ + return (__m256h) ((__v16hf) __A * (__v16hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vmulph_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_vmulph_v16hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vmulph_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_vmulph_v16hf_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_ph (__m128h __A, __m128h __B) +{ + return (__m128h) ((__v8hf) __A / (__v8hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_div_ph (__m256h __A, __m256h __B) +{ + return (__m256h) ((__v16hf) __A / (__v16hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vdivph_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_div_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_vdivph_v16hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vdivph_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_div_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_vdivph_v16hf_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +#ifdef __DISABLE_AVX512FP16VL__ +#undef __DISABLE_AVX512FP16VL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512FP16VL__ */ + +#endif /* __AVX512FP16VLINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 4df6ee1..fdc46bd 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -98,6 +98,7 @@ DEF_VECTOR_TYPE (V16UQI, UQI, V16QI) # AVX vectors DEF_VECTOR_TYPE (V4DF, DOUBLE) DEF_VECTOR_TYPE (V8SF, FLOAT) +DEF_VECTOR_TYPE (V16HF, FLOAT16) DEF_VECTOR_TYPE (V4DI, DI) DEF_VECTOR_TYPE (V8SI, SI) DEF_VECTOR_TYPE (V16HI, HI) @@ -108,6 +109,7 @@ DEF_VECTOR_TYPE (V16UHI, UHI, V16HI) # AVX512F vectors DEF_VECTOR_TYPE (V32SF, FLOAT) +DEF_VECTOR_TYPE (V32HF, FLOAT16) DEF_VECTOR_TYPE (V16SF, FLOAT) DEF_VECTOR_TYPE (V8DF, DOUBLE) DEF_VECTOR_TYPE (V8DI, DI) @@ -1302,3 +1304,8 @@ DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID) # FP16 builtins DEF_FUNCTION_TYPE (V8HF, V8HI) +DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI) +DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI) +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT) +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI) +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 4b1ae0e..2f152096 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2774,6 +2774,20 @@ BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builti BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI) BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI) +/* AVX512FP16. */ +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_vaddph_v8hf_mask", IX86_BUILTIN_VADDPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv16hf3_mask, "__builtin_ia32_vaddph_v16hf_mask", IX86_BUILTIN_VADDPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, "__builtin_ia32_vaddph_v32hf_mask", IX86_BUILTIN_VADDPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv8hf3_mask, "__builtin_ia32_vsubph_v8hf_mask", IX86_BUILTIN_VSUBPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv16hf3_mask, "__builtin_ia32_vsubph_v16hf_mask", IX86_BUILTIN_VSUBPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, "__builtin_ia32_vsubph_v32hf_mask", IX86_BUILTIN_VSUBPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv8hf3_mask, "__builtin_ia32_vmulph_v8hf_mask", IX86_BUILTIN_VMULPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv16hf3_mask, "__builtin_ia32_vmulph_v16hf_mask", IX86_BUILTIN_VMULPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask, "__builtin_ia32_vmulph_v32hf_mask", IX86_BUILTIN_VMULPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv8hf3_mask, "__builtin_ia32_vdivph_v8hf_mask", IX86_BUILTIN_VDIVPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv16hf3_mask, "__builtin_ia32_vdivph_v16hf_mask", IX86_BUILTIN_VDIVPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask, "__builtin_ia32_vdivph_v32hf_mask", IX86_BUILTIN_VDIVPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) + /* Builtins with rounding support. */ BDESC_END (ARGS, ROUND_ARGS) @@ -2973,6 +2987,12 @@ BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_fixuns_truncv8dfv8di2_mask_round, " BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv16sf_mask_round, "__builtin_ia32_rangeps512_mask", IX86_BUILTIN_RANGEPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT) BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv8df_mask_round, "__builtin_ia32_rangepd512_mask", IX86_BUILTIN_RANGEPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT) +/* AVX512FP16. */ +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask_round, "__builtin_ia32_vaddph_v32hf_mask_round", IX86_BUILTIN_VADDPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask_round, "__builtin_ia32_vsubph_v32hf_mask_round", IX86_BUILTIN_VSUBPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask_round, "__builtin_ia32_vmulph_v32hf_mask_round", IX86_BUILTIN_VMULPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask_round, "__builtin_ia32_vdivph_v32hf_mask_round", IX86_BUILTIN_VDIVPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) + BDESC_END (ROUND_ARGS, MULTI_ARG) /* FMA4 and XOP. */ diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index badbacc..ad9c672 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -10038,6 +10038,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI: case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI: case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI: + case V32HF_FTYPE_V32HF_V32HF_V32HF_USI: case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI: case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI: case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI: @@ -10055,6 +10056,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI: case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI: case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI: + case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI: case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI: case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI: case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI: @@ -10062,6 +10064,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI: case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI: case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI: + case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI: case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI: case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI: case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI: @@ -10738,6 +10741,7 @@ ix86_expand_round_builtin (const struct builtin_description *d, case INT_FTYPE_V4SF_INT: nargs = 2; break; + case V32HF_FTYPE_V32HF_V32HF_INT: case V4SF_FTYPE_V4SF_UINT_INT: case V4SF_FTYPE_V4SF_UINT64_INT: case V2DF_FTYPE_V2DF_UINT64_INT: @@ -10778,6 +10782,7 @@ ix86_expand_round_builtin (const struct builtin_description *d, case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT: case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT: case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT: + case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT: case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT: case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT: case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT: diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h index 2421a78..1761c75 100644 --- a/gcc/config/i386/immintrin.h +++ b/gcc/config/i386/immintrin.h @@ -96,6 +96,8 @@ #ifdef __SSE2__ #include + +#include #endif #include diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 9c67750..0633916 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -298,6 +298,13 @@ [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) +(define_mode_iterator VFH + [(V32HF "TARGET_AVX512FP16") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) + ;; 128- and 256-bit float vector modes (define_mode_iterator VF_128_256 [(V8SF "TARGET_AVX") V4SF @@ -321,6 +328,13 @@ (define_mode_iterator VF2 [(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) +;; All DFmode & HFmode vector float modes +(define_mode_iterator VF2H + [(V32HF "TARGET_AVX512FP16") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) + ;; 128- and 256-bit DF vector modes (define_mode_iterator VF2_128_256 [(V4DF "TARGET_AVX") V2DF]) @@ -885,6 +899,7 @@ (V32HI "HI") (V16HI "QI") (V8HI "QI") (V4HI "QI") (V16SI "QI") (V8SI "QI") (V4SI "QI") (V8DI "QI") (V4DI "QI") (V2DI "QI") + (V32HF "HI") (V16HF "QI") (V8HF "QI") (V16SF "QI") (V8SF "QI") (V4SF "QI") (V8DF "QI") (V4DF "QI") (V2DF "QI")]) @@ -2032,18 +2047,18 @@ }) (define_expand "3" - [(set (match_operand:VF 0 "register_operand") - (plusminus:VF - (match_operand:VF 1 "") - (match_operand:VF 2 "")))] + [(set (match_operand:VFH 0 "register_operand") + (plusminus:VFH + (match_operand:VFH 1 "") + (match_operand:VFH 2 "")))] "TARGET_SSE && && " "ix86_fixup_binary_operands_no_copy (, mode, operands);") (define_insn "*3" - [(set (match_operand:VF 0 "register_operand" "=x,v") - (plusminus:VF - (match_operand:VF 1 "" "0,v") - (match_operand:VF 2 "" "xBm,")))] + [(set (match_operand:VFH 0 "register_operand" "=x,v") + (plusminus:VFH + (match_operand:VFH 1 "" "0,v") + (match_operand:VFH 2 "" "xBm,")))] "TARGET_SSE && ix86_binary_operator_ok (, mode, operands) && && " "@ @@ -2121,18 +2136,18 @@ }) (define_expand "mul3" - [(set (match_operand:VF 0 "register_operand") - (mult:VF - (match_operand:VF 1 "") - (match_operand:VF 2 "")))] + [(set (match_operand:VFH 0 "register_operand") + (mult:VFH + (match_operand:VFH 1 "") + (match_operand:VFH 2 "")))] "TARGET_SSE && && " "ix86_fixup_binary_operands_no_copy (MULT, mode, operands);") (define_insn "*mul3" - [(set (match_operand:VF 0 "register_operand" "=x,v") - (mult:VF - (match_operand:VF 1 "" "%0,v") - (match_operand:VF 2 "" "xBm,")))] + [(set (match_operand:VFH 0 "register_operand" "=x,v") + (mult:VFH + (match_operand:VFH 1 "" "%0,v") + (match_operand:VFH 2 "" "xBm,")))] "TARGET_SSE && ix86_binary_operator_ok (MULT, mode, operands) && && " "@ @@ -2195,9 +2210,9 @@ (set_attr "mode" "")]) (define_expand "div3" - [(set (match_operand:VF2 0 "register_operand") - (div:VF2 (match_operand:VF2 1 "register_operand") - (match_operand:VF2 2 "vector_operand")))] + [(set (match_operand:VF2H 0 "register_operand") + (div:VF2H (match_operand:VF2H 1 "register_operand") + (match_operand:VF2H 2 "vector_operand")))] "TARGET_SSE2") (define_expand "div3" @@ -2236,10 +2251,10 @@ }) (define_insn "_div3" - [(set (match_operand:VF 0 "register_operand" "=x,v") - (div:VF - (match_operand:VF 1 "register_operand" "0,v") - (match_operand:VF 2 "" "xBm,")))] + [(set (match_operand:VFH 0 "register_operand" "=x,v") + (div:VFH + (match_operand:VFH 1 "register_operand" "0,v") + (match_operand:VFH 2 "" "xBm,")))] "TARGET_SSE && && " "@ div\t{%2, %0|%0, %2} diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index 6614e04..94426a5 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -24,6 +24,7 @@ V32HI V16HI V8HI V16SI V8SI V4SI V8DI V4DI V2DI + V32HF V16HF V8HF V16SF V8SF V4SF V8DF V4DF V2DF]) @@ -35,6 +36,7 @@ V32HI V16HI V8HI V16SI V8SI V4SI V8DI V4DI V2DI + V32HF V16HF V8HF V16SF V8SF V4SF V8DF V4DF V2DF QI HI SI DI SF DF]) @@ -161,7 +163,9 @@ (define_subst_attr "round_mode512bit_condition" "round" "1" "(mode == V16SFmode || mode == V8DFmode || mode == V8DImode - || mode == V16SImode)") + || mode == V16SImode + || mode == V32HFmode)") + (define_subst_attr "round_modev8sf_condition" "round" "1" "(mode == V8SFmode)") (define_subst_attr "round_modev4sf_condition" "round" "1" "(mode == V4SFmode)") (define_subst_attr "round_codefor" "round" "*" "") -- cgit v1.1 From 60efb1fee9b8985829abf4ed9958691783c2604e Mon Sep 17 00:00:00 2001 From: liuhongt Date: Thu, 9 Sep 2021 14:49:16 +0800 Subject: Remove copysign post_reload splitter for scalar modes. It can generate better code just like avx512dq-abs-copysign-1.c shows. gcc/ChangeLog: * config/i386/i386-expand.c (ix86_expand_copysign): Expand right into ANDNOT + AND + IOR, using paradoxical subregs. (ix86_split_copysign_const): Remove. (ix86_split_copysign_var): Ditto. * config/i386/i386-protos.h (ix86_split_copysign_const): Dotto. (ix86_split_copysign_var): Ditto. * config/i386/i386.md (@copysign3_const): Ditto. (@copysign3_var): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512dq-abs-copysign-1.c: Adjust testcase. * gcc.target/i386/avx512vl-abs-copysign-1.c: Adjust testcase. --- gcc/config/i386/i386-expand.c | 152 ++++++++---------------------------------- gcc/config/i386/i386-protos.h | 2 - gcc/config/i386/i386.md | 44 ------------ 3 files changed, 26 insertions(+), 172 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index ad9c672..0c1aec5 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -2115,13 +2115,9 @@ void ix86_expand_copysign (rtx operands[]) { machine_mode mode, vmode; - rtx dest, op0, op1, mask; + rtx dest, op0, op1, mask, op2, op3; - dest = operands[0]; - op0 = operands[1]; - op1 = operands[2]; - - mode = GET_MODE (dest); + mode = GET_MODE (operands[0]); if (mode == SFmode) vmode = V4SFmode; @@ -2132,136 +2128,40 @@ ix86_expand_copysign (rtx operands[]) else gcc_unreachable (); - mask = ix86_build_signbit_mask (vmode, 0, 0); - - if (CONST_DOUBLE_P (op0)) + if (rtx_equal_p (operands[1], operands[2])) { - if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0))) - op0 = simplify_unary_operation (ABS, mode, op0, mode); - - if (mode == SFmode || mode == DFmode) - { - if (op0 == CONST0_RTX (mode)) - op0 = CONST0_RTX (vmode); - else - { - rtx v = ix86_build_const_vector (vmode, false, op0); - - op0 = force_reg (vmode, v); - } - } - else if (op0 != CONST0_RTX (mode)) - op0 = force_reg (mode, op0); - - emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask)); - } - else - { - rtx nmask = ix86_build_signbit_mask (vmode, 0, 1); - - emit_insn (gen_copysign3_var - (mode, dest, NULL_RTX, op0, op1, nmask, mask)); - } -} - -/* Deconstruct a copysign operation into bit masks. Operand 0 is known to - be a constant, and so has already been expanded into a vector constant. */ - -void -ix86_split_copysign_const (rtx operands[]) -{ - machine_mode mode, vmode; - rtx dest, op0, mask, x; - - dest = operands[0]; - op0 = operands[1]; - mask = operands[3]; - - mode = GET_MODE (dest); - vmode = GET_MODE (mask); - - dest = lowpart_subreg (vmode, dest, mode); - x = gen_rtx_AND (vmode, dest, mask); - emit_insn (gen_rtx_SET (dest, x)); - - if (op0 != CONST0_RTX (vmode)) - { - x = gen_rtx_IOR (vmode, dest, op0); - emit_insn (gen_rtx_SET (dest, x)); - } -} - -/* Deconstruct a copysign operation into bit masks. Operand 0 is variable, - so we have to do two masks. */ - -void -ix86_split_copysign_var (rtx operands[]) -{ - machine_mode mode, vmode; - rtx dest, scratch, op0, op1, mask, nmask, x; - - dest = operands[0]; - scratch = operands[1]; - op0 = operands[2]; - op1 = operands[3]; - nmask = operands[4]; - mask = operands[5]; - - mode = GET_MODE (dest); - vmode = GET_MODE (mask); - - if (rtx_equal_p (op0, op1)) - { - /* Shouldn't happen often (it's useless, obviously), but when it does - we'd generate incorrect code if we continue below. */ - emit_move_insn (dest, op0); + emit_move_insn (operands[0], operands[1]); return; } - if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */ - { - gcc_assert (REGNO (op1) == REGNO (scratch)); - - x = gen_rtx_AND (vmode, scratch, mask); - emit_insn (gen_rtx_SET (scratch, x)); + dest = lowpart_subreg (vmode, operands[0], mode); + op1 = lowpart_subreg (vmode, operands[2], mode); + mask = ix86_build_signbit_mask (vmode, 0, 0); - dest = mask; - op0 = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_NOT (vmode, dest); - x = gen_rtx_AND (vmode, x, op0); - emit_insn (gen_rtx_SET (dest, x)); - } - else + if (CONST_DOUBLE_P (operands[1])) { - if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */ - { - x = gen_rtx_AND (vmode, scratch, mask); - } - else /* alternative 2,4 */ + op0 = simplify_unary_operation (ABS, mode, operands[1], mode); + /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */ + if (op0 == CONST0_RTX (mode)) { - gcc_assert (REGNO (mask) == REGNO (scratch)); - op1 = lowpart_subreg (vmode, op1, mode); - x = gen_rtx_AND (vmode, scratch, op1); + emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1)); + return; } - emit_insn (gen_rtx_SET (scratch, x)); - if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */ - { - dest = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_AND (vmode, dest, nmask); - } - else /* alternative 3,4 */ - { - gcc_assert (REGNO (nmask) == REGNO (dest)); - dest = nmask; - op0 = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_AND (vmode, dest, op0); - } - emit_insn (gen_rtx_SET (dest, x)); + if (GET_MODE_SIZE (mode) < 16) + op0 = ix86_build_const_vector (vmode, false, op0); + op0 = force_reg (vmode, op0); } - - x = gen_rtx_IOR (vmode, dest, scratch); - emit_insn (gen_rtx_SET (dest, x)); + else + op0 = lowpart_subreg (vmode, operands[1], mode); + + op2 = gen_reg_rtx (vmode); + op3 = gen_reg_rtx (vmode); + emit_move_insn (op2, gen_rtx_AND (vmode, + gen_rtx_NOT (vmode, mask), + op0)); + emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1)); + emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3)); } /* Expand an xorsign operation. */ diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 72644e3..dcae34b 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -135,8 +135,6 @@ extern void ix86_expand_fp_absneg_operator (enum rtx_code, machine_mode, extern void ix86_split_fp_absneg_operator (enum rtx_code, machine_mode, rtx[]); extern void ix86_expand_copysign (rtx []); -extern void ix86_split_copysign_const (rtx []); -extern void ix86_split_copysign_var (rtx []); extern void ix86_expand_xorsign (rtx []); extern bool ix86_unary_operator_ok (enum rtx_code, machine_mode, rtx[]); extern bool ix86_match_ccmode (rtx, machine_mode); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 6b4ceb2..ba0058d 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -10861,50 +10861,6 @@ || (TARGET_SSE && (mode == TFmode))" "ix86_expand_copysign (operands); DONE;") -(define_insn_and_split "@copysign3_const" - [(set (match_operand:SSEMODEF 0 "register_operand" "=Yv") - (unspec:SSEMODEF - [(match_operand: 1 "nonimm_or_0_operand" "YvmC") - (match_operand:SSEMODEF 2 "register_operand" "0") - (match_operand: 3 "nonimmediate_operand" "Yvm")] - UNSPEC_COPYSIGN))] - "(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - || (TARGET_SSE && (mode == TFmode))" - "#" - "&& reload_completed" - [(const_int 0)] - "ix86_split_copysign_const (operands); DONE;") - -(define_insn "@copysign3_var" - [(set (match_operand:SSEMODEF 0 "register_operand" "=Yv,Yv,Yv,Yv,Yv") - (unspec:SSEMODEF - [(match_operand:SSEMODEF 2 "register_operand" "Yv,0,0,Yv,Yv") - (match_operand:SSEMODEF 3 "register_operand" "1,1,Yv,1,Yv") - (match_operand: 4 - "nonimmediate_operand" "X,Yvm,Yvm,0,0") - (match_operand: 5 - "nonimmediate_operand" "0,Yvm,1,Yvm,1")] - UNSPEC_COPYSIGN)) - (clobber (match_scratch: 1 "=Yv,Yv,Yv,Yv,Yv"))] - "(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - || (TARGET_SSE && (mode == TFmode))" - "#") - -(define_split - [(set (match_operand:SSEMODEF 0 "register_operand") - (unspec:SSEMODEF - [(match_operand:SSEMODEF 2 "register_operand") - (match_operand:SSEMODEF 3 "register_operand") - (match_operand: 4) - (match_operand: 5)] - UNSPEC_COPYSIGN)) - (clobber (match_scratch: 1))] - "((SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - || (TARGET_SSE && (mode == TFmode))) - && reload_completed" - [(const_int 0)] - "ix86_split_copysign_var (operands); DONE;") - (define_expand "xorsign3" [(match_operand:MODEF 0 "register_operand") (match_operand:MODEF 1 "register_operand") -- cgit v1.1 From 0458154caafc5438cecf1db8cf96076e384244ab Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Thu, 9 Sep 2021 15:08:22 +0200 Subject: Remove dbx.h, do not set PREFERRED_DEBUGGING_TYPE from dbxcoff.h, lynx.h The following removes the unused config/dbx.h file and removes the setting of PREFERRED_DEBUGGING_TYPE from dbxcoff.h which is overridden by all users (djgpp/mingw/cygwin) via either including config/i386/djgpp.h or config/i386/cygming.h There are still circumstances where mingw and cygwin default to STABS, namely when HAVE_GAS_PE_SECREL32_RELOC is not defined and the target defaults to 32bit code generation. The new style handling DBX_DEBUGGING_INFO is in line with dbxelf.h which does not define PREFERRED_DEBUGGING_TYPE either. The patch also removes the PREFERRED_DEBUGGING_TYPE define from lynx.h which always follows elfos.h already defaulting to DWARF, so the comment about STABS being the default is misleading and outdated. 2021-09-09 Richard Biener PR target/102255 * config/dbx.h: Remove. * config/dbxcoff.h: Do not define PREFERRED_DEBUGGING_TYPE. * config/lynx.h: Likewise. --- gcc/config/dbx.h | 32 -------------------------------- gcc/config/dbxcoff.h | 6 ------ gcc/config/lynx.h | 7 ------- 3 files changed, 45 deletions(-) delete mode 100644 gcc/config/dbx.h (limited to 'gcc/config') diff --git a/gcc/config/dbx.h b/gcc/config/dbx.h deleted file mode 100644 index b270a07..0000000 --- a/gcc/config/dbx.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Prefer DBX (stabs) debugging information. - Copyright (C) 1996-2021 Free Software Foundation, Inc. - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -Under Section 7 of GPL version 3, you are granted additional -permissions described in the GCC Runtime Library Exception, version -3.1, as published by the Free Software Foundation. - -You should have received a copy of the GNU General Public License and -a copy of the GCC Runtime Library Exception along with this program; -see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -. */ - -/* This file causes gcc to prefer using DBX (stabs) debugging - information. The configure script will add a #include of this file - to tm.h when --with-stabs is used for certain targets. */ - -#define DBX_DEBUGGING_INFO 1 - -#undef PREFERRED_DEBUGGING_TYPE -#define PREFERRED_DEBUGGING_TYPE DBX_DEBUG diff --git a/gcc/config/dbxcoff.h b/gcc/config/dbxcoff.h index d491cff..bd705f9 100644 --- a/gcc/config/dbxcoff.h +++ b/gcc/config/dbxcoff.h @@ -25,12 +25,6 @@ along with GCC; see the file COPYING3. If not see #define DBX_DEBUGGING_INFO 1 -/* Generate DBX debugging information by default. */ - -#ifndef PREFERRED_DEBUGGING_TYPE -#define PREFERRED_DEBUGGING_TYPE DBX_DEBUG -#endif - /* Be function-relative for block and source line stab directives. */ #define DBX_BLOCKS_FUNCTION_RELATIVE 1 diff --git a/gcc/config/lynx.h b/gcc/config/lynx.h index 020cc97..1438704 100644 --- a/gcc/config/lynx.h +++ b/gcc/config/lynx.h @@ -158,13 +158,6 @@ along with GCC; see the file COPYING3. If not see while (0) #endif -/* Keep the *_DEBUGGING_INFO defines from elfos.h except that stabs is - the default on LynxOS. */ - -#ifndef PREFERRED_DEBUGGING_TYPE -# define PREFERRED_DEBUGGING_TYPE DBX_DEBUG -#endif - #ifndef TARGET_POSIX_IO # define TARGET_POSIX_IO #endif -- cgit v1.1 From d959312b429971d69521e91506e304f8fa271a2d Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sun, 27 Jan 2019 19:38:02 -0800 Subject: AVX512FP16: Enable _Float16 autovectorization gcc/ChangeLog: * config/i386/i386-expand.c (ix86_avx256_split_vector_move_misalign): Handle V16HF mode. * config/i386/i386.c (ix86_preferred_simd_mode): Handle HF mode. * config/i386/sse.md (V_256H): New mode iterator. (avx_vextractf128): Use it. (VEC_INIT_MODE): Align vector HFmode condition to vector HImodes since there're no real HF instruction used. (VEC_INIT_HALF_MODE): Ditto. (VIHF): Ditto. (VIHF_AVX512BW): Ditto. (*vec_extracthf): Ditto. (VEC_EXTRACT_MODE): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/vect-float16-1.c: New test. * gcc.target/i386/vect-float16-10.c: Ditto. * gcc.target/i386/vect-float16-11.c: Ditto. * gcc.target/i386/vect-float16-12.c: Ditto. * gcc.target/i386/vect-float16-2.c: Ditto. * gcc.target/i386/vect-float16-3.c: Ditto. * gcc.target/i386/vect-float16-4.c: Ditto. * gcc.target/i386/vect-float16-5.c: Ditto. * gcc.target/i386/vect-float16-6.c: Ditto. * gcc.target/i386/vect-float16-7.c: Ditto. * gcc.target/i386/vect-float16-8.c: Ditto. * gcc.target/i386/vect-float16-9.c: Ditto. --- gcc/config/i386/i386-expand.c | 4 ++++ gcc/config/i386/i386.c | 14 ++++++++++++++ gcc/config/i386/sse.md | 24 ++++++++++++------------ 3 files changed, 30 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 0c1aec5..cac8354 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -678,6 +678,10 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) extract = gen_avx_vextractf128v32qi; mode = V16QImode; break; + case E_V16HFmode: + extract = gen_avx_vextractf128v16hf; + mode = V8HFmode; + break; case E_V8SFmode: extract = gen_avx_vextractf128v8sf; mode = V4SFmode; diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index dc649f9..7b173bc 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -22641,6 +22641,20 @@ ix86_preferred_simd_mode (scalar_mode mode) else return V2DImode; + case E_HFmode: + if (TARGET_AVX512FP16) + { + if (TARGET_AVX512VL) + { + if (TARGET_PREFER_AVX128) + return V8HFmode; + else if (TARGET_PREFER_AVX256) + return V16HFmode; + } + return V32HFmode; + } + return word_mode; + case E_SFmode: if (TARGET_AVX512F && !TARGET_PREFER_AVX256) return V16SFmode; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 0633916..2602460 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -279,6 +279,10 @@ (define_mode_iterator V_256 [V32QI V16HI V8SI V4DI V8SF V4DF]) +;; All 256bit vector modes including HF vector mode +(define_mode_iterator V_256H + [V32QI V16HI V8SI V4DI V8SF V4DF V16HF]) + ;; All 128bit and 256bit vector modes (define_mode_iterator V_128_256 [V32QI V16QI V16HI V8HI V8SI V4SI V4DI V2DI V8SF V4SF V4DF V2DF]) @@ -406,8 +410,7 @@ (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX") V8HI (V8SI "TARGET_AVX") V4SI (V4DI "TARGET_AVX") V2DI - (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") - (V8HF "TARGET_AVX512FP16")]) + (V32HF "TARGET_AVX512BW") (V16HF "TARGET_AVX") V8HF]) (define_mode_iterator VI_AVX2 [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI @@ -752,7 +755,7 @@ [V16SI V8DI (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")]) (define_mode_iterator VIHF_AVX512BW [V16SI V8DI (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW") - (V32HF "TARGET_AVX512FP16")]) + (V32HF "TARGET_AVX512BW")]) ;; Int-float size matches (define_mode_iterator VI4F_128 [V4SI V4SF]) @@ -9381,7 +9384,7 @@ (define_expand "avx_vextractf128" [(match_operand: 0 "nonimmediate_operand") - (match_operand:V_256 1 "register_operand") + (match_operand:V_256H 1 "register_operand") (match_operand:SI 2 "const_0_to_1_operand")] "TARGET_AVX" { @@ -9868,7 +9871,7 @@ (match_operand:V8HF 1 "register_operand" "v,v") (parallel [(match_operand:SI 2 "const_0_to_7_operand")])))] - "TARGET_AVX512FP16" + "TARGET_SSE2" "@ vpextrw\t{%2, %1, %k0|%k0, %1, %2} vpextrw\t{%2, %1, %0|%0, %1, %2}" @@ -9882,8 +9885,7 @@ (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX") V8HI (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI - (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") - (V8HF "TARGET_AVX512FP16") + (V32HF "TARGET_AVX512BW") (V16HF "TARGET_AVX") V8HF (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX")]) @@ -15615,7 +15617,7 @@ ;; Modes handled by pinsr patterns. (define_mode_iterator PINSR_MODE - [(V16QI "TARGET_SSE4_1") V8HI (V8HF "TARGET_AVX512FP16") + [(V16QI "TARGET_SSE4_1") V8HI V8HF (V4SI "TARGET_SSE4_1") (V2DI "TARGET_SSE4_1 && TARGET_64BIT")]) @@ -23723,8 +23725,7 @@ (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI - (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") - (V8HF "TARGET_AVX512FP16") + (V32HF "TARGET_AVX512F") (V16HF "TARGET_AVX") V8HF (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2") (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX")]) @@ -23736,8 +23737,7 @@ (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") - (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") - (V8HF "TARGET_AVX512FP16") + (V32HF "TARGET_AVX512F") (V16HF "TARGET_AVX") V8HF (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V4TI "TARGET_AVX512F")]) -- cgit v1.1 From 71838266e7857381ec1ea9d0cf0066e7992a73c5 Mon Sep 17 00:00:00 2001 From: "Liu, Hongtao" Date: Mon, 28 Jan 2019 00:05:04 -0800 Subject: AVX512FP16: Add vaddsh/vsubsh/vmulsh/vdivsh. gcc/ChangeLog: * config/i386/avx512fp16intrin.h (_mm_add_sh): New intrinsic. (_mm_mask_add_sh): Likewise. (_mm_maskz_add_sh): Likewise. (_mm_sub_sh): Likewise. (_mm_mask_sub_sh): Likewise. (_mm_maskz_sub_sh): Likewise. (_mm_mul_sh): Likewise. (_mm_mask_mul_sh): Likewise. (_mm_maskz_mul_sh): Likewise. (_mm_div_sh): Likewise. (_mm_mask_div_sh): Likewise. (_mm_maskz_div_sh): Likewise. (_mm_add_round_sh): Likewise. (_mm_mask_add_round_sh): Likewise. (_mm_maskz_add_round_sh): Likewise. (_mm_sub_round_sh): Likewise. (_mm_mask_sub_round_sh): Likewise. (_mm_maskz_sub_round_sh): Likewise. (_mm_mul_round_sh): Likewise. (_mm_mask_mul_round_sh): Likewise. (_mm_maskz_mul_round_sh): Likewise. (_mm_div_round_sh): Likewise. (_mm_mask_div_round_sh): Likewise. (_mm_maskz_div_round_sh): Likewise. * config/i386/i386-builtin-types.def: Add corresponding builtin types. * config/i386/i386-builtin.def: Add corresponding new builtins. * config/i386/i386-expand.c (ix86_expand_round_builtin): Handle new builtins. * config/i386/sse.md (VF_128): Change description. (_vm3): Adjust to support HF vector modes. (_vm3): Likewise. gcc/testsuite/ChangeLog: * gcc.target/i386/avx-1.c: Add test for new builtins. * gcc.target/i386/sse-13.c: Ditto. * gcc.target/i386/sse-23.c: Ditto. * gcc.target/i386/sse-14.c: Add test for new intrinsics. * gcc.target/i386/sse-22.c: Ditto. --- gcc/config/i386/avx512fp16intrin.h | 254 +++++++++++++++++++++++++++++++++ gcc/config/i386/i386-builtin-types.def | 2 + gcc/config/i386/i386-builtin.def | 8 ++ gcc/config/i386/i386-expand.c | 2 + gcc/config/i386/sse.md | 22 +-- 5 files changed, 277 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h index 3e9d676..6ae12eb 100644 --- a/gcc/config/i386/avx512fp16intrin.h +++ b/gcc/config/i386/avx512fp16intrin.h @@ -468,6 +468,260 @@ _mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C, (A), (D))) #endif /* __OPTIMIZE__ */ +/* Intrinsics of v[add,sub,mul,div]sh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_sh (__m128h __A, __m128h __B) +{ + __A[0] += __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vaddsh_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vaddsh_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_sh (__m128h __A, __m128h __B) +{ + __A[0] -= __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vsubsh_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vsubsh_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_sh (__m128h __A, __m128h __B) +{ + __A[0] *= __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vmulsh_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vmulsh_v8hf_mask (__B, __C, _mm_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_sh (__m128h __A, __m128h __B) +{ + __A[0] /= __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vdivsh_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vdivsh_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_vaddsh_v8hf_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_vaddsh_v8hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_vaddsh_v8hf_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_vsubsh_v8hf_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_vsubsh_v8hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_vsubsh_v8hf_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_vmulsh_v8hf_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_vmulsh_v8hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_vmulsh_v8hf_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_vdivsh_v8hf_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_vdivsh_v8hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_vdivsh_v8hf_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} +#else +#define _mm_add_round_sh(A, B, C) \ + ((__m128h)__builtin_ia32_vaddsh_v8hf_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_add_round_sh(A, B, C, D, E) \ + ((__m128h)__builtin_ia32_vaddsh_v8hf_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_add_round_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_vaddsh_v8hf_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) + +#define _mm_sub_round_sh(A, B, C) \ + ((__m128h)__builtin_ia32_vsubsh_v8hf_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_sub_round_sh(A, B, C, D, E) \ + ((__m128h)__builtin_ia32_vsubsh_v8hf_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_sub_round_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_vsubsh_v8hf_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) + +#define _mm_mul_round_sh(A, B, C) \ + ((__m128h)__builtin_ia32_vmulsh_v8hf_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_mul_round_sh(A, B, C, D, E) \ + ((__m128h)__builtin_ia32_vmulsh_v8hf_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_mul_round_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_vmulsh_v8hf_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) + +#define _mm_div_round_sh(A, B, C) \ + ((__m128h)__builtin_ia32_vdivsh_v8hf_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_div_round_sh(A, B, C, D, E) \ + ((__m128h)__builtin_ia32_vdivsh_v8hf_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_div_round_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_vdivsh_v8hf_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) +#endif /* __OPTIMIZE__ */ + #ifdef __DISABLE_AVX512FP16__ #undef __DISABLE_AVX512FP16__ #pragma GCC pop_options diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index fdc46bd..86cf825 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -1304,7 +1304,9 @@ DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID) # FP16 builtins DEF_FUNCTION_TYPE (V8HF, V8HI) +DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, INT) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI) +DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI, INT) DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 2f152096..85ad235 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2787,6 +2787,10 @@ BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask, "__builtin_ia32_ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv8hf3_mask, "__builtin_ia32_vdivph_v8hf_mask", IX86_BUILTIN_VDIVPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv16hf3_mask, "__builtin_ia32_vdivph_v16hf_mask", IX86_BUILTIN_VDIVPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask, "__builtin_ia32_vdivph_v32hf_mask", IX86_BUILTIN_VDIVPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask, "__builtin_ia32_vaddsh_v8hf_mask", IX86_BUILTIN_VADDSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask, "__builtin_ia32_vsubsh_v8hf_mask", IX86_BUILTIN_VSUBSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask, "__builtin_ia32_vmulsh_v8hf_mask", IX86_BUILTIN_VMULSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask, "__builtin_ia32_vdivsh_v8hf_mask", IX86_BUILTIN_VDIVSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) /* Builtins with rounding support. */ BDESC_END (ARGS, ROUND_ARGS) @@ -2992,6 +2996,10 @@ BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask_round, "__builtin BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask_round, "__builtin_ia32_vsubph_v32hf_mask_round", IX86_BUILTIN_VSUBPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask_round, "__builtin_ia32_vmulph_v32hf_mask_round", IX86_BUILTIN_VMULPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask_round, "__builtin_ia32_vdivph_v32hf_mask_round", IX86_BUILTIN_VDIVPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask_round, "__builtin_ia32_vaddsh_v8hf_mask_round", IX86_BUILTIN_VADDSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask_round, "__builtin_ia32_vsubsh_v8hf_mask_round", IX86_BUILTIN_VSUBSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask_round, "__builtin_ia32_vmulsh_v8hf_mask_round", IX86_BUILTIN_VMULSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask_round, "__builtin_ia32_vdivsh_v8hf_mask_round", IX86_BUILTIN_VDIVSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC_END (ROUND_ARGS, MULTI_ARG) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index cac8354..29a4374 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -10646,6 +10646,7 @@ ix86_expand_round_builtin (const struct builtin_description *d, nargs = 2; break; case V32HF_FTYPE_V32HF_V32HF_INT: + case V8HF_FTYPE_V8HF_V8HF_INT: case V4SF_FTYPE_V4SF_UINT_INT: case V4SF_FTYPE_V4SF_UINT64_INT: case V2DF_FTYPE_V2DF_UINT64_INT: @@ -10693,6 +10694,7 @@ ix86_expand_round_builtin (const struct builtin_description *d, case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT: case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT: case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT: + case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT: nargs = 5; break; case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT: diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 2602460..6f2072c 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -349,7 +349,7 @@ (define_mode_iterator VF2_512_256VL [V8DF (V4DF "TARGET_AVX512VL")]) -;; All 128bit vector float modes +;; All 128bit vector SF/DF modes (define_mode_iterator VF_128 [V4SF (V2DF "TARGET_SSE2")]) @@ -2104,11 +2104,11 @@ (set_attr "mode" "")]) (define_insn "_vm3" - [(set (match_operand:VF_128 0 "register_operand" "=x,v") - (vec_merge:VF_128 - (plusminus:VF_128 - (match_operand:VF_128 1 "register_operand" "0,v") - (match_operand:VF_128 2 "nonimmediate_operand" "xm,")) + [(set (match_operand:VFH_128 0 "register_operand" "=x,v") + (vec_merge:VFH_128 + (plusminus:VFH_128 + (match_operand:VFH_128 1 "register_operand" "0,v") + (match_operand:VFH_128 2 "nonimmediate_operand" "xm,")) (match_dup 1) (const_int 1)))] "TARGET_SSE" @@ -2195,11 +2195,11 @@ (set_attr "mode" "")]) (define_insn "_vm3" - [(set (match_operand:VF_128 0 "register_operand" "=x,v") - (vec_merge:VF_128 - (multdiv:VF_128 - (match_operand:VF_128 1 "register_operand" "0,v") - (match_operand:VF_128 2 "nonimmediate_operand" "xm,")) + [(set (match_operand:VFH_128 0 "register_operand" "=x,v") + (vec_merge:VFH_128 + (multdiv:VFH_128 + (match_operand:VFH_128 1 "register_operand" "0,v") + (match_operand:VFH_128 2 "nonimmediate_operand" "xm,")) (match_dup 1) (const_int 1)))] "TARGET_SSE" -- cgit v1.1 From b96cb2caa973d26c4f27da91c44e35796f411e4a Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 23 Jan 2019 16:06:48 -0800 Subject: AVX512FP16: Add vmaxph/vminph/vmaxsh/vminsh. gcc/ChangeLog: * config/i386/avx512fp16intrin.h: (_mm512_max_ph): New intrinsic. (_mm512_mask_max_ph): Likewise. (_mm512_maskz_max_ph): Likewise. (_mm512_min_ph): Likewise. (_mm512_mask_min_ph): Likewise. (_mm512_maskz_min_ph): Likewise. (_mm512_max_round_ph): Likewise. (_mm512_mask_max_round_ph): Likewise. (_mm512_maskz_max_round_ph): Likewise. (_mm512_min_round_ph): Likewise. (_mm512_mask_min_round_ph): Likewise. (_mm512_maskz_min_round_ph): Likewise. (_mm_max_sh): Likewise. (_mm_mask_max_sh): Likewise. (_mm_maskz_max_sh): Likewise. (_mm_min_sh): Likewise. (_mm_mask_min_sh): Likewise. (_mm_maskz_min_sh): Likewise. (_mm_max_round_sh): Likewise. (_mm_mask_max_round_sh): Likewise. (_mm_maskz_max_round_sh): Likewise. (_mm_min_round_sh): Likewise. (_mm_mask_min_round_sh): Likewise. (_mm_maskz_min_round_sh): Likewise. * config/i386/avx512fp16vlintrin.h (_mm_max_ph): New intrinsic. (_mm256_max_ph): Likewise. (_mm_mask_max_ph): Likewise. (_mm256_mask_max_ph): Likewise. (_mm_maskz_max_ph): Likewise. (_mm256_maskz_max_ph): Likewise. (_mm_min_ph): Likewise. (_mm256_min_ph): Likewise. (_mm_mask_min_ph): Likewise. (_mm256_mask_min_ph): Likewise. (_mm_maskz_min_ph): Likewise. (_mm256_maskz_min_ph): Likewise. * config/i386/i386-builtin-types.def: Add corresponding builtin types. * config/i386/i386-builtin.def: Add corresponding new builtins. * config/i386/i386-expand.c (ix86_expand_args_builtin): Handle new builtin types. * config/i386/sse.md (3): Adjust to support HF vector modes. (*3): Likewise. (ieee_3): Likewise. (_vm3): Likewise. * config/i386/subst.md (round_saeonly_mode512bit_condition): Adjust for HF vector modes. gcc/testsuite/ChangeLog: * gcc.target/i386/avx-1.c: Add test for new builtins. * gcc.target/i386/sse-13.c: Ditto. * gcc.target/i386/sse-23.c: Ditto. * gcc.target/i386/sse-14.c: Add test for new intrinsics. * gcc.target/i386/sse-22.c: Ditto. --- gcc/config/i386/avx512fp16intrin.h | 263 +++++++++++++++++++++++++++++++++ gcc/config/i386/avx512fp16vlintrin.h | 97 ++++++++++++ gcc/config/i386/i386-builtin-types.def | 2 + gcc/config/i386/i386-builtin.def | 12 ++ gcc/config/i386/i386-expand.c | 2 + gcc/config/i386/sse.md | 43 +++--- gcc/config/i386/subst.md | 4 +- 7 files changed, 402 insertions(+), 21 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h index 6ae12eb..c232419 100644 --- a/gcc/config/i386/avx512fp16intrin.h +++ b/gcc/config/i386/avx512fp16intrin.h @@ -722,6 +722,269 @@ _mm_maskz_div_round_sh (__mmask8 __A, __m128h __B, __m128h __C, (A), (D))) #endif /* __OPTIMIZE__ */ +/* Intrinsic vmaxph vminph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_ph (__m512h __A, __m512h __B) +{ + return __builtin_ia32_vmaxph_v32hf_mask (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_vmaxph_v32hf_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_vmaxph_v32hf_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_ph (__m512h __A, __m512h __B) +{ + return __builtin_ia32_vminph_v32hf_mask (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_vminph_v32hf_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_vminph_v32hf_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_vmaxph_v32hf_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_vmaxph_v32hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_vmaxph_v32hf_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_vminph_v32hf_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_vminph_v32hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_vminph_v32hf_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +#else +#define _mm512_max_round_ph(A, B, C) \ + (__builtin_ia32_vmaxph_v32hf_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_max_round_ph(A, B, C, D, E) \ + (__builtin_ia32_vmaxph_v32hf_mask_round ((C), (D), (A), (B), (E))) + +#define _mm512_maskz_max_round_ph(A, B, C, D) \ + (__builtin_ia32_vmaxph_v32hf_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) + +#define _mm512_min_round_ph(A, B, C) \ + (__builtin_ia32_vminph_v32hf_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_min_round_ph(A, B, C, D, E) \ + (__builtin_ia32_vminph_v32hf_mask_round ((C), (D), (A), (B), (E))) + +#define _mm512_maskz_min_round_ph(A, B, C, D) \ + (__builtin_ia32_vminph_v32hf_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) +#endif /* __OPTIMIZE__ */ + +/* Intrinsic vmaxsh vminsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_sh (__m128h __A, __m128h __B) +{ + __A[0] = __A[0] > __B[0] ? __A[0] : __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vmaxsh_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vmaxsh_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_sh (__m128h __A, __m128h __B) +{ + __A[0] = __A[0] < __B[0] ? __A[0] : __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vminsh_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vminsh_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_vmaxsh_v8hf_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_vmaxsh_v8hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_vmaxsh_v8hf_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_vminsh_v8hf_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_vminsh_v8hf_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_vminsh_v8hf_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} + +#else +#define _mm_max_round_sh(A, B, C) \ + (__builtin_ia32_vmaxsh_v8hf_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_max_round_sh(A, B, C, D, E) \ + (__builtin_ia32_vmaxsh_v8hf_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_max_round_sh(A, B, C, D) \ + (__builtin_ia32_vmaxsh_v8hf_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) + +#define _mm_min_round_sh(A, B, C) \ + (__builtin_ia32_vminsh_v8hf_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_min_round_sh(A, B, C, D, E) \ + (__builtin_ia32_vminsh_v8hf_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_min_round_sh(A, B, C, D) \ + (__builtin_ia32_vminsh_v8hf_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) + +#endif /* __OPTIMIZE__ */ + #ifdef __DISABLE_AVX512FP16__ #undef __DISABLE_AVX512FP16__ #pragma GCC pop_options diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h index 75fa9eb..bd60b4c 100644 --- a/gcc/config/i386/avx512fp16vlintrin.h +++ b/gcc/config/i386/avx512fp16vlintrin.h @@ -211,6 +211,103 @@ _mm256_maskz_div_ph (__mmask16 __A, __m256h __B, __m256h __C) _mm256_setzero_ph (), __A); } +/* Intrinsics v[max,min]ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_ph (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vmaxph_v8hf_mask (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_ph (__m256h __A, __m256h __B) +{ + return __builtin_ia32_vmaxph_v16hf_mask (__A, __B, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vmaxph_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_vmaxph_v16hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vmaxph_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_vmaxph_v16hf_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_ph (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vminph_v8hf_mask (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_ph (__m256h __A, __m256h __B) +{ + return __builtin_ia32_vminph_v16hf_mask (__A, __B, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vminph_v8hf_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_vminph_v16hf_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vminph_v8hf_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_vminph_v16hf_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + #ifdef __DISABLE_AVX512FP16VL__ #undef __DISABLE_AVX512FP16VL__ #pragma GCC pop_options diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 86cf825..658fb69 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -1304,9 +1304,11 @@ DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID) # FP16 builtins DEF_FUNCTION_TYPE (V8HF, V8HI) +DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, INT) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI, INT) +DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF) DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 85ad235..df134d6 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2791,6 +2791,14 @@ BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask, "__b BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask, "__builtin_ia32_vsubsh_v8hf_mask", IX86_BUILTIN_VSUBSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask, "__builtin_ia32_vmulsh_v8hf_mask", IX86_BUILTIN_VMULSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask, "__builtin_ia32_vdivsh_v8hf_mask", IX86_BUILTIN_VDIVSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv8hf3_mask, "__builtin_ia32_vmaxph_v8hf_mask", IX86_BUILTIN_VMAXPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv16hf3_mask, "__builtin_ia32_vmaxph_v16hf_mask", IX86_BUILTIN_VMAXPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask, "__builtin_ia32_vmaxph_v32hf_mask", IX86_BUILTIN_VMAXPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv8hf3_mask, "__builtin_ia32_vminph_v8hf_mask", IX86_BUILTIN_VMINPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv16hf3_mask, "__builtin_ia32_vminph_v16hf_mask", IX86_BUILTIN_VMINPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask, "__builtin_ia32_vminph_v32hf_mask", IX86_BUILTIN_VMINPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask, "__builtin_ia32_vmaxsh_v8hf_mask", IX86_BUILTIN_VMAXSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask, "__builtin_ia32_vminsh_v8hf_mask", IX86_BUILTIN_VMINSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) /* Builtins with rounding support. */ BDESC_END (ARGS, ROUND_ARGS) @@ -3000,6 +3008,10 @@ BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask_round BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask_round, "__builtin_ia32_vsubsh_v8hf_mask_round", IX86_BUILTIN_VSUBSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask_round, "__builtin_ia32_vmulsh_v8hf_mask_round", IX86_BUILTIN_VMULSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask_round, "__builtin_ia32_vdivsh_v8hf_mask_round", IX86_BUILTIN_VDIVSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask_round, "__builtin_ia32_vmaxph_v32hf_mask_round", IX86_BUILTIN_VMAXPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask_round, "__builtin_ia32_vminph_v32hf_mask_round", IX86_BUILTIN_VMINPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask_round, "__builtin_ia32_vmaxsh_v8hf_mask_round", IX86_BUILTIN_VMAXSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask_round, "__builtin_ia32_vminsh_v8hf_mask_round", IX86_BUILTIN_VMINSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC_END (ROUND_ARGS, MULTI_ARG) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 29a4374..9b6648d 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -9527,12 +9527,14 @@ ix86_expand_args_builtin (const struct builtin_description *d, case FLOAT128_FTYPE_FLOAT128_FLOAT128: case V16QI_FTYPE_V16QI_V16QI: case V16QI_FTYPE_V8HI_V8HI: + case V16HF_FTYPE_V16HF_V16HF: case V16SF_FTYPE_V16SF_V16SF: case V8QI_FTYPE_V8QI_V8QI: case V8QI_FTYPE_V4HI_V4HI: case V8HI_FTYPE_V8HI_V8HI: case V8HI_FTYPE_V16QI_V16QI: case V8HI_FTYPE_V4SI_V4SI: + case V8HF_FTYPE_V8HF_V8HF: case V8SF_FTYPE_V8SF_V8SF: case V8SF_FTYPE_V8SF_V8SI: case V8DF_FTYPE_V8DF_V8DF: diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 6f2072c..1aab06d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2545,11 +2545,12 @@ }) (define_expand "3" - [(set (match_operand:VF 0 "register_operand") - (smaxmin:VF - (match_operand:VF 1 "") - (match_operand:VF 2 "")))] - "TARGET_SSE && && " + [(set (match_operand:VFH 0 "register_operand") + (smaxmin:VFH + (match_operand:VFH 1 "") + (match_operand:VFH 2 "")))] + "TARGET_SSE && + && " { if (!flag_finite_math_only || flag_signed_zeros) { @@ -2570,13 +2571,14 @@ ;; are undefined in this condition, we're certain this is correct. (define_insn "*3" - [(set (match_operand:VF 0 "register_operand" "=x,v") - (smaxmin:VF - (match_operand:VF 1 "" "%0,v") - (match_operand:VF 2 "" "xBm,")))] + [(set (match_operand:VFH 0 "register_operand" "=x,v") + (smaxmin:VFH + (match_operand:VFH 1 "" "%0,v") + (match_operand:VFH 2 "" "xBm,")))] "TARGET_SSE && !(MEM_P (operands[1]) && MEM_P (operands[2])) - && && " + && + && " "@ \t{%2, %0|%0, %2} v\t{%2, %1, %0|%0, %1, %2}" @@ -2593,13 +2595,14 @@ ;; presence of -0.0 and NaN. (define_insn "ieee_3" - [(set (match_operand:VF 0 "register_operand" "=x,v") - (unspec:VF - [(match_operand:VF 1 "register_operand" "0,v") - (match_operand:VF 2 "" "xBm,")] + [(set (match_operand:VFH 0 "register_operand" "=x,v") + (unspec:VFH + [(match_operand:VFH 1 "register_operand" "0,v") + (match_operand:VFH 2 "" "xBm,")] IEEE_MAXMIN))] "TARGET_SSE - && && " + && + && " "@ \t{%2, %0|%0, %2} v\t{%2, %1, %0|%0, %1, %2}" @@ -2643,11 +2646,11 @@ (set_attr "mode" "")]) (define_insn "_vm3" - [(set (match_operand:VF_128 0 "register_operand" "=x,v") - (vec_merge:VF_128 - (smaxmin:VF_128 - (match_operand:VF_128 1 "register_operand" "0,v") - (match_operand:VF_128 2 "nonimmediate_operand" "xm,")) + [(set (match_operand:VFH_128 0 "register_operand" "=x,v") + (vec_merge:VFH_128 + (smaxmin:VFH_128 + (match_operand:VFH_128 1 "register_operand" "0,v") + (match_operand:VFH_128 2 "nonimmediate_operand" "xm,")) (match_dup 1) (const_int 1)))] "TARGET_SSE" diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index 94426a5..717561a 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -208,7 +208,9 @@ (define_subst_attr "round_saeonly_mode512bit_condition" "round_saeonly" "1" "(mode == V16SFmode || mode == V8DFmode || mode == V8DImode - || mode == V16SImode)") + || mode == V16SImode + || mode == V32HFmode)") + (define_subst_attr "round_saeonly_modev8sf_condition" "round_saeonly" "1" "(mode == V8SFmode)") (define_subst "round_saeonly" -- cgit v1.1 From 0f200733fe863c7ed4d33ab3fda16471d5d69981 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 18 Feb 2019 18:04:02 -0800 Subject: AVX512FP16: Add vcmpph/vcmpsh/vcomish/vucomish. gcc/ChangeLog: * config/i386/avx512fp16intrin.h: (_mm512_cmp_ph_mask): New intrinsic. (_mm512_mask_cmp_ph_mask): Likewise. (_mm512_cmp_round_ph_mask): Likewise. (_mm512_mask_cmp_round_ph_mask): Likewise. (_mm_cmp_sh_mask): Likewise. (_mm_mask_cmp_sh_mask): Likewise. (_mm_cmp_round_sh_mask): Likewise. (_mm_mask_cmp_round_sh_mask): Likewise. (_mm_comieq_sh): Likewise. (_mm_comilt_sh): Likewise. (_mm_comile_sh): Likewise. (_mm_comigt_sh): Likewise. (_mm_comige_sh): Likewise. (_mm_comineq_sh): Likewise. (_mm_ucomieq_sh): Likewise. (_mm_ucomilt_sh): Likewise. (_mm_ucomile_sh): Likewise. (_mm_ucomigt_sh): Likewise. (_mm_ucomige_sh): Likewise. (_mm_ucomineq_sh): Likewise. (_mm_comi_round_sh): Likewise. (_mm_comi_sh): Likewise. * config/i386/avx512fp16vlintrin.h (_mm_cmp_ph_mask): New intrinsic. (_mm_mask_cmp_ph_mask): Likewise. (_mm256_cmp_ph_mask): Likewise. (_mm256_mask_cmp_ph_mask): Likewise. * config/i386/i386-builtin-types.def: Add corresponding builtin types. * config/i386/i386-builtin.def: Add corresponding new builtins. * config/i386/i386-expand.c (ix86_expand_args_builtin): Handle new builtin types. (ix86_expand_round_builtin): Ditto. * config/i386/i386.md (ssevecmode): Add HF mode. (MODEFH): New mode iterator. * config/i386/sse.md (V48H_AVX512VL): New mode iterator to support HF vector modes. Ajdust corresponding description. (ssecmpintprefix): New. (VI12_AVX512VL): Adjust to support HF vector modes. (cmp_imm_predicate): Likewise. (_cmp3): Likewise. (avx512f_vmcmp3): Likewise. (avx512f_vmcmp3_mask): Likewise. (_comi): Likewise. gcc/testsuite/ChangeLog: * gcc.target/i386/avx-1.c: Add test for new builtins. * gcc.target/i386/sse-13.c: Ditto. * gcc.target/i386/sse-23.c: Ditto. * gcc.target/i386/sse-14.c: Add test for new intrinsics. * gcc.target/i386/sse-22.c: Ditto. --- gcc/config/i386/avx512fp16intrin.h | 250 +++++++++++++++++++++++++++++++++ gcc/config/i386/avx512fp16vlintrin.h | 50 +++++++ gcc/config/i386/i386-builtin-types.def | 5 + gcc/config/i386/i386-builtin.def | 5 + gcc/config/i386/i386-expand.c | 10 ++ gcc/config/i386/i386.md | 5 +- gcc/config/i386/sse.md | 56 +++++--- 7 files changed, 363 insertions(+), 18 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h index c232419..ed8ad84 100644 --- a/gcc/config/i386/avx512fp16intrin.h +++ b/gcc/config/i386/avx512fp16intrin.h @@ -985,6 +985,256 @@ _mm_maskz_min_round_sh (__mmask8 __A, __m128h __B, __m128h __C, #endif /* __OPTIMIZE__ */ +/* vcmpph */ +#ifdef __OPTIMIZE +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_ph_mask (__m512h __A, __m512h __B, const int __C) +{ + return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask (__A, __B, __C, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_ph_mask (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask (__B, __C, __D, + __A); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_round_ph_mask (__m512h __A, __m512h __B, const int __C, + const int __D) +{ + return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask_round (__A, __B, + __C, (__mmask32) -1, + __D); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_round_ph_mask (__mmask32 __A, __m512h __B, __m512h __C, + const int __D, const int __E) +{ + return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask_round (__B, __C, + __D, __A, + __E); +} + +#else +#define _mm512_cmp_ph_mask(A, B, C) \ + (__builtin_ia32_vcmpph_v32hf_mask ((A), (B), (C), (-1))) + +#define _mm512_mask_cmp_ph_mask(A, B, C, D) \ + (__builtin_ia32_vcmpph_v32hf_mask ((B), (C), (D), (A))) + +#define _mm512_cmp_round_ph_mask(A, B, C, D) \ + (__builtin_ia32_vcmpph_v32hf_mask_round ((A), (B), (C), (-1), (D))) + +#define _mm512_mask_cmp_round_ph_mask(A, B, C, D, E) \ + (__builtin_ia32_vcmpph_v32hf_mask_round ((B), (C), (D), (A), (E))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcmpsh. */ +#ifdef __OPTIMIZE__ +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_sh_mask (__m128h __A, __m128h __B, const int __C) +{ + return (__mmask8) + __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, + __C, (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_sh_mask (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return (__mmask8) + __builtin_ia32_vcmpsh_v8hf_mask_round (__B, __C, + __D, __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_round_sh_mask (__m128h __A, __m128h __B, const int __C, + const int __D) +{ + return (__mmask8) __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, + __C, (__mmask8) -1, + __D); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_round_sh_mask (__mmask8 __A, __m128h __B, __m128h __C, + const int __D, const int __E) +{ + return (__mmask8) __builtin_ia32_vcmpsh_v8hf_mask_round (__B, __C, + __D, __A, + __E); +} + +#else +#define _mm_cmp_sh_mask(A, B, C) \ + (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (C), (-1), \ + (_MM_FROUND_CUR_DIRECTION))) + +#define _mm_mask_cmp_sh_mask(A, B, C, D) \ + (__builtin_ia32_vcmpsh_v8hf_mask_round ((B), (C), (D), (A), \ + (_MM_FROUND_CUR_DIRECTION))) + +#define _mm_cmp_round_sh_mask(A, B, C, D) \ + (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (C), (-1), (D))) + +#define _mm_mask_cmp_round_sh_mask(A, B, C, D, E) \ + (__builtin_ia32_vcmpsh_v8hf_mask_round ((B), (C), (D), (A), (E))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcomish. */ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comieq_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_EQ_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comilt_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LT_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comile_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LE_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comigt_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GT_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comige_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GE_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comineq_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_NEQ_US, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomieq_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_EQ_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomilt_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LT_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomile_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LE_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomigt_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GT_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomige_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GE_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomineq_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_NEQ_UQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comi_sh (__m128h __A, __m128h __B, const int __P) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comi_round_sh (__m128h __A, __m128h __B, const int __P, const int __R) +{ + return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, __P, + (__mmask8) -1,__R); +} + +#else +#define _mm_comi_round_sh(A, B, P, R) \ + (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (P), (__mmask8) (-1), (R))) +#define _mm_comi_sh(A, B, P) \ + (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (P), (__mmask8) (-1), \ + _MM_FROUND_CUR_DIRECTION)) + +#endif /* __OPTIMIZE__ */ + #ifdef __DISABLE_AVX512FP16__ #undef __DISABLE_AVX512FP16__ #pragma GCC pop_options diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h index bd60b4c..1787ed5 100644 --- a/gcc/config/i386/avx512fp16vlintrin.h +++ b/gcc/config/i386/avx512fp16vlintrin.h @@ -308,6 +308,56 @@ _mm256_maskz_min_ph (__mmask16 __A, __m256h __B, __m256h __C) _mm256_setzero_ph (), __A); } +/* vcmpph */ +#ifdef __OPTIMIZE +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ph_mask (__m128h __A, __m128h __B, const int __C) +{ + return (__mmask8) __builtin_ia32_vcmpph_v8hf_mask (__A, __B, __C, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_ph_mask (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return (__mmask8) __builtin_ia32_vcmpph_v8hf_mask (__B, __C, __D, __A); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ph_mask (__m256h __A, __m256h __B, const int __C) +{ + return (__mmask16) __builtin_ia32_vcmpph_v16hf_mask (__A, __B, __C, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_ph_mask (__mmask16 __A, __m256h __B, __m256h __C, + const int __D) +{ + return (__mmask16) __builtin_ia32_vcmpph_v16hf_mask (__B, __C, __D, + __A); +} + +#else +#define _mm_cmp_ph_mask(A, B, C) \ + (__builtin_ia32_vcmpph_v8hf_mask ((A), (B), (C), (-1))) + +#define _mm_mask_cmp_ph_mask(A, B, C, D) \ + (__builtin_ia32_vcmpph_v8hf_mask ((B), (C), (D), (A))) + +#define _mm256_cmp_ph_mask(A, B, C) \ + (__builtin_ia32_vcmpph_v16hf_mask ((A), (B), (C), (-1))) + +#define _mm256_mask_cmp_ph_mask(A, B, C, D) \ + (__builtin_ia32_vcmpph_v16hf_mask ((B), (C), (D), (A))) + +#endif /* __OPTIMIZE__ */ + #ifdef __DISABLE_AVX512FP16VL__ #undef __DISABLE_AVX512FP16VL__ #pragma GCC pop_options diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 658fb69..d11c02b 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -1306,10 +1306,15 @@ DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID) DEF_FUNCTION_TYPE (V8HF, V8HI) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, INT) +DEF_FUNCTION_TYPE (UQI, V8HF, V8HF, INT, UQI) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI) +DEF_FUNCTION_TYPE (UQI, V8HF, V8HF, INT, UQI, INT) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI, INT) DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF) +DEF_FUNCTION_TYPE (UHI, V16HF, V16HF, INT, UHI) DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT) +DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI) +DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI, INT) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index df134d6..c9d80cb 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2799,6 +2799,9 @@ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv16hf BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask, "__builtin_ia32_vminph_v32hf_mask", IX86_BUILTIN_VMINPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask, "__builtin_ia32_vmaxsh_v8hf_mask", IX86_BUILTIN_VMAXSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask, "__builtin_ia32_vminsh_v8hf_mask", IX86_BUILTIN_VMINSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_cmpv8hf3_mask, "__builtin_ia32_vcmpph_v8hf_mask", IX86_BUILTIN_VCMPPH_V8HF_MASK, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_cmpv16hf3_mask, "__builtin_ia32_vcmpph_v16hf_mask", IX86_BUILTIN_VCMPPH_V16HF_MASK, UNKNOWN, (int) UHI_FTYPE_V16HF_V16HF_INT_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask, "__builtin_ia32_vcmpph_v32hf_mask", IX86_BUILTIN_VCMPPH_V32HF_MASK, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI) /* Builtins with rounding support. */ BDESC_END (ARGS, ROUND_ARGS) @@ -3012,6 +3015,8 @@ BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask_round, "__builti BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask_round, "__builtin_ia32_vminph_v32hf_mask_round", IX86_BUILTIN_VMINPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask_round, "__builtin_ia32_vmaxsh_v8hf_mask_round", IX86_BUILTIN_VMAXSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask_round, "__builtin_ia32_vminsh_v8hf_mask_round", IX86_BUILTIN_VMINSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask_round, "__builtin_ia32_vcmpph_v32hf_mask_round", IX86_BUILTIN_VCMPPH_V32HF_MASK_ROUND, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmcmpv8hf3_mask_round, "__builtin_ia32_vcmpsh_v8hf_mask_round", IX86_BUILTIN_VCMPSH_V8HF_MASK_ROUND, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI_INT) BDESC_END (ROUND_ARGS, MULTI_ARG) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 9b6648d..e117afb 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -9999,14 +9999,17 @@ ix86_expand_args_builtin (const struct builtin_description *d, case UQI_FTYPE_V8SI_V8SI_INT_UQI: case QI_FTYPE_V4DF_V4DF_INT_UQI: case QI_FTYPE_V8SF_V8SF_INT_UQI: + case UHI_FTYPE_V16HF_V16HF_INT_UHI: case UQI_FTYPE_V2DI_V2DI_INT_UQI: case UQI_FTYPE_V4SI_V4SI_INT_UQI: case UQI_FTYPE_V2DF_V2DF_INT_UQI: case UQI_FTYPE_V4SF_V4SF_INT_UQI: + case UQI_FTYPE_V8HF_V8HF_INT_UQI: case UDI_FTYPE_V64QI_V64QI_INT_UDI: case USI_FTYPE_V32QI_V32QI_INT_USI: case UHI_FTYPE_V16QI_V16QI_INT_UHI: case USI_FTYPE_V32HI_V32HI_INT_USI: + case USI_FTYPE_V32HF_V32HF_INT_USI: case UHI_FTYPE_V16HI_V16HI_INT_UHI: case UQI_FTYPE_V8HI_V8HI_INT_UQI: nargs = 4; @@ -10290,6 +10293,9 @@ ix86_expand_args_builtin (const struct builtin_description *d, case CODE_FOR_avx512f_cmpv16sf3_mask: case CODE_FOR_avx512f_vmcmpv2df3_mask: case CODE_FOR_avx512f_vmcmpv4sf3_mask: + case CODE_FOR_avx512bw_cmpv32hf3_mask: + case CODE_FOR_avx512vl_cmpv16hf3_mask: + case CODE_FOR_avx512fp16_cmpv8hf3_mask: error ("the last argument must be a 5-bit immediate"); return const0_rtx; @@ -10710,6 +10716,8 @@ ix86_expand_round_builtin (const struct builtin_description *d, case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT: case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT: case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT: + case USI_FTYPE_V32HF_V32HF_INT_USI_INT: + case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT: nargs_constant = 3; nargs = 5; break; @@ -10765,6 +10773,8 @@ ix86_expand_round_builtin (const struct builtin_description *d, case CODE_FOR_avx512f_cmpv16sf3_mask_round: case CODE_FOR_avx512f_vmcmpv2df3_mask_round: case CODE_FOR_avx512f_vmcmpv4sf3_mask_round: + case CODE_FOR_avx512f_vmcmpv8hf3_mask_round: + case CODE_FOR_avx512bw_cmpv32hf3_mask_round: error ("the immediate argument must be a 5-bit immediate"); return const0_rtx; default: diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index ba0058d..c415487 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -1229,6 +1229,9 @@ ;; SSE and x87 SFmode and DFmode floating point modes (define_mode_iterator MODEF [SF DF]) +;; SSE floating point modes +(define_mode_iterator MODEFH [(HF "TARGET_AVX512FP16") SF DF]) + ;; All x87 floating point modes (define_mode_iterator X87MODEF [SF DF XF]) @@ -1254,7 +1257,7 @@ ;; SSE vector mode corresponding to a scalar mode (define_mode_attr ssevecmode - [(QI "V16QI") (HI "V8HI") (SI "V4SI") (DI "V2DI") (SF "V4SF") (DF "V2DF")]) + [(QI "V16QI") (HI "V8HI") (SI "V4SI") (DI "V2DI") (HF "V8HF") (SF "V4SF") (DF "V2DF")]) (define_mode_attr ssevecmodelower [(QI "v16qi") (HI "v8hi") (SI "v4si") (DI "v2di") (SF "v4sf") (DF "v2df")]) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 1aab06d..516eb45 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -226,7 +226,7 @@ (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) -;; All AVX-512{F,VL} vector modes. Supposed TARGET_AVX512F baseline. +;; All AVX-512{F,VL} vector modes without HF. Supposed TARGET_AVX512F baseline. (define_mode_iterator V48_AVX512VL [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL") @@ -239,6 +239,16 @@ V16SF (V8SF "TARGET_AVX512VL") V8DF (V4DF "TARGET_AVX512VL")]) +;; All AVX-512{F,VL} vector modes. Supposed TARGET_AVX512F baseline. +(define_mode_iterator V48H_AVX512VL + [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") + V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL") + (V32HF "TARGET_AVX512FP16") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL") + V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")]) + ;; 1,2 byte AVX-512{BW,VL} vector modes. Supposed TARGET_AVX512BW baseline. (define_mode_iterator VI12_AVX512VL [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL") @@ -1014,10 +1024,10 @@ [(V64QI "b") (V32HI "w") (V16SI "k") (V8DI "q") (V32QI "b") (V16HI "w") (V8SI "k") (V4DI "q") (V16QI "b") (V8HI "w") (V4SI "k") (V2DI "q") - (V16SF "k") (V8DF "q") - (V8SF "k") (V4DF "q") - (V4SF "k") (V2DF "q") - (SF "k") (DF "q")]) + (V32HF "w") (V16SF "k") (V8DF "q") + (V16HF "w") (V8SF "k") (V4DF "q") + (V8HF "w") (V4SF "k") (V2DF "q") + (HF "w") (SF "k") (DF "q")]) ;; Mapping of vector modes to VPTERNLOG suffix (define_mode_attr ternlogsuffix @@ -1067,6 +1077,18 @@ (V32QI "p") (V16HI "p") (V16HF "p") (V64QI "p") (V32HI "p") (V32HF "p")]) +;; SSE prefix for integer and HF vector comparison. +(define_mode_attr ssecmpintprefix + [(V2DI "p") (V2DF "") + (V4DI "p") (V4DF "") + (V8DI "p") (V8DF "") + (V4SI "p") (V4SF "") + (V8SI "p") (V8SF "") + (V16SI "p") (V16SF "") + (V16QI "p") (V8HI "p") (V8HF "") + (V32QI "p") (V16HI "p") (V16HF "") + (V64QI "p") (V32HI "p") (V32HF "")]) + ;; SSE scalar suffix for vector modes (define_mode_attr ssescalarmodesuffix [(HF "sh") (SF "ss") (DF "sd") @@ -3450,11 +3472,11 @@ (set_attr "mode" "")]) (define_mode_attr cmp_imm_predicate - [(V16SF "const_0_to_31_operand") (V8DF "const_0_to_31_operand") + [(V32HF "const_0_to_31_operand") (V16SF "const_0_to_31_operand") (V8DF "const_0_to_31_operand") (V16SI "const_0_to_7_operand") (V8DI "const_0_to_7_operand") - (V8SF "const_0_to_31_operand") (V4DF "const_0_to_31_operand") + (V16HF "const_0_to_31_operand") (V8SF "const_0_to_31_operand") (V4DF "const_0_to_31_operand") (V8SI "const_0_to_7_operand") (V4DI "const_0_to_7_operand") - (V4SF "const_0_to_31_operand") (V2DF "const_0_to_31_operand") + (V8HF "const_0_to_31_operand") (V4SF "const_0_to_31_operand") (V2DF "const_0_to_31_operand") (V4SI "const_0_to_7_operand") (V2DI "const_0_to_7_operand") (V32HI "const_0_to_7_operand") (V64QI "const_0_to_7_operand") (V16HI "const_0_to_7_operand") (V32QI "const_0_to_7_operand") @@ -3463,12 +3485,12 @@ (define_insn "_cmp3" [(set (match_operand: 0 "register_operand" "=k") (unspec: - [(match_operand:V48_AVX512VL 1 "register_operand" "v") - (match_operand:V48_AVX512VL 2 "nonimmediate_operand" "") + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "") (match_operand:SI 3 "" "n")] UNSPEC_PCMP))] "TARGET_AVX512F && " - "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "ssecmp") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") @@ -3617,8 +3639,8 @@ [(set (match_operand: 0 "register_operand" "=k") (and: (unspec: - [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "" "") + [(match_operand:VFH_128 1 "register_operand" "v") + (match_operand:VFH_128 2 "" "") (match_operand:SI 3 "const_0_to_31_operand" "n")] UNSPEC_PCMP) (const_int 1)))] @@ -3633,8 +3655,8 @@ [(set (match_operand: 0 "register_operand" "=k") (and: (unspec: - [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "" "") + [(match_operand:VFH_128 1 "register_operand" "v") + (match_operand:VFH_128 2 "" "") (match_operand:SI 3 "const_0_to_31_operand" "n")] UNSPEC_PCMP) (and: @@ -3650,10 +3672,10 @@ (define_insn "_comi" [(set (reg:CCFP FLAGS_REG) (compare:CCFP - (vec_select:MODEF + (vec_select:MODEFH (match_operand: 0 "register_operand" "v") (parallel [(const_int 0)])) - (vec_select:MODEF + (vec_select:MODEFH (match_operand: 1 "" "") (parallel [(const_int 0)]))))] "SSE_FLOAT_MODE_P (mode)" -- cgit v1.1 From 4f0f696fea17cd91b184181abcf596df0e857304 Mon Sep 17 00:00:00 2001 From: David Faust Date: Fri, 20 Aug 2021 14:54:42 -0700 Subject: bpf: correct zero_extend output templates The output templates for zero_extendhidi2 and zero_extendqidi2 could lead to incorrect code generation when zero-extending one register into another. This patch adds a new output template to the define_insns to handle such cases and produce correct asm. gcc/ChangeLog: * config/bpf/bpf.md (zero_extendhidi2): Add new output template for register-to-register extensions. (zero_extendqidi2): Likewise. --- gcc/config/bpf/bpf.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/bpf/bpf.md b/gcc/config/bpf/bpf.md index 03830cc..c51add7 100644 --- a/gcc/config/bpf/bpf.md +++ b/gcc/config/bpf/bpf.md @@ -241,22 +241,24 @@ ;; the ldx{bhwdw} instructions to load the values in registers. (define_insn "zero_extendhidi2" - [(set (match_operand:DI 0 "register_operand" "=r,r") - (zero_extend:DI (match_operand:HI 1 "nonimmediate_operand" "r,m")))] + [(set (match_operand:DI 0 "register_operand" "=r,r,r") + (zero_extend:DI (match_operand:HI 1 "nonimmediate_operand" "0,r,m")))] "" "@ and\t%0,0xffff + mov\t%0,%1\;and\t%0,0xffff ldxh\t%0,%1" - [(set_attr "type" "alu,ldx")]) + [(set_attr "type" "alu,alu,ldx")]) (define_insn "zero_extendqidi2" - [(set (match_operand:DI 0 "register_operand" "=r,r") - (zero_extend:DI (match_operand:QI 1 "nonimmediate_operand" "r,m")))] + [(set (match_operand:DI 0 "register_operand" "=r,r,r") + (zero_extend:DI (match_operand:QI 1 "nonimmediate_operand" "0,r,m")))] "" "@ and\t%0,0xff + mov\t%0,%1\;and\t%0,0xff ldxb\t%0,%1" - [(set_attr "type" "alu,ldx")]) + [(set_attr "type" "alu,alu,ldx")]) (define_insn "zero_extendsidi2" [(set (match_operand:DI 0 "register_operand" "=r,r") -- cgit v1.1 From 5b2ab1d35e41528ea844c6f5ee030f8e032f4c18 Mon Sep 17 00:00:00 2001 From: David Faust Date: Wed, 8 Sep 2021 10:26:15 -0700 Subject: bpf: add -mcpu and related feature options New instructions have been added over time to the eBPF ISA, but previously there has been no good method to select which version to target in GCC. This patch adds the following options to the BPF backend: -mcpu={v1, v2, v3} Select which version of the eBPF ISA to target. This enables or disables generation of certain instructions. The default is v3. -mjmpext Enable extra conditional branch instructions. Enabled for CPU v2 and above. -mjmp32 Enable 32-bit jump/branch instructions. Enabled for CPU v3 and above. -malu32 Enable 32-bit ALU instructions. Enabled for CPU v3 and above. gcc/ChangeLog: * config/bpf/bpf-opts.h (bpf_isa_version): New enum. * config/bpf/bpf-protos.h (bpf_expand_cbranch): New. * config/bpf/bpf.c (bpf_option_override): Handle -mcpu option. (bpf_expand_cbranch): New function. * config/bpf/bpf.md (AM mode iterator): Conditionalize support for SI mode. (zero_extendsidi2): Only use mov32 instruction if it is available. (SIM mode iterator): Conditionalize support for SI mode. (JM mode iterator): New. (cbranchdi4): Update name, use new JM iterator. Use bpf_expand_cbranch. (*branch_on_di): Update name, use new JM iterator. * config/bpf/bpf.opt: (mjmpext): New option. (malu32): Likewise. (mjmp32): Likewise. (mcpu): Likewise. (bpf_isa): New enum. --- gcc/config/bpf/bpf-opts.h | 7 +++++++ gcc/config/bpf/bpf-protos.h | 1 + gcc/config/bpf/bpf.c | 41 +++++++++++++++++++++++++++++++++++++++++ gcc/config/bpf/bpf.md | 44 ++++++++++++++++++++++++-------------------- gcc/config/bpf/bpf.opt | 29 +++++++++++++++++++++++++++++ 5 files changed, 102 insertions(+), 20 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/bpf/bpf-opts.h b/gcc/config/bpf/bpf-opts.h index 1bc930c..456e39c 100644 --- a/gcc/config/bpf/bpf-opts.h +++ b/gcc/config/bpf/bpf-opts.h @@ -53,4 +53,11 @@ enum bpf_kernel_version LINUX_NATIVE, }; +enum bpf_isa_version +{ + ISA_V1, + ISA_V2, + ISA_V3, +}; + #endif /* ! BPF_OPTS_H */ diff --git a/gcc/config/bpf/bpf-protos.h b/gcc/config/bpf/bpf-protos.h index 7ce3386..e6bb901 100644 --- a/gcc/config/bpf/bpf-protos.h +++ b/gcc/config/bpf/bpf-protos.h @@ -29,6 +29,7 @@ extern void bpf_print_operand (FILE *, rtx, int); extern void bpf_print_operand_address (FILE *, rtx); extern void bpf_expand_prologue (void); extern void bpf_expand_epilogue (void); +extern void bpf_expand_cbranch (machine_mode, rtx *); rtl_opt_pass * make_pass_bpf_core_attr (gcc::context *); diff --git a/gcc/config/bpf/bpf.c b/gcc/config/bpf/bpf.c index 01d9c03..82bb698 100644 --- a/gcc/config/bpf/bpf.c +++ b/gcc/config/bpf/bpf.c @@ -242,6 +242,17 @@ bpf_option_override (void) target_flags |= MASK_BPF_CORE; write_symbols |= BTF_WITH_CORE_DEBUG; } + + /* Determine available features from ISA setting (-mcpu=). */ + if (bpf_has_jmpext == -1) + bpf_has_jmpext = (bpf_isa >= ISA_V2); + + if (bpf_has_alu32 == -1) + bpf_has_alu32 = (bpf_isa >= ISA_V3); + + if (bpf_has_jmp32 == -1) + bpf_has_jmp32 = (bpf_isa >= ISA_V3); + } #undef TARGET_OPTION_OVERRIDE @@ -540,6 +551,36 @@ bpf_expand_epilogue (void) emit_jump_insn (gen_exit ()); } +/* Expand to the instructions for a conditional branch. This function + is called when expanding the 'cbranch4' pattern in bpf.md. */ + +void +bpf_expand_cbranch (machine_mode mode, rtx *operands) +{ + /* If all jump instructions are available, nothing special to do here. */ + if (bpf_has_jmpext) + return; + + enum rtx_code code = GET_CODE (operands[0]); + + /* Without the conditional branch instructions jslt, jsle, jlt, jle, we need + to convert conditional branches that would use them to an available + operation instead by reversing the comparison. */ + if ((code == LT || code == LE || code == LTU || code == LEU)) + { + /* Reverse the condition. */ + PUT_CODE (operands[0], reverse_condition (code)); + + /* Swap the operands, and ensure that the first is a register. */ + if (!register_operand (operands[2], mode)) + operands[2] = force_reg (mode, operands[2]); + + rtx tmp = operands[1]; + operands[1] = operands[2]; + operands[2] = tmp; + } +} + /* Return the initial difference between the specified pair of registers. The registers that can figure in FROM, and TO, are specified by ELIMINABLE_REGS in bpf.h. diff --git a/gcc/config/bpf/bpf.md b/gcc/config/bpf/bpf.md index c51add7..436c8df 100644 --- a/gcc/config/bpf/bpf.md +++ b/gcc/config/bpf/bpf.md @@ -100,9 +100,9 @@ ;; insns, with the proper modes. ;; ;; 32-bit arithmetic (for SI modes) is implemented using the alu32 -;; instructions. +;; instructions, if available. -(define_mode_iterator AM [SI DI]) +(define_mode_iterator AM [(SI "bpf_has_alu32") DI]) ;;; Addition (define_insn "add3" @@ -266,7 +266,7 @@ (match_operand:SI 1 "nonimmediate_operand" "r,m")))] "" "@ - mov32\t%0,%1 + * return bpf_has_alu32 ? \"mov32\t%0,%1\" : \"mov\t%0,%1\;and\t%0,0xffffffff\"; ldxw\t%0,%1" [(set_attr "type" "alu,ldx")]) @@ -315,7 +315,7 @@ ;;;; Shifts -(define_mode_iterator SIM [SI DI]) +(define_mode_iterator SIM [(SI "bpf_has_alu32") DI]) (define_insn "ashr3" [(set (match_operand:SIM 0 "register_operand" "=r,r") @@ -346,24 +346,28 @@ ;; The eBPF jump instructions use 64-bit arithmetic when evaluating ;; the jump conditions. Therefore we use DI modes below. -(define_expand "cbranchdi4" +(define_mode_iterator JM [(SI "bpf_has_jmp32") DI]) + +(define_expand "cbranch4" [(set (pc) (if_then_else (match_operator 0 "comparison_operator" - [(match_operand:DI 1 "register_operand") - (match_operand:DI 2 "reg_or_imm_operand")]) + [(match_operand:JM 1 "register_operand") + (match_operand:JM 2 "reg_or_imm_operand")]) (label_ref (match_operand 3 "" "")) (pc)))] "" { if (!ordered_comparison_operator (operands[0], VOIDmode)) FAIL; + + bpf_expand_cbranch (mode, operands); }) -(define_insn "*branch_on_di" +(define_insn "*branch_on_" [(set (pc) (if_then_else (match_operator 3 "ordered_comparison_operator" - [(match_operand:DI 0 "register_operand" "r") - (match_operand:DI 1 "reg_or_imm_operand" "rI")]) + [(match_operand:JM 0 "register_operand" "r") + (match_operand:JM 1 "reg_or_imm_operand" "rI")]) (label_ref (match_operand 2 "" "")) (pc)))] "" @@ -372,16 +376,16 @@ switch (code) { - case EQ: return "jeq\t%0,%1,%2"; break; - case NE: return "jne\t%0,%1,%2"; break; - case LT: return "jslt\t%0,%1,%2"; break; - case LE: return "jsle\t%0,%1,%2"; break; - case GT: return "jsgt\t%0,%1,%2"; break; - case GE: return "jsge\t%0,%1,%2"; break; - case LTU: return "jlt\t%0,%1,%2"; break; - case LEU: return "jle\t%0,%1,%2"; break; - case GTU: return "jgt\t%0,%1,%2"; break; - case GEU: return "jge\t%0,%1,%2"; break; + case EQ: return "jeq\t%0,%1,%2"; break; + case NE: return "jne\t%0,%1,%2"; break; + case LT: return "jslt\t%0,%1,%2"; break; + case LE: return "jsle\t%0,%1,%2"; break; + case GT: return "jsgt\t%0,%1,%2"; break; + case GE: return "jsge\t%0,%1,%2"; break; + case LTU: return "jlt\t%0,%1,%2"; break; + case LEU: return "jle\t%0,%1,%2"; break; + case GTU: return "jgt\t%0,%1,%2"; break; + case GEU: return "jge\t%0,%1,%2"; break; default: gcc_unreachable (); return ""; diff --git a/gcc/config/bpf/bpf.opt b/gcc/config/bpf/bpf.opt index 4493067..e8b728c 100644 --- a/gcc/config/bpf/bpf.opt +++ b/gcc/config/bpf/bpf.opt @@ -131,3 +131,32 @@ Set a hard limit for the size of each stack frame, in bytes. mco-re Target Mask(BPF_CORE) Generate all necessary information for BPF Compile Once - Run Everywhere. + +; Selecting BPF ISA features and versions + +mjmpext +Target Var(bpf_has_jmpext) Init(-1) +Enable extra conditional-branch instructions j(s)lt and j(s)le. + +malu32 +Target Var(bpf_has_alu32) Init(-1) +Enable 32-bit ALU instructions. + +mjmp32 +Target Var(bpf_has_jmp32) Init(-1) +Enable 32-bit jump instructions. + +mcpu= +Target RejectNegative Joined Var(bpf_isa) Enum(bpf_isa) Init(ISA_V3) + +Enum +Name(bpf_isa) Type(enum bpf_isa_version) + +EnumValue +Enum(bpf_isa) String(v1) Value(ISA_V1) + +EnumValue +Enum(bpf_isa) String(v2) Value(ISA_V2) + +EnumValue +Enum(bpf_isa) String(v3) Value(ISA_V3) -- cgit v1.1 From 7f8ee895349f4e14aa315ac4de2889c511a84c91 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 13 Sep 2021 13:04:19 +0800 Subject: [i386] Remove UNSPEC_{COPYSIGN,XORSIGN}. gcc/ChangeLog: * config/i386/i386.md: (UNSPEC_COPYSIGN): Remove. (UNSPEC_XORSIGN): Ditto. --- gcc/config/i386/i386.md | 2 -- 1 file changed, 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index c415487..13f6f57 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -129,8 +129,6 @@ UNSPEC_SCALEF ;; Generic math support - UNSPEC_COPYSIGN - UNSPEC_XORSIGN UNSPEC_IEEE_MIN ; not commutative UNSPEC_IEEE_MAX ; not commutative -- cgit v1.1 From b70e2541fec8751d287b24b9e3681a41e420d4c4 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Sun, 12 Sep 2021 21:04:31 -0500 Subject: rs6000: Remove typedef for struct rs6000_cost_data As Segher pointed out, to typedef struct _rs6000_cost_data as rs6000_cost_data is useless, so rewrite it without typedef. gcc/ChangeLog: * config/rs6000/rs6000.c (struct rs6000_cost_data): Remove typedef. (rs6000_init_cost): Adjust. --- gcc/config/rs6000/rs6000.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index b7ea148..39d428d 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5262,7 +5262,7 @@ rs6000_preferred_simd_mode (scalar_mode mode) return word_mode; } -typedef struct _rs6000_cost_data +struct rs6000_cost_data { struct loop *loop_info; unsigned cost[3]; @@ -5271,7 +5271,7 @@ typedef struct _rs6000_cost_data bool vect_nonmem; /* Indicates this is costing for the scalar version of a loop or block. */ bool costing_for_scalar; -} rs6000_cost_data; +}; /* Test for likely overcommitment of vector hardware resources. If a loop iteration is relatively large, and too large a percentage of @@ -5337,7 +5337,7 @@ rs6000_density_test (rs6000_cost_data *data) static void * rs6000_init_cost (struct loop *loop_info, bool costing_for_scalar) { - rs6000_cost_data *data = XNEW (struct _rs6000_cost_data); + rs6000_cost_data *data = XNEW (rs6000_cost_data); data->loop_info = loop_info; data->cost[vect_prologue] = 0; data->cost[vect_body] = 0; -- cgit v1.1 From fbeead55e03711fb39429ab564a0d8a0b43e9a74 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Sun, 12 Sep 2021 21:57:34 -0500 Subject: rs6000: Add load density heuristic We noticed that SPEC2017 503.bwaves_r run time degrades by about 8% on P8 and P9 if we enabled vectorization at O2 fast-math (with cheap vect cost model). Comparing to Ofast, compiler doesn't do the loop interchange on the innermost loop, it's not profitable to vectorize it then. As Richi's comments [1], this follows the similar idea to over price the vector construction fed by VMAT_ELEMENTWISE or VMAT_STRIDED_SLP. Instead of adding the extra cost on vector construction costing immediately, it firstly records how many loads and vectorized statements in the given loop, later in rs6000_density_test (called by finish_cost) it computes the load density ratio against all vectorized statements, and check with the corresponding thresholds DENSITY_LOAD_NUM_THRESHOLD and DENSITY_LOAD_PCT_THRESHOLD, do the actual extra pricing if both thresholds are exceeded. Note that this new load density heuristic check is based on some fields in target cost which are updated as needed when scanning each add_stmt_cost entry, it's independent of the current function rs6000_density_test which requires to scan non_vect stmts. Since it's checking the load stmts count vs. all vectorized stmts, it's kind of density, so I put it in function rs6000_density_test. With the same reason to keep it independent, I didn't put it as an else arm of the current existing density threshold check hunk or before this hunk. In the investigation of -1.04% degradation from 526.blender_r on Power8, I noticed that the extra penalized cost 320 on one single vector construction for mode V16QI is much exaggerated, which makes the final body cost unreliable, so this patch adds one maximum bound for the extra penalized cost for each vector construction statement. Full SPEC2017 performance evaluation on Power8/Power9 with option combinations: * -O2 -ftree-vectorize {,-fvect-cost-model=very-cheap} {,-ffast-math} * {-O3, -Ofast} {,-funroll-loops} bwaves_r degradations on P8/P9 have been fixed, nothing else remarkable was observed. Power10 -Ofast -funroll-loops run shows it's neutral, while -O2 -ftree-vectorize run shows the bwaves_r degradation is fixed expectedly. [1] https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570076.html gcc/ChangeLog: * config/rs6000/rs6000.c (struct rs6000_cost_data): New members nstmts, nloads and extra_ctor_cost. (rs6000_density_test): Add load density related heuristics. Do extra costing on vector construction statements if need. (rs6000_init_cost): Init new members. (rs6000_update_target_cost_per_stmt): New function. (rs6000_add_stmt_cost): Factor vect_nonmem hunk out to function rs6000_update_target_cost_per_stmt and call it. --- gcc/config/rs6000/rs6000.c | 125 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 116 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 39d428d..2570937 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5266,6 +5266,12 @@ struct rs6000_cost_data { struct loop *loop_info; unsigned cost[3]; + /* Total number of vectorized stmts (loop only). */ + unsigned nstmts; + /* Total number of loads (loop only). */ + unsigned nloads; + /* Possible extra penalized cost on vector construction (loop only). */ + unsigned extra_ctor_cost; /* For each vectorized loop, this var holds TRUE iff a non-memory vector instruction is needed by the vectorization. */ bool vect_nonmem; @@ -5327,9 +5333,48 @@ rs6000_density_test (rs6000_cost_data *data) if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "density %d%%, cost %d exceeds threshold, penalizing " - "loop body cost by %d%%", density_pct, + "loop body cost by %d%%\n", density_pct, vec_cost + not_vec_cost, DENSITY_PENALTY); } + + /* Check whether we need to penalize the body cost to account + for excess strided or elementwise loads. */ + if (data->extra_ctor_cost > 0) + { + /* Threshold for load stmts percentage in all vectorized stmts. */ + const int DENSITY_LOAD_PCT_THRESHOLD = 45; + /* Threshold for total number of load stmts. */ + const int DENSITY_LOAD_NUM_THRESHOLD = 20; + + gcc_assert (data->nloads <= data->nstmts); + unsigned int load_pct = (data->nloads * 100) / data->nstmts; + + /* It's likely to be bounded by latency and execution resources + from many scalar loads which are strided or elementwise loads + into a vector if both conditions below are found: + 1. there are many loads, it's easy to result in a long wait + for load units; + 2. load has a big proportion of all vectorized statements, + it's not easy to schedule other statements to spread among + the loads. + One typical case is the innermost loop of the hotspot of SPEC2017 + 503.bwaves_r without loop interchange. */ + if (data->nloads > DENSITY_LOAD_NUM_THRESHOLD + && load_pct > DENSITY_LOAD_PCT_THRESHOLD) + { + data->cost[vect_body] += data->extra_ctor_cost; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Found %u loads and " + "load pct. %u%% exceed " + "the threshold, " + "penalizing loop body " + "cost by extra cost %u " + "for ctor.\n", + data->nloads, load_pct, + data->extra_ctor_cost); + } + } } /* Implement targetm.vectorize.init_cost. */ @@ -5343,6 +5388,9 @@ rs6000_init_cost (struct loop *loop_info, bool costing_for_scalar) data->cost[vect_body] = 0; data->cost[vect_epilogue] = 0; data->vect_nonmem = false; + data->nstmts = 0; + data->nloads = 0; + data->extra_ctor_cost = 0; data->costing_for_scalar = costing_for_scalar; return data; } @@ -5370,6 +5418,70 @@ rs6000_adjust_vect_cost_per_stmt (enum vect_cost_for_stmt kind, return 0; } +/* Helper function for add_stmt_cost. Check each statement cost + entry, gather information and update the target_cost fields + accordingly. */ +static void +rs6000_update_target_cost_per_stmt (rs6000_cost_data *data, + enum vect_cost_for_stmt kind, + struct _stmt_vec_info *stmt_info, + enum vect_cost_model_location where, + int stmt_cost, + unsigned int orig_count) +{ + + /* Check whether we're doing something other than just a copy loop. + Not all such loops may be profitably vectorized; see + rs6000_finish_cost. */ + if (kind == vec_to_scalar + || kind == vec_perm + || kind == vec_promote_demote + || kind == vec_construct + || kind == scalar_to_vec + || (where == vect_body && kind == vector_stmt)) + data->vect_nonmem = true; + + /* Gather some information when we are costing the vectorized instruction + for the statements located in a loop body. */ + if (!data->costing_for_scalar && data->loop_info && where == vect_body) + { + data->nstmts += orig_count; + + if (kind == scalar_load || kind == vector_load + || kind == unaligned_load || kind == vector_gather_load) + data->nloads += orig_count; + + /* Power processors do not currently have instructions for strided + and elementwise loads, and instead we must generate multiple + scalar loads. This leads to undercounting of the cost. We + account for this by scaling the construction cost by the number + of elements involved, and saving this as extra cost that we may + or may not need to apply. When finalizing the cost of the loop, + the extra penalty is applied when the load density heuristics + are satisfied. */ + if (kind == vec_construct && stmt_info + && STMT_VINFO_TYPE (stmt_info) == load_vec_info_type + && (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE + || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_STRIDED_SLP)) + { + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + unsigned int nunits = vect_nunits_for_cost (vectype); + unsigned int extra_cost = nunits * stmt_cost; + /* As function rs6000_builtin_vectorization_cost shows, we have + priced much on V16QI/V8HI vector construction as their units, + if we penalize them with nunits * stmt_cost, it can result in + an unreliable body cost, eg: for V16QI on Power8, stmt_cost + is 20 and nunits is 16, the extra cost is 320 which looks + much exaggerated. So let's use one maximum bound for the + extra penalized cost for vector construction here. */ + const unsigned int MAX_PENALIZED_COST_FOR_CTOR = 12; + if (extra_cost > MAX_PENALIZED_COST_FOR_CTOR) + extra_cost = MAX_PENALIZED_COST_FOR_CTOR; + data->extra_ctor_cost += extra_cost; + } + } +} + /* Implement targetm.vectorize.add_stmt_cost. */ static unsigned @@ -5389,6 +5501,7 @@ rs6000_add_stmt_cost (class vec_info *vinfo, void *data, int count, /* Statements in an inner loop relative to the loop being vectorized are weighted more heavily. The value here is arbitrary and could potentially be improved with analysis. */ + unsigned int orig_count = count; if (where == vect_body && stmt_info && stmt_in_inner_loop_p (vinfo, stmt_info)) { @@ -5400,14 +5513,8 @@ rs6000_add_stmt_cost (class vec_info *vinfo, void *data, int count, retval = (unsigned) (count * stmt_cost); cost_data->cost[where] += retval; - /* Check whether we're doing something other than just a copy loop. - Not all such loops may be profitably vectorized; see - rs6000_finish_cost. */ - if ((kind == vec_to_scalar || kind == vec_perm - || kind == vec_promote_demote || kind == vec_construct - || kind == scalar_to_vec) - || (where == vect_body && kind == vector_stmt)) - cost_data->vect_nonmem = true; + rs6000_update_target_cost_per_stmt (cost_data, kind, stmt_info, where, + stmt_cost, orig_count); } return retval; -- cgit v1.1 From 2ebb6f6e5162f2c759a883a30c3a49fbedbf5892 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 10 Sep 2021 08:18:45 +0200 Subject: Always default to DWARF2 debug for cygwin and mingw This removes the fallback to STABS as default for cygwin and mingw when the assembler does not support .secrel32 and the default is to emit 32bit code. Support for .secrel32 was added to binutils 2.16 released in 2005 so instead document that as requirement. I left the now unused check for .secrel32 in configure around in case somebody wants to turn that into an error or warning. 2021-09-10 Richard Biener * config/i386/cygming.h: Always default to DWARF2 debugging. Do not define DBX_DEBUGGING_INFO, that's done via dbxcoff.h already. * doc/install.texi: Document binutils 2.16 as minimum requirement for mingw. --- gcc/config/i386/cygming.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/cygming.h b/gcc/config/i386/cygming.h index ac458cd..da872d1 100644 --- a/gcc/config/i386/cygming.h +++ b/gcc/config/i386/cygming.h @@ -18,17 +18,10 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ -#define DBX_DEBUGGING_INFO 1 -#if TARGET_64BIT_DEFAULT || defined (HAVE_GAS_PE_SECREL32_RELOC) #define DWARF2_DEBUGGING_INFO 1 -#endif #undef PREFERRED_DEBUGGING_TYPE -#if (DWARF2_DEBUGGING_INFO) #define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG -#else -#define PREFERRED_DEBUGGING_TYPE DBX_DEBUG -#endif #undef TARGET_SEH #define TARGET_SEH (TARGET_64BIT_MS_ABI && flag_unwind_tables) @@ -97,7 +90,6 @@ along with GCC; see the file COPYING3. If not see #undef DWARF_FRAME_REGISTERS #define DWARF_FRAME_REGISTERS (TARGET_64BIT ? 33 : 17) -#ifdef HAVE_GAS_PE_SECREL32_RELOC /* Use section relative relocations for debugging offsets. Unlike other targets that fake this by putting the section VMA at 0, PE won't allow it. */ @@ -129,7 +121,6 @@ along with GCC; see the file COPYING3. If not see gcc_unreachable (); \ } \ } while (0) -#endif #define TARGET_EXECUTABLE_SUFFIX ".exe" -- cgit v1.1 From 113ff252170bd4eb096d0b30b9e9add0922d8be5 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 10 Sep 2021 08:47:47 +0200 Subject: Default Alpha/VMS to DWARF2 debugging only This changes the default debug format for Alpha/VMS to DWARF2 only, skipping emission of VMS debug info which is going do be deprecated for GCC 12 alongside the support for STABS. 2021-09-10 Richard Biener * config/alpha/vms.h (PREFERRED_DEBUGGING_TYPE): Define to DWARF2_DEBUG. --- gcc/config/alpha/vms.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/alpha/vms.h b/gcc/config/alpha/vms.h index b8673b6..2a9917c 100644 --- a/gcc/config/alpha/vms.h +++ b/gcc/config/alpha/vms.h @@ -244,7 +244,7 @@ typedef struct {int num_args; enum avms_arg_type atypes[6];} avms_arg_info; while (0) #undef PREFERRED_DEBUGGING_TYPE -#define PREFERRED_DEBUGGING_TYPE VMS_AND_DWARF2_DEBUG +#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG #define ASM_PN_FORMAT "%s___%lu" -- cgit v1.1 From d399e43a91e7e35e169cdbcabb7a792b9671ccc1 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 10 Sep 2021 08:59:18 +0200 Subject: Always default to DWARF2 debugging for RX, even with -mas100-syntax The RX port defaults to STABS when -mas100-syntax is used because the AS100 assembler does not support some of the pseudo-ops used by DWARF2 debug emission. Since STABS is going to be deprecated that has to change. The following simply always uses DWARF2, likely leaving -mas100-syntax broken when debug info is generated. Can the RX port maintainer please sort out the situation? 2021-09-10 Richard Biener * config/rx/rx.h (PREFERRED_DEBUGGING_TYPE): Always define to DWARF2_DEBUG. --- gcc/config/rx/rx.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rx/rx.h b/gcc/config/rx/rx.h index 4078440..3cb411d 100644 --- a/gcc/config/rx/rx.h +++ b/gcc/config/rx/rx.h @@ -620,14 +620,8 @@ typedef unsigned int CUMULATIVE_ARGS; /* Like REG_P except that this macro is true for SET expressions. */ #define SET_P(rtl) (GET_CODE (rtl) == SET) -/* The AS100 assembler does not support .leb128 and .uleb128, but - the compiler-build-time configure tests will have enabled their - use because GAS supports them. So default to generating STABS - debug information instead of DWARF2 when generating AS100 - compatible output. */ #undef PREFERRED_DEBUGGING_TYPE -#define PREFERRED_DEBUGGING_TYPE (TARGET_AS100_SYNTAX \ - ? DBX_DEBUG : DWARF2_DEBUG) +#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG #define DBX_DEBUGGING_INFO 1 #define DWARF2_DEBUGGING_INFO 1 -- cgit v1.1 From 716e03f9f3d30a4077c487fbb5f558562796217f Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 10 Sep 2021 09:09:53 +0200 Subject: Default AVR to DWARF2 debug This switches the AVR port to generate DWARF2 debugging info by default since the support for STABS is going to be deprecated for GCC 12. 2021-09-10 Richard Biener * config/avr/elf.h (PREFERRED_DEBUGGING_TYPE): Remove override, pick up DWARF2_DEBUG define from elfos.h --- gcc/config/avr/elf.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/avr/elf.h b/gcc/config/avr/elf.h index 2f0eb0d..5f2d1e0 100644 --- a/gcc/config/avr/elf.h +++ b/gcc/config/avr/elf.h @@ -22,9 +22,6 @@ #undef PCC_BITFIELD_TYPE_MATTERS -#undef PREFERRED_DEBUGGING_TYPE -#define PREFERRED_DEBUGGING_TYPE DBX_DEBUG - #undef MAX_OFILE_ALIGNMENT #define MAX_OFILE_ALIGNMENT (32768 * 8) -- cgit v1.1 From 2071a0ed778596927253fd128e1ffa8f18089175 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 13 Sep 2021 11:28:24 +0200 Subject: Fix i686-lynx build breakage With the last adjustment I failed to remove a stray undef of PREFERRED_DEBUGGING_TYPE from config/i386/lynx.h 2021-09-13 Richard Biener * config/i386/lynx.h: Remove undef of PREFERRED_DEBUGGING_TYPE to inherit from elfos.h --- gcc/config/i386/lynx.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/lynx.h b/gcc/config/i386/lynx.h index 70b2587..65fc6a7 100644 --- a/gcc/config/i386/lynx.h +++ b/gcc/config/i386/lynx.h @@ -60,10 +60,6 @@ along with GCC; see the file COPYING3. If not see #undef ASM_OUTPUT_ALIGN -/* Undefine the definition from elfos.h to enable our default. */ - -#undef PREFERRED_DEBUGGING_TYPE - /* The file i386.c defines TARGET_HAVE_TLS unconditionally if HAVE_AS_TLS is defined. HAVE_AS_TLS is defined as gas support for TLS is detected by configure. We undefine it here. */ -- cgit v1.1 From c86de344f81f7a3368c2327477429c13a3746783 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 10 Sep 2021 09:17:00 +0200 Subject: Remove DARWIN_PREFER_DWARF and dead code This removes the always defined DARWIN_PREFER_DWARF and the code guarded by it being not defined, removing the possibility to default some i386 darwin configurations to STABS when it would not be defined. 2021-09-10 Richard Biener * config/darwin.h (DARWIN_PREFER_DWARF): Do not define. * config/i386/darwin.h (PREFERRED_DEBUGGING_TYPE): Do not change based on DARWIN_PREFER_DWARF not being defined. --- gcc/config/darwin.h | 3 +-- gcc/config/i386/darwin.h | 11 ----------- 2 files changed, 1 insertion(+), 13 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h index f1d92f8..6396586 100644 --- a/gcc/config/darwin.h +++ b/gcc/config/darwin.h @@ -499,9 +499,8 @@ extern GTY(()) int darwin_ms_struct; /* We now require C++11 to bootstrap and newer tools than those based on stabs, so require DWARF-2, even if stabs is supported by the assembler. */ -#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG -#define DARWIN_PREFER_DWARF #define DWARF2_DEBUGGING_INFO 1 +#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG #ifdef HAVE_AS_STABS_DIRECTIVE #define DBX_DEBUGGING_INFO 1 diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h index da0ae5b..c4a6f4d 100644 --- a/gcc/config/i386/darwin.h +++ b/gcc/config/i386/darwin.h @@ -264,17 +264,6 @@ along with GCC; see the file COPYING3. If not see target_flags &= ~MASK_MACHO_DYNAMIC_NO_PIC; \ } while (0) -/* Darwin on x86_64 uses dwarf-2 by default. Pre-darwin9 32-bit - compiles default to stabs+. darwin9+ defaults to dwarf-2. */ -#ifndef DARWIN_PREFER_DWARF -#undef PREFERRED_DEBUGGING_TYPE -#ifdef HAVE_AS_STABS_DIRECTIVE -#define PREFERRED_DEBUGGING_TYPE (TARGET_64BIT ? DWARF2_DEBUG : DBX_DEBUG) -#else -#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG -#endif -#endif - /* Darwin uses the standard DWARF register numbers but the default register numbers for STABS. Fortunately for 64-bit code the default and the standard are the same. */ -- cgit v1.1 From f42e95a830ab48e59389065ce79a013a519646f1 Mon Sep 17 00:00:00 2001 From: Jan-Benedict Glaw Date: Mon, 13 Sep 2021 12:08:25 +0200 Subject: Fix multi-statment macro INIT_CUMULATIVE_ARGS() expands to multiple statements, which will break right after an `if` statement. Wrap it into a block. gcc/ChangeLog: * config/alpha/vms.h (INIT_CUMULATIVE_ARGS): Wrap multi-statment define into a block. --- gcc/config/alpha/vms.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/alpha/vms.h b/gcc/config/alpha/vms.h index 2a9917c..0033b00 100644 --- a/gcc/config/alpha/vms.h +++ b/gcc/config/alpha/vms.h @@ -145,9 +145,13 @@ typedef struct {int num_args; enum avms_arg_type atypes[6];} avms_arg_info; #undef INIT_CUMULATIVE_ARGS #define INIT_CUMULATIVE_ARGS(CUM, FNTYPE, LIBNAME, INDIRECT, N_NAMED_ARGS) \ - (CUM).num_args = 0; \ - (CUM).atypes[0] = (CUM).atypes[1] = (CUM).atypes[2] = I64; \ - (CUM).atypes[3] = (CUM).atypes[4] = (CUM).atypes[5] = I64; + do \ + { \ + (CUM).num_args = 0; \ + (CUM).atypes[0] = (CUM).atypes[1] = (CUM).atypes[2] = I64; \ + (CUM).atypes[3] = (CUM).atypes[4] = (CUM).atypes[5] = I64; \ + } \ + while (0) #define DEFAULT_PCC_STRUCT_RETURN 0 -- cgit v1.1 From f0cfd070b68772eaaa19a3b711fbd9e85b244240 Mon Sep 17 00:00:00 2001 From: Richard Earnshaw Date: Fri, 3 Sep 2021 16:53:13 +0100 Subject: arm: expand handling of movmisalign for DImode [PR102125] DImode is currently handled only for machines with vector modes enabled, but this is unduly restrictive and is generally better done in core registers. gcc/ChangeLog: PR target/102125 * config/arm/arm.md (movmisaligndi): New define_expand. * config/arm/vec-common.md (movmisalign): Iterate over VDQ mode. --- gcc/config/arm/arm.md | 16 ++++++++++++++++ gcc/config/arm/vec-common.md | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index 5d3f21b..4adc976 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -12617,6 +12617,22 @@ }" ) +;; movmisalign for DImode +(define_expand "movmisaligndi" + [(match_operand:DI 0 "general_operand") + (match_operand:DI 1 "general_operand")] + "unaligned_access" +{ + rtx lo_op0 = gen_lowpart (SImode, operands[0]); + rtx lo_op1 = gen_lowpart (SImode, operands[1]); + rtx hi_op0 = gen_highpart_mode (SImode, DImode, operands[0]); + rtx hi_op1 = gen_highpart_mode (SImode, DImode, operands[1]); + + emit_insn (gen_movmisalignsi (lo_op0, lo_op1)); + emit_insn (gen_movmisalignsi (hi_op0, hi_op1)); + DONE; +}) + ;; movmisalign patterns for HImode and SImode. (define_expand "movmisalign" [(match_operand:HSI 0 "general_operand") diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 68de4f0..e71d9b3 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -281,8 +281,8 @@ }) (define_expand "movmisalign" - [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand") - (unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")] + [(set (match_operand:VDQ 0 "neon_perm_struct_or_reg_operand") + (unspec:VDQ [(match_operand:VDQ 1 "neon_perm_struct_or_reg_operand")] UNSPEC_MISALIGNED_ACCESS))] "ARM_HAVE__LDST && !BYTES_BIG_ENDIAN && unaligned_access && !TARGET_REALLY_IWMMXT" -- cgit v1.1 From 5b01bfeb8703c264ad402b77741f06f41d7fceac Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 26 Aug 2021 05:31:50 -0700 Subject: x86: Add TARGET_AVX256_[MOVE|STORE]_BY_PIECES 1. Add TARGET_AVX256_MOVE_BY_PIECES to perform move by-pieces operation with 256-bit AVX instructions. 2. Add TARGET_AVX256_STORE_BY_PIECES to perform move and store by-pieces operations with 256-bit AVX instructions. They are enabled only for Intel Alder Lake and Intel processors with AVX512. gcc/ PR target/101935 * config/i386/i386.h (TARGET_AVX256_MOVE_BY_PIECES): New. (TARGET_AVX256_STORE_BY_PIECES): Likewise. (MOVE_MAX): Check TARGET_AVX256_MOVE_BY_PIECES and TARGET_AVX256_STORE_BY_PIECES instead of TARGET_AVX256_SPLIT_UNALIGNED_LOAD and TARGET_AVX256_SPLIT_UNALIGNED_STORE. (STORE_MAX_PIECES): Check TARGET_AVX256_STORE_BY_PIECES instead of TARGET_AVX256_SPLIT_UNALIGNED_STORE. * config/i386/x86-tune.def (X86_TUNE_AVX256_MOVE_BY_PIECES): New. (X86_TUNE_AVX256_STORE_BY_PIECES): Likewise. gcc/testsuite/ PR target/101935 * g++.target/i386/pr80566-1.C: Add -mtune-ctrl=avx256_store_by_pieces. * gcc.target/i386/pr100865-4a.c: Likewise. * gcc.target/i386/pr100865-10a.c: Likewise. * gcc.target/i386/pr90773-20.c: Likewise. * gcc.target/i386/pr90773-21.c: Likewise. * gcc.target/i386/pr90773-22.c: Likewise. * gcc.target/i386/pr90773-23.c: Likewise. * g++.target/i386/pr80566-2.C: Add -mtune-ctrl=avx256_move_by_pieces. * gcc.target/i386/eh_return-1.c: Likewise. * gcc.target/i386/pr90773-26.c: Likewise. * gcc.target/i386/pieces-memcpy-12.c: Replace -mtune=haswell with -mtune-ctrl=avx256_move_by_pieces. * gcc.target/i386/pieces-memcpy-15.c: Likewise. * gcc.target/i386/pieces-memset-2.c: Replace -mtune=haswell with -mtune-ctrl=avx256_store_by_pieces. * gcc.target/i386/pieces-memset-5.c: Likewise. * gcc.target/i386/pieces-memset-11.c: Likewise. * gcc.target/i386/pieces-memset-14.c: Likewise. * gcc.target/i386/pieces-memset-20.c: Likewise. * gcc.target/i386/pieces-memset-23.c: Likewise. * gcc.target/i386/pieces-memset-29.c: Likewise. * gcc.target/i386/pieces-memset-30.c: Likewise. * gcc.target/i386/pieces-memset-33.c: Likewise. * gcc.target/i386/pieces-memset-34.c: Likewise. * gcc.target/i386/pieces-memset-44.c: Likewise. * gcc.target/i386/pieces-memset-37.c: Replace -mtune=generic with -mtune-ctrl=avx256_store_by_pieces. --- gcc/config/i386/i386.h | 10 +++++++--- gcc/config/i386/x86-tune.def | 11 +++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 73237b8..e76bb55 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -403,6 +403,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR] #define TARGET_SOFTWARE_PREFETCHING_BENEFICIAL \ ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL] +#define TARGET_AVX256_MOVE_BY_PIECES \ + ix86_tune_features[X86_TUNE_AVX256_MOVE_BY_PIECES] +#define TARGET_AVX256_STORE_BY_PIECES \ + ix86_tune_features[X86_TUNE_AVX256_STORE_BY_PIECES] #define TARGET_AVX256_SPLIT_REGS \ ix86_tune_features[X86_TUNE_AVX256_SPLIT_REGS] #define TARGET_GENERAL_REGS_SSE_SPILL \ @@ -1793,8 +1797,8 @@ typedef struct ix86_args { ? 64 \ : ((TARGET_AVX \ && !TARGET_PREFER_AVX128 \ - && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \ - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ + && (TARGET_AVX256_MOVE_BY_PIECES \ + || TARGET_AVX256_STORE_BY_PIECES)) \ ? 32 \ : ((TARGET_SSE2 \ && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \ @@ -1811,7 +1815,7 @@ typedef struct ix86_args { ? 64 \ : ((TARGET_AVX \ && !TARGET_PREFER_AVX128 \ - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ + && TARGET_AVX256_STORE_BY_PIECES) \ ? 32 \ : ((TARGET_SSE2 \ && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 8f55da8..2f221b1 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -484,6 +484,17 @@ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2 instructions in the auto-vectorizer. */ DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512) +/* X86_TUNE_AVX256_MOVE_BY_PIECES: Optimize move_by_pieces with 256-bit + AVX instructions. */ +DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces", + m_ALDERLAKE | m_CORE_AVX512) + +/* X86_TUNE_AVX256_STORE_BY_PIECES: Optimize store_by_pieces with 256-bit + AVX instructions. */ +DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES, "avx256_store_by_pieces", + m_ALDERLAKE | m_CORE_AVX512) + +/*****************************************************************************/ /*****************************************************************************/ /* Historical relics: tuning flags that helps a specific old CPU designs */ /*****************************************************************************/ -- cgit v1.1 From 512b383534785f9fc021e700a1fdda86cf0f3fe7 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Mon, 13 Sep 2021 15:40:28 +0100 Subject: aarch64: PR target/102252 Invalid addressing mode for SVE load predicate In the testcase we generate invalid assembly for an SVE load predicate instruction. The RTL for the insn is: (insn 9 8 10 (set (reg:VNx16BI 68 p0) (mem:VNx16BI (plus:DI (mult:DI (reg:DI 1 x1 [93]) (const_int 8 [0x8])) (reg/f:DI 0 x0 [92])) [2 work_3(D)->array[offset_4(D)]+0 S8 A16])) That addressing mode is not valid for the instruction [1] as it only accepts the addressing mode: [{, #, MUL VL}] This patch rejects the register index form for SVE predicate modes. Bootstrapped and tested on aarch64-none-linux-gnu. [1] https://developer.arm.com/documentation/ddi0602/2021-06/SVE-Instructions/LDR--predicate---Load-predicate-register- gcc/ChangeLog: PR target/102252 * config/aarch64/aarch64.c (aarch64_classify_address): Don't allow register index for SVE predicate modes. gcc/testsuite/ChangeLog: PR target/102252 * g++.target/aarch64/sve/pr102252.C: New test. --- gcc/config/aarch64/aarch64.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 1fbe9e0..30d9a0b 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -9770,7 +9770,6 @@ aarch64_classify_address (struct aarch64_address_info *info, || mode == TImode || mode == TFmode || (BYTES_BIG_ENDIAN && advsimd_struct_p)); - /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode corresponds to the actual size of the memory being loaded/stored and the mode of the corresponding addressing mode is half of that. */ @@ -9779,12 +9778,14 @@ aarch64_classify_address (struct aarch64_address_info *info, mode = DFmode; bool allow_reg_index_p = (!load_store_pair_p - && (known_lt (GET_MODE_SIZE (mode), 16) + && ((vec_flags == 0 + && known_lt (GET_MODE_SIZE (mode), 16)) || vec_flags == VEC_ADVSIMD || vec_flags & VEC_SVE_DATA)); - /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and - [Rn, #offset, MUL VL]. */ + /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift]. + The latter is not valid for SVE predicates, and that's rejected through + allow_reg_index_p above. */ if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0 && (code != REG && code != PLUS)) return false; -- cgit v1.1 From 20f3c168205cc7b0a97ecd54ffc54bed7637be74 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Mon, 13 Sep 2021 06:56:57 +0000 Subject: Remove m32r{,le}-*-linux* support from GCC m32r support never made it to glibc and the support for the Linux kernel was removed with 4.18. It does not remove much but no reason to keep around a port which never worked or one which the support in other projects is gone. OK? Checked to make sure m32r-linux and m32rle-linux were rejected when building. contrib/ChangeLog: * config-list.mk: Remove m32r-linux and m32rle-linux from the list. gcc/ChangeLog: * config.gcc: Add m32r-*-linux* and m32rle-*-linux* to the Unsupported targets list. Remove support for m32r-*-linux* and m32rle-*-linux*. * config/m32r/linux.h: Removed. * config/m32r/t-linux: Removed. libgcc/ChangeLog: * config.host: Remove m32r-*-linux* and m32rle-*-linux*. * config/m32r/libgcc-glibc.ver: Removed. * config/m32r/t-linux: Removed. --- gcc/config/m32r/linux.h | 91 ------------------------------------------------- gcc/config/m32r/t-linux | 20 ----------- 2 files changed, 111 deletions(-) delete mode 100644 gcc/config/m32r/linux.h delete mode 100644 gcc/config/m32r/t-linux (limited to 'gcc/config') diff --git a/gcc/config/m32r/linux.h b/gcc/config/m32r/linux.h deleted file mode 100644 index 4fdebbc..0000000 --- a/gcc/config/m32r/linux.h +++ /dev/null @@ -1,91 +0,0 @@ -/* Definitions for Renesas M32R running Linux-based GNU systems using ELF. - Copyright (C) 2003-2021 Free Software Foundation, Inc. - - This file is part of GCC. - - GCC is free software; you can redistribute it and/or modify it - under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3, or (at your - option) any later version. - - GCC is distributed in the hope that it will be useful, but WITHOUT - ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public - License for more details. - - You should have received a copy of the GNU General Public License - along with GCC; see the file COPYING3. If not see - . */ - -#undef SIZE_TYPE -#define SIZE_TYPE "unsigned int" - -#undef PTRDIFF_TYPE -#define PTRDIFF_TYPE "int" - -#undef WCHAR_TYPE -#define WCHAR_TYPE "long int" - -#undef WCHAR_TYPE_SIZE -#define WCHAR_TYPE_SIZE BITS_PER_WORD - -/* Provide a LINK_SPEC appropriate for Linux. Here we provide support - for the special GCC options -static and -shared, which allow us to - link things in one of these three modes by applying the appropriate - combinations of options at link-time. - - When the -shared link option is used a final link is not being - done. */ - -#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux.so.2" - -#undef LINK_SPEC -#if TARGET_LITTLE_ENDIAN -#define LINK_SPEC "%(link_cpu) -m m32rlelf_linux %{shared:-shared} \ - %{!shared: \ - %{!static: \ - %{rdynamic:-export-dynamic} \ - -dynamic-linker " GNU_USER_DYNAMIC_LINKER "} \ - %{static:-static}}" -#else -#define LINK_SPEC "%(link_cpu) -m m32relf_linux %{shared:-shared} \ - %{!shared: \ - %{!static: \ - %{rdynamic:-export-dynamic} \ - -dynamic-linker " GNU_USER_DYNAMIC_LINKER "} \ - %{static:-static}}" -#endif - -#undef LIB_SPEC -#define LIB_SPEC \ - "%{pthread:-lpthread} \ - %{shared: -lc} \ - %{!shared: \ - %{mieee-fp:-lieee} \ - %{profile:-lc_p} %{!profile: -lc}}" - -#undef STARTFILE_SPEC -#if defined HAVE_LD_PIE -#define STARTFILE_SPEC \ - "%{!shared: %{pg|p|profile:gcrt1.o%s;pie:Scrt1.o%s;:crt1.o%s}} \ - crti.o%s %{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s}" -#else -#define STARTFILE_SPEC \ - "%{!shared: \ - %{pg:gcrt1.o%s} %{!pg:%{p:gcrt1.o%s} %{!p:crt1.o%s}}}\ - crti.o%s %{!shared:crtbegin.o%s} %{shared:crtbeginS.o%s}" -#endif - -#undef ENDFILE_SPEC -#define ENDFILE_SPEC \ - "%{shared|pie:crtendS.o%s;:crtend.o%s} crtn.o%s" - -#undef SUBTARGET_CPP_SPEC -#define SUBTARGET_CPP_SPEC "\ - %{posix:-D_POSIX_SOURCE} \ - %{pthread:-D_REENTRANT -D_PTHREADS} \ -" - -#define TARGET_OS_CPP_BUILTINS() GNU_USER_TARGET_OS_CPP_BUILTINS() - -#define TARGET_ASM_FILE_END file_end_indicate_exec_stack diff --git a/gcc/config/m32r/t-linux b/gcc/config/m32r/t-linux deleted file mode 100644 index 3384b8a..0000000 --- a/gcc/config/m32r/t-linux +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2003-2021 Free Software Foundation, Inc. -# -# This file is part of GCC. -# -# GCC is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GCC is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GCC; see the file COPYING3. If not see -# . - -# Don't install "assert.h" in gcc. We use the one in glibc. -INSTALL_ASSERT_H = -- cgit v1.1 From 03312cbd54f337dfb25be356a1d1abc9925c6c03 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Thu, 2 Sep 2021 07:08:22 +0000 Subject: [aarch64] Fix target/95969: __builtin_aarch64_im_lane_boundsi interferes with gimple This patch adds simple folding of __builtin_aarch64_im_lane_boundsi where we are not going to error out. It fixes the problem by the removal of the function from the IR. OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions. gcc/ChangeLog: PR target/95969 * config/aarch64/aarch64-builtins.c (aarch64_fold_builtin_lane_check): New function. (aarch64_general_fold_builtin): Handle AARCH64_SIMD_BUILTIN_LANE_CHECK. (aarch64_general_gimple_fold_builtin): Likewise. gcc/testsuite/ChangeLog: PR target/95969 * gcc.target/aarch64/lane-bound-1.c: New test. * gcc.target/aarch64/lane-bound-2.c: New test. --- gcc/config/aarch64/aarch64-builtins.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index eef9fc0..119f67d 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -29,6 +29,7 @@ #include "rtl.h" #include "tree.h" #include "gimple.h" +#include "ssa.h" #include "memmodel.h" #include "tm_p.h" #include "expmed.h" @@ -2333,6 +2334,27 @@ aarch64_general_builtin_rsqrt (unsigned int fn) return NULL_TREE; } +/* Return true if the lane check can be removed as there is no + error going to be emitted. */ +static bool +aarch64_fold_builtin_lane_check (tree arg0, tree arg1, tree arg2) +{ + if (TREE_CODE (arg0) != INTEGER_CST) + return false; + if (TREE_CODE (arg1) != INTEGER_CST) + return false; + if (TREE_CODE (arg2) != INTEGER_CST) + return false; + + auto totalsize = wi::to_widest (arg0); + auto elementsize = wi::to_widest (arg1); + if (totalsize == 0 || elementsize == 0) + return false; + auto lane = wi::to_widest (arg2); + auto high = wi::udiv_trunc (totalsize, elementsize); + return wi::ltu_p (lane, high); +} + #undef VAR1 #define VAR1(T, N, MAP, FLAG, A) \ case AARCH64_SIMD_BUILTIN_##T##_##N##A: @@ -2353,6 +2375,11 @@ aarch64_general_fold_builtin (unsigned int fcode, tree type, VAR1 (UNOP, floatv4si, 2, ALL, v4sf) VAR1 (UNOP, floatv2di, 2, ALL, v2df) return fold_build1 (FLOAT_EXPR, type, args[0]); + case AARCH64_SIMD_BUILTIN_LANE_CHECK: + gcc_assert (n_args == 3); + if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2])) + return void_node; + break; default: break; } @@ -2440,6 +2467,14 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt) } break; } + case AARCH64_SIMD_BUILTIN_LANE_CHECK: + if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2])) + { + unlink_stmt_vdef (stmt); + release_defs (stmt); + new_stmt = gimple_build_nop (); + } + break; default: break; } -- cgit v1.1 From 8ea292591e42aa4d52b4b7a00b86335bfd2e2e85 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Thu, 12 Aug 2021 15:20:43 +0200 Subject: i386: support micro-levels in target{,_clone} attrs [PR101696] As mentioned in the PR, we do miss supports target micro-architectures in target and target_clone attribute. While the levels x86-64 x86-64-v2 x86-64-v3 x86-64-v4 are supported values by -march option, they are actually only aliases for k8 CPU. That said, they are more closer to __builtin_cpu_supports function and we decided to implement it there. PR target/101696 gcc/ChangeLog: * common/config/i386/cpuinfo.h (cpu_indicator_init): Add support for x86-64 micro levels for __builtin_cpu_supports. * common/config/i386/i386-cpuinfo.h (enum feature_priority): Add priorities for the micro-arch levels. (enum processor_features): Add new features. * common/config/i386/i386-isas.h: Add micro-arch features. * config/i386/i386-builtins.c (get_builtin_code_for_version): Support the micro-arch levels by callsing __builtin_cpu_supports. * doc/extend.texi: Document that the levels are support by __builtin_cpu_supports. gcc/testsuite/ChangeLog: * g++.target/i386/mv30.C: New test. * gcc.target/i386/mvc16.c: New test. * gcc.target/i386/builtin_target.c (CHECK___builtin_cpu_supports): New. Co-Authored-By: H.J. Lu --- gcc/config/i386/i386-builtins.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c index 1799701..11ce58b 100644 --- a/gcc/config/i386/i386-builtins.c +++ b/gcc/config/i386/i386-builtins.c @@ -1927,8 +1927,24 @@ get_builtin_code_for_version (tree decl, tree *predicate_list) return 0; new_target = TREE_TARGET_OPTION (target_node); gcc_assert (new_target); - - if (new_target->arch_specified && new_target->arch > 0) + enum ix86_builtins builtin_fn = IX86_BUILTIN_CPU_IS; + + /* Special case x86-64 micro-level architectures. */ + const char *arch_name = attrs_str + strlen ("arch="); + if (startswith (arch_name, "x86-64")) + { + arg_str = arch_name; + builtin_fn = IX86_BUILTIN_CPU_SUPPORTS; + if (strcmp (arch_name, "x86-64") == 0) + priority = P_X86_64_BASELINE; + else if (strcmp (arch_name, "x86-64-v2") == 0) + priority = P_X86_64_V2; + else if (strcmp (arch_name, "x86-64-v3") == 0) + priority = P_X86_64_V3; + else if (strcmp (arch_name, "x86-64-v4") == 0) + priority = P_X86_64_V4; + } + else if (new_target->arch_specified && new_target->arch > 0) for (i = 0; i < pta_size; i++) if (processor_alias_table[i].processor == new_target->arch) { @@ -1998,7 +2014,7 @@ get_builtin_code_for_version (tree decl, tree *predicate_list) if (predicate_list) { - predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS]; + predicate_decl = ix86_builtins [(int) builtin_fn]; /* For a C string literal the length includes the trailing NULL. */ predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str); predicate_chain = tree_cons (predicate_decl, predicate_arg, -- cgit v1.1 From 76b75018b3d053a890ebe155e47814de14b3c9fb Mon Sep 17 00:00:00 2001 From: Jason Merrill Date: Thu, 15 Jul 2021 15:30:17 -0400 Subject: c++: implement C++17 hardware interference size The last missing piece of the C++17 standard library is the hardware intereference size constants. Much of the delay in implementing these has been due to uncertainty about what the right values are, and even whether there is a single constant value that is suitable; the destructive interference size is intended to be used in structure layout, so program ABIs will depend on it. In principle, both of these values should be the same as the target's L1 cache line size. When compiling for a generic target that is intended to support a range of target CPUs with different cache line sizes, the constructive size should probably be the minimum size, and the destructive size the maximum, unless you are constrained by ABI compatibility with previous code. From discussion on gcc-patches, I've come to the conclusion that the solution to the difficulty of choosing stable values is to give up on it, and instead encourage only uses where ABI stability is unimportant: in particular, uses where the ABI is shared at most between translation units built at the same time with the same flags. To that end, I've added a warning for any use of the constant value of std::hardware_destructive_interference_size in a header or module export. Appropriate uses within a project can disable the warning. A previous iteration of this patch included an -finterference-tune flag to make the value vary with -mtune; this iteration makes that the default behavior, which should be appropriate for all reasonable uses of the variable. The previous default of "stable-ish" seems to me likely to have been more of an attractive nuisance; since we can't promise actual stability, we should instead make proper uses more convenient. JF Bastien's implementation proposal is summarized at https://github.com/itanium-cxx-abi/cxx-abi/issues/74 I implement this by adding new --params for the two sizes. Targets can override these values in targetm.target_option.override() to support a range of values for the generic target; otherwise, both will default to the L1 cache line size. 64 bytes still seems correct for all x86. I'm not sure why he proposed 64/64 for generic 32-bit ARM, since the Cortex A9 has a 32-byte cache line, so I'd think 32/64 would make more sense. He proposed 64/128 for generic AArch64, but since the A64FX now has a 256B cache line, I've changed that to 64/256. Other arch maintainers are invited to set ranges for their generic targets if that seems better than using the default cache line size for both values. With the above choice to reject stability as a goal, getting these values "right" is now just a matter of what we want the default optimization to be, and we can feel free to adjust them as CPUs with different cache lines become more and less common. gcc/ChangeLog: * params.opt: Add destructive-interference-size and constructive-interference-size. * doc/invoke.texi: Document them. * config/aarch64/aarch64.c (aarch64_override_options_internal): Set them. * config/arm/arm.c (arm_option_override): Set them. * config/i386/i386-options.c (ix86_option_override_internal): Set them. gcc/c-family/ChangeLog: * c.opt: Add -Winterference-size. * c-cppbuiltin.c (cpp_atomic_builtins): Add __GCC_DESTRUCTIVE_SIZE and __GCC_CONSTRUCTIVE_SIZE. gcc/cp/ChangeLog: * constexpr.c (maybe_warn_about_constant_value): Complain about std::hardware_destructive_interference_size. (cxx_eval_constant_expression): Call it. * decl.c (cxx_init_decl_processing): Check --param *-interference-size values. libstdc++-v3/ChangeLog: * include/std/version: Define __cpp_lib_hardware_interference_size. * libsupc++/new: Define hardware interference size variables. gcc/testsuite/ChangeLog: * g++.dg/warn/Winterference.H: New file. * g++.dg/warn/Winterference.C: New test. * g++.target/aarch64/interference.C: New test. * g++.target/arm/interference.C: New test. * g++.target/i386/interference.C: New test. --- gcc/config/aarch64/aarch64.c | 22 ++++++++++++++++++++++ gcc/config/arm/arm.c | 22 ++++++++++++++++++++++ gcc/config/i386/i386-options.c | 6 ++++++ 3 files changed, 50 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 30d9a0b..36519cc 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -16540,6 +16540,28 @@ aarch64_override_options_internal (struct gcc_options *opts) SET_OPTION_IF_UNSET (opts, &global_options_set, param_l1_cache_line_size, aarch64_tune_params.prefetch->l1_cache_line_size); + + if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0) + { + SET_OPTION_IF_UNSET (opts, &global_options_set, + param_destruct_interfere_size, + aarch64_tune_params.prefetch->l1_cache_line_size); + SET_OPTION_IF_UNSET (opts, &global_options_set, + param_construct_interfere_size, + aarch64_tune_params.prefetch->l1_cache_line_size); + } + else + { + /* For a generic AArch64 target, cover the current range of cache line + sizes. */ + SET_OPTION_IF_UNSET (opts, &global_options_set, + param_destruct_interfere_size, + 256); + SET_OPTION_IF_UNSET (opts, &global_options_set, + param_construct_interfere_size, + 64); + } + if (aarch64_tune_params.prefetch->l2_cache_size >= 0) SET_OPTION_IF_UNSET (opts, &global_options_set, param_l2_cache_size, diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index f1e6282..6c6e77f 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -3669,6 +3669,28 @@ arm_option_override (void) SET_OPTION_IF_UNSET (&global_options, &global_options_set, param_l1_cache_line_size, current_tune->prefetch.l1_cache_line_size); + if (current_tune->prefetch.l1_cache_line_size >= 0) + { + SET_OPTION_IF_UNSET (&global_options, &global_options_set, + param_destruct_interfere_size, + current_tune->prefetch.l1_cache_line_size); + SET_OPTION_IF_UNSET (&global_options, &global_options_set, + param_construct_interfere_size, + current_tune->prefetch.l1_cache_line_size); + } + else + { + /* For a generic ARM target, JF Bastien proposed using 64 for both. */ + /* ??? Cortex A9 has a 32-byte cache line, so why not 32 for + constructive? */ + /* More recent Cortex chips have a 64-byte cache line, but are marked + ARM_PREFETCH_NOT_BENEFICIAL, so they get these defaults. */ + SET_OPTION_IF_UNSET (&global_options, &global_options_set, + param_destruct_interfere_size, 64); + SET_OPTION_IF_UNSET (&global_options, &global_options_set, + param_construct_interfere_size, 64); + } + if (current_tune->prefetch.l1_cache_size >= 0) SET_OPTION_IF_UNSET (&global_options, &global_options_set, param_l1_cache_size, diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 2cb87ce..c0006b3 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -2579,6 +2579,12 @@ ix86_option_override_internal (bool main_args_p, SET_OPTION_IF_UNSET (opts, opts_set, param_l2_cache_size, ix86_tune_cost->l2_cache_size); + /* 64B is the accepted value for these for all x86. */ + SET_OPTION_IF_UNSET (&global_options, &global_options_set, + param_destruct_interfere_size, 64); + SET_OPTION_IF_UNSET (&global_options, &global_options_set, + param_construct_interfere_size, 64); + /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */ if (opts->x_flag_prefetch_loop_arrays < 0 && HAVE_prefetch -- cgit v1.1