From 206222e0ce349c1205b8c07c367cdaa62e4f7382 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Thu, 27 Jan 2022 13:37:04 +0100 Subject: internal_error - do not use leading capital letter gcc/ChangeLog: * config/rs6000/host-darwin.cc (segv_crash_handler): Do not use leading capital letter. (segv_handler): Likewise. * ipa-sra.cc (verify_splitting_accesses): Likewise. * varasm.cc (get_section): Likewise. gcc/d/ChangeLog: * decl.cc (d_finish_decl): Do not use leading capital letter. --- gcc/config/rs6000/host-darwin.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/host-darwin.cc b/gcc/config/rs6000/host-darwin.cc index 541f7e1..6072a6c 100644 --- a/gcc/config/rs6000/host-darwin.cc +++ b/gcc/config/rs6000/host-darwin.cc @@ -58,7 +58,7 @@ extern int sigaltstack(const struct sigaltstack *, struct sigaltstack *); static void segv_crash_handler (int sig ATTRIBUTE_UNUSED) { - internal_error ("Segmentation Fault (code)"); + internal_error ("segmentation fault (code)"); } static void @@ -128,7 +128,7 @@ segv_handler (int sig ATTRIBUTE_UNUSED, fprintf (stderr, "[address=%08lx pc=%08x]\n", uc->uc_mcontext->MC_FLD(es).MC_FLD(dar), uc->uc_mcontext->MC_FLD(ss).MC_FLD(srr0)); - internal_error ("Segmentation Fault"); + internal_error ("segmentation fault"); exit (FATAL_EXIT_CODE); } -- cgit v1.1 From 3a5fdf986dc6ebb6e244087b462132590ad0a184 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Fri, 28 Jan 2022 19:17:16 +0000 Subject: Darwin, PPC: Fix bootstrap after GLIBC version changes. A recent patch added tests for OPTION_GLIBC that is defined in linux.h and linux64.h. This broke bootstrap for powerpc Darwin. Fixed by adding a definition to 0 for OPTION_GLIBC. Signed-off-by: Iain Sandoe gcc/ChangeLog: * config/rs6000/darwin.h (OPTION_GLIBC): Define to 0. --- gcc/config/rs6000/darwin.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/darwin.h b/gcc/config/rs6000/darwin.h index b5cef42..210c606 100644 --- a/gcc/config/rs6000/darwin.h +++ b/gcc/config/rs6000/darwin.h @@ -34,6 +34,8 @@ #endif #endif +#define OPTION_GLIBC 0 + /* The object file format is Mach-O. */ #define TARGET_OBJECT_FORMAT OBJECT_MACHO -- cgit v1.1 From 06995c2958aaae7e1f60b7d8aa5f07ffda10880a Mon Sep 17 00:00:00 2001 From: Yoshinori Sato Date: Fri, 28 Jan 2022 17:16:47 -0500 Subject: sh-linux fix target cpu sh-linux not supported any SH1 and SH2a little-endian. gcc * config/sh/t-linux (MULTILIB_EXCEPTIONS): Add m1, mb/m1 and m2a. --- gcc/config/sh/t-linux | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/sh/t-linux b/gcc/config/sh/t-linux index d33c638..4866dac 100644 --- a/gcc/config/sh/t-linux +++ b/gcc/config/sh/t-linux @@ -1,2 +1,3 @@ -MULTILIB_DIRNAMES= -MULTILIB_MATCHES = +MULTILIB_DIRNAMES= +MULTILIB_MATCHES= +MULTILIB_EXCEPTIONS=m1 mb/m1 m2a -- cgit v1.1 From 23987912ddb4207de0714d81237f93f613557d1f Mon Sep 17 00:00:00 2001 From: Eric Botcazou Date: Mon, 31 Jan 2022 09:21:48 +0100 Subject: Use V8+ default in 32-bit mode on SPARC64/Linux This is what has been done for ages on SPARC/Solaris and makes it possible to use 64-bit atomic instructions even in 32-bit mode. gcc/ PR target/104189 * config/sparc/linux64.h (TARGET_DEFAULT): Add MASK_V8PLUS. --- gcc/config/sparc/linux64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/sparc/linux64.h b/gcc/config/sparc/linux64.h index 46823b6..d08a2ef 100644 --- a/gcc/config/sparc/linux64.h +++ b/gcc/config/sparc/linux64.h @@ -35,8 +35,8 @@ along with GCC; see the file COPYING3. If not see #if defined(TARGET_64BIT_DEFAULT) && TARGET_CPU_DEFAULT >= TARGET_CPU_v9 #undef TARGET_DEFAULT #define TARGET_DEFAULT \ - (MASK_V9 + MASK_PTR64 + MASK_64BIT + MASK_STACK_BIAS + \ - MASK_APP_REGS + MASK_FPU + MASK_LONG_DOUBLE_128) + (MASK_V9 + MASK_64BIT + MASK_PTR64 + MASK_STACK_BIAS + \ + MASK_V8PLUS + MASK_APP_REGS + MASK_FPU + MASK_LONG_DOUBLE_128) #endif /* This must be v9a not just v9 because by default we enable -- cgit v1.1 From 2cbe5dd54f15e88e0b42567319aa9c8e7bad7946 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Mon, 31 Jan 2022 20:08:18 +0100 Subject: rs6000: Fix up build of non-glibc/aix/darwin powerpc* targets [PR104298] As reported by Martin, while David has added OPTION_GLIBC define to aix and Iain to darwin, all the other non-linux targets now fail because rs6000.md macro isn't defined. One possibility is to define this macro in option-defaults.h which on rs6000 targets is included last, then we don't need to define it in aix/darwin headers and for targets using linux.h or linux64.h it will DTRT too. The other option is the first 2 hunks + changing the 3 if (!OPTION_GLIBC) FAIL; cases in rs6000.md to e.g. #ifdef OPTION_GLIBC if (!OPTION_GLIBC) #endif FAIL; or to: #ifdef OPTION_GLIBC if (!OPTION_GLIBC) #else if (true) #endif FAIL; (the latter case if Richi wants to push the -Wunreachable-code changes for GCC 13). 2022-01-31 Jakub Jelinek PR target/104298 * config/rs6000/aix.h (OPTION_GLIBC): Remove. * config/rs6000/darwin.h (OPTION_GLIBC): Likewise. * config/rs6000/option-defaults.h (OPTION_GLIBC): Define to 0 if not already defined. --- gcc/config/rs6000/aix.h | 1 - gcc/config/rs6000/darwin.h | 2 -- gcc/config/rs6000/option-defaults.h | 6 ++++++ 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/aix.h b/gcc/config/rs6000/aix.h index eb7a0c0..ad3238b 100644 --- a/gcc/config/rs6000/aix.h +++ b/gcc/config/rs6000/aix.h @@ -23,7 +23,6 @@ #define DEFAULT_ABI ABI_AIX #undef TARGET_AIX #define TARGET_AIX 1 -#define OPTION_GLIBC 0 /* Linux64.h wants to redefine TARGET_AIX based on -m64, but it can't be used in the #if conditional in options-default.h, so provide another macro. */ diff --git a/gcc/config/rs6000/darwin.h b/gcc/config/rs6000/darwin.h index 210c606..b5cef42 100644 --- a/gcc/config/rs6000/darwin.h +++ b/gcc/config/rs6000/darwin.h @@ -34,8 +34,6 @@ #endif #endif -#define OPTION_GLIBC 0 - /* The object file format is Mach-O. */ #define TARGET_OBJECT_FORMAT OBJECT_MACHO diff --git a/gcc/config/rs6000/option-defaults.h b/gcc/config/rs6000/option-defaults.h index f03694e..2123bfd 100644 --- a/gcc/config/rs6000/option-defaults.h +++ b/gcc/config/rs6000/option-defaults.h @@ -62,3 +62,9 @@ {"cpu_32", "%{" OPT_ARCH32 ":%{!mcpu=*:-mcpu=%(VALUE)}}" }, \ {"cpu_64", "%{" OPT_ARCH64 ":%{!mcpu=*:-mcpu=%(VALUE)}}" }, \ {"float", "%{!msoft-float:%{!mhard-float:-m%(VALUE)-float}}" } + +/* rs6000.md uses OPTION_GLIBC unconditionally, while it is defined only in + linux{,64}.h. Define fallback for other targets here. */ +#ifndef OPTION_GLIBC +#define OPTION_GLIBC 0 +#endif -- cgit v1.1 From 7e83607907151d5fbb3d2a7bceb7dcc6125c6c15 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Mon, 31 Jan 2022 12:28:12 -0600 Subject: rs6000: Don't #ifdef "short" built-in names It was recently pointed out that we get anomalous behavior when using __attribute__((target)) to select a CPU. As an example, when building for -mcpu=power8 but using __attribute__((target("mcpu=power10")), it is legal to call __builtin_vec_mod, but not vec_mod, even though these are equivalent. This is because the equivalence is established with a #define that is guarded by #ifdef _ARCH_PWR10. This goofy behavior occurs with both the old builtins support and the new. One of the goals of the new builtins support was to make sure all appropriate interfaces are available using __attribute__((target)), so I failed in this respect. This patch corrects the problem by removing the ifdef. Note that in a few cases we use an ifdef in a way that can't be overridden by __attribute__((target)), and we need to keep those. For example, #ifdef __PPU__ is still appropriate. 2022-01-06 Bill Schmidt gcc/ * config/rs6000/rs6000-overload.def (VEC_ABSD): Remove #ifdef token. (VEC_BLENDV): Likewise. (VEC_BPERM): Likewise. (VEC_CFUGE): Likewise. (VEC_CIPHER_BE): Likewise. (VEC_CIPHERLAST_BE): Likewise. (VEC_CLRL): Likewise. (VEC_CLRR): Likewise. (VEC_CMPNEZ): Likewise. (VEC_CNTLZ): Likewise. (VEC_CNTLZM): Likewise. (VEC_CNTTZM): Likewise. (VEC_CNTLZ_LSBB): Likewise. (VEC_CNTM): Likewise. (VEC_CNTTZ): Likewise. (VEC_CNTTZ_LSBB): Likewise. (VEC_CONVERT_4F32_8F16): Likewise. (VEC_DIV): Likewise. (VEC_DIVE): Likewise. (VEC_EQV): Likewise. (VEC_EXPANDM): Likewise. (VEC_EXTRACT_FP_FROM_SHORTH): Likewise. (VEC_EXTRACT_FP_FROM_SHORTL): Likewise. (VEC_EXTRACTH): Likewise. (VEC_EXTRACTL): Likewise. (VEC_EXTRACTM): Likewise. (VEC_EXTRACT4B): Likewise. (VEC_EXTULX): Likewise. (VEC_EXTURX): Likewise. (VEC_FIRSTMATCHINDEX): Likewise. (VEC_FIRSTMACHOREOSINDEX): Likewise. (VEC_FIRSTMISMATCHINDEX): Likewise. (VEC_FIRSTMISMATCHOREOSINDEX): Likewise. (VEC_GB): Likewise. (VEC_GENBM): Likewise. (VEC_GENHM): Likewise. (VEC_GENWM): Likewise. (VEC_GENDM): Likewise. (VEC_GENQM): Likewise. (VEC_GENPCVM): Likewise. (VEC_GNB): Likewise. (VEC_INSERTH): Likewise. (VEC_INSERTL): Likewise. (VEC_INSERT4B): Likewise. (VEC_LXVL): Likewise. (VEC_MERGEE): Likewise. (VEC_MERGEO): Likewise. (VEC_MOD): Likewise. (VEC_MSUB): Likewise. (VEC_MULH): Likewise. (VEC_NAND): Likewise. (VEC_NCIPHER_BE): Likewise. (VEC_NCIPHERLAST_BE): Likewise. (VEC_NEARBYINT): Likewise. (VEC_NMADD): Likewise. (VEC_ORC): Likewise. (VEC_PDEP): Likewise. (VEC_PERMX): Likewise. (VEC_PEXT): Likewise. (VEC_POPCNT): Likewise. (VEC_PARITY_LSBB): Likewise. (VEC_REPLACE_ELT): Likewise. (VEC_REPLACE_UN): Likewise. (VEC_REVB): Likewise. (VEC_RINT): Likewise. (VEC_RLMI): Likewise. (VEC_RLNM): Likewise. (VEC_SBOX_BE): Likewise. (VEC_SIGNEXTI): Likewise. (VEC_SIGNEXTLL): Likewise. (VEC_SIGNEXTQ): Likewise. (VEC_SLDB): Likewise. (VEC_SLV): Likewise. (VEC_SPLATI): Likewise. (VEC_SPLATID): Likewise. (VEC_SPLATI_INS): Likewise. (VEC_SQRT): Likewise. (VEC_SRDB): Likewise. (VEC_SRV): Likewise. (VEC_STRIL): Likewise. (VEC_STRIL_P): Likewise. (VEC_STRIR): Likewise. (VEC_STRIR_P): Likewise. (VEC_STXVL): Likewise. (VEC_TERNARYLOGIC): Likewise. (VEC_TEST_LSBB_ALL_ONES): Likewise. (VEC_TEST_LSBB_ALL_ZEROS): Likewise. (VEC_VEE): Likewise. (VEC_VES): Likewise. (VEC_VIE): Likewise. (VEC_VPRTYB): Likewise. (VEC_VSCEEQ): Likewise. (VEC_VSCEGT): Likewise. (VEC_VSCELT): Likewise. (VEC_VSCEUO): Likewise. (VEC_VSEE): Likewise. (VEC_VSES): Likewise. (VEC_VSIE): Likewise. (VEC_VSTDC): Likewise. (VEC_VSTDCN): Likewise. (VEC_VTDC): Likewise. (VEC_XL): Likewise. (VEC_XL_BE): Likewise. (VEC_XL_LEN_R): Likewise. (VEC_XL_SEXT): Likewise. (VEC_XL_ZEXT): Likewise. (VEC_XST): Likewise. (VEC_XST_BE): Likewise. (VEC_XST_LEN_R): Likewise. (VEC_XST_TRUNC): Likewise. (VEC_XXPERMDI): Likewise. (VEC_XXSLDWI): Likewise. (VEC_TSTSFI_EQ_DD): Likewise. (VEC_TSTSFI_EQ_TD): Likewise. (VEC_TSTSFI_GT_DD): Likewise. (VEC_TSTSFI_GT_TD): Likewise. (VEC_TSTSFI_LT_DD): Likewise. (VEC_TSTSFI_LT_TD): Likewise. (VEC_TSTSFI_OV_DD): Likewise. (VEC_TSTSFI_OV_TD): Likewise. (VEC_VADDCUQ): Likewise. (VEC_VADDECUQ): Likewise. (VEC_VADDEUQM): Likewise. (VEC_VADDUDM): Likewise. (VEC_VADDUQM): Likewise. (VEC_VBPERMQ): Likewise. (VEC_VCLZB): Likewise. (VEC_VCLZD): Likewise. (VEC_VCLZH): Likewise. (VEC_VCLZW): Likewise. (VEC_VCTZB): Likewise. (VEC_VCTZD): Likewise. (VEC_VCTZH): Likewise. (VEC_VCTZW): Likewise. (VEC_VEEDP): Likewise. (VEC_VEESP): Likewise. (VEC_VESDP): Likewise. (VEC_VESSP): Likewise. (VEC_VIEDP): Likewise. (VEC_VIESP): Likewise. (VEC_VPKSDSS): Likewise. (VEC_VPKSDUS): Likewise. (VEC_VPKUDUM): Likewise. (VEC_VPKUDUS): Likewise. (VEC_VPOPCNT): Likewise. (VEC_VPOPCNTB): Likewise. (VEC_VPOPCNTD): Likewise. (VEC_VPOPCNTH): Likewise. (VEC_VPOPCNTW): Likewise. (VEC_VPRTYBD): Likewise. (VEC_VPRTYBQ): Likewise. (VEC_VPRTYBW): Likewise. (VEC_VRLD): Likewise. (VEC_VSLD): Likewise. (VEC_VSRAD): Likewise. (VEC_VSRD): Likewise. (VEC_VSTDCDP): Likewise. (VEC_VSTDCNDP): Likewise. (VEC_VSTDCNQP): Likewise. (VEC_VSTDCNSP): Likewise. (VEC_VSTDCQP): Likewise. (VEC_VSTDCSP): Likewise. (VEC_VSUBECUQ): Likewise. (VEC_VSUBEUQM): Likewise. (VEC_VSUBUDM): Likewise. (VEC_VSUBUQM): Likewise. (VEC_VTDCDP): Likewise. (VEC_VTDCSP): Likewise. (VEC_VUPKHSW): Likewise. (VEC_VUPKLSW): Likewise. --- gcc/config/rs6000/rs6000-overload.def | 344 +++++++++++++++++----------------- 1 file changed, 174 insertions(+), 170 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-overload.def b/gcc/config/rs6000/rs6000-overload.def index 7d030ab..cdc703e 100644 --- a/gcc/config/rs6000/rs6000-overload.def +++ b/gcc/config/rs6000/rs6000-overload.def @@ -34,6 +34,10 @@ ; in rs6000-vecdefines.h. If no #define is desired, the should ; be replaced with the token SKIP. ; +; The token should be used sparingly, because a #define can't be +; overridden by __attribute__((target)). It is appropriate for cases +; where a target override isn't a possibility, like __PPU__. +; ; Each function entry has two lines. The first line is a prototype line. ; See rs6000-builtin-new.def for a description of the prototype line. ; A prototype line in this file differs in that it doesn't have an @@ -205,7 +209,7 @@ vd __builtin_vec_abs (vd); XVABSDP -[VEC_ABSD, vec_absd, __builtin_vec_vadu, _ARCH_PWR9] +[VEC_ABSD, vec_absd, __builtin_vec_vadu] vuc __builtin_vec_vadu (vuc, vuc); VADUB vus __builtin_vec_vadu (vus, vus); @@ -503,7 +507,7 @@ vui __builtin_vec_avg (vui, vui); VAVGUW -[VEC_BLENDV, vec_blendv, __builtin_vec_xxblend, _ARCH_PWR10] +[VEC_BLENDV, vec_blendv, __builtin_vec_xxblend] vsc __builtin_vec_xxblend (vsc, vsc, vuc); VXXBLEND_V16QI VXXBLEND_VSC vuc __builtin_vec_xxblend (vuc, vuc, vuc); @@ -525,7 +529,7 @@ vd __builtin_vec_xxblend (vd, vd, vull); VXXBLEND_V2DF -[VEC_BPERM, vec_bperm, __builtin_vec_vbperm_api, _ARCH_PWR8] +[VEC_BPERM, vec_bperm, __builtin_vec_vbperm_api] vull __builtin_vec_vbperm_api (vull, vuc); VBPERMD VBPERMD_VULL vull __builtin_vec_vbperm_api (vuq, vuc); @@ -541,25 +545,25 @@ vd __builtin_vec_ceil (vd); XVRDPIP -[VEC_CFUGE, vec_cfuge, __builtin_vec_cfuge, _ARCH_PWR10] +[VEC_CFUGE, vec_cfuge, __builtin_vec_cfuge] vull __builtin_vec_cfuge (vull, vull); VCFUGED -[VEC_CIPHER_BE, vec_cipher_be, __builtin_vec_vcipher_be, _ARCH_PWR8] +[VEC_CIPHER_BE, vec_cipher_be, __builtin_vec_vcipher_be] vuc __builtin_vec_vcipher_be (vuc, vuc); VCIPHER_BE -[VEC_CIPHERLAST_BE, vec_cipherlast_be, __builtin_vec_vcipherlast_be, _ARCH_PWR8] +[VEC_CIPHERLAST_BE, vec_cipherlast_be, __builtin_vec_vcipherlast_be] vuc __builtin_vec_vcipherlast_be (vuc, vuc); VCIPHERLAST_BE -[VEC_CLRL, vec_clrl, __builtin_vec_clrl, _ARCH_PWR10] +[VEC_CLRL, vec_clrl, __builtin_vec_clrl] vsc __builtin_vec_clrl (vsc, unsigned int); VCLRLB VCLRLB_S vuc __builtin_vec_clrl (vuc, unsigned int); VCLRLB VCLRLB_U -[VEC_CLRR, vec_clrr, __builtin_vec_clrr, _ARCH_PWR10] +[VEC_CLRR, vec_clrr, __builtin_vec_clrr] vsc __builtin_vec_clrr (vsc, unsigned int); VCLRRB VCLRRB_S vuc __builtin_vec_clrr (vuc, unsigned int); @@ -1026,7 +1030,7 @@ signed int __builtin_vec_vcmpne_p (signed int, vbll, vsll); VCMPNED_P VCMPNED_P_SB -[VEC_CMPNEZ, vec_cmpnez, __builtin_vec_vcmpnez, _ARCH_PWR9] +[VEC_CMPNEZ, vec_cmpnez, __builtin_vec_vcmpnez] vbc __builtin_vec_cmpnez (vsc, vsc); CMPNEZB CMPNEZB_S vbc __builtin_vec_cmpnez (vuc, vuc); @@ -1064,7 +1068,7 @@ signed int __builtin_byte_in_range (unsigned int, unsigned int); CMPRB2 -[VEC_CNTLZ, vec_cntlz, __builtin_vec_vclz, _ARCH_PWR8] +[VEC_CNTLZ, vec_cntlz, __builtin_vec_vclz] vsc __builtin_vec_vclz (vsc); VCLZB VCLZB_S vuc __builtin_vec_vclz (vuc); @@ -1082,15 +1086,15 @@ vull __builtin_vec_vclz (vull); VCLZD VCLZD_U -[VEC_CNTLZM, vec_cntlzm, __builtin_vec_vclzdm, _ARCH_PWR10] +[VEC_CNTLZM, vec_cntlzm, __builtin_vec_vclzdm] vull __builtin_vec_vclzdm (vull, vull); VCLZDM -[VEC_CNTTZM, vec_cnttzm, __builtin_vec_vctzdm, _ARCH_PWR10] +[VEC_CNTTZM, vec_cnttzm, __builtin_vec_vctzdm] vull __builtin_vec_vctzdm (vull, vull); VCTZDM -[VEC_CNTLZ_LSBB, vec_cntlz_lsbb, __builtin_vec_vclzlsbb, _ARCH_PWR9] +[VEC_CNTLZ_LSBB, vec_cntlz_lsbb, __builtin_vec_vclzlsbb] signed int __builtin_vec_vclzlsbb (vsc); VCLZLSBB_V16QI VCLZLSBB_VSC signed int __builtin_vec_vclzlsbb (vuc); @@ -1104,7 +1108,7 @@ signed int __builtin_vec_vclzlsbb (vui); VCLZLSBB_V4SI VCLZLSBB_VUI -[VEC_CNTM, vec_cntm, __builtin_vec_cntm, _ARCH_PWR10] +[VEC_CNTM, vec_cntm, __builtin_vec_cntm] unsigned long long __builtin_vec_cntm (vuc, const int); VCNTMBB unsigned long long __builtin_vec_cntm (vus, const int); @@ -1114,7 +1118,7 @@ unsigned long long __builtin_vec_cntm (vull, const int); VCNTMBD -[VEC_CNTTZ, vec_cnttz, __builtin_vec_vctz, _ARCH_PWR9] +[VEC_CNTTZ, vec_cnttz, __builtin_vec_vctz] vsc __builtin_vec_vctz (vsc); VCTZB VCTZB_S vuc __builtin_vec_vctz (vuc); @@ -1132,7 +1136,7 @@ vull __builtin_vec_vctz (vull); VCTZD VCTZD_U -[VEC_CNTTZ_LSBB, vec_cnttz_lsbb, __builtin_vec_vctzlsbb, _ARCH_PWR9] +[VEC_CNTTZ_LSBB, vec_cnttz_lsbb, __builtin_vec_vctzlsbb] signed int __builtin_vec_vctzlsbb (vsc); VCTZLSBB_V16QI VCTZLSBB_VSC signed int __builtin_vec_vctzlsbb (vuc); @@ -1150,7 +1154,7 @@ vus __builtin_vec_convert_4f32_8i16 (vf, vf); CONVERT_4F32_8I16 -[VEC_CONVERT_4F32_8F16, vec_pack_to_short_fp32, __builtin_vec_convert_4f32_8f16, _ARCH_PWR9] +[VEC_CONVERT_4F32_8F16, vec_pack_to_short_fp32, __builtin_vec_convert_4f32_8f16] vus __builtin_vec_convert_4f32_8f16 (vf, vf); CONVERT_4F32_8F16 @@ -1182,7 +1186,7 @@ vull __builtin_vec_ctu (vd, const int); XVCVDPUXDS_SCALE -[VEC_DIV, vec_div, __builtin_vec_div, __VSX__] +[VEC_DIV, vec_div, __builtin_vec_div] vsi __builtin_vec_div (vsi, vsi); VDIVSW vui __builtin_vec_div (vui, vui); @@ -1200,7 +1204,7 @@ vd __builtin_vec_div (vd, vd); XVDIVDP -[VEC_DIVE, vec_dive, __builtin_vec_dive, _ARCH_PWR10] +[VEC_DIVE, vec_dive, __builtin_vec_dive] vsi __builtin_vec_dive (vsi, vsi); VDIVESW vui __builtin_vec_dive (vui, vui); @@ -1436,7 +1440,7 @@ void __builtin_vec_dstt (vf *, const int, const int); DSTT DSTT_VF -[VEC_EQV, vec_eqv, __builtin_vec_eqv, _ARCH_PWR8] +[VEC_EQV, vec_eqv, __builtin_vec_eqv] vsc __builtin_vec_eqv (vsc, vsc); EQV_V16QI vuc __builtin_vec_eqv (vuc, vuc); @@ -1499,7 +1503,7 @@ vull __builtin_vec_eqv (vull, vbll); EQV_V2DI_UNS EQV_VULL_VBLL -[VEC_EXPANDM, vec_expandm, __builtin_vec_vexpandm, _ARCH_PWR10] +[VEC_EXPANDM, vec_expandm, __builtin_vec_vexpandm] vuc __builtin_vec_vexpandm (vuc); VEXPANDMB vus __builtin_vec_vexpandm (vus); @@ -1524,15 +1528,15 @@ vsi __builtin_vec_extract (vsi, signed int); VSPLTW EXTRACT_FAKERY -[VEC_EXTRACT_FP_FROM_SHORTH, vec_extract_fp32_from_shorth, __builtin_vec_vextract_fp_from_shorth, _ARCH_PWR9] +[VEC_EXTRACT_FP_FROM_SHORTH, vec_extract_fp32_from_shorth, __builtin_vec_vextract_fp_from_shorth] vf __builtin_vec_vextract_fp_from_shorth (vus); VEXTRACT_FP_FROM_SHORTH -[VEC_EXTRACT_FP_FROM_SHORTL, vec_extract_fp32_from_shortl, __builtin_vec_vextract_fp_from_shortl, _ARCH_PWR9] +[VEC_EXTRACT_FP_FROM_SHORTL, vec_extract_fp32_from_shortl, __builtin_vec_vextract_fp_from_shortl] vf __builtin_vec_vextract_fp_from_shortl (vus); VEXTRACT_FP_FROM_SHORTL -[VEC_EXTRACTH, vec_extracth, __builtin_vec_extracth, _ARCH_PWR10] +[VEC_EXTRACTH, vec_extracth, __builtin_vec_extracth] vull __builtin_vec_extracth (vuc, vuc, unsigned char); VEXTRACTBR vull __builtin_vec_extracth (vus, vus, unsigned char); @@ -1542,7 +1546,7 @@ vull __builtin_vec_extracth (vull, vull, unsigned char); VEXTRACTDR -[VEC_EXTRACTL, vec_extractl, __builtin_vec_extractl, _ARCH_PWR10] +[VEC_EXTRACTL, vec_extractl, __builtin_vec_extractl] vull __builtin_vec_extractl (vuc, vuc, unsigned char); VEXTRACTBL vull __builtin_vec_extractl (vus, vus, unsigned char); @@ -1552,7 +1556,7 @@ vull __builtin_vec_extractl (vull, vull, unsigned char); VEXTRACTDL -[VEC_EXTRACTM, vec_extractm, __builtin_vec_vextractm, _ARCH_PWR10] +[VEC_EXTRACTM, vec_extractm, __builtin_vec_vextractm] signed int __builtin_vec_vextractm (vuc); VEXTRACTMB signed int __builtin_vec_vextractm (vus); @@ -1564,11 +1568,11 @@ signed int __builtin_vec_vextractm (vuq); VEXTRACTMQ -[VEC_EXTRACT4B, vec_extract4b, __builtin_vec_extract4b, _ARCH_PWR9] +[VEC_EXTRACT4B, vec_extract4b, __builtin_vec_extract4b] vull __builtin_vec_extract4b (vuc, const int); EXTRACT4B -[VEC_EXTULX, vec_xlx, __builtin_vec_vextulx, _ARCH_PWR9] +[VEC_EXTULX, vec_xlx, __builtin_vec_vextulx] signed char __builtin_vec_vextulx (unsigned int, vsc); VEXTUBLX VEXTUBLX_S unsigned char __builtin_vec_vextulx (unsigned int, vuc); @@ -1584,7 +1588,7 @@ float __builtin_vec_vextulx (unsigned int, vf); VEXTUWLX VEXTUWLX_F -[VEC_EXTURX, vec_xrx, __builtin_vec_vexturx, _ARCH_PWR9] +[VEC_EXTURX, vec_xrx, __builtin_vec_vexturx] signed char __builtin_vec_vexturx (unsigned int, vsc); VEXTUBRX VEXTUBRX_S unsigned char __builtin_vec_vexturx (unsigned int, vuc); @@ -1600,7 +1604,7 @@ float __builtin_vec_vexturx (unsigned int, vf); VEXTUWRX VEXTUWRX_F -[VEC_FIRSTMATCHINDEX, vec_first_match_index, __builtin_vec_first_match_index, _ARCH_PWR9] +[VEC_FIRSTMATCHINDEX, vec_first_match_index, __builtin_vec_first_match_index] unsigned int __builtin_vec_first_match_index (vsc, vsc); VFIRSTMATCHINDEX_V16QI FIRSTMATCHINDEX_VSC unsigned int __builtin_vec_first_match_index (vuc, vuc); @@ -1614,7 +1618,7 @@ unsigned int __builtin_vec_first_match_index (vui, vui); VFIRSTMATCHINDEX_V4SI FIRSTMATCHINDEX_VUI -[VEC_FIRSTMATCHOREOSINDEX, vec_first_match_or_eos_index, __builtin_vec_first_match_or_eos_index, _ARCH_PWR9] +[VEC_FIRSTMATCHOREOSINDEX, vec_first_match_or_eos_index, __builtin_vec_first_match_or_eos_index] unsigned int __builtin_vec_first_match_or_eos_index (vsc, vsc); VFIRSTMATCHOREOSINDEX_V16QI FIRSTMATCHOREOSINDEX_VSC unsigned int __builtin_vec_first_match_or_eos_index (vuc, vuc); @@ -1628,7 +1632,7 @@ unsigned int __builtin_vec_first_match_or_eos_index (vui, vui); VFIRSTMATCHOREOSINDEX_V4SI FIRSTMATCHOREOSINDEX_VUI -[VEC_FIRSTMISMATCHINDEX, vec_first_mismatch_index, __builtin_vec_first_mismatch_index, _ARCH_PWR9] +[VEC_FIRSTMISMATCHINDEX, vec_first_mismatch_index, __builtin_vec_first_mismatch_index] unsigned int __builtin_vec_first_mismatch_index (vsc, vsc); VFIRSTMISMATCHINDEX_V16QI FIRSTMISMATCHINDEX_VSC unsigned int __builtin_vec_first_mismatch_index (vuc, vuc); @@ -1642,7 +1646,7 @@ unsigned int __builtin_vec_first_mismatch_index (vui, vui); VFIRSTMISMATCHINDEX_V4SI FIRSTMISMATCHINDEX_VUI -[VEC_FIRSTMISMATCHOREOSINDEX, vec_first_mismatch_or_eos_index, __builtin_vec_first_mismatch_or_eos_index, _ARCH_PWR9] +[VEC_FIRSTMISMATCHOREOSINDEX, vec_first_mismatch_or_eos_index, __builtin_vec_first_mismatch_or_eos_index] unsigned int __builtin_vec_first_mismatch_or_eos_index (vsc, vsc); VFIRSTMISMATCHOREOSINDEX_V16QI FIRSTMISMATCHOREOSINDEX_VSC unsigned int __builtin_vec_first_mismatch_or_eos_index (vuc, vuc); @@ -1692,33 +1696,33 @@ vd __builtin_vec_floor (vd); XVRDPIM -[VEC_GB, vec_gb, __builtin_vec_vgbbd, _ARCH_PWR8] +[VEC_GB, vec_gb, __builtin_vec_vgbbd] vsc __builtin_vec_vgbbd (vsc); VGBBD VGBBD_S vuc __builtin_vec_vgbbd (vuc); VGBBD VGBBD_U -[VEC_GENBM, vec_genbm, __builtin_vec_mtvsrbm, _ARCH_PWR10] +[VEC_GENBM, vec_genbm, __builtin_vec_mtvsrbm] vuc __builtin_vec_mtvsrbm (unsigned long long); MTVSRBM -[VEC_GENHM, vec_genhm, __builtin_vec_mtvsrhm, _ARCH_PWR10] +[VEC_GENHM, vec_genhm, __builtin_vec_mtvsrhm] vus __builtin_vec_mtvsrhm (unsigned long long); MTVSRHM -[VEC_GENWM, vec_genwm, __builtin_vec_mtvsrwm, _ARCH_PWR10] +[VEC_GENWM, vec_genwm, __builtin_vec_mtvsrwm] vui __builtin_vec_mtvsrwm (unsigned long long); MTVSRWM -[VEC_GENDM, vec_gendm, __builtin_vec_mtvsrdm, _ARCH_PWR10] +[VEC_GENDM, vec_gendm, __builtin_vec_mtvsrdm] vull __builtin_vec_mtvsrdm (unsigned long long); MTVSRDM -[VEC_GENQM, vec_genqm, __builtin_vec_mtvsrqm, _ARCH_PWR10] +[VEC_GENQM, vec_genqm, __builtin_vec_mtvsrqm] vuq __builtin_vec_mtvsrqm (unsigned long long); MTVSRQM -[VEC_GENPCVM, vec_genpcvm, __builtin_vec_xxgenpcvm, _ARCH_PWR10] +[VEC_GENPCVM, vec_genpcvm, __builtin_vec_xxgenpcvm] vuc __builtin_vec_xxgenpcvm (vuc, const int); XXGENPCVM_V16QI vus __builtin_vec_xxgenpcvm (vus, const int); @@ -1728,7 +1732,7 @@ vull __builtin_vec_xxgenpcvm (vull, const int); XXGENPCVM_V2DI -[VEC_GNB, vec_gnb, __builtin_vec_gnb, _ARCH_PWR10] +[VEC_GNB, vec_gnb, __builtin_vec_gnb] unsigned long long __builtin_vec_gnb (vuq, const int); VGNB @@ -1740,7 +1744,7 @@ vsi __builtin_vec_insert (vsi, vsi, signed int); XXPERMDI_4SI INSERT_FAKERY -[VEC_INSERTH, vec_inserth, __builtin_vec_inserth, _ARCH_PWR10] +[VEC_INSERTH, vec_inserth, __builtin_vec_inserth] vuc __builtin_vec_inserth (unsigned char, vuc, unsigned int); VINSERTGPRBR vuc __builtin_vec_inserth (vuc, vuc, unsigned int); @@ -1756,7 +1760,7 @@ vull __builtin_vec_inserth (unsigned long long, vull, unsigned int); VINSERTGPRDR -[VEC_INSERTL, vec_insertl, __builtin_vec_insertl, _ARCH_PWR10] +[VEC_INSERTL, vec_insertl, __builtin_vec_insertl] vuc __builtin_vec_insertl (unsigned char, vuc, unsigned int); VINSERTGPRBL vuc __builtin_vec_insertl (vuc, vuc, unsigned int); @@ -1772,7 +1776,7 @@ vull __builtin_vec_insertl (unsigned long long, vull, unsigned int); VINSERTGPRDL -[VEC_INSERT4B, vec_insert4b, __builtin_vec_insert4b, _ARCH_PWR9] +[VEC_INSERT4B, vec_insert4b, __builtin_vec_insert4b] vuc __builtin_vec_insert4b (vsi, vuc, const int); INSERT4B INSERT4B_S vuc __builtin_vec_insert4b (vui, vuc, const int); @@ -2128,7 +2132,7 @@ vuc __builtin_vec_lvsr (signed long, const double *); LVSR LVSR_D -[VEC_LXVL, vec_xl_len, __builtin_vec_lxvl, _ARCH_PPC64_PWR9] +[VEC_LXVL, vec_xl_len, __builtin_vec_lxvl] vsc __builtin_vec_lxvl (const signed char *, unsigned int); LXVL LXVL_VSC vuc __builtin_vec_lxvl (const unsigned char *, unsigned int); @@ -2227,7 +2231,7 @@ vull __builtin_vec_max (vbll, vull); VMAXUD VMAXUD_BU -[VEC_MERGEE, vec_mergee, __builtin_vec_vmrgew, _ARCH_PWR8] +[VEC_MERGEE, vec_mergee, __builtin_vec_vmrgew] vsi __builtin_vec_vmrgew (vsi, vsi); VMRGEW_V4SI VMRGEW_VSI vui __builtin_vec_vmrgew (vui, vui); @@ -2327,7 +2331,7 @@ vull __builtin_vec_mergel (vbll, vull); VEC_MERGEL_V2DI VEC_MERGEL_VBLL_VULL -[VEC_MERGEO, vec_mergeo, __builtin_vec_vmrgow, _ARCH_PWR8] +[VEC_MERGEO, vec_mergeo, __builtin_vec_vmrgow] vsi __builtin_vec_vmrgow (vsi, vsi); VMRGOW_V4SI VMRGOW_VSI vui __builtin_vec_vmrgow (vui, vui); @@ -2414,7 +2418,7 @@ vus __builtin_vec_mladd (vus, vus, vus); VMLADDUHM VMLADDUHM_VUS2 -[VEC_MOD, vec_mod, __builtin_vec_mod, _ARCH_PWR10] +[VEC_MOD, vec_mod, __builtin_vec_mod] vsi __builtin_vec_mod (vsi, vsi); VMODSW vui __builtin_vec_mod (vui, vui); @@ -2432,7 +2436,7 @@ vss __builtin_vec_mradds (vss, vss, vss); VMHRADDSHS -[VEC_MSUB, vec_msub, __builtin_vec_msub, __VSX__] +[VEC_MSUB, vec_msub, __builtin_vec_msub] vf __builtin_vec_msub (vf, vf, vf); XVMSUBSP vd __builtin_vec_msub (vd, vd, vd); @@ -2511,7 +2515,7 @@ vuq __builtin_vec_mule (vull, vull); VMULEUD -[VEC_MULH, vec_mulh, __builtin_vec_mulh, _ARCH_PWR10] +[VEC_MULH, vec_mulh, __builtin_vec_mulh] vsi __builtin_vec_mulh (vsi, vsi); VMULHSW vui __builtin_vec_mulh (vui, vui); @@ -2553,7 +2557,7 @@ vd __builtin_vec_nabs (vd); NABS_V2DF -[VEC_NAND, vec_nand, __builtin_vec_nand, _ARCH_PWR8] +[VEC_NAND, vec_nand, __builtin_vec_nand] vsc __builtin_vec_nand (vsc, vsc); NAND_V16QI vuc __builtin_vec_nand (vuc, vuc); @@ -2616,15 +2620,15 @@ vull __builtin_vec_nand (vull, vbll); NAND_V2DI_UNS NAND_VULL_VBLL -[VEC_NCIPHER_BE, vec_ncipher_be, __builtin_vec_vncipher_be, _ARCH_PWR8] +[VEC_NCIPHER_BE, vec_ncipher_be, __builtin_vec_vncipher_be] vuc __builtin_vec_vncipher_be (vuc, vuc); VNCIPHER_BE -[VEC_NCIPHERLAST_BE, vec_ncipherlast_be, __builtin_vec_vncipherlast_be, _ARCH_PWR8] +[VEC_NCIPHERLAST_BE, vec_ncipherlast_be, __builtin_vec_vncipherlast_be] vuc __builtin_vec_vncipherlast_be (vuc, vuc); VNCIPHERLAST_BE -[VEC_NEARBYINT, vec_nearbyint, __builtin_vec_nearbyint, __VSX__] +[VEC_NEARBYINT, vec_nearbyint, __builtin_vec_nearbyint] vf __builtin_vec_nearbyint (vf); XVRSPI XVRSPI_NBI vd __builtin_vec_nearbyint (vd); @@ -2644,7 +2648,7 @@ vd __builtin_vec_neg (vd); NEG_V2DF -[VEC_NMADD, vec_nmadd, __builtin_vec_nmadd, __VSX__] +[VEC_NMADD, vec_nmadd, __builtin_vec_nmadd] vf __builtin_vec_nmadd (vf, vf, vf); XVNMADDSP vd __builtin_vec_nmadd (vd, vd, vd); @@ -2778,7 +2782,7 @@ vd __builtin_vec_or (vbll, vd); VOR_V2DF VOR_VBLL_VD -[VEC_ORC, vec_orc, __builtin_vec_orc, _ARCH_PWR8] +[VEC_ORC, vec_orc, __builtin_vec_orc] vsc __builtin_vec_orc (vsc, vsc); ORC_V16QI vuc __builtin_vec_orc (vuc, vuc); @@ -2895,7 +2899,7 @@ vui __builtin_vec_packsu (vsll, vsll); VPKSDUS -[VEC_PDEP, vec_pdep, __builtin_vec_vpdepd, _ARCH_PWR10] +[VEC_PDEP, vec_pdep, __builtin_vec_vpdepd] vull __builtin_vec_vpdepd (vull, vull); VPDEPD @@ -2940,7 +2944,7 @@ vbc __builtin_vec_perm (vbc, vbc, vbc); VPERM_16QI VPERM_VBC_VBC_VBC -[VEC_PERMX, vec_permx, __builtin_vec_xxpermx, _ARCH_PWR10] +[VEC_PERMX, vec_permx, __builtin_vec_xxpermx] vsc __builtin_vec_xxpermx (vsc, vsc, vuc, const int); XXPERMX_UV2DI XXPERMX_VSC vuc __builtin_vec_xxpermx (vuc, vuc, vuc, const int); @@ -2970,7 +2974,7 @@ vbc __builtin_vec_vpermxor (vbc, vbc, vbc); VPERMXOR VPERMXOR_VBC -[VEC_PEXT, vec_pext, __builtin_vec_vpextd, _ARCH_PWR10] +[VEC_PEXT, vec_pext, __builtin_vec_vpextd] vull __builtin_vec_vpextd (vull, vull); VPEXTD @@ -2984,7 +2988,7 @@ vuq __builtin_vec_vpmsum (vull, vull); VPMSUMD VPMSUMD_V -[VEC_POPCNT, vec_popcnt, __builtin_vec_vpopcntu, _ARCH_PWR8] +[VEC_POPCNT, vec_popcnt, __builtin_vec_vpopcntu] vuc __builtin_vec_vpopcntu (vsc); VPOPCNTB vuc __builtin_vec_vpopcntu (vuc); @@ -3002,7 +3006,7 @@ vull __builtin_vec_vpopcntu (vull); VPOPCNTUD -[VEC_PARITY_LSBB, vec_parity_lsbb, __builtin_vec_vparity_lsbb, _ARCH_PWR9] +[VEC_PARITY_LSBB, vec_parity_lsbb, __builtin_vec_vparity_lsbb] vui __builtin_vec_vparity_lsbb (vsi); VPRTYBW VPRTYBW_S vui __builtin_vec_vparity_lsbb (vui); @@ -3036,7 +3040,7 @@ vd __builtin_vec_recipdiv (vd, vd); RECIP_V2DF -[VEC_REPLACE_ELT, vec_replace_elt, __builtin_vec_replace_elt, _ARCH_PWR10] +[VEC_REPLACE_ELT, vec_replace_elt, __builtin_vec_replace_elt] vui __builtin_vec_replace_elt (vui, unsigned int, const int); VREPLACE_ELT_UV4SI vsi __builtin_vec_replace_elt (vsi, signed int, const int); @@ -3050,7 +3054,7 @@ vd __builtin_vec_replace_elt (vd, double, const int); VREPLACE_ELT_V2DF -[VEC_REPLACE_UN, vec_replace_unaligned, __builtin_vec_replace_un, _ARCH_PWR10] +[VEC_REPLACE_UN, vec_replace_unaligned, __builtin_vec_replace_un] vui __builtin_vec_replace_un (vui, unsigned int, const int); VREPLACE_UN_UV4SI vsi __builtin_vec_replace_un (vsi, signed int, const int); @@ -3064,7 +3068,7 @@ vd __builtin_vec_replace_un (vd, double, const int); VREPLACE_UN_V2DF -[VEC_REVB, vec_revb, __builtin_vec_revb, _ARCH_PWR8] +[VEC_REVB, vec_revb, __builtin_vec_revb] vss __builtin_vec_revb (vss); REVB_V8HI REVB_VSS vus __builtin_vec_revb (vus); @@ -3129,7 +3133,7 @@ vd __builtin_vec_vreve (vd); VREVE_V2DF -[VEC_RINT, vec_rint, __builtin_vec_rint, __VSX__] +[VEC_RINT, vec_rint, __builtin_vec_rint] vf __builtin_vec_rint (vf); XVRSPIC vd __builtin_vec_rint (vd); @@ -3157,7 +3161,7 @@ vuq __builtin_vec_rl (vuq, vuq); VRLQ VRLQ_VUQ -[VEC_RLMI, vec_rlmi, __builtin_vec_rlmi, _ARCH_PWR9] +[VEC_RLMI, vec_rlmi, __builtin_vec_rlmi] vui __builtin_vec_rlmi (vui, vui, vui); VRLWMI vull __builtin_vec_rlmi (vull, vull, vull); @@ -3167,7 +3171,7 @@ vuq __builtin_vec_rlmi (vuq, vuq, vuq); VRLQMI VRLQMI_VUQ -[VEC_RLNM, vec_vrlnm, __builtin_vec_rlnm, _ARCH_PWR9] +[VEC_RLNM, vec_vrlnm, __builtin_vec_rlnm] vui __builtin_vec_rlnm (vui, vui); VRLWNM vull __builtin_vec_rlnm (vull, vull); @@ -3195,7 +3199,7 @@ vd __builtin_vec_rsqrte (vd); XVRSQRTEDP -[VEC_SBOX_BE, vec_sbox_be, __builtin_vec_sbox_be, _ARCH_PWR8] +[VEC_SBOX_BE, vec_sbox_be, __builtin_vec_sbox_be] vuc __builtin_vec_sbox_be (vuc); VSBOX_BE @@ -3294,13 +3298,13 @@ vsi __builtin_vec_vsignedo (vd); VEC_VSIGNEDO_V2DF -[VEC_SIGNEXTI, vec_signexti, __builtin_vec_signexti, _ARCH_PWR9] +[VEC_SIGNEXTI, vec_signexti, __builtin_vec_signexti] vsi __builtin_vec_signexti (vsc); VSIGNEXTSB2W vsi __builtin_vec_signexti (vss); VSIGNEXTSH2W -[VEC_SIGNEXTLL, vec_signextll, __builtin_vec_signextll, _ARCH_PWR9] +[VEC_SIGNEXTLL, vec_signextll, __builtin_vec_signextll] vsll __builtin_vec_signextll (vsc); VSIGNEXTSB2D vsll __builtin_vec_signextll (vss); @@ -3308,7 +3312,7 @@ vsll __builtin_vec_signextll (vsi); VSIGNEXTSW2D -[VEC_SIGNEXTQ, vec_signextq, __builtin_vec_signextq, _ARCH_PWR10] +[VEC_SIGNEXTQ, vec_signextq, __builtin_vec_signextq] vsq __builtin_vec_signextq (vsll); VSIGNEXTSD2Q @@ -3366,7 +3370,7 @@ vd __builtin_vec_sld (vd, vd, const int); VSLDOI_2DF -[VEC_SLDB, vec_sldb, __builtin_vec_sldb, _ARCH_PWR10] +[VEC_SLDB, vec_sldb, __builtin_vec_sldb] vsc __builtin_vec_sldb (vsc, vsc, const int); VSLDB_V16QI VSLDB_VSC vuc __builtin_vec_sldb (vuc, vuc, const int); @@ -3521,7 +3525,7 @@ vf __builtin_vec_slo (vf, vuc); VSLO VSLO_VFU -[VEC_SLV, vec_slv, __builtin_vec_vslv, _ARCH_PWR9] +[VEC_SLV, vec_slv, __builtin_vec_vslv] vuc __builtin_vec_vslv (vuc, vuc); VSLV @@ -3572,17 +3576,17 @@ ; There are no entries for vec_splat_u{8,16,32}. These are handled ; in altivec.h with a #define and a cast. -[VEC_SPLATI, vec_splati, __builtin_vec_xxspltiw, _ARCH_PWR10] +[VEC_SPLATI, vec_splati, __builtin_vec_xxspltiw] vsi __builtin_vec_xxspltiw (signed int); VXXSPLTIW_V4SI vf __builtin_vec_xxspltiw (float); VXXSPLTIW_V4SF -[VEC_SPLATID, vec_splatid, __builtin_vec_xxspltid, _ARCH_PWR10] +[VEC_SPLATID, vec_splatid, __builtin_vec_xxspltid] vd __builtin_vec_xxspltid (float); VXXSPLTIDP -[VEC_SPLATI_INS, vec_splati_ins, __builtin_vec_xxsplti32dx, _ARCH_PWR10] +[VEC_SPLATI_INS, vec_splati_ins, __builtin_vec_xxsplti32dx] vsi __builtin_vec_xxsplti32dx (vsi, const int, signed int); VXXSPLTI32DX_V4SI VXXSPLTI32DX_VSI vui __builtin_vec_xxsplti32dx (vui, const int, unsigned int); @@ -3598,7 +3602,7 @@ vsi __builtin_vec_splats (vsi); ABS_V4SI SPLATS_FAKERY -[VEC_SQRT, vec_sqrt, __builtin_vec_sqrt, __VSX__] +[VEC_SQRT, vec_sqrt, __builtin_vec_sqrt] vf __builtin_vec_sqrt (vf); XVSQRTSP vd __builtin_vec_sqrt (vd); @@ -3648,7 +3652,7 @@ vuq __builtin_vec_sra (vuq, vuq); VSRAQ VSRAQ_VUQ -[VEC_SRDB, vec_srdb, __builtin_vec_srdb, _ARCH_PWR10] +[VEC_SRDB, vec_srdb, __builtin_vec_srdb] vsc __builtin_vec_srdb (vsc, vsc, const int); VSRDB_V16QI VSRDB_VSC vuc __builtin_vec_srdb (vuc, vuc, const int); @@ -3775,7 +3779,7 @@ vf __builtin_vec_sro (vf, vuc); VSRO VSRO_VFU -[VEC_SRV, vec_srv, __builtin_vec_vsrv, _ARCH_PWR9] +[VEC_SRV, vec_srv, __builtin_vec_vsrv] vuc __builtin_vec_vsrv (vuc, vuc); VSRV @@ -3956,7 +3960,7 @@ void __builtin_vec_stl (vd, signed long long, double *); STVXL_V2DF STVXL_D -[VEC_STRIL, vec_stril, __builtin_vec_stril, _ARCH_PWR10] +[VEC_STRIL, vec_stril, __builtin_vec_stril] vuc __builtin_vec_stril (vuc); VSTRIBL VSTRIBL_U vsc __builtin_vec_stril (vsc); @@ -3966,7 +3970,7 @@ vss __builtin_vec_stril (vss); VSTRIHL VSTRIHL_S -[VEC_STRIL_P, vec_stril_p, __builtin_vec_stril_p, _ARCH_PWR10] +[VEC_STRIL_P, vec_stril_p, __builtin_vec_stril_p] signed int __builtin_vec_stril_p (vuc); VSTRIBL_P VSTRIBL_PU signed int __builtin_vec_stril_p (vsc); @@ -3976,7 +3980,7 @@ signed int __builtin_vec_stril_p (vss); VSTRIHL_P VSTRIHL_PS -[VEC_STRIR, vec_strir, __builtin_vec_strir, _ARCH_PWR10] +[VEC_STRIR, vec_strir, __builtin_vec_strir] vuc __builtin_vec_strir (vuc); VSTRIBR VSTRIBR_U vsc __builtin_vec_strir (vsc); @@ -3986,7 +3990,7 @@ vss __builtin_vec_strir (vss); VSTRIHR VSTRIHR_S -[VEC_STRIR_P, vec_strir_p, __builtin_vec_strir_p, _ARCH_PWR10] +[VEC_STRIR_P, vec_strir_p, __builtin_vec_strir_p] signed int __builtin_vec_strir_p (vuc); VSTRIBR_P VSTRIBR_PU signed int __builtin_vec_strir_p (vsc); @@ -4148,7 +4152,7 @@ void __builtin_vec_stvrxl (vf, signed long long, float *); STVRXL STVRXL_F -[VEC_STXVL, vec_xst_len, __builtin_vec_stxvl, _ARCH_PPC64_PWR9] +[VEC_STXVL, vec_xst_len, __builtin_vec_stxvl] void __builtin_vec_stxvl (vsc, signed char *, unsigned int); STXVL STXVL_VSC void __builtin_vec_stxvl (vuc, unsigned char *, unsigned int); @@ -4316,7 +4320,7 @@ vsi __builtin_vec_sums (vsi, vsi); VSUMSWS -[VEC_TERNARYLOGIC, vec_ternarylogic, __builtin_vec_xxeval, _ARCH_PWR10] +[VEC_TERNARYLOGIC, vec_ternarylogic, __builtin_vec_xxeval] vuc __builtin_vec_xxeval (vuc, vuc, vuc, const int); XXEVAL XXEVAL_VUC vus __builtin_vec_xxeval (vus, vus, vus, const int); @@ -4328,11 +4332,11 @@ vuq __builtin_vec_xxeval (vuq, vuq, vuq, const int); XXEVAL XXEVAL_VUQ -[VEC_TEST_LSBB_ALL_ONES, vec_test_lsbb_all_ones, __builtin_vec_xvtlsbb_all_ones, _ARCH_PWR9] +[VEC_TEST_LSBB_ALL_ONES, vec_test_lsbb_all_ones, __builtin_vec_xvtlsbb_all_ones] signed int __builtin_vec_xvtlsbb_all_ones (vuc); XVTLSBB_ONES -[VEC_TEST_LSBB_ALL_ZEROS, vec_test_lsbb_all_zeros, __builtin_vec_xvtlsbb_all_zeros, _ARCH_PWR9] +[VEC_TEST_LSBB_ALL_ZEROS, vec_test_lsbb_all_zeros, __builtin_vec_xvtlsbb_all_zeros] signed int __builtin_vec_xvtlsbb_all_zeros (vuc); XVTLSBB_ZEROS @@ -4420,19 +4424,19 @@ vui __builtin_vec_vunsignedo (vd); VEC_VUNSIGNEDO_V2DF -[VEC_VEE, vec_extract_exp, __builtin_vec_extract_exp, _ARCH_PWR9] +[VEC_VEE, vec_extract_exp, __builtin_vec_extract_exp] vui __builtin_vec_extract_exp (vf); VEESP vull __builtin_vec_extract_exp (vd); VEEDP -[VEC_VES, vec_extract_sig, __builtin_vec_extract_sig, _ARCH_PWR9] +[VEC_VES, vec_extract_sig, __builtin_vec_extract_sig] vui __builtin_vec_extract_sig (vf); VESSP vull __builtin_vec_extract_sig (vd); VESDP -[VEC_VIE, vec_insert_exp, __builtin_vec_insert_exp, _ARCH_PWR9] +[VEC_VIE, vec_insert_exp, __builtin_vec_insert_exp] vf __builtin_vec_insert_exp (vf, vui); VIESP VIESP_VF vf __builtin_vec_insert_exp (vui, vui); @@ -4444,7 +4448,7 @@ ; It is truly unfortunate that vec_vprtyb has an incompatible set of ; interfaces with vec_parity_lsbb. So we can't even deprecate this. -[VEC_VPRTYB, vec_vprtyb, __builtin_vec_vprtyb, _ARCH_PWR9] +[VEC_VPRTYB, vec_vprtyb, __builtin_vec_vprtyb] vsi __builtin_vec_vprtyb (vsi); VPRTYBW VPRTYB_VSI vui __builtin_vec_vprtyb (vui); @@ -4462,43 +4466,43 @@ unsigned __int128 __builtin_vec_vprtyb (unsigned __int128); VPRTYBQ VPRTYB_UQ -[VEC_VSCEEQ, scalar_cmp_exp_eq, __builtin_vec_scalar_cmp_exp_eq, _ARCH_PWR9] +[VEC_VSCEEQ, scalar_cmp_exp_eq, __builtin_vec_scalar_cmp_exp_eq] signed int __builtin_vec_scalar_cmp_exp_eq (double, double); VSCEDPEQ signed int __builtin_vec_scalar_cmp_exp_eq (_Float128, _Float128); VSCEQPEQ -[VEC_VSCEGT, scalar_cmp_exp_gt, __builtin_vec_scalar_cmp_exp_gt, _ARCH_PWR9] +[VEC_VSCEGT, scalar_cmp_exp_gt, __builtin_vec_scalar_cmp_exp_gt] signed int __builtin_vec_scalar_cmp_exp_gt (double, double); VSCEDPGT signed int __builtin_vec_scalar_cmp_exp_gt (_Float128, _Float128); VSCEQPGT -[VEC_VSCELT, scalar_cmp_exp_lt, __builtin_vec_scalar_cmp_exp_lt, _ARCH_PWR9] +[VEC_VSCELT, scalar_cmp_exp_lt, __builtin_vec_scalar_cmp_exp_lt] signed int __builtin_vec_scalar_cmp_exp_lt (double, double); VSCEDPLT signed int __builtin_vec_scalar_cmp_exp_lt (_Float128, _Float128); VSCEQPLT -[VEC_VSCEUO, scalar_cmp_exp_unordered, __builtin_vec_scalar_cmp_exp_unordered, _ARCH_PWR9] +[VEC_VSCEUO, scalar_cmp_exp_unordered, __builtin_vec_scalar_cmp_exp_unordered] signed int __builtin_vec_scalar_cmp_exp_unordered (double, double); VSCEDPUO signed int __builtin_vec_scalar_cmp_exp_unordered (_Float128, _Float128); VSCEQPUO -[VEC_VSEE, scalar_extract_exp, __builtin_vec_scalar_extract_exp, _ARCH_PWR9] +[VEC_VSEE, scalar_extract_exp, __builtin_vec_scalar_extract_exp] unsigned int __builtin_vec_scalar_extract_exp (double); VSEEDP unsigned int __builtin_vec_scalar_extract_exp (_Float128); VSEEQP -[VEC_VSES, scalar_extract_sig, __builtin_vec_scalar_extract_sig, _ARCH_PWR9] +[VEC_VSES, scalar_extract_sig, __builtin_vec_scalar_extract_sig] unsigned long long __builtin_vec_scalar_extract_sig (double); VSESDP unsigned __int128 __builtin_vec_scalar_extract_sig (_Float128); VSESQP -[VEC_VSIE, scalar_insert_exp, __builtin_vec_scalar_insert_exp, _ARCH_PWR9] +[VEC_VSIE, scalar_insert_exp, __builtin_vec_scalar_insert_exp] double __builtin_vec_scalar_insert_exp (unsigned long long, unsigned long long); VSIEDP double __builtin_vec_scalar_insert_exp (double, unsigned long long); @@ -4508,7 +4512,7 @@ _Float128 __builtin_vec_scalar_insert_exp (_Float128, unsigned long long); VSIEQPF -[VEC_VSTDC, scalar_test_data_class, __builtin_vec_scalar_test_data_class, _ARCH_PWR9] +[VEC_VSTDC, scalar_test_data_class, __builtin_vec_scalar_test_data_class] unsigned int __builtin_vec_scalar_test_data_class (float, const int); VSTDCSP unsigned int __builtin_vec_scalar_test_data_class (double, const int); @@ -4516,7 +4520,7 @@ unsigned int __builtin_vec_scalar_test_data_class (_Float128, const int); VSTDCQP -[VEC_VSTDCN, scalar_test_neg, __builtin_vec_scalar_test_neg, _ARCH_PWR9] +[VEC_VSTDCN, scalar_test_neg, __builtin_vec_scalar_test_neg] unsigned int __builtin_vec_scalar_test_neg (float); VSTDCNSP unsigned int __builtin_vec_scalar_test_neg (double); @@ -4524,13 +4528,13 @@ unsigned int __builtin_vec_scalar_test_neg (_Float128); VSTDCNQP -[VEC_VTDC, vec_test_data_class, __builtin_vec_test_data_class, _ARCH_PWR9] +[VEC_VTDC, vec_test_data_class, __builtin_vec_test_data_class] vbi __builtin_vec_test_data_class (vf, const int); VTDCSP vbll __builtin_vec_test_data_class (vd, const int); VTDCDP -[VEC_XL, vec_xl, __builtin_vec_vsx_ld, __VSX__] +[VEC_XL, vec_xl, __builtin_vec_vsx_ld] vsc __builtin_vec_vsx_ld (signed long long, const vsc *); LXVW4X_V16QI LXVW4X_VSC vsc __builtin_vec_vsx_ld (signed long long, const signed char *); @@ -4588,7 +4592,7 @@ vd __builtin_vec_vsx_ld (signed long long, const double *); LXVD2X_V2DF LXVD2X_D -[VEC_XL_BE, vec_xl_be, __builtin_vec_xl_be, __VSX__] +[VEC_XL_BE, vec_xl_be, __builtin_vec_xl_be] vsc __builtin_vec_xl_be (signed long long, const vsc *); LD_ELEMREV_V16QI LD_ELEMREV_VSC vsc __builtin_vec_xl_be (signed long long, const signed char *); @@ -4634,11 +4638,11 @@ vd __builtin_vec_xl_be (signed long long, const double *); LD_ELEMREV_V2DF LD_ELEMREV_DD -[VEC_XL_LEN_R, vec_xl_len_r, __builtin_vec_xl_len_r, _ARCH_PPC64_PWR9] +[VEC_XL_LEN_R, vec_xl_len_r, __builtin_vec_xl_len_r] vuc __builtin_vsx_xl_len_r (const unsigned char *, unsigned int); XL_LEN_R -[VEC_XL_SEXT, vec_xl_sext, __builtin_vec_xl_sext, _ARCH_PWR10] +[VEC_XL_SEXT, vec_xl_sext, __builtin_vec_xl_sext] vsq __builtin_vec_xl_sext (signed long long, const signed char *); SE_LXVRBX vsq __builtin_vec_xl_sext (signed long long, const signed short *); @@ -4648,7 +4652,7 @@ vsq __builtin_vec_xl_sext (signed long long, const signed long long *); SE_LXVRDX -[VEC_XL_ZEXT, vec_xl_zext, __builtin_vec_xl_zext, _ARCH_PWR10] +[VEC_XL_ZEXT, vec_xl_zext, __builtin_vec_xl_zext] vuq __builtin_vec_xl_zext (signed long long, const unsigned char *); ZE_LXVRBX vuq __builtin_vec_xl_zext (signed long long, const unsigned short *); @@ -4733,7 +4737,7 @@ vd __builtin_vec_xor (vbll, vd); VXOR_V2DF VXOR_VBLL_VD -[VEC_XST, vec_xst, __builtin_vec_vsx_st, __VSX__] +[VEC_XST, vec_xst, __builtin_vec_vsx_st] void __builtin_vec_vsx_st (vsc, signed long long, vsc *); STXVW4X_V16QI STXVW4X_VSC void __builtin_vec_vsx_st (vsc, signed long long, signed char *); @@ -4801,7 +4805,7 @@ void __builtin_vec_vsx_st (vd, signed long long, double *); STXVD2X_V2DF STXVD2X_D -[VEC_XST_BE, vec_xst_be, __builtin_vec_xst_be, __VSX__] +[VEC_XST_BE, vec_xst_be, __builtin_vec_xst_be] void __builtin_vec_xst_be (vsc, signed long long, vsc *); ST_ELEMREV_V16QI ST_ELEMREV_VSC void __builtin_vec_xst_be (vsc, signed long long, signed char *); @@ -4847,11 +4851,11 @@ void __builtin_vec_xst_be (vd, signed long long, double *); ST_ELEMREV_V2DF ST_ELEMREV_D -[VEC_XST_LEN_R, vec_xst_len_r, __builtin_vec_xst_len_r, _ARCH_PPC64_PWR9] +[VEC_XST_LEN_R, vec_xst_len_r, __builtin_vec_xst_len_r] void __builtin_vsx_xst_len_r (vuc, unsigned char *, unsigned int); XST_LEN_R -[VEC_XST_TRUNC, vec_xst_trunc, __builtin_vec_xst_trunc, _ARCH_PWR10] +[VEC_XST_TRUNC, vec_xst_trunc, __builtin_vec_xst_trunc] void __builtin_vec_xst_trunc (vsq, signed long long, signed char *); TR_STXVRBX TR_STXVRBX_S void __builtin_vec_xst_trunc (vuq, signed long long, unsigned char *); @@ -4869,7 +4873,7 @@ void __builtin_vec_xst_trunc (vuq, signed long long, unsigned long long *); TR_STXVRDX TR_STXVRDX_U -[VEC_XXPERMDI, vec_xxpermdi, __builtin_vsx_xxpermdi, __VSX__] +[VEC_XXPERMDI, vec_xxpermdi, __builtin_vsx_xxpermdi] vsc __builtin_vsx_xxpermdi (vsc, vsc, const int); XXPERMDI_16QI XXPERMDI_VSC vuc __builtin_vsx_xxpermdi (vuc, vuc, const int); @@ -4891,7 +4895,7 @@ vd __builtin_vsx_xxpermdi (vd, vd, const int); XXPERMDI_2DF XXPERMDI_VD -[VEC_XXSLDWI, vec_xxsldwi, __builtin_vsx_xxsldwi, __VSX__] +[VEC_XXSLDWI, vec_xxsldwi, __builtin_vsx_xxsldwi] vsc __builtin_vsx_xxsldwi (vsc, vsc, const int); XXSLDWI_16QI XXSLDWI_VSC2 vuc __builtin_vsx_xxsldwi (vuc, vuc, const int); @@ -4990,51 +4994,51 @@ void __builtin_vec_stvewx (vui, signed long, void *); STVEWX STVEWX_DEPR8 -[VEC_TSTSFI_EQ_DD, SKIP, __builtin_dfp_dtstsfi_eq_dd, _ARCH_PWR9] +[VEC_TSTSFI_EQ_DD, SKIP, __builtin_dfp_dtstsfi_eq_dd] signed int __builtin_dfp_dtstsfi_eq_dd (const int, _Decimal64); TSTSFI_EQ_DD TSTSFI_EQ_DD_DEPR1 -[VEC_TSTSFI_EQ_TD, SKIP, __builtin_dfp_dtstsfi_eq_td, _ARCH_PWR9] +[VEC_TSTSFI_EQ_TD, SKIP, __builtin_dfp_dtstsfi_eq_td] signed int __builtin_dfp_dtstsfi_eq_td (const int, _Decimal128); TSTSFI_EQ_TD TSTSFI_EQ_TD_DEPR1 -[VEC_TSTSFI_GT_DD, SKIP, __builtin_dfp_dtstsfi_gt_dd, _ARCH_PWR9] +[VEC_TSTSFI_GT_DD, SKIP, __builtin_dfp_dtstsfi_gt_dd] signed int __builtin_dfp_dtstsfi_gt_dd (const int, _Decimal64); TSTSFI_GT_DD TSTSFI_GT_DD_DEPR1 -[VEC_TSTSFI_GT_TD, SKIP, __builtin_dfp_dtstsfi_gt_td, _ARCH_PWR9] +[VEC_TSTSFI_GT_TD, SKIP, __builtin_dfp_dtstsfi_gt_td] signed int __builtin_dfp_dtstsfi_gt_td (const int, _Decimal128); TSTSFI_GT_TD TSTSFI_GT_TD_DEPR1 -[VEC_TSTSFI_LT_DD, SKIP, __builtin_dfp_dtstsfi_lt_dd, _ARCH_PWR9] +[VEC_TSTSFI_LT_DD, SKIP, __builtin_dfp_dtstsfi_lt_dd] signed int __builtin_dfp_dtstsfi_lt_dd (const int, _Decimal64); TSTSFI_LT_DD TSTSFI_LT_DD_DEPR1 -[VEC_TSTSFI_LT_TD, SKIP, __builtin_dfp_dtstsfi_lt_td, _ARCH_PWR9] +[VEC_TSTSFI_LT_TD, SKIP, __builtin_dfp_dtstsfi_lt_td] signed int __builtin_dfp_dtstsfi_lt_td (const int, _Decimal128); TSTSFI_LT_TD TSTSFI_LT_TD_DEPR1 -[VEC_TSTSFI_OV_DD, SKIP, __builtin_dfp_dtstsfi_ov_dd, _ARCH_PWR9] +[VEC_TSTSFI_OV_DD, SKIP, __builtin_dfp_dtstsfi_ov_dd] signed int __builtin_dfp_dtstsfi_ov_dd (const int, _Decimal64); TSTSFI_OV_DD TSTSFI_OV_DD_DEPR1 -[VEC_TSTSFI_OV_TD, SKIP, __builtin_dfp_dtstsfi_ov_td, _ARCH_PWR9] +[VEC_TSTSFI_OV_TD, SKIP, __builtin_dfp_dtstsfi_ov_td] signed int __builtin_dfp_dtstsfi_ov_td (const int, _Decimal128); TSTSFI_OV_TD TSTSFI_OV_TD_DEPR1 -[VEC_VADDCUQ, vec_vaddcuq, __builtin_vec_vaddcuq, _ARCH_PWR8] +[VEC_VADDCUQ, vec_vaddcuq, __builtin_vec_vaddcuq] vsq __builtin_vec_vaddcuq (vsq, vsq); VADDCUQ VADDCUQ_DEPR1 vuq __builtin_vec_vaddcuq (vuq, vuq); VADDCUQ VADDCUQ_DEPR2 -[VEC_VADDECUQ, vec_vaddecuq, __builtin_vec_vaddecuq, _ARCH_PWR8] +[VEC_VADDECUQ, vec_vaddecuq, __builtin_vec_vaddecuq] vsq __builtin_vec_vaddecuq (vsq, vsq, vsq); VADDECUQ VADDECUQ_DEPR1 vuq __builtin_vec_vaddecuq (vuq, vuq, vuq); VADDECUQ VADDECUQ_DEPR2 -[VEC_VADDEUQM, vec_vaddeuqm, __builtin_vec_vaddeuqm, _ARCH_PWR8] +[VEC_VADDEUQM, vec_vaddeuqm, __builtin_vec_vaddeuqm] vsq __builtin_vec_vaddeuqm (vsq, vsq, vsq); VADDEUQM VADDEUQM_DEPR1 vuq __builtin_vec_vaddeuqm (vuq, vuq, vuq); @@ -5098,7 +5102,7 @@ vuc __builtin_vec_vaddubs (vuc, vbc); VADDUBS VADDUBS_DEPR5 -[VEC_VADDUDM, vec_vaddudm, __builtin_vec_vaddudm, _ARCH_PWR8] +[VEC_VADDUDM, vec_vaddudm, __builtin_vec_vaddudm] vsll __builtin_vec_vaddudm (vbll, vsll); VADDUDM VADDUDM_DEPR1 vsll __builtin_vec_vaddudm (vsll, vbll); @@ -5142,7 +5146,7 @@ vus __builtin_vec_vadduhs (vus, vbs); VADDUHS VADDUHS_DEPR5 -[VEC_VADDUQM, vec_vadduqm, __builtin_vec_vadduqm, _ARCH_PWR8] +[VEC_VADDUQM, vec_vadduqm, __builtin_vec_vadduqm] vsq __builtin_vec_vadduqm (vsq, vsq); VADDUQM VADDUQM_DEPR1 vuq __builtin_vec_vadduqm (vuq, vuq); @@ -5214,7 +5218,7 @@ vui __builtin_vec_vavguw (vui, vui); VAVGUW VAVGUW_DEPR1 -[VEC_VBPERMQ, vec_vbpermq, __builtin_vec_vbpermq, _ARCH_PWR8] +[VEC_VBPERMQ, vec_vbpermq, __builtin_vec_vbpermq] vull __builtin_vec_vbpermq (vull, vuc); VBPERMQ VBPERMQ_DEPR1 vsll __builtin_vec_vbpermq (vsc, vsc); @@ -5232,25 +5236,25 @@ vf __builtin_vec_vcfux (vui, const int); VCFUX VCFUX_DEPR1 -[VEC_VCLZB, vec_vclzb, __builtin_vec_vclzb, _ARCH_PWR8] +[VEC_VCLZB, vec_vclzb, __builtin_vec_vclzb] vsc __builtin_vec_vclzb (vsc); VCLZB VCLZB_DEPR1 vuc __builtin_vec_vclzb (vuc); VCLZB VCLZB_DEPR2 -[VEC_VCLZD, vec_vclzd, __builtin_vec_vclzd, _ARCH_PWR8] +[VEC_VCLZD, vec_vclzd, __builtin_vec_vclzd] vsll __builtin_vec_vclzd (vsll); VCLZD VCLZD_DEPR1 vull __builtin_vec_vclzd (vull); VCLZD VCLZD_DEPR2 -[VEC_VCLZH, vec_vclzh, __builtin_vec_vclzh, _ARCH_PWR8] +[VEC_VCLZH, vec_vclzh, __builtin_vec_vclzh] vss __builtin_vec_vclzh (vss); VCLZH VCLZH_DEPR1 vus __builtin_vec_vclzh (vus); VCLZH VCLZH_DEPR2 -[VEC_VCLZW, vec_vclzw, __builtin_vec_vclzw, _ARCH_PWR8] +[VEC_VCLZW, vec_vclzw, __builtin_vec_vclzw] vsi __builtin_vec_vclzw (vsi); VCLZW VCLZW_DEPR1 vui __builtin_vec_vclzw (vui); @@ -5306,53 +5310,53 @@ vbi __builtin_vec_vcmpgtuw (vui, vui); VCMPGTUW VCMPGTUW_DEPR1 -[VEC_VCTZB, vec_vctzb, __builtin_vec_vctzb, _ARCH_PWR9] +[VEC_VCTZB, vec_vctzb, __builtin_vec_vctzb] vsc __builtin_vec_vctzb (vsc); VCTZB VCTZB_DEPR1 vuc __builtin_vec_vctzb (vuc); VCTZB VCTZB_DEPR2 -[VEC_VCTZD, vec_vctzd, __builtin_vec_vctzd, _ARCH_PWR9] +[VEC_VCTZD, vec_vctzd, __builtin_vec_vctzd] vsll __builtin_vec_vctzd (vsll); VCTZD VCTZD_DEPR1 vull __builtin_vec_vctzd (vull); VCTZD VCTZD_DEPR2 -[VEC_VCTZH, vec_vctzh, __builtin_vec_vctzh, _ARCH_PWR9] +[VEC_VCTZH, vec_vctzh, __builtin_vec_vctzh] vss __builtin_vec_vctzh (vss); VCTZH VCTZH_DEPR1 vus __builtin_vec_vctzh (vus); VCTZH VCTZH_DEPR2 -[VEC_VCTZW, vec_vctzw, __builtin_vec_vctzw, _ARCH_PWR9] +[VEC_VCTZW, vec_vctzw, __builtin_vec_vctzw] vsi __builtin_vec_vctzw (vsi); VCTZW VCTZW_DEPR1 vui __builtin_vec_vctzw (vui); VCTZW VCTZW_DEPR2 -[VEC_VEEDP, vec_extract_exp_dp, __builtin_vec_extract_exp_dp, _ARCH_PWR9] +[VEC_VEEDP, vec_extract_exp_dp, __builtin_vec_extract_exp_dp] vull __builtin_vec_extract_exp_dp (vd); VEEDP VEEDP_DEPR1 -[VEC_VEESP, vec_extract_exp_sp, __builtin_vec_extract_exp_sp, _ARCH_PWR9] +[VEC_VEESP, vec_extract_exp_sp, __builtin_vec_extract_exp_sp] vui __builtin_vec_extract_exp_sp (vf); VEESP VEESP_DEPR1 -[VEC_VESDP, vec_extract_sig_dp, __builtin_vec_extract_sig_dp, _ARCH_PWR9] +[VEC_VESDP, vec_extract_sig_dp, __builtin_vec_extract_sig_dp] vull __builtin_vec_extract_sig_dp (vd); VESDP VESDP_DEPR1 -[VEC_VESSP, vec_extract_sig_sp, __builtin_vec_extract_sig_sp, _ARCH_PWR9] +[VEC_VESSP, vec_extract_sig_sp, __builtin_vec_extract_sig_sp] vui __builtin_vec_extract_sig_sp (vf); VESSP VESSP_DEPR1 -[VEC_VIEDP, vec_insert_exp_dp, __builtin_vec_insert_exp_dp, _ARCH_PWR9] +[VEC_VIEDP, vec_insert_exp_dp, __builtin_vec_insert_exp_dp] vd __builtin_vec_insert_exp_dp (vd, vull); VIEDP VIEDP_DEPR1 vd __builtin_vec_insert_exp_dp (vull, vull); VIEDP VIEDP_DEPR2 -[VEC_VIESP, vec_insert_exp_sp, __builtin_vec_insert_exp_sp, _ARCH_PWR9] +[VEC_VIESP, vec_insert_exp_sp, __builtin_vec_insert_exp_sp] vf __builtin_vec_insert_exp_sp (vf, vui); VIESP VIESP_DEPR1 vf __builtin_vec_insert_exp_sp (vui, vui); @@ -5650,11 +5654,11 @@ vull __builtin_vec_vmulouw (vui, vui); VMULOUW VMULOUW_DEPR1 -[VEC_VPKSDSS, vec_vpksdss, __builtin_vec_vpksdss, _ARCH_PWR8] +[VEC_VPKSDSS, vec_vpksdss, __builtin_vec_vpksdss] vsi __builtin_vec_vpksdss (vsll, vsll); VPKSDSS VPKSDSS_DEPR1 -[VEC_VPKSDUS, vec_vpksdus, __builtin_vec_vpksdus, _ARCH_PWR8] +[VEC_VPKSDUS, vec_vpksdus, __builtin_vec_vpksdus] vui __builtin_vec_vpksdus (vsll, vsll); VPKSDUS VPKSDUS_DEPR1 @@ -5674,7 +5678,7 @@ vus __builtin_vec_vpkswus (vsi, vsi); VPKSWUS VPKSWUS_DEPR1 -[VEC_VPKUDUM, vec_vpkudum, __builtin_vec_vpkudum, _ARCH_PWR8] +[VEC_VPKUDUM, vec_vpkudum, __builtin_vec_vpkudum] vsi __builtin_vec_vpkudum (vsll, vsll); VPKUDUM VPKUDUM_DEPR1 vui __builtin_vec_vpkudum (vull, vull); @@ -5682,7 +5686,7 @@ vbi __builtin_vec_vpkudum (vbll, vbll); VPKUDUM VPKUDUM_DEPR3 -[VEC_VPKUDUS, vec_vpkudus, __builtin_vec_vpkudus, _ARCH_PWR8] +[VEC_VPKUDUS, vec_vpkudus, __builtin_vec_vpkudus] vui __builtin_vec_vpkudus (vull, vull); VPKUDUS VPKUDUS_DEPR1 @@ -5710,7 +5714,7 @@ vus __builtin_vec_vpkuwus (vui, vui); VPKUWUS VPKUWUS_DEPR1 -[VEC_VPOPCNT, vec_vpopcnt, __builtin_vec_vpopcnt, _ARCH_PWR8] +[VEC_VPOPCNT, vec_vpopcnt, __builtin_vec_vpopcnt] vsc __builtin_vec_vpopcnt (vsc); VPOPCNTB VPOPCNT_DEPR1 vuc __builtin_vec_vpopcnt (vuc); @@ -5728,37 +5732,37 @@ vull __builtin_vec_vpopcnt (vull); VPOPCNTD VPOPCNT_DEPR8 -[VEC_VPOPCNTB, vec_vpopcntb, __builtin_vec_vpopcntb, _ARCH_PWR8] +[VEC_VPOPCNTB, vec_vpopcntb, __builtin_vec_vpopcntb] vsc __builtin_vec_vpopcntb (vsc); VPOPCNTB VPOPCNTB_DEPR1 vuc __builtin_vec_vpopcntb (vuc); VPOPCNTB VPOPCNTB_DEPR2 -[VEC_VPOPCNTD, vec_vpopcntd, __builtin_vec_vpopcntd, _ARCH_PWR8] +[VEC_VPOPCNTD, vec_vpopcntd, __builtin_vec_vpopcntd] vsll __builtin_vec_vpopcntd (vsll); VPOPCNTD VPOPCNTD_DEPR1 vull __builtin_vec_vpopcntd (vull); VPOPCNTD VPOPCNTD_DEPR2 -[VEC_VPOPCNTH, vec_vpopcnth, __builtin_vec_vpopcnth, _ARCH_PWR8] +[VEC_VPOPCNTH, vec_vpopcnth, __builtin_vec_vpopcnth] vss __builtin_vec_vpopcnth (vss); VPOPCNTH VPOPCNTH_DEPR1 vus __builtin_vec_vpopcnth (vus); VPOPCNTH VPOPCNTH_DEPR2 -[VEC_VPOPCNTW, vec_vpopcntw, __builtin_vec_vpopcntw, _ARCH_PWR8] +[VEC_VPOPCNTW, vec_vpopcntw, __builtin_vec_vpopcntw] vsi __builtin_vec_vpopcntw (vsi); VPOPCNTW VPOPCNTW_DEPR1 vui __builtin_vec_vpopcntw (vui); VPOPCNTW VPOPCNTW_DEPR2 -[VEC_VPRTYBD, vec_vprtybd, __builtin_vec_vprtybd, _ARCH_PWR9] +[VEC_VPRTYBD, vec_vprtybd, __builtin_vec_vprtybd] vsll __builtin_vec_vprtybd (vsll); VPRTYBD VPRTYBD_DEPR1 vull __builtin_vec_vprtybd (vull); VPRTYBD VPRTYBD_DEPR2 -[VEC_VPRTYBQ, vec_vprtybq, __builtin_vec_vprtybq, _ARCH_PPC64_PWR9] +[VEC_VPRTYBQ, vec_vprtybq, __builtin_vec_vprtybq] vsq __builtin_vec_vprtybq (vsq); VPRTYBQ VPRTYBQ_DEPR1 vuq __builtin_vec_vprtybq (vuq); @@ -5768,7 +5772,7 @@ unsigned __int128 __builtin_vec_vprtybq (unsigned __int128); VPRTYBQ VPRTYBQ_DEPR4 -[VEC_VPRTYBW, vec_vprtybw, __builtin_vec_vprtybw, _ARCH_PWR9] +[VEC_VPRTYBW, vec_vprtybw, __builtin_vec_vprtybw] vsi __builtin_vec_vprtybw (vsi); VPRTYBW VPRTYBW_DEPR1 vui __builtin_vec_vprtybw (vui); @@ -5780,7 +5784,7 @@ vuc __builtin_vec_vrlb (vuc, vuc); VRLB VRLB_DEPR2 -[VEC_VRLD, SKIP, __builtin_vec_vrld, _ARCH_PWR8] +[VEC_VRLD, SKIP, __builtin_vec_vrld] vsll __builtin_vec_vrld (vsll, vull); VRLD VRLD_DEPR1 vull __builtin_vec_vrld (vull, vull); @@ -5804,7 +5808,7 @@ vuc __builtin_vec_vslb (vuc, vuc); VSLB VSLB_DEPR2 -[VEC_VSLD, SKIP, __builtin_vec_vsld, _ARCH_PWR8] +[VEC_VSLD, SKIP, __builtin_vec_vsld] vsll __builtin_vec_vsld (vsll, vull); VSLD VSLD_DEPR1 vull __builtin_vec_vsld (vull, vull); @@ -5856,7 +5860,7 @@ vuc __builtin_vec_vsrab (vuc, vuc); VSRAB VSRAB_DEPR2 -[VEC_VSRAD, SKIP, __builtin_vec_vsrad, _ARCH_PWR8] +[VEC_VSRAD, SKIP, __builtin_vec_vsrad] vsll __builtin_vec_vsrad (vsll, vull); VSRAD VSRAD_DEPR1 vull __builtin_vec_vsrad (vull, vull); @@ -5880,7 +5884,7 @@ vuc __builtin_vec_vsrb (vuc, vuc); VSRB VSRB_DEPR2 -[VEC_VSRD, SKIP, __builtin_vec_vsrd, _ARCH_PWR8] +[VEC_VSRD, SKIP, __builtin_vec_vsrd] vsll __builtin_vec_vsrd (vsll, vull); VSRD VSRD_DEPR1 vull __builtin_vec_vsrd (vull, vull); @@ -5898,27 +5902,27 @@ vui __builtin_vec_vsrw (vui, vui); VSRW VSRW_DEPR2 -[VEC_VSTDCDP, scalar_test_data_class_dp, __builtin_vec_scalar_test_data_class_dp, _ARCH_PWR9] +[VEC_VSTDCDP, scalar_test_data_class_dp, __builtin_vec_scalar_test_data_class_dp] unsigned int __builtin_vec_scalar_test_data_class_dp (double, const int); VSTDCDP VSTDCDP_DEPR1 -[VEC_VSTDCNDP, scalar_test_neg_dp, __builtin_vec_scalar_test_neg_dp, _ARCH_PWR9] +[VEC_VSTDCNDP, scalar_test_neg_dp, __builtin_vec_scalar_test_neg_dp] unsigned int __builtin_vec_scalar_test_neg_dp (double); VSTDCNDP VSTDCNDP_DEPR1 -[VEC_VSTDCNQP, scalar_test_neg_qp, __builtin_vec_scalar_test_neg_qp, _ARCH_PWR9] +[VEC_VSTDCNQP, scalar_test_neg_qp, __builtin_vec_scalar_test_neg_qp] unsigned int __builtin_vec_scalar_test_neg_qp (_Float128); VSTDCNQP VSTDCNQP_DEPR1 -[VEC_VSTDCNSP, scalar_test_neg_sp, __builtin_vec_scalar_test_neg_sp, _ARCH_PWR9] +[VEC_VSTDCNSP, scalar_test_neg_sp, __builtin_vec_scalar_test_neg_sp] unsigned int __builtin_vec_scalar_test_neg_sp (float); VSTDCNSP VSTDCNSP_DEPR1 -[VEC_VSTDCQP, scalar_test_data_class_qp, __builtin_vec_scalar_test_data_class_qp, _ARCH_PWR9] +[VEC_VSTDCQP, scalar_test_data_class_qp, __builtin_vec_scalar_test_data_class_qp] unsigned int __builtin_vec_scalar_test_data_class_qp (_Float128, const int); VSTDCQP VSTDCQP_DEPR1 -[VEC_VSTDCSP, scalar_test_data_class_sp, __builtin_vec_scalar_test_data_class_sp, _ARCH_PWR9] +[VEC_VSTDCSP, scalar_test_data_class_sp, __builtin_vec_scalar_test_data_class_sp] unsigned int __builtin_vec_scalar_test_data_class_sp (float, const int); VSTDCSP VSTDCSP_DEPR1 @@ -5928,13 +5932,13 @@ vuq __builtin_vec_vsubcuq (vuq, vuq); VSUBCUQ VSUBCUQ_DEPR2 -[VEC_VSUBECUQ, vec_vsubecuq, __builtin_vec_vsubecuq, ARCH_PWR8] +[VEC_VSUBECUQ, vec_vsubecuq, __builtin_vec_vsubecuq] vsq __builtin_vec_vsubecuq (vsq, vsq, vsq); VSUBECUQ VSUBECUQ_DEPR1 vuq __builtin_vec_vsubecuq (vuq, vuq, vuq); VSUBECUQ VSUBECUQ_DEPR2 -[VEC_VSUBEUQM, vec_vsubeuqm, __builtin_vec_vsubeuqm, _ARCH_PWR8] +[VEC_VSUBEUQM, vec_vsubeuqm, __builtin_vec_vsubeuqm] vsq __builtin_vec_vsubeuqm (vsq, vsq, vsq); VSUBEUQM VSUBEUQM_DEPR1 vuq __builtin_vec_vsubeuqm (vuq, vuq, vuq); @@ -6004,7 +6008,7 @@ vuc __builtin_vec_vsububs (vuc, vbc); VSUBUBS VSUBUBS_DEPR8 -[VEC_VSUBUDM, vec_vsubudm, __builtin_vec_vsubudm, _ARCH_PWR8] +[VEC_VSUBUDM, vec_vsubudm, __builtin_vec_vsubudm] vsll __builtin_vec_vsubudm (vbll, vsll); VSUBUDM VSUBUDM_DEPR1 vsll __builtin_vec_vsubudm (vsll, vbll); @@ -6048,7 +6052,7 @@ vus __builtin_vec_vsubuhs (vus, vbs); VSUBUHS VSUBUHS_DEPR5 -[VEC_VSUBUQM, vec_vsubuqm, __builtin_vec_vsubuqm, _ARCH_PWR8] +[VEC_VSUBUQM, vec_vsubuqm, __builtin_vec_vsubuqm] vsq __builtin_vec_vsubuqm (vsq, vsq); VSUBUQM VSUBUQM_DEPR1 vuq __builtin_vec_vsubuqm (vuq, vuq); @@ -6096,11 +6100,11 @@ vui __builtin_vec_vsum4ubs (vuc, vui); VSUM4UBS VSUM4UBS_DEPR1 -[VEC_VTDCDP, vec_test_data_class_dp, __builtin_vec_test_data_class_dp, _ARCH_PWR9] +[VEC_VTDCDP, vec_test_data_class_dp, __builtin_vec_test_data_class_dp] vbll __builtin_vec_test_data_class_dp (vd, const int); VTDCDP VTDCDP_DEPR1 -[VEC_VTDCSP, vec_test_data_class_sp, __builtin_vec_test_data_class_sp, _ARCH_PWR9] +[VEC_VTDCSP, vec_test_data_class_sp, __builtin_vec_test_data_class_sp] vbi __builtin_vec_test_data_class_sp (vf, const int); VTDCSP VTDCSP_DEPR1 @@ -6138,7 +6142,7 @@ vbi __builtin_vec_vupkhsh (vbs); VUPKHSH VUPKHSH_DEPR2 -[VEC_VUPKHSW, vec_vupkhsw, __builtin_vec_vupkhsw, _ARCH_PWR8] +[VEC_VUPKHSW, vec_vupkhsw, __builtin_vec_vupkhsw] vsll __builtin_vec_vupkhsw (vsi); VUPKHSW VUPKHSW_DEPR1 vbll __builtin_vec_vupkhsw (vbi); @@ -6162,7 +6166,7 @@ vbi __builtin_vec_vupklsh (vbs); VUPKLSH VUPKLSH_DEPR2 -[VEC_VUPKLSW, vec_vupklsw, __builtin_vec_vupklsw, _ARCH_PWR8] +[VEC_VUPKLSW, vec_vupklsw, __builtin_vec_vupklsw] vsll __builtin_vec_vupklsw (vsi); VUPKLSW VUPKLSW_DEPR1 vbll __builtin_vec_vupklsw (vbi); -- cgit v1.1 From ca902055d056773bd0ca80f68bca4b20ad0e183f Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Fri, 21 Jan 2022 10:57:43 +0100 Subject: [nvptx] Fix reduction lock When I run the libgomp test-case reduction-cplx-dbl.c on an nvptx accelerator (T400, driver version 470.86), I run into: ... FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c \ -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none -O0 \ execution test FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c \ -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none -O2 \ execution test ... The problem is in this code generated for a gang reduction: ... $L39: atom.global.cas.b32 %r59, [__reduction_lock], 0, 1; setp.ne.u32 %r116, %r59, 0; @%r116 bra $L39; ld.f64 %r60, [%r44]; ld.f64 %r61, [%r44+8]; ld.f64 %r64, [%r44]; ld.f64 %r65, [%r44+8]; add.f64 %r117, %r64, %r22; add.f64 %r118, %r65, %r41; st.f64 [%r44], %r117; st.f64 [%r44+8], %r118; atom.global.cas.b32 %r119, [__reduction_lock], 1, 0; ... which is taking and releasing a lock, but missing the appropriate barriers to protect the loads and store inside the lock. Fix this by adding membar.gl barriers. Likewise, add membar.cta barriers if we protect shared memory loads and stores (even though the worker-partitioning part of the test-case is not failing). Tested on x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-01-27 Tom de Vries * config/nvptx/nvptx.cc (enum nvptx_builtins): Add NVPTX_BUILTIN_MEMBAR_GL and NVPTX_BUILTIN_MEMBAR_CTA. (VOID): New macro. (nvptx_init_builtins): Add MEMBAR_GL and MEMBAR_CTA. (nvptx_expand_builtin): Handle NVPTX_BUILTIN_MEMBAR_GL and NVPTX_BUILTIN_MEMBAR_CTA. (nvptx_lockfull_update): Add level parameter. Emit barriers. (nvptx_reduction_update, nvptx_goacc_reduction_fini): Update call to nvptx_lockfull_update. * config/nvptx/nvptx.md (define_c_enum "unspecv"): Add UNSPECV_MEMBAR_GL. (define_expand "nvptx_membar_gl"): New expand. (define_insn "*nvptx_membar_gl"): New insn. --- gcc/config/nvptx/nvptx.cc | 37 ++++++++++++++++++++++++++++++++----- gcc/config/nvptx/nvptx.md | 17 +++++++++++++++++ 2 files changed, 49 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index db6a405..ceea4d3 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -5622,6 +5622,8 @@ enum nvptx_builtins NVPTX_BUILTIN_VECTOR_ADDR, NVPTX_BUILTIN_CMP_SWAP, NVPTX_BUILTIN_CMP_SWAPLL, + NVPTX_BUILTIN_MEMBAR_GL, + NVPTX_BUILTIN_MEMBAR_CTA, NVPTX_BUILTIN_MAX }; @@ -5652,6 +5654,7 @@ nvptx_init_builtins (void) #define UINT unsigned_type_node #define LLUINT long_long_unsigned_type_node #define PTRVOID ptr_type_node +#define VOID void_type_node DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE)); DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE)); @@ -5661,6 +5664,8 @@ nvptx_init_builtins (void) (PTRVOID, ST, UINT, UINT, NULL_TREE)); DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE)); DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE)); + DEF (MEMBAR_GL, "membar_gl", (VOID, VOID, NULL_TREE)); + DEF (MEMBAR_CTA, "membar_cta", (VOID, VOID, NULL_TREE)); #undef DEF #undef ST @@ -5696,6 +5701,14 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), case NVPTX_BUILTIN_CMP_SWAPLL: return nvptx_expand_cmp_swap (exp, target, mode, ignore); + case NVPTX_BUILTIN_MEMBAR_GL: + emit_insn (gen_nvptx_membar_gl ()); + return NULL_RTX; + + case NVPTX_BUILTIN_MEMBAR_CTA: + emit_insn (gen_nvptx_membar_cta ()); + return NULL_RTX; + default: gcc_unreachable (); } } @@ -6243,7 +6256,7 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, static tree nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi, - tree ptr, tree var, tree_code op) + tree ptr, tree var, tree_code op, int level) { tree var_type = TREE_TYPE (var); tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true); @@ -6295,8 +6308,17 @@ nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi, lock_loop->any_estimate = true; add_loop (lock_loop, entry_bb->loop_father); - /* Build and insert the reduction calculation. */ + /* Build the pre-barrier. */ gimple_seq red_seq = NULL; + enum nvptx_builtins barrier_builtin + = (level == GOMP_DIM_GANG + ? NVPTX_BUILTIN_MEMBAR_GL + : NVPTX_BUILTIN_MEMBAR_CTA); + tree barrier_fn = nvptx_builtin_decl (barrier_builtin, true); + tree barrier_expr = build_call_expr_loc (loc, barrier_fn, 0); + gimplify_stmt (&barrier_expr, &red_seq); + + /* Build the reduction calculation. */ tree acc_in = make_ssa_name (var_type); tree ref_in = build_simple_mem_ref (ptr); TREE_THIS_VOLATILE (ref_in) = 1; @@ -6310,6 +6332,11 @@ nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi, TREE_THIS_VOLATILE (ref_out) = 1; gimplify_assign (ref_out, acc_out, &red_seq); + /* Build the post-barrier. */ + barrier_expr = build_call_expr_loc (loc, barrier_fn, 0); + gimplify_stmt (&barrier_expr, &red_seq); + + /* Insert the reduction calculation. */ gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT); /* Build & insert the unlock sequence. */ @@ -6330,7 +6357,7 @@ nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi, static tree nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi, - tree ptr, tree var, tree_code op) + tree ptr, tree var, tree_code op, int level) { tree type = TREE_TYPE (var); tree size = TYPE_SIZE (type); @@ -6339,7 +6366,7 @@ nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi, || size == TYPE_SIZE (long_long_unsigned_type_node)) return nvptx_lockless_update (loc, gsi, ptr, var, op); else - return nvptx_lockfull_update (loc, gsi, ptr, var, op); + return nvptx_lockfull_update (loc, gsi, ptr, var, op, level); } /* NVPTX implementation of GOACC_REDUCTION_SETUP. */ @@ -6531,7 +6558,7 @@ nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa) gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); seq = NULL; r = nvptx_reduction_update (gimple_location (call), &gsi, - accum, var, op); + accum, var, op, level); } } diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 5cf190a..773ae8f 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -58,6 +58,7 @@ UNSPECV_BARSYNC UNSPECV_MEMBAR UNSPECV_MEMBAR_CTA + UNSPECV_MEMBAR_GL UNSPECV_DIM_POS UNSPECV_FORK @@ -1932,6 +1933,22 @@ "\\tmembar.cta;" [(set_attr "predicable" "false")]) +(define_expand "nvptx_membar_gl" + [(set (match_dup 0) + (unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMBAR_GL))] + "" +{ + operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); + MEM_VOLATILE_P (operands[0]) = 1; +}) + +(define_insn "*nvptx_membar_gl" + [(set (match_operand:BLK 0 "" "") + (unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMBAR_GL))] + "" + "\\tmembar.gl;" + [(set_attr "predicable" "false")]) + (define_insn "nvptx_nounroll" [(unspec_volatile [(const_int 0)] UNSPECV_NOUNROLL)] "" -- cgit v1.1 From e0451f93d9faa13495132f4e246e9bef30b51417 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Fri, 21 Jan 2022 21:46:05 +0100 Subject: [nvptx] Add some support for .local atomics The ptx insn atom doesn't support local memory. In case of doing an atomic operation on local memory, we run into: ... operation not supported on global/shared address space ... This is the cuGetErrorString message for CUDA_ERROR_INVALID_ADDRESS_SPACE. The message is somewhat confusing given that actually the operation is not supported on local address space. Fix this by falling back on a non-atomic version when detecting a frame-related memory operand. This only solves some cases that are detected at compile-time. It does however fix the openacc private-atomic-* test-cases. Tested on x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-01-27 Tom de Vries * config/nvptx/nvptx.md (define_insn "atomic_compare_and_swap_1") (define_insn "atomic_exchange") (define_insn "atomic_fetch_add") (define_insn "atomic_fetch_addsf") (define_insn "atomic_fetch_"): Output non-atomic version if memory operands is frame-relative. gcc/testsuite/ChangeLog: 2022-01-31 Tom de Vries * gcc.target/nvptx/stack-atomics-run.c: New test. libgomp/ChangeLog: 2022-01-27 Tom de Vries * testsuite/libgomp.oacc-c-c++-common/private-atomic-1.c: Remove PR83812 workaround. * testsuite/libgomp.oacc-fortran/private-atomic-1-vector.f90: Same. * testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90: Same. --- gcc/config/nvptx/nvptx.md | 82 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 773ae8f..9cbbd95 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -1790,11 +1790,28 @@ (unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))] "" { + struct address_info info; + decompose_mem_address (&info, operands[1]); + if (info.base != NULL && REG_P (*info.base) + && REGNO_PTR_FRAME_P (REGNO (*info.base))) + { + output_asm_insn ("{", NULL); + output_asm_insn ("\\t" ".reg.pred" "\\t" "%%eq_p;", NULL); + output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands); + output_asm_insn ("\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands); + output_asm_insn ("\\t" "setp.eq%t0" "\\t" "%%eq_p, %%val, %2;", + operands); + output_asm_insn ("@%%eq_p\\t" "st%A1%t0" "\\t" "%1,%3;", operands); + output_asm_insn ("\\t" "mov%t0" "\\t" "%0,%%val;", operands); + output_asm_insn ("}", NULL); + return ""; + } const char *t - = "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;"; + = "\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;"; return nvptx_output_atomic_insn (t, operands, 1, 4); } - [(set_attr "atomic" "true")]) + [(set_attr "atomic" "true") + (set_attr "predicable" "false")]) (define_insn "atomic_exchange" [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R") ;; output @@ -1806,6 +1823,19 @@ (match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))] ;; input "" { + struct address_info info; + decompose_mem_address (&info, operands[1]); + if (info.base != NULL && REG_P (*info.base) + && REGNO_PTR_FRAME_P (REGNO (*info.base))) + { + output_asm_insn ("{", NULL); + output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands); + output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands); + output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%2;", operands); + output_asm_insn ("%.\\t" "mov%t0" "\\t" "%0,%%val;", operands); + output_asm_insn ("}", NULL); + return ""; + } const char *t = "%.\tatom%A1.exch.b%T0\t%0, %1, %2;"; return nvptx_output_atomic_insn (t, operands, 1, 3); @@ -1823,6 +1853,22 @@ (match_dup 1))] "" { + struct address_info info; + decompose_mem_address (&info, operands[1]); + if (info.base != NULL && REG_P (*info.base) + && REGNO_PTR_FRAME_P (REGNO (*info.base))) + { + output_asm_insn ("{", NULL); + output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands); + output_asm_insn ("\\t" ".reg%t0" "\\t" "%%update;", operands); + output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands); + output_asm_insn ("%.\\t" "add%t0" "\\t" "%%update,%%val,%2;", + operands); + output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%%update;", operands); + output_asm_insn ("%.\\t" "mov%t0" "\\t" "%0,%%val;", operands); + output_asm_insn ("}", NULL); + return ""; + } const char *t = "%.\\tatom%A1.add%t0\\t%0, %1, %2;"; return nvptx_output_atomic_insn (t, operands, 1, 3); @@ -1840,6 +1886,22 @@ (match_dup 1))] "" { + struct address_info info; + decompose_mem_address (&info, operands[1]); + if (info.base != NULL && REG_P (*info.base) + && REGNO_PTR_FRAME_P (REGNO (*info.base))) + { + output_asm_insn ("{", NULL); + output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands); + output_asm_insn ("\\t" ".reg%t0" "\\t" "%%update;", operands); + output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands); + output_asm_insn ("%.\\t" "add%t0" "\\t" "%%update,%%val,%2;", + operands); + output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%%update;", operands); + output_asm_insn ("%.\\t" "mov%t0" "\\t" "%0,%%val;", operands); + output_asm_insn ("}", NULL); + return ""; + } const char *t = "%.\\tatom%A1.add%t0\\t%0, %1, %2;"; return nvptx_output_atomic_insn (t, operands, 1, 3); @@ -1860,6 +1922,22 @@ (match_dup 1))] "mode == SImode || TARGET_SM35" { + struct address_info info; + decompose_mem_address (&info, operands[1]); + if (info.base != NULL && REG_P (*info.base) + && REGNO_PTR_FRAME_P (REGNO (*info.base))) + { + output_asm_insn ("{", NULL); + output_asm_insn ("\\t" ".reg.b%T0" "\\t" "%%val;", operands); + output_asm_insn ("\\t" ".reg.b%T0" "\\t" "%%update;", operands); + output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands); + output_asm_insn ("%.\\t" ".b%T0" "\\t" "%%update,%%val,%2;", + operands); + output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%%update;", operands); + output_asm_insn ("%.\\t" "mov%t0" "\\t" "%0,%%val;", operands); + output_asm_insn ("}", NULL); + return ""; + } const char *t = "%.\\tatom%A1.b%T0.\\t%0, %1, %2;"; return nvptx_output_atomic_insn (t, operands, 1, 3); -- cgit v1.1 From 456de10c549379b74d4858f00d4b8817035a73fc Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Sun, 23 Jan 2022 06:42:24 +0100 Subject: [nvptx] Handle nop in prevent_branch_around_nothing When running libgomp test-case reduction-7.c on an nvptx accelerator (T400, driver version 470.86) and GOMP_NVPTX_JIT=-O0, I run into: ... reduction-7.exe:reduction-7.c:312: v_p_2: \ Assertion `out[j * 32 + i] == (i + j) * 2' failed. FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-7.c \ -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none \ -O0 execution test ... During investigation I found ptx code like this: ... @ %r163 bra $L262; $L262: ... There's a known problem with executing this type of code, and a workaround is in place to address this: prevent_branch_around_nothing. The workaround does not trigger though because it doesn't handle the nop insn. Fix this by handling the nop insn in prevent_branch_around_nothing. Tested libgomp on x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-01-27 Tom de Vries PR target/100428 * config/nvptx/nvptx.cc (prevent_branch_around_nothing): Handle nop insn. --- gcc/config/nvptx/nvptx.cc | 1 + 1 file changed, 1 insertion(+) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index ceea4d3..262e8f9 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -5103,6 +5103,7 @@ prevent_branch_around_nothing (void) case CODE_FOR_nvptx_forked: case CODE_FOR_nvptx_joining: case CODE_FOR_nvptx_join: + case CODE_FOR_nop: continue; default: seen_label = NULL; -- cgit v1.1 From 57f971f99209cc950d7e706b7b52f4c9ef1d10b0 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Wed, 26 Jan 2022 14:16:42 +0100 Subject: [nvptx] Update bar.sync for ptx isa 6.0 In ptx isa 6.0, a new barrier instruction was added, and bar.sync was redefined as barrier.sync.aligned. The aligned modifier indicates that all threads in a CTA will execute the same barrier instruction. The seems fine for a form "bar.sync 0". But a "bar.sync %rx,64" (as used for vector length > 32) may execute a diffferent barrier depending on the value of %rx, so we can't assume it's aligned. Fix this by using "barrier.sync %rx,64" instead. Tested on x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-01-27 Tom de Vries * config/nvptx/nvptx-opts.h (enum ptx_version): Add PTX_VERSION_6_0. * config/nvptx/nvptx.h (TARGET_PTX_6_0): New macro. * config/nvptx/nvptx.md (define_insn "nvptx_barsync"): Use barrier insn for TARGET_PTX_6_0. --- gcc/config/nvptx/nvptx-opts.h | 1 + gcc/config/nvptx/nvptx.h | 1 + gcc/config/nvptx/nvptx.md | 8 ++++++-- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h index daae72f..c754a51 100644 --- a/gcc/config/nvptx/nvptx-opts.h +++ b/gcc/config/nvptx/nvptx-opts.h @@ -32,6 +32,7 @@ enum ptx_isa enum ptx_version { PTX_VERSION_3_1, + PTX_VERSION_6_0, PTX_VERSION_6_3, PTX_VERSION_7_0 }; diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h index 9fda2f0..065d7aa 100644 --- a/gcc/config/nvptx/nvptx.h +++ b/gcc/config/nvptx/nvptx.h @@ -91,6 +91,7 @@ #define TARGET_SM75 (ptx_isa_option >= PTX_ISA_SM75) #define TARGET_SM80 (ptx_isa_option >= PTX_ISA_SM80) +#define TARGET_PTX_6_0 (ptx_version_option >= PTX_VERSION_6_0) #define TARGET_PTX_6_3 (ptx_version_option >= PTX_VERSION_6_3) #define TARGET_PTX_7_0 (ptx_version_option >= PTX_VERSION_7_0) diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 9cbbd95..b391165 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -1968,9 +1968,13 @@ "" { if (INTVAL (operands[1]) == 0) - return "\\tbar.sync\\t%0;"; + return (TARGET_PTX_6_0 + ? "\\tbarrier.sync.aligned\\t%0;" + : "\\tbar.sync\\t%0;"); else - return "\\tbar.sync\\t%0, %1;"; + return (TARGET_PTX_6_0 + ? "\\tbarrier.sync\\t%0, %1;" + : "\\tbar.sync\\t%0, %1;"); } [(set_attr "predicable" "false")]) -- cgit v1.1 From 8ff0669f6d1d6126b7c010da02fa6532abb5e1ca Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Wed, 26 Jan 2022 14:17:40 +0100 Subject: [nvptx] Update default ptx isa to 6.3 With the following example, minimized from parallel-dims.c: ... int main (void) { int vectors_max = -1; #pragma acc parallel num_gangs (1) num_workers (1) copy (vectors_max) { for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) #pragma acc loop vector reduction (max: vectors_max) for (int k = 0; k < 32; k++) vectors_max = k; } if (vectors_max != 31) __builtin_abort (); return 0; } ... I run into (T400, driver version 470.94): ... FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/parallel-dims.c \ -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none -O2 \ execution test ... The FAIL does not happen with GOMP_NVPTX_JIT=-O0. The problem seems to be that the shfl insns for the vector reduction are not executed uniformly by the warp. Enforcing this by using shfl.sync fixes the problem. Fix this by setting the ptx isa to 6.3 by default, which allows the use of shfl.sync. Tested on x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-01-27 Tom de Vries * config/nvptx/nvptx.opt (mptx): Set to PTX_VERSION_6_3 by default. --- gcc/config/nvptx/nvptx.opt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 6514dd3..6e12b1f 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -89,5 +89,5 @@ EnumValue Enum(ptx_version) String(7.0) Value(PTX_VERSION_7_0) mptx= -Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option) Init(PTX_VERSION_3_1) +Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option) Init(PTX_VERSION_6_3) Specify the version of the ptx version to use. -- cgit v1.1 From bba61d403d05202deb698b352a4faef3feb1f04d Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Thu, 27 Jan 2022 15:03:59 +0100 Subject: [nvptx] Add bar.warp.sync On a GT 1030 (sm_61), with driver version 470.94 I run into: ... FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/parallel-dims.c \ -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none \ -O2 execution test ... which minimizes to the same test-case as listed in commit "[nvptx] Update default ptx isa to 6.3". The first divergent branch looks like: ... { .reg .u32 %x; mov.u32 %x,%tid.x; setp.ne.u32 %r59,%x,0; } @ %r59 bra $L15; mov.u64 %r48,%ar0; mov.u32 %r22,2; ld.u64 %r53,[%r48]; mov.u32 %r55,%r22; mov.u32 %r54,1; $L15: ... and when inspecting the generated SASS, the branch is not setup as a divergent branch, but instead as a regular branch. This causes us to execute a shfl.sync insn in divergent mode, which is likely to cause trouble given a remark in the ptx isa version 6.3, which mentions that for .target sm_6x or below, all threads must excute the same shfl.sync instruction in convergence. Fix this by placing a "bar.warp.sync 0xffffffff" at the desired convergence point (in the example above, after $L15). Tested on x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-01-31 Tom de Vries * config/nvptx/nvptx.cc (nvptx_single): Use nvptx_warpsync. * config/nvptx/nvptx.md (define_c_enum "unspecv"): Add UNSPECV_WARPSYNC. (define_insn "nvptx_warpsync"): New define_insn. --- gcc/config/nvptx/nvptx.cc | 7 +++++++ gcc/config/nvptx/nvptx.md | 7 +++++++ 2 files changed, 14 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 262e8f9..1b91990 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -4598,6 +4598,7 @@ nvptx_single (unsigned mask, basic_block from, basic_block to) rtx_insn *neuter_start = NULL; rtx_insn *worker_label = NULL, *vector_label = NULL; rtx_insn *worker_jump = NULL, *vector_jump = NULL; + rtx_insn *warp_sync = NULL; for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) if (GOMP_DIM_MASK (mode) & skip_mask) { @@ -4630,11 +4631,15 @@ nvptx_single (unsigned mask, basic_block from, basic_block to) if (tail_branch) { label_insn = emit_label_before (label, before); + if (TARGET_PTX_6_0 && mode == GOMP_DIM_VECTOR) + warp_sync = emit_insn_after (gen_nvptx_warpsync (), label_insn); before = label_insn; } else { label_insn = emit_label_after (label, tail); + if (TARGET_PTX_6_0 && mode == GOMP_DIM_VECTOR) + warp_sync = emit_insn_after (gen_nvptx_warpsync (), label_insn); if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER) && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL)) emit_insn_after (gen_exit (), label_insn); @@ -4702,6 +4707,8 @@ nvptx_single (unsigned mask, basic_block from, basic_block to) setp.ne.u32 %rcond,%rcondu32,0; */ rtx_insn *label = PREV_INSN (tail); + if (label == warp_sync) + label = PREV_INSN (label); gcc_assert (label && LABEL_P (label)); rtx tmp = gen_reg_rtx (BImode); emit_insn_before (gen_movbi (tmp, const0_rtx), diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index b391165..b4c7cd6 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -56,6 +56,7 @@ UNSPECV_CAS UNSPECV_XCHG UNSPECV_BARSYNC + UNSPECV_WARPSYNC UNSPECV_MEMBAR UNSPECV_MEMBAR_CTA UNSPECV_MEMBAR_GL @@ -1978,6 +1979,12 @@ } [(set_attr "predicable" "false")]) +(define_insn "nvptx_warpsync" + [(unspec_volatile [(const_int 0)] UNSPECV_WARPSYNC)] + "TARGET_PTX_6_0" + "\\tbar.warp.sync\\t0xffffffff;" + [(set_attr "predicable" "false")]) + (define_expand "memory_barrier" [(set (match_dup 0) (unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMBAR))] -- cgit v1.1 From f32f74c2e8cef5fe37af6d4e8d7e8f6b4c8ae9a8 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Fri, 28 Jan 2022 10:28:59 +0100 Subject: [nvptx] Add uniform_warp_check insn On a GT 1030, with driver version 470.94 and -mptx=3.1 I run into: ... FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/parallel-dims.c \ -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none \ -O2 execution test ... which minimizes to the same test-case as listed in commit "[nvptx] Update default ptx isa to 6.3". The problem is again that the first diverging branch is not handled as such in SASS, which causes problems with a subsequent shfl insn, but given that we have -mptx=3.1 we can't use the bar.warp.sync insn. Given that the default is now -mptx=6.3, and consequently -mptx=3.1 is of a lesser importance, implement the next best thing: abort when detecting non-convergence using this insn: ... { .reg.b32 act; vote.ballot.b32 act,1; .reg.pred uni; setp.eq.b32 uni,act,0xffffffff; @ !uni trap; @ !uni exit; } ... Interestingly, the effect of this is that rather than aborting, the test-case now passes. Tested on x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-01-31 Tom de Vries * config/nvptx/nvptx.cc (nvptx_single): Use nvptx_uniform_warp_check. * config/nvptx/nvptx.md (define_c_enum "unspecv"): Add UNSPECV_UNIFORM_WARP_CHECK. (define_insn "nvptx_uniform_warp_check"): New define_insn. --- gcc/config/nvptx/nvptx.cc | 22 ++++++++++++++++++---- gcc/config/nvptx/nvptx.md | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 1b91990..b3bb97c 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -4631,15 +4631,29 @@ nvptx_single (unsigned mask, basic_block from, basic_block to) if (tail_branch) { label_insn = emit_label_before (label, before); - if (TARGET_PTX_6_0 && mode == GOMP_DIM_VECTOR) - warp_sync = emit_insn_after (gen_nvptx_warpsync (), label_insn); + if (mode == GOMP_DIM_VECTOR) + { + if (TARGET_PTX_6_0) + warp_sync = emit_insn_after (gen_nvptx_warpsync (), + label_insn); + else + warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (), + label_insn); + } before = label_insn; } else { label_insn = emit_label_after (label, tail); - if (TARGET_PTX_6_0 && mode == GOMP_DIM_VECTOR) - warp_sync = emit_insn_after (gen_nvptx_warpsync (), label_insn); + if (mode == GOMP_DIM_VECTOR) + { + if (TARGET_PTX_6_0) + warp_sync = emit_insn_after (gen_nvptx_warpsync (), + label_insn); + else + warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (), + label_insn); + } if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER) && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL)) emit_insn_after (gen_exit (), label_insn); diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index b4c7cd6..92768dd 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -57,6 +57,7 @@ UNSPECV_XCHG UNSPECV_BARSYNC UNSPECV_WARPSYNC + UNSPECV_UNIFORM_WARP_CHECK UNSPECV_MEMBAR UNSPECV_MEMBAR_CTA UNSPECV_MEMBAR_GL @@ -1985,6 +1986,23 @@ "\\tbar.warp.sync\\t0xffffffff;" [(set_attr "predicable" "false")]) +(define_insn "nvptx_uniform_warp_check" + [(unspec_volatile [(const_int 0)] UNSPECV_UNIFORM_WARP_CHECK)] + "" + { + output_asm_insn ("{", NULL); + output_asm_insn ("\\t" ".reg.b32" "\\t" "act;", NULL); + output_asm_insn ("\\t" "vote.ballot.b32" "\\t" "act,1;", NULL); + output_asm_insn ("\\t" ".reg.pred" "\\t" "uni;", NULL); + output_asm_insn ("\\t" "setp.eq.b32" "\\t" "uni,act,0xffffffff;", + NULL); + output_asm_insn ("@ !uni\\t" "trap;", NULL); + output_asm_insn ("@ !uni\\t" "exit;", NULL); + output_asm_insn ("}", NULL); + return ""; + } + [(set_attr "predicable" "false")]) + (define_expand "memory_barrier" [(set (match_dup 0) (unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMBAR))] -- cgit v1.1 From fa882c3e3bf642e0ef30772e4b54a2851497db96 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 1 Feb 2022 20:22:14 +0100 Subject: rs6000: Fix up PCH on powerpc* [PR104323] As mentioned in the PR and as can be seen on: --- gcc/testsuite/gcc.dg/pch/pr104323-1.c.jj 2022-02-01 13:06:00.163192414 +0100 +++ gcc/testsuite/gcc.dg/pch/pr104323-1.c 2022-02-01 13:13:41.226712735 +0100 @@ -0,0 +1,16 @@ +/* PR target/104323 */ +/* { dg-require-effective-target powerpc_altivec_ok } */ +/* { dg-options "-maltivec" } */ + +#include "pr104323-1.h" + +__vector int a1 = { 100, 200, 300, 400 }; +__vector int a2 = { 500, 600, 700, 800 }; +__vector int r; + +int +main () +{ + r = vec_add (a1, a2); + return 0; +} --- gcc/testsuite/gcc.dg/pch/pr104323-1.hs.jj 2022-02-01 13:06:03.180149978 +0100 +++ gcc/testsuite/gcc.dg/pch/pr104323-1.hs 2022-02-01 13:12:30.175706620 +0100 @@ -0,0 +1,5 @@ +/* PR target/104323 */ +/* { dg-require-effective-target powerpc_altivec_ok } */ +/* { dg-options "-maltivec" } */ + +#include testcase which I'm not including into testsuite because for some reason the test fails on non-powerpc* targets (is done even on those and fails because of missing altivec.h etc.), PCH is broken on powerpc*-*-* since the new builtin generator has been introduced. The generator contains or emits comments like: /* #### Cannot mark this as a GC root because only pointer types can be marked as GTY((user)) and be GC roots. All trees in here are kept alive by other globals, so not a big deal. Alternatively, we could change the enum fields to ints and cast them in and out to avoid requiring a GTY((user)) designation, but that seems unnecessarily gross. */ Having the fntypes stored in other GC roots can work fine for GC, ggc_collect will then always mark them and so they won't disappear from the tables, but it definitely doesn't work for PCH, which when the arrays with fntype members aren't GTY marked means on PCH write we create copies of those FUNCTION_TYPEs and store in *.gch that the GC roots should be updated, but don't store that rs6000_builtin_info[?].fntype etc. should be updated. When PCH is read again, the blob is read at some other address, GC roots are updated, rs6000_builtin_info[?].fntype contains garbage pointers (GC freed pointers with random data, or random unrelated types or other trees). The following patch fixes that. It stops any user markings because that is totally unnecessary, just skips fields we don't need to mark and adds GTY(()) to the 2 array variables. We can get rid of all those global vars for the fn types, they can be now automatic vars. With the patch we get { &rs6000_instance_info[0].fntype, 1 * (RS6000_INST_MAX), sizeof (rs6000_instance_info[0]), >_ggc_mx_tree_node, >_pch_nx_tree_node }, { &rs6000_builtin_info[0].fntype, 1 * (RS6000_BIF_MAX), sizeof (rs6000_builtin_info[0]), >_ggc_mx_tree_node, >_pch_nx_tree_node }, as the new roots which is exactly what we want and significantly more compact than countless { &uv2di_ftype_pudi_usi, 1, sizeof (uv2di_ftype_pudi_usi), >_ggc_mx_tree_node, >_pch_nx_tree_node }, { &uv2di_ftype_lg_puv2di, 1, sizeof (uv2di_ftype_lg_puv2di), >_ggc_mx_tree_node, >_pch_nx_tree_node }, { &uv2di_ftype_lg_pudi, 1, sizeof (uv2di_ftype_lg_pudi), >_ggc_mx_tree_node, >_pch_nx_tree_node }, { &uv2di_ftype_di_puv2di, 1, sizeof (uv2di_ftype_di_puv2di), >_ggc_mx_tree_node, >_pch_nx_tree_node }, cases (822 of these instead of just those 4 shown). 2022-02-01 Jakub Jelinek PR target/104323 * config/rs6000/t-rs6000 (EXTRA_GTYPE_DEPS): Append rs6000-builtins.h rather than $(srcdir)/config/rs6000/rs6000-builtins.def. * config/rs6000/rs6000-gen-builtins.cc (write_decls): Don't use GTY((user)) for struct bifdata and struct ovlddata. Instead add GTY((skip(""))) to members with pointer and enum types that don't need to be tracked. Add GTY(()) to rs6000_builtin_info and rs6000_instance_info declarations. Don't emit gt_ggc_mx and gt_pch_nx declarations. (write_extern_fntype, write_fntype): Remove. (write_fntype_init): Emit the fntype vars as automatic vars instead of file scope ones. (write_header_file): Don't iterate with write_extern_fntype. (write_init_file): Don't iterate with write_fntype. Don't emit gt_ggc_mx and gt_pch_nx definitions. --- gcc/config/rs6000/rs6000-gen-builtins.cc | 109 ++++++------------------------- gcc/config/rs6000/t-rs6000 | 2 +- 2 files changed, 22 insertions(+), 89 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-gen-builtins.cc b/gcc/config/rs6000/rs6000-gen-builtins.cc index 6a0858a..629ead9 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.cc +++ b/gcc/config/rs6000/rs6000-gen-builtins.cc @@ -2255,20 +2255,20 @@ write_decls (void) fprintf (header_file, "};\n\n"); fprintf (header_file, "#define PPC_MAXRESTROPNDS 3\n"); - fprintf (header_file, "struct GTY((user)) bifdata\n"); + fprintf (header_file, "struct GTY(()) bifdata\n"); fprintf (header_file, "{\n"); - fprintf (header_file, " const char *bifname;\n"); - fprintf (header_file, " bif_enable enable;\n"); + fprintf (header_file, " const char *GTY((skip(\"\"))) bifname;\n"); + fprintf (header_file, " bif_enable GTY((skip(\"\"))) enable;\n"); fprintf (header_file, " tree fntype;\n"); - fprintf (header_file, " insn_code icode;\n"); + fprintf (header_file, " insn_code GTY((skip(\"\"))) icode;\n"); fprintf (header_file, " int nargs;\n"); fprintf (header_file, " int bifattrs;\n"); fprintf (header_file, " int restr_opnd[PPC_MAXRESTROPNDS];\n"); - fprintf (header_file, " restriction restr[PPC_MAXRESTROPNDS];\n"); + fprintf (header_file, " restriction GTY((skip(\"\"))) restr[PPC_MAXRESTROPNDS];\n"); fprintf (header_file, " int restr_val1[PPC_MAXRESTROPNDS];\n"); fprintf (header_file, " int restr_val2[PPC_MAXRESTROPNDS];\n"); - fprintf (header_file, " const char *attr_string;\n"); - fprintf (header_file, " rs6000_gen_builtins assoc_bif;\n"); + fprintf (header_file, " const char *GTY((skip(\"\"))) attr_string;\n"); + fprintf (header_file, " rs6000_gen_builtins GTY((skip(\"\"))) assoc_bif;\n"); fprintf (header_file, "};\n\n"); fprintf (header_file, "#define bif_init_bit\t\t(0x00000001)\n"); @@ -2343,21 +2343,15 @@ write_decls (void) "#define bif_is_ibmld(x)\t((x).bifattrs & bif_ibmld_bit)\n"); fprintf (header_file, "\n"); - /* #### Cannot mark this as a GC root because only pointer types can - be marked as GTY((user)) and be GC roots. All trees in here are - kept alive by other globals, so not a big deal. Alternatively, - we could change the enum fields to ints and cast them in and out - to avoid requiring a GTY((user)) designation, but that seems - unnecessarily gross. */ fprintf (header_file, - "extern bifdata rs6000_builtin_info[RS6000_BIF_MAX];\n\n"); + "extern GTY(()) bifdata rs6000_builtin_info[RS6000_BIF_MAX];\n\n"); - fprintf (header_file, "struct GTY((user)) ovlddata\n"); + fprintf (header_file, "struct GTY(()) ovlddata\n"); fprintf (header_file, "{\n"); - fprintf (header_file, " const char *bifname;\n"); - fprintf (header_file, " rs6000_gen_builtins bifid;\n"); + fprintf (header_file, " const char *GTY((skip(\"\"))) bifname;\n"); + fprintf (header_file, " rs6000_gen_builtins GTY((skip(\"\"))) bifid;\n"); fprintf (header_file, " tree fntype;\n"); - fprintf (header_file, " ovlddata *next;\n"); + fprintf (header_file, " ovlddata *GTY((skip(\"\"))) next;\n"); fprintf (header_file, "};\n\n"); fprintf (header_file, "struct ovldrecord\n"); @@ -2367,14 +2361,7 @@ write_decls (void) fprintf (header_file, "};\n\n"); fprintf (header_file, - "/* #### Cannot mark this as a GC root because only pointer\n" - " types can be marked as GTY((user)) and be GC roots. All\n" - " trees in here are kept alive by other globals, so not a big\n" - " deal. Alternatively, we could change the enum fields to ints\n" - " and cast them in and out to avoid requiring a GTY((user))\n" - " designation, but that seems unnecessarily gross. */\n"); - fprintf (header_file, - "extern ovlddata rs6000_instance_info[RS6000_INST_MAX];\n"); + "extern GTY(()) ovlddata rs6000_instance_info[RS6000_INST_MAX];\n"); fprintf (header_file, "extern ovldrecord rs6000_overload_info[];\n\n"); fprintf (header_file, "extern void rs6000_init_generated_builtins ();\n\n"); @@ -2383,33 +2370,6 @@ write_decls (void) fprintf (header_file, "extern tree rs6000_builtin_decl (unsigned, " "bool ATTRIBUTE_UNUSED);\n\n"); - fprintf (header_file, - "extern void gt_ggc_mx (bifdata *bd);\n"); - fprintf (header_file, - "extern void gt_pch_nx (bifdata *bd);\n"); - fprintf (header_file, - "extern void gt_pch_nx (bifdata *bd, gt_pointer_operator op, " - "void *cookie);\n"); - fprintf (header_file, - "extern void gt_ggc_mx (ovlddata *od);\n"); - fprintf (header_file, - "extern void gt_pch_nx (ovlddata *od);\n"); - fprintf (header_file, - "extern void gt_pch_nx (ovlddata *od, gt_pointer_operator op, " - "void *cookie);\n"); -} - -/* Callback functions used for generating trees for function types. */ -void -write_extern_fntype (char *str) -{ - fprintf (header_file, "extern GTY(()) tree %s;\n", str); -} - -void -write_fntype (char *str) -{ - fprintf (init_file, "tree %s;\n", str); } /* Comparator for bsearch on the type map. */ @@ -2452,12 +2412,17 @@ write_fntype_init (char *str) /* Avoid side effects of strtok on the original string by using a copy. */ char *buf = strdup (str); + if (tf_found || dfp_found) + fprintf (init_file, " tree %s = NULL_TREE;\n", buf); + else + fprintf (init_file, " tree "); + if (tf_found) - fprintf (init_file, " if (float128_type_node)\n "); + fprintf (init_file, " if (float128_type_node)\n "); else if (dfp_found) - fprintf (init_file, " if (dfloat64_type_node)\n "); + fprintf (init_file, " if (dfloat64_type_node)\n "); - fprintf (init_file, " %s\n = build_function_type_list (", buf); + fprintf (init_file, "%s\n = build_function_type_list (", buf); tok = strtok (buf, "_"); write_type_node (tok, tf_found || dfp_found); tok = strtok (0, "_"); @@ -2491,8 +2456,6 @@ write_header_file (void) write_decls (); - /* Write function type list declarators to the header file. */ - rbt_inorder_callback (&fntype_rbt, fntype_rbt.rbt_root, write_extern_fntype); fprintf (header_file, "\n"); fprintf (header_file, "\n#endif\n"); @@ -2846,9 +2809,6 @@ write_init_file (void) write_bif_static_init (); write_ovld_static_init (); - rbt_inorder_callback (&fntype_rbt, fntype_rbt.rbt_root, write_fntype); - fprintf (init_file, "\n"); - fprintf (init_file, "void\n"); fprintf (init_file, "rs6000_init_generated_builtins ()\n"); fprintf (init_file, "{\n"); @@ -2868,33 +2828,6 @@ write_init_file (void) fprintf (init_file, "}\n\n"); - fprintf (init_file, - "void gt_ggc_mx (bifdata *bd)\n"); - fprintf (init_file, - "{\n gt_ggc_mx (bd->fntype);\n}\n\n"); - fprintf (init_file, - "void gt_pch_nx (bifdata *bd)\n"); - fprintf (init_file, - "{\n gt_pch_nx (bd->fntype);\n}\n\n"); - fprintf (init_file, - "void gt_pch_nx (bifdata *bd, gt_pointer_operator op, " - "void *cookie)\n"); - fprintf (init_file, - "{\n op(&(bd->fntype), NULL, cookie);\n}\n\n"); - fprintf (init_file, - "void gt_ggc_mx (ovlddata *od)\n"); - fprintf (init_file, - "{\n gt_ggc_mx (od->fntype);\n}\n\n"); - fprintf (init_file, - "void gt_pch_nx (ovlddata *od)\n"); - fprintf (init_file, - "{\n gt_pch_nx (od->fntype);\n}\n\n"); - fprintf (init_file, - "void gt_pch_nx (ovlddata *od, gt_pointer_operator op, " - "void *cookie)\n"); - fprintf (init_file, - "{\n op(&(od->fntype), NULL, cookie);\n}\n"); - return 1; } diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index 90079ce..1a460d9 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -21,7 +21,7 @@ TM_H += $(srcdir)/config/rs6000/rs6000-cpus.def TM_H += $(srcdir)/config/rs6000/rs6000-modes.h PASSES_EXTRA += $(srcdir)/config/rs6000/rs6000-passes.def -EXTRA_GTYPE_DEPS += $(srcdir)/config/rs6000/rs6000-builtins.def +EXTRA_GTYPE_DEPS += rs6000-builtins.h rs6000-pcrel-opt.o: $(srcdir)/config/rs6000/rs6000-pcrel-opt.cc $(COMPILE) $< -- cgit v1.1 From 8753b13a31c777cdab0265dae0b68534247908f7 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 28 Jan 2022 13:34:24 +0100 Subject: IBM Z: fix `section type conflict` with -mindirect-branch-table s390_code_end () puts indirect branch tables into separate sections and tries to switch back to wherever it was in the beginning by calling switch_to_section (current_function_section ()). First of all, this is unnecessary - the other backends don't do it. Furthermore, at this time there is no current function, but if the last processed function was cold, in_cold_section_p remains set. This causes targetm.asm_out.function_section () to call targetm.section_type_flags (), which in absence of current function decl classifies the section as SECTION_WRITE. This causes a section type conflict with the existing SECTION_CODE. gcc/ChangeLog: * config/s390/s390.cc (s390_code_end): Do not switch back to code section. gcc/testsuite/ChangeLog: * gcc.target/s390/nobp-section-type-conflict.c: New test. --- gcc/config/s390/s390.cc | 1 - 1 file changed, 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 43c5c72..2db12d4 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -16809,7 +16809,6 @@ s390_code_end (void) assemble_name_raw (asm_out_file, label_start); fputs ("-.\n", asm_out_file); } - switch_to_section (current_function_section ()); } } } -- cgit v1.1 From 4c4d0af4c94ccf0cfa74c8b13b8ec1029f57cd63 Mon Sep 17 00:00:00 2001 From: Hans-Peter Nilsson Date: Wed, 2 Feb 2022 00:00:09 +0100 Subject: cris: Don't default to -mmul-bug-workaround This flips the default for the errata handling for an old version (TL;DR: workaround: no multiply instruction last on a cache-line). Newer versions of the CRIS cpu don't have that bug. While the impact of the workaround is very marginal (coremark: less than .05% larger, less than .0005% slower) it's an irritating pseudorandom factor when assessing the impact of other changes. Also, fix a wart requiring changes to more than TARGET_DEFAULT to flip the default. People building old kernels or operating systems to run on ETRAX 100 LX are advised to pass "-mmul-bug-workaround". gcc: * config/cris/cris.h (TARGET_DEFAULT): Don't include MASK_MUL_BUG. (MUL_BUG_ASM_DEFAULT): New macro. (MAYBE_AS_NO_MUL_BUG_ABORT): Define in terms of MUL_BUG_ASM_DEFAULT. * doc/invoke.texi (CRIS Options, -mmul-bug-workaround): Adjust accordingly. --- gcc/config/cris/cris.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/cris/cris.h b/gcc/config/cris/cris.h index b274e11..9245d78 100644 --- a/gcc/config/cris/cris.h +++ b/gcc/config/cris/cris.h @@ -153,7 +153,9 @@ extern int cris_cpu_version; #ifdef HAVE_AS_NO_MUL_BUG_ABORT_OPTION #define MAYBE_AS_NO_MUL_BUG_ABORT \ - "%{mno-mul-bug-workaround:-no-mul-bug-abort} " + "%{mno-mul-bug-workaround:-no-mul-bug-abort} " \ + "%{mmul-bug-workaround:-mul-bug-abort} " \ + "%{!mmul-bug-workaround:%{!mno-mul-bug-workaround:" MUL_BUG_ASM_DEFAULT "}} " #else #define MAYBE_AS_NO_MUL_BUG_ABORT #endif @@ -255,15 +257,26 @@ extern int cris_cpu_version; (MASK_SIDE_EFFECT_PREFIXES + MASK_STACK_ALIGN \ + MASK_CONST_ALIGN + MASK_DATA_ALIGN \ + MASK_ALIGN_BY_32 \ - + MASK_PROLOGUE_EPILOGUE + MASK_MUL_BUG) + + MASK_PROLOGUE_EPILOGUE) # else /* 0 */ # define TARGET_DEFAULT \ (MASK_SIDE_EFFECT_PREFIXES + MASK_STACK_ALIGN \ + MASK_CONST_ALIGN + MASK_DATA_ALIGN \ - + MASK_PROLOGUE_EPILOGUE + MASK_MUL_BUG) + + MASK_PROLOGUE_EPILOGUE) # endif #endif +/* Don't depend on the assembler default setting for the errata machinery; + always pass the option to turn it on or off explicitly. But, we have to + decide on which is the *GCC* default, and for that we should only need to + consider what's in TARGET_DEFAULT; no other changes should be necessary. */ + +#if (TARGET_DEFAULT & MASK_MUL_BUG) +#define MUL_BUG_ASM_DEFAULT "-mul-bug-abort" +#else +#define MUL_BUG_ASM_DEFAULT "-no-mul-bug-abort" +#endif + /* Local, providing a default for cris_cpu_version. */ #define CRIS_DEFAULT_CPU_VERSION TARGET_CPU_DEFAULT -- cgit v1.1 From a58401d2e6d31eb8f0e4ded84b3dde28c98ba4da Mon Sep 17 00:00:00 2001 From: Hans-Peter Nilsson Date: Wed, 2 Feb 2022 00:00:10 +0100 Subject: cris: For expanded movsi, don't match operands we know will be reloaded In a session investigating unexpected fallout from a change, I noticed reload needs one operand being a register to make an informed decision. It can happen that there's just a constant and a memory operand, as in: (insn 668 667 42 104 (parallel [ (set (mem:SI (plus:SI (reg/v/f:SI 347 [ fs ]) (const_int 168 [0xa8])) \ [1 fs_126(D)->regs.cfa_how+0 S4 A8]) (const_int 2 [0x2])) (clobber (reg:CC 19 dccr)) ]) "<...>/gcc/libgcc/unwind-dw2.c":1121:21 22 {*movsi_internal} (expr_list:REG_UNUSED (reg:CC 19 dccr) (nil))) This was helpfully created by combine. When this happens, reload can't check for costs and preferred register classes, (both operands will start with NO_REGS as the preferred class) and will default to the constraints order in the insn in reload. (Which also does its own temporary merge in find_reloads, but that's a different story.) Better don't match the simple cases. Beware that subregs have to be matched. I'm doing this just for word_mode (SI) for now, but may repeat this for the other valid modes as well. In particular, that goes for DImode as I see the expanded movdi does *almost* this, but uses register_operand instead of REG_S_P (from cris.h). Using REG_S_P is the right choice here because register_operand also matches (subreg (mem ...) ...) *until* reload is done. By itself it's just a sub-0.1% performance win (coremark). Also removing a stale comment. gcc: * config/cris/cris.md ("*movsi_internal"): Conditionalize on (sub-)register operands or operand 1 being 0. --- gcc/config/cris/cris.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/cris/cris.md b/gcc/config/cris/cris.md index bc8d758..9d1c179 100644 --- a/gcc/config/cris/cris.md +++ b/gcc/config/cris/cris.md @@ -583,9 +583,10 @@ (match_operand:SI 1 "general_operand" "r,Q>,M,M, I,r, M,n,g,r,x, rQ>,x,gi")) (clobber (reg:CC CRIS_CC0_REGNUM))] - ;; Note that we prefer not to use the S alternative (if for some reason - ;; it competes with others) above, but g matches S. - "" + ;; Avoid matching insns we know must be reloaded. Without one + ;; operand being a (pseudo-)register, reload chooses + ;; reload-registers suboptimally. + "REG_S_P (operands[0]) || REG_S_P (operands[1]) || operands[1] == const0_rtx" { /* Better to have c-switch here; it is worth it to optimize the size of move insns. The alternative would be to try to find more constraint -- cgit v1.1 From 27e35bc4910e291d8676c69b08fb88fa51ba528e Mon Sep 17 00:00:00 2001 From: Hans-Peter Nilsson Date: Wed, 2 Feb 2022 00:00:10 +0100 Subject: cris: Remove CRIS v32 ACR artefacts This is the change to which I alluded to this in r11-220 / d0780379c1b6 as "causes extra register moves in libgcc". It has unfortunate side-effects due to the change in register-class topology. There's a slight improvement in coremark numbers (< 0.07%) though also increase in code size total (< 0.7%) but looking at the individual changes in functions, it's all-over (-7..+7%). Looking specifically at functions that improved in speed, it's also both plus and minus in code sizes. It's unworkable to separate improvements from regressions for this case. I'll follow up with patches to restore the previous code quality, in both size and speed. gcc: * config/cris/constraints.md (define_register_constraint "b"): Now GENERAL_REGS. * config/cris/cris.md (CRIS_ACR_REGNUM): Remove. * config/cris/cris.h: (reg_class, REG_CLASS_NAMES) (REG_CLASS_CONTENTS): Remove ACR_REGS, SPEC_ACR_REGS, GENNONACR_REGS, and SPEC_GENNONACR_REGS. * config/cris/cris.cc (cris_preferred_reload_class): Don't mention ACR_REGS and return GENERAL_REGS instead of GENNONACR_REGS. --- gcc/config/cris/constraints.md | 7 ++++++- gcc/config/cris/cris.cc | 5 ++--- gcc/config/cris/cris.h | 27 +++++---------------------- gcc/config/cris/cris.md | 1 - 4 files changed, 13 insertions(+), 27 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/cris/constraints.md b/gcc/config/cris/constraints.md index 01ec12c..83fab62 100644 --- a/gcc/config/cris/constraints.md +++ b/gcc/config/cris/constraints.md @@ -18,7 +18,12 @@ ;; . ;; Register constraints. -(define_register_constraint "b" "GENNONACR_REGS" + +;; Kept for compatibility. It used to exclude the CRIS v32 +;; register "ACR", which was like GENERAL_REGS except it +;; couldn't be used for autoincrement, and intended mainly +;; for use in user asm statements. +(define_register_constraint "b" "GENERAL_REGS" "@internal") (define_register_constraint "h" "MOF_REGS" diff --git a/gcc/config/cris/cris.cc b/gcc/config/cris/cris.cc index a7807b3..264439c 100644 --- a/gcc/config/cris/cris.cc +++ b/gcc/config/cris/cris.cc @@ -1663,13 +1663,12 @@ cris_reload_address_legitimized (rtx x, static reg_class_t cris_preferred_reload_class (rtx x ATTRIBUTE_UNUSED, reg_class_t rclass) { - if (rclass != ACR_REGS - && rclass != MOF_REGS + if (rclass != MOF_REGS && rclass != MOF_SRP_REGS && rclass != SRP_REGS && rclass != CC0_REGS && rclass != SPECIAL_REGS) - return GENNONACR_REGS; + return GENERAL_REGS; return rclass; } diff --git a/gcc/config/cris/cris.h b/gcc/config/cris/cris.h index 9245d78..6edfe13 100644 --- a/gcc/config/cris/cris.h +++ b/gcc/config/cris/cris.h @@ -436,19 +436,15 @@ extern int cris_cpu_version; /* Node: Register Classes */ -/* We need a separate register class to handle register allocation for - ACR, since it can't be used for post-increment. - - It's not obvious, but having subunions of all movable-between +/* It's not obvious, but having subunions of all movable-between register classes does really help register allocation (pre-IRA comment). */ enum reg_class { NO_REGS, - ACR_REGS, MOF_REGS, SRP_REGS, CC0_REGS, + MOF_REGS, SRP_REGS, CC0_REGS, MOF_SRP_REGS, SPECIAL_REGS, - SPEC_ACR_REGS, GENNONACR_REGS, - SPEC_GENNONACR_REGS, GENERAL_REGS, + GENERAL_REGS, ALL_REGS, LIM_REG_CLASSES }; @@ -457,9 +453,8 @@ enum reg_class #define REG_CLASS_NAMES \ {"NO_REGS", \ - "ACR_REGS", "MOF_REGS", "SRP_REGS", "CC0_REGS", \ + "MOF_REGS", "SRP_REGS", "CC0_REGS", \ "MOF_SRP_REGS", "SPECIAL_REGS", \ - "SPEC_ACR_REGS", "GENNONACR_REGS", "SPEC_GENNONACR_REGS", \ "GENERAL_REGS", "ALL_REGS"} #define CRIS_SPECIAL_REGS_CONTENTS \ @@ -472,37 +467,25 @@ enum reg_class #define REG_CLASS_CONTENTS \ { \ {0}, \ - {1 << CRIS_ACR_REGNUM}, \ {1 << CRIS_MOF_REGNUM}, \ {1 << CRIS_SRP_REGNUM}, \ {1 << CRIS_CC0_REGNUM}, \ {(1 << CRIS_MOF_REGNUM) \ | (1 << CRIS_SRP_REGNUM)}, \ {CRIS_SPECIAL_REGS_CONTENTS}, \ - {CRIS_SPECIAL_REGS_CONTENTS \ - | (1 << CRIS_ACR_REGNUM)}, \ - {(0xffff | CRIS_FAKED_REGS_CONTENTS) \ - & ~(1 << CRIS_ACR_REGNUM)}, \ - {(0xffff | CRIS_FAKED_REGS_CONTENTS \ - | CRIS_SPECIAL_REGS_CONTENTS) \ - & ~(1 << CRIS_ACR_REGNUM)}, \ {0xffff | CRIS_FAKED_REGS_CONTENTS}, \ {0xffff | CRIS_FAKED_REGS_CONTENTS \ | CRIS_SPECIAL_REGS_CONTENTS} \ } #define REGNO_REG_CLASS(REGNO) \ - ((REGNO) == CRIS_ACR_REGNUM ? ACR_REGS : \ - (REGNO) == CRIS_MOF_REGNUM ? MOF_REGS : \ + ((REGNO) == CRIS_MOF_REGNUM ? MOF_REGS : \ (REGNO) == CRIS_SRP_REGNUM ? SRP_REGS : \ (REGNO) == CRIS_CC0_REGNUM ? CC0_REGS : \ GENERAL_REGS) #define BASE_REG_CLASS GENERAL_REGS -#define MODE_CODE_BASE_REG_CLASS(MODE, AS, OCODE, ICODE) \ - ((OCODE) != POST_INC ? BASE_REG_CLASS : GENNONACR_REGS) - #define INDEX_REG_CLASS GENERAL_REGS /* Since it uses reg_renumber, it is safe only once reg_renumber diff --git a/gcc/config/cris/cris.md b/gcc/config/cris/cris.md index 9d1c179..9d9eb8b 100644 --- a/gcc/config/cris/cris.md +++ b/gcc/config/cris/cris.md @@ -60,7 +60,6 @@ [(CRIS_STATIC_CHAIN_REGNUM 7) (CRIS_REAL_FP_REGNUM 8) (CRIS_SP_REGNUM 14) - (CRIS_ACR_REGNUM 15) (CRIS_SRP_REGNUM 16) (CRIS_MOF_REGNUM 17) (CRIS_AP_REGNUM 18) -- cgit v1.1 From 9a7f14ef9b6b287d99b8240cdb43e8fe089ea9b3 Mon Sep 17 00:00:00 2001 From: Hans-Peter Nilsson Date: Wed, 2 Feb 2022 00:00:10 +0100 Subject: cris: Don't discriminate against ALL_REGS in TARGET_REGISTER_MOVE_COST When the tightest class including both SPECIAL_REGS and GENERAL_REGS is ALL_REGS, artificially special-casing for *either* to or from, hits artificially hard. This gets the port back to the code quality before the previous patch ("cris: Remove CRIS v32 ACR artefacts") - except for_vfprintf_r and _vfiprintf_r in newlib (still .8 and .4% larger). gcc: * config/cris/cris.cc (cris_register_move_cost): Remove special pre-ira extra cost for ALL_REGS. --- gcc/config/cris/cris.cc | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/cris/cris.cc b/gcc/config/cris/cris.cc index 264439c..4f97722 100644 --- a/gcc/config/cris/cris.cc +++ b/gcc/config/cris/cris.cc @@ -1683,20 +1683,10 @@ cris_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED, their move cost within that class is higher. How about 7? That's 3 for a move to a GENERAL_REGS register, 3 for the move from the GENERAL_REGS register, and 1 for the increased register pressure. - Also, it's higher than the memory move cost, as it should. - We also do this for ALL_REGS, since we don't want that class to be - preferred (even to memory) at all where GENERAL_REGS doesn't fit. - Whenever it's about to be used, it's for SPECIAL_REGS. If we don't - present a higher cost for ALL_REGS than memory, a SPECIAL_REGS may be - used when a GENERAL_REGS should be used, even if there are call-saved - GENERAL_REGS left to allocate. This is because the fall-back when - the most preferred register class isn't available, isn't the next - (or next good) wider register class, but the *most widest* register - class. FIXME: pre-IRA comment, perhaps obsolete now. */ - - if ((reg_classes_intersect_p (from, SPECIAL_REGS) - && reg_classes_intersect_p (to, SPECIAL_REGS)) - || from == ALL_REGS || to == ALL_REGS) + Also, it's higher than the memory move cost, as it should be. */ + + if (reg_classes_intersect_p (from, SPECIAL_REGS) + && reg_classes_intersect_p (to, SPECIAL_REGS)) return 7; /* Make moves to/from SPECIAL_REGS slightly more expensive, as we -- cgit v1.1 From 07a6c52c4cd145d20488c4823669a2d984ba2051 Mon Sep 17 00:00:00 2001 From: Hans-Peter Nilsson Date: Wed, 2 Feb 2022 00:00:10 +0100 Subject: cris: Reload using special-regs before general-regs On code where reload has an effect (i.e. quite rarely, just enough to be noticeable), this change gets code quality back to the situation prior to "Remove CRIS v32 ACR artefacts". We had from IRA a pseudoregister marked to be reloaded from a union of all allocatable registers (here: SPEC_GENNONACR_REGS) but where the register-class corresponding to the constraint for the register-type alternative (here: GENERAL_REGS) was *not* a subset of that class: SPEC_GENNONACR_REGS (and GENNONACR_REGS) had a one-register "hole" for the ACR register, a register present in GENERAL_REGS. Code in reload.cc:find_reloads adds 4 to the cost of a register-type alternative that is neither a subset of the preferred register class nor vice versa and thus reload thinks it can't use. It would be preferable to look for a non-empty intersection of the two, and use that intersection for that alternative, something that can't be expressed because a register class can't be formed from a random register set. The effect was here that the GENERAL_REGS to/from memory alternatives ("r") had their cost raised such that the SPECIAL_REGS alternatives ("x") looked better. This happened to improve code quality just a little bit compared to GENERAL_REGS being chosen. Anyway, with the improved CRIS register-class topology, the subset-checking code no longer has the GENERAL_REGS-demoting effect. To get the same quality, we have to adjust the port such that SPECIAL_REGS are specifically preferred when possible and advisible, i.e. when there's at least two of those registers as for the CPU variant with multiplication (which happens to be the variant maintained for performance). For the move-pattern, the obvious method may seem to simply "curse" the constraints of some alternatives (by prepending one of the "?!^$" characters) but that method can't be used, because we want the effect to be conditional on the CPU variant. It'd also be a shame to split the "*movsi_internal" into two CPU-variants (with different cursing). Iterators would help, but it still seems unwieldy. Instead, add copies of the GENERAL_REGS variants (to the SPECIAL_REGS alternatives) on the "other" side, and make use of the "enabled" attribute to activate just the desired order of alternatives. gcc: * config/cris/cris.cc (cris_preferred_reload_class): Reject "eliminated" registers and small-enough constants unless reloaded into a class that is a subset of GENERAL_REGS. * config/cris/cris.md (attribute "cpu_variant"): New. (attribute "enabled"): Conditionalize on a matching attribute cpu_variant, if specified. ("*movsi_internal"): For moves to and from memory, add cpu-variant-enabled variants for "r" alternatives on the far side of the "x" alternatives, preferring the "x" ones only for variants where MOF is present (in addition to SRP). --- gcc/config/cris/cris.cc | 13 ++++++++++++- gcc/config/cris/cris.md | 25 ++++++++++++++++++++----- 2 files changed, 32 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/cris/cris.cc b/gcc/config/cris/cris.cc index 4f97722..f0017d6 100644 --- a/gcc/config/cris/cris.cc +++ b/gcc/config/cris/cris.cc @@ -1661,7 +1661,7 @@ cris_reload_address_legitimized (rtx x, a bug. */ static reg_class_t -cris_preferred_reload_class (rtx x ATTRIBUTE_UNUSED, reg_class_t rclass) +cris_preferred_reload_class (rtx x, reg_class_t rclass) { if (rclass != MOF_REGS && rclass != MOF_SRP_REGS @@ -1670,6 +1670,17 @@ cris_preferred_reload_class (rtx x ATTRIBUTE_UNUSED, reg_class_t rclass) && rclass != SPECIAL_REGS) return GENERAL_REGS; + /* We can't make use of something that's not a general register when + reloading an "eliminated" register (i.e. something that has turned into + e.g. sp + const_int). */ + if (GET_CODE (x) == PLUS && !reg_class_subset_p (rclass, GENERAL_REGS)) + return NO_REGS; + + /* Avoid putting constants into a special register, where the instruction is + shorter if loaded into a general register. */ + if (satisfies_constraint_P (x) && !reg_class_subset_p (rclass, GENERAL_REGS)) + return NO_REGS; + return rclass; } diff --git a/gcc/config/cris/cris.md b/gcc/config/cris/cris.md index 9d9eb8b..dd70941 100644 --- a/gcc/config/cris/cris.md +++ b/gcc/config/cris/cris.md @@ -153,9 +153,20 @@ (not (match_test "dead_or_set_regno_p (insn, CRIS_SRP_REGNUM)"))) (nil) (nil)]) +;; Enable choosing particular instructions. The discriminator choice +;; "v0" stands for "pre-v10", for brevity. +(define_attr "cpu_variant" "default,v0,v10" (const_string "default")) + (define_attr "enabled" "no,yes" (if_then_else - (eq_attr "cc_enabled" "normal") + (and + (eq_attr "cc_enabled" "normal") + (ior + (eq_attr "cpu_variant" "default") + (and (eq_attr "cpu_variant" "v10") + (match_test "TARGET_HAS_MUL_INSNS")) + (and (eq_attr "cpu_variant" "v0") + (not (match_test "TARGET_HAS_MUL_INSNS"))))) (const_string "yes") (const_string "no"))) @@ -578,9 +589,9 @@ (define_insn "*movsi_internal" [(set (match_operand:SI 0 "nonimmediate_operand" - "=r,r, r,Q>,r,Q>,g,r,r,g,rQ>,x, m,x") + "=r,r, r,Q>,r,Q>,g,r,r,g,rQ>,x, m,x, Q>,r,g") (match_operand:SI 1 "general_operand" - "r,Q>,M,M, I,r, M,n,g,r,x, rQ>,x,gi")) + "r,Q>,M,M, I,r, M,n,g,r,x, rQ>,x,gi,r, g,r")) (clobber (reg:CC CRIS_CC0_REGNUM))] ;; Avoid matching insns we know must be reloaded. Without one ;; operand being a (pseudo-)register, reload chooses @@ -597,6 +608,9 @@ case 5: case 8: case 9: + case 14: + case 15: + case 16: return "move.d %1,%0"; case 10: @@ -634,9 +648,10 @@ gcc_unreachable (); } } - [(set_attr "slottable" "yes,yes,yes,yes,yes,yes,no,no,no,no,yes,yes,no,no") + [(set_attr "cpu_variant" "*,*,*,*,*,v0,*,*,v0,v0,*,*,*,*,v10,v10,v10") + (set_attr "slottable" "yes,yes,yes,yes,yes,yes,no,no,no,no,yes,yes,no,no,yes,no,no") (set_attr "cc" - "*,*,none,none,*,none,none,*,*,none,none,none,none,none")]) + "*,*,none,none,*,none,none,*,*,none,none,none,none,none,none,*,none")]) ;; FIXME: See movsi. -- cgit v1.1 From ab95fe61fea38fbac7f4e00abd32c2530532351a Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Wed, 2 Feb 2022 10:51:38 +0000 Subject: AArch64: use canonical ordering for complex mul, fma and fms After the first patch in the series this updates the optabs to expect the canonical sequence. gcc/ChangeLog: PR tree-optimization/102819 PR tree-optimization/103169 * config/aarch64/aarch64-simd.md (cml4): Use canonical order. * config/aarch64/aarch64-sve.md (cml4): Likewise. --- gcc/config/aarch64/aarch64-simd.md | 14 +++++++------- gcc/config/aarch64/aarch64-sve.md | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 71c429f..13255be 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -556,17 +556,17 @@ ;; remainder. Because of this, expand early. (define_expand "cml4" [(set (match_operand:VHSDF 0 "register_operand") - (plus:VHSDF (match_operand:VHSDF 1 "register_operand") - (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand") - (match_operand:VHSDF 3 "register_operand")] - FCMLA_OP)))] + (plus:VHSDF (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand") + (match_operand:VHSDF 2 "register_operand")] + FCMLA_OP) + (match_operand:VHSDF 3 "register_operand")))] "TARGET_COMPLEX && !BYTES_BIG_ENDIAN" { rtx tmp = gen_reg_rtx (mode); - emit_insn (gen_aarch64_fcmla (tmp, operands[1], - operands[3], operands[2])); + emit_insn (gen_aarch64_fcmla (tmp, operands[3], + operands[2], operands[1])); emit_insn (gen_aarch64_fcmla (operands[0], tmp, - operands[3], operands[2])); + operands[2], operands[1])); DONE; }) diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index bd22fe5..bd60e65 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -7278,11 +7278,11 @@ rtx tmp = gen_reg_rtx (mode); emit_insn (gen_aarch64_pred_fcmla (tmp, operands[4], - operands[3], operands[2], - operands[1], operands[5])); + operands[2], operands[1], + operands[3], operands[5])); emit_insn (gen_aarch64_pred_fcmla (operands[0], operands[4], - operands[3], operands[2], + operands[2], operands[1], tmp, operands[5])); DONE; }) -- cgit v1.1 From 9f6f411f63f3aceddd846e4b0d27202a6e13d42c Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Wed, 2 Feb 2022 10:52:17 +0000 Subject: AArch32: use canonical ordering for complex mul, fma and fms After the first patch in the series this updates the optabs to expect the canonical sequence. gcc/ChangeLog: PR tree-optimization/102819 PR tree-optimization/103169 * config/arm/vec-common.md (cml4): Use canonical order. --- gcc/config/arm/vec-common.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index cef358e..2718d82 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -265,18 +265,18 @@ ;; remainder. Because of this, expand early. (define_expand "cml4" [(set (match_operand:VF 0 "register_operand") - (plus:VF (match_operand:VF 1 "register_operand") - (unspec:VF [(match_operand:VF 2 "register_operand") - (match_operand:VF 3 "register_operand")] - VCMLA_OP)))] + (plus:VF (unspec:VF [(match_operand:VF 1 "register_operand") + (match_operand:VF 2 "register_operand")] + VCMLA_OP) + (match_operand:VF 3 "register_operand")))] "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT && ARM_HAVE__ARITH)) && !BYTES_BIG_ENDIAN" { rtx tmp = gen_reg_rtx (mode); - emit_insn (gen_arm_vcmla (tmp, operands[1], - operands[3], operands[2])); + emit_insn (gen_arm_vcmla (tmp, operands[3], + operands[2], operands[1])); emit_insn (gen_arm_vcmla (operands[0], tmp, - operands[3], operands[2])); + operands[2], operands[1])); DONE; }) -- cgit v1.1 From cac2f69cdad434ad5cb60f5fe931d45cd82ef476 Mon Sep 17 00:00:00 2001 From: Bernd Kuhls Date: Fri, 27 Mar 2020 21:23:53 +0100 Subject: gcc: define _REENTRANT for OpenRISC when -pthread is passed The detection of pthread support fails on OpenRISC unless _REENTRANT is defined. Added the CPP_SPEC definition to correct this. gcc/ChangeLog: PR target/94372 * config/or1k/linux.h (CPP_SPEC): Define. Signed-off-by: Bernd Kuhls --- gcc/config/or1k/linux.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/or1k/linux.h b/gcc/config/or1k/linux.h index 52909af..80f77c7 100644 --- a/gcc/config/or1k/linux.h +++ b/gcc/config/or1k/linux.h @@ -32,6 +32,8 @@ #undef MUSL_DYNAMIC_LINKER #define MUSL_DYNAMIC_LINKER "/lib/ld-musl-or1k.so.1" +#define CPP_SPEC "%{pthread:-D_REENTRANT}" + #undef LINK_SPEC #define LINK_SPEC "%{h*} \ %{static:-Bstatic} \ -- cgit v1.1 From 0415470c8d66200f6ae8ffb5ff4342bafc06251b Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Thu, 3 Feb 2022 09:55:59 +0100 Subject: s390x: Fix one more -Wformat-diag. gcc/ChangeLog: * config/s390/s390.cc (s390_valid_target_attribute_inner_p): Use the error message for i386 target. --- gcc/config/s390/s390.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 2db12d4..63b78ab 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -15903,7 +15903,7 @@ s390_valid_target_attribute_inner_p (tree args, /* Process the option. */ if (!found) { - error ("attribute(target(\"%s\")) is unknown", orig_p); + error ("attribute %qs argument % is unknown", orig_p); return false; } else if (attrs[i].only_as_pragma && !force_pragma) @@ -15953,7 +15953,7 @@ s390_valid_target_attribute_inner_p (tree args, } else { - error ("attribute(target(\"%s\")) is unknown", orig_p); + error ("attribute %qs argument % is unknown", orig_p); ret = false; } } @@ -15970,7 +15970,7 @@ s390_valid_target_attribute_inner_p (tree args, global_dc); else { - error ("attribute(target(\"%s\")) is unknown", orig_p); + error ("attribute %qs argument % is unknown", orig_p); ret = false; } } -- cgit v1.1 From 9db03cd0caf6bbde1de302bf3509dc26ca8bff2b Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Thu, 3 Feb 2022 10:19:33 +0100 Subject: =?UTF-8?q?Fix=20wording=20for:=20attribute=20=E2=80=98-xyz?= =?UTF-8?q?=E2=80=99=20argument=20=E2=80=98target=E2=80=99=20is=20unknown?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc/ChangeLog: * config/i386/i386-options.cc (ix86_valid_target_attribute_inner_p): Change subject and object in the error message. * config/s390/s390.cc (s390_valid_target_attribute_inner_p): Likewise. --- gcc/config/i386/i386-options.cc | 2 +- gcc/config/s390/s390.cc | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index 715d9a1..082abd2 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -1201,7 +1201,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], if (opt == N_OPTS) { error_at (loc, "attribute %qs argument %qs is unknown", - orig_p, attr_name); + attr_name, orig_p); ret = false; } diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 63b78ab..5c2a830 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -15903,7 +15903,7 @@ s390_valid_target_attribute_inner_p (tree args, /* Process the option. */ if (!found) { - error ("attribute %qs argument % is unknown", orig_p); + error ("attribute % argument %qs is unknown", orig_p); return false; } else if (attrs[i].only_as_pragma && !force_pragma) @@ -15953,7 +15953,7 @@ s390_valid_target_attribute_inner_p (tree args, } else { - error ("attribute %qs argument % is unknown", orig_p); + error ("attribute % argument %qs is unknown", orig_p); ret = false; } } @@ -15970,7 +15970,7 @@ s390_valid_target_attribute_inner_p (tree args, global_dc); else { - error ("attribute %qs argument % is unknown", orig_p); + error ("attribute % argument %qs is unknown", orig_p); ret = false; } } -- cgit v1.1 From 6a770526600a7ffda1f288fa481a4322d5f149b4 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Thu, 3 Feb 2022 10:44:00 +0000 Subject: aarch64: Remove VALL_F16MOV iterator The VALL_F16MOV iterator now has the same modes as VALL_F16, in the same order. This patch removes the former in favour of the latter. This doesn't fix a bug as such, but it's ultra-safe (no change in object code) and it saves a follow-up patch from having to make a false choice between the iterators. gcc/ * config/aarch64/iterators.md (VALL_F16MOV): Delete. * config/aarch64/aarch64-simd.md (mov): Use VALL_F16 instead of VALL_F16MOV. --- gcc/config/aarch64/aarch64-simd.md | 4 ++-- gcc/config/aarch64/iterators.md | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 13255be..f6d7b42 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -19,8 +19,8 @@ ;; . (define_expand "mov" - [(set (match_operand:VALL_F16MOV 0 "nonimmediate_operand") - (match_operand:VALL_F16MOV 1 "general_operand"))] + [(set (match_operand:VALL_F16 0 "nonimmediate_operand") + (match_operand:VALL_F16 1 "general_operand"))] "TARGET_SIMD" " /* Force the operand into a register if it is not an diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 9160ce3..a0c02e4 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -187,11 +187,6 @@ (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V4HF V8HF V4BF V8BF V2SF V4SF V2DF]) -;; All Advanced SIMD modes suitable for moving, loading, and storing, -;; including special Bfloat vector types. -(define_mode_iterator VALL_F16MOV [V8QI V16QI V4HI V8HI V2SI V4SI V2DI - V4HF V8HF V4BF V8BF V2SF V4SF V2DF]) - ;; The VALL_F16 modes except the 128-bit 2-element ones. (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI V4HF V8HF V2SF V4SF]) -- cgit v1.1 From 7e4f89a23e32604f71f8f6756c8856bf07bf7ac2 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Thu, 3 Feb 2022 10:44:00 +0000 Subject: aarch64: Add missing movmisalign patterns The Advanced SIMD movmisalign patterns didn't handle 16-bit FP modes, which meant that the vector loop for: void test (_Float16 *data) { _Pragma ("omp simd") for (int i = 0; i < 8; ++i) data[i] = 1.0; } would be versioned for alignment. This was causing some new failures in aarch64/sve/single_5.c: FAIL: gcc.target/aarch64/sve/single_5.c scan-assembler-not \\tb FAIL: gcc.target/aarch64/sve/single_5.c scan-assembler-not \\tcmp FAIL: gcc.target/aarch64/sve/single_5.c scan-assembler-times \\tstr\\tq[0-9]+, 10 but I didn't look into what changed from earlier releases. Adding the missing modes removes some existing xfails. gcc/ * config/aarch64/aarch64-simd.md (movmisalign): Extend from VALL to VALL_F16. gcc/testsuite/ * gcc.target/aarch64/sve/single_5.c: Remove some XFAILs. --- gcc/config/aarch64/aarch64-simd.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index f6d7b42..6646e06 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -50,8 +50,8 @@ ) (define_expand "movmisalign" - [(set (match_operand:VALL 0 "nonimmediate_operand") - (match_operand:VALL 1 "general_operand"))] + [(set (match_operand:VALL_F16 0 "nonimmediate_operand") + (match_operand:VALL_F16 1 "general_operand"))] "TARGET_SIMD && !STRICT_ALIGNMENT" { /* This pattern is not permitted to fail during expansion: if both arguments -- cgit v1.1 From 8439e866a38399f0d5e6aab16faaf10bdabc4b5f Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 3 Feb 2022 14:34:21 +0100 Subject: arm: Fix up help.exp regression On Thu, Jan 20, 2022 at 11:27:20AM +0000, Richard Earnshaw via Gcc-patches wrote: > gcc/ChangeLog: > > * config/arm/arm.opt (mfix-cortex-a57-aes-1742098): New command-line > option. > (mfix-cortex-a72-aes-1655431): New option alias. > --- a/gcc/config/arm/arm.opt > +++ b/gcc/config/arm/arm.opt > @@ -272,6 +272,16 @@ mfix-cmse-cve-2021-35465 > Target Var(fix_vlldm) Init(2) > Mitigate issues with VLLDM on some M-profile devices (CVE-2021-35465). > > +mfix-cortex-a57-aes-1742098 > +Target Var(fix_aes_erratum_1742098) Init(2) Save > +Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72. > +Arm erratum #1742098 > + > +mfix-cortex-a72-aes-1655431 > +Target Alias(mfix-cortex-a57-aes-1742098) > +Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72. > +Arm erratum #1655431 > + > munaligned-access > Target Var(unaligned_access) Init(2) Save > Enable unaligned word and halfword accesses to packed data. This breaks: Running /usr/src/gcc/gcc/testsuite/gcc.misc-tests/help.exp ... FAIL: compiler driver --help=target option(s): "^ +-.*[^:.]$" absent from output: " -mfix-cortex-a57-aes-1742098 Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72. Arm erratum #1742098" help.exp with help of lib/options.exp tests whether all non-empty descriptions of options are terminated with . or :. 2022-02-03 Jakub Jelinek * config/arm/arm.opt (mfix-cortex-a57-aes-1742098, mfix-cortex-a72-aes-1655431): Ensure description ends with full stop. --- gcc/config/arm/arm.opt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt index cc16534..3209b6c 100644 --- a/gcc/config/arm/arm.opt +++ b/gcc/config/arm/arm.opt @@ -274,13 +274,13 @@ Mitigate issues with VLLDM on some M-profile devices (CVE-2021-35465). mfix-cortex-a57-aes-1742098 Target Var(fix_aes_erratum_1742098) Init(2) Save -Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72. -Arm erratum #1742098 +Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72 +(Arm erratum #1742098). mfix-cortex-a72-aes-1655431 Target Alias(mfix-cortex-a57-aes-1742098) -Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72. -Arm erratum #1655431 +Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72 +(Arm erratum #1655431). munaligned-access Target Var(unaligned_access) Init(2) Save -- cgit v1.1 From a1b4d225d8cd07c79eea81fb6416e8ad5a07f018 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 2 Feb 2022 20:59:00 -0600 Subject: rs6000: Unify error messages for built-in constant restrictions We currently give different error messages for built-in functions that violate range restrictions on their arguments, depending on whether we record them as requiring an n-bit literal or a literal between two values. It's better to be consistent. Change the error message for the n-bit literal to look like the other one. 2022-02-02 Bill Schmidt gcc/ * config/rs6000/rs6000-call.cc (rs6000_expand_builtin): Revise error message for RES_BITS case. gcc/testsuite/ * gcc.target/powerpc/bfp/scalar-test-data-class-10.c: Adjust error messages. * gcc.target/powerpc/bfp/scalar-test-data-class-2.c: Likewise. * gcc.target/powerpc/bfp/scalar-test-data-class-3.c: Likewise. * gcc.target/powerpc/bfp/scalar-test-data-class-4.c: Likewise. * gcc.target/powerpc/bfp/scalar-test-data-class-5.c: Likewise. * gcc.target/powerpc/bfp/scalar-test-data-class-9.c: Likewise. * gcc.target/powerpc/bfp/vec-test-data-class-4.c: Likewise. * gcc.target/powerpc/bfp/vec-test-data-class-5.c: Likewise. * gcc.target/powerpc/bfp/vec-test-data-class-6.c: Likewise. * gcc.target/powerpc/bfp/vec-test-data-class-7.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-12.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-14.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-17.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-19.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-2.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-22.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-24.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-27.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-29.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-32.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-34.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-37.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-39.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-4.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-42.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-44.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-47.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-49.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-52.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-54.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-57.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-59.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-62.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-64.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-67.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-69.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-7.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-72.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-74.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-77.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-79.c: Likewise. * gcc.target/powerpc/dfp/dtstsfi-9.c: Likewise. * gcc.target/powerpc/pr80315-1.c: Likewise. * gcc.target/powerpc/pr80315-2.c: Likewise. * gcc.target/powerpc/pr80315-3.c: Likewise. * gcc.target/powerpc/pr80315-4.c: Likewise. * gcc.target/powerpc/pr82015.c: Likewise. * gcc.target/powerpc/pr91903.c: Likewise. * gcc.target/powerpc/test_fpscr_rn_builtin_error.c: Likewise. * gcc.target/powerpc/vec-ternarylogic-10.c: Likewise. --- gcc/config/rs6000/rs6000-call.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-call.cc b/gcc/config/rs6000/rs6000-call.cc index 5c870d4..d9bd5ca 100644 --- a/gcc/config/rs6000/rs6000-call.cc +++ b/gcc/config/rs6000/rs6000-call.cc @@ -5717,8 +5717,10 @@ rs6000_expand_builtin (tree exp, rtx target, rtx /* subtarget */, if (!(TREE_CODE (restr_arg) == INTEGER_CST && (TREE_INT_CST_LOW (restr_arg) & ~mask) == 0)) { - error ("argument %d must be a %d-bit unsigned literal", - bifaddr->restr_opnd[i], bifaddr->restr_val1[i]); + unsigned p = (1U << bifaddr->restr_val1[i]) - 1; + error ("argument %d must be a literal between 0 and %d," + " inclusive", + bifaddr->restr_opnd[i], p); return CONST0_RTX (mode[0]); } break; -- cgit v1.1 From eecee223f435fa01fb07a2fdba1615b89627d710 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Thu, 3 Feb 2022 10:26:29 -0600 Subject: rs6000: Consolidate target built-ins code Continuing with the refactoring effort, this patch moves as much of the target-specific built-in support code into a new file, rs6000-builtin.cc. However, we can't easily move the overloading support code out of rs6000-c.cc, because the build machinery understands that as a special file to be included with the C and C++ front ends. This patch is just a straightforward move, with one exception. I found that the builtin_mode_to_type[] array is no longer used, so I also removed all code having to do with it. The code in rs6000-builtin.cc is organized in related sections: - General support functions - Initialization support - GIMPLE folding support - Expansion support Overloading support remains in rs6000-c.cc. 2022-02-03 Bill Schmidt gcc/ * config.gcc (powerpc*-*-*): Add rs6000-builtin.o to extra_objs. * config/rs6000/rs6000-builtin.cc: New file, containing code moved from other files. * config/rs6000/rs6000-call.cc (cpu_is_info): Move to rs6000-builtin.cc. (cpu_supports_info): Likewise. (rs6000_type_string): Likewise. (altivec_expand_predicate_builtin): Likewise. (rs6000_htm_spr_icode): Likewise. (altivec_expand_vec_init_builtin): Likewise. (get_element_number): Likewise. (altivec_expand_vec_set_builtin): Likewise. (altivec_expand_vec_ext_builtin): Likewise. (rs6000_invalid_builtin): Likewise. (rs6000_fold_builtin): Likewise. (fold_build_vec_cmp): Likewise. (fold_compare_helper): Likewise. (map_to_integral_tree_type): Likewise. (fold_mergehl_helper): Likewise. (fold_mergeeo_helper): Likewise. (rs6000_builtin_valid_without_lhs): Likewise. (rs6000_builtin_is_supported): Likewise. (rs6000_gimple_fold_mma_builtin): Likewise. (rs6000_gimple_fold_builtin): Likewise. (rs6000_expand_ldst_mask): Likewise. (cpu_expand_builtin): Likewise. (elemrev_icode): Likewise. (ldv_expand_builtin): Likewise. (lxvrse_expand_builtin): Likewise. (lxvrze_expand_builtin): Likewise. (stv_expand_builtin): Likewise. (mma_expand_builtin): Likewise. (htm_spr_num): Likewise. (htm_expand_builtin): Likewise. (rs6000_expand_builtin): Likewise. (rs6000_vector_type): Likewise. (rs6000_init_builtins): Likewise. Remove initialization of builtin_mode_to_type entries. (rs6000_builtin_decl): Move to rs6000-builtin.cc. * config/rs6000/rs6000.cc (rs6000_builtin_mask_for_load): New external declaration. (rs6000_builtin_md_vectorized_function): Likewise. (rs6000_builtin_reciprocal): Likewise. (altivec_builtin_mask_for_load): Move to rs6000-builtin.cc. (rs6000_builtin_types): Likewise. (builtin_mode_to_type): Remove. (rs6000_builtin_mask_for_load): Move to rs6000-builtin.cc. Remove static qualifier. (rs6000_builtin_md_vectorized_function): Likewise. (rs6000_builtin_reciprocal): Likewise. * config/rs6000/rs6000.h (builtin_mode_to_type): Remove. * config/rs6000/t-rs6000 (rs6000-builtin.o): New target. --- gcc/config/rs6000/rs6000-builtin.cc | 3714 +++++++++++++++++++++++++++++++++++ gcc/config/rs6000/rs6000-call.cc | 3526 --------------------------------- gcc/config/rs6000/rs6000.cc | 163 +- gcc/config/rs6000/rs6000.h | 1 - gcc/config/rs6000/t-rs6000 | 4 + 5 files changed, 3722 insertions(+), 3686 deletions(-) create mode 100644 gcc/config/rs6000/rs6000-builtin.cc (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin.cc b/gcc/config/rs6000/rs6000-builtin.cc new file mode 100644 index 0000000..005f936 --- /dev/null +++ b/gcc/config/rs6000/rs6000-builtin.cc @@ -0,0 +1,3714 @@ +/* Target-specific built-in function support for the Power architecture. + See also rs6000-c.c, rs6000-gen-builtins.c, rs6000-builtins.def, and + rs6000-overloads.def. + Note that "normal" builtins (generic math functions, etc.) are handled + in rs6000.c. + + Copyright (C) 2002-2022 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#define IN_TARGET_CODE 1 + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "target.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "memmodel.h" +#include "gimple.h" +#include "tm_p.h" +#include "optabs.h" +#include "recog.h" +#include "diagnostic-core.h" +#include "fold-const.h" +#include "stor-layout.h" +#include "calls.h" +#include "varasm.h" +#include "explow.h" +#include "expr.h" +#include "langhooks.h" +#include "gimplify.h" +#include "gimple-fold.h" +#include "gimple-iterator.h" +#include "ssa.h" +#include "tree-ssa-propagate.h" +#include "builtins.h" +#include "tree-vector-builder.h" +#if TARGET_XCOFF +#include "xcoffout.h" /* get declarations of xcoff_*_section_name */ +#endif +#include "ppc-auxv.h" +#include "rs6000-internal.h" + +/* Built in types. */ +tree rs6000_builtin_types[RS6000_BTI_MAX]; + +/* Support targetm.vectorize.builtin_mask_for_load. */ +tree altivec_builtin_mask_for_load; + +/* **** General support functions **** */ + +/* Raise an error message for a builtin function that is called without the + appropriate target options being set. */ + +void +rs6000_invalid_builtin (enum rs6000_gen_builtins fncode) +{ + size_t j = (size_t) fncode; + const char *name = rs6000_builtin_info[j].bifname; + + switch (rs6000_builtin_info[j].enable) + { + case ENB_P5: + error ("%qs requires the %qs option", name, "-mcpu=power5"); + break; + case ENB_P6: + error ("%qs requires the %qs option", name, "-mcpu=power6"); + break; + case ENB_P6_64: + error ("%qs requires the %qs option and either the %qs or %qs option", + name, "-mcpu=power6", "-m64", "-mpowerpc64"); + break; + case ENB_ALTIVEC: + error ("%qs requires the %qs option", name, "-maltivec"); + break; + case ENB_CELL: + error ("%qs requires the %qs option", name, "-mcpu=cell"); + break; + case ENB_VSX: + error ("%qs requires the %qs option", name, "-mvsx"); + break; + case ENB_P7: + error ("%qs requires the %qs option", name, "-mcpu=power7"); + break; + case ENB_P7_64: + error ("%qs requires the %qs option and either the %qs or %qs option", + name, "-mcpu=power7", "-m64", "-mpowerpc64"); + break; + case ENB_P8: + error ("%qs requires the %qs option", name, "-mcpu=power8"); + break; + case ENB_P8V: + error ("%qs requires the %qs and %qs options", name, "-mcpu=power8", + "-mvsx"); + break; + case ENB_P9: + error ("%qs requires the %qs option", name, "-mcpu=power9"); + break; + case ENB_P9_64: + error ("%qs requires the %qs option and either the %qs or %qs option", + name, "-mcpu=power9", "-m64", "-mpowerpc64"); + break; + case ENB_P9V: + error ("%qs requires the %qs and %qs options", name, "-mcpu=power9", + "-mvsx"); + break; + case ENB_IEEE128_HW: + error ("%qs requires quad-precision floating-point arithmetic", name); + break; + case ENB_DFP: + error ("%qs requires the %qs option", name, "-mhard-dfp"); + break; + case ENB_CRYPTO: + error ("%qs requires the %qs option", name, "-mcrypto"); + break; + case ENB_HTM: + error ("%qs requires the %qs option", name, "-mhtm"); + break; + case ENB_P10: + error ("%qs requires the %qs option", name, "-mcpu=power10"); + break; + case ENB_P10_64: + error ("%qs requires the %qs option and either the %qs or %qs option", + name, "-mcpu=power10", "-m64", "-mpowerpc64"); + break; + case ENB_MMA: + error ("%qs requires the %qs option", name, "-mmma"); + break; + default: + case ENB_ALWAYS: + gcc_unreachable (); + } +} + +/* Check whether a builtin function is supported in this target + configuration. */ +bool +rs6000_builtin_is_supported (enum rs6000_gen_builtins fncode) +{ + switch (rs6000_builtin_info[(size_t) fncode].enable) + { + case ENB_ALWAYS: + return true; + case ENB_P5: + return TARGET_POPCNTB; + case ENB_P6: + return TARGET_CMPB; + case ENB_P6_64: + return TARGET_CMPB && TARGET_POWERPC64; + case ENB_P7: + return TARGET_POPCNTD; + case ENB_P7_64: + return TARGET_POPCNTD && TARGET_POWERPC64; + case ENB_P8: + return TARGET_DIRECT_MOVE; + case ENB_P8V: + return TARGET_P8_VECTOR; + case ENB_P9: + return TARGET_MODULO; + case ENB_P9_64: + return TARGET_MODULO && TARGET_POWERPC64; + case ENB_P9V: + return TARGET_P9_VECTOR; + case ENB_P10: + return TARGET_POWER10; + case ENB_P10_64: + return TARGET_POWER10 && TARGET_POWERPC64; + case ENB_ALTIVEC: + return TARGET_ALTIVEC; + case ENB_VSX: + return TARGET_VSX; + case ENB_CELL: + return TARGET_ALTIVEC && rs6000_cpu == PROCESSOR_CELL; + case ENB_IEEE128_HW: + return TARGET_FLOAT128_HW; + case ENB_DFP: + return TARGET_DFP; + case ENB_CRYPTO: + return TARGET_CRYPTO; + case ENB_HTM: + return TARGET_HTM; + case ENB_MMA: + return TARGET_MMA; + default: + gcc_unreachable (); + } + gcc_unreachable (); +} + +/* Target hook for early folding of built-ins, shamelessly stolen + from ia64.cc. */ + +tree +rs6000_fold_builtin (tree fndecl ATTRIBUTE_UNUSED, + int n_args ATTRIBUTE_UNUSED, + tree *args ATTRIBUTE_UNUSED, + bool ignore ATTRIBUTE_UNUSED) +{ +#ifdef SUBTARGET_FOLD_BUILTIN + return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore); +#else + return NULL_TREE; +#endif +} + +tree +rs6000_builtin_decl (unsigned code, bool /* initialize_p */) +{ + rs6000_gen_builtins fcode = (rs6000_gen_builtins) code; + + if (fcode >= RS6000_OVLD_MAX) + return error_mark_node; + + return rs6000_builtin_decls[code]; +} + +/* Implement targetm.vectorize.builtin_mask_for_load. */ +tree +rs6000_builtin_mask_for_load (void) +{ + /* Don't use lvsl/vperm for P8 and similarly efficient machines. */ + if ((TARGET_ALTIVEC && !TARGET_VSX) + || (TARGET_VSX && !TARGET_EFFICIENT_UNALIGNED_VSX)) + return altivec_builtin_mask_for_load; + else + return 0; +} + +/* Implement targetm.vectorize.builtin_md_vectorized_function. */ + +tree +rs6000_builtin_md_vectorized_function (tree fndecl, tree type_out, + tree type_in) +{ + machine_mode in_mode, out_mode; + int in_n, out_n; + + if (TARGET_DEBUG_BUILTIN) + fprintf (stderr, + "rs6000_builtin_md_vectorized_function (%s, %s, %s)\n", + IDENTIFIER_POINTER (DECL_NAME (fndecl)), + GET_MODE_NAME (TYPE_MODE (type_out)), + GET_MODE_NAME (TYPE_MODE (type_in))); + + /* TODO: Should this be gcc_assert? */ + if (TREE_CODE (type_out) != VECTOR_TYPE + || TREE_CODE (type_in) != VECTOR_TYPE) + return NULL_TREE; + + out_mode = TYPE_MODE (TREE_TYPE (type_out)); + out_n = TYPE_VECTOR_SUBPARTS (type_out); + in_mode = TYPE_MODE (TREE_TYPE (type_in)); + in_n = TYPE_VECTOR_SUBPARTS (type_in); + + enum rs6000_gen_builtins fn + = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl); + switch (fn) + { + case RS6000_BIF_RSQRTF: + if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode) + && out_mode == SFmode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return rs6000_builtin_decls[RS6000_BIF_VRSQRTFP]; + break; + case RS6000_BIF_RSQRT: + if (VECTOR_UNIT_VSX_P (V2DFmode) + && out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return rs6000_builtin_decls[RS6000_BIF_RSQRT_2DF]; + break; + case RS6000_BIF_RECIPF: + if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode) + && out_mode == SFmode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return rs6000_builtin_decls[RS6000_BIF_VRECIPFP]; + break; + case RS6000_BIF_RECIP: + if (VECTOR_UNIT_VSX_P (V2DFmode) + && out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return rs6000_builtin_decls[RS6000_BIF_RECIP_V2DF]; + break; + default: + break; + } + + machine_mode in_vmode = TYPE_MODE (type_in); + machine_mode out_vmode = TYPE_MODE (type_out); + + /* Power10 supported vectorized built-in functions. */ + if (TARGET_POWER10 + && in_vmode == out_vmode + && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode)) + { + machine_mode exp_mode = DImode; + machine_mode exp_vmode = V2DImode; + enum rs6000_gen_builtins bif; + switch (fn) + { + case RS6000_BIF_DIVWE: + case RS6000_BIF_DIVWEU: + exp_mode = SImode; + exp_vmode = V4SImode; + if (fn == RS6000_BIF_DIVWE) + bif = RS6000_BIF_VDIVESW; + else + bif = RS6000_BIF_VDIVEUW; + break; + case RS6000_BIF_DIVDE: + case RS6000_BIF_DIVDEU: + if (fn == RS6000_BIF_DIVDE) + bif = RS6000_BIF_VDIVESD; + else + bif = RS6000_BIF_VDIVEUD; + break; + case RS6000_BIF_CFUGED: + bif = RS6000_BIF_VCFUGED; + break; + case RS6000_BIF_CNTLZDM: + bif = RS6000_BIF_VCLZDM; + break; + case RS6000_BIF_CNTTZDM: + bif = RS6000_BIF_VCTZDM; + break; + case RS6000_BIF_PDEPD: + bif = RS6000_BIF_VPDEPD; + break; + case RS6000_BIF_PEXTD: + bif = RS6000_BIF_VPEXTD; + break; + default: + return NULL_TREE; + } + + if (in_mode == exp_mode && in_vmode == exp_vmode) + return rs6000_builtin_decls[bif]; + } + + return NULL_TREE; +} + +/* Returns a code for a target-specific builtin that implements + reciprocal of the function, or NULL_TREE if not available. */ + +tree +rs6000_builtin_reciprocal (tree fndecl) +{ + switch (DECL_MD_FUNCTION_CODE (fndecl)) + { + case RS6000_BIF_XVSQRTDP: + if (!RS6000_RECIP_AUTO_RSQRTE_P (V2DFmode)) + return NULL_TREE; + + return rs6000_builtin_decls[RS6000_BIF_RSQRT_2DF]; + + case RS6000_BIF_XVSQRTSP: + if (!RS6000_RECIP_AUTO_RSQRTE_P (V4SFmode)) + return NULL_TREE; + + return rs6000_builtin_decls[RS6000_BIF_RSQRT_4SF]; + + default: + return NULL_TREE; + } +} + +/* **** Initialization support **** */ + +/* Create a builtin vector type with a name. Taking care not to give + the canonical type a name. */ + +static tree +rs6000_vector_type (const char *name, tree elt_type, unsigned num_elts) +{ + tree result = build_vector_type (elt_type, num_elts); + + /* Copy so we don't give the canonical type a name. */ + result = build_variant_type_copy (result); + + add_builtin_type (name, result); + + return result; +} + +/* Debug utility to translate a type node to a single textual token. */ +static +const char *rs6000_type_string (tree type_node) +{ + if (type_node == void_type_node) + return "void"; + else if (type_node == long_integer_type_node) + return "long"; + else if (type_node == long_unsigned_type_node) + return "ulong"; + else if (type_node == long_long_integer_type_node) + return "longlong"; + else if (type_node == long_long_unsigned_type_node) + return "ulonglong"; + else if (type_node == bool_V2DI_type_node) + return "vbll"; + else if (type_node == bool_V4SI_type_node) + return "vbi"; + else if (type_node == bool_V8HI_type_node) + return "vbs"; + else if (type_node == bool_V16QI_type_node) + return "vbc"; + else if (type_node == bool_int_type_node) + return "bool"; + else if (type_node == dfloat64_type_node) + return "_Decimal64"; + else if (type_node == double_type_node) + return "double"; + else if (type_node == intDI_type_node) + return "sll"; + else if (type_node == intHI_type_node) + return "ss"; + else if (type_node == ibm128_float_type_node) + return "__ibm128"; + else if (type_node == opaque_V4SI_type_node) + return "opaque"; + else if (POINTER_TYPE_P (type_node)) + return "void*"; + else if (type_node == intQI_type_node || type_node == char_type_node) + return "sc"; + else if (type_node == dfloat32_type_node) + return "_Decimal32"; + else if (type_node == float_type_node) + return "float"; + else if (type_node == intSI_type_node || type_node == integer_type_node) + return "si"; + else if (type_node == dfloat128_type_node) + return "_Decimal128"; + else if (type_node == long_double_type_node) + return "longdouble"; + else if (type_node == intTI_type_node) + return "sq"; + else if (type_node == unsigned_intDI_type_node) + return "ull"; + else if (type_node == unsigned_intHI_type_node) + return "us"; + else if (type_node == unsigned_intQI_type_node) + return "uc"; + else if (type_node == unsigned_intSI_type_node) + return "ui"; + else if (type_node == unsigned_intTI_type_node) + return "uq"; + else if (type_node == unsigned_V1TI_type_node) + return "vuq"; + else if (type_node == unsigned_V2DI_type_node) + return "vull"; + else if (type_node == unsigned_V4SI_type_node) + return "vui"; + else if (type_node == unsigned_V8HI_type_node) + return "vus"; + else if (type_node == unsigned_V16QI_type_node) + return "vuc"; + else if (type_node == V16QI_type_node) + return "vsc"; + else if (type_node == V1TI_type_node) + return "vsq"; + else if (type_node == V2DF_type_node) + return "vd"; + else if (type_node == V2DI_type_node) + return "vsll"; + else if (type_node == V4SF_type_node) + return "vf"; + else if (type_node == V4SI_type_node) + return "vsi"; + else if (type_node == V8HI_type_node) + return "vss"; + else if (type_node == pixel_V8HI_type_node) + return "vp"; + else if (type_node == pcvoid_type_node) + return "voidc*"; + else if (type_node == float128_type_node) + return "_Float128"; + else if (type_node == vector_pair_type_node) + return "__vector_pair"; + else if (type_node == vector_quad_type_node) + return "__vector_quad"; + + return "unknown"; +} + +void +rs6000_init_builtins (void) +{ + tree tdecl; + tree t; + + if (TARGET_DEBUG_BUILTIN) + fprintf (stderr, "rs6000_init_builtins%s%s\n", + (TARGET_ALTIVEC) ? ", altivec" : "", + (TARGET_VSX) ? ", vsx" : ""); + + V2DI_type_node = rs6000_vector_type ("__vector long long", + long_long_integer_type_node, 2); + ptr_V2DI_type_node + = build_pointer_type (build_qualified_type (V2DI_type_node, + TYPE_QUAL_CONST)); + + V2DF_type_node = rs6000_vector_type ("__vector double", double_type_node, 2); + ptr_V2DF_type_node + = build_pointer_type (build_qualified_type (V2DF_type_node, + TYPE_QUAL_CONST)); + + V4SI_type_node = rs6000_vector_type ("__vector signed int", + intSI_type_node, 4); + ptr_V4SI_type_node + = build_pointer_type (build_qualified_type (V4SI_type_node, + TYPE_QUAL_CONST)); + + V4SF_type_node = rs6000_vector_type ("__vector float", float_type_node, 4); + ptr_V4SF_type_node + = build_pointer_type (build_qualified_type (V4SF_type_node, + TYPE_QUAL_CONST)); + + V8HI_type_node = rs6000_vector_type ("__vector signed short", + intHI_type_node, 8); + ptr_V8HI_type_node + = build_pointer_type (build_qualified_type (V8HI_type_node, + TYPE_QUAL_CONST)); + + V16QI_type_node = rs6000_vector_type ("__vector signed char", + intQI_type_node, 16); + ptr_V16QI_type_node + = build_pointer_type (build_qualified_type (V16QI_type_node, + TYPE_QUAL_CONST)); + + unsigned_V16QI_type_node = rs6000_vector_type ("__vector unsigned char", + unsigned_intQI_type_node, 16); + ptr_unsigned_V16QI_type_node + = build_pointer_type (build_qualified_type (unsigned_V16QI_type_node, + TYPE_QUAL_CONST)); + + unsigned_V8HI_type_node = rs6000_vector_type ("__vector unsigned short", + unsigned_intHI_type_node, 8); + ptr_unsigned_V8HI_type_node + = build_pointer_type (build_qualified_type (unsigned_V8HI_type_node, + TYPE_QUAL_CONST)); + + unsigned_V4SI_type_node = rs6000_vector_type ("__vector unsigned int", + unsigned_intSI_type_node, 4); + ptr_unsigned_V4SI_type_node + = build_pointer_type (build_qualified_type (unsigned_V4SI_type_node, + TYPE_QUAL_CONST)); + + unsigned_V2DI_type_node + = rs6000_vector_type ("__vector unsigned long long", + long_long_unsigned_type_node, 2); + + ptr_unsigned_V2DI_type_node + = build_pointer_type (build_qualified_type (unsigned_V2DI_type_node, + TYPE_QUAL_CONST)); + + opaque_V4SI_type_node = build_opaque_vector_type (intSI_type_node, 4); + + const_str_type_node + = build_pointer_type (build_qualified_type (char_type_node, + TYPE_QUAL_CONST)); + + /* We use V1TI mode as a special container to hold __int128_t items that + must live in VSX registers. */ + if (intTI_type_node) + { + V1TI_type_node = rs6000_vector_type ("__vector __int128", + intTI_type_node, 1); + ptr_V1TI_type_node + = build_pointer_type (build_qualified_type (V1TI_type_node, + TYPE_QUAL_CONST)); + unsigned_V1TI_type_node + = rs6000_vector_type ("__vector unsigned __int128", + unsigned_intTI_type_node, 1); + ptr_unsigned_V1TI_type_node + = build_pointer_type (build_qualified_type (unsigned_V1TI_type_node, + TYPE_QUAL_CONST)); + } + + /* The 'vector bool ...' types must be kept distinct from 'vector unsigned ...' + types, especially in C++ land. Similarly, 'vector pixel' is distinct from + 'vector unsigned short'. */ + + bool_char_type_node = build_distinct_type_copy (unsigned_intQI_type_node); + bool_short_type_node = build_distinct_type_copy (unsigned_intHI_type_node); + bool_int_type_node = build_distinct_type_copy (unsigned_intSI_type_node); + bool_long_long_type_node = build_distinct_type_copy (unsigned_intDI_type_node); + pixel_type_node = build_distinct_type_copy (unsigned_intHI_type_node); + + long_integer_type_internal_node = long_integer_type_node; + long_unsigned_type_internal_node = long_unsigned_type_node; + long_long_integer_type_internal_node = long_long_integer_type_node; + long_long_unsigned_type_internal_node = long_long_unsigned_type_node; + intQI_type_internal_node = intQI_type_node; + uintQI_type_internal_node = unsigned_intQI_type_node; + intHI_type_internal_node = intHI_type_node; + uintHI_type_internal_node = unsigned_intHI_type_node; + intSI_type_internal_node = intSI_type_node; + uintSI_type_internal_node = unsigned_intSI_type_node; + intDI_type_internal_node = intDI_type_node; + uintDI_type_internal_node = unsigned_intDI_type_node; + intTI_type_internal_node = intTI_type_node; + uintTI_type_internal_node = unsigned_intTI_type_node; + float_type_internal_node = float_type_node; + double_type_internal_node = double_type_node; + long_double_type_internal_node = long_double_type_node; + dfloat64_type_internal_node = dfloat64_type_node; + dfloat128_type_internal_node = dfloat128_type_node; + void_type_internal_node = void_type_node; + + ptr_intQI_type_node + = build_pointer_type (build_qualified_type (intQI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_uintQI_type_node + = build_pointer_type (build_qualified_type (uintQI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_intHI_type_node + = build_pointer_type (build_qualified_type (intHI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_uintHI_type_node + = build_pointer_type (build_qualified_type (uintHI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_intSI_type_node + = build_pointer_type (build_qualified_type (intSI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_uintSI_type_node + = build_pointer_type (build_qualified_type (uintSI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_intDI_type_node + = build_pointer_type (build_qualified_type (intDI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_uintDI_type_node + = build_pointer_type (build_qualified_type (uintDI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_intTI_type_node + = build_pointer_type (build_qualified_type (intTI_type_internal_node, + TYPE_QUAL_CONST)); + ptr_uintTI_type_node + = build_pointer_type (build_qualified_type (uintTI_type_internal_node, + TYPE_QUAL_CONST)); + + t = build_qualified_type (long_integer_type_internal_node, TYPE_QUAL_CONST); + ptr_long_integer_type_node = build_pointer_type (t); + + t = build_qualified_type (long_unsigned_type_internal_node, TYPE_QUAL_CONST); + ptr_long_unsigned_type_node = build_pointer_type (t); + + ptr_float_type_node + = build_pointer_type (build_qualified_type (float_type_internal_node, + TYPE_QUAL_CONST)); + ptr_double_type_node + = build_pointer_type (build_qualified_type (double_type_internal_node, + TYPE_QUAL_CONST)); + ptr_long_double_type_node + = build_pointer_type (build_qualified_type (long_double_type_internal_node, + TYPE_QUAL_CONST)); + if (dfloat64_type_node) + { + t = build_qualified_type (dfloat64_type_internal_node, TYPE_QUAL_CONST); + ptr_dfloat64_type_node = build_pointer_type (t); + } + else + ptr_dfloat64_type_node = NULL; + + if (dfloat128_type_node) + { + t = build_qualified_type (dfloat128_type_internal_node, TYPE_QUAL_CONST); + ptr_dfloat128_type_node = build_pointer_type (t); + } + else + ptr_dfloat128_type_node = NULL; + + t = build_qualified_type (long_long_integer_type_internal_node, + TYPE_QUAL_CONST); + ptr_long_long_integer_type_node = build_pointer_type (t); + + t = build_qualified_type (long_long_unsigned_type_internal_node, + TYPE_QUAL_CONST); + ptr_long_long_unsigned_type_node = build_pointer_type (t); + + /* 128-bit floating point support. KFmode is IEEE 128-bit floating point. + IFmode is the IBM extended 128-bit format that is a pair of doubles. + TFmode will be either IEEE 128-bit floating point or the IBM double-double + format that uses a pair of doubles, depending on the switches and + defaults. + + If we don't support for either 128-bit IBM double double or IEEE 128-bit + floating point, we need make sure the type is non-zero or else self-test + fails during bootstrap. + + Always create __ibm128 as a separate type, even if the current long double + format is IBM extended double. + + For IEEE 128-bit floating point, always create the type __ieee128. If the + user used -mfloat128, rs6000-c.cc will create a define from __float128 to + __ieee128. */ + if (TARGET_FLOAT128_TYPE) + { + if (!TARGET_IEEEQUAD && TARGET_LONG_DOUBLE_128) + ibm128_float_type_node = long_double_type_node; + else + { + ibm128_float_type_node = make_node (REAL_TYPE); + TYPE_PRECISION (ibm128_float_type_node) = 128; + SET_TYPE_MODE (ibm128_float_type_node, IFmode); + layout_type (ibm128_float_type_node); + } + t = build_qualified_type (ibm128_float_type_node, TYPE_QUAL_CONST); + ptr_ibm128_float_type_node = build_pointer_type (t); + lang_hooks.types.register_builtin_type (ibm128_float_type_node, + "__ibm128"); + + if (TARGET_IEEEQUAD && TARGET_LONG_DOUBLE_128) + ieee128_float_type_node = long_double_type_node; + else + ieee128_float_type_node = float128_type_node; + t = build_qualified_type (ieee128_float_type_node, TYPE_QUAL_CONST); + ptr_ieee128_float_type_node = build_pointer_type (t); + lang_hooks.types.register_builtin_type (ieee128_float_type_node, + "__ieee128"); + } + + else + ieee128_float_type_node = ibm128_float_type_node = long_double_type_node; + + /* Vector pair and vector quad support. */ + vector_pair_type_node = make_node (OPAQUE_TYPE); + SET_TYPE_MODE (vector_pair_type_node, OOmode); + TYPE_SIZE (vector_pair_type_node) = bitsize_int (GET_MODE_BITSIZE (OOmode)); + TYPE_PRECISION (vector_pair_type_node) = GET_MODE_BITSIZE (OOmode); + TYPE_SIZE_UNIT (vector_pair_type_node) = size_int (GET_MODE_SIZE (OOmode)); + SET_TYPE_ALIGN (vector_pair_type_node, 256); + TYPE_USER_ALIGN (vector_pair_type_node) = 0; + lang_hooks.types.register_builtin_type (vector_pair_type_node, + "__vector_pair"); + t = build_qualified_type (vector_pair_type_node, TYPE_QUAL_CONST); + ptr_vector_pair_type_node = build_pointer_type (t); + + vector_quad_type_node = make_node (OPAQUE_TYPE); + SET_TYPE_MODE (vector_quad_type_node, XOmode); + TYPE_SIZE (vector_quad_type_node) = bitsize_int (GET_MODE_BITSIZE (XOmode)); + TYPE_PRECISION (vector_quad_type_node) = GET_MODE_BITSIZE (XOmode); + TYPE_SIZE_UNIT (vector_quad_type_node) = size_int (GET_MODE_SIZE (XOmode)); + SET_TYPE_ALIGN (vector_quad_type_node, 512); + TYPE_USER_ALIGN (vector_quad_type_node) = 0; + lang_hooks.types.register_builtin_type (vector_quad_type_node, + "__vector_quad"); + t = build_qualified_type (vector_quad_type_node, TYPE_QUAL_CONST); + ptr_vector_quad_type_node = build_pointer_type (t); + + tdecl = add_builtin_type ("__bool char", bool_char_type_node); + TYPE_NAME (bool_char_type_node) = tdecl; + + tdecl = add_builtin_type ("__bool short", bool_short_type_node); + TYPE_NAME (bool_short_type_node) = tdecl; + + tdecl = add_builtin_type ("__bool int", bool_int_type_node); + TYPE_NAME (bool_int_type_node) = tdecl; + + tdecl = add_builtin_type ("__pixel", pixel_type_node); + TYPE_NAME (pixel_type_node) = tdecl; + + bool_V16QI_type_node = rs6000_vector_type ("__vector __bool char", + bool_char_type_node, 16); + ptr_bool_V16QI_type_node + = build_pointer_type (build_qualified_type (bool_V16QI_type_node, + TYPE_QUAL_CONST)); + + bool_V8HI_type_node = rs6000_vector_type ("__vector __bool short", + bool_short_type_node, 8); + ptr_bool_V8HI_type_node + = build_pointer_type (build_qualified_type (bool_V8HI_type_node, + TYPE_QUAL_CONST)); + + bool_V4SI_type_node = rs6000_vector_type ("__vector __bool int", + bool_int_type_node, 4); + ptr_bool_V4SI_type_node + = build_pointer_type (build_qualified_type (bool_V4SI_type_node, + TYPE_QUAL_CONST)); + + bool_V2DI_type_node = rs6000_vector_type (TARGET_POWERPC64 + ? "__vector __bool long" + : "__vector __bool long long", + bool_long_long_type_node, 2); + ptr_bool_V2DI_type_node + = build_pointer_type (build_qualified_type (bool_V2DI_type_node, + TYPE_QUAL_CONST)); + + bool_V1TI_type_node = rs6000_vector_type ("__vector __bool __int128", + intTI_type_node, 1); + ptr_bool_V1TI_type_node + = build_pointer_type (build_qualified_type (bool_V1TI_type_node, + TYPE_QUAL_CONST)); + + pixel_V8HI_type_node = rs6000_vector_type ("__vector __pixel", + pixel_type_node, 8); + ptr_pixel_V8HI_type_node + = build_pointer_type (build_qualified_type (pixel_V8HI_type_node, + TYPE_QUAL_CONST)); + pcvoid_type_node + = build_pointer_type (build_qualified_type (void_type_node, + TYPE_QUAL_CONST)); + + /* Execute the autogenerated initialization code for builtins. */ + rs6000_init_generated_builtins (); + + if (TARGET_DEBUG_BUILTIN) + { + fprintf (stderr, "\nAutogenerated built-in functions:\n\n"); + for (int i = 1; i < (int) RS6000_BIF_MAX; i++) + { + bif_enable e = rs6000_builtin_info[i].enable; + if (e == ENB_P5 && !TARGET_POPCNTB) + continue; + if (e == ENB_P6 && !TARGET_CMPB) + continue; + if (e == ENB_P6_64 && !(TARGET_CMPB && TARGET_POWERPC64)) + continue; + if (e == ENB_ALTIVEC && !TARGET_ALTIVEC) + continue; + if (e == ENB_VSX && !TARGET_VSX) + continue; + if (e == ENB_P7 && !TARGET_POPCNTD) + continue; + if (e == ENB_P7_64 && !(TARGET_POPCNTD && TARGET_POWERPC64)) + continue; + if (e == ENB_P8 && !TARGET_DIRECT_MOVE) + continue; + if (e == ENB_P8V && !TARGET_P8_VECTOR) + continue; + if (e == ENB_P9 && !TARGET_MODULO) + continue; + if (e == ENB_P9_64 && !(TARGET_MODULO && TARGET_POWERPC64)) + continue; + if (e == ENB_P9V && !TARGET_P9_VECTOR) + continue; + if (e == ENB_IEEE128_HW && !TARGET_FLOAT128_HW) + continue; + if (e == ENB_DFP && !TARGET_DFP) + continue; + if (e == ENB_CRYPTO && !TARGET_CRYPTO) + continue; + if (e == ENB_HTM && !TARGET_HTM) + continue; + if (e == ENB_P10 && !TARGET_POWER10) + continue; + if (e == ENB_P10_64 && !(TARGET_POWER10 && TARGET_POWERPC64)) + continue; + if (e == ENB_MMA && !TARGET_MMA) + continue; + tree fntype = rs6000_builtin_info[i].fntype; + tree t = TREE_TYPE (fntype); + fprintf (stderr, "%s %s (", rs6000_type_string (t), + rs6000_builtin_info[i].bifname); + t = TYPE_ARG_TYPES (fntype); + while (t && TREE_VALUE (t) != void_type_node) + { + fprintf (stderr, "%s", + rs6000_type_string (TREE_VALUE (t))); + t = TREE_CHAIN (t); + if (t && TREE_VALUE (t) != void_type_node) + fprintf (stderr, ", "); + } + fprintf (stderr, "); %s [%4d]\n", + rs6000_builtin_info[i].attr_string, (int) i); + } + fprintf (stderr, "\nEnd autogenerated built-in functions.\n\n\n"); + } + + if (TARGET_XCOFF) + { + /* AIX libm provides clog as __clog. */ + if ((tdecl = builtin_decl_explicit (BUILT_IN_CLOG)) != NULL_TREE) + set_user_assembler_name (tdecl, "__clog"); + + /* When long double is 64 bit, some long double builtins of libc + functions (like __builtin_frexpl) must call the double version + (frexp) not the long double version (frexpl) that expects a 128 bit + argument. */ + if (! TARGET_LONG_DOUBLE_128) + { + if ((tdecl = builtin_decl_explicit (BUILT_IN_FMODL)) != NULL_TREE) + set_user_assembler_name (tdecl, "fmod"); + if ((tdecl = builtin_decl_explicit (BUILT_IN_FREXPL)) != NULL_TREE) + set_user_assembler_name (tdecl, "frexp"); + if ((tdecl = builtin_decl_explicit (BUILT_IN_LDEXPL)) != NULL_TREE) + set_user_assembler_name (tdecl, "ldexp"); + if ((tdecl = builtin_decl_explicit (BUILT_IN_MODFL)) != NULL_TREE) + set_user_assembler_name (tdecl, "modf"); + } + } + + altivec_builtin_mask_for_load + = rs6000_builtin_decls[RS6000_BIF_MASK_FOR_LOAD]; + +#ifdef SUBTARGET_INIT_BUILTINS + SUBTARGET_INIT_BUILTINS; +#endif + + return; +} + +/* **** GIMPLE folding support **** */ + +/* Helper function to handle the gimple folding of a vector compare + operation. This sets up true/false vectors, and uses the + VEC_COND_EXPR operation. + CODE indicates which comparison is to be made. (EQ, GT, ...). + TYPE indicates the type of the result. + Code is inserted before GSI. */ +static tree +fold_build_vec_cmp (tree_code code, tree type, tree arg0, tree arg1, + gimple_stmt_iterator *gsi) +{ + tree cmp_type = truth_type_for (type); + tree zero_vec = build_zero_cst (type); + tree minus_one_vec = build_minus_one_cst (type); + tree temp = create_tmp_reg_or_ssa_name (cmp_type); + gimple *g = gimple_build_assign (temp, code, arg0, arg1); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + return fold_build3 (VEC_COND_EXPR, type, temp, minus_one_vec, zero_vec); +} + +/* Helper function to handle the in-between steps for the + vector compare built-ins. */ +static void +fold_compare_helper (gimple_stmt_iterator *gsi, tree_code code, gimple *stmt) +{ + tree arg0 = gimple_call_arg (stmt, 0); + tree arg1 = gimple_call_arg (stmt, 1); + tree lhs = gimple_call_lhs (stmt); + tree cmp = fold_build_vec_cmp (code, TREE_TYPE (lhs), arg0, arg1, gsi); + gimple *g = gimple_build_assign (lhs, cmp); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); +} + +/* Helper function to map V2DF and V4SF types to their + integral equivalents (V2DI and V4SI). */ +tree map_to_integral_tree_type (tree input_tree_type) +{ + if (INTEGRAL_TYPE_P (TREE_TYPE (input_tree_type))) + return input_tree_type; + else + { + if (types_compatible_p (TREE_TYPE (input_tree_type), + TREE_TYPE (V2DF_type_node))) + return V2DI_type_node; + else if (types_compatible_p (TREE_TYPE (input_tree_type), + TREE_TYPE (V4SF_type_node))) + return V4SI_type_node; + else + gcc_unreachable (); + } +} + +/* Helper function to handle the vector merge[hl] built-ins. The + implementation difference between h and l versions for this code are in + the values used when building of the permute vector for high word versus + low word merge. The variance is keyed off the use_high parameter. */ +static void +fold_mergehl_helper (gimple_stmt_iterator *gsi, gimple *stmt, int use_high) +{ + tree arg0 = gimple_call_arg (stmt, 0); + tree arg1 = gimple_call_arg (stmt, 1); + tree lhs = gimple_call_lhs (stmt); + tree lhs_type = TREE_TYPE (lhs); + int n_elts = TYPE_VECTOR_SUBPARTS (lhs_type); + int midpoint = n_elts / 2; + int offset = 0; + + if (use_high == 1) + offset = midpoint; + + /* The permute_type will match the lhs for integral types. For double and + float types, the permute type needs to map to the V2 or V4 type that + matches size. */ + tree permute_type; + permute_type = map_to_integral_tree_type (lhs_type); + tree_vector_builder elts (permute_type, VECTOR_CST_NELTS (arg0), 1); + + for (int i = 0; i < midpoint; i++) + { + elts.safe_push (build_int_cst (TREE_TYPE (permute_type), + offset + i)); + elts.safe_push (build_int_cst (TREE_TYPE (permute_type), + offset + n_elts + i)); + } + + tree permute = elts.build (); + + gimple *g = gimple_build_assign (lhs, VEC_PERM_EXPR, arg0, arg1, permute); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); +} + +/* Helper function to handle the vector merge[eo] built-ins. */ +static void +fold_mergeeo_helper (gimple_stmt_iterator *gsi, gimple *stmt, int use_odd) +{ + tree arg0 = gimple_call_arg (stmt, 0); + tree arg1 = gimple_call_arg (stmt, 1); + tree lhs = gimple_call_lhs (stmt); + tree lhs_type = TREE_TYPE (lhs); + int n_elts = TYPE_VECTOR_SUBPARTS (lhs_type); + + /* The permute_type will match the lhs for integral types. For double and + float types, the permute type needs to map to the V2 or V4 type that + matches size. */ + tree permute_type; + permute_type = map_to_integral_tree_type (lhs_type); + + tree_vector_builder elts (permute_type, VECTOR_CST_NELTS (arg0), 1); + + /* Build the permute vector. */ + for (int i = 0; i < n_elts / 2; i++) + { + elts.safe_push (build_int_cst (TREE_TYPE (permute_type), + 2*i + use_odd)); + elts.safe_push (build_int_cst (TREE_TYPE (permute_type), + 2*i + use_odd + n_elts)); + } + + tree permute = elts.build (); + + gimple *g = gimple_build_assign (lhs, VEC_PERM_EXPR, arg0, arg1, permute); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); +} + +/* Helper function to sort out which built-ins may be valid without having + a LHS. */ +static bool +rs6000_builtin_valid_without_lhs (enum rs6000_gen_builtins fn_code, + tree fndecl) +{ + if (TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node) + return true; + + switch (fn_code) + { + case RS6000_BIF_STVX_V16QI: + case RS6000_BIF_STVX_V8HI: + case RS6000_BIF_STVX_V4SI: + case RS6000_BIF_STVX_V4SF: + case RS6000_BIF_STVX_V2DI: + case RS6000_BIF_STVX_V2DF: + case RS6000_BIF_STXVW4X_V16QI: + case RS6000_BIF_STXVW4X_V8HI: + case RS6000_BIF_STXVW4X_V4SF: + case RS6000_BIF_STXVW4X_V4SI: + case RS6000_BIF_STXVD2X_V2DF: + case RS6000_BIF_STXVD2X_V2DI: + return true; + default: + return false; + } +} + +/* Expand the MMA built-ins early, so that we can convert the pass-by-reference + __vector_quad arguments into pass-by-value arguments, leading to more + efficient code generation. */ +static bool +rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi, + rs6000_gen_builtins fn_code) +{ + gimple *stmt = gsi_stmt (*gsi); + size_t fncode = (size_t) fn_code; + + if (!bif_is_mma (rs6000_builtin_info[fncode])) + return false; + + /* Each call that can be gimple-expanded has an associated built-in + function that it will expand into. If this one doesn't, we have + already expanded it! Exceptions: lxvp and stxvp. */ + if (rs6000_builtin_info[fncode].assoc_bif == RS6000_BIF_NONE + && fncode != RS6000_BIF_LXVP + && fncode != RS6000_BIF_STXVP) + return false; + + bifdata *bd = &rs6000_builtin_info[fncode]; + unsigned nopnds = bd->nargs; + gimple_seq new_seq = NULL; + gimple *new_call; + tree new_decl; + + /* Compatibility built-ins; we used to call these + __builtin_mma_{dis,}assemble_pair, but now we call them + __builtin_vsx_{dis,}assemble_pair. Handle the old versions. */ + if (fncode == RS6000_BIF_ASSEMBLE_PAIR) + fncode = RS6000_BIF_ASSEMBLE_PAIR_V; + else if (fncode == RS6000_BIF_DISASSEMBLE_PAIR) + fncode = RS6000_BIF_DISASSEMBLE_PAIR_V; + + if (fncode == RS6000_BIF_DISASSEMBLE_ACC + || fncode == RS6000_BIF_DISASSEMBLE_PAIR_V) + { + /* This is an MMA disassemble built-in function. */ + push_gimplify_context (true); + unsigned nvec = (fncode == RS6000_BIF_DISASSEMBLE_ACC) ? 4 : 2; + tree dst_ptr = gimple_call_arg (stmt, 0); + tree src_ptr = gimple_call_arg (stmt, 1); + tree src_type = TREE_TYPE (src_ptr); + tree src = create_tmp_reg_or_ssa_name (TREE_TYPE (src_type)); + gimplify_assign (src, build_simple_mem_ref (src_ptr), &new_seq); + + /* If we are not disassembling an accumulator/pair or our destination is + another accumulator/pair, then just copy the entire thing as is. */ + if ((fncode == RS6000_BIF_DISASSEMBLE_ACC + && TREE_TYPE (TREE_TYPE (dst_ptr)) == vector_quad_type_node) + || (fncode == RS6000_BIF_DISASSEMBLE_PAIR_V + && TREE_TYPE (TREE_TYPE (dst_ptr)) == vector_pair_type_node)) + { + tree dst = build_simple_mem_ref (build1 (VIEW_CONVERT_EXPR, + src_type, dst_ptr)); + gimplify_assign (dst, src, &new_seq); + pop_gimplify_context (NULL); + gsi_replace_with_seq (gsi, new_seq, true); + return true; + } + + /* If we're disassembling an accumulator into a different type, we need + to emit a xxmfacc instruction now, since we cannot do it later. */ + if (fncode == RS6000_BIF_DISASSEMBLE_ACC) + { + new_decl = rs6000_builtin_decls[RS6000_BIF_XXMFACC_INTERNAL]; + new_call = gimple_build_call (new_decl, 1, src); + src = create_tmp_reg_or_ssa_name (vector_quad_type_node); + gimple_call_set_lhs (new_call, src); + gimple_seq_add_stmt (&new_seq, new_call); + } + + /* Copy the accumulator/pair vector by vector. */ + new_decl + = rs6000_builtin_decls[rs6000_builtin_info[fncode].assoc_bif]; + tree dst_type = build_pointer_type_for_mode (unsigned_V16QI_type_node, + ptr_mode, true); + tree dst_base = build1 (VIEW_CONVERT_EXPR, dst_type, dst_ptr); + for (unsigned i = 0; i < nvec; i++) + { + unsigned index = WORDS_BIG_ENDIAN ? i : nvec - 1 - i; + tree dst = build2 (MEM_REF, unsigned_V16QI_type_node, dst_base, + build_int_cst (dst_type, index * 16)); + tree dstssa = create_tmp_reg_or_ssa_name (unsigned_V16QI_type_node); + new_call = gimple_build_call (new_decl, 2, src, + build_int_cstu (uint16_type_node, i)); + gimple_call_set_lhs (new_call, dstssa); + gimple_seq_add_stmt (&new_seq, new_call); + gimplify_assign (dst, dstssa, &new_seq); + } + pop_gimplify_context (NULL); + gsi_replace_with_seq (gsi, new_seq, true); + return true; + } + + /* TODO: Do some factoring on these two chunks. */ + if (fncode == RS6000_BIF_LXVP) + { + push_gimplify_context (true); + tree offset = gimple_call_arg (stmt, 0); + tree ptr = gimple_call_arg (stmt, 1); + tree lhs = gimple_call_lhs (stmt); + if (TREE_TYPE (TREE_TYPE (ptr)) != vector_pair_type_node) + ptr = build1 (VIEW_CONVERT_EXPR, + build_pointer_type (vector_pair_type_node), ptr); + tree mem = build_simple_mem_ref (build2 (POINTER_PLUS_EXPR, + TREE_TYPE (ptr), ptr, offset)); + gimplify_assign (lhs, mem, &new_seq); + pop_gimplify_context (NULL); + gsi_replace_with_seq (gsi, new_seq, true); + return true; + } + + if (fncode == RS6000_BIF_STXVP) + { + push_gimplify_context (true); + tree src = gimple_call_arg (stmt, 0); + tree offset = gimple_call_arg (stmt, 1); + tree ptr = gimple_call_arg (stmt, 2); + if (TREE_TYPE (TREE_TYPE (ptr)) != vector_pair_type_node) + ptr = build1 (VIEW_CONVERT_EXPR, + build_pointer_type (vector_pair_type_node), ptr); + tree mem = build_simple_mem_ref (build2 (POINTER_PLUS_EXPR, + TREE_TYPE (ptr), ptr, offset)); + gimplify_assign (mem, src, &new_seq); + pop_gimplify_context (NULL); + gsi_replace_with_seq (gsi, new_seq, true); + return true; + } + + /* Convert this built-in into an internal version that uses pass-by-value + arguments. The internal built-in is found in the assoc_bif field. */ + new_decl = rs6000_builtin_decls[rs6000_builtin_info[fncode].assoc_bif]; + tree lhs, op[MAX_MMA_OPERANDS]; + tree acc = gimple_call_arg (stmt, 0); + push_gimplify_context (true); + + if (bif_is_quad (*bd)) + { + /* This built-in has a pass-by-reference accumulator input, so load it + into a temporary accumulator for use as a pass-by-value input. */ + op[0] = create_tmp_reg_or_ssa_name (vector_quad_type_node); + for (unsigned i = 1; i < nopnds; i++) + op[i] = gimple_call_arg (stmt, i); + gimplify_assign (op[0], build_simple_mem_ref (acc), &new_seq); + } + else + { + /* This built-in does not use its pass-by-reference accumulator argument + as an input argument, so remove it from the input list. */ + nopnds--; + for (unsigned i = 0; i < nopnds; i++) + op[i] = gimple_call_arg (stmt, i + 1); + } + + switch (nopnds) + { + case 0: + new_call = gimple_build_call (new_decl, 0); + break; + case 1: + new_call = gimple_build_call (new_decl, 1, op[0]); + break; + case 2: + new_call = gimple_build_call (new_decl, 2, op[0], op[1]); + break; + case 3: + new_call = gimple_build_call (new_decl, 3, op[0], op[1], op[2]); + break; + case 4: + new_call = gimple_build_call (new_decl, 4, op[0], op[1], op[2], op[3]); + break; + case 5: + new_call = gimple_build_call (new_decl, 5, op[0], op[1], op[2], op[3], + op[4]); + break; + case 6: + new_call = gimple_build_call (new_decl, 6, op[0], op[1], op[2], op[3], + op[4], op[5]); + break; + case 7: + new_call = gimple_build_call (new_decl, 7, op[0], op[1], op[2], op[3], + op[4], op[5], op[6]); + break; + default: + gcc_unreachable (); + } + + if (fncode == RS6000_BIF_BUILD_PAIR || fncode == RS6000_BIF_ASSEMBLE_PAIR_V) + lhs = create_tmp_reg_or_ssa_name (vector_pair_type_node); + else + lhs = create_tmp_reg_or_ssa_name (vector_quad_type_node); + gimple_call_set_lhs (new_call, lhs); + gimple_seq_add_stmt (&new_seq, new_call); + gimplify_assign (build_simple_mem_ref (acc), lhs, &new_seq); + pop_gimplify_context (NULL); + gsi_replace_with_seq (gsi, new_seq, true); + + return true; +} + +/* Fold a machine-dependent built-in in GIMPLE. (For folding into + a constant, use rs6000_fold_builtin.) */ +bool +rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi) +{ + gimple *stmt = gsi_stmt (*gsi); + tree fndecl = gimple_call_fndecl (stmt); + gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD); + enum rs6000_gen_builtins fn_code + = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl); + tree arg0, arg1, lhs, temp; + enum tree_code bcode; + gimple *g; + + size_t uns_fncode = (size_t) fn_code; + enum insn_code icode = rs6000_builtin_info[uns_fncode].icode; + const char *fn_name1 = rs6000_builtin_info[uns_fncode].bifname; + const char *fn_name2 = (icode != CODE_FOR_nothing) + ? get_insn_name ((int) icode) + : "nothing"; + + if (TARGET_DEBUG_BUILTIN) + fprintf (stderr, "rs6000_gimple_fold_builtin %d %s %s\n", + fn_code, fn_name1, fn_name2); + + if (!rs6000_fold_gimple) + return false; + + /* Prevent gimple folding for code that does not have a LHS, unless it is + allowed per the rs6000_builtin_valid_without_lhs helper function. */ + if (!gimple_call_lhs (stmt) + && !rs6000_builtin_valid_without_lhs (fn_code, fndecl)) + return false; + + /* Don't fold invalid builtins, let rs6000_expand_builtin diagnose it. */ + if (!rs6000_builtin_is_supported (fn_code)) + return false; + + if (rs6000_gimple_fold_mma_builtin (gsi, fn_code)) + return true; + + switch (fn_code) + { + /* Flavors of vec_add. We deliberately don't expand + RS6000_BIF_VADDUQM as it gets lowered from V1TImode to + TImode, resulting in much poorer code generation. */ + case RS6000_BIF_VADDUBM: + case RS6000_BIF_VADDUHM: + case RS6000_BIF_VADDUWM: + case RS6000_BIF_VADDUDM: + case RS6000_BIF_VADDFP: + case RS6000_BIF_XVADDDP: + case RS6000_BIF_XVADDSP: + bcode = PLUS_EXPR; + do_binary: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + if (INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (lhs))) + && !TYPE_OVERFLOW_WRAPS (TREE_TYPE (TREE_TYPE (lhs)))) + { + /* Ensure the binary operation is performed in a type + that wraps if it is integral type. */ + gimple_seq stmts = NULL; + tree type = unsigned_type_for (TREE_TYPE (lhs)); + tree uarg0 = gimple_build (&stmts, VIEW_CONVERT_EXPR, + type, arg0); + tree uarg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR, + type, arg1); + tree res = gimple_build (&stmts, gimple_location (stmt), bcode, + type, uarg0, uarg1); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + g = gimple_build_assign (lhs, VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + TREE_TYPE (lhs), res)); + gsi_replace (gsi, g, true); + return true; + } + g = gimple_build_assign (lhs, bcode, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Flavors of vec_sub. We deliberately don't expand + RS6000_BIF_VSUBUQM. */ + case RS6000_BIF_VSUBUBM: + case RS6000_BIF_VSUBUHM: + case RS6000_BIF_VSUBUWM: + case RS6000_BIF_VSUBUDM: + case RS6000_BIF_VSUBFP: + case RS6000_BIF_XVSUBDP: + case RS6000_BIF_XVSUBSP: + bcode = MINUS_EXPR; + goto do_binary; + case RS6000_BIF_XVMULSP: + case RS6000_BIF_XVMULDP: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + g = gimple_build_assign (lhs, MULT_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Even element flavors of vec_mul (signed). */ + case RS6000_BIF_VMULESB: + case RS6000_BIF_VMULESH: + case RS6000_BIF_VMULESW: + /* Even element flavors of vec_mul (unsigned). */ + case RS6000_BIF_VMULEUB: + case RS6000_BIF_VMULEUH: + case RS6000_BIF_VMULEUW: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + g = gimple_build_assign (lhs, VEC_WIDEN_MULT_EVEN_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Odd element flavors of vec_mul (signed). */ + case RS6000_BIF_VMULOSB: + case RS6000_BIF_VMULOSH: + case RS6000_BIF_VMULOSW: + /* Odd element flavors of vec_mul (unsigned). */ + case RS6000_BIF_VMULOUB: + case RS6000_BIF_VMULOUH: + case RS6000_BIF_VMULOUW: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + g = gimple_build_assign (lhs, VEC_WIDEN_MULT_ODD_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Flavors of vec_div (Integer). */ + case RS6000_BIF_DIV_V2DI: + case RS6000_BIF_UDIV_V2DI: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + g = gimple_build_assign (lhs, TRUNC_DIV_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Flavors of vec_div (Float). */ + case RS6000_BIF_XVDIVSP: + case RS6000_BIF_XVDIVDP: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + g = gimple_build_assign (lhs, RDIV_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Flavors of vec_and. */ + case RS6000_BIF_VAND_V16QI_UNS: + case RS6000_BIF_VAND_V16QI: + case RS6000_BIF_VAND_V8HI_UNS: + case RS6000_BIF_VAND_V8HI: + case RS6000_BIF_VAND_V4SI_UNS: + case RS6000_BIF_VAND_V4SI: + case RS6000_BIF_VAND_V2DI_UNS: + case RS6000_BIF_VAND_V2DI: + case RS6000_BIF_VAND_V4SF: + case RS6000_BIF_VAND_V2DF: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + g = gimple_build_assign (lhs, BIT_AND_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Flavors of vec_andc. */ + case RS6000_BIF_VANDC_V16QI_UNS: + case RS6000_BIF_VANDC_V16QI: + case RS6000_BIF_VANDC_V8HI_UNS: + case RS6000_BIF_VANDC_V8HI: + case RS6000_BIF_VANDC_V4SI_UNS: + case RS6000_BIF_VANDC_V4SI: + case RS6000_BIF_VANDC_V2DI_UNS: + case RS6000_BIF_VANDC_V2DI: + case RS6000_BIF_VANDC_V4SF: + case RS6000_BIF_VANDC_V2DF: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1)); + g = gimple_build_assign (temp, BIT_NOT_EXPR, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + g = gimple_build_assign (lhs, BIT_AND_EXPR, arg0, temp); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Flavors of vec_nand. */ + case RS6000_BIF_NAND_V16QI_UNS: + case RS6000_BIF_NAND_V16QI: + case RS6000_BIF_NAND_V8HI_UNS: + case RS6000_BIF_NAND_V8HI: + case RS6000_BIF_NAND_V4SI_UNS: + case RS6000_BIF_NAND_V4SI: + case RS6000_BIF_NAND_V2DI_UNS: + case RS6000_BIF_NAND_V2DI: + case RS6000_BIF_NAND_V4SF: + case RS6000_BIF_NAND_V2DF: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1)); + g = gimple_build_assign (temp, BIT_AND_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + g = gimple_build_assign (lhs, BIT_NOT_EXPR, temp); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Flavors of vec_or. */ + case RS6000_BIF_VOR_V16QI_UNS: + case RS6000_BIF_VOR_V16QI: + case RS6000_BIF_VOR_V8HI_UNS: + case RS6000_BIF_VOR_V8HI: + case RS6000_BIF_VOR_V4SI_UNS: + case RS6000_BIF_VOR_V4SI: + case RS6000_BIF_VOR_V2DI_UNS: + case RS6000_BIF_VOR_V2DI: + case RS6000_BIF_VOR_V4SF: + case RS6000_BIF_VOR_V2DF: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + g = gimple_build_assign (lhs, BIT_IOR_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* flavors of vec_orc. */ + case RS6000_BIF_ORC_V16QI_UNS: + case RS6000_BIF_ORC_V16QI: + case RS6000_BIF_ORC_V8HI_UNS: + case RS6000_BIF_ORC_V8HI: + case RS6000_BIF_ORC_V4SI_UNS: + case RS6000_BIF_ORC_V4SI: + case RS6000_BIF_ORC_V2DI_UNS: + case RS6000_BIF_ORC_V2DI: + case RS6000_BIF_ORC_V4SF: + case RS6000_BIF_ORC_V2DF: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1)); + g = gimple_build_assign (temp, BIT_NOT_EXPR, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + g = gimple_build_assign (lhs, BIT_IOR_EXPR, arg0, temp); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Flavors of vec_xor. */ + case RS6000_BIF_VXOR_V16QI_UNS: + case RS6000_BIF_VXOR_V16QI: + case RS6000_BIF_VXOR_V8HI_UNS: + case RS6000_BIF_VXOR_V8HI: + case RS6000_BIF_VXOR_V4SI_UNS: + case RS6000_BIF_VXOR_V4SI: + case RS6000_BIF_VXOR_V2DI_UNS: + case RS6000_BIF_VXOR_V2DI: + case RS6000_BIF_VXOR_V4SF: + case RS6000_BIF_VXOR_V2DF: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + g = gimple_build_assign (lhs, BIT_XOR_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Flavors of vec_nor. */ + case RS6000_BIF_VNOR_V16QI_UNS: + case RS6000_BIF_VNOR_V16QI: + case RS6000_BIF_VNOR_V8HI_UNS: + case RS6000_BIF_VNOR_V8HI: + case RS6000_BIF_VNOR_V4SI_UNS: + case RS6000_BIF_VNOR_V4SI: + case RS6000_BIF_VNOR_V2DI_UNS: + case RS6000_BIF_VNOR_V2DI: + case RS6000_BIF_VNOR_V4SF: + case RS6000_BIF_VNOR_V2DF: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1)); + g = gimple_build_assign (temp, BIT_IOR_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + g = gimple_build_assign (lhs, BIT_NOT_EXPR, temp); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* flavors of vec_abs. */ + case RS6000_BIF_ABS_V16QI: + case RS6000_BIF_ABS_V8HI: + case RS6000_BIF_ABS_V4SI: + case RS6000_BIF_ABS_V4SF: + case RS6000_BIF_ABS_V2DI: + case RS6000_BIF_XVABSDP: + case RS6000_BIF_XVABSSP: + arg0 = gimple_call_arg (stmt, 0); + if (INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (arg0))) + && !TYPE_OVERFLOW_WRAPS (TREE_TYPE (TREE_TYPE (arg0)))) + return false; + lhs = gimple_call_lhs (stmt); + g = gimple_build_assign (lhs, ABS_EXPR, arg0); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* flavors of vec_min. */ + case RS6000_BIF_XVMINDP: + case RS6000_BIF_XVMINSP: + case RS6000_BIF_VMINFP: + { + lhs = gimple_call_lhs (stmt); + tree type = TREE_TYPE (lhs); + if (HONOR_NANS (type)) + return false; + gcc_fallthrough (); + } + case RS6000_BIF_VMINSD: + case RS6000_BIF_VMINUD: + case RS6000_BIF_VMINSB: + case RS6000_BIF_VMINSH: + case RS6000_BIF_VMINSW: + case RS6000_BIF_VMINUB: + case RS6000_BIF_VMINUH: + case RS6000_BIF_VMINUW: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + g = gimple_build_assign (lhs, MIN_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* flavors of vec_max. */ + case RS6000_BIF_XVMAXDP: + case RS6000_BIF_XVMAXSP: + case RS6000_BIF_VMAXFP: + { + lhs = gimple_call_lhs (stmt); + tree type = TREE_TYPE (lhs); + if (HONOR_NANS (type)) + return false; + gcc_fallthrough (); + } + case RS6000_BIF_VMAXSD: + case RS6000_BIF_VMAXUD: + case RS6000_BIF_VMAXSB: + case RS6000_BIF_VMAXSH: + case RS6000_BIF_VMAXSW: + case RS6000_BIF_VMAXUB: + case RS6000_BIF_VMAXUH: + case RS6000_BIF_VMAXUW: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + g = gimple_build_assign (lhs, MAX_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Flavors of vec_eqv. */ + case RS6000_BIF_EQV_V16QI: + case RS6000_BIF_EQV_V8HI: + case RS6000_BIF_EQV_V4SI: + case RS6000_BIF_EQV_V4SF: + case RS6000_BIF_EQV_V2DF: + case RS6000_BIF_EQV_V2DI: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1)); + g = gimple_build_assign (temp, BIT_XOR_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + g = gimple_build_assign (lhs, BIT_NOT_EXPR, temp); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Flavors of vec_rotate_left. */ + case RS6000_BIF_VRLB: + case RS6000_BIF_VRLH: + case RS6000_BIF_VRLW: + case RS6000_BIF_VRLD: + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + g = gimple_build_assign (lhs, LROTATE_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + /* Flavors of vector shift right algebraic. + vec_sra{b,h,w} -> vsra{b,h,w}. */ + case RS6000_BIF_VSRAB: + case RS6000_BIF_VSRAH: + case RS6000_BIF_VSRAW: + case RS6000_BIF_VSRAD: + { + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + tree arg1_type = TREE_TYPE (arg1); + tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1)); + tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type)); + location_t loc = gimple_location (stmt); + /* Force arg1 into the range valid matching the arg0 type. */ + /* Build a vector consisting of the max valid bit-size values. */ + int n_elts = VECTOR_CST_NELTS (arg1); + tree element_size = build_int_cst (unsigned_element_type, + 128 / n_elts); + tree_vector_builder elts (unsigned_arg1_type, n_elts, 1); + for (int i = 0; i < n_elts; i++) + elts.safe_push (element_size); + tree modulo_tree = elts.build (); + /* Modulo the provided shift value against that vector. */ + gimple_seq stmts = NULL; + tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR, + unsigned_arg1_type, arg1); + tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR, + unsigned_arg1_type, unsigned_arg1, + modulo_tree); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + /* And finally, do the shift. */ + g = gimple_build_assign (lhs, RSHIFT_EXPR, arg0, new_arg1); + gimple_set_location (g, loc); + gsi_replace (gsi, g, true); + return true; + } + /* Flavors of vector shift left. + builtin_altivec_vsl{b,h,w} -> vsl{b,h,w}. */ + case RS6000_BIF_VSLB: + case RS6000_BIF_VSLH: + case RS6000_BIF_VSLW: + case RS6000_BIF_VSLD: + { + location_t loc; + gimple_seq stmts = NULL; + arg0 = gimple_call_arg (stmt, 0); + tree arg0_type = TREE_TYPE (arg0); + if (INTEGRAL_TYPE_P (TREE_TYPE (arg0_type)) + && !TYPE_OVERFLOW_WRAPS (TREE_TYPE (arg0_type))) + return false; + arg1 = gimple_call_arg (stmt, 1); + tree arg1_type = TREE_TYPE (arg1); + tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1)); + tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type)); + loc = gimple_location (stmt); + lhs = gimple_call_lhs (stmt); + /* Force arg1 into the range valid matching the arg0 type. */ + /* Build a vector consisting of the max valid bit-size values. */ + int n_elts = VECTOR_CST_NELTS (arg1); + int tree_size_in_bits = TREE_INT_CST_LOW (size_in_bytes (arg1_type)) + * BITS_PER_UNIT; + tree element_size = build_int_cst (unsigned_element_type, + tree_size_in_bits / n_elts); + tree_vector_builder elts (unsigned_type_for (arg1_type), n_elts, 1); + for (int i = 0; i < n_elts; i++) + elts.safe_push (element_size); + tree modulo_tree = elts.build (); + /* Modulo the provided shift value against that vector. */ + tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR, + unsigned_arg1_type, arg1); + tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR, + unsigned_arg1_type, unsigned_arg1, + modulo_tree); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + /* And finally, do the shift. */ + g = gimple_build_assign (lhs, LSHIFT_EXPR, arg0, new_arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + } + /* Flavors of vector shift right. */ + case RS6000_BIF_VSRB: + case RS6000_BIF_VSRH: + case RS6000_BIF_VSRW: + case RS6000_BIF_VSRD: + { + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + tree arg1_type = TREE_TYPE (arg1); + tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1)); + tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type)); + location_t loc = gimple_location (stmt); + gimple_seq stmts = NULL; + /* Convert arg0 to unsigned. */ + tree arg0_unsigned + = gimple_build (&stmts, VIEW_CONVERT_EXPR, + unsigned_type_for (TREE_TYPE (arg0)), arg0); + /* Force arg1 into the range valid matching the arg0 type. */ + /* Build a vector consisting of the max valid bit-size values. */ + int n_elts = VECTOR_CST_NELTS (arg1); + tree element_size = build_int_cst (unsigned_element_type, + 128 / n_elts); + tree_vector_builder elts (unsigned_arg1_type, n_elts, 1); + for (int i = 0; i < n_elts; i++) + elts.safe_push (element_size); + tree modulo_tree = elts.build (); + /* Modulo the provided shift value against that vector. */ + tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR, + unsigned_arg1_type, arg1); + tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR, + unsigned_arg1_type, unsigned_arg1, + modulo_tree); + /* Do the shift. */ + tree res + = gimple_build (&stmts, RSHIFT_EXPR, + TREE_TYPE (arg0_unsigned), arg0_unsigned, new_arg1); + /* Convert result back to the lhs type. */ + res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + replace_call_with_value (gsi, res); + return true; + } + /* Vector loads. */ + case RS6000_BIF_LVX_V16QI: + case RS6000_BIF_LVX_V8HI: + case RS6000_BIF_LVX_V4SI: + case RS6000_BIF_LVX_V4SF: + case RS6000_BIF_LVX_V2DI: + case RS6000_BIF_LVX_V2DF: + case RS6000_BIF_LVX_V1TI: + { + arg0 = gimple_call_arg (stmt, 0); // offset + arg1 = gimple_call_arg (stmt, 1); // address + lhs = gimple_call_lhs (stmt); + location_t loc = gimple_location (stmt); + /* Since arg1 may be cast to a different type, just use ptr_type_node + here instead of trying to enforce TBAA on pointer types. */ + tree arg1_type = ptr_type_node; + tree lhs_type = TREE_TYPE (lhs); + /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'. Create + the tree using the value from arg0. The resulting type will match + the type of arg1. */ + gimple_seq stmts = NULL; + tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0); + tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR, + arg1_type, arg1, temp_offset); + /* Mask off any lower bits from the address. */ + tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR, + arg1_type, temp_addr, + build_int_cst (arg1_type, -16)); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + if (!is_gimple_mem_ref_addr (aligned_addr)) + { + tree t = make_ssa_name (TREE_TYPE (aligned_addr)); + gimple *g = gimple_build_assign (t, aligned_addr); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + aligned_addr = t; + } + /* Use the build2 helper to set up the mem_ref. The MEM_REF could also + take an offset, but since we've already incorporated the offset + above, here we just pass in a zero. */ + gimple *g + = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr, + build_int_cst (arg1_type, 0))); + gimple_set_location (g, loc); + gsi_replace (gsi, g, true); + return true; + } + /* Vector stores. */ + case RS6000_BIF_STVX_V16QI: + case RS6000_BIF_STVX_V8HI: + case RS6000_BIF_STVX_V4SI: + case RS6000_BIF_STVX_V4SF: + case RS6000_BIF_STVX_V2DI: + case RS6000_BIF_STVX_V2DF: + { + arg0 = gimple_call_arg (stmt, 0); /* Value to be stored. */ + arg1 = gimple_call_arg (stmt, 1); /* Offset. */ + tree arg2 = gimple_call_arg (stmt, 2); /* Store-to address. */ + location_t loc = gimple_location (stmt); + tree arg0_type = TREE_TYPE (arg0); + /* Use ptr_type_node (no TBAA) for the arg2_type. + FIXME: (Richard) "A proper fix would be to transition this type as + seen from the frontend to GIMPLE, for example in a similar way we + do for MEM_REFs by piggy-backing that on an extra argument, a + constant zero pointer of the alias pointer type to use (which would + also serve as a type indicator of the store itself). I'd use a + target specific internal function for this (not sure if we can have + those target specific, but I guess if it's folded away then that's + fine) and get away with the overload set." */ + tree arg2_type = ptr_type_node; + /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'. Create + the tree using the value from arg0. The resulting type will match + the type of arg2. */ + gimple_seq stmts = NULL; + tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg1); + tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR, + arg2_type, arg2, temp_offset); + /* Mask off any lower bits from the address. */ + tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR, + arg2_type, temp_addr, + build_int_cst (arg2_type, -16)); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + if (!is_gimple_mem_ref_addr (aligned_addr)) + { + tree t = make_ssa_name (TREE_TYPE (aligned_addr)); + gimple *g = gimple_build_assign (t, aligned_addr); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + aligned_addr = t; + } + /* The desired gimple result should be similar to: + MEM[(__vector floatD.1407 *)_1] = vf1D.2697; */ + gimple *g + = gimple_build_assign (build2 (MEM_REF, arg0_type, aligned_addr, + build_int_cst (arg2_type, 0)), arg0); + gimple_set_location (g, loc); + gsi_replace (gsi, g, true); + return true; + } + + /* unaligned Vector loads. */ + case RS6000_BIF_LXVW4X_V16QI: + case RS6000_BIF_LXVW4X_V8HI: + case RS6000_BIF_LXVW4X_V4SF: + case RS6000_BIF_LXVW4X_V4SI: + case RS6000_BIF_LXVD2X_V2DF: + case RS6000_BIF_LXVD2X_V2DI: + { + arg0 = gimple_call_arg (stmt, 0); // offset + arg1 = gimple_call_arg (stmt, 1); // address + lhs = gimple_call_lhs (stmt); + location_t loc = gimple_location (stmt); + /* Since arg1 may be cast to a different type, just use ptr_type_node + here instead of trying to enforce TBAA on pointer types. */ + tree arg1_type = ptr_type_node; + tree lhs_type = TREE_TYPE (lhs); + /* In GIMPLE the type of the MEM_REF specifies the alignment. The + required alignment (power) is 4 bytes regardless of data type. */ + tree align_ltype = build_aligned_type (lhs_type, 4); + /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'. Create + the tree using the value from arg0. The resulting type will match + the type of arg1. */ + gimple_seq stmts = NULL; + tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0); + tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR, + arg1_type, arg1, temp_offset); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + if (!is_gimple_mem_ref_addr (temp_addr)) + { + tree t = make_ssa_name (TREE_TYPE (temp_addr)); + gimple *g = gimple_build_assign (t, temp_addr); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + temp_addr = t; + } + /* Use the build2 helper to set up the mem_ref. The MEM_REF could also + take an offset, but since we've already incorporated the offset + above, here we just pass in a zero. */ + gimple *g; + g = gimple_build_assign (lhs, build2 (MEM_REF, align_ltype, temp_addr, + build_int_cst (arg1_type, 0))); + gimple_set_location (g, loc); + gsi_replace (gsi, g, true); + return true; + } + + /* unaligned Vector stores. */ + case RS6000_BIF_STXVW4X_V16QI: + case RS6000_BIF_STXVW4X_V8HI: + case RS6000_BIF_STXVW4X_V4SF: + case RS6000_BIF_STXVW4X_V4SI: + case RS6000_BIF_STXVD2X_V2DF: + case RS6000_BIF_STXVD2X_V2DI: + { + arg0 = gimple_call_arg (stmt, 0); /* Value to be stored. */ + arg1 = gimple_call_arg (stmt, 1); /* Offset. */ + tree arg2 = gimple_call_arg (stmt, 2); /* Store-to address. */ + location_t loc = gimple_location (stmt); + tree arg0_type = TREE_TYPE (arg0); + /* Use ptr_type_node (no TBAA) for the arg2_type. */ + tree arg2_type = ptr_type_node; + /* In GIMPLE the type of the MEM_REF specifies the alignment. The + required alignment (power) is 4 bytes regardless of data type. */ + tree align_stype = build_aligned_type (arg0_type, 4); + /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'. Create + the tree using the value from arg1. */ + gimple_seq stmts = NULL; + tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg1); + tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR, + arg2_type, arg2, temp_offset); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + if (!is_gimple_mem_ref_addr (temp_addr)) + { + tree t = make_ssa_name (TREE_TYPE (temp_addr)); + gimple *g = gimple_build_assign (t, temp_addr); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + temp_addr = t; + } + gimple *g; + g = gimple_build_assign (build2 (MEM_REF, align_stype, temp_addr, + build_int_cst (arg2_type, 0)), arg0); + gimple_set_location (g, loc); + gsi_replace (gsi, g, true); + return true; + } + + /* Vector Fused multiply-add (fma). */ + case RS6000_BIF_VMADDFP: + case RS6000_BIF_XVMADDDP: + case RS6000_BIF_XVMADDSP: + case RS6000_BIF_VMLADDUHM: + { + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + tree arg2 = gimple_call_arg (stmt, 2); + lhs = gimple_call_lhs (stmt); + gcall *g = gimple_build_call_internal (IFN_FMA, 3, arg0, arg1, arg2); + gimple_call_set_lhs (g, lhs); + gimple_call_set_nothrow (g, true); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + } + + /* Vector compares; EQ, NE, GE, GT, LE. */ + case RS6000_BIF_VCMPEQUB: + case RS6000_BIF_VCMPEQUH: + case RS6000_BIF_VCMPEQUW: + case RS6000_BIF_VCMPEQUD: + /* We deliberately omit RS6000_BIF_VCMPEQUT for now, because gimple + folding produces worse code for 128-bit compares. */ + fold_compare_helper (gsi, EQ_EXPR, stmt); + return true; + + case RS6000_BIF_VCMPNEB: + case RS6000_BIF_VCMPNEH: + case RS6000_BIF_VCMPNEW: + /* We deliberately omit RS6000_BIF_VCMPNET for now, because gimple + folding produces worse code for 128-bit compares. */ + fold_compare_helper (gsi, NE_EXPR, stmt); + return true; + + case RS6000_BIF_CMPGE_16QI: + case RS6000_BIF_CMPGE_U16QI: + case RS6000_BIF_CMPGE_8HI: + case RS6000_BIF_CMPGE_U8HI: + case RS6000_BIF_CMPGE_4SI: + case RS6000_BIF_CMPGE_U4SI: + case RS6000_BIF_CMPGE_2DI: + case RS6000_BIF_CMPGE_U2DI: + /* We deliberately omit RS6000_BIF_CMPGE_1TI and RS6000_BIF_CMPGE_U1TI + for now, because gimple folding produces worse code for 128-bit + compares. */ + fold_compare_helper (gsi, GE_EXPR, stmt); + return true; + + case RS6000_BIF_VCMPGTSB: + case RS6000_BIF_VCMPGTUB: + case RS6000_BIF_VCMPGTSH: + case RS6000_BIF_VCMPGTUH: + case RS6000_BIF_VCMPGTSW: + case RS6000_BIF_VCMPGTUW: + case RS6000_BIF_VCMPGTUD: + case RS6000_BIF_VCMPGTSD: + /* We deliberately omit RS6000_BIF_VCMPGTUT and RS6000_BIF_VCMPGTST + for now, because gimple folding produces worse code for 128-bit + compares. */ + fold_compare_helper (gsi, GT_EXPR, stmt); + return true; + + case RS6000_BIF_CMPLE_16QI: + case RS6000_BIF_CMPLE_U16QI: + case RS6000_BIF_CMPLE_8HI: + case RS6000_BIF_CMPLE_U8HI: + case RS6000_BIF_CMPLE_4SI: + case RS6000_BIF_CMPLE_U4SI: + case RS6000_BIF_CMPLE_2DI: + case RS6000_BIF_CMPLE_U2DI: + /* We deliberately omit RS6000_BIF_CMPLE_1TI and RS6000_BIF_CMPLE_U1TI + for now, because gimple folding produces worse code for 128-bit + compares. */ + fold_compare_helper (gsi, LE_EXPR, stmt); + return true; + + /* flavors of vec_splat_[us]{8,16,32}. */ + case RS6000_BIF_VSPLTISB: + case RS6000_BIF_VSPLTISH: + case RS6000_BIF_VSPLTISW: + { + arg0 = gimple_call_arg (stmt, 0); + lhs = gimple_call_lhs (stmt); + + /* Only fold the vec_splat_*() if the lower bits of arg 0 is a + 5-bit signed constant in range -16 to +15. */ + if (TREE_CODE (arg0) != INTEGER_CST + || !IN_RANGE (TREE_INT_CST_LOW (arg0), -16, 15)) + return false; + gimple_seq stmts = NULL; + location_t loc = gimple_location (stmt); + tree splat_value = gimple_convert (&stmts, loc, + TREE_TYPE (TREE_TYPE (lhs)), arg0); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + tree splat_tree = build_vector_from_val (TREE_TYPE (lhs), splat_value); + g = gimple_build_assign (lhs, splat_tree); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + } + + /* Flavors of vec_splat. */ + /* a = vec_splat (b, 0x3) becomes a = { b[3],b[3],b[3],...}; */ + case RS6000_BIF_VSPLTB: + case RS6000_BIF_VSPLTH: + case RS6000_BIF_VSPLTW: + case RS6000_BIF_XXSPLTD_V2DI: + case RS6000_BIF_XXSPLTD_V2DF: + { + arg0 = gimple_call_arg (stmt, 0); /* input vector. */ + arg1 = gimple_call_arg (stmt, 1); /* index into arg0. */ + /* Only fold the vec_splat_*() if arg1 is both a constant value and + is a valid index into the arg0 vector. */ + unsigned int n_elts = VECTOR_CST_NELTS (arg0); + if (TREE_CODE (arg1) != INTEGER_CST + || TREE_INT_CST_LOW (arg1) > (n_elts -1)) + return false; + lhs = gimple_call_lhs (stmt); + tree lhs_type = TREE_TYPE (lhs); + tree arg0_type = TREE_TYPE (arg0); + tree splat; + if (TREE_CODE (arg0) == VECTOR_CST) + splat = VECTOR_CST_ELT (arg0, TREE_INT_CST_LOW (arg1)); + else + { + /* Determine (in bits) the length and start location of the + splat value for a call to the tree_vec_extract helper. */ + int splat_elem_size = TREE_INT_CST_LOW (size_in_bytes (arg0_type)) + * BITS_PER_UNIT / n_elts; + int splat_start_bit = TREE_INT_CST_LOW (arg1) * splat_elem_size; + tree len = build_int_cst (bitsizetype, splat_elem_size); + tree start = build_int_cst (bitsizetype, splat_start_bit); + splat = tree_vec_extract (gsi, TREE_TYPE (lhs_type), arg0, + len, start); + } + /* And finally, build the new vector. */ + tree splat_tree = build_vector_from_val (lhs_type, splat); + g = gimple_build_assign (lhs, splat_tree); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + } + + /* vec_mergel (integrals). */ + case RS6000_BIF_VMRGLH: + case RS6000_BIF_VMRGLW: + case RS6000_BIF_XXMRGLW_4SI: + case RS6000_BIF_VMRGLB: + case RS6000_BIF_VEC_MERGEL_V2DI: + case RS6000_BIF_XXMRGLW_4SF: + case RS6000_BIF_VEC_MERGEL_V2DF: + fold_mergehl_helper (gsi, stmt, 1); + return true; + /* vec_mergeh (integrals). */ + case RS6000_BIF_VMRGHH: + case RS6000_BIF_VMRGHW: + case RS6000_BIF_XXMRGHW_4SI: + case RS6000_BIF_VMRGHB: + case RS6000_BIF_VEC_MERGEH_V2DI: + case RS6000_BIF_XXMRGHW_4SF: + case RS6000_BIF_VEC_MERGEH_V2DF: + fold_mergehl_helper (gsi, stmt, 0); + return true; + + /* Flavors of vec_mergee. */ + case RS6000_BIF_VMRGEW_V4SI: + case RS6000_BIF_VMRGEW_V2DI: + case RS6000_BIF_VMRGEW_V4SF: + case RS6000_BIF_VMRGEW_V2DF: + fold_mergeeo_helper (gsi, stmt, 0); + return true; + /* Flavors of vec_mergeo. */ + case RS6000_BIF_VMRGOW_V4SI: + case RS6000_BIF_VMRGOW_V2DI: + case RS6000_BIF_VMRGOW_V4SF: + case RS6000_BIF_VMRGOW_V2DF: + fold_mergeeo_helper (gsi, stmt, 1); + return true; + + /* d = vec_pack (a, b) */ + case RS6000_BIF_VPKUDUM: + case RS6000_BIF_VPKUHUM: + case RS6000_BIF_VPKUWUM: + { + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + lhs = gimple_call_lhs (stmt); + gimple *g = gimple_build_assign (lhs, VEC_PACK_TRUNC_EXPR, arg0, arg1); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + } + + /* d = vec_unpackh (a) */ + /* Note that the UNPACK_{HI,LO}_EXPR used in the gimple_build_assign call + in this code is sensitive to endian-ness, and needs to be inverted to + handle both LE and BE targets. */ + case RS6000_BIF_VUPKHSB: + case RS6000_BIF_VUPKHSH: + case RS6000_BIF_VUPKHSW: + { + arg0 = gimple_call_arg (stmt, 0); + lhs = gimple_call_lhs (stmt); + if (BYTES_BIG_ENDIAN) + g = gimple_build_assign (lhs, VEC_UNPACK_HI_EXPR, arg0); + else + g = gimple_build_assign (lhs, VEC_UNPACK_LO_EXPR, arg0); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + } + /* d = vec_unpackl (a) */ + case RS6000_BIF_VUPKLSB: + case RS6000_BIF_VUPKLSH: + case RS6000_BIF_VUPKLSW: + { + arg0 = gimple_call_arg (stmt, 0); + lhs = gimple_call_lhs (stmt); + if (BYTES_BIG_ENDIAN) + g = gimple_build_assign (lhs, VEC_UNPACK_LO_EXPR, arg0); + else + g = gimple_build_assign (lhs, VEC_UNPACK_HI_EXPR, arg0); + gimple_set_location (g, gimple_location (stmt)); + gsi_replace (gsi, g, true); + return true; + } + /* There is no gimple type corresponding with pixel, so just return. */ + case RS6000_BIF_VUPKHPX: + case RS6000_BIF_VUPKLPX: + return false; + + /* vec_perm. */ + case RS6000_BIF_VPERM_16QI: + case RS6000_BIF_VPERM_8HI: + case RS6000_BIF_VPERM_4SI: + case RS6000_BIF_VPERM_2DI: + case RS6000_BIF_VPERM_4SF: + case RS6000_BIF_VPERM_2DF: + case RS6000_BIF_VPERM_16QI_UNS: + case RS6000_BIF_VPERM_8HI_UNS: + case RS6000_BIF_VPERM_4SI_UNS: + case RS6000_BIF_VPERM_2DI_UNS: + { + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + tree permute = gimple_call_arg (stmt, 2); + lhs = gimple_call_lhs (stmt); + location_t loc = gimple_location (stmt); + gimple_seq stmts = NULL; + // convert arg0 and arg1 to match the type of the permute + // for the VEC_PERM_EXPR operation. + tree permute_type = (TREE_TYPE (permute)); + tree arg0_ptype = gimple_build (&stmts, loc, VIEW_CONVERT_EXPR, + permute_type, arg0); + tree arg1_ptype = gimple_build (&stmts, loc, VIEW_CONVERT_EXPR, + permute_type, arg1); + tree lhs_ptype = gimple_build (&stmts, loc, VEC_PERM_EXPR, + permute_type, arg0_ptype, arg1_ptype, + permute); + // Convert the result back to the desired lhs type upon completion. + tree temp = gimple_build (&stmts, loc, VIEW_CONVERT_EXPR, + TREE_TYPE (lhs), lhs_ptype); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + g = gimple_build_assign (lhs, temp); + gimple_set_location (g, loc); + gsi_replace (gsi, g, true); + return true; + } + + default: + if (TARGET_DEBUG_BUILTIN) + fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n", + fn_code, fn_name1, fn_name2); + break; + } + + return false; +} + +/* **** Expansion support **** */ + +static rtx +altivec_expand_predicate_builtin (enum insn_code icode, tree exp, rtx target) +{ + rtx pat, scratch; + tree cr6_form = CALL_EXPR_ARG (exp, 0); + tree arg0 = CALL_EXPR_ARG (exp, 1); + tree arg1 = CALL_EXPR_ARG (exp, 2); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + machine_mode tmode = SImode; + machine_mode mode0 = insn_data[icode].operand[1].mode; + machine_mode mode1 = insn_data[icode].operand[2].mode; + int cr6_form_int; + + if (TREE_CODE (cr6_form) != INTEGER_CST) + { + error ("argument 1 of %qs must be a constant", + "__builtin_altivec_predicate"); + return const0_rtx; + } + else + cr6_form_int = TREE_INT_CST_LOW (cr6_form); + + gcc_assert (mode0 == mode1); + + /* If we have invalid arguments, bail out before generating bad rtl. */ + if (arg0 == error_mark_node || arg1 == error_mark_node) + return const0_rtx; + + if (target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + /* Note that for many of the relevant operations (e.g. cmpne or + cmpeq) with float or double operands, it makes more sense for the + mode of the allocated scratch register to select a vector of + integer. But the choice to copy the mode of operand 0 was made + long ago and there are no plans to change it. */ + scratch = gen_reg_rtx (mode0); + + pat = GEN_FCN (icode) (scratch, op0, op1); + if (! pat) + return 0; + emit_insn (pat); + + /* The vec_any* and vec_all* predicates use the same opcodes for two + different operations, but the bits in CR6 will be different + depending on what information we want. So we have to play tricks + with CR6 to get the right bits out. + + If you think this is disgusting, look at the specs for the + AltiVec predicates. */ + + switch (cr6_form_int) + { + case 0: + emit_insn (gen_cr6_test_for_zero (target)); + break; + case 1: + emit_insn (gen_cr6_test_for_zero_reverse (target)); + break; + case 2: + emit_insn (gen_cr6_test_for_lt (target)); + break; + case 3: + emit_insn (gen_cr6_test_for_lt_reverse (target)); + break; + default: + error ("argument 1 of %qs is out of range", + "__builtin_altivec_predicate"); + break; + } + + return target; +} + +/* Expand vec_init builtin. */ +static rtx +altivec_expand_vec_init_builtin (tree type, tree exp, rtx target) +{ + machine_mode tmode = TYPE_MODE (type); + machine_mode inner_mode = GET_MODE_INNER (tmode); + int i, n_elt = GET_MODE_NUNITS (tmode); + + gcc_assert (VECTOR_MODE_P (tmode)); + gcc_assert (n_elt == call_expr_nargs (exp)); + + if (!target || !register_operand (target, tmode)) + target = gen_reg_rtx (tmode); + + /* If we have a vector compromised of a single element, such as V1TImode, do + the initialization directly. */ + if (n_elt == 1 && GET_MODE_SIZE (tmode) == GET_MODE_SIZE (inner_mode)) + { + rtx x = expand_normal (CALL_EXPR_ARG (exp, 0)); + emit_move_insn (target, gen_lowpart (tmode, x)); + } + else + { + rtvec v = rtvec_alloc (n_elt); + + for (i = 0; i < n_elt; ++i) + { + rtx x = expand_normal (CALL_EXPR_ARG (exp, i)); + RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x); + } + + rs6000_expand_vector_init (target, gen_rtx_PARALLEL (tmode, v)); + } + + return target; +} + +/* Return the integer constant in ARG. Constrain it to be in the range + of the subparts of VEC_TYPE; issue an error if not. */ + +static int +get_element_number (tree vec_type, tree arg) +{ + unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1; + + if (!tree_fits_uhwi_p (arg) + || (elt = tree_to_uhwi (arg), elt > max)) + { + error ("selector must be an integer constant in the range [0, %wi]", max); + return 0; + } + + return elt; +} + +/* Expand vec_set builtin. */ +static rtx +altivec_expand_vec_set_builtin (tree exp) +{ + machine_mode tmode, mode1; + tree arg0, arg1, arg2; + int elt; + rtx op0, op1; + + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + + tmode = TYPE_MODE (TREE_TYPE (arg0)); + mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); + gcc_assert (VECTOR_MODE_P (tmode)); + + op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL); + op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL); + elt = get_element_number (TREE_TYPE (arg0), arg2); + + if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode) + op1 = convert_modes (mode1, GET_MODE (op1), op1, true); + + op0 = force_reg (tmode, op0); + op1 = force_reg (mode1, op1); + + rs6000_expand_vector_set (op0, op1, GEN_INT (elt)); + + return op0; +} + +/* Expand vec_ext builtin. */ +static rtx +altivec_expand_vec_ext_builtin (tree exp, rtx target) +{ + machine_mode tmode, mode0; + tree arg0, arg1; + rtx op0; + rtx op1; + + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + + if (TREE_CODE (arg1) == INTEGER_CST) + { + unsigned HOST_WIDE_INT elt; + unsigned HOST_WIDE_INT size = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); + unsigned int truncated_selector; + /* Even if !tree_fits_uhwi_p (arg1)), TREE_INT_CST_LOW (arg0) + returns low-order bits of INTEGER_CST for modulo indexing. */ + elt = TREE_INT_CST_LOW (arg1); + truncated_selector = elt % size; + op1 = GEN_INT (truncated_selector); + } + + tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); + mode0 = TYPE_MODE (TREE_TYPE (arg0)); + gcc_assert (VECTOR_MODE_P (mode0)); + + op0 = force_reg (mode0, op0); + + if (optimize || !target || !register_operand (target, tmode)) + target = gen_reg_rtx (tmode); + + rs6000_expand_vector_extract (target, op0, op1); + + return target; +} + +/* Expand ALTIVEC_BUILTIN_MASK_FOR_LOAD. */ +rtx +rs6000_expand_ldst_mask (rtx target, tree arg0) +{ + int icode2 = BYTES_BIG_ENDIAN ? (int) CODE_FOR_altivec_lvsr_direct + : (int) CODE_FOR_altivec_lvsl_direct; + machine_mode tmode = insn_data[icode2].operand[0].mode; + machine_mode mode = insn_data[icode2].operand[1].mode; + + gcc_assert (TARGET_ALTIVEC); + + gcc_assert (POINTER_TYPE_P (TREE_TYPE (arg0))); + rtx op = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL); + rtx addr = memory_address (mode, op); + /* We need to negate the address. */ + op = gen_reg_rtx (GET_MODE (addr)); + emit_insn (gen_rtx_SET (op, gen_rtx_NEG (GET_MODE (addr), addr))); + op = gen_rtx_MEM (mode, op); + + if (target == 0 + || GET_MODE (target) != tmode + || !insn_data[icode2].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + rtx pat = GEN_FCN (icode2) (target, op); + if (!pat) + return 0; + emit_insn (pat); + + return target; +} + +/* Used by __builtin_cpu_is(), mapping from PLATFORM names to values. */ +static const struct +{ + const char *cpu; + unsigned int cpuid; +} cpu_is_info[] = { + { "power10", PPC_PLATFORM_POWER10 }, + { "power9", PPC_PLATFORM_POWER9 }, + { "power8", PPC_PLATFORM_POWER8 }, + { "power7", PPC_PLATFORM_POWER7 }, + { "power6x", PPC_PLATFORM_POWER6X }, + { "power6", PPC_PLATFORM_POWER6 }, + { "power5+", PPC_PLATFORM_POWER5_PLUS }, + { "power5", PPC_PLATFORM_POWER5 }, + { "ppc970", PPC_PLATFORM_PPC970 }, + { "power4", PPC_PLATFORM_POWER4 }, + { "ppca2", PPC_PLATFORM_PPCA2 }, + { "ppc476", PPC_PLATFORM_PPC476 }, + { "ppc464", PPC_PLATFORM_PPC464 }, + { "ppc440", PPC_PLATFORM_PPC440 }, + { "ppc405", PPC_PLATFORM_PPC405 }, + { "ppc-cell-be", PPC_PLATFORM_CELL_BE } +}; + +/* Used by __builtin_cpu_supports(), mapping from HWCAP names to masks. */ +static const struct +{ + const char *hwcap; + int mask; + unsigned int id; +} cpu_supports_info[] = { + /* AT_HWCAP masks. */ + { "4xxmac", PPC_FEATURE_HAS_4xxMAC, 0 }, + { "altivec", PPC_FEATURE_HAS_ALTIVEC, 0 }, + { "arch_2_05", PPC_FEATURE_ARCH_2_05, 0 }, + { "arch_2_06", PPC_FEATURE_ARCH_2_06, 0 }, + { "archpmu", PPC_FEATURE_PERFMON_COMPAT, 0 }, + { "booke", PPC_FEATURE_BOOKE, 0 }, + { "cellbe", PPC_FEATURE_CELL_BE, 0 }, + { "dfp", PPC_FEATURE_HAS_DFP, 0 }, + { "efpdouble", PPC_FEATURE_HAS_EFP_DOUBLE, 0 }, + { "efpsingle", PPC_FEATURE_HAS_EFP_SINGLE, 0 }, + { "fpu", PPC_FEATURE_HAS_FPU, 0 }, + { "ic_snoop", PPC_FEATURE_ICACHE_SNOOP, 0 }, + { "mmu", PPC_FEATURE_HAS_MMU, 0 }, + { "notb", PPC_FEATURE_NO_TB, 0 }, + { "pa6t", PPC_FEATURE_PA6T, 0 }, + { "power4", PPC_FEATURE_POWER4, 0 }, + { "power5", PPC_FEATURE_POWER5, 0 }, + { "power5+", PPC_FEATURE_POWER5_PLUS, 0 }, + { "power6x", PPC_FEATURE_POWER6_EXT, 0 }, + { "ppc32", PPC_FEATURE_32, 0 }, + { "ppc601", PPC_FEATURE_601_INSTR, 0 }, + { "ppc64", PPC_FEATURE_64, 0 }, + { "ppcle", PPC_FEATURE_PPC_LE, 0 }, + { "smt", PPC_FEATURE_SMT, 0 }, + { "spe", PPC_FEATURE_HAS_SPE, 0 }, + { "true_le", PPC_FEATURE_TRUE_LE, 0 }, + { "ucache", PPC_FEATURE_UNIFIED_CACHE, 0 }, + { "vsx", PPC_FEATURE_HAS_VSX, 0 }, + + /* AT_HWCAP2 masks. */ + { "arch_2_07", PPC_FEATURE2_ARCH_2_07, 1 }, + { "dscr", PPC_FEATURE2_HAS_DSCR, 1 }, + { "ebb", PPC_FEATURE2_HAS_EBB, 1 }, + { "htm", PPC_FEATURE2_HAS_HTM, 1 }, + { "htm-nosc", PPC_FEATURE2_HTM_NOSC, 1 }, + { "htm-no-suspend", PPC_FEATURE2_HTM_NO_SUSPEND, 1 }, + { "isel", PPC_FEATURE2_HAS_ISEL, 1 }, + { "tar", PPC_FEATURE2_HAS_TAR, 1 }, + { "vcrypto", PPC_FEATURE2_HAS_VEC_CRYPTO, 1 }, + { "arch_3_00", PPC_FEATURE2_ARCH_3_00, 1 }, + { "ieee128", PPC_FEATURE2_HAS_IEEE128, 1 }, + { "darn", PPC_FEATURE2_DARN, 1 }, + { "scv", PPC_FEATURE2_SCV, 1 }, + { "arch_3_1", PPC_FEATURE2_ARCH_3_1, 1 }, + { "mma", PPC_FEATURE2_MMA, 1 }, +}; + +/* Expand the CPU builtin in FCODE and store the result in TARGET. */ +static rtx +cpu_expand_builtin (enum rs6000_gen_builtins fcode, + tree exp ATTRIBUTE_UNUSED, rtx target) +{ + /* __builtin_cpu_init () is a nop, so expand to nothing. */ + if (fcode == RS6000_BIF_CPU_INIT) + return const0_rtx; + + if (target == 0 || GET_MODE (target) != SImode) + target = gen_reg_rtx (SImode); + + /* TODO: Factor the #ifdef'd code into a separate function. */ +#ifdef TARGET_LIBC_PROVIDES_HWCAP_IN_TCB + tree arg = TREE_OPERAND (CALL_EXPR_ARG (exp, 0), 0); + /* Target clones creates an ARRAY_REF instead of STRING_CST, convert it back + to a STRING_CST. */ + if (TREE_CODE (arg) == ARRAY_REF + && TREE_CODE (TREE_OPERAND (arg, 0)) == STRING_CST + && TREE_CODE (TREE_OPERAND (arg, 1)) == INTEGER_CST + && compare_tree_int (TREE_OPERAND (arg, 1), 0) == 0) + arg = TREE_OPERAND (arg, 0); + + if (TREE_CODE (arg) != STRING_CST) + { + error ("builtin %qs only accepts a string argument", + rs6000_builtin_info[(size_t) fcode].bifname); + return const0_rtx; + } + + if (fcode == RS6000_BIF_CPU_IS) + { + const char *cpu = TREE_STRING_POINTER (arg); + rtx cpuid = NULL_RTX; + for (size_t i = 0; i < ARRAY_SIZE (cpu_is_info); i++) + if (strcmp (cpu, cpu_is_info[i].cpu) == 0) + { + /* The CPUID value in the TCB is offset by _DL_FIRST_PLATFORM. */ + cpuid = GEN_INT (cpu_is_info[i].cpuid + _DL_FIRST_PLATFORM); + break; + } + if (cpuid == NULL_RTX) + { + /* Invalid CPU argument. */ + error ("cpu %qs is an invalid argument to builtin %qs", + cpu, rs6000_builtin_info[(size_t) fcode].bifname); + return const0_rtx; + } + + rtx platform = gen_reg_rtx (SImode); + rtx address = gen_rtx_PLUS (Pmode, + gen_rtx_REG (Pmode, TLS_REGNUM), + GEN_INT (TCB_PLATFORM_OFFSET)); + rtx tcbmem = gen_const_mem (SImode, address); + emit_move_insn (platform, tcbmem); + emit_insn (gen_eqsi3 (target, platform, cpuid)); + } + else if (fcode == RS6000_BIF_CPU_SUPPORTS) + { + const char *hwcap = TREE_STRING_POINTER (arg); + rtx mask = NULL_RTX; + int hwcap_offset; + for (size_t i = 0; i < ARRAY_SIZE (cpu_supports_info); i++) + if (strcmp (hwcap, cpu_supports_info[i].hwcap) == 0) + { + mask = GEN_INT (cpu_supports_info[i].mask); + hwcap_offset = TCB_HWCAP_OFFSET (cpu_supports_info[i].id); + break; + } + if (mask == NULL_RTX) + { + /* Invalid HWCAP argument. */ + error ("%s %qs is an invalid argument to builtin %qs", + "hwcap", hwcap, + rs6000_builtin_info[(size_t) fcode].bifname); + return const0_rtx; + } + + rtx tcb_hwcap = gen_reg_rtx (SImode); + rtx address = gen_rtx_PLUS (Pmode, + gen_rtx_REG (Pmode, TLS_REGNUM), + GEN_INT (hwcap_offset)); + rtx tcbmem = gen_const_mem (SImode, address); + emit_move_insn (tcb_hwcap, tcbmem); + rtx scratch1 = gen_reg_rtx (SImode); + emit_insn (gen_rtx_SET (scratch1, + gen_rtx_AND (SImode, tcb_hwcap, mask))); + rtx scratch2 = gen_reg_rtx (SImode); + emit_insn (gen_eqsi3 (scratch2, scratch1, const0_rtx)); + emit_insn (gen_rtx_SET (target, + gen_rtx_XOR (SImode, scratch2, const1_rtx))); + } + else + gcc_unreachable (); + + /* Record that we have expanded a CPU builtin, so that we can later + emit a reference to the special symbol exported by LIBC to ensure we + do not link against an old LIBC that doesn't support this feature. */ + cpu_builtin_p = true; + +#else + warning (0, "builtin %qs needs GLIBC (2.23 and newer) that exports hardware " + "capability bits", rs6000_builtin_info[(size_t) fcode].bifname); + + /* For old LIBCs, always return FALSE. */ + emit_move_insn (target, GEN_INT (0)); +#endif /* TARGET_LIBC_PROVIDES_HWCAP_IN_TCB */ + + return target; +} + +/* For the element-reversing load/store built-ins, produce the correct + insn_code depending on the target endianness. */ +static insn_code +elemrev_icode (rs6000_gen_builtins fcode) +{ + switch (fcode) + { + case RS6000_BIF_ST_ELEMREV_V1TI: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v1ti + : CODE_FOR_vsx_st_elemrev_v1ti; + + case RS6000_BIF_ST_ELEMREV_V2DF: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v2df + : CODE_FOR_vsx_st_elemrev_v2df; + + case RS6000_BIF_ST_ELEMREV_V2DI: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v2di + : CODE_FOR_vsx_st_elemrev_v2di; + + case RS6000_BIF_ST_ELEMREV_V4SF: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v4sf + : CODE_FOR_vsx_st_elemrev_v4sf; + + case RS6000_BIF_ST_ELEMREV_V4SI: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v4si + : CODE_FOR_vsx_st_elemrev_v4si; + + case RS6000_BIF_ST_ELEMREV_V8HI: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v8hi + : CODE_FOR_vsx_st_elemrev_v8hi; + + case RS6000_BIF_ST_ELEMREV_V16QI: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v16qi + : CODE_FOR_vsx_st_elemrev_v16qi; + + case RS6000_BIF_LD_ELEMREV_V2DF: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v2df + : CODE_FOR_vsx_ld_elemrev_v2df; + + case RS6000_BIF_LD_ELEMREV_V1TI: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v1ti + : CODE_FOR_vsx_ld_elemrev_v1ti; + + case RS6000_BIF_LD_ELEMREV_V2DI: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v2di + : CODE_FOR_vsx_ld_elemrev_v2di; + + case RS6000_BIF_LD_ELEMREV_V4SF: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v4sf + : CODE_FOR_vsx_ld_elemrev_v4sf; + + case RS6000_BIF_LD_ELEMREV_V4SI: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v4si + : CODE_FOR_vsx_ld_elemrev_v4si; + + case RS6000_BIF_LD_ELEMREV_V8HI: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v8hi + : CODE_FOR_vsx_ld_elemrev_v8hi; + + case RS6000_BIF_LD_ELEMREV_V16QI: + return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v16qi + : CODE_FOR_vsx_ld_elemrev_v16qi; + default: + ; + } + + gcc_unreachable (); +} + +/* Expand an AltiVec vector load builtin, and return the expanded rtx. */ +static rtx +ldv_expand_builtin (rtx target, insn_code icode, rtx *op, machine_mode tmode) +{ + if (target == 0 + || GET_MODE (target) != tmode + || !insn_data[icode].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + op[1] = copy_to_mode_reg (Pmode, op[1]); + + /* These CELL built-ins use BLKmode instead of tmode for historical + (i.e., unknown) reasons. TODO: Is this necessary? */ + bool blk = (icode == CODE_FOR_altivec_lvlx + || icode == CODE_FOR_altivec_lvlxl + || icode == CODE_FOR_altivec_lvrx + || icode == CODE_FOR_altivec_lvrxl); + + /* For LVX, express the RTL accurately by ANDing the address with -16. + LVXL and LVE*X expand to use UNSPECs to hide their special behavior, + so the raw address is fine. */ + /* TODO: That statement seems wrong, as the UNSPECs don't surround the + memory expression, so a latent bug may lie here. The &-16 is likely + needed for all VMX-style loads. */ + if (icode == CODE_FOR_altivec_lvx_v1ti + || icode == CODE_FOR_altivec_lvx_v2df + || icode == CODE_FOR_altivec_lvx_v2di + || icode == CODE_FOR_altivec_lvx_v4sf + || icode == CODE_FOR_altivec_lvx_v4si + || icode == CODE_FOR_altivec_lvx_v8hi + || icode == CODE_FOR_altivec_lvx_v16qi) + { + rtx rawaddr; + if (op[0] == const0_rtx) + rawaddr = op[1]; + else + { + op[0] = copy_to_mode_reg (Pmode, op[0]); + rawaddr = gen_rtx_PLUS (Pmode, op[1], op[0]); + } + rtx addr = gen_rtx_AND (Pmode, rawaddr, gen_rtx_CONST_INT (Pmode, -16)); + addr = gen_rtx_MEM (blk ? BLKmode : tmode, addr); + + emit_insn (gen_rtx_SET (target, addr)); + } + else + { + rtx addr; + if (op[0] == const0_rtx) + addr = gen_rtx_MEM (blk ? BLKmode : tmode, op[1]); + else + { + op[0] = copy_to_mode_reg (Pmode, op[0]); + addr = gen_rtx_MEM (blk ? BLKmode : tmode, + gen_rtx_PLUS (Pmode, op[1], op[0])); + } + + rtx pat = GEN_FCN (icode) (target, addr); + if (!pat) + return 0; + emit_insn (pat); + } + + return target; +} + +/* Expand a builtin function that loads a scalar into a vector register + with sign extension, and return the expanded rtx. */ +static rtx +lxvrse_expand_builtin (rtx target, insn_code icode, rtx *op, + machine_mode tmode, machine_mode smode) +{ + rtx pat, addr; + op[1] = copy_to_mode_reg (Pmode, op[1]); + + if (op[0] == const0_rtx) + addr = gen_rtx_MEM (tmode, op[1]); + else + { + op[0] = copy_to_mode_reg (Pmode, op[0]); + addr = gen_rtx_MEM (smode, + gen_rtx_PLUS (Pmode, op[1], op[0])); + } + + rtx discratch = gen_reg_rtx (V2DImode); + rtx tiscratch = gen_reg_rtx (TImode); + + /* Emit the lxvr*x insn. */ + pat = GEN_FCN (icode) (tiscratch, addr); + if (!pat) + return 0; + emit_insn (pat); + + /* Emit a sign extension from V16QI,V8HI,V4SI to V2DI. */ + rtx temp1; + if (icode == CODE_FOR_vsx_lxvrbx) + { + temp1 = simplify_gen_subreg (V16QImode, tiscratch, TImode, 0); + emit_insn (gen_vsx_sign_extend_qi_v2di (discratch, temp1)); + } + else if (icode == CODE_FOR_vsx_lxvrhx) + { + temp1 = simplify_gen_subreg (V8HImode, tiscratch, TImode, 0); + emit_insn (gen_vsx_sign_extend_hi_v2di (discratch, temp1)); + } + else if (icode == CODE_FOR_vsx_lxvrwx) + { + temp1 = simplify_gen_subreg (V4SImode, tiscratch, TImode, 0); + emit_insn (gen_vsx_sign_extend_si_v2di (discratch, temp1)); + } + else if (icode == CODE_FOR_vsx_lxvrdx) + discratch = simplify_gen_subreg (V2DImode, tiscratch, TImode, 0); + else + gcc_unreachable (); + + /* Emit the sign extension from V2DI (double) to TI (quad). */ + rtx temp2 = simplify_gen_subreg (TImode, discratch, V2DImode, 0); + emit_insn (gen_extendditi2_vector (target, temp2)); + + return target; +} + +/* Expand a builtin function that loads a scalar into a vector register + with zero extension, and return the expanded rtx. */ +static rtx +lxvrze_expand_builtin (rtx target, insn_code icode, rtx *op, + machine_mode tmode, machine_mode smode) +{ + rtx pat, addr; + op[1] = copy_to_mode_reg (Pmode, op[1]); + + if (op[0] == const0_rtx) + addr = gen_rtx_MEM (tmode, op[1]); + else + { + op[0] = copy_to_mode_reg (Pmode, op[0]); + addr = gen_rtx_MEM (smode, + gen_rtx_PLUS (Pmode, op[1], op[0])); + } + + pat = GEN_FCN (icode) (target, addr); + if (!pat) + return 0; + emit_insn (pat); + return target; +} + +/* Expand an AltiVec vector store builtin, and return the expanded rtx. */ +static rtx +stv_expand_builtin (insn_code icode, rtx *op, + machine_mode tmode, machine_mode smode) +{ + op[2] = copy_to_mode_reg (Pmode, op[2]); + + /* For STVX, express the RTL accurately by ANDing the address with -16. + STVXL and STVE*X expand to use UNSPECs to hide their special behavior, + so the raw address is fine. */ + /* TODO: That statement seems wrong, as the UNSPECs don't surround the + memory expression, so a latent bug may lie here. The &-16 is likely + needed for all VMX-style stores. */ + if (icode == CODE_FOR_altivec_stvx_v2df + || icode == CODE_FOR_altivec_stvx_v2di + || icode == CODE_FOR_altivec_stvx_v4sf + || icode == CODE_FOR_altivec_stvx_v4si + || icode == CODE_FOR_altivec_stvx_v8hi + || icode == CODE_FOR_altivec_stvx_v16qi) + { + rtx rawaddr; + if (op[1] == const0_rtx) + rawaddr = op[2]; + else + { + op[1] = copy_to_mode_reg (Pmode, op[1]); + rawaddr = gen_rtx_PLUS (Pmode, op[2], op[1]); + } + + rtx addr = gen_rtx_AND (Pmode, rawaddr, gen_rtx_CONST_INT (Pmode, -16)); + addr = gen_rtx_MEM (tmode, addr); + op[0] = copy_to_mode_reg (tmode, op[0]); + emit_insn (gen_rtx_SET (addr, op[0])); + } + else if (icode == CODE_FOR_vsx_stxvrbx + || icode == CODE_FOR_vsx_stxvrhx + || icode == CODE_FOR_vsx_stxvrwx + || icode == CODE_FOR_vsx_stxvrdx) + { + rtx truncrtx = gen_rtx_TRUNCATE (tmode, op[0]); + op[0] = copy_to_mode_reg (E_TImode, truncrtx); + + rtx addr; + if (op[1] == const0_rtx) + addr = gen_rtx_MEM (Pmode, op[2]); + else + { + op[1] = copy_to_mode_reg (Pmode, op[1]); + addr = gen_rtx_MEM (tmode, gen_rtx_PLUS (Pmode, op[2], op[1])); + } + rtx pat = GEN_FCN (icode) (addr, op[0]); + if (pat) + emit_insn (pat); + } + else + { + if (!insn_data[icode].operand[1].predicate (op[0], smode)) + op[0] = copy_to_mode_reg (smode, op[0]); + + rtx addr; + if (op[1] == const0_rtx) + addr = gen_rtx_MEM (tmode, op[2]); + else + { + op[1] = copy_to_mode_reg (Pmode, op[1]); + addr = gen_rtx_MEM (tmode, gen_rtx_PLUS (Pmode, op[2], op[1])); + } + + rtx pat = GEN_FCN (icode) (addr, op[0]); + if (pat) + emit_insn (pat); + } + + return NULL_RTX; +} + +/* Expand the MMA built-in in EXP, and return it. */ +static rtx +mma_expand_builtin (tree exp, rtx target, insn_code icode, + rs6000_gen_builtins fcode) +{ + tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); + bool void_func = TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node; + machine_mode tmode = VOIDmode; + rtx op[MAX_MMA_OPERANDS]; + unsigned nopnds = 0; + + if (!void_func) + { + tmode = insn_data[icode].operand[0].mode; + if (!(target + && GET_MODE (target) == tmode + && insn_data[icode].operand[0].predicate (target, tmode))) + target = gen_reg_rtx (tmode); + op[nopnds++] = target; + } + else + target = const0_rtx; + + call_expr_arg_iterator iter; + tree arg; + FOR_EACH_CALL_EXPR_ARG (arg, iter, exp) + { + if (arg == error_mark_node) + return const0_rtx; + + rtx opnd; + const struct insn_operand_data *insn_op; + insn_op = &insn_data[icode].operand[nopnds]; + if (TREE_CODE (arg) == ADDR_EXPR + && MEM_P (DECL_RTL (TREE_OPERAND (arg, 0)))) + opnd = DECL_RTL (TREE_OPERAND (arg, 0)); + else + opnd = expand_normal (arg); + + if (!insn_op->predicate (opnd, insn_op->mode)) + { + /* TODO: This use of constraints needs explanation. */ + if (!strcmp (insn_op->constraint, "n")) + { + if (!CONST_INT_P (opnd)) + error ("argument %d must be an unsigned literal", nopnds); + else + error ("argument %d is an unsigned literal that is " + "out of range", nopnds); + return const0_rtx; + } + opnd = copy_to_mode_reg (insn_op->mode, opnd); + } + + /* Some MMA instructions have INOUT accumulator operands, so force + their target register to be the same as their input register. */ + if (!void_func + && nopnds == 1 + && !strcmp (insn_op->constraint, "0") + && insn_op->mode == tmode + && REG_P (opnd) + && insn_data[icode].operand[0].predicate (opnd, tmode)) + target = op[0] = opnd; + + op[nopnds++] = opnd; + } + + rtx pat; + switch (nopnds) + { + case 1: + pat = GEN_FCN (icode) (op[0]); + break; + case 2: + pat = GEN_FCN (icode) (op[0], op[1]); + break; + case 3: + /* The ASSEMBLE builtin source operands are reversed in little-endian + mode, so reorder them. */ + if (fcode == RS6000_BIF_ASSEMBLE_PAIR_V_INTERNAL && !WORDS_BIG_ENDIAN) + std::swap (op[1], op[2]); + pat = GEN_FCN (icode) (op[0], op[1], op[2]); + break; + case 4: + pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3]); + break; + case 5: + /* The ASSEMBLE builtin source operands are reversed in little-endian + mode, so reorder them. */ + if (fcode == RS6000_BIF_ASSEMBLE_ACC_INTERNAL && !WORDS_BIG_ENDIAN) + { + std::swap (op[1], op[4]); + std::swap (op[2], op[3]); + } + pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4]); + break; + case 6: + pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5]); + break; + case 7: + pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5], op[6]); + break; + default: + gcc_unreachable (); + } + + if (!pat) + return NULL_RTX; + + emit_insn (pat); + return target; +} + +/* Return the correct ICODE value depending on whether we are + setting or reading the HTM SPRs. */ +static inline enum insn_code +rs6000_htm_spr_icode (bool nonvoid) +{ + if (nonvoid) + return (TARGET_POWERPC64) ? CODE_FOR_htm_mfspr_di : CODE_FOR_htm_mfspr_si; + else + return (TARGET_POWERPC64) ? CODE_FOR_htm_mtspr_di : CODE_FOR_htm_mtspr_si; +} + +/* Return the appropriate SPR number associated with the given builtin. */ +static inline HOST_WIDE_INT +htm_spr_num (enum rs6000_gen_builtins code) +{ + if (code == RS6000_BIF_GET_TFHAR + || code == RS6000_BIF_SET_TFHAR) + return TFHAR_SPR; + else if (code == RS6000_BIF_GET_TFIAR + || code == RS6000_BIF_SET_TFIAR) + return TFIAR_SPR; + else if (code == RS6000_BIF_GET_TEXASR + || code == RS6000_BIF_SET_TEXASR) + return TEXASR_SPR; + gcc_assert (code == RS6000_BIF_GET_TEXASRU + || code == RS6000_BIF_SET_TEXASRU); + return TEXASRU_SPR; +} + +/* Expand the HTM builtin in EXP and store the result in TARGET. + Return the expanded rtx. */ +static rtx +htm_expand_builtin (bifdata *bifaddr, rs6000_gen_builtins fcode, + tree exp, rtx target) +{ + if (!TARGET_POWERPC64 + && (fcode == RS6000_BIF_TABORTDC + || fcode == RS6000_BIF_TABORTDCI)) + { + error ("builtin %qs is only valid in 64-bit mode", bifaddr->bifname); + return const0_rtx; + } + + tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); + bool nonvoid = TREE_TYPE (TREE_TYPE (fndecl)) != void_type_node; + bool uses_spr = bif_is_htmspr (*bifaddr); + insn_code icode = bifaddr->icode; + + if (uses_spr) + icode = rs6000_htm_spr_icode (nonvoid); + + rtx op[MAX_HTM_OPERANDS]; + int nopnds = 0; + const insn_operand_data *insn_op = &insn_data[icode].operand[0]; + + if (nonvoid) + { + machine_mode tmode = (uses_spr) ? insn_op->mode : E_SImode; + if (!target + || GET_MODE (target) != tmode + || (uses_spr && !insn_op->predicate (target, tmode))) + target = gen_reg_rtx (tmode); + if (uses_spr) + op[nopnds++] = target; + } + + tree arg; + call_expr_arg_iterator iter; + + FOR_EACH_CALL_EXPR_ARG (arg, iter, exp) + { + if (arg == error_mark_node || nopnds >= MAX_HTM_OPERANDS) + return const0_rtx; + + insn_op = &insn_data[icode].operand[nopnds]; + op[nopnds] = expand_normal (arg); + + if (!insn_op->predicate (op[nopnds], insn_op->mode)) + { + /* TODO: This use of constraints could use explanation. + This happens a couple of places, perhaps make that a + function to document what's happening. */ + if (!strcmp (insn_op->constraint, "n")) + { + int arg_num = nonvoid ? nopnds : nopnds + 1; + if (!CONST_INT_P (op[nopnds])) + error ("argument %d must be an unsigned literal", arg_num); + else + error ("argument %d is an unsigned literal that is " + "out of range", arg_num); + return const0_rtx; + } + op[nopnds] = copy_to_mode_reg (insn_op->mode, op[nopnds]); + } + + nopnds++; + } + + /* Handle the builtins for extended mnemonics. These accept + no arguments, but map to builtins that take arguments. */ + switch (fcode) + { + case RS6000_BIF_TENDALL: /* Alias for: tend. 1 */ + case RS6000_BIF_TRESUME: /* Alias for: tsr. 1 */ + op[nopnds++] = GEN_INT (1); + break; + case RS6000_BIF_TSUSPEND: /* Alias for: tsr. 0 */ + op[nopnds++] = GEN_INT (0); + break; + default: + break; + } + + /* If this builtin accesses SPRs, then pass in the appropriate + SPR number and SPR regno as the last two operands. */ + rtx cr = NULL_RTX; + if (uses_spr) + { + machine_mode mode = TARGET_POWERPC64 ? DImode : SImode; + op[nopnds++] = gen_rtx_CONST_INT (mode, htm_spr_num (fcode)); + } + /* If this builtin accesses a CR field, then pass in a scratch + CR field as the last operand. */ + else if (bif_is_htmcr (*bifaddr)) + { + cr = gen_reg_rtx (CCmode); + op[nopnds++] = cr; + } + + rtx pat; + switch (nopnds) + { + case 1: + pat = GEN_FCN (icode) (op[0]); + break; + case 2: + pat = GEN_FCN (icode) (op[0], op[1]); + break; + case 3: + pat = GEN_FCN (icode) (op[0], op[1], op[2]); + break; + case 4: + pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3]); + break; + default: + gcc_unreachable (); + } + if (!pat) + return NULL_RTX; + emit_insn (pat); + + if (bif_is_htmcr (*bifaddr)) + { + if (fcode == RS6000_BIF_TBEGIN) + { + /* Emit code to set TARGET to true or false depending on + whether the tbegin. instruction succeeded or failed + to start a transaction. We do this by placing the 1's + complement of CR's EQ bit into TARGET. */ + rtx scratch = gen_reg_rtx (SImode); + emit_insn (gen_rtx_SET (scratch, + gen_rtx_EQ (SImode, cr, + const0_rtx))); + emit_insn (gen_rtx_SET (target, + gen_rtx_XOR (SImode, scratch, + GEN_INT (1)))); + } + else + { + /* Emit code to copy the 4-bit condition register field + CR into the least significant end of register TARGET. */ + rtx scratch1 = gen_reg_rtx (SImode); + rtx scratch2 = gen_reg_rtx (SImode); + rtx subreg = simplify_gen_subreg (CCmode, scratch1, SImode, 0); + emit_insn (gen_movcc (subreg, cr)); + emit_insn (gen_lshrsi3 (scratch2, scratch1, GEN_INT (28))); + emit_insn (gen_andsi3 (target, scratch2, GEN_INT (0xf))); + } + } + + if (nonvoid) + return target; + return const0_rtx; +} + +/* Expand an expression EXP that calls a built-in function, + with result going to TARGET if that's convenient + (and in mode MODE if that's convenient). + SUBTARGET may be used as the target for computing one of EXP's operands. + IGNORE is nonzero if the value is to be ignored. + Use the new builtin infrastructure. */ +rtx +rs6000_expand_builtin (tree exp, rtx target, rtx /* subtarget */, + machine_mode /* mode */, int ignore) +{ + tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); + enum rs6000_gen_builtins fcode + = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl); + size_t uns_fcode = (size_t)fcode; + enum insn_code icode = rs6000_builtin_info[uns_fcode].icode; + + /* TODO: The following commentary and code is inherited from the original + builtin processing code. The commentary is a bit confusing, with the + intent being that KFmode is always IEEE-128, IFmode is always IBM + double-double, and TFmode is the current long double. The code is + confusing in that it converts from KFmode to TFmode pattern names, + when the other direction is more intuitive. Try to address this. */ + + /* We have two different modes (KFmode, TFmode) that are the IEEE + 128-bit floating point type, depending on whether long double is the + IBM extended double (KFmode) or long double is IEEE 128-bit (TFmode). + It is simpler if we only define one variant of the built-in function, + and switch the code when defining it, rather than defining two built- + ins and using the overload table in rs6000-c.cc to switch between the + two. If we don't have the proper assembler, don't do this switch + because CODE_FOR_*kf* and CODE_FOR_*tf* will be CODE_FOR_nothing. */ + if (FLOAT128_IEEE_P (TFmode)) + switch (icode) + { + case CODE_FOR_sqrtkf2_odd: + icode = CODE_FOR_sqrttf2_odd; + break; + case CODE_FOR_trunckfdf2_odd: + icode = CODE_FOR_trunctfdf2_odd; + break; + case CODE_FOR_addkf3_odd: + icode = CODE_FOR_addtf3_odd; + break; + case CODE_FOR_subkf3_odd: + icode = CODE_FOR_subtf3_odd; + break; + case CODE_FOR_mulkf3_odd: + icode = CODE_FOR_multf3_odd; + break; + case CODE_FOR_divkf3_odd: + icode = CODE_FOR_divtf3_odd; + break; + case CODE_FOR_fmakf4_odd: + icode = CODE_FOR_fmatf4_odd; + break; + case CODE_FOR_xsxexpqp_kf: + icode = CODE_FOR_xsxexpqp_tf; + break; + case CODE_FOR_xsxsigqp_kf: + icode = CODE_FOR_xsxsigqp_tf; + break; + case CODE_FOR_xststdcnegqp_kf: + icode = CODE_FOR_xststdcnegqp_tf; + break; + case CODE_FOR_xsiexpqp_kf: + icode = CODE_FOR_xsiexpqp_tf; + break; + case CODE_FOR_xsiexpqpf_kf: + icode = CODE_FOR_xsiexpqpf_tf; + break; + case CODE_FOR_xststdcqp_kf: + icode = CODE_FOR_xststdcqp_tf; + break; + case CODE_FOR_xscmpexpqp_eq_kf: + icode = CODE_FOR_xscmpexpqp_eq_tf; + break; + case CODE_FOR_xscmpexpqp_lt_kf: + icode = CODE_FOR_xscmpexpqp_lt_tf; + break; + case CODE_FOR_xscmpexpqp_gt_kf: + icode = CODE_FOR_xscmpexpqp_gt_tf; + break; + case CODE_FOR_xscmpexpqp_unordered_kf: + icode = CODE_FOR_xscmpexpqp_unordered_tf; + break; + default: + break; + } + + /* In case of "#pragma target" changes, we initialize all builtins + but check for actual availability now, during expand time. For + invalid builtins, generate a normal call. */ + bifdata *bifaddr = &rs6000_builtin_info[uns_fcode]; + bif_enable e = bifaddr->enable; + + if (!(e == ENB_ALWAYS + || (e == ENB_P5 && TARGET_POPCNTB) + || (e == ENB_P6 && TARGET_CMPB) + || (e == ENB_P6_64 && TARGET_CMPB && TARGET_POWERPC64) + || (e == ENB_ALTIVEC && TARGET_ALTIVEC) + || (e == ENB_CELL && TARGET_ALTIVEC && rs6000_cpu == PROCESSOR_CELL) + || (e == ENB_VSX && TARGET_VSX) + || (e == ENB_P7 && TARGET_POPCNTD) + || (e == ENB_P7_64 && TARGET_POPCNTD && TARGET_POWERPC64) + || (e == ENB_P8 && TARGET_DIRECT_MOVE) + || (e == ENB_P8V && TARGET_P8_VECTOR) + || (e == ENB_P9 && TARGET_MODULO) + || (e == ENB_P9_64 && TARGET_MODULO && TARGET_POWERPC64) + || (e == ENB_P9V && TARGET_P9_VECTOR) + || (e == ENB_IEEE128_HW && TARGET_FLOAT128_HW) + || (e == ENB_DFP && TARGET_DFP) + || (e == ENB_CRYPTO && TARGET_CRYPTO) + || (e == ENB_HTM && TARGET_HTM) + || (e == ENB_P10 && TARGET_POWER10) + || (e == ENB_P10_64 && TARGET_POWER10 && TARGET_POWERPC64) + || (e == ENB_MMA && TARGET_MMA))) + { + rs6000_invalid_builtin (fcode); + return expand_call (exp, target, ignore); + } + + if (bif_is_nosoft (*bifaddr) + && rs6000_isa_flags & OPTION_MASK_SOFT_FLOAT) + { + error ("%qs not supported with %<-msoft-float%>", + bifaddr->bifname); + return const0_rtx; + } + + if (bif_is_no32bit (*bifaddr) && TARGET_32BIT) + { + error ("%qs is not supported in 32-bit mode", bifaddr->bifname); + return const0_rtx; + } + + if (bif_is_ibmld (*bifaddr) && !FLOAT128_2REG_P (TFmode)) + { + error ("%qs requires % to be IBM 128-bit format", + bifaddr->bifname); + return const0_rtx; + } + + if (bif_is_cpu (*bifaddr)) + return cpu_expand_builtin (fcode, exp, target); + + if (bif_is_init (*bifaddr)) + return altivec_expand_vec_init_builtin (TREE_TYPE (exp), exp, target); + + if (bif_is_set (*bifaddr)) + return altivec_expand_vec_set_builtin (exp); + + if (bif_is_extract (*bifaddr)) + return altivec_expand_vec_ext_builtin (exp, target); + + if (bif_is_predicate (*bifaddr)) + return altivec_expand_predicate_builtin (icode, exp, target); + + if (bif_is_htm (*bifaddr)) + return htm_expand_builtin (bifaddr, fcode, exp, target); + + if (bif_is_32bit (*bifaddr) && TARGET_32BIT) + { + if (fcode == RS6000_BIF_MFTB) + icode = CODE_FOR_rs6000_mftb_si; + else if (fcode == RS6000_BIF_BPERMD) + icode = CODE_FOR_bpermd_si; + else if (fcode == RS6000_BIF_DARN) + icode = CODE_FOR_darn_64_si; + else if (fcode == RS6000_BIF_DARN_32) + icode = CODE_FOR_darn_32_si; + else if (fcode == RS6000_BIF_DARN_RAW) + icode = CODE_FOR_darn_raw_si; + else + gcc_unreachable (); + } + + if (bif_is_endian (*bifaddr) && BYTES_BIG_ENDIAN) + { + if (fcode == RS6000_BIF_LD_ELEMREV_V1TI) + icode = CODE_FOR_vsx_load_v1ti; + else if (fcode == RS6000_BIF_LD_ELEMREV_V2DF) + icode = CODE_FOR_vsx_load_v2df; + else if (fcode == RS6000_BIF_LD_ELEMREV_V2DI) + icode = CODE_FOR_vsx_load_v2di; + else if (fcode == RS6000_BIF_LD_ELEMREV_V4SF) + icode = CODE_FOR_vsx_load_v4sf; + else if (fcode == RS6000_BIF_LD_ELEMREV_V4SI) + icode = CODE_FOR_vsx_load_v4si; + else if (fcode == RS6000_BIF_LD_ELEMREV_V8HI) + icode = CODE_FOR_vsx_load_v8hi; + else if (fcode == RS6000_BIF_LD_ELEMREV_V16QI) + icode = CODE_FOR_vsx_load_v16qi; + else if (fcode == RS6000_BIF_ST_ELEMREV_V1TI) + icode = CODE_FOR_vsx_store_v1ti; + else if (fcode == RS6000_BIF_ST_ELEMREV_V2DF) + icode = CODE_FOR_vsx_store_v2df; + else if (fcode == RS6000_BIF_ST_ELEMREV_V2DI) + icode = CODE_FOR_vsx_store_v2di; + else if (fcode == RS6000_BIF_ST_ELEMREV_V4SF) + icode = CODE_FOR_vsx_store_v4sf; + else if (fcode == RS6000_BIF_ST_ELEMREV_V4SI) + icode = CODE_FOR_vsx_store_v4si; + else if (fcode == RS6000_BIF_ST_ELEMREV_V8HI) + icode = CODE_FOR_vsx_store_v8hi; + else if (fcode == RS6000_BIF_ST_ELEMREV_V16QI) + icode = CODE_FOR_vsx_store_v16qi; + else + gcc_unreachable (); + } + + + /* TRUE iff the built-in function returns void. */ + bool void_func = TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node; + /* Position of first argument (0 for void-returning functions, else 1). */ + int k; + /* Modes for the return value, if any, and arguments. */ + const int MAX_BUILTIN_ARGS = 6; + machine_mode mode[MAX_BUILTIN_ARGS + 1]; + + if (void_func) + k = 0; + else + { + k = 1; + mode[0] = insn_data[icode].operand[0].mode; + } + + /* Tree expressions for each argument. */ + tree arg[MAX_BUILTIN_ARGS]; + /* RTL expressions for each argument. */ + rtx op[MAX_BUILTIN_ARGS]; + + int nargs = bifaddr->nargs; + gcc_assert (nargs <= MAX_BUILTIN_ARGS); + + + for (int i = 0; i < nargs; i++) + { + arg[i] = CALL_EXPR_ARG (exp, i); + if (arg[i] == error_mark_node) + return const0_rtx; + STRIP_NOPS (arg[i]); + op[i] = expand_normal (arg[i]); + /* We have a couple of pesky patterns that don't specify the mode... */ + mode[i+k] = insn_data[icode].operand[i+k].mode; + if (!mode[i+k]) + mode[i+k] = Pmode; + } + + /* Check for restricted constant arguments. */ + for (int i = 0; i < 2; i++) + { + switch (bifaddr->restr[i]) + { + case RES_BITS: + { + size_t mask = 1; + mask <<= bifaddr->restr_val1[i]; + mask--; + tree restr_arg = arg[bifaddr->restr_opnd[i] - 1]; + STRIP_NOPS (restr_arg); + if (!(TREE_CODE (restr_arg) == INTEGER_CST + && (TREE_INT_CST_LOW (restr_arg) & ~mask) == 0)) + { + unsigned p = (1U << bifaddr->restr_val1[i]) - 1; + error ("argument %d must be a literal between 0 and %d," + " inclusive", + bifaddr->restr_opnd[i], p); + return CONST0_RTX (mode[0]); + } + break; + } + case RES_RANGE: + { + tree restr_arg = arg[bifaddr->restr_opnd[i] - 1]; + STRIP_NOPS (restr_arg); + if (!(TREE_CODE (restr_arg) == INTEGER_CST + && IN_RANGE (tree_to_shwi (restr_arg), + bifaddr->restr_val1[i], + bifaddr->restr_val2[i]))) + { + error ("argument %d must be a literal between %d and %d," + " inclusive", + bifaddr->restr_opnd[i], bifaddr->restr_val1[i], + bifaddr->restr_val2[i]); + return CONST0_RTX (mode[0]); + } + break; + } + case RES_VAR_RANGE: + { + tree restr_arg = arg[bifaddr->restr_opnd[i] - 1]; + STRIP_NOPS (restr_arg); + if (TREE_CODE (restr_arg) == INTEGER_CST + && !IN_RANGE (tree_to_shwi (restr_arg), + bifaddr->restr_val1[i], + bifaddr->restr_val2[i])) + { + error ("argument %d must be a variable or a literal " + "between %d and %d, inclusive", + bifaddr->restr_opnd[i], bifaddr->restr_val1[i], + bifaddr->restr_val2[i]); + return CONST0_RTX (mode[0]); + } + break; + } + case RES_VALUES: + { + tree restr_arg = arg[bifaddr->restr_opnd[i] - 1]; + STRIP_NOPS (restr_arg); + if (!(TREE_CODE (restr_arg) == INTEGER_CST + && (tree_to_shwi (restr_arg) == bifaddr->restr_val1[i] + || tree_to_shwi (restr_arg) == bifaddr->restr_val2[i]))) + { + error ("argument %d must be either a literal %d or a " + "literal %d", + bifaddr->restr_opnd[i], bifaddr->restr_val1[i], + bifaddr->restr_val2[i]); + return CONST0_RTX (mode[0]); + } + break; + } + default: + case RES_NONE: + break; + } + } + + if (bif_is_ldstmask (*bifaddr)) + return rs6000_expand_ldst_mask (target, arg[0]); + + if (bif_is_stvec (*bifaddr)) + { + if (bif_is_reve (*bifaddr)) + icode = elemrev_icode (fcode); + return stv_expand_builtin (icode, op, mode[0], mode[1]); + } + + if (bif_is_ldvec (*bifaddr)) + { + if (bif_is_reve (*bifaddr)) + icode = elemrev_icode (fcode); + return ldv_expand_builtin (target, icode, op, mode[0]); + } + + if (bif_is_lxvrse (*bifaddr)) + return lxvrse_expand_builtin (target, icode, op, mode[0], mode[1]); + + if (bif_is_lxvrze (*bifaddr)) + return lxvrze_expand_builtin (target, icode, op, mode[0], mode[1]); + + if (bif_is_mma (*bifaddr)) + return mma_expand_builtin (exp, target, icode, fcode); + + if (fcode == RS6000_BIF_PACK_IF + && TARGET_LONG_DOUBLE_128 + && !TARGET_IEEEQUAD) + { + icode = CODE_FOR_packtf; + fcode = RS6000_BIF_PACK_TF; + uns_fcode = (size_t) fcode; + } + else if (fcode == RS6000_BIF_UNPACK_IF + && TARGET_LONG_DOUBLE_128 + && !TARGET_IEEEQUAD) + { + icode = CODE_FOR_unpacktf; + fcode = RS6000_BIF_UNPACK_TF; + uns_fcode = (size_t) fcode; + } + + if (TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node) + target = NULL_RTX; + else if (target == 0 + || GET_MODE (target) != mode[0] + || !insn_data[icode].operand[0].predicate (target, mode[0])) + target = gen_reg_rtx (mode[0]); + + for (int i = 0; i < nargs; i++) + if (!insn_data[icode].operand[i+k].predicate (op[i], mode[i+k])) + op[i] = copy_to_mode_reg (mode[i+k], op[i]); + + rtx pat; + + switch (nargs) + { + case 0: + pat = (void_func + ? GEN_FCN (icode) () + : GEN_FCN (icode) (target)); + break; + case 1: + pat = (void_func + ? GEN_FCN (icode) (op[0]) + : GEN_FCN (icode) (target, op[0])); + break; + case 2: + pat = (void_func + ? GEN_FCN (icode) (op[0], op[1]) + : GEN_FCN (icode) (target, op[0], op[1])); + break; + case 3: + pat = (void_func + ? GEN_FCN (icode) (op[0], op[1], op[2]) + : GEN_FCN (icode) (target, op[0], op[1], op[2])); + break; + case 4: + pat = (void_func + ? GEN_FCN (icode) (op[0], op[1], op[2], op[3]) + : GEN_FCN (icode) (target, op[0], op[1], op[2], op[3])); + break; + case 5: + pat = (void_func + ? GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4]) + : GEN_FCN (icode) (target, op[0], op[1], op[2], op[3], op[4])); + break; + case 6: + pat = (void_func + ? GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5]) + : GEN_FCN (icode) (target, op[0], op[1], + op[2], op[3], op[4], op[5])); + break; + default: + gcc_assert (MAX_BUILTIN_ARGS == 6); + gcc_unreachable (); + } + + if (!pat) + return 0; + + emit_insn (pat); + return target; +} diff --git a/gcc/config/rs6000/rs6000-call.cc b/gcc/config/rs6000/rs6000-call.cc index d9bd5ca..f06c692 100644 --- a/gcc/config/rs6000/rs6000-call.cc +++ b/gcc/config/rs6000/rs6000-call.cc @@ -89,85 +89,6 @@ #define TARGET_NO_PROTOTYPE 0 #endif -/* Used by __builtin_cpu_is(), mapping from PLATFORM names to values. */ -static const struct -{ - const char *cpu; - unsigned int cpuid; -} cpu_is_info[] = { - { "power10", PPC_PLATFORM_POWER10 }, - { "power9", PPC_PLATFORM_POWER9 }, - { "power8", PPC_PLATFORM_POWER8 }, - { "power7", PPC_PLATFORM_POWER7 }, - { "power6x", PPC_PLATFORM_POWER6X }, - { "power6", PPC_PLATFORM_POWER6 }, - { "power5+", PPC_PLATFORM_POWER5_PLUS }, - { "power5", PPC_PLATFORM_POWER5 }, - { "ppc970", PPC_PLATFORM_PPC970 }, - { "power4", PPC_PLATFORM_POWER4 }, - { "ppca2", PPC_PLATFORM_PPCA2 }, - { "ppc476", PPC_PLATFORM_PPC476 }, - { "ppc464", PPC_PLATFORM_PPC464 }, - { "ppc440", PPC_PLATFORM_PPC440 }, - { "ppc405", PPC_PLATFORM_PPC405 }, - { "ppc-cell-be", PPC_PLATFORM_CELL_BE } -}; - -/* Used by __builtin_cpu_supports(), mapping from HWCAP names to masks. */ -static const struct -{ - const char *hwcap; - int mask; - unsigned int id; -} cpu_supports_info[] = { - /* AT_HWCAP masks. */ - { "4xxmac", PPC_FEATURE_HAS_4xxMAC, 0 }, - { "altivec", PPC_FEATURE_HAS_ALTIVEC, 0 }, - { "arch_2_05", PPC_FEATURE_ARCH_2_05, 0 }, - { "arch_2_06", PPC_FEATURE_ARCH_2_06, 0 }, - { "archpmu", PPC_FEATURE_PERFMON_COMPAT, 0 }, - { "booke", PPC_FEATURE_BOOKE, 0 }, - { "cellbe", PPC_FEATURE_CELL_BE, 0 }, - { "dfp", PPC_FEATURE_HAS_DFP, 0 }, - { "efpdouble", PPC_FEATURE_HAS_EFP_DOUBLE, 0 }, - { "efpsingle", PPC_FEATURE_HAS_EFP_SINGLE, 0 }, - { "fpu", PPC_FEATURE_HAS_FPU, 0 }, - { "ic_snoop", PPC_FEATURE_ICACHE_SNOOP, 0 }, - { "mmu", PPC_FEATURE_HAS_MMU, 0 }, - { "notb", PPC_FEATURE_NO_TB, 0 }, - { "pa6t", PPC_FEATURE_PA6T, 0 }, - { "power4", PPC_FEATURE_POWER4, 0 }, - { "power5", PPC_FEATURE_POWER5, 0 }, - { "power5+", PPC_FEATURE_POWER5_PLUS, 0 }, - { "power6x", PPC_FEATURE_POWER6_EXT, 0 }, - { "ppc32", PPC_FEATURE_32, 0 }, - { "ppc601", PPC_FEATURE_601_INSTR, 0 }, - { "ppc64", PPC_FEATURE_64, 0 }, - { "ppcle", PPC_FEATURE_PPC_LE, 0 }, - { "smt", PPC_FEATURE_SMT, 0 }, - { "spe", PPC_FEATURE_HAS_SPE, 0 }, - { "true_le", PPC_FEATURE_TRUE_LE, 0 }, - { "ucache", PPC_FEATURE_UNIFIED_CACHE, 0 }, - { "vsx", PPC_FEATURE_HAS_VSX, 0 }, - - /* AT_HWCAP2 masks. */ - { "arch_2_07", PPC_FEATURE2_ARCH_2_07, 1 }, - { "dscr", PPC_FEATURE2_HAS_DSCR, 1 }, - { "ebb", PPC_FEATURE2_HAS_EBB, 1 }, - { "htm", PPC_FEATURE2_HAS_HTM, 1 }, - { "htm-nosc", PPC_FEATURE2_HTM_NOSC, 1 }, - { "htm-no-suspend", PPC_FEATURE2_HTM_NO_SUSPEND, 1 }, - { "isel", PPC_FEATURE2_HAS_ISEL, 1 }, - { "tar", PPC_FEATURE2_HAS_TAR, 1 }, - { "vcrypto", PPC_FEATURE2_HAS_VEC_CRYPTO, 1 }, - { "arch_3_00", PPC_FEATURE2_ARCH_3_00, 1 }, - { "ieee128", PPC_FEATURE2_HAS_IEEE128, 1 }, - { "darn", PPC_FEATURE2_DARN, 1 }, - { "scv", PPC_FEATURE2_SCV, 1 }, - { "arch_3_1", PPC_FEATURE2_ARCH_3_1, 1 }, - { "mma", PPC_FEATURE2_MMA, 1 }, -}; - /* Nonzero if we can use a floating-point register to pass this arg. */ #define USE_FP_FOR_ARG_P(CUM,MODE) \ (SCALAR_FLOAT_MODE_NOT_VECTOR_P (MODE) \ @@ -2880,188 +2801,6 @@ rs6000_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, return build_va_arg_indirect_ref (addr); } -/* Debug utility to translate a type node to a single textual token. */ -static -const char *rs6000_type_string (tree type_node) -{ - if (type_node == void_type_node) - return "void"; - else if (type_node == long_integer_type_node) - return "long"; - else if (type_node == long_unsigned_type_node) - return "ulong"; - else if (type_node == long_long_integer_type_node) - return "longlong"; - else if (type_node == long_long_unsigned_type_node) - return "ulonglong"; - else if (type_node == bool_V2DI_type_node) - return "vbll"; - else if (type_node == bool_V4SI_type_node) - return "vbi"; - else if (type_node == bool_V8HI_type_node) - return "vbs"; - else if (type_node == bool_V16QI_type_node) - return "vbc"; - else if (type_node == bool_int_type_node) - return "bool"; - else if (type_node == dfloat64_type_node) - return "_Decimal64"; - else if (type_node == double_type_node) - return "double"; - else if (type_node == intDI_type_node) - return "sll"; - else if (type_node == intHI_type_node) - return "ss"; - else if (type_node == ibm128_float_type_node) - return "__ibm128"; - else if (type_node == opaque_V4SI_type_node) - return "opaque"; - else if (POINTER_TYPE_P (type_node)) - return "void*"; - else if (type_node == intQI_type_node || type_node == char_type_node) - return "sc"; - else if (type_node == dfloat32_type_node) - return "_Decimal32"; - else if (type_node == float_type_node) - return "float"; - else if (type_node == intSI_type_node || type_node == integer_type_node) - return "si"; - else if (type_node == dfloat128_type_node) - return "_Decimal128"; - else if (type_node == long_double_type_node) - return "longdouble"; - else if (type_node == intTI_type_node) - return "sq"; - else if (type_node == unsigned_intDI_type_node) - return "ull"; - else if (type_node == unsigned_intHI_type_node) - return "us"; - else if (type_node == unsigned_intQI_type_node) - return "uc"; - else if (type_node == unsigned_intSI_type_node) - return "ui"; - else if (type_node == unsigned_intTI_type_node) - return "uq"; - else if (type_node == unsigned_V1TI_type_node) - return "vuq"; - else if (type_node == unsigned_V2DI_type_node) - return "vull"; - else if (type_node == unsigned_V4SI_type_node) - return "vui"; - else if (type_node == unsigned_V8HI_type_node) - return "vus"; - else if (type_node == unsigned_V16QI_type_node) - return "vuc"; - else if (type_node == V16QI_type_node) - return "vsc"; - else if (type_node == V1TI_type_node) - return "vsq"; - else if (type_node == V2DF_type_node) - return "vd"; - else if (type_node == V2DI_type_node) - return "vsll"; - else if (type_node == V4SF_type_node) - return "vf"; - else if (type_node == V4SI_type_node) - return "vsi"; - else if (type_node == V8HI_type_node) - return "vss"; - else if (type_node == pixel_V8HI_type_node) - return "vp"; - else if (type_node == pcvoid_type_node) - return "voidc*"; - else if (type_node == float128_type_node) - return "_Float128"; - else if (type_node == vector_pair_type_node) - return "__vector_pair"; - else if (type_node == vector_quad_type_node) - return "__vector_quad"; - - return "unknown"; -} - -static rtx -altivec_expand_predicate_builtin (enum insn_code icode, tree exp, rtx target) -{ - rtx pat, scratch; - tree cr6_form = CALL_EXPR_ARG (exp, 0); - tree arg0 = CALL_EXPR_ARG (exp, 1); - tree arg1 = CALL_EXPR_ARG (exp, 2); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - machine_mode tmode = SImode; - machine_mode mode0 = insn_data[icode].operand[1].mode; - machine_mode mode1 = insn_data[icode].operand[2].mode; - int cr6_form_int; - - if (TREE_CODE (cr6_form) != INTEGER_CST) - { - error ("argument 1 of %qs must be a constant", - "__builtin_altivec_predicate"); - return const0_rtx; - } - else - cr6_form_int = TREE_INT_CST_LOW (cr6_form); - - gcc_assert (mode0 == mode1); - - /* If we have invalid arguments, bail out before generating bad rtl. */ - if (arg0 == error_mark_node || arg1 == error_mark_node) - return const0_rtx; - - if (target == 0 - || GET_MODE (target) != tmode - || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) - target = gen_reg_rtx (tmode); - - if (! (*insn_data[icode].operand[1].predicate) (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if (! (*insn_data[icode].operand[2].predicate) (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - /* Note that for many of the relevant operations (e.g. cmpne or - cmpeq) with float or double operands, it makes more sense for the - mode of the allocated scratch register to select a vector of - integer. But the choice to copy the mode of operand 0 was made - long ago and there are no plans to change it. */ - scratch = gen_reg_rtx (mode0); - - pat = GEN_FCN (icode) (scratch, op0, op1); - if (! pat) - return 0; - emit_insn (pat); - - /* The vec_any* and vec_all* predicates use the same opcodes for two - different operations, but the bits in CR6 will be different - depending on what information we want. So we have to play tricks - with CR6 to get the right bits out. - - If you think this is disgusting, look at the specs for the - AltiVec predicates. */ - - switch (cr6_form_int) - { - case 0: - emit_insn (gen_cr6_test_for_zero (target)); - break; - case 1: - emit_insn (gen_cr6_test_for_zero_reverse (target)); - break; - case 2: - emit_insn (gen_cr6_test_for_lt (target)); - break; - case 3: - emit_insn (gen_cr6_test_for_lt_reverse (target)); - break; - default: - error ("argument 1 of %qs is out of range", - "__builtin_altivec_predicate"); - break; - } - - return target; -} - rtx swap_endian_selector_for_mode (machine_mode mode) { @@ -3100,3271 +2839,6 @@ swap_endian_selector_for_mode (machine_mode mode) gen_rtvec_v (16, perm))); } -/* Return the correct ICODE value depending on whether we are - setting or reading the HTM SPRs. */ -static inline enum insn_code -rs6000_htm_spr_icode (bool nonvoid) -{ - if (nonvoid) - return (TARGET_POWERPC64) ? CODE_FOR_htm_mfspr_di : CODE_FOR_htm_mfspr_si; - else - return (TARGET_POWERPC64) ? CODE_FOR_htm_mtspr_di : CODE_FOR_htm_mtspr_si; -} - -/* Expand vec_init builtin. */ -static rtx -altivec_expand_vec_init_builtin (tree type, tree exp, rtx target) -{ - machine_mode tmode = TYPE_MODE (type); - machine_mode inner_mode = GET_MODE_INNER (tmode); - int i, n_elt = GET_MODE_NUNITS (tmode); - - gcc_assert (VECTOR_MODE_P (tmode)); - gcc_assert (n_elt == call_expr_nargs (exp)); - - if (!target || !register_operand (target, tmode)) - target = gen_reg_rtx (tmode); - - /* If we have a vector compromised of a single element, such as V1TImode, do - the initialization directly. */ - if (n_elt == 1 && GET_MODE_SIZE (tmode) == GET_MODE_SIZE (inner_mode)) - { - rtx x = expand_normal (CALL_EXPR_ARG (exp, 0)); - emit_move_insn (target, gen_lowpart (tmode, x)); - } - else - { - rtvec v = rtvec_alloc (n_elt); - - for (i = 0; i < n_elt; ++i) - { - rtx x = expand_normal (CALL_EXPR_ARG (exp, i)); - RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x); - } - - rs6000_expand_vector_init (target, gen_rtx_PARALLEL (tmode, v)); - } - - return target; -} - -/* Return the integer constant in ARG. Constrain it to be in the range - of the subparts of VEC_TYPE; issue an error if not. */ - -static int -get_element_number (tree vec_type, tree arg) -{ - unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1; - - if (!tree_fits_uhwi_p (arg) - || (elt = tree_to_uhwi (arg), elt > max)) - { - error ("selector must be an integer constant in the range [0, %wi]", max); - return 0; - } - - return elt; -} - -/* Expand vec_set builtin. */ -static rtx -altivec_expand_vec_set_builtin (tree exp) -{ - machine_mode tmode, mode1; - tree arg0, arg1, arg2; - int elt; - rtx op0, op1; - - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - - tmode = TYPE_MODE (TREE_TYPE (arg0)); - mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); - gcc_assert (VECTOR_MODE_P (tmode)); - - op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL); - op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL); - elt = get_element_number (TREE_TYPE (arg0), arg2); - - if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode) - op1 = convert_modes (mode1, GET_MODE (op1), op1, true); - - op0 = force_reg (tmode, op0); - op1 = force_reg (mode1, op1); - - rs6000_expand_vector_set (op0, op1, GEN_INT (elt)); - - return op0; -} - -/* Expand vec_ext builtin. */ -static rtx -altivec_expand_vec_ext_builtin (tree exp, rtx target) -{ - machine_mode tmode, mode0; - tree arg0, arg1; - rtx op0; - rtx op1; - - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - - if (TREE_CODE (arg1) == INTEGER_CST) - { - unsigned HOST_WIDE_INT elt; - unsigned HOST_WIDE_INT size = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); - unsigned int truncated_selector; - /* Even if !tree_fits_uhwi_p (arg1)), TREE_INT_CST_LOW (arg0) - returns low-order bits of INTEGER_CST for modulo indexing. */ - elt = TREE_INT_CST_LOW (arg1); - truncated_selector = elt % size; - op1 = GEN_INT (truncated_selector); - } - - tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); - mode0 = TYPE_MODE (TREE_TYPE (arg0)); - gcc_assert (VECTOR_MODE_P (mode0)); - - op0 = force_reg (mode0, op0); - - if (optimize || !target || !register_operand (target, tmode)) - target = gen_reg_rtx (tmode); - - rs6000_expand_vector_extract (target, op0, op1); - - return target; -} - -/* Raise an error message for a builtin function that is called without the - appropriate target options being set. */ - -void -rs6000_invalid_builtin (enum rs6000_gen_builtins fncode) -{ - size_t j = (size_t) fncode; - const char *name = rs6000_builtin_info[j].bifname; - - switch (rs6000_builtin_info[j].enable) - { - case ENB_P5: - error ("%qs requires the %qs option", name, "-mcpu=power5"); - break; - case ENB_P6: - error ("%qs requires the %qs option", name, "-mcpu=power6"); - break; - case ENB_P6_64: - error ("%qs requires the %qs option and either the %qs or %qs option", - name, "-mcpu=power6", "-m64", "-mpowerpc64"); - break; - case ENB_ALTIVEC: - error ("%qs requires the %qs option", name, "-maltivec"); - break; - case ENB_CELL: - error ("%qs requires the %qs option", name, "-mcpu=cell"); - break; - case ENB_VSX: - error ("%qs requires the %qs option", name, "-mvsx"); - break; - case ENB_P7: - error ("%qs requires the %qs option", name, "-mcpu=power7"); - break; - case ENB_P7_64: - error ("%qs requires the %qs option and either the %qs or %qs option", - name, "-mcpu=power7", "-m64", "-mpowerpc64"); - break; - case ENB_P8: - error ("%qs requires the %qs option", name, "-mcpu=power8"); - break; - case ENB_P8V: - error ("%qs requires the %qs and %qs options", name, "-mcpu=power8", - "-mvsx"); - break; - case ENB_P9: - error ("%qs requires the %qs option", name, "-mcpu=power9"); - break; - case ENB_P9_64: - error ("%qs requires the %qs option and either the %qs or %qs option", - name, "-mcpu=power9", "-m64", "-mpowerpc64"); - break; - case ENB_P9V: - error ("%qs requires the %qs and %qs options", name, "-mcpu=power9", - "-mvsx"); - break; - case ENB_IEEE128_HW: - error ("%qs requires quad-precision floating-point arithmetic", name); - break; - case ENB_DFP: - error ("%qs requires the %qs option", name, "-mhard-dfp"); - break; - case ENB_CRYPTO: - error ("%qs requires the %qs option", name, "-mcrypto"); - break; - case ENB_HTM: - error ("%qs requires the %qs option", name, "-mhtm"); - break; - case ENB_P10: - error ("%qs requires the %qs option", name, "-mcpu=power10"); - break; - case ENB_P10_64: - error ("%qs requires the %qs option and either the %qs or %qs option", - name, "-mcpu=power10", "-m64", "-mpowerpc64"); - break; - case ENB_MMA: - error ("%qs requires the %qs option", name, "-mmma"); - break; - default: - case ENB_ALWAYS: - gcc_unreachable (); - } -} - -/* Target hook for early folding of built-ins, shamelessly stolen - from ia64.cc. */ - -tree -rs6000_fold_builtin (tree fndecl ATTRIBUTE_UNUSED, - int n_args ATTRIBUTE_UNUSED, - tree *args ATTRIBUTE_UNUSED, - bool ignore ATTRIBUTE_UNUSED) -{ -#ifdef SUBTARGET_FOLD_BUILTIN - return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore); -#else - return NULL_TREE; -#endif -} - -/* Helper function to handle the gimple folding of a vector compare - operation. This sets up true/false vectors, and uses the - VEC_COND_EXPR operation. - CODE indicates which comparison is to be made. (EQ, GT, ...). - TYPE indicates the type of the result. - Code is inserted before GSI. */ -static tree -fold_build_vec_cmp (tree_code code, tree type, tree arg0, tree arg1, - gimple_stmt_iterator *gsi) -{ - tree cmp_type = truth_type_for (type); - tree zero_vec = build_zero_cst (type); - tree minus_one_vec = build_minus_one_cst (type); - tree temp = create_tmp_reg_or_ssa_name (cmp_type); - gimple *g = gimple_build_assign (temp, code, arg0, arg1); - gsi_insert_before (gsi, g, GSI_SAME_STMT); - return fold_build3 (VEC_COND_EXPR, type, temp, minus_one_vec, zero_vec); -} - -/* Helper function to handle the in-between steps for the - vector compare built-ins. */ -static void -fold_compare_helper (gimple_stmt_iterator *gsi, tree_code code, gimple *stmt) -{ - tree arg0 = gimple_call_arg (stmt, 0); - tree arg1 = gimple_call_arg (stmt, 1); - tree lhs = gimple_call_lhs (stmt); - tree cmp = fold_build_vec_cmp (code, TREE_TYPE (lhs), arg0, arg1, gsi); - gimple *g = gimple_build_assign (lhs, cmp); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); -} - -/* Helper function to map V2DF and V4SF types to their - integral equivalents (V2DI and V4SI). */ -tree map_to_integral_tree_type (tree input_tree_type) -{ - if (INTEGRAL_TYPE_P (TREE_TYPE (input_tree_type))) - return input_tree_type; - else - { - if (types_compatible_p (TREE_TYPE (input_tree_type), - TREE_TYPE (V2DF_type_node))) - return V2DI_type_node; - else if (types_compatible_p (TREE_TYPE (input_tree_type), - TREE_TYPE (V4SF_type_node))) - return V4SI_type_node; - else - gcc_unreachable (); - } -} - -/* Helper function to handle the vector merge[hl] built-ins. The - implementation difference between h and l versions for this code are in - the values used when building of the permute vector for high word versus - low word merge. The variance is keyed off the use_high parameter. */ -static void -fold_mergehl_helper (gimple_stmt_iterator *gsi, gimple *stmt, int use_high) -{ - tree arg0 = gimple_call_arg (stmt, 0); - tree arg1 = gimple_call_arg (stmt, 1); - tree lhs = gimple_call_lhs (stmt); - tree lhs_type = TREE_TYPE (lhs); - int n_elts = TYPE_VECTOR_SUBPARTS (lhs_type); - int midpoint = n_elts / 2; - int offset = 0; - - if (use_high == 1) - offset = midpoint; - - /* The permute_type will match the lhs for integral types. For double and - float types, the permute type needs to map to the V2 or V4 type that - matches size. */ - tree permute_type; - permute_type = map_to_integral_tree_type (lhs_type); - tree_vector_builder elts (permute_type, VECTOR_CST_NELTS (arg0), 1); - - for (int i = 0; i < midpoint; i++) - { - elts.safe_push (build_int_cst (TREE_TYPE (permute_type), - offset + i)); - elts.safe_push (build_int_cst (TREE_TYPE (permute_type), - offset + n_elts + i)); - } - - tree permute = elts.build (); - - gimple *g = gimple_build_assign (lhs, VEC_PERM_EXPR, arg0, arg1, permute); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); -} - -/* Helper function to handle the vector merge[eo] built-ins. */ -static void -fold_mergeeo_helper (gimple_stmt_iterator *gsi, gimple *stmt, int use_odd) -{ - tree arg0 = gimple_call_arg (stmt, 0); - tree arg1 = gimple_call_arg (stmt, 1); - tree lhs = gimple_call_lhs (stmt); - tree lhs_type = TREE_TYPE (lhs); - int n_elts = TYPE_VECTOR_SUBPARTS (lhs_type); - - /* The permute_type will match the lhs for integral types. For double and - float types, the permute type needs to map to the V2 or V4 type that - matches size. */ - tree permute_type; - permute_type = map_to_integral_tree_type (lhs_type); - - tree_vector_builder elts (permute_type, VECTOR_CST_NELTS (arg0), 1); - - /* Build the permute vector. */ - for (int i = 0; i < n_elts / 2; i++) - { - elts.safe_push (build_int_cst (TREE_TYPE (permute_type), - 2*i + use_odd)); - elts.safe_push (build_int_cst (TREE_TYPE (permute_type), - 2*i + use_odd + n_elts)); - } - - tree permute = elts.build (); - - gimple *g = gimple_build_assign (lhs, VEC_PERM_EXPR, arg0, arg1, permute); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); -} - -/* Helper function to sort out which built-ins may be valid without having - a LHS. */ -static bool -rs6000_builtin_valid_without_lhs (enum rs6000_gen_builtins fn_code, - tree fndecl) -{ - if (TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node) - return true; - - switch (fn_code) - { - case RS6000_BIF_STVX_V16QI: - case RS6000_BIF_STVX_V8HI: - case RS6000_BIF_STVX_V4SI: - case RS6000_BIF_STVX_V4SF: - case RS6000_BIF_STVX_V2DI: - case RS6000_BIF_STVX_V2DF: - case RS6000_BIF_STXVW4X_V16QI: - case RS6000_BIF_STXVW4X_V8HI: - case RS6000_BIF_STXVW4X_V4SF: - case RS6000_BIF_STXVW4X_V4SI: - case RS6000_BIF_STXVD2X_V2DF: - case RS6000_BIF_STXVD2X_V2DI: - return true; - default: - return false; - } -} - -/* Check whether a builtin function is supported in this target - configuration. */ -bool -rs6000_builtin_is_supported (enum rs6000_gen_builtins fncode) -{ - switch (rs6000_builtin_info[(size_t) fncode].enable) - { - case ENB_ALWAYS: - return true; - case ENB_P5: - return TARGET_POPCNTB; - case ENB_P6: - return TARGET_CMPB; - case ENB_P6_64: - return TARGET_CMPB && TARGET_POWERPC64; - case ENB_P7: - return TARGET_POPCNTD; - case ENB_P7_64: - return TARGET_POPCNTD && TARGET_POWERPC64; - case ENB_P8: - return TARGET_DIRECT_MOVE; - case ENB_P8V: - return TARGET_P8_VECTOR; - case ENB_P9: - return TARGET_MODULO; - case ENB_P9_64: - return TARGET_MODULO && TARGET_POWERPC64; - case ENB_P9V: - return TARGET_P9_VECTOR; - case ENB_P10: - return TARGET_POWER10; - case ENB_P10_64: - return TARGET_POWER10 && TARGET_POWERPC64; - case ENB_ALTIVEC: - return TARGET_ALTIVEC; - case ENB_VSX: - return TARGET_VSX; - case ENB_CELL: - return TARGET_ALTIVEC && rs6000_cpu == PROCESSOR_CELL; - case ENB_IEEE128_HW: - return TARGET_FLOAT128_HW; - case ENB_DFP: - return TARGET_DFP; - case ENB_CRYPTO: - return TARGET_CRYPTO; - case ENB_HTM: - return TARGET_HTM; - case ENB_MMA: - return TARGET_MMA; - default: - gcc_unreachable (); - } - gcc_unreachable (); -} - -/* Expand the MMA built-ins early, so that we can convert the pass-by-reference - __vector_quad arguments into pass-by-value arguments, leading to more - efficient code generation. */ -static bool -rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi, - rs6000_gen_builtins fn_code) -{ - gimple *stmt = gsi_stmt (*gsi); - size_t fncode = (size_t) fn_code; - - if (!bif_is_mma (rs6000_builtin_info[fncode])) - return false; - - /* Each call that can be gimple-expanded has an associated built-in - function that it will expand into. If this one doesn't, we have - already expanded it! Exceptions: lxvp and stxvp. */ - if (rs6000_builtin_info[fncode].assoc_bif == RS6000_BIF_NONE - && fncode != RS6000_BIF_LXVP - && fncode != RS6000_BIF_STXVP) - return false; - - bifdata *bd = &rs6000_builtin_info[fncode]; - unsigned nopnds = bd->nargs; - gimple_seq new_seq = NULL; - gimple *new_call; - tree new_decl; - - /* Compatibility built-ins; we used to call these - __builtin_mma_{dis,}assemble_pair, but now we call them - __builtin_vsx_{dis,}assemble_pair. Handle the old versions. */ - if (fncode == RS6000_BIF_ASSEMBLE_PAIR) - fncode = RS6000_BIF_ASSEMBLE_PAIR_V; - else if (fncode == RS6000_BIF_DISASSEMBLE_PAIR) - fncode = RS6000_BIF_DISASSEMBLE_PAIR_V; - - if (fncode == RS6000_BIF_DISASSEMBLE_ACC - || fncode == RS6000_BIF_DISASSEMBLE_PAIR_V) - { - /* This is an MMA disassemble built-in function. */ - push_gimplify_context (true); - unsigned nvec = (fncode == RS6000_BIF_DISASSEMBLE_ACC) ? 4 : 2; - tree dst_ptr = gimple_call_arg (stmt, 0); - tree src_ptr = gimple_call_arg (stmt, 1); - tree src_type = TREE_TYPE (src_ptr); - tree src = create_tmp_reg_or_ssa_name (TREE_TYPE (src_type)); - gimplify_assign (src, build_simple_mem_ref (src_ptr), &new_seq); - - /* If we are not disassembling an accumulator/pair or our destination is - another accumulator/pair, then just copy the entire thing as is. */ - if ((fncode == RS6000_BIF_DISASSEMBLE_ACC - && TREE_TYPE (TREE_TYPE (dst_ptr)) == vector_quad_type_node) - || (fncode == RS6000_BIF_DISASSEMBLE_PAIR_V - && TREE_TYPE (TREE_TYPE (dst_ptr)) == vector_pair_type_node)) - { - tree dst = build_simple_mem_ref (build1 (VIEW_CONVERT_EXPR, - src_type, dst_ptr)); - gimplify_assign (dst, src, &new_seq); - pop_gimplify_context (NULL); - gsi_replace_with_seq (gsi, new_seq, true); - return true; - } - - /* If we're disassembling an accumulator into a different type, we need - to emit a xxmfacc instruction now, since we cannot do it later. */ - if (fncode == RS6000_BIF_DISASSEMBLE_ACC) - { - new_decl = rs6000_builtin_decls[RS6000_BIF_XXMFACC_INTERNAL]; - new_call = gimple_build_call (new_decl, 1, src); - src = create_tmp_reg_or_ssa_name (vector_quad_type_node); - gimple_call_set_lhs (new_call, src); - gimple_seq_add_stmt (&new_seq, new_call); - } - - /* Copy the accumulator/pair vector by vector. */ - new_decl - = rs6000_builtin_decls[rs6000_builtin_info[fncode].assoc_bif]; - tree dst_type = build_pointer_type_for_mode (unsigned_V16QI_type_node, - ptr_mode, true); - tree dst_base = build1 (VIEW_CONVERT_EXPR, dst_type, dst_ptr); - for (unsigned i = 0; i < nvec; i++) - { - unsigned index = WORDS_BIG_ENDIAN ? i : nvec - 1 - i; - tree dst = build2 (MEM_REF, unsigned_V16QI_type_node, dst_base, - build_int_cst (dst_type, index * 16)); - tree dstssa = create_tmp_reg_or_ssa_name (unsigned_V16QI_type_node); - new_call = gimple_build_call (new_decl, 2, src, - build_int_cstu (uint16_type_node, i)); - gimple_call_set_lhs (new_call, dstssa); - gimple_seq_add_stmt (&new_seq, new_call); - gimplify_assign (dst, dstssa, &new_seq); - } - pop_gimplify_context (NULL); - gsi_replace_with_seq (gsi, new_seq, true); - return true; - } - - /* TODO: Do some factoring on these two chunks. */ - if (fncode == RS6000_BIF_LXVP) - { - push_gimplify_context (true); - tree offset = gimple_call_arg (stmt, 0); - tree ptr = gimple_call_arg (stmt, 1); - tree lhs = gimple_call_lhs (stmt); - if (TREE_TYPE (TREE_TYPE (ptr)) != vector_pair_type_node) - ptr = build1 (VIEW_CONVERT_EXPR, - build_pointer_type (vector_pair_type_node), ptr); - tree mem = build_simple_mem_ref (build2 (POINTER_PLUS_EXPR, - TREE_TYPE (ptr), ptr, offset)); - gimplify_assign (lhs, mem, &new_seq); - pop_gimplify_context (NULL); - gsi_replace_with_seq (gsi, new_seq, true); - return true; - } - - if (fncode == RS6000_BIF_STXVP) - { - push_gimplify_context (true); - tree src = gimple_call_arg (stmt, 0); - tree offset = gimple_call_arg (stmt, 1); - tree ptr = gimple_call_arg (stmt, 2); - if (TREE_TYPE (TREE_TYPE (ptr)) != vector_pair_type_node) - ptr = build1 (VIEW_CONVERT_EXPR, - build_pointer_type (vector_pair_type_node), ptr); - tree mem = build_simple_mem_ref (build2 (POINTER_PLUS_EXPR, - TREE_TYPE (ptr), ptr, offset)); - gimplify_assign (mem, src, &new_seq); - pop_gimplify_context (NULL); - gsi_replace_with_seq (gsi, new_seq, true); - return true; - } - - /* Convert this built-in into an internal version that uses pass-by-value - arguments. The internal built-in is found in the assoc_bif field. */ - new_decl = rs6000_builtin_decls[rs6000_builtin_info[fncode].assoc_bif]; - tree lhs, op[MAX_MMA_OPERANDS]; - tree acc = gimple_call_arg (stmt, 0); - push_gimplify_context (true); - - if (bif_is_quad (*bd)) - { - /* This built-in has a pass-by-reference accumulator input, so load it - into a temporary accumulator for use as a pass-by-value input. */ - op[0] = create_tmp_reg_or_ssa_name (vector_quad_type_node); - for (unsigned i = 1; i < nopnds; i++) - op[i] = gimple_call_arg (stmt, i); - gimplify_assign (op[0], build_simple_mem_ref (acc), &new_seq); - } - else - { - /* This built-in does not use its pass-by-reference accumulator argument - as an input argument, so remove it from the input list. */ - nopnds--; - for (unsigned i = 0; i < nopnds; i++) - op[i] = gimple_call_arg (stmt, i + 1); - } - - switch (nopnds) - { - case 0: - new_call = gimple_build_call (new_decl, 0); - break; - case 1: - new_call = gimple_build_call (new_decl, 1, op[0]); - break; - case 2: - new_call = gimple_build_call (new_decl, 2, op[0], op[1]); - break; - case 3: - new_call = gimple_build_call (new_decl, 3, op[0], op[1], op[2]); - break; - case 4: - new_call = gimple_build_call (new_decl, 4, op[0], op[1], op[2], op[3]); - break; - case 5: - new_call = gimple_build_call (new_decl, 5, op[0], op[1], op[2], op[3], - op[4]); - break; - case 6: - new_call = gimple_build_call (new_decl, 6, op[0], op[1], op[2], op[3], - op[4], op[5]); - break; - case 7: - new_call = gimple_build_call (new_decl, 7, op[0], op[1], op[2], op[3], - op[4], op[5], op[6]); - break; - default: - gcc_unreachable (); - } - - if (fncode == RS6000_BIF_BUILD_PAIR || fncode == RS6000_BIF_ASSEMBLE_PAIR_V) - lhs = create_tmp_reg_or_ssa_name (vector_pair_type_node); - else - lhs = create_tmp_reg_or_ssa_name (vector_quad_type_node); - gimple_call_set_lhs (new_call, lhs); - gimple_seq_add_stmt (&new_seq, new_call); - gimplify_assign (build_simple_mem_ref (acc), lhs, &new_seq); - pop_gimplify_context (NULL); - gsi_replace_with_seq (gsi, new_seq, true); - - return true; -} - -/* Fold a machine-dependent built-in in GIMPLE. (For folding into - a constant, use rs6000_fold_builtin.) */ -bool -rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi) -{ - gimple *stmt = gsi_stmt (*gsi); - tree fndecl = gimple_call_fndecl (stmt); - gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD); - enum rs6000_gen_builtins fn_code - = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl); - tree arg0, arg1, lhs, temp; - enum tree_code bcode; - gimple *g; - - size_t uns_fncode = (size_t) fn_code; - enum insn_code icode = rs6000_builtin_info[uns_fncode].icode; - const char *fn_name1 = rs6000_builtin_info[uns_fncode].bifname; - const char *fn_name2 = (icode != CODE_FOR_nothing) - ? get_insn_name ((int) icode) - : "nothing"; - - if (TARGET_DEBUG_BUILTIN) - fprintf (stderr, "rs6000_gimple_fold_builtin %d %s %s\n", - fn_code, fn_name1, fn_name2); - - if (!rs6000_fold_gimple) - return false; - - /* Prevent gimple folding for code that does not have a LHS, unless it is - allowed per the rs6000_builtin_valid_without_lhs helper function. */ - if (!gimple_call_lhs (stmt) - && !rs6000_builtin_valid_without_lhs (fn_code, fndecl)) - return false; - - /* Don't fold invalid builtins, let rs6000_expand_builtin diagnose it. */ - if (!rs6000_builtin_is_supported (fn_code)) - return false; - - if (rs6000_gimple_fold_mma_builtin (gsi, fn_code)) - return true; - - switch (fn_code) - { - /* Flavors of vec_add. We deliberately don't expand - RS6000_BIF_VADDUQM as it gets lowered from V1TImode to - TImode, resulting in much poorer code generation. */ - case RS6000_BIF_VADDUBM: - case RS6000_BIF_VADDUHM: - case RS6000_BIF_VADDUWM: - case RS6000_BIF_VADDUDM: - case RS6000_BIF_VADDFP: - case RS6000_BIF_XVADDDP: - case RS6000_BIF_XVADDSP: - bcode = PLUS_EXPR; - do_binary: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - if (INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (lhs))) - && !TYPE_OVERFLOW_WRAPS (TREE_TYPE (TREE_TYPE (lhs)))) - { - /* Ensure the binary operation is performed in a type - that wraps if it is integral type. */ - gimple_seq stmts = NULL; - tree type = unsigned_type_for (TREE_TYPE (lhs)); - tree uarg0 = gimple_build (&stmts, VIEW_CONVERT_EXPR, - type, arg0); - tree uarg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR, - type, arg1); - tree res = gimple_build (&stmts, gimple_location (stmt), bcode, - type, uarg0, uarg1); - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - g = gimple_build_assign (lhs, VIEW_CONVERT_EXPR, - build1 (VIEW_CONVERT_EXPR, - TREE_TYPE (lhs), res)); - gsi_replace (gsi, g, true); - return true; - } - g = gimple_build_assign (lhs, bcode, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Flavors of vec_sub. We deliberately don't expand - RS6000_BIF_VSUBUQM. */ - case RS6000_BIF_VSUBUBM: - case RS6000_BIF_VSUBUHM: - case RS6000_BIF_VSUBUWM: - case RS6000_BIF_VSUBUDM: - case RS6000_BIF_VSUBFP: - case RS6000_BIF_XVSUBDP: - case RS6000_BIF_XVSUBSP: - bcode = MINUS_EXPR; - goto do_binary; - case RS6000_BIF_XVMULSP: - case RS6000_BIF_XVMULDP: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - g = gimple_build_assign (lhs, MULT_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Even element flavors of vec_mul (signed). */ - case RS6000_BIF_VMULESB: - case RS6000_BIF_VMULESH: - case RS6000_BIF_VMULESW: - /* Even element flavors of vec_mul (unsigned). */ - case RS6000_BIF_VMULEUB: - case RS6000_BIF_VMULEUH: - case RS6000_BIF_VMULEUW: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - g = gimple_build_assign (lhs, VEC_WIDEN_MULT_EVEN_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Odd element flavors of vec_mul (signed). */ - case RS6000_BIF_VMULOSB: - case RS6000_BIF_VMULOSH: - case RS6000_BIF_VMULOSW: - /* Odd element flavors of vec_mul (unsigned). */ - case RS6000_BIF_VMULOUB: - case RS6000_BIF_VMULOUH: - case RS6000_BIF_VMULOUW: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - g = gimple_build_assign (lhs, VEC_WIDEN_MULT_ODD_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Flavors of vec_div (Integer). */ - case RS6000_BIF_DIV_V2DI: - case RS6000_BIF_UDIV_V2DI: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - g = gimple_build_assign (lhs, TRUNC_DIV_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Flavors of vec_div (Float). */ - case RS6000_BIF_XVDIVSP: - case RS6000_BIF_XVDIVDP: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - g = gimple_build_assign (lhs, RDIV_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Flavors of vec_and. */ - case RS6000_BIF_VAND_V16QI_UNS: - case RS6000_BIF_VAND_V16QI: - case RS6000_BIF_VAND_V8HI_UNS: - case RS6000_BIF_VAND_V8HI: - case RS6000_BIF_VAND_V4SI_UNS: - case RS6000_BIF_VAND_V4SI: - case RS6000_BIF_VAND_V2DI_UNS: - case RS6000_BIF_VAND_V2DI: - case RS6000_BIF_VAND_V4SF: - case RS6000_BIF_VAND_V2DF: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - g = gimple_build_assign (lhs, BIT_AND_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Flavors of vec_andc. */ - case RS6000_BIF_VANDC_V16QI_UNS: - case RS6000_BIF_VANDC_V16QI: - case RS6000_BIF_VANDC_V8HI_UNS: - case RS6000_BIF_VANDC_V8HI: - case RS6000_BIF_VANDC_V4SI_UNS: - case RS6000_BIF_VANDC_V4SI: - case RS6000_BIF_VANDC_V2DI_UNS: - case RS6000_BIF_VANDC_V2DI: - case RS6000_BIF_VANDC_V4SF: - case RS6000_BIF_VANDC_V2DF: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1)); - g = gimple_build_assign (temp, BIT_NOT_EXPR, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_insert_before (gsi, g, GSI_SAME_STMT); - g = gimple_build_assign (lhs, BIT_AND_EXPR, arg0, temp); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Flavors of vec_nand. */ - case RS6000_BIF_NAND_V16QI_UNS: - case RS6000_BIF_NAND_V16QI: - case RS6000_BIF_NAND_V8HI_UNS: - case RS6000_BIF_NAND_V8HI: - case RS6000_BIF_NAND_V4SI_UNS: - case RS6000_BIF_NAND_V4SI: - case RS6000_BIF_NAND_V2DI_UNS: - case RS6000_BIF_NAND_V2DI: - case RS6000_BIF_NAND_V4SF: - case RS6000_BIF_NAND_V2DF: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1)); - g = gimple_build_assign (temp, BIT_AND_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_insert_before (gsi, g, GSI_SAME_STMT); - g = gimple_build_assign (lhs, BIT_NOT_EXPR, temp); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Flavors of vec_or. */ - case RS6000_BIF_VOR_V16QI_UNS: - case RS6000_BIF_VOR_V16QI: - case RS6000_BIF_VOR_V8HI_UNS: - case RS6000_BIF_VOR_V8HI: - case RS6000_BIF_VOR_V4SI_UNS: - case RS6000_BIF_VOR_V4SI: - case RS6000_BIF_VOR_V2DI_UNS: - case RS6000_BIF_VOR_V2DI: - case RS6000_BIF_VOR_V4SF: - case RS6000_BIF_VOR_V2DF: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - g = gimple_build_assign (lhs, BIT_IOR_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* flavors of vec_orc. */ - case RS6000_BIF_ORC_V16QI_UNS: - case RS6000_BIF_ORC_V16QI: - case RS6000_BIF_ORC_V8HI_UNS: - case RS6000_BIF_ORC_V8HI: - case RS6000_BIF_ORC_V4SI_UNS: - case RS6000_BIF_ORC_V4SI: - case RS6000_BIF_ORC_V2DI_UNS: - case RS6000_BIF_ORC_V2DI: - case RS6000_BIF_ORC_V4SF: - case RS6000_BIF_ORC_V2DF: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1)); - g = gimple_build_assign (temp, BIT_NOT_EXPR, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_insert_before (gsi, g, GSI_SAME_STMT); - g = gimple_build_assign (lhs, BIT_IOR_EXPR, arg0, temp); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Flavors of vec_xor. */ - case RS6000_BIF_VXOR_V16QI_UNS: - case RS6000_BIF_VXOR_V16QI: - case RS6000_BIF_VXOR_V8HI_UNS: - case RS6000_BIF_VXOR_V8HI: - case RS6000_BIF_VXOR_V4SI_UNS: - case RS6000_BIF_VXOR_V4SI: - case RS6000_BIF_VXOR_V2DI_UNS: - case RS6000_BIF_VXOR_V2DI: - case RS6000_BIF_VXOR_V4SF: - case RS6000_BIF_VXOR_V2DF: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - g = gimple_build_assign (lhs, BIT_XOR_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Flavors of vec_nor. */ - case RS6000_BIF_VNOR_V16QI_UNS: - case RS6000_BIF_VNOR_V16QI: - case RS6000_BIF_VNOR_V8HI_UNS: - case RS6000_BIF_VNOR_V8HI: - case RS6000_BIF_VNOR_V4SI_UNS: - case RS6000_BIF_VNOR_V4SI: - case RS6000_BIF_VNOR_V2DI_UNS: - case RS6000_BIF_VNOR_V2DI: - case RS6000_BIF_VNOR_V4SF: - case RS6000_BIF_VNOR_V2DF: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1)); - g = gimple_build_assign (temp, BIT_IOR_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_insert_before (gsi, g, GSI_SAME_STMT); - g = gimple_build_assign (lhs, BIT_NOT_EXPR, temp); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* flavors of vec_abs. */ - case RS6000_BIF_ABS_V16QI: - case RS6000_BIF_ABS_V8HI: - case RS6000_BIF_ABS_V4SI: - case RS6000_BIF_ABS_V4SF: - case RS6000_BIF_ABS_V2DI: - case RS6000_BIF_XVABSDP: - case RS6000_BIF_XVABSSP: - arg0 = gimple_call_arg (stmt, 0); - if (INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (arg0))) - && !TYPE_OVERFLOW_WRAPS (TREE_TYPE (TREE_TYPE (arg0)))) - return false; - lhs = gimple_call_lhs (stmt); - g = gimple_build_assign (lhs, ABS_EXPR, arg0); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* flavors of vec_min. */ - case RS6000_BIF_XVMINDP: - case RS6000_BIF_XVMINSP: - case RS6000_BIF_VMINFP: - { - lhs = gimple_call_lhs (stmt); - tree type = TREE_TYPE (lhs); - if (HONOR_NANS (type)) - return false; - gcc_fallthrough (); - } - case RS6000_BIF_VMINSD: - case RS6000_BIF_VMINUD: - case RS6000_BIF_VMINSB: - case RS6000_BIF_VMINSH: - case RS6000_BIF_VMINSW: - case RS6000_BIF_VMINUB: - case RS6000_BIF_VMINUH: - case RS6000_BIF_VMINUW: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - g = gimple_build_assign (lhs, MIN_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* flavors of vec_max. */ - case RS6000_BIF_XVMAXDP: - case RS6000_BIF_XVMAXSP: - case RS6000_BIF_VMAXFP: - { - lhs = gimple_call_lhs (stmt); - tree type = TREE_TYPE (lhs); - if (HONOR_NANS (type)) - return false; - gcc_fallthrough (); - } - case RS6000_BIF_VMAXSD: - case RS6000_BIF_VMAXUD: - case RS6000_BIF_VMAXSB: - case RS6000_BIF_VMAXSH: - case RS6000_BIF_VMAXSW: - case RS6000_BIF_VMAXUB: - case RS6000_BIF_VMAXUH: - case RS6000_BIF_VMAXUW: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - g = gimple_build_assign (lhs, MAX_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Flavors of vec_eqv. */ - case RS6000_BIF_EQV_V16QI: - case RS6000_BIF_EQV_V8HI: - case RS6000_BIF_EQV_V4SI: - case RS6000_BIF_EQV_V4SF: - case RS6000_BIF_EQV_V2DF: - case RS6000_BIF_EQV_V2DI: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1)); - g = gimple_build_assign (temp, BIT_XOR_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_insert_before (gsi, g, GSI_SAME_STMT); - g = gimple_build_assign (lhs, BIT_NOT_EXPR, temp); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Flavors of vec_rotate_left. */ - case RS6000_BIF_VRLB: - case RS6000_BIF_VRLH: - case RS6000_BIF_VRLW: - case RS6000_BIF_VRLD: - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - g = gimple_build_assign (lhs, LROTATE_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - /* Flavors of vector shift right algebraic. - vec_sra{b,h,w} -> vsra{b,h,w}. */ - case RS6000_BIF_VSRAB: - case RS6000_BIF_VSRAH: - case RS6000_BIF_VSRAW: - case RS6000_BIF_VSRAD: - { - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - tree arg1_type = TREE_TYPE (arg1); - tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1)); - tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type)); - location_t loc = gimple_location (stmt); - /* Force arg1 into the range valid matching the arg0 type. */ - /* Build a vector consisting of the max valid bit-size values. */ - int n_elts = VECTOR_CST_NELTS (arg1); - tree element_size = build_int_cst (unsigned_element_type, - 128 / n_elts); - tree_vector_builder elts (unsigned_arg1_type, n_elts, 1); - for (int i = 0; i < n_elts; i++) - elts.safe_push (element_size); - tree modulo_tree = elts.build (); - /* Modulo the provided shift value against that vector. */ - gimple_seq stmts = NULL; - tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR, - unsigned_arg1_type, arg1); - tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR, - unsigned_arg1_type, unsigned_arg1, - modulo_tree); - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - /* And finally, do the shift. */ - g = gimple_build_assign (lhs, RSHIFT_EXPR, arg0, new_arg1); - gimple_set_location (g, loc); - gsi_replace (gsi, g, true); - return true; - } - /* Flavors of vector shift left. - builtin_altivec_vsl{b,h,w} -> vsl{b,h,w}. */ - case RS6000_BIF_VSLB: - case RS6000_BIF_VSLH: - case RS6000_BIF_VSLW: - case RS6000_BIF_VSLD: - { - location_t loc; - gimple_seq stmts = NULL; - arg0 = gimple_call_arg (stmt, 0); - tree arg0_type = TREE_TYPE (arg0); - if (INTEGRAL_TYPE_P (TREE_TYPE (arg0_type)) - && !TYPE_OVERFLOW_WRAPS (TREE_TYPE (arg0_type))) - return false; - arg1 = gimple_call_arg (stmt, 1); - tree arg1_type = TREE_TYPE (arg1); - tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1)); - tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type)); - loc = gimple_location (stmt); - lhs = gimple_call_lhs (stmt); - /* Force arg1 into the range valid matching the arg0 type. */ - /* Build a vector consisting of the max valid bit-size values. */ - int n_elts = VECTOR_CST_NELTS (arg1); - int tree_size_in_bits = TREE_INT_CST_LOW (size_in_bytes (arg1_type)) - * BITS_PER_UNIT; - tree element_size = build_int_cst (unsigned_element_type, - tree_size_in_bits / n_elts); - tree_vector_builder elts (unsigned_type_for (arg1_type), n_elts, 1); - for (int i = 0; i < n_elts; i++) - elts.safe_push (element_size); - tree modulo_tree = elts.build (); - /* Modulo the provided shift value against that vector. */ - tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR, - unsigned_arg1_type, arg1); - tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR, - unsigned_arg1_type, unsigned_arg1, - modulo_tree); - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - /* And finally, do the shift. */ - g = gimple_build_assign (lhs, LSHIFT_EXPR, arg0, new_arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - } - /* Flavors of vector shift right. */ - case RS6000_BIF_VSRB: - case RS6000_BIF_VSRH: - case RS6000_BIF_VSRW: - case RS6000_BIF_VSRD: - { - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - tree arg1_type = TREE_TYPE (arg1); - tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1)); - tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type)); - location_t loc = gimple_location (stmt); - gimple_seq stmts = NULL; - /* Convert arg0 to unsigned. */ - tree arg0_unsigned - = gimple_build (&stmts, VIEW_CONVERT_EXPR, - unsigned_type_for (TREE_TYPE (arg0)), arg0); - /* Force arg1 into the range valid matching the arg0 type. */ - /* Build a vector consisting of the max valid bit-size values. */ - int n_elts = VECTOR_CST_NELTS (arg1); - tree element_size = build_int_cst (unsigned_element_type, - 128 / n_elts); - tree_vector_builder elts (unsigned_arg1_type, n_elts, 1); - for (int i = 0; i < n_elts; i++) - elts.safe_push (element_size); - tree modulo_tree = elts.build (); - /* Modulo the provided shift value against that vector. */ - tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR, - unsigned_arg1_type, arg1); - tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR, - unsigned_arg1_type, unsigned_arg1, - modulo_tree); - /* Do the shift. */ - tree res - = gimple_build (&stmts, RSHIFT_EXPR, - TREE_TYPE (arg0_unsigned), arg0_unsigned, new_arg1); - /* Convert result back to the lhs type. */ - res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res); - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - replace_call_with_value (gsi, res); - return true; - } - /* Vector loads. */ - case RS6000_BIF_LVX_V16QI: - case RS6000_BIF_LVX_V8HI: - case RS6000_BIF_LVX_V4SI: - case RS6000_BIF_LVX_V4SF: - case RS6000_BIF_LVX_V2DI: - case RS6000_BIF_LVX_V2DF: - case RS6000_BIF_LVX_V1TI: - { - arg0 = gimple_call_arg (stmt, 0); // offset - arg1 = gimple_call_arg (stmt, 1); // address - lhs = gimple_call_lhs (stmt); - location_t loc = gimple_location (stmt); - /* Since arg1 may be cast to a different type, just use ptr_type_node - here instead of trying to enforce TBAA on pointer types. */ - tree arg1_type = ptr_type_node; - tree lhs_type = TREE_TYPE (lhs); - /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'. Create - the tree using the value from arg0. The resulting type will match - the type of arg1. */ - gimple_seq stmts = NULL; - tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0); - tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR, - arg1_type, arg1, temp_offset); - /* Mask off any lower bits from the address. */ - tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR, - arg1_type, temp_addr, - build_int_cst (arg1_type, -16)); - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - if (!is_gimple_mem_ref_addr (aligned_addr)) - { - tree t = make_ssa_name (TREE_TYPE (aligned_addr)); - gimple *g = gimple_build_assign (t, aligned_addr); - gsi_insert_before (gsi, g, GSI_SAME_STMT); - aligned_addr = t; - } - /* Use the build2 helper to set up the mem_ref. The MEM_REF could also - take an offset, but since we've already incorporated the offset - above, here we just pass in a zero. */ - gimple *g - = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr, - build_int_cst (arg1_type, 0))); - gimple_set_location (g, loc); - gsi_replace (gsi, g, true); - return true; - } - /* Vector stores. */ - case RS6000_BIF_STVX_V16QI: - case RS6000_BIF_STVX_V8HI: - case RS6000_BIF_STVX_V4SI: - case RS6000_BIF_STVX_V4SF: - case RS6000_BIF_STVX_V2DI: - case RS6000_BIF_STVX_V2DF: - { - arg0 = gimple_call_arg (stmt, 0); /* Value to be stored. */ - arg1 = gimple_call_arg (stmt, 1); /* Offset. */ - tree arg2 = gimple_call_arg (stmt, 2); /* Store-to address. */ - location_t loc = gimple_location (stmt); - tree arg0_type = TREE_TYPE (arg0); - /* Use ptr_type_node (no TBAA) for the arg2_type. - FIXME: (Richard) "A proper fix would be to transition this type as - seen from the frontend to GIMPLE, for example in a similar way we - do for MEM_REFs by piggy-backing that on an extra argument, a - constant zero pointer of the alias pointer type to use (which would - also serve as a type indicator of the store itself). I'd use a - target specific internal function for this (not sure if we can have - those target specific, but I guess if it's folded away then that's - fine) and get away with the overload set." */ - tree arg2_type = ptr_type_node; - /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'. Create - the tree using the value from arg0. The resulting type will match - the type of arg2. */ - gimple_seq stmts = NULL; - tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg1); - tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR, - arg2_type, arg2, temp_offset); - /* Mask off any lower bits from the address. */ - tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR, - arg2_type, temp_addr, - build_int_cst (arg2_type, -16)); - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - if (!is_gimple_mem_ref_addr (aligned_addr)) - { - tree t = make_ssa_name (TREE_TYPE (aligned_addr)); - gimple *g = gimple_build_assign (t, aligned_addr); - gsi_insert_before (gsi, g, GSI_SAME_STMT); - aligned_addr = t; - } - /* The desired gimple result should be similar to: - MEM[(__vector floatD.1407 *)_1] = vf1D.2697; */ - gimple *g - = gimple_build_assign (build2 (MEM_REF, arg0_type, aligned_addr, - build_int_cst (arg2_type, 0)), arg0); - gimple_set_location (g, loc); - gsi_replace (gsi, g, true); - return true; - } - - /* unaligned Vector loads. */ - case RS6000_BIF_LXVW4X_V16QI: - case RS6000_BIF_LXVW4X_V8HI: - case RS6000_BIF_LXVW4X_V4SF: - case RS6000_BIF_LXVW4X_V4SI: - case RS6000_BIF_LXVD2X_V2DF: - case RS6000_BIF_LXVD2X_V2DI: - { - arg0 = gimple_call_arg (stmt, 0); // offset - arg1 = gimple_call_arg (stmt, 1); // address - lhs = gimple_call_lhs (stmt); - location_t loc = gimple_location (stmt); - /* Since arg1 may be cast to a different type, just use ptr_type_node - here instead of trying to enforce TBAA on pointer types. */ - tree arg1_type = ptr_type_node; - tree lhs_type = TREE_TYPE (lhs); - /* In GIMPLE the type of the MEM_REF specifies the alignment. The - required alignment (power) is 4 bytes regardless of data type. */ - tree align_ltype = build_aligned_type (lhs_type, 4); - /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'. Create - the tree using the value from arg0. The resulting type will match - the type of arg1. */ - gimple_seq stmts = NULL; - tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0); - tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR, - arg1_type, arg1, temp_offset); - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - if (!is_gimple_mem_ref_addr (temp_addr)) - { - tree t = make_ssa_name (TREE_TYPE (temp_addr)); - gimple *g = gimple_build_assign (t, temp_addr); - gsi_insert_before (gsi, g, GSI_SAME_STMT); - temp_addr = t; - } - /* Use the build2 helper to set up the mem_ref. The MEM_REF could also - take an offset, but since we've already incorporated the offset - above, here we just pass in a zero. */ - gimple *g; - g = gimple_build_assign (lhs, build2 (MEM_REF, align_ltype, temp_addr, - build_int_cst (arg1_type, 0))); - gimple_set_location (g, loc); - gsi_replace (gsi, g, true); - return true; - } - - /* unaligned Vector stores. */ - case RS6000_BIF_STXVW4X_V16QI: - case RS6000_BIF_STXVW4X_V8HI: - case RS6000_BIF_STXVW4X_V4SF: - case RS6000_BIF_STXVW4X_V4SI: - case RS6000_BIF_STXVD2X_V2DF: - case RS6000_BIF_STXVD2X_V2DI: - { - arg0 = gimple_call_arg (stmt, 0); /* Value to be stored. */ - arg1 = gimple_call_arg (stmt, 1); /* Offset. */ - tree arg2 = gimple_call_arg (stmt, 2); /* Store-to address. */ - location_t loc = gimple_location (stmt); - tree arg0_type = TREE_TYPE (arg0); - /* Use ptr_type_node (no TBAA) for the arg2_type. */ - tree arg2_type = ptr_type_node; - /* In GIMPLE the type of the MEM_REF specifies the alignment. The - required alignment (power) is 4 bytes regardless of data type. */ - tree align_stype = build_aligned_type (arg0_type, 4); - /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'. Create - the tree using the value from arg1. */ - gimple_seq stmts = NULL; - tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg1); - tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR, - arg2_type, arg2, temp_offset); - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - if (!is_gimple_mem_ref_addr (temp_addr)) - { - tree t = make_ssa_name (TREE_TYPE (temp_addr)); - gimple *g = gimple_build_assign (t, temp_addr); - gsi_insert_before (gsi, g, GSI_SAME_STMT); - temp_addr = t; - } - gimple *g; - g = gimple_build_assign (build2 (MEM_REF, align_stype, temp_addr, - build_int_cst (arg2_type, 0)), arg0); - gimple_set_location (g, loc); - gsi_replace (gsi, g, true); - return true; - } - - /* Vector Fused multiply-add (fma). */ - case RS6000_BIF_VMADDFP: - case RS6000_BIF_XVMADDDP: - case RS6000_BIF_XVMADDSP: - case RS6000_BIF_VMLADDUHM: - { - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - tree arg2 = gimple_call_arg (stmt, 2); - lhs = gimple_call_lhs (stmt); - gcall *g = gimple_build_call_internal (IFN_FMA, 3, arg0, arg1, arg2); - gimple_call_set_lhs (g, lhs); - gimple_call_set_nothrow (g, true); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - } - - /* Vector compares; EQ, NE, GE, GT, LE. */ - case RS6000_BIF_VCMPEQUB: - case RS6000_BIF_VCMPEQUH: - case RS6000_BIF_VCMPEQUW: - case RS6000_BIF_VCMPEQUD: - /* We deliberately omit RS6000_BIF_VCMPEQUT for now, because gimple - folding produces worse code for 128-bit compares. */ - fold_compare_helper (gsi, EQ_EXPR, stmt); - return true; - - case RS6000_BIF_VCMPNEB: - case RS6000_BIF_VCMPNEH: - case RS6000_BIF_VCMPNEW: - /* We deliberately omit RS6000_BIF_VCMPNET for now, because gimple - folding produces worse code for 128-bit compares. */ - fold_compare_helper (gsi, NE_EXPR, stmt); - return true; - - case RS6000_BIF_CMPGE_16QI: - case RS6000_BIF_CMPGE_U16QI: - case RS6000_BIF_CMPGE_8HI: - case RS6000_BIF_CMPGE_U8HI: - case RS6000_BIF_CMPGE_4SI: - case RS6000_BIF_CMPGE_U4SI: - case RS6000_BIF_CMPGE_2DI: - case RS6000_BIF_CMPGE_U2DI: - /* We deliberately omit RS6000_BIF_CMPGE_1TI and RS6000_BIF_CMPGE_U1TI - for now, because gimple folding produces worse code for 128-bit - compares. */ - fold_compare_helper (gsi, GE_EXPR, stmt); - return true; - - case RS6000_BIF_VCMPGTSB: - case RS6000_BIF_VCMPGTUB: - case RS6000_BIF_VCMPGTSH: - case RS6000_BIF_VCMPGTUH: - case RS6000_BIF_VCMPGTSW: - case RS6000_BIF_VCMPGTUW: - case RS6000_BIF_VCMPGTUD: - case RS6000_BIF_VCMPGTSD: - /* We deliberately omit RS6000_BIF_VCMPGTUT and RS6000_BIF_VCMPGTST - for now, because gimple folding produces worse code for 128-bit - compares. */ - fold_compare_helper (gsi, GT_EXPR, stmt); - return true; - - case RS6000_BIF_CMPLE_16QI: - case RS6000_BIF_CMPLE_U16QI: - case RS6000_BIF_CMPLE_8HI: - case RS6000_BIF_CMPLE_U8HI: - case RS6000_BIF_CMPLE_4SI: - case RS6000_BIF_CMPLE_U4SI: - case RS6000_BIF_CMPLE_2DI: - case RS6000_BIF_CMPLE_U2DI: - /* We deliberately omit RS6000_BIF_CMPLE_1TI and RS6000_BIF_CMPLE_U1TI - for now, because gimple folding produces worse code for 128-bit - compares. */ - fold_compare_helper (gsi, LE_EXPR, stmt); - return true; - - /* flavors of vec_splat_[us]{8,16,32}. */ - case RS6000_BIF_VSPLTISB: - case RS6000_BIF_VSPLTISH: - case RS6000_BIF_VSPLTISW: - { - arg0 = gimple_call_arg (stmt, 0); - lhs = gimple_call_lhs (stmt); - - /* Only fold the vec_splat_*() if the lower bits of arg 0 is a - 5-bit signed constant in range -16 to +15. */ - if (TREE_CODE (arg0) != INTEGER_CST - || !IN_RANGE (TREE_INT_CST_LOW (arg0), -16, 15)) - return false; - gimple_seq stmts = NULL; - location_t loc = gimple_location (stmt); - tree splat_value = gimple_convert (&stmts, loc, - TREE_TYPE (TREE_TYPE (lhs)), arg0); - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - tree splat_tree = build_vector_from_val (TREE_TYPE (lhs), splat_value); - g = gimple_build_assign (lhs, splat_tree); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - } - - /* Flavors of vec_splat. */ - /* a = vec_splat (b, 0x3) becomes a = { b[3],b[3],b[3],...}; */ - case RS6000_BIF_VSPLTB: - case RS6000_BIF_VSPLTH: - case RS6000_BIF_VSPLTW: - case RS6000_BIF_XXSPLTD_V2DI: - case RS6000_BIF_XXSPLTD_V2DF: - { - arg0 = gimple_call_arg (stmt, 0); /* input vector. */ - arg1 = gimple_call_arg (stmt, 1); /* index into arg0. */ - /* Only fold the vec_splat_*() if arg1 is both a constant value and - is a valid index into the arg0 vector. */ - unsigned int n_elts = VECTOR_CST_NELTS (arg0); - if (TREE_CODE (arg1) != INTEGER_CST - || TREE_INT_CST_LOW (arg1) > (n_elts -1)) - return false; - lhs = gimple_call_lhs (stmt); - tree lhs_type = TREE_TYPE (lhs); - tree arg0_type = TREE_TYPE (arg0); - tree splat; - if (TREE_CODE (arg0) == VECTOR_CST) - splat = VECTOR_CST_ELT (arg0, TREE_INT_CST_LOW (arg1)); - else - { - /* Determine (in bits) the length and start location of the - splat value for a call to the tree_vec_extract helper. */ - int splat_elem_size = TREE_INT_CST_LOW (size_in_bytes (arg0_type)) - * BITS_PER_UNIT / n_elts; - int splat_start_bit = TREE_INT_CST_LOW (arg1) * splat_elem_size; - tree len = build_int_cst (bitsizetype, splat_elem_size); - tree start = build_int_cst (bitsizetype, splat_start_bit); - splat = tree_vec_extract (gsi, TREE_TYPE (lhs_type), arg0, - len, start); - } - /* And finally, build the new vector. */ - tree splat_tree = build_vector_from_val (lhs_type, splat); - g = gimple_build_assign (lhs, splat_tree); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - } - - /* vec_mergel (integrals). */ - case RS6000_BIF_VMRGLH: - case RS6000_BIF_VMRGLW: - case RS6000_BIF_XXMRGLW_4SI: - case RS6000_BIF_VMRGLB: - case RS6000_BIF_VEC_MERGEL_V2DI: - case RS6000_BIF_XXMRGLW_4SF: - case RS6000_BIF_VEC_MERGEL_V2DF: - fold_mergehl_helper (gsi, stmt, 1); - return true; - /* vec_mergeh (integrals). */ - case RS6000_BIF_VMRGHH: - case RS6000_BIF_VMRGHW: - case RS6000_BIF_XXMRGHW_4SI: - case RS6000_BIF_VMRGHB: - case RS6000_BIF_VEC_MERGEH_V2DI: - case RS6000_BIF_XXMRGHW_4SF: - case RS6000_BIF_VEC_MERGEH_V2DF: - fold_mergehl_helper (gsi, stmt, 0); - return true; - - /* Flavors of vec_mergee. */ - case RS6000_BIF_VMRGEW_V4SI: - case RS6000_BIF_VMRGEW_V2DI: - case RS6000_BIF_VMRGEW_V4SF: - case RS6000_BIF_VMRGEW_V2DF: - fold_mergeeo_helper (gsi, stmt, 0); - return true; - /* Flavors of vec_mergeo. */ - case RS6000_BIF_VMRGOW_V4SI: - case RS6000_BIF_VMRGOW_V2DI: - case RS6000_BIF_VMRGOW_V4SF: - case RS6000_BIF_VMRGOW_V2DF: - fold_mergeeo_helper (gsi, stmt, 1); - return true; - - /* d = vec_pack (a, b) */ - case RS6000_BIF_VPKUDUM: - case RS6000_BIF_VPKUHUM: - case RS6000_BIF_VPKUWUM: - { - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - lhs = gimple_call_lhs (stmt); - gimple *g = gimple_build_assign (lhs, VEC_PACK_TRUNC_EXPR, arg0, arg1); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - } - - /* d = vec_unpackh (a) */ - /* Note that the UNPACK_{HI,LO}_EXPR used in the gimple_build_assign call - in this code is sensitive to endian-ness, and needs to be inverted to - handle both LE and BE targets. */ - case RS6000_BIF_VUPKHSB: - case RS6000_BIF_VUPKHSH: - case RS6000_BIF_VUPKHSW: - { - arg0 = gimple_call_arg (stmt, 0); - lhs = gimple_call_lhs (stmt); - if (BYTES_BIG_ENDIAN) - g = gimple_build_assign (lhs, VEC_UNPACK_HI_EXPR, arg0); - else - g = gimple_build_assign (lhs, VEC_UNPACK_LO_EXPR, arg0); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - } - /* d = vec_unpackl (a) */ - case RS6000_BIF_VUPKLSB: - case RS6000_BIF_VUPKLSH: - case RS6000_BIF_VUPKLSW: - { - arg0 = gimple_call_arg (stmt, 0); - lhs = gimple_call_lhs (stmt); - if (BYTES_BIG_ENDIAN) - g = gimple_build_assign (lhs, VEC_UNPACK_LO_EXPR, arg0); - else - g = gimple_build_assign (lhs, VEC_UNPACK_HI_EXPR, arg0); - gimple_set_location (g, gimple_location (stmt)); - gsi_replace (gsi, g, true); - return true; - } - /* There is no gimple type corresponding with pixel, so just return. */ - case RS6000_BIF_VUPKHPX: - case RS6000_BIF_VUPKLPX: - return false; - - /* vec_perm. */ - case RS6000_BIF_VPERM_16QI: - case RS6000_BIF_VPERM_8HI: - case RS6000_BIF_VPERM_4SI: - case RS6000_BIF_VPERM_2DI: - case RS6000_BIF_VPERM_4SF: - case RS6000_BIF_VPERM_2DF: - case RS6000_BIF_VPERM_16QI_UNS: - case RS6000_BIF_VPERM_8HI_UNS: - case RS6000_BIF_VPERM_4SI_UNS: - case RS6000_BIF_VPERM_2DI_UNS: - { - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - tree permute = gimple_call_arg (stmt, 2); - lhs = gimple_call_lhs (stmt); - location_t loc = gimple_location (stmt); - gimple_seq stmts = NULL; - // convert arg0 and arg1 to match the type of the permute - // for the VEC_PERM_EXPR operation. - tree permute_type = (TREE_TYPE (permute)); - tree arg0_ptype = gimple_build (&stmts, loc, VIEW_CONVERT_EXPR, - permute_type, arg0); - tree arg1_ptype = gimple_build (&stmts, loc, VIEW_CONVERT_EXPR, - permute_type, arg1); - tree lhs_ptype = gimple_build (&stmts, loc, VEC_PERM_EXPR, - permute_type, arg0_ptype, arg1_ptype, - permute); - // Convert the result back to the desired lhs type upon completion. - tree temp = gimple_build (&stmts, loc, VIEW_CONVERT_EXPR, - TREE_TYPE (lhs), lhs_ptype); - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - g = gimple_build_assign (lhs, temp); - gimple_set_location (g, loc); - gsi_replace (gsi, g, true); - return true; - } - - default: - if (TARGET_DEBUG_BUILTIN) - fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n", - fn_code, fn_name1, fn_name2); - break; - } - - return false; -} - -/* Expand ALTIVEC_BUILTIN_MASK_FOR_LOAD. */ -rtx -rs6000_expand_ldst_mask (rtx target, tree arg0) -{ - int icode2 = BYTES_BIG_ENDIAN ? (int) CODE_FOR_altivec_lvsr_direct - : (int) CODE_FOR_altivec_lvsl_direct; - machine_mode tmode = insn_data[icode2].operand[0].mode; - machine_mode mode = insn_data[icode2].operand[1].mode; - - gcc_assert (TARGET_ALTIVEC); - - gcc_assert (POINTER_TYPE_P (TREE_TYPE (arg0))); - rtx op = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL); - rtx addr = memory_address (mode, op); - /* We need to negate the address. */ - op = gen_reg_rtx (GET_MODE (addr)); - emit_insn (gen_rtx_SET (op, gen_rtx_NEG (GET_MODE (addr), addr))); - op = gen_rtx_MEM (mode, op); - - if (target == 0 - || GET_MODE (target) != tmode - || !insn_data[icode2].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - rtx pat = GEN_FCN (icode2) (target, op); - if (!pat) - return 0; - emit_insn (pat); - - return target; -} - -/* Expand the CPU builtin in FCODE and store the result in TARGET. */ -static rtx -cpu_expand_builtin (enum rs6000_gen_builtins fcode, - tree exp ATTRIBUTE_UNUSED, rtx target) -{ - /* __builtin_cpu_init () is a nop, so expand to nothing. */ - if (fcode == RS6000_BIF_CPU_INIT) - return const0_rtx; - - if (target == 0 || GET_MODE (target) != SImode) - target = gen_reg_rtx (SImode); - - /* TODO: Factor the #ifdef'd code into a separate function. */ -#ifdef TARGET_LIBC_PROVIDES_HWCAP_IN_TCB - tree arg = TREE_OPERAND (CALL_EXPR_ARG (exp, 0), 0); - /* Target clones creates an ARRAY_REF instead of STRING_CST, convert it back - to a STRING_CST. */ - if (TREE_CODE (arg) == ARRAY_REF - && TREE_CODE (TREE_OPERAND (arg, 0)) == STRING_CST - && TREE_CODE (TREE_OPERAND (arg, 1)) == INTEGER_CST - && compare_tree_int (TREE_OPERAND (arg, 1), 0) == 0) - arg = TREE_OPERAND (arg, 0); - - if (TREE_CODE (arg) != STRING_CST) - { - error ("builtin %qs only accepts a string argument", - rs6000_builtin_info[(size_t) fcode].bifname); - return const0_rtx; - } - - if (fcode == RS6000_BIF_CPU_IS) - { - const char *cpu = TREE_STRING_POINTER (arg); - rtx cpuid = NULL_RTX; - for (size_t i = 0; i < ARRAY_SIZE (cpu_is_info); i++) - if (strcmp (cpu, cpu_is_info[i].cpu) == 0) - { - /* The CPUID value in the TCB is offset by _DL_FIRST_PLATFORM. */ - cpuid = GEN_INT (cpu_is_info[i].cpuid + _DL_FIRST_PLATFORM); - break; - } - if (cpuid == NULL_RTX) - { - /* Invalid CPU argument. */ - error ("cpu %qs is an invalid argument to builtin %qs", - cpu, rs6000_builtin_info[(size_t) fcode].bifname); - return const0_rtx; - } - - rtx platform = gen_reg_rtx (SImode); - rtx address = gen_rtx_PLUS (Pmode, - gen_rtx_REG (Pmode, TLS_REGNUM), - GEN_INT (TCB_PLATFORM_OFFSET)); - rtx tcbmem = gen_const_mem (SImode, address); - emit_move_insn (platform, tcbmem); - emit_insn (gen_eqsi3 (target, platform, cpuid)); - } - else if (fcode == RS6000_BIF_CPU_SUPPORTS) - { - const char *hwcap = TREE_STRING_POINTER (arg); - rtx mask = NULL_RTX; - int hwcap_offset; - for (size_t i = 0; i < ARRAY_SIZE (cpu_supports_info); i++) - if (strcmp (hwcap, cpu_supports_info[i].hwcap) == 0) - { - mask = GEN_INT (cpu_supports_info[i].mask); - hwcap_offset = TCB_HWCAP_OFFSET (cpu_supports_info[i].id); - break; - } - if (mask == NULL_RTX) - { - /* Invalid HWCAP argument. */ - error ("%s %qs is an invalid argument to builtin %qs", - "hwcap", hwcap, - rs6000_builtin_info[(size_t) fcode].bifname); - return const0_rtx; - } - - rtx tcb_hwcap = gen_reg_rtx (SImode); - rtx address = gen_rtx_PLUS (Pmode, - gen_rtx_REG (Pmode, TLS_REGNUM), - GEN_INT (hwcap_offset)); - rtx tcbmem = gen_const_mem (SImode, address); - emit_move_insn (tcb_hwcap, tcbmem); - rtx scratch1 = gen_reg_rtx (SImode); - emit_insn (gen_rtx_SET (scratch1, - gen_rtx_AND (SImode, tcb_hwcap, mask))); - rtx scratch2 = gen_reg_rtx (SImode); - emit_insn (gen_eqsi3 (scratch2, scratch1, const0_rtx)); - emit_insn (gen_rtx_SET (target, - gen_rtx_XOR (SImode, scratch2, const1_rtx))); - } - else - gcc_unreachable (); - - /* Record that we have expanded a CPU builtin, so that we can later - emit a reference to the special symbol exported by LIBC to ensure we - do not link against an old LIBC that doesn't support this feature. */ - cpu_builtin_p = true; - -#else - warning (0, "builtin %qs needs GLIBC (2.23 and newer) that exports hardware " - "capability bits", rs6000_builtin_info[(size_t) fcode].bifname); - - /* For old LIBCs, always return FALSE. */ - emit_move_insn (target, GEN_INT (0)); -#endif /* TARGET_LIBC_PROVIDES_HWCAP_IN_TCB */ - - return target; -} - -/* For the element-reversing load/store built-ins, produce the correct - insn_code depending on the target endianness. */ -static insn_code -elemrev_icode (rs6000_gen_builtins fcode) -{ - switch (fcode) - { - case RS6000_BIF_ST_ELEMREV_V1TI: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v1ti - : CODE_FOR_vsx_st_elemrev_v1ti; - - case RS6000_BIF_ST_ELEMREV_V2DF: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v2df - : CODE_FOR_vsx_st_elemrev_v2df; - - case RS6000_BIF_ST_ELEMREV_V2DI: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v2di - : CODE_FOR_vsx_st_elemrev_v2di; - - case RS6000_BIF_ST_ELEMREV_V4SF: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v4sf - : CODE_FOR_vsx_st_elemrev_v4sf; - - case RS6000_BIF_ST_ELEMREV_V4SI: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v4si - : CODE_FOR_vsx_st_elemrev_v4si; - - case RS6000_BIF_ST_ELEMREV_V8HI: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v8hi - : CODE_FOR_vsx_st_elemrev_v8hi; - - case RS6000_BIF_ST_ELEMREV_V16QI: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v16qi - : CODE_FOR_vsx_st_elemrev_v16qi; - - case RS6000_BIF_LD_ELEMREV_V2DF: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v2df - : CODE_FOR_vsx_ld_elemrev_v2df; - - case RS6000_BIF_LD_ELEMREV_V1TI: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v1ti - : CODE_FOR_vsx_ld_elemrev_v1ti; - - case RS6000_BIF_LD_ELEMREV_V2DI: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v2di - : CODE_FOR_vsx_ld_elemrev_v2di; - - case RS6000_BIF_LD_ELEMREV_V4SF: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v4sf - : CODE_FOR_vsx_ld_elemrev_v4sf; - - case RS6000_BIF_LD_ELEMREV_V4SI: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v4si - : CODE_FOR_vsx_ld_elemrev_v4si; - - case RS6000_BIF_LD_ELEMREV_V8HI: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v8hi - : CODE_FOR_vsx_ld_elemrev_v8hi; - - case RS6000_BIF_LD_ELEMREV_V16QI: - return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v16qi - : CODE_FOR_vsx_ld_elemrev_v16qi; - default: - ; - } - - gcc_unreachable (); -} - -/* Expand an AltiVec vector load builtin, and return the expanded rtx. */ -static rtx -ldv_expand_builtin (rtx target, insn_code icode, rtx *op, machine_mode tmode) -{ - if (target == 0 - || GET_MODE (target) != tmode - || !insn_data[icode].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - op[1] = copy_to_mode_reg (Pmode, op[1]); - - /* These CELL built-ins use BLKmode instead of tmode for historical - (i.e., unknown) reasons. TODO: Is this necessary? */ - bool blk = (icode == CODE_FOR_altivec_lvlx - || icode == CODE_FOR_altivec_lvlxl - || icode == CODE_FOR_altivec_lvrx - || icode == CODE_FOR_altivec_lvrxl); - - /* For LVX, express the RTL accurately by ANDing the address with -16. - LVXL and LVE*X expand to use UNSPECs to hide their special behavior, - so the raw address is fine. */ - /* TODO: That statement seems wrong, as the UNSPECs don't surround the - memory expression, so a latent bug may lie here. The &-16 is likely - needed for all VMX-style loads. */ - if (icode == CODE_FOR_altivec_lvx_v1ti - || icode == CODE_FOR_altivec_lvx_v2df - || icode == CODE_FOR_altivec_lvx_v2di - || icode == CODE_FOR_altivec_lvx_v4sf - || icode == CODE_FOR_altivec_lvx_v4si - || icode == CODE_FOR_altivec_lvx_v8hi - || icode == CODE_FOR_altivec_lvx_v16qi) - { - rtx rawaddr; - if (op[0] == const0_rtx) - rawaddr = op[1]; - else - { - op[0] = copy_to_mode_reg (Pmode, op[0]); - rawaddr = gen_rtx_PLUS (Pmode, op[1], op[0]); - } - rtx addr = gen_rtx_AND (Pmode, rawaddr, gen_rtx_CONST_INT (Pmode, -16)); - addr = gen_rtx_MEM (blk ? BLKmode : tmode, addr); - - emit_insn (gen_rtx_SET (target, addr)); - } - else - { - rtx addr; - if (op[0] == const0_rtx) - addr = gen_rtx_MEM (blk ? BLKmode : tmode, op[1]); - else - { - op[0] = copy_to_mode_reg (Pmode, op[0]); - addr = gen_rtx_MEM (blk ? BLKmode : tmode, - gen_rtx_PLUS (Pmode, op[1], op[0])); - } - - rtx pat = GEN_FCN (icode) (target, addr); - if (!pat) - return 0; - emit_insn (pat); - } - - return target; -} - -/* Expand a builtin function that loads a scalar into a vector register - with sign extension, and return the expanded rtx. */ -static rtx -lxvrse_expand_builtin (rtx target, insn_code icode, rtx *op, - machine_mode tmode, machine_mode smode) -{ - rtx pat, addr; - op[1] = copy_to_mode_reg (Pmode, op[1]); - - if (op[0] == const0_rtx) - addr = gen_rtx_MEM (tmode, op[1]); - else - { - op[0] = copy_to_mode_reg (Pmode, op[0]); - addr = gen_rtx_MEM (smode, - gen_rtx_PLUS (Pmode, op[1], op[0])); - } - - rtx discratch = gen_reg_rtx (V2DImode); - rtx tiscratch = gen_reg_rtx (TImode); - - /* Emit the lxvr*x insn. */ - pat = GEN_FCN (icode) (tiscratch, addr); - if (!pat) - return 0; - emit_insn (pat); - - /* Emit a sign extension from V16QI,V8HI,V4SI to V2DI. */ - rtx temp1; - if (icode == CODE_FOR_vsx_lxvrbx) - { - temp1 = simplify_gen_subreg (V16QImode, tiscratch, TImode, 0); - emit_insn (gen_vsx_sign_extend_qi_v2di (discratch, temp1)); - } - else if (icode == CODE_FOR_vsx_lxvrhx) - { - temp1 = simplify_gen_subreg (V8HImode, tiscratch, TImode, 0); - emit_insn (gen_vsx_sign_extend_hi_v2di (discratch, temp1)); - } - else if (icode == CODE_FOR_vsx_lxvrwx) - { - temp1 = simplify_gen_subreg (V4SImode, tiscratch, TImode, 0); - emit_insn (gen_vsx_sign_extend_si_v2di (discratch, temp1)); - } - else if (icode == CODE_FOR_vsx_lxvrdx) - discratch = simplify_gen_subreg (V2DImode, tiscratch, TImode, 0); - else - gcc_unreachable (); - - /* Emit the sign extension from V2DI (double) to TI (quad). */ - rtx temp2 = simplify_gen_subreg (TImode, discratch, V2DImode, 0); - emit_insn (gen_extendditi2_vector (target, temp2)); - - return target; -} - -/* Expand a builtin function that loads a scalar into a vector register - with zero extension, and return the expanded rtx. */ -static rtx -lxvrze_expand_builtin (rtx target, insn_code icode, rtx *op, - machine_mode tmode, machine_mode smode) -{ - rtx pat, addr; - op[1] = copy_to_mode_reg (Pmode, op[1]); - - if (op[0] == const0_rtx) - addr = gen_rtx_MEM (tmode, op[1]); - else - { - op[0] = copy_to_mode_reg (Pmode, op[0]); - addr = gen_rtx_MEM (smode, - gen_rtx_PLUS (Pmode, op[1], op[0])); - } - - pat = GEN_FCN (icode) (target, addr); - if (!pat) - return 0; - emit_insn (pat); - return target; -} - -/* Expand an AltiVec vector store builtin, and return the expanded rtx. */ -static rtx -stv_expand_builtin (insn_code icode, rtx *op, - machine_mode tmode, machine_mode smode) -{ - op[2] = copy_to_mode_reg (Pmode, op[2]); - - /* For STVX, express the RTL accurately by ANDing the address with -16. - STVXL and STVE*X expand to use UNSPECs to hide their special behavior, - so the raw address is fine. */ - /* TODO: That statement seems wrong, as the UNSPECs don't surround the - memory expression, so a latent bug may lie here. The &-16 is likely - needed for all VMX-style stores. */ - if (icode == CODE_FOR_altivec_stvx_v2df - || icode == CODE_FOR_altivec_stvx_v2di - || icode == CODE_FOR_altivec_stvx_v4sf - || icode == CODE_FOR_altivec_stvx_v4si - || icode == CODE_FOR_altivec_stvx_v8hi - || icode == CODE_FOR_altivec_stvx_v16qi) - { - rtx rawaddr; - if (op[1] == const0_rtx) - rawaddr = op[2]; - else - { - op[1] = copy_to_mode_reg (Pmode, op[1]); - rawaddr = gen_rtx_PLUS (Pmode, op[2], op[1]); - } - - rtx addr = gen_rtx_AND (Pmode, rawaddr, gen_rtx_CONST_INT (Pmode, -16)); - addr = gen_rtx_MEM (tmode, addr); - op[0] = copy_to_mode_reg (tmode, op[0]); - emit_insn (gen_rtx_SET (addr, op[0])); - } - else if (icode == CODE_FOR_vsx_stxvrbx - || icode == CODE_FOR_vsx_stxvrhx - || icode == CODE_FOR_vsx_stxvrwx - || icode == CODE_FOR_vsx_stxvrdx) - { - rtx truncrtx = gen_rtx_TRUNCATE (tmode, op[0]); - op[0] = copy_to_mode_reg (E_TImode, truncrtx); - - rtx addr; - if (op[1] == const0_rtx) - addr = gen_rtx_MEM (Pmode, op[2]); - else - { - op[1] = copy_to_mode_reg (Pmode, op[1]); - addr = gen_rtx_MEM (tmode, gen_rtx_PLUS (Pmode, op[2], op[1])); - } - rtx pat = GEN_FCN (icode) (addr, op[0]); - if (pat) - emit_insn (pat); - } - else - { - if (!insn_data[icode].operand[1].predicate (op[0], smode)) - op[0] = copy_to_mode_reg (smode, op[0]); - - rtx addr; - if (op[1] == const0_rtx) - addr = gen_rtx_MEM (tmode, op[2]); - else - { - op[1] = copy_to_mode_reg (Pmode, op[1]); - addr = gen_rtx_MEM (tmode, gen_rtx_PLUS (Pmode, op[2], op[1])); - } - - rtx pat = GEN_FCN (icode) (addr, op[0]); - if (pat) - emit_insn (pat); - } - - return NULL_RTX; -} - -/* Expand the MMA built-in in EXP, and return it. */ -static rtx -mma_expand_builtin (tree exp, rtx target, insn_code icode, - rs6000_gen_builtins fcode) -{ - tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); - bool void_func = TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node; - machine_mode tmode = VOIDmode; - rtx op[MAX_MMA_OPERANDS]; - unsigned nopnds = 0; - - if (!void_func) - { - tmode = insn_data[icode].operand[0].mode; - if (!(target - && GET_MODE (target) == tmode - && insn_data[icode].operand[0].predicate (target, tmode))) - target = gen_reg_rtx (tmode); - op[nopnds++] = target; - } - else - target = const0_rtx; - - call_expr_arg_iterator iter; - tree arg; - FOR_EACH_CALL_EXPR_ARG (arg, iter, exp) - { - if (arg == error_mark_node) - return const0_rtx; - - rtx opnd; - const struct insn_operand_data *insn_op; - insn_op = &insn_data[icode].operand[nopnds]; - if (TREE_CODE (arg) == ADDR_EXPR - && MEM_P (DECL_RTL (TREE_OPERAND (arg, 0)))) - opnd = DECL_RTL (TREE_OPERAND (arg, 0)); - else - opnd = expand_normal (arg); - - if (!insn_op->predicate (opnd, insn_op->mode)) - { - /* TODO: This use of constraints needs explanation. */ - if (!strcmp (insn_op->constraint, "n")) - { - if (!CONST_INT_P (opnd)) - error ("argument %d must be an unsigned literal", nopnds); - else - error ("argument %d is an unsigned literal that is " - "out of range", nopnds); - return const0_rtx; - } - opnd = copy_to_mode_reg (insn_op->mode, opnd); - } - - /* Some MMA instructions have INOUT accumulator operands, so force - their target register to be the same as their input register. */ - if (!void_func - && nopnds == 1 - && !strcmp (insn_op->constraint, "0") - && insn_op->mode == tmode - && REG_P (opnd) - && insn_data[icode].operand[0].predicate (opnd, tmode)) - target = op[0] = opnd; - - op[nopnds++] = opnd; - } - - rtx pat; - switch (nopnds) - { - case 1: - pat = GEN_FCN (icode) (op[0]); - break; - case 2: - pat = GEN_FCN (icode) (op[0], op[1]); - break; - case 3: - /* The ASSEMBLE builtin source operands are reversed in little-endian - mode, so reorder them. */ - if (fcode == RS6000_BIF_ASSEMBLE_PAIR_V_INTERNAL && !WORDS_BIG_ENDIAN) - std::swap (op[1], op[2]); - pat = GEN_FCN (icode) (op[0], op[1], op[2]); - break; - case 4: - pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3]); - break; - case 5: - /* The ASSEMBLE builtin source operands are reversed in little-endian - mode, so reorder them. */ - if (fcode == RS6000_BIF_ASSEMBLE_ACC_INTERNAL && !WORDS_BIG_ENDIAN) - { - std::swap (op[1], op[4]); - std::swap (op[2], op[3]); - } - pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4]); - break; - case 6: - pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5]); - break; - case 7: - pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5], op[6]); - break; - default: - gcc_unreachable (); - } - - if (!pat) - return NULL_RTX; - - emit_insn (pat); - return target; -} - -/* Return the appropriate SPR number associated with the given builtin. */ -static inline HOST_WIDE_INT -htm_spr_num (enum rs6000_gen_builtins code) -{ - if (code == RS6000_BIF_GET_TFHAR - || code == RS6000_BIF_SET_TFHAR) - return TFHAR_SPR; - else if (code == RS6000_BIF_GET_TFIAR - || code == RS6000_BIF_SET_TFIAR) - return TFIAR_SPR; - else if (code == RS6000_BIF_GET_TEXASR - || code == RS6000_BIF_SET_TEXASR) - return TEXASR_SPR; - gcc_assert (code == RS6000_BIF_GET_TEXASRU - || code == RS6000_BIF_SET_TEXASRU); - return TEXASRU_SPR; -} - -/* Expand the HTM builtin in EXP and store the result in TARGET. - Return the expanded rtx. */ -static rtx -htm_expand_builtin (bifdata *bifaddr, rs6000_gen_builtins fcode, - tree exp, rtx target) -{ - if (!TARGET_POWERPC64 - && (fcode == RS6000_BIF_TABORTDC - || fcode == RS6000_BIF_TABORTDCI)) - { - error ("builtin %qs is only valid in 64-bit mode", bifaddr->bifname); - return const0_rtx; - } - - tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); - bool nonvoid = TREE_TYPE (TREE_TYPE (fndecl)) != void_type_node; - bool uses_spr = bif_is_htmspr (*bifaddr); - insn_code icode = bifaddr->icode; - - if (uses_spr) - icode = rs6000_htm_spr_icode (nonvoid); - - rtx op[MAX_HTM_OPERANDS]; - int nopnds = 0; - const insn_operand_data *insn_op = &insn_data[icode].operand[0]; - - if (nonvoid) - { - machine_mode tmode = (uses_spr) ? insn_op->mode : E_SImode; - if (!target - || GET_MODE (target) != tmode - || (uses_spr && !insn_op->predicate (target, tmode))) - target = gen_reg_rtx (tmode); - if (uses_spr) - op[nopnds++] = target; - } - - tree arg; - call_expr_arg_iterator iter; - - FOR_EACH_CALL_EXPR_ARG (arg, iter, exp) - { - if (arg == error_mark_node || nopnds >= MAX_HTM_OPERANDS) - return const0_rtx; - - insn_op = &insn_data[icode].operand[nopnds]; - op[nopnds] = expand_normal (arg); - - if (!insn_op->predicate (op[nopnds], insn_op->mode)) - { - /* TODO: This use of constraints could use explanation. - This happens a couple of places, perhaps make that a - function to document what's happening. */ - if (!strcmp (insn_op->constraint, "n")) - { - int arg_num = nonvoid ? nopnds : nopnds + 1; - if (!CONST_INT_P (op[nopnds])) - error ("argument %d must be an unsigned literal", arg_num); - else - error ("argument %d is an unsigned literal that is " - "out of range", arg_num); - return const0_rtx; - } - op[nopnds] = copy_to_mode_reg (insn_op->mode, op[nopnds]); - } - - nopnds++; - } - - /* Handle the builtins for extended mnemonics. These accept - no arguments, but map to builtins that take arguments. */ - switch (fcode) - { - case RS6000_BIF_TENDALL: /* Alias for: tend. 1 */ - case RS6000_BIF_TRESUME: /* Alias for: tsr. 1 */ - op[nopnds++] = GEN_INT (1); - break; - case RS6000_BIF_TSUSPEND: /* Alias for: tsr. 0 */ - op[nopnds++] = GEN_INT (0); - break; - default: - break; - } - - /* If this builtin accesses SPRs, then pass in the appropriate - SPR number and SPR regno as the last two operands. */ - rtx cr = NULL_RTX; - if (uses_spr) - { - machine_mode mode = TARGET_POWERPC64 ? DImode : SImode; - op[nopnds++] = gen_rtx_CONST_INT (mode, htm_spr_num (fcode)); - } - /* If this builtin accesses a CR field, then pass in a scratch - CR field as the last operand. */ - else if (bif_is_htmcr (*bifaddr)) - { - cr = gen_reg_rtx (CCmode); - op[nopnds++] = cr; - } - - rtx pat; - switch (nopnds) - { - case 1: - pat = GEN_FCN (icode) (op[0]); - break; - case 2: - pat = GEN_FCN (icode) (op[0], op[1]); - break; - case 3: - pat = GEN_FCN (icode) (op[0], op[1], op[2]); - break; - case 4: - pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3]); - break; - default: - gcc_unreachable (); - } - if (!pat) - return NULL_RTX; - emit_insn (pat); - - if (bif_is_htmcr (*bifaddr)) - { - if (fcode == RS6000_BIF_TBEGIN) - { - /* Emit code to set TARGET to true or false depending on - whether the tbegin. instruction succeeded or failed - to start a transaction. We do this by placing the 1's - complement of CR's EQ bit into TARGET. */ - rtx scratch = gen_reg_rtx (SImode); - emit_insn (gen_rtx_SET (scratch, - gen_rtx_EQ (SImode, cr, - const0_rtx))); - emit_insn (gen_rtx_SET (target, - gen_rtx_XOR (SImode, scratch, - GEN_INT (1)))); - } - else - { - /* Emit code to copy the 4-bit condition register field - CR into the least significant end of register TARGET. */ - rtx scratch1 = gen_reg_rtx (SImode); - rtx scratch2 = gen_reg_rtx (SImode); - rtx subreg = simplify_gen_subreg (CCmode, scratch1, SImode, 0); - emit_insn (gen_movcc (subreg, cr)); - emit_insn (gen_lshrsi3 (scratch2, scratch1, GEN_INT (28))); - emit_insn (gen_andsi3 (target, scratch2, GEN_INT (0xf))); - } - } - - if (nonvoid) - return target; - return const0_rtx; -} - -/* Expand an expression EXP that calls a built-in function, - with result going to TARGET if that's convenient - (and in mode MODE if that's convenient). - SUBTARGET may be used as the target for computing one of EXP's operands. - IGNORE is nonzero if the value is to be ignored. - Use the new builtin infrastructure. */ -rtx -rs6000_expand_builtin (tree exp, rtx target, rtx /* subtarget */, - machine_mode /* mode */, int ignore) -{ - tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); - enum rs6000_gen_builtins fcode - = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl); - size_t uns_fcode = (size_t)fcode; - enum insn_code icode = rs6000_builtin_info[uns_fcode].icode; - - /* TODO: The following commentary and code is inherited from the original - builtin processing code. The commentary is a bit confusing, with the - intent being that KFmode is always IEEE-128, IFmode is always IBM - double-double, and TFmode is the current long double. The code is - confusing in that it converts from KFmode to TFmode pattern names, - when the other direction is more intuitive. Try to address this. */ - - /* We have two different modes (KFmode, TFmode) that are the IEEE - 128-bit floating point type, depending on whether long double is the - IBM extended double (KFmode) or long double is IEEE 128-bit (TFmode). - It is simpler if we only define one variant of the built-in function, - and switch the code when defining it, rather than defining two built- - ins and using the overload table in rs6000-c.cc to switch between the - two. If we don't have the proper assembler, don't do this switch - because CODE_FOR_*kf* and CODE_FOR_*tf* will be CODE_FOR_nothing. */ - if (FLOAT128_IEEE_P (TFmode)) - switch (icode) - { - case CODE_FOR_sqrtkf2_odd: - icode = CODE_FOR_sqrttf2_odd; - break; - case CODE_FOR_trunckfdf2_odd: - icode = CODE_FOR_trunctfdf2_odd; - break; - case CODE_FOR_addkf3_odd: - icode = CODE_FOR_addtf3_odd; - break; - case CODE_FOR_subkf3_odd: - icode = CODE_FOR_subtf3_odd; - break; - case CODE_FOR_mulkf3_odd: - icode = CODE_FOR_multf3_odd; - break; - case CODE_FOR_divkf3_odd: - icode = CODE_FOR_divtf3_odd; - break; - case CODE_FOR_fmakf4_odd: - icode = CODE_FOR_fmatf4_odd; - break; - case CODE_FOR_xsxexpqp_kf: - icode = CODE_FOR_xsxexpqp_tf; - break; - case CODE_FOR_xsxsigqp_kf: - icode = CODE_FOR_xsxsigqp_tf; - break; - case CODE_FOR_xststdcnegqp_kf: - icode = CODE_FOR_xststdcnegqp_tf; - break; - case CODE_FOR_xsiexpqp_kf: - icode = CODE_FOR_xsiexpqp_tf; - break; - case CODE_FOR_xsiexpqpf_kf: - icode = CODE_FOR_xsiexpqpf_tf; - break; - case CODE_FOR_xststdcqp_kf: - icode = CODE_FOR_xststdcqp_tf; - break; - case CODE_FOR_xscmpexpqp_eq_kf: - icode = CODE_FOR_xscmpexpqp_eq_tf; - break; - case CODE_FOR_xscmpexpqp_lt_kf: - icode = CODE_FOR_xscmpexpqp_lt_tf; - break; - case CODE_FOR_xscmpexpqp_gt_kf: - icode = CODE_FOR_xscmpexpqp_gt_tf; - break; - case CODE_FOR_xscmpexpqp_unordered_kf: - icode = CODE_FOR_xscmpexpqp_unordered_tf; - break; - default: - break; - } - - /* In case of "#pragma target" changes, we initialize all builtins - but check for actual availability now, during expand time. For - invalid builtins, generate a normal call. */ - bifdata *bifaddr = &rs6000_builtin_info[uns_fcode]; - bif_enable e = bifaddr->enable; - - if (!(e == ENB_ALWAYS - || (e == ENB_P5 && TARGET_POPCNTB) - || (e == ENB_P6 && TARGET_CMPB) - || (e == ENB_P6_64 && TARGET_CMPB && TARGET_POWERPC64) - || (e == ENB_ALTIVEC && TARGET_ALTIVEC) - || (e == ENB_CELL && TARGET_ALTIVEC && rs6000_cpu == PROCESSOR_CELL) - || (e == ENB_VSX && TARGET_VSX) - || (e == ENB_P7 && TARGET_POPCNTD) - || (e == ENB_P7_64 && TARGET_POPCNTD && TARGET_POWERPC64) - || (e == ENB_P8 && TARGET_DIRECT_MOVE) - || (e == ENB_P8V && TARGET_P8_VECTOR) - || (e == ENB_P9 && TARGET_MODULO) - || (e == ENB_P9_64 && TARGET_MODULO && TARGET_POWERPC64) - || (e == ENB_P9V && TARGET_P9_VECTOR) - || (e == ENB_IEEE128_HW && TARGET_FLOAT128_HW) - || (e == ENB_DFP && TARGET_DFP) - || (e == ENB_CRYPTO && TARGET_CRYPTO) - || (e == ENB_HTM && TARGET_HTM) - || (e == ENB_P10 && TARGET_POWER10) - || (e == ENB_P10_64 && TARGET_POWER10 && TARGET_POWERPC64) - || (e == ENB_MMA && TARGET_MMA))) - { - rs6000_invalid_builtin (fcode); - return expand_call (exp, target, ignore); - } - - if (bif_is_nosoft (*bifaddr) - && rs6000_isa_flags & OPTION_MASK_SOFT_FLOAT) - { - error ("%qs not supported with %<-msoft-float%>", - bifaddr->bifname); - return const0_rtx; - } - - if (bif_is_no32bit (*bifaddr) && TARGET_32BIT) - { - error ("%qs is not supported in 32-bit mode", bifaddr->bifname); - return const0_rtx; - } - - if (bif_is_ibmld (*bifaddr) && !FLOAT128_2REG_P (TFmode)) - { - error ("%qs requires % to be IBM 128-bit format", - bifaddr->bifname); - return const0_rtx; - } - - if (bif_is_cpu (*bifaddr)) - return cpu_expand_builtin (fcode, exp, target); - - if (bif_is_init (*bifaddr)) - return altivec_expand_vec_init_builtin (TREE_TYPE (exp), exp, target); - - if (bif_is_set (*bifaddr)) - return altivec_expand_vec_set_builtin (exp); - - if (bif_is_extract (*bifaddr)) - return altivec_expand_vec_ext_builtin (exp, target); - - if (bif_is_predicate (*bifaddr)) - return altivec_expand_predicate_builtin (icode, exp, target); - - if (bif_is_htm (*bifaddr)) - return htm_expand_builtin (bifaddr, fcode, exp, target); - - if (bif_is_32bit (*bifaddr) && TARGET_32BIT) - { - if (fcode == RS6000_BIF_MFTB) - icode = CODE_FOR_rs6000_mftb_si; - else if (fcode == RS6000_BIF_BPERMD) - icode = CODE_FOR_bpermd_si; - else if (fcode == RS6000_BIF_DARN) - icode = CODE_FOR_darn_64_si; - else if (fcode == RS6000_BIF_DARN_32) - icode = CODE_FOR_darn_32_si; - else if (fcode == RS6000_BIF_DARN_RAW) - icode = CODE_FOR_darn_raw_si; - else - gcc_unreachable (); - } - - if (bif_is_endian (*bifaddr) && BYTES_BIG_ENDIAN) - { - if (fcode == RS6000_BIF_LD_ELEMREV_V1TI) - icode = CODE_FOR_vsx_load_v1ti; - else if (fcode == RS6000_BIF_LD_ELEMREV_V2DF) - icode = CODE_FOR_vsx_load_v2df; - else if (fcode == RS6000_BIF_LD_ELEMREV_V2DI) - icode = CODE_FOR_vsx_load_v2di; - else if (fcode == RS6000_BIF_LD_ELEMREV_V4SF) - icode = CODE_FOR_vsx_load_v4sf; - else if (fcode == RS6000_BIF_LD_ELEMREV_V4SI) - icode = CODE_FOR_vsx_load_v4si; - else if (fcode == RS6000_BIF_LD_ELEMREV_V8HI) - icode = CODE_FOR_vsx_load_v8hi; - else if (fcode == RS6000_BIF_LD_ELEMREV_V16QI) - icode = CODE_FOR_vsx_load_v16qi; - else if (fcode == RS6000_BIF_ST_ELEMREV_V1TI) - icode = CODE_FOR_vsx_store_v1ti; - else if (fcode == RS6000_BIF_ST_ELEMREV_V2DF) - icode = CODE_FOR_vsx_store_v2df; - else if (fcode == RS6000_BIF_ST_ELEMREV_V2DI) - icode = CODE_FOR_vsx_store_v2di; - else if (fcode == RS6000_BIF_ST_ELEMREV_V4SF) - icode = CODE_FOR_vsx_store_v4sf; - else if (fcode == RS6000_BIF_ST_ELEMREV_V4SI) - icode = CODE_FOR_vsx_store_v4si; - else if (fcode == RS6000_BIF_ST_ELEMREV_V8HI) - icode = CODE_FOR_vsx_store_v8hi; - else if (fcode == RS6000_BIF_ST_ELEMREV_V16QI) - icode = CODE_FOR_vsx_store_v16qi; - else - gcc_unreachable (); - } - - - /* TRUE iff the built-in function returns void. */ - bool void_func = TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node; - /* Position of first argument (0 for void-returning functions, else 1). */ - int k; - /* Modes for the return value, if any, and arguments. */ - const int MAX_BUILTIN_ARGS = 6; - machine_mode mode[MAX_BUILTIN_ARGS + 1]; - - if (void_func) - k = 0; - else - { - k = 1; - mode[0] = insn_data[icode].operand[0].mode; - } - - /* Tree expressions for each argument. */ - tree arg[MAX_BUILTIN_ARGS]; - /* RTL expressions for each argument. */ - rtx op[MAX_BUILTIN_ARGS]; - - int nargs = bifaddr->nargs; - gcc_assert (nargs <= MAX_BUILTIN_ARGS); - - - for (int i = 0; i < nargs; i++) - { - arg[i] = CALL_EXPR_ARG (exp, i); - if (arg[i] == error_mark_node) - return const0_rtx; - STRIP_NOPS (arg[i]); - op[i] = expand_normal (arg[i]); - /* We have a couple of pesky patterns that don't specify the mode... */ - mode[i+k] = insn_data[icode].operand[i+k].mode; - if (!mode[i+k]) - mode[i+k] = Pmode; - } - - /* Check for restricted constant arguments. */ - for (int i = 0; i < 2; i++) - { - switch (bifaddr->restr[i]) - { - case RES_BITS: - { - size_t mask = 1; - mask <<= bifaddr->restr_val1[i]; - mask--; - tree restr_arg = arg[bifaddr->restr_opnd[i] - 1]; - STRIP_NOPS (restr_arg); - if (!(TREE_CODE (restr_arg) == INTEGER_CST - && (TREE_INT_CST_LOW (restr_arg) & ~mask) == 0)) - { - unsigned p = (1U << bifaddr->restr_val1[i]) - 1; - error ("argument %d must be a literal between 0 and %d," - " inclusive", - bifaddr->restr_opnd[i], p); - return CONST0_RTX (mode[0]); - } - break; - } - case RES_RANGE: - { - tree restr_arg = arg[bifaddr->restr_opnd[i] - 1]; - STRIP_NOPS (restr_arg); - if (!(TREE_CODE (restr_arg) == INTEGER_CST - && IN_RANGE (tree_to_shwi (restr_arg), - bifaddr->restr_val1[i], - bifaddr->restr_val2[i]))) - { - error ("argument %d must be a literal between %d and %d," - " inclusive", - bifaddr->restr_opnd[i], bifaddr->restr_val1[i], - bifaddr->restr_val2[i]); - return CONST0_RTX (mode[0]); - } - break; - } - case RES_VAR_RANGE: - { - tree restr_arg = arg[bifaddr->restr_opnd[i] - 1]; - STRIP_NOPS (restr_arg); - if (TREE_CODE (restr_arg) == INTEGER_CST - && !IN_RANGE (tree_to_shwi (restr_arg), - bifaddr->restr_val1[i], - bifaddr->restr_val2[i])) - { - error ("argument %d must be a variable or a literal " - "between %d and %d, inclusive", - bifaddr->restr_opnd[i], bifaddr->restr_val1[i], - bifaddr->restr_val2[i]); - return CONST0_RTX (mode[0]); - } - break; - } - case RES_VALUES: - { - tree restr_arg = arg[bifaddr->restr_opnd[i] - 1]; - STRIP_NOPS (restr_arg); - if (!(TREE_CODE (restr_arg) == INTEGER_CST - && (tree_to_shwi (restr_arg) == bifaddr->restr_val1[i] - || tree_to_shwi (restr_arg) == bifaddr->restr_val2[i]))) - { - error ("argument %d must be either a literal %d or a " - "literal %d", - bifaddr->restr_opnd[i], bifaddr->restr_val1[i], - bifaddr->restr_val2[i]); - return CONST0_RTX (mode[0]); - } - break; - } - default: - case RES_NONE: - break; - } - } - - if (bif_is_ldstmask (*bifaddr)) - return rs6000_expand_ldst_mask (target, arg[0]); - - if (bif_is_stvec (*bifaddr)) - { - if (bif_is_reve (*bifaddr)) - icode = elemrev_icode (fcode); - return stv_expand_builtin (icode, op, mode[0], mode[1]); - } - - if (bif_is_ldvec (*bifaddr)) - { - if (bif_is_reve (*bifaddr)) - icode = elemrev_icode (fcode); - return ldv_expand_builtin (target, icode, op, mode[0]); - } - - if (bif_is_lxvrse (*bifaddr)) - return lxvrse_expand_builtin (target, icode, op, mode[0], mode[1]); - - if (bif_is_lxvrze (*bifaddr)) - return lxvrze_expand_builtin (target, icode, op, mode[0], mode[1]); - - if (bif_is_mma (*bifaddr)) - return mma_expand_builtin (exp, target, icode, fcode); - - if (fcode == RS6000_BIF_PACK_IF - && TARGET_LONG_DOUBLE_128 - && !TARGET_IEEEQUAD) - { - icode = CODE_FOR_packtf; - fcode = RS6000_BIF_PACK_TF; - uns_fcode = (size_t) fcode; - } - else if (fcode == RS6000_BIF_UNPACK_IF - && TARGET_LONG_DOUBLE_128 - && !TARGET_IEEEQUAD) - { - icode = CODE_FOR_unpacktf; - fcode = RS6000_BIF_UNPACK_TF; - uns_fcode = (size_t) fcode; - } - - if (TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node) - target = NULL_RTX; - else if (target == 0 - || GET_MODE (target) != mode[0] - || !insn_data[icode].operand[0].predicate (target, mode[0])) - target = gen_reg_rtx (mode[0]); - - for (int i = 0; i < nargs; i++) - if (!insn_data[icode].operand[i+k].predicate (op[i], mode[i+k])) - op[i] = copy_to_mode_reg (mode[i+k], op[i]); - - rtx pat; - - switch (nargs) - { - case 0: - pat = (void_func - ? GEN_FCN (icode) () - : GEN_FCN (icode) (target)); - break; - case 1: - pat = (void_func - ? GEN_FCN (icode) (op[0]) - : GEN_FCN (icode) (target, op[0])); - break; - case 2: - pat = (void_func - ? GEN_FCN (icode) (op[0], op[1]) - : GEN_FCN (icode) (target, op[0], op[1])); - break; - case 3: - pat = (void_func - ? GEN_FCN (icode) (op[0], op[1], op[2]) - : GEN_FCN (icode) (target, op[0], op[1], op[2])); - break; - case 4: - pat = (void_func - ? GEN_FCN (icode) (op[0], op[1], op[2], op[3]) - : GEN_FCN (icode) (target, op[0], op[1], op[2], op[3])); - break; - case 5: - pat = (void_func - ? GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4]) - : GEN_FCN (icode) (target, op[0], op[1], op[2], op[3], op[4])); - break; - case 6: - pat = (void_func - ? GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5]) - : GEN_FCN (icode) (target, op[0], op[1], - op[2], op[3], op[4], op[5])); - break; - default: - gcc_assert (MAX_BUILTIN_ARGS == 6); - gcc_unreachable (); - } - - if (!pat) - return 0; - - emit_insn (pat); - return target; -} - -/* Create a builtin vector type with a name. Taking care not to give - the canonical type a name. */ - -static tree -rs6000_vector_type (const char *name, tree elt_type, unsigned num_elts) -{ - tree result = build_vector_type (elt_type, num_elts); - - /* Copy so we don't give the canonical type a name. */ - result = build_variant_type_copy (result); - - add_builtin_type (name, result); - - return result; -} - -void -rs6000_init_builtins (void) -{ - tree tdecl; - tree t; - - if (TARGET_DEBUG_BUILTIN) - fprintf (stderr, "rs6000_init_builtins%s%s\n", - (TARGET_ALTIVEC) ? ", altivec" : "", - (TARGET_VSX) ? ", vsx" : ""); - - V2DI_type_node = rs6000_vector_type ("__vector long long", - long_long_integer_type_node, 2); - ptr_V2DI_type_node - = build_pointer_type (build_qualified_type (V2DI_type_node, - TYPE_QUAL_CONST)); - - V2DF_type_node = rs6000_vector_type ("__vector double", double_type_node, 2); - ptr_V2DF_type_node - = build_pointer_type (build_qualified_type (V2DF_type_node, - TYPE_QUAL_CONST)); - - V4SI_type_node = rs6000_vector_type ("__vector signed int", - intSI_type_node, 4); - ptr_V4SI_type_node - = build_pointer_type (build_qualified_type (V4SI_type_node, - TYPE_QUAL_CONST)); - - V4SF_type_node = rs6000_vector_type ("__vector float", float_type_node, 4); - ptr_V4SF_type_node - = build_pointer_type (build_qualified_type (V4SF_type_node, - TYPE_QUAL_CONST)); - - V8HI_type_node = rs6000_vector_type ("__vector signed short", - intHI_type_node, 8); - ptr_V8HI_type_node - = build_pointer_type (build_qualified_type (V8HI_type_node, - TYPE_QUAL_CONST)); - - V16QI_type_node = rs6000_vector_type ("__vector signed char", - intQI_type_node, 16); - ptr_V16QI_type_node - = build_pointer_type (build_qualified_type (V16QI_type_node, - TYPE_QUAL_CONST)); - - unsigned_V16QI_type_node = rs6000_vector_type ("__vector unsigned char", - unsigned_intQI_type_node, 16); - ptr_unsigned_V16QI_type_node - = build_pointer_type (build_qualified_type (unsigned_V16QI_type_node, - TYPE_QUAL_CONST)); - - unsigned_V8HI_type_node = rs6000_vector_type ("__vector unsigned short", - unsigned_intHI_type_node, 8); - ptr_unsigned_V8HI_type_node - = build_pointer_type (build_qualified_type (unsigned_V8HI_type_node, - TYPE_QUAL_CONST)); - - unsigned_V4SI_type_node = rs6000_vector_type ("__vector unsigned int", - unsigned_intSI_type_node, 4); - ptr_unsigned_V4SI_type_node - = build_pointer_type (build_qualified_type (unsigned_V4SI_type_node, - TYPE_QUAL_CONST)); - - unsigned_V2DI_type_node - = rs6000_vector_type ("__vector unsigned long long", - long_long_unsigned_type_node, 2); - - ptr_unsigned_V2DI_type_node - = build_pointer_type (build_qualified_type (unsigned_V2DI_type_node, - TYPE_QUAL_CONST)); - - opaque_V4SI_type_node = build_opaque_vector_type (intSI_type_node, 4); - - const_str_type_node - = build_pointer_type (build_qualified_type (char_type_node, - TYPE_QUAL_CONST)); - - /* We use V1TI mode as a special container to hold __int128_t items that - must live in VSX registers. */ - if (intTI_type_node) - { - V1TI_type_node = rs6000_vector_type ("__vector __int128", - intTI_type_node, 1); - ptr_V1TI_type_node - = build_pointer_type (build_qualified_type (V1TI_type_node, - TYPE_QUAL_CONST)); - unsigned_V1TI_type_node - = rs6000_vector_type ("__vector unsigned __int128", - unsigned_intTI_type_node, 1); - ptr_unsigned_V1TI_type_node - = build_pointer_type (build_qualified_type (unsigned_V1TI_type_node, - TYPE_QUAL_CONST)); - } - - /* The 'vector bool ...' types must be kept distinct from 'vector unsigned ...' - types, especially in C++ land. Similarly, 'vector pixel' is distinct from - 'vector unsigned short'. */ - - bool_char_type_node = build_distinct_type_copy (unsigned_intQI_type_node); - bool_short_type_node = build_distinct_type_copy (unsigned_intHI_type_node); - bool_int_type_node = build_distinct_type_copy (unsigned_intSI_type_node); - bool_long_long_type_node = build_distinct_type_copy (unsigned_intDI_type_node); - pixel_type_node = build_distinct_type_copy (unsigned_intHI_type_node); - - long_integer_type_internal_node = long_integer_type_node; - long_unsigned_type_internal_node = long_unsigned_type_node; - long_long_integer_type_internal_node = long_long_integer_type_node; - long_long_unsigned_type_internal_node = long_long_unsigned_type_node; - intQI_type_internal_node = intQI_type_node; - uintQI_type_internal_node = unsigned_intQI_type_node; - intHI_type_internal_node = intHI_type_node; - uintHI_type_internal_node = unsigned_intHI_type_node; - intSI_type_internal_node = intSI_type_node; - uintSI_type_internal_node = unsigned_intSI_type_node; - intDI_type_internal_node = intDI_type_node; - uintDI_type_internal_node = unsigned_intDI_type_node; - intTI_type_internal_node = intTI_type_node; - uintTI_type_internal_node = unsigned_intTI_type_node; - float_type_internal_node = float_type_node; - double_type_internal_node = double_type_node; - long_double_type_internal_node = long_double_type_node; - dfloat64_type_internal_node = dfloat64_type_node; - dfloat128_type_internal_node = dfloat128_type_node; - void_type_internal_node = void_type_node; - - ptr_intQI_type_node - = build_pointer_type (build_qualified_type (intQI_type_internal_node, - TYPE_QUAL_CONST)); - ptr_uintQI_type_node - = build_pointer_type (build_qualified_type (uintQI_type_internal_node, - TYPE_QUAL_CONST)); - ptr_intHI_type_node - = build_pointer_type (build_qualified_type (intHI_type_internal_node, - TYPE_QUAL_CONST)); - ptr_uintHI_type_node - = build_pointer_type (build_qualified_type (uintHI_type_internal_node, - TYPE_QUAL_CONST)); - ptr_intSI_type_node - = build_pointer_type (build_qualified_type (intSI_type_internal_node, - TYPE_QUAL_CONST)); - ptr_uintSI_type_node - = build_pointer_type (build_qualified_type (uintSI_type_internal_node, - TYPE_QUAL_CONST)); - ptr_intDI_type_node - = build_pointer_type (build_qualified_type (intDI_type_internal_node, - TYPE_QUAL_CONST)); - ptr_uintDI_type_node - = build_pointer_type (build_qualified_type (uintDI_type_internal_node, - TYPE_QUAL_CONST)); - ptr_intTI_type_node - = build_pointer_type (build_qualified_type (intTI_type_internal_node, - TYPE_QUAL_CONST)); - ptr_uintTI_type_node - = build_pointer_type (build_qualified_type (uintTI_type_internal_node, - TYPE_QUAL_CONST)); - - t = build_qualified_type (long_integer_type_internal_node, TYPE_QUAL_CONST); - ptr_long_integer_type_node = build_pointer_type (t); - - t = build_qualified_type (long_unsigned_type_internal_node, TYPE_QUAL_CONST); - ptr_long_unsigned_type_node = build_pointer_type (t); - - ptr_float_type_node - = build_pointer_type (build_qualified_type (float_type_internal_node, - TYPE_QUAL_CONST)); - ptr_double_type_node - = build_pointer_type (build_qualified_type (double_type_internal_node, - TYPE_QUAL_CONST)); - ptr_long_double_type_node - = build_pointer_type (build_qualified_type (long_double_type_internal_node, - TYPE_QUAL_CONST)); - if (dfloat64_type_node) - { - t = build_qualified_type (dfloat64_type_internal_node, TYPE_QUAL_CONST); - ptr_dfloat64_type_node = build_pointer_type (t); - } - else - ptr_dfloat64_type_node = NULL; - - if (dfloat128_type_node) - { - t = build_qualified_type (dfloat128_type_internal_node, TYPE_QUAL_CONST); - ptr_dfloat128_type_node = build_pointer_type (t); - } - else - ptr_dfloat128_type_node = NULL; - - t = build_qualified_type (long_long_integer_type_internal_node, - TYPE_QUAL_CONST); - ptr_long_long_integer_type_node = build_pointer_type (t); - - t = build_qualified_type (long_long_unsigned_type_internal_node, - TYPE_QUAL_CONST); - ptr_long_long_unsigned_type_node = build_pointer_type (t); - - /* 128-bit floating point support. KFmode is IEEE 128-bit floating point. - IFmode is the IBM extended 128-bit format that is a pair of doubles. - TFmode will be either IEEE 128-bit floating point or the IBM double-double - format that uses a pair of doubles, depending on the switches and - defaults. - - If we don't support for either 128-bit IBM double double or IEEE 128-bit - floating point, we need make sure the type is non-zero or else self-test - fails during bootstrap. - - Always create __ibm128 as a separate type, even if the current long double - format is IBM extended double. - - For IEEE 128-bit floating point, always create the type __ieee128. If the - user used -mfloat128, rs6000-c.cc will create a define from __float128 to - __ieee128. */ - if (TARGET_FLOAT128_TYPE) - { - if (!TARGET_IEEEQUAD && TARGET_LONG_DOUBLE_128) - ibm128_float_type_node = long_double_type_node; - else - { - ibm128_float_type_node = make_node (REAL_TYPE); - TYPE_PRECISION (ibm128_float_type_node) = 128; - SET_TYPE_MODE (ibm128_float_type_node, IFmode); - layout_type (ibm128_float_type_node); - } - t = build_qualified_type (ibm128_float_type_node, TYPE_QUAL_CONST); - ptr_ibm128_float_type_node = build_pointer_type (t); - lang_hooks.types.register_builtin_type (ibm128_float_type_node, - "__ibm128"); - - if (TARGET_IEEEQUAD && TARGET_LONG_DOUBLE_128) - ieee128_float_type_node = long_double_type_node; - else - ieee128_float_type_node = float128_type_node; - t = build_qualified_type (ieee128_float_type_node, TYPE_QUAL_CONST); - ptr_ieee128_float_type_node = build_pointer_type (t); - lang_hooks.types.register_builtin_type (ieee128_float_type_node, - "__ieee128"); - } - - else - ieee128_float_type_node = ibm128_float_type_node = long_double_type_node; - - /* Vector pair and vector quad support. */ - vector_pair_type_node = make_node (OPAQUE_TYPE); - SET_TYPE_MODE (vector_pair_type_node, OOmode); - TYPE_SIZE (vector_pair_type_node) = bitsize_int (GET_MODE_BITSIZE (OOmode)); - TYPE_PRECISION (vector_pair_type_node) = GET_MODE_BITSIZE (OOmode); - TYPE_SIZE_UNIT (vector_pair_type_node) = size_int (GET_MODE_SIZE (OOmode)); - SET_TYPE_ALIGN (vector_pair_type_node, 256); - TYPE_USER_ALIGN (vector_pair_type_node) = 0; - lang_hooks.types.register_builtin_type (vector_pair_type_node, - "__vector_pair"); - t = build_qualified_type (vector_pair_type_node, TYPE_QUAL_CONST); - ptr_vector_pair_type_node = build_pointer_type (t); - - vector_quad_type_node = make_node (OPAQUE_TYPE); - SET_TYPE_MODE (vector_quad_type_node, XOmode); - TYPE_SIZE (vector_quad_type_node) = bitsize_int (GET_MODE_BITSIZE (XOmode)); - TYPE_PRECISION (vector_quad_type_node) = GET_MODE_BITSIZE (XOmode); - TYPE_SIZE_UNIT (vector_quad_type_node) = size_int (GET_MODE_SIZE (XOmode)); - SET_TYPE_ALIGN (vector_quad_type_node, 512); - TYPE_USER_ALIGN (vector_quad_type_node) = 0; - lang_hooks.types.register_builtin_type (vector_quad_type_node, - "__vector_quad"); - t = build_qualified_type (vector_quad_type_node, TYPE_QUAL_CONST); - ptr_vector_quad_type_node = build_pointer_type (t); - - /* Initialize the modes for builtin_function_type, mapping a machine mode to - tree type node. */ - builtin_mode_to_type[QImode][0] = integer_type_node; - builtin_mode_to_type[QImode][1] = unsigned_intSI_type_node; - builtin_mode_to_type[HImode][0] = integer_type_node; - builtin_mode_to_type[HImode][1] = unsigned_intSI_type_node; - builtin_mode_to_type[SImode][0] = intSI_type_node; - builtin_mode_to_type[SImode][1] = unsigned_intSI_type_node; - builtin_mode_to_type[DImode][0] = intDI_type_node; - builtin_mode_to_type[DImode][1] = unsigned_intDI_type_node; - builtin_mode_to_type[TImode][0] = intTI_type_node; - builtin_mode_to_type[TImode][1] = unsigned_intTI_type_node; - builtin_mode_to_type[SFmode][0] = float_type_node; - builtin_mode_to_type[DFmode][0] = double_type_node; - builtin_mode_to_type[IFmode][0] = ibm128_float_type_node; - builtin_mode_to_type[KFmode][0] = ieee128_float_type_node; - builtin_mode_to_type[TFmode][0] = long_double_type_node; - builtin_mode_to_type[DDmode][0] = dfloat64_type_node; - builtin_mode_to_type[TDmode][0] = dfloat128_type_node; - builtin_mode_to_type[V1TImode][0] = V1TI_type_node; - builtin_mode_to_type[V1TImode][1] = unsigned_V1TI_type_node; - builtin_mode_to_type[V2DImode][0] = V2DI_type_node; - builtin_mode_to_type[V2DImode][1] = unsigned_V2DI_type_node; - builtin_mode_to_type[V2DFmode][0] = V2DF_type_node; - builtin_mode_to_type[V4SImode][0] = V4SI_type_node; - builtin_mode_to_type[V4SImode][1] = unsigned_V4SI_type_node; - builtin_mode_to_type[V4SFmode][0] = V4SF_type_node; - builtin_mode_to_type[V8HImode][0] = V8HI_type_node; - builtin_mode_to_type[V8HImode][1] = unsigned_V8HI_type_node; - builtin_mode_to_type[V16QImode][0] = V16QI_type_node; - builtin_mode_to_type[V16QImode][1] = unsigned_V16QI_type_node; - builtin_mode_to_type[OOmode][1] = vector_pair_type_node; - builtin_mode_to_type[XOmode][1] = vector_quad_type_node; - - tdecl = add_builtin_type ("__bool char", bool_char_type_node); - TYPE_NAME (bool_char_type_node) = tdecl; - - tdecl = add_builtin_type ("__bool short", bool_short_type_node); - TYPE_NAME (bool_short_type_node) = tdecl; - - tdecl = add_builtin_type ("__bool int", bool_int_type_node); - TYPE_NAME (bool_int_type_node) = tdecl; - - tdecl = add_builtin_type ("__pixel", pixel_type_node); - TYPE_NAME (pixel_type_node) = tdecl; - - bool_V16QI_type_node = rs6000_vector_type ("__vector __bool char", - bool_char_type_node, 16); - ptr_bool_V16QI_type_node - = build_pointer_type (build_qualified_type (bool_V16QI_type_node, - TYPE_QUAL_CONST)); - - bool_V8HI_type_node = rs6000_vector_type ("__vector __bool short", - bool_short_type_node, 8); - ptr_bool_V8HI_type_node - = build_pointer_type (build_qualified_type (bool_V8HI_type_node, - TYPE_QUAL_CONST)); - - bool_V4SI_type_node = rs6000_vector_type ("__vector __bool int", - bool_int_type_node, 4); - ptr_bool_V4SI_type_node - = build_pointer_type (build_qualified_type (bool_V4SI_type_node, - TYPE_QUAL_CONST)); - - bool_V2DI_type_node = rs6000_vector_type (TARGET_POWERPC64 - ? "__vector __bool long" - : "__vector __bool long long", - bool_long_long_type_node, 2); - ptr_bool_V2DI_type_node - = build_pointer_type (build_qualified_type (bool_V2DI_type_node, - TYPE_QUAL_CONST)); - - bool_V1TI_type_node = rs6000_vector_type ("__vector __bool __int128", - intTI_type_node, 1); - ptr_bool_V1TI_type_node - = build_pointer_type (build_qualified_type (bool_V1TI_type_node, - TYPE_QUAL_CONST)); - - pixel_V8HI_type_node = rs6000_vector_type ("__vector __pixel", - pixel_type_node, 8); - ptr_pixel_V8HI_type_node - = build_pointer_type (build_qualified_type (pixel_V8HI_type_node, - TYPE_QUAL_CONST)); - pcvoid_type_node - = build_pointer_type (build_qualified_type (void_type_node, - TYPE_QUAL_CONST)); - - /* Execute the autogenerated initialization code for builtins. */ - rs6000_init_generated_builtins (); - - if (TARGET_DEBUG_BUILTIN) - { - fprintf (stderr, "\nAutogenerated built-in functions:\n\n"); - for (int i = 1; i < (int) RS6000_BIF_MAX; i++) - { - bif_enable e = rs6000_builtin_info[i].enable; - if (e == ENB_P5 && !TARGET_POPCNTB) - continue; - if (e == ENB_P6 && !TARGET_CMPB) - continue; - if (e == ENB_P6_64 && !(TARGET_CMPB && TARGET_POWERPC64)) - continue; - if (e == ENB_ALTIVEC && !TARGET_ALTIVEC) - continue; - if (e == ENB_VSX && !TARGET_VSX) - continue; - if (e == ENB_P7 && !TARGET_POPCNTD) - continue; - if (e == ENB_P7_64 && !(TARGET_POPCNTD && TARGET_POWERPC64)) - continue; - if (e == ENB_P8 && !TARGET_DIRECT_MOVE) - continue; - if (e == ENB_P8V && !TARGET_P8_VECTOR) - continue; - if (e == ENB_P9 && !TARGET_MODULO) - continue; - if (e == ENB_P9_64 && !(TARGET_MODULO && TARGET_POWERPC64)) - continue; - if (e == ENB_P9V && !TARGET_P9_VECTOR) - continue; - if (e == ENB_IEEE128_HW && !TARGET_FLOAT128_HW) - continue; - if (e == ENB_DFP && !TARGET_DFP) - continue; - if (e == ENB_CRYPTO && !TARGET_CRYPTO) - continue; - if (e == ENB_HTM && !TARGET_HTM) - continue; - if (e == ENB_P10 && !TARGET_POWER10) - continue; - if (e == ENB_P10_64 && !(TARGET_POWER10 && TARGET_POWERPC64)) - continue; - if (e == ENB_MMA && !TARGET_MMA) - continue; - tree fntype = rs6000_builtin_info[i].fntype; - tree t = TREE_TYPE (fntype); - fprintf (stderr, "%s %s (", rs6000_type_string (t), - rs6000_builtin_info[i].bifname); - t = TYPE_ARG_TYPES (fntype); - while (t && TREE_VALUE (t) != void_type_node) - { - fprintf (stderr, "%s", - rs6000_type_string (TREE_VALUE (t))); - t = TREE_CHAIN (t); - if (t && TREE_VALUE (t) != void_type_node) - fprintf (stderr, ", "); - } - fprintf (stderr, "); %s [%4d]\n", - rs6000_builtin_info[i].attr_string, (int) i); - } - fprintf (stderr, "\nEnd autogenerated built-in functions.\n\n\n"); - } - - if (TARGET_XCOFF) - { - /* AIX libm provides clog as __clog. */ - if ((tdecl = builtin_decl_explicit (BUILT_IN_CLOG)) != NULL_TREE) - set_user_assembler_name (tdecl, "__clog"); - - /* When long double is 64 bit, some long double builtins of libc - functions (like __builtin_frexpl) must call the double version - (frexp) not the long double version (frexpl) that expects a 128 bit - argument. */ - if (! TARGET_LONG_DOUBLE_128) - { - if ((tdecl = builtin_decl_explicit (BUILT_IN_FMODL)) != NULL_TREE) - set_user_assembler_name (tdecl, "fmod"); - if ((tdecl = builtin_decl_explicit (BUILT_IN_FREXPL)) != NULL_TREE) - set_user_assembler_name (tdecl, "frexp"); - if ((tdecl = builtin_decl_explicit (BUILT_IN_LDEXPL)) != NULL_TREE) - set_user_assembler_name (tdecl, "ldexp"); - if ((tdecl = builtin_decl_explicit (BUILT_IN_MODFL)) != NULL_TREE) - set_user_assembler_name (tdecl, "modf"); - } - } - - altivec_builtin_mask_for_load - = rs6000_builtin_decls[RS6000_BIF_MASK_FOR_LOAD]; - -#ifdef SUBTARGET_INIT_BUILTINS - SUBTARGET_INIT_BUILTINS; -#endif - - return; -} - -tree -rs6000_builtin_decl (unsigned code, bool /* initialize_p */) -{ - rs6000_gen_builtins fcode = (rs6000_gen_builtins) code; - - if (fcode >= RS6000_OVLD_MAX) - return error_mark_node; - - return rs6000_builtin_decls[code]; -} - /* Return the internal arg pointer used for function incoming arguments. When -fsplit-stack, the arg pointer is r12 so we need to copy it to a pseudo in order for it to be preserved over calls diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index a5fd36b..ac6dd19 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -86,6 +86,10 @@ /* This file should be included last. */ #include "target-def.h" +extern tree rs6000_builtin_mask_for_load (void); +extern tree rs6000_builtin_md_vectorized_function (tree, tree, tree); +extern tree rs6000_builtin_reciprocal (tree); + /* Set -mabi=ieeelongdouble on some old targets. In the future, power server systems will also set long double to be IEEE 128-bit. AIX and Darwin explicitly redefine TARGET_IEEEQUAD and TARGET_IEEEQUAD_DEFAULT to 0, so @@ -105,9 +109,6 @@ #define PCREL_SUPPORTED_BY_OS 0 #endif -/* Support targetm.vectorize.builtin_mask_for_load. */ -tree altivec_builtin_mask_for_load; - #ifdef USING_ELFOS_H /* Counter for labels which are to be placed in .fixup. */ int fixuplabelno = 0; @@ -159,9 +160,6 @@ enum reg_class rs6000_regno_regclass[FIRST_PSEUDO_REGISTER]; static int dbg_cost_ctrl; -/* Built in types. */ -tree rs6000_builtin_types[RS6000_BTI_MAX]; - /* Flag to say the TOC is initialized */ int toc_initialized, need_toc_init; char toc_label_name[10]; @@ -190,9 +188,6 @@ enum reg_class rs6000_constraints[RS6000_CONSTRAINT_MAX]; /* Describe the alignment of a vector. */ int rs6000_vector_align[NUM_MACHINE_MODES]; -/* Map selected modes to types for builtins. */ -tree builtin_mode_to_type[MAX_MACHINE_MODE][2]; - /* What modes to automatically generate reciprocal divide estimate (fre) and reciprocal sqrt (frsqrte) for. */ unsigned char rs6000_recip_bits[MAX_MACHINE_MODE]; @@ -4969,18 +4964,6 @@ rs6000_option_override (void) } -/* Implement targetm.vectorize.builtin_mask_for_load. */ -static tree -rs6000_builtin_mask_for_load (void) -{ - /* Don't use lvsl/vperm for P8 and similarly efficient machines. */ - if ((TARGET_ALTIVEC && !TARGET_VSX) - || (TARGET_VSX && !TARGET_EFFICIENT_UNALIGNED_VSX)) - return altivec_builtin_mask_for_load; - else - return 0; -} - /* Implement LOOP_ALIGN. */ align_flags rs6000_loop_align (rtx label) @@ -5689,119 +5672,6 @@ rs6000_builtin_vectorized_function (unsigned int fn, tree type_out, return NULL_TREE; } -/* Implement targetm.vectorize.builtin_md_vectorized_function. */ - -static tree -rs6000_builtin_md_vectorized_function (tree fndecl, tree type_out, - tree type_in) -{ - machine_mode in_mode, out_mode; - int in_n, out_n; - - if (TARGET_DEBUG_BUILTIN) - fprintf (stderr, - "rs6000_builtin_md_vectorized_function (%s, %s, %s)\n", - IDENTIFIER_POINTER (DECL_NAME (fndecl)), - GET_MODE_NAME (TYPE_MODE (type_out)), - GET_MODE_NAME (TYPE_MODE (type_in))); - - /* TODO: Should this be gcc_assert? */ - if (TREE_CODE (type_out) != VECTOR_TYPE - || TREE_CODE (type_in) != VECTOR_TYPE) - return NULL_TREE; - - out_mode = TYPE_MODE (TREE_TYPE (type_out)); - out_n = TYPE_VECTOR_SUBPARTS (type_out); - in_mode = TYPE_MODE (TREE_TYPE (type_in)); - in_n = TYPE_VECTOR_SUBPARTS (type_in); - - enum rs6000_gen_builtins fn - = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl); - switch (fn) - { - case RS6000_BIF_RSQRTF: - if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode) - && out_mode == SFmode && out_n == 4 - && in_mode == SFmode && in_n == 4) - return rs6000_builtin_decls[RS6000_BIF_VRSQRTFP]; - break; - case RS6000_BIF_RSQRT: - if (VECTOR_UNIT_VSX_P (V2DFmode) - && out_mode == DFmode && out_n == 2 - && in_mode == DFmode && in_n == 2) - return rs6000_builtin_decls[RS6000_BIF_RSQRT_2DF]; - break; - case RS6000_BIF_RECIPF: - if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode) - && out_mode == SFmode && out_n == 4 - && in_mode == SFmode && in_n == 4) - return rs6000_builtin_decls[RS6000_BIF_VRECIPFP]; - break; - case RS6000_BIF_RECIP: - if (VECTOR_UNIT_VSX_P (V2DFmode) - && out_mode == DFmode && out_n == 2 - && in_mode == DFmode && in_n == 2) - return rs6000_builtin_decls[RS6000_BIF_RECIP_V2DF]; - break; - default: - break; - } - - machine_mode in_vmode = TYPE_MODE (type_in); - machine_mode out_vmode = TYPE_MODE (type_out); - - /* Power10 supported vectorized built-in functions. */ - if (TARGET_POWER10 - && in_vmode == out_vmode - && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode)) - { - machine_mode exp_mode = DImode; - machine_mode exp_vmode = V2DImode; - enum rs6000_gen_builtins bif; - switch (fn) - { - case RS6000_BIF_DIVWE: - case RS6000_BIF_DIVWEU: - exp_mode = SImode; - exp_vmode = V4SImode; - if (fn == RS6000_BIF_DIVWE) - bif = RS6000_BIF_VDIVESW; - else - bif = RS6000_BIF_VDIVEUW; - break; - case RS6000_BIF_DIVDE: - case RS6000_BIF_DIVDEU: - if (fn == RS6000_BIF_DIVDE) - bif = RS6000_BIF_VDIVESD; - else - bif = RS6000_BIF_VDIVEUD; - break; - case RS6000_BIF_CFUGED: - bif = RS6000_BIF_VCFUGED; - break; - case RS6000_BIF_CNTLZDM: - bif = RS6000_BIF_VCLZDM; - break; - case RS6000_BIF_CNTTZDM: - bif = RS6000_BIF_VCTZDM; - break; - case RS6000_BIF_PDEPD: - bif = RS6000_BIF_VPDEPD; - break; - case RS6000_BIF_PEXTD: - bif = RS6000_BIF_VPEXTD; - break; - default: - return NULL_TREE; - } - - if (in_mode == exp_mode && in_vmode == exp_vmode) - return rs6000_builtin_decls[bif]; - } - - return NULL_TREE; -} - /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a library with vectorized intrinsics. */ @@ -22543,31 +22413,6 @@ rs6000_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED, return allocno_class; } -/* Returns a code for a target-specific builtin that implements - reciprocal of the function, or NULL_TREE if not available. */ - -static tree -rs6000_builtin_reciprocal (tree fndecl) -{ - switch (DECL_MD_FUNCTION_CODE (fndecl)) - { - case RS6000_BIF_XVSQRTDP: - if (!RS6000_RECIP_AUTO_RSQRTE_P (V2DFmode)) - return NULL_TREE; - - return rs6000_builtin_decls[RS6000_BIF_RSQRT_2DF]; - - case RS6000_BIF_XVSQRTSP: - if (!RS6000_RECIP_AUTO_RSQRTE_P (V4SFmode)) - return NULL_TREE; - - return rs6000_builtin_decls[RS6000_BIF_RSQRT_4SF]; - - default: - return NULL_TREE; - } -} - /* Load up a constant. If the mode is a vector mode, splat the value across all of the vector elements. */ diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index 5fdb8f2..17af314 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -2551,7 +2551,6 @@ enum rs6000_builtin_type_index extern GTY(()) tree rs6000_builtin_types[RS6000_BTI_MAX]; #ifndef USED_FOR_TARGET -extern GTY(()) tree builtin_mode_to_type[MAX_MACHINE_MODE][2]; extern GTY(()) tree altivec_builtin_mask_for_load; extern GTY(()) section *toc_section; diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index 1a460d9..597cea4 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -43,6 +43,10 @@ rs6000-logue.o: $(srcdir)/config/rs6000/rs6000-logue.cc $(COMPILE) $< $(POSTCOMPILE) +rs6000-builtin.o: $(srcdir)/config/rs6000/rs6000-builtin.cc + $(COMPILE) $< + $(POSTCOMPILE) + build/rs6000-gen-builtins.o: $(srcdir)/config/rs6000/rs6000-gen-builtins.cc build/rbtree.o: $(srcdir)/config/rs6000/rbtree.cc -- cgit v1.1 From 3f30f2d1dbb3228b8468b26239fe60c2974ce2ac Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 2 Feb 2022 21:24:22 -0600 Subject: rs6000: Fix LE code gen for vec_cnt[lt]z_lsbb [PR95082] These built-ins were misimplemented as always having big-endian semantics. 2022-01-18 Bill Schmidt gcc/ PR target/95082 * config/rs6000/rs6000-builtin.cc (rs6000_expand_builtin): Handle endianness for vclzlsbb and vctzlsbb. * config/rs6000/rs6000-builtins.def (VCLZLSBB_V16QI): Change default pattern and indicate a different pattern will be used for big endian. (VCLZLSBB_V4SI): Likewise. (VCLZLSBB_V8HI): Likewise. (VCTZLSBB_V16QI): Likewise. (VCTZLSBB_V4SI): Likewise. (VCTZLSBB_V8HI): Likewise. gcc/testsuite/ PR target/95082 * gcc.target/powerpc/vsu/vec-cntlz-lsbb-0.c: Restrict to -mbig. * gcc.target/powerpc/vsu/vec-cntlz-lsbb-1.c: Likewise. * gcc.target/powerpc/vsu/vec-cntlz-lsbb-3.c: New. * gcc.target/powerpc/vsu/vec-cntlz-lsbb-4.c: New. * gcc.target/powerpc/vsu/vec-cnttz-lsbb-0.c: Restrict to -mbig. * gcc.target/powerpc/vsu/vec-cnttz-lsbb-1.c: Likewise. * gcc.target/powerpc/vsu/vec-cnttz-lsbb-3.c: New. * gcc.target/powerpc/vsu/vec-cnttz-lsbb-4.c: New. --- gcc/config/rs6000/rs6000-builtin.cc | 12 ++++++++++++ gcc/config/rs6000/rs6000-builtins.def | 12 ++++++------ 2 files changed, 18 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin.cc b/gcc/config/rs6000/rs6000-builtin.cc index 005f936..69f8cee 100644 --- a/gcc/config/rs6000/rs6000-builtin.cc +++ b/gcc/config/rs6000/rs6000-builtin.cc @@ -3485,6 +3485,18 @@ rs6000_expand_builtin (tree exp, rtx target, rtx /* subtarget */, icode = CODE_FOR_vsx_store_v8hi; else if (fcode == RS6000_BIF_ST_ELEMREV_V16QI) icode = CODE_FOR_vsx_store_v16qi; + else if (fcode == RS6000_BIF_VCLZLSBB_V16QI) + icode = CODE_FOR_vclzlsbb_v16qi; + else if (fcode == RS6000_BIF_VCLZLSBB_V4SI) + icode = CODE_FOR_vclzlsbb_v4si; + else if (fcode == RS6000_BIF_VCLZLSBB_V8HI) + icode = CODE_FOR_vclzlsbb_v8hi; + else if (fcode == RS6000_BIF_VCTZLSBB_V16QI) + icode = CODE_FOR_vctzlsbb_v16qi; + else if (fcode == RS6000_BIF_VCTZLSBB_V4SI) + icode = CODE_FOR_vctzlsbb_v4si; + else if (fcode == RS6000_BIF_VCTZLSBB_V8HI) + icode = CODE_FOR_vctzlsbb_v8hi; else gcc_unreachable (); } diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def index a8ebb4a..7f527b6 100644 --- a/gcc/config/rs6000/rs6000-builtins.def +++ b/gcc/config/rs6000/rs6000-builtins.def @@ -2550,13 +2550,13 @@ VBPERMD altivec_vbpermd {} const signed int __builtin_altivec_vclzlsbb_v16qi (vsc); - VCLZLSBB_V16QI vclzlsbb_v16qi {} + VCLZLSBB_V16QI vctzlsbb_v16qi {endian} const signed int __builtin_altivec_vclzlsbb_v4si (vsi); - VCLZLSBB_V4SI vclzlsbb_v4si {} + VCLZLSBB_V4SI vctzlsbb_v4si {endian} const signed int __builtin_altivec_vclzlsbb_v8hi (vss); - VCLZLSBB_V8HI vclzlsbb_v8hi {} + VCLZLSBB_V8HI vctzlsbb_v8hi {endian} const vsc __builtin_altivec_vctzb (vsc); VCTZB ctzv16qi2 {} @@ -2571,13 +2571,13 @@ VCTZW ctzv4si2 {} const signed int __builtin_altivec_vctzlsbb_v16qi (vsc); - VCTZLSBB_V16QI vctzlsbb_v16qi {} + VCTZLSBB_V16QI vclzlsbb_v16qi {endian} const signed int __builtin_altivec_vctzlsbb_v4si (vsi); - VCTZLSBB_V4SI vctzlsbb_v4si {} + VCTZLSBB_V4SI vclzlsbb_v4si {endian} const signed int __builtin_altivec_vctzlsbb_v8hi (vss); - VCTZLSBB_V8HI vctzlsbb_v8hi {} + VCTZLSBB_V8HI vclzlsbb_v8hi {endian} const signed int __builtin_altivec_vcmpaeb_p (vsc, vsc); VCMPAEB_P vector_ae_v16qi_p {} -- cgit v1.1 From 48bd780ee327c9ae6ffc0641e73cc1f4939fb204 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 2 Feb 2022 21:30:27 -0600 Subject: rs6000: Remove -m[no-]fold-gimple flag [PR103686] The -m[no-]fold-gimple flag was really intended primarily for internal testing while implementing GIMPLE folding for rs6000 vector built-in functions. It ended up leaking into other places, causing problems such as PR103686 identifies. Let's remove it. There are a number of tests in the testsuite that require adjustment. Some specify -mfold-gimple directly, which is the default, so that is handled by removing the option. Others unnecessarily specify -mno-fold-gimple, as the tests work fine without this. Again that is handled by removing the option. There are a couple of extra variants of tests specifically for -mno-fold-gimple; for those, we can just remove the whole test. gcc.target/powerpc/builtins-1.c was more problematic. It was written in such a way as to be extremely fragile. For this one, I rewrote the whole test in a different style, using individual functions to test each built-in function. These same tests are also largely covered by builtins-1-be-folded.c and builtins-1-le-folded.c, so I chose to explicitly make this test -mbig for simplicity, and use -O2 for clean code generation. I made some slight modifications to the expected instruction counts as a result, and tested on both 32- and 64-bit. 2022-02-02 Bill Schmidt gcc/ PR target/103686 * config/rs6000/rs6000-builtin.cc (rs6000_gimple_fold_builtin): Remove test for !rs6000_fold_gimple. * config/rs6000/rs6000.cc (rs6000_option_override_internal): Likewise. * config/rs6000/rs6000.opt (mfold-gimple): Remove. gcc/testsuite/ PR target/103686 * gcc.target/powerpc/builtins-1-be-folded.c: Remove -mfold-gimple option. * gcc.target/powerpc/builtins-1-le-folded.c: Likewise. * gcc.target/powerpc/builtins-1.c: Rewrite to use small functions and restrict to -O2 -mbig for predictability. Adjust instruction counts. * gcc.target/powerpc/builtins-5.c: Remove -mno-fold-gimple option. * gcc.target/powerpc/p8-vec-xl-xst.c: Likewise. * gcc.target/powerpc/pr83926.c: Likewise. * gcc.target/powerpc/pr86731-nogimplefold-longlong.c: Delete. * gcc.target/powerpc/pr86731-nogimplefold.c: Delete. * gcc.target/powerpc/swaps-p8-17.c: Remove -mno-fold-gimple option. --- gcc/config/rs6000/rs6000-builtin.cc | 3 --- gcc/config/rs6000/rs6000.cc | 4 ---- gcc/config/rs6000/rs6000.opt | 4 ---- 3 files changed, 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtin.cc b/gcc/config/rs6000/rs6000-builtin.cc index 69f8cee..5d34c1b 100644 --- a/gcc/config/rs6000/rs6000-builtin.cc +++ b/gcc/config/rs6000/rs6000-builtin.cc @@ -1299,9 +1299,6 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi) fprintf (stderr, "rs6000_gimple_fold_builtin %d %s %s\n", fn_code, fn_name1, fn_name2); - if (!rs6000_fold_gimple) - return false; - /* Prevent gimple folding for code that does not have a LHS, unless it is allowed per the rs6000_builtin_valid_without_lhs helper function. */ if (!gimple_call_lhs (stmt) diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index ac6dd19..b6f2309 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -3833,10 +3833,6 @@ rs6000_option_override_internal (bool global_init_p) & OPTION_MASK_DIRECT_MOVE)) rs6000_isa_flags |= ~rs6000_isa_flags_explicit & OPTION_MASK_STRICT_ALIGN; - if (!rs6000_fold_gimple) - fprintf (stderr, - "gimple folding of rs6000 builtins has been disabled.\n"); - /* Add some warnings for VSX. */ if (TARGET_VSX) { diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index c2a7718..68c0cae 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -155,10 +155,6 @@ maltivec Target Mask(ALTIVEC) Var(rs6000_isa_flags) Use AltiVec instructions. -mfold-gimple -Target Var(rs6000_fold_gimple) Init(1) -Enable early gimple folding of builtins. - mhard-dfp Target Mask(DFP) Var(rs6000_isa_flags) Use decimal floating point instructions. -- cgit v1.1 From 599122fa690d55e5e14d74f4d514b2d8b6a98505 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 3 Feb 2022 22:24:21 +0100 Subject: i386: Do not use %ecx DRAP for functions that use __builtin_eh_return [PR104362] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit %ecx can't be used for both DRAP register and eh_return. Adjust find_drap_reg to choose %edi for functions that uses __builtin_eh_return to avoid the assert in ix86_expand_epilogue that enforces this rule. 2022-02-03 Uroš Bizjak gcc/ChangeLog: PR target/104362 * config/i386/i386.cc (find_drap_reg): For 32bit targets return DI_REG if function uses __builtin_eh_return. gcc/testsuite/ChangeLog: PR target/104362 * gcc.target/i386/pr104362.c: New test. --- gcc/config/i386/i386.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index ad5a5ca..dd5584f 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -7400,7 +7400,8 @@ find_drap_reg (void) register in such case. */ if (DECL_STATIC_CHAIN (decl) || cfun->machine->no_caller_saved_registers - || crtl->tail_call_emit) + || crtl->tail_call_emit + || crtl->calls_eh_return) return DI_REG; /* Reuse static chain register if it isn't used for parameter -- cgit v1.1 From 8d6fffc4bcd4afa0beb0efad4f3b95394aa15618 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 4 Feb 2022 18:30:59 +0100 Subject: rs6000: Fix up -D_FORTIFY_SOURCE* with -mabi=ieeelongdouble [PR104380] The following testcase FAILs when configured with --with-long-double-format=ieee . Only happens in the -std=c* modes, not the GNU modes; while the glibc headers have __asm redirects of vsnprintf and __vsnprinf_chk to __vsnprintfieee128 and __vsnprintf_chkieee128, the vsnprintf fortification extern inline gnu_inline always_inline wrapper calls __builtin_vsnprintf_chk and we actually emit a call to __vsnprinf_chk (i.e. with IBM extended long double) instead of __vsnprintf_chkieee128. rs6000_mangle_decl_assembler_name already had cases for *printf and *scanf, so this just adds another case for *printf_chk. *scanf_chk doesn't exist. __ prefixing isn't done because *printf_chk already starts with __. 2022-02-04 Jakub Jelinek PR target/104380 * config/rs6000/rs6000.cc (rs6000_mangle_decl_assembler_name): Also adjust mangling of __builtin*printf_chk. * gcc.dg/pr104380.c: New test. --- gcc/config/rs6000/rs6000.cc | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index b6f2309..d9fc67d 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -28069,6 +28069,7 @@ rs6000_mangle_decl_assembler_name (tree decl, tree id) { size_t printf_len = strlen ("printf"); size_t scanf_len = strlen ("scanf"); + size_t printf_chk_len = strlen ("printf_chk"); if (len >= printf_len && strcmp (name + len - printf_len, "printf") == 0) @@ -28078,6 +28079,10 @@ rs6000_mangle_decl_assembler_name (tree decl, tree id) && strcmp (name + len - scanf_len, "scanf") == 0) newname = xasprintf ("__isoc99_%sieee128", name); + else if (len >= printf_chk_len + && strcmp (name + len - printf_chk_len, "printf_chk") == 0) + newname = xasprintf ("%sieee128", name); + else if (name[len - 1] == 'l') { bool uses_ieee128_p = false; -- cgit v1.1 From b28b92bc008776c8b517841f99ba6a31bf7751d2 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 2 Feb 2022 20:55:36 -0600 Subject: rs6000: More factoring of overload processing This patch continues the refactoring started with r12-6014. I had previously noted that the resolve_vec* routines can be further simplified by processing the argument list earlier, so that all routines can use the arrays of arguments and types. I found that this was useful for some of the routines, but not for all of them. For several of the special-cased overloads, we don't specify all of the possible type combinations in rs6000-overload.def, because the types don't matter for the expansion we do. For these, we can't use generic error message handling when the number of arguments is incorrect, because the result is misleading error messages that indicate argument types are wrong. So this patch goes halfway and improves the factoring on the remaining special cases, but leaves vec_splats, vec_promote, vec_extract, vec_insert, and vec_step alone. 2022-02-02 Bill Schmidt gcc/ * config/rs6000/rs6000-c.cc (resolve_vec_mul): Accept args and types parameters instead of arglist and nargs. Simplify accordingly. Remove unnecessary test for argument count mismatch. (resolve_vec_cmpne): Likewise. (resolve_vec_adde_sube): Likewise. (resolve_vec_addec_subec): Likewise. (altivec_resolve_overloaded_builtin): Move overload special handling after the gathering of arguments into args[] and types[] and the test for correct number of arguments. Don't perform the test for correct number of arguments for certain special cases. Call the other special cases with args and types instead of arglist and nargs. --- gcc/config/rs6000/rs6000-c.cc | 304 ++++++++++++++++++------------------------ 1 file changed, 127 insertions(+), 177 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc index 145421a..15251ef 100644 --- a/gcc/config/rs6000/rs6000-c.cc +++ b/gcc/config/rs6000/rs6000-c.cc @@ -939,37 +939,25 @@ altivec_build_resolved_builtin (tree *args, int n, tree fntype, tree ret_type, enum resolution { unresolved, resolved, resolved_bad }; /* Resolve an overloaded vec_mul call and return a tree expression for the - resolved call if successful. NARGS is the number of arguments to the call. - ARGLIST contains the arguments. RES must be set to indicate the status of + resolved call if successful. ARGS contains the arguments to the call. + TYPES contains their types. RES must be set to indicate the status of the resolution attempt. LOC contains statement location information. */ static tree -resolve_vec_mul (resolution *res, vec *arglist, unsigned nargs, - location_t loc) +resolve_vec_mul (resolution *res, tree *args, tree *types, location_t loc) { /* vec_mul needs to be special cased because there are no instructions for it for the {un}signed char, {un}signed short, and {un}signed int types. */ - if (nargs != 2) - { - error ("builtin %qs only accepts 2 arguments", "vec_mul"); - *res = resolved; - return error_mark_node; - } - - tree arg0 = (*arglist)[0]; - tree arg0_type = TREE_TYPE (arg0); - tree arg1 = (*arglist)[1]; - tree arg1_type = TREE_TYPE (arg1); /* Both arguments must be vectors and the types must be compatible. */ - if (TREE_CODE (arg0_type) != VECTOR_TYPE - || !lang_hooks.types_compatible_p (arg0_type, arg1_type)) + if (TREE_CODE (types[0]) != VECTOR_TYPE + || !lang_hooks.types_compatible_p (types[0], types[1])) { *res = resolved_bad; return error_mark_node; } - switch (TYPE_MODE (TREE_TYPE (arg0_type))) + switch (TYPE_MODE (TREE_TYPE (types[0]))) { case E_QImode: case E_HImode: @@ -978,21 +966,21 @@ resolve_vec_mul (resolution *res, vec *arglist, unsigned nargs, case E_TImode: /* For scalar types just use a multiply expression. */ *res = resolved; - return fold_build2_loc (loc, MULT_EXPR, TREE_TYPE (arg0), arg0, - fold_convert (TREE_TYPE (arg0), arg1)); + return fold_build2_loc (loc, MULT_EXPR, types[0], args[0], + fold_convert (types[0], args[1])); case E_SFmode: { /* For floats use the xvmulsp instruction directly. */ *res = resolved; tree call = rs6000_builtin_decls[RS6000_BIF_XVMULSP]; - return build_call_expr (call, 2, arg0, arg1); + return build_call_expr (call, 2, args[0], args[1]); } case E_DFmode: { /* For doubles use the xvmuldp instruction directly. */ *res = resolved; tree call = rs6000_builtin_decls[RS6000_BIF_XVMULDP]; - return build_call_expr (call, 2, arg0, arg1); + return build_call_expr (call, 2, args[0], args[1]); } /* Other types are errors. */ default: @@ -1002,37 +990,25 @@ resolve_vec_mul (resolution *res, vec *arglist, unsigned nargs, } /* Resolve an overloaded vec_cmpne call and return a tree expression for the - resolved call if successful. NARGS is the number of arguments to the call. - ARGLIST contains the arguments. RES must be set to indicate the status of + resolved call if successful. ARGS contains the arguments to the call. + TYPES contains their types. RES must be set to indicate the status of the resolution attempt. LOC contains statement location information. */ static tree -resolve_vec_cmpne (resolution *res, vec *arglist, unsigned nargs, - location_t loc) +resolve_vec_cmpne (resolution *res, tree *args, tree *types, location_t loc) { /* vec_cmpne needs to be special cased because there are no instructions for it (prior to power 9). */ - if (nargs != 2) - { - error ("builtin %qs only accepts 2 arguments", "vec_cmpne"); - *res = resolved; - return error_mark_node; - } - - tree arg0 = (*arglist)[0]; - tree arg0_type = TREE_TYPE (arg0); - tree arg1 = (*arglist)[1]; - tree arg1_type = TREE_TYPE (arg1); /* Both arguments must be vectors and the types must be compatible. */ - if (TREE_CODE (arg0_type) != VECTOR_TYPE - || !lang_hooks.types_compatible_p (arg0_type, arg1_type)) + if (TREE_CODE (types[0]) != VECTOR_TYPE + || !lang_hooks.types_compatible_p (types[0], types[1])) { *res = resolved_bad; return error_mark_node; } - machine_mode arg0_elt_mode = TYPE_MODE (TREE_TYPE (arg0_type)); + machine_mode arg0_elt_mode = TYPE_MODE (TREE_TYPE (types[0])); /* Power9 instructions provide the most efficient implementation of ALTIVEC_BUILTIN_VEC_CMPNE if the mode is not DImode or TImode @@ -1060,8 +1036,8 @@ resolve_vec_cmpne (resolution *res, vec *arglist, unsigned nargs, /* call = vec_cmpeq (va, vb) result = vec_nor (call, call). */ vec *params = make_tree_vector (); - vec_safe_push (params, arg0); - vec_safe_push (params, arg1); + vec_safe_push (params, args[0]); + vec_safe_push (params, args[1]); tree decl = rs6000_builtin_decls[RS6000_OVLD_VEC_CMPEQ]; tree call = altivec_resolve_overloaded_builtin (loc, decl, params); /* Use save_expr to ensure that operands used more than once @@ -1088,46 +1064,30 @@ resolve_vec_cmpne (resolution *res, vec *arglist, unsigned nargs, return error_mark_node; } -/* Resolve an overloaded vec_adde or vec_sube call and return a tree - expression for the resolved call if successful. NARGS is the number of - arguments to the call. ARGLIST contains the arguments. RES must be set - to indicate the status of the resolution attempt. LOC contains statement - location information. */ +/* Resolve an overloaded vec_adde or vec_sube call and return a tree expression + for the resolved call if successful. ARGS contains the arguments to the + call. TYPES contains their arguments. RES must be set to indicate the + status of the resolution attempt. LOC contains statement location + information. */ static tree resolve_vec_adde_sube (resolution *res, rs6000_gen_builtins fcode, - vec *arglist, unsigned nargs, - location_t loc) + tree *args, tree *types, location_t loc) { /* vec_adde needs to be special cased because there is no instruction for the {un}signed int version. */ - if (nargs != 3) - { - const char *name; - name = fcode == RS6000_OVLD_VEC_ADDE ? "vec_adde" : "vec_sube"; - error ("builtin %qs only accepts 3 arguments", name); - *res = resolved; - return error_mark_node; - } - - tree arg0 = (*arglist)[0]; - tree arg0_type = TREE_TYPE (arg0); - tree arg1 = (*arglist)[1]; - tree arg1_type = TREE_TYPE (arg1); - tree arg2 = (*arglist)[2]; - tree arg2_type = TREE_TYPE (arg2); /* All 3 arguments must be vectors of (signed or unsigned) (int or __int128) and the types must be compatible. */ - if (TREE_CODE (arg0_type) != VECTOR_TYPE - || !lang_hooks.types_compatible_p (arg0_type, arg1_type) - || !lang_hooks.types_compatible_p (arg1_type, arg2_type)) + if (TREE_CODE (types[0]) != VECTOR_TYPE + || !lang_hooks.types_compatible_p (types[0], types[1]) + || !lang_hooks.types_compatible_p (types[1], types[2])) { *res = resolved_bad; return error_mark_node; } - switch (TYPE_MODE (TREE_TYPE (arg0_type))) + switch (TYPE_MODE (TREE_TYPE (types[0]))) { /* For {un}signed ints, vec_adde (va, vb, carryv) == vec_add (vec_add (va, vb), @@ -1137,8 +1097,8 @@ resolve_vec_adde_sube (resolution *res, rs6000_gen_builtins fcode, case E_SImode: { vec *params = make_tree_vector (); - vec_safe_push (params, arg0); - vec_safe_push (params, arg1); + vec_safe_push (params, args[0]); + vec_safe_push (params, args[1]); tree add_sub_builtin; if (fcode == RS6000_OVLD_VEC_ADDE) @@ -1148,10 +1108,10 @@ resolve_vec_adde_sube (resolution *res, rs6000_gen_builtins fcode, tree call = altivec_resolve_overloaded_builtin (loc, add_sub_builtin, params); - tree const1 = build_int_cstu (TREE_TYPE (arg0_type), 1); - tree ones_vector = build_vector_from_val (arg0_type, const1); - tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, arg0_type, - arg2, ones_vector); + tree const1 = build_int_cstu (TREE_TYPE (types[0]), 1); + tree ones_vector = build_vector_from_val (types[0], const1); + tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, types[0], + args[2], ones_vector); params = make_tree_vector (); vec_safe_push (params, call); vec_safe_push (params, and_expr); @@ -1175,45 +1135,29 @@ resolve_vec_adde_sube (resolution *res, rs6000_gen_builtins fcode, } /* Resolve an overloaded vec_addec or vec_subec call and return a tree - expression for the resolved call if successful. NARGS is the number of - arguments to the call. ARGLIST contains the arguments. RES must be set - to indicate the status of the resolution attempt. LOC contains statement - location information. */ + expression for the resolved call if successful. ARGS contains the arguments + to the call. TYPES contains their types. RES must be set to indicate the + status of the resolution attempt. LOC contains statement location + information. */ static tree resolve_vec_addec_subec (resolution *res, rs6000_gen_builtins fcode, - vec *arglist, unsigned nargs, - location_t loc) + tree *args, tree *types, location_t loc) { /* vec_addec and vec_subec needs to be special cased because there is no instruction for the (un)signed int version. */ - if (nargs != 3) - { - const char *name; - name = fcode == RS6000_OVLD_VEC_ADDEC ? "vec_addec" : "vec_subec"; - error ("builtin %qs only accepts 3 arguments", name); - *res = resolved; - return error_mark_node; - } - - tree arg0 = (*arglist)[0]; - tree arg0_type = TREE_TYPE (arg0); - tree arg1 = (*arglist)[1]; - tree arg1_type = TREE_TYPE (arg1); - tree arg2 = (*arglist)[2]; - tree arg2_type = TREE_TYPE (arg2); /* All 3 arguments must be vectors of (signed or unsigned) (int or __int128) and the types must be compatible. */ - if (TREE_CODE (arg0_type) != VECTOR_TYPE - || !lang_hooks.types_compatible_p (arg0_type, arg1_type) - || !lang_hooks.types_compatible_p (arg1_type, arg2_type)) + if (TREE_CODE (types[0]) != VECTOR_TYPE + || !lang_hooks.types_compatible_p (types[0], types[1]) + || !lang_hooks.types_compatible_p (types[1], types[2])) { *res = resolved_bad; return error_mark_node; } - switch (TYPE_MODE (TREE_TYPE (arg0_type))) + switch (TYPE_MODE (TREE_TYPE (types[0]))) { /* For {un}signed ints, vec_addec (va, vb, carryv) == @@ -1224,11 +1168,11 @@ resolve_vec_addec_subec (resolution *res, rs6000_gen_builtins fcode, { /* Use save_expr to ensure that operands used more than once that may have side effects (like calls) are only evaluated once. */ - arg0 = save_expr (arg0); - arg1 = save_expr (arg1); + args[0] = save_expr (args[0]); + args[1] = save_expr (args[1]); vec *params = make_tree_vector (); - vec_safe_push (params, arg0); - vec_safe_push (params, arg1); + vec_safe_push (params, args[0]); + vec_safe_push (params, args[1]); tree as_c_builtin; if (fcode == RS6000_OVLD_VEC_ADDEC) @@ -1239,8 +1183,8 @@ resolve_vec_addec_subec (resolution *res, rs6000_gen_builtins fcode, tree call1 = altivec_resolve_overloaded_builtin (loc, as_c_builtin, params); params = make_tree_vector (); - vec_safe_push (params, arg0); - vec_safe_push (params, arg1); + vec_safe_push (params, args[0]); + vec_safe_push (params, args[1]); tree as_builtin; if (fcode == RS6000_OVLD_VEC_ADDEC) @@ -1250,10 +1194,10 @@ resolve_vec_addec_subec (resolution *res, rs6000_gen_builtins fcode, tree call2 = altivec_resolve_overloaded_builtin (loc, as_builtin, params); - tree const1 = build_int_cstu (TREE_TYPE (arg0_type), 1); - tree ones_vector = build_vector_from_val (arg0_type, const1); - tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, arg0_type, - arg2, ones_vector); + tree const1 = build_int_cstu (TREE_TYPE (types[0]), 1); + tree ones_vector = build_vector_from_val (types[0], const1); + tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, types[0], + args[2], ones_vector); params = make_tree_vector (); vec_safe_push (params, call2); vec_safe_push (params, and_expr); @@ -1783,78 +1727,22 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, "% is deprecated for little endian; use " "assignment for unaligned loads and stores"); - /* Some overloads require special handling. */ - /* FIXME: Could we simplify the helper functions if we gathered arguments - and types into arrays first? */ - tree returned_expr = NULL; - resolution res = unresolved; - vec *arglist = static_cast *> (passed_arglist); - unsigned int nargs = vec_safe_length (arglist); - - switch (fcode) - { - case RS6000_OVLD_VEC_MUL: - returned_expr = resolve_vec_mul (&res, arglist, nargs, loc); - break; - - case RS6000_OVLD_VEC_CMPNE: - returned_expr = resolve_vec_cmpne (&res, arglist, nargs, loc); - break; - - case RS6000_OVLD_VEC_ADDE: - case RS6000_OVLD_VEC_SUBE: - returned_expr = resolve_vec_adde_sube (&res, fcode, arglist, nargs, loc); - break; - - case RS6000_OVLD_VEC_ADDEC: - case RS6000_OVLD_VEC_SUBEC: - returned_expr = resolve_vec_addec_subec (&res, fcode, arglist, nargs, - loc); - break; - - case RS6000_OVLD_VEC_SPLATS: - case RS6000_OVLD_VEC_PROMOTE: - returned_expr = resolve_vec_splats (&res, fcode, arglist, nargs); - break; - - case RS6000_OVLD_VEC_EXTRACT: - returned_expr = resolve_vec_extract (&res, arglist, nargs, loc); - break; - - case RS6000_OVLD_VEC_INSERT: - returned_expr = resolve_vec_insert (&res, arglist, nargs, loc); - break; - - case RS6000_OVLD_VEC_STEP: - returned_expr = resolve_vec_step (&res, arglist, nargs); - break; - - default: - ; - } - - if (res == resolved) - return returned_expr; - - /* "Regular" built-in functions and overloaded functions share a namespace - for some arrays, like rs6000_builtin_decls. But rs6000_overload_info - only has information for the overloaded functions, so we need an - adjusted index for that. */ - unsigned int adj_fcode = fcode - RS6000_OVLD_NONE; - - if (res == resolved_bad) - { - const char *name = rs6000_overload_info[adj_fcode].ovld_name; - error ("invalid parameter combination for AltiVec intrinsic %qs", name); - return error_mark_node; - } - /* Gather the arguments and their types into arrays for easier handling. */ tree fnargs = TYPE_ARG_TYPES (TREE_TYPE (fndecl)); tree types[MAX_OVLD_ARGS]; tree args[MAX_OVLD_ARGS]; unsigned int n; + /* Count the number of expected arguments. */ + unsigned expected_args = 0; + for (tree chain = fnargs; + chain && !VOID_TYPE_P (TREE_VALUE (chain)); + chain = TREE_CHAIN (chain)) + expected_args++; + + vec *arglist = static_cast *> (passed_arglist); + unsigned int nargs = vec_safe_length (arglist); + for (n = 0; !VOID_TYPE_P (TREE_VALUE (fnargs)) && n < nargs; fnargs = TREE_CHAIN (fnargs), n++) @@ -1915,10 +1803,72 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, } /* If the number of arguments did not match the prototype, return NULL - and the generic code will issue the appropriate error message. */ - if (!VOID_TYPE_P (TREE_VALUE (fnargs)) || n < nargs) + and the generic code will issue the appropriate error message. Skip + this test for functions where we don't fully describe all the possible + overload signatures in rs6000-overload.def (because they aren't relevant + to the expansion here). If we don't, we get confusing error messages. */ + /* As an example, for vec_splats we have: + +; There are no actual builtins for vec_splats. There is special handling for +; this in altivec_resolve_overloaded_builtin in rs6000-c.cc, where the call +; is replaced by a constructor. The single overload here causes +; __builtin_vec_splats to be registered with the front end so that can happen. +[VEC_SPLATS, vec_splats, __builtin_vec_splats] + vsi __builtin_vec_splats (vsi); + ABS_V4SI SPLATS_FAKERY + + So even though __builtin_vec_splats accepts all vector types, the + infrastructure cheats and just records one prototype. We end up getting + an error message that refers to this specific prototype even when we + are handling a different argument type. That is completely confusing + to the user, so it's best to let these cases be handled individually + in the resolve_vec_splats, etc., helper functions. */ + + if (n != expected_args + && !(fcode == RS6000_OVLD_VEC_PROMOTE + || fcode == RS6000_OVLD_VEC_SPLATS + || fcode == RS6000_OVLD_VEC_EXTRACT + || fcode == RS6000_OVLD_VEC_INSERT + || fcode == RS6000_OVLD_VEC_STEP)) return NULL; + /* Some overloads require special handling. */ + tree returned_expr = NULL; + resolution res = unresolved; + + if (fcode == RS6000_OVLD_VEC_MUL) + returned_expr = resolve_vec_mul (&res, args, types, loc); + else if (fcode == RS6000_OVLD_VEC_CMPNE) + returned_expr = resolve_vec_cmpne (&res, args, types, loc); + else if (fcode == RS6000_OVLD_VEC_ADDE || fcode == RS6000_OVLD_VEC_SUBE) + returned_expr = resolve_vec_adde_sube (&res, fcode, args, types, loc); + else if (fcode == RS6000_OVLD_VEC_ADDEC || fcode == RS6000_OVLD_VEC_SUBEC) + returned_expr = resolve_vec_addec_subec (&res, fcode, args, types, loc); + else if (fcode == RS6000_OVLD_VEC_SPLATS || fcode == RS6000_OVLD_VEC_PROMOTE) + returned_expr = resolve_vec_splats (&res, fcode, arglist, nargs); + else if (fcode == RS6000_OVLD_VEC_EXTRACT) + returned_expr = resolve_vec_extract (&res, arglist, nargs, loc); + else if (fcode == RS6000_OVLD_VEC_INSERT) + returned_expr = resolve_vec_insert (&res, arglist, nargs, loc); + else if (fcode == RS6000_OVLD_VEC_STEP) + returned_expr = resolve_vec_step (&res, arglist, nargs); + + if (res == resolved) + return returned_expr; + + /* "Regular" built-in functions and overloaded functions share a namespace + for some arrays, like rs6000_builtin_decls. But rs6000_overload_info + only has information for the overloaded functions, so we need an + adjusted index for that. */ + unsigned int adj_fcode = fcode - RS6000_OVLD_NONE; + + if (res == resolved_bad) + { + const char *name = rs6000_overload_info[adj_fcode].ovld_name; + error ("invalid parameter combination for AltiVec intrinsic %qs", name); + return error_mark_node; + } + bool unsupported_builtin = false; rs6000_gen_builtins instance_code; bool supported = false; -- cgit v1.1 From 06e32a5ebf20c11dd31bc2677bede569fef84316 Mon Sep 17 00:00:00 2001 From: Kito Cheng Date: Tue, 25 Jan 2022 20:44:04 +0800 Subject: RISC-V: Always pass -misa-spec to assembler [PR104219] Add -misa-spec to OPTION_DEFAULT_SPECS to make sure -misa-spec will always pass that into assembler, that prevent GCC and binutils using different way to interpret the ISA string. gcc/ChangeLog: PR target/104219 * config.gcc (riscv*-*-*): Normalize the with_isa_spec value. (all_defaults): Add isa_spec. * config/riscv/riscv.h (OPTION_DEFAULT_SPECS): Add isa_spec. --- gcc/config/riscv/riscv.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index 6956684..8a4d2cf 100644 --- a/gcc/config/riscv/riscv.h +++ b/gcc/config/riscv/riscv.h @@ -60,6 +60,7 @@ extern const char *riscv_default_mtune (int argc, const char **argv); --with-arch is ignored if -march or -mcpu is specified. --with-abi is ignored if -mabi is specified. --with-tune is ignored if -mtune or -mcpu is specified. + --with-isa-spec is ignored if -misa-spec is specified. But using default -march/-mtune value if -mcpu don't have valid option. */ #define OPTION_DEFAULT_SPECS \ @@ -70,6 +71,7 @@ extern const char *riscv_default_mtune (int argc, const char **argv); " %{!mcpu=*:-march=%(VALUE)}" \ " %{mcpu=*:%:riscv_expand_arch_from_cpu(%* %(VALUE))}}" }, \ {"abi", "%{!mabi=*:-mabi=%(VALUE)}" }, \ + {"isa_spec", "%{!misa-spec=*:-misa-spec=%(VALUE)}" }, \ #ifdef IN_LIBGCC2 #undef TARGET_64BIT -- cgit v1.1 From 8103623923ac4ea19b97a369979d4bd5731aab57 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Sun, 6 Feb 2022 21:29:32 -0600 Subject: rs6000: Disable MMA if no VSX support [PR103627] As PR103627 shows, there is an unexpected case where !TARGET_VSX and TARGET_MMA co-exist. As ISA3.1 claims, SIMD is a requirement for MMA. By looking into the ICE, I noticed that the current MMA implementation depends on vector pairs load/store which use VSX register, but we don't have a separated option to control Power10 vector support and Segher pointed out "-mpower9-vector is a workaround that should go away" and more explanations in [1]. So this patch makes MMA require VSX instead. [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-January/589303.html gcc/ChangeLog: PR target/103627 * config/rs6000/rs6000.cc (rs6000_option_override_internal): Disable MMA if !TARGET_VSX. gcc/testsuite/ChangeLog: PR target/103627 * gcc.target/powerpc/pr103627-1.c: New test. * gcc.target/powerpc/pr103627-2.c: New test. --- gcc/config/rs6000/rs6000.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index d9fc67d..a2843d1 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -4482,6 +4482,16 @@ rs6000_option_override_internal (bool global_init_p) rs6000_isa_flags &= ~OPTION_MASK_MMA; } + /* MMA requires SIMD support as ISA 3.1 claims and our implementation + such as "*movoo" uses vector pair access which use VSX registers. + So make MMA require VSX support here. */ + if (TARGET_MMA && !TARGET_VSX) + { + if ((rs6000_isa_flags_explicit & OPTION_MASK_MMA) != 0) + error ("%qs requires %qs", "-mmma", "-mvsx"); + rs6000_isa_flags &= ~OPTION_MASK_MMA; + } + if (!TARGET_PCREL && TARGET_PCREL_OPT) rs6000_isa_flags &= ~OPTION_MASK_PCREL_OPT; -- cgit v1.1 From e66ba0f55c000152df63fc67c11a64f79122ef86 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Sun, 6 Feb 2022 21:30:02 -0600 Subject: rs6000: Move the hunk affecting VSX/ALTIVEC ahead [PR103627] The modified hunk can update VSX and ALTIVEC flag, we have some codes to check/warn for some flags related to VSX and ALTIVEC sitting where the hunk is proprosed to be moved to. Without this adjustment, the VSX and ALTIVEC update is too late, it can cause the incompatibility and result in unexpected behaviors, the associated test case is one typical case. Since we already have the code which sets TARGET_FLOAT128_TYPE and lays after the moved place, and OPTION_MASK_FLOAT128_KEYWORD will rely on TARGET_FLOAT128_TYPE, so it just simply remove them. gcc/ChangeLog: PR target/103627 * config/rs6000/rs6000.cc (rs6000_option_override_internal): Move the hunk affecting VSX and ALTIVEC to appropriate place. gcc/testsuite/ChangeLog: PR target/103627 * gcc.target/powerpc/pr103627-3.c: New test. --- gcc/config/rs6000/rs6000.cc | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index a2843d1..e571a0b 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -3934,6 +3934,15 @@ rs6000_option_override_internal (bool global_init_p) else if (TARGET_ALTIVEC) rs6000_isa_flags |= (OPTION_MASK_PPC_GFXOPT & ~ignore_masks); + /* Disable VSX and Altivec silently if the user switched cpus to power7 in a + target attribute or pragma which automatically enables both options, + unless the altivec ABI was set. This is set by default for 64-bit, but + not for 32-bit. Don't move this before the above code using ignore_masks, + since it can reset the cleared VSX/ALTIVEC flag again. */ + if (main_target_opt && !main_target_opt->x_rs6000_altivec_abi) + rs6000_isa_flags &= ~((OPTION_MASK_VSX | OPTION_MASK_ALTIVEC) + & ~rs6000_isa_flags_explicit); + if (TARGET_CRYPTO && !TARGET_ALTIVEC) { if (rs6000_isa_flags_explicit & OPTION_MASK_CRYPTO) @@ -4350,18 +4359,6 @@ rs6000_option_override_internal (bool global_init_p) } } - /* Disable VSX and Altivec silently if the user switched cpus to power7 in a - target attribute or pragma which automatically enables both options, - unless the altivec ABI was set. This is set by default for 64-bit, but - not for 32-bit. */ - if (main_target_opt != NULL && !main_target_opt->x_rs6000_altivec_abi) - { - TARGET_FLOAT128_TYPE = 0; - rs6000_isa_flags &= ~((OPTION_MASK_VSX | OPTION_MASK_ALTIVEC - | OPTION_MASK_FLOAT128_KEYWORD) - & ~rs6000_isa_flags_explicit); - } - /* Enable Altivec ABI for AIX -maltivec. */ if (TARGET_XCOFF && (TARGET_ALTIVEC || TARGET_VSX) -- cgit v1.1 From db95441cf5399aabc46ca83df19f7290c3e23cb1 Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Sun, 6 Feb 2022 09:07:41 +0100 Subject: Check always_inline flag in s390_can_inline_p [PR104327] MASK_MVCLE is set for -Os but not for other optimization levels. In general it should not make much sense to inline across calls where the flag is different but we have to allow it for always_inline. The patch also rearranges the hook implementation a bit based on the recommendations from Jakub und Martin in the PR. Bootstrapped and regression tested on s390x with various arch flags. Will commit after giving a few days for comments. gcc/ChangeLog: PR target/104327 * config/s390/s390.cc (s390_can_inline_p): Accept a few more flags if always_inline is set. Don't inline when tune differs without always_inline. gcc/testsuite/ChangeLog: PR target/104327 * gcc.c-torture/compile/pr104327.c: New test. --- gcc/config/s390/s390.cc | 64 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 17 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 5c2a830..c6cfe41 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -16091,6 +16091,23 @@ s390_valid_target_attribute_p (tree fndecl, static bool s390_can_inline_p (tree caller, tree callee) { + /* Flags which if present in the callee are required in the caller as well. */ + const unsigned HOST_WIDE_INT caller_required_masks = MASK_OPT_HTM; + + /* Flags which affect the ABI and in general prevent inlining. */ + unsigned HOST_WIDE_INT must_match_masks + = (MASK_64BIT | MASK_ZARCH | MASK_HARD_DFP | MASK_SOFT_FLOAT + | MASK_LONG_DOUBLE_128 | MASK_OPT_VX); + + /* Flags which we in general want to prevent inlining but accept for + always_inline. */ + const unsigned HOST_WIDE_INT always_inline_safe_masks + = MASK_MVCLE | MASK_BACKCHAIN | MASK_SMALL_EXEC; + + const HOST_WIDE_INT all_masks + = (caller_required_masks | must_match_masks | always_inline_safe_masks + | MASK_DEBUG_ARG | MASK_PACKED_STACK | MASK_ZVECTOR); + tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller); tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee); @@ -16103,16 +16120,18 @@ s390_can_inline_p (tree caller, tree callee) struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree); struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree); - bool ret = true; - if ((caller_opts->x_target_flags & ~(MASK_SOFT_FLOAT | MASK_HARD_DFP)) - != (callee_opts->x_target_flags & ~(MASK_SOFT_FLOAT | MASK_HARD_DFP))) - ret = false; + /* If one of these triggers make sure to add proper handling of your + new flag to this hook. */ + gcc_assert (!(caller_opts->x_target_flags & ~all_masks)); + gcc_assert (!(callee_opts->x_target_flags & ~all_masks)); - /* Don't inline functions to be compiled for a more recent arch into a - function for an older arch. */ - else if (caller_opts->x_s390_arch < callee_opts->x_s390_arch) - ret = false; + bool always_inline + = (DECL_DISREGARD_INLINE_LIMITS (callee) + && lookup_attribute ("always_inline", DECL_ATTRIBUTES (callee))); + + if (!always_inline) + must_match_masks |= always_inline_safe_masks; /* Inlining a hard float function into a soft float function is only allowed if the hard float function doesn't actually make use of @@ -16120,16 +16139,27 @@ s390_can_inline_p (tree caller, tree callee) We are called from FEs for multi-versioning call optimization, so beware of ipa_fn_summaries not available. */ - else if (((TARGET_SOFT_FLOAT_P (caller_opts->x_target_flags) - && !TARGET_SOFT_FLOAT_P (callee_opts->x_target_flags)) - || (!TARGET_HARD_DFP_P (caller_opts->x_target_flags) - && TARGET_HARD_DFP_P (callee_opts->x_target_flags))) - && (! ipa_fn_summaries - || ipa_fn_summaries->get - (cgraph_node::get (callee))->fp_expressions)) - ret = false; + if (always_inline && ipa_fn_summaries + && !ipa_fn_summaries->get(cgraph_node::get (callee))->fp_expressions) + must_match_masks &= ~(MASK_HARD_DFP | MASK_SOFT_FLOAT); - return ret; + if ((caller_opts->x_target_flags & must_match_masks) + != (callee_opts->x_target_flags & must_match_masks)) + return false; + + if (~(caller_opts->x_target_flags & caller_required_masks) + & (callee_opts->x_target_flags & caller_required_masks)) + return false; + + /* Don't inline functions to be compiled for a more recent arch into a + function for an older arch. */ + if (caller_opts->x_s390_arch < callee_opts->x_s390_arch) + return false; + + if (!always_inline && caller_opts->x_s390_tune != callee_opts->x_s390_tune) + return false; + + return true; } #endif -- cgit v1.1 From 12aae3b93aeae50f5ced1bbef57fe207ecd12930 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Mon, 7 Feb 2022 12:54:42 +0000 Subject: AArch32: correct dot-product RTL patterns. The previous fix for this problem was wrong due to a subtle difference between where NEON expects the RMW values and where intrinsics expects them. The insn pattern is modeled after the intrinsics and so needs an expand for the vectorizer optab to switch the RTL. However operand[3] is not expected to be written to so the current pattern is bogus. Instead we use the expand to shuffle around the RTL. The vectorizer expects operands[3] and operands[0] to be the same but the aarch64 intrinsics expanders expect operands[0] and operands[1] to be the same. This also fixes some issues with big-endian, each dot product performs 4 8-byte multiplications. However compared to AArch64 we don't enter lanes in GCC lane indexed in AArch32 aside from loads/stores. This means no lane remappings are done in arm-builtins.c and so none should be done at the instruction side. There are some other instructions that need inspections as I think there are more incorrect ones. Third there was a bug in the ACLE specication for dot product which has now been fixed[1]. This means some intrinsics were missing and are added by this patch. Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues. Ok for master? and active branches after some stew? [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3 gcc/ChangeLog: * config/arm/arm_neon.h (vdot_laneq_u32, vdotq_laneq_u32, vdot_laneq_s32, vdotq_laneq_s32): New. * config/arm/arm_neon_builtins.def (sdot_laneq, udot_laneq): New. * config/arm/neon.md (neon_dot): New. (dot_prod): Re-order rtl. (neon_dot_lane): Fix rtl order and endiannes. (neon_dot_laneq): New. gcc/testsuite/ChangeLog: * gcc.target/arm/simd/vdot-compile.c: Add new cases. * gcc.target/arm/simd/vdot-exec.c: Likewise. --- gcc/config/arm/arm_neon.h | 29 ++++++++ gcc/config/arm/arm_neon_builtins.def | 2 + gcc/config/arm/neon.md | 125 ++++++++++++++++++++--------------- 3 files changed, 101 insertions(+), 55 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 9b6d599..fdfea33 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -18243,6 +18243,35 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, int8x8_t __b, const int __index) return __builtin_neon_sdot_lanev16qi (__r, __a, __b, __index); } +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, const int __index) +{ + return __builtin_neon_udot_laneqv8qi_uuuus (__r, __a, __b, __index); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b, + const int __index) +{ + return __builtin_neon_udot_laneqv16qi_uuuus (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int __index) +{ + return __builtin_neon_sdot_laneqv8qi (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int __index) +{ + return __builtin_neon_sdot_laneqv16qi (__r, __a, __b, __index); +} + #pragma GCC pop_options #endif diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 865de65..c29ae3a 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -342,6 +342,8 @@ VAR2 (TERNOP, sdot, v8qi, v16qi) VAR2 (UTERNOP, udot, v8qi, v16qi) VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi) VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi) +VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi) +VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi) VAR1 (USTERNOP, usdot, v8qi) VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index e06c824..4a8987b 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2866,20 +2866,49 @@ }) -;; These instructions map to the __builtins for the Dot Product operations. -(define_insn "neon_dot" +;; These map to the auto-vectorizer Dot Product optab. +;; The auto-vectorizer expects a dot product builtin that also does an +;; accumulation into the provided register. +;; Given the following pattern +;; +;; for (i=0; idot_prod" [(set (match_operand:VCVTI 0 "register_operand" "=w") - (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") - (unspec:VCVTI [(match_operand: 2 - "register_operand" "w") - (match_operand: 3 - "register_operand" "w")] - DOTPROD)))] + (plus:VCVTI + (unspec:VCVTI [(match_operand: 1 "register_operand" "w") + (match_operand: 2 "register_operand" "w")] + DOTPROD) + (match_operand:VCVTI 3 "register_operand" "0")))] "TARGET_DOTPROD" - "vdot.\\t%0, %2, %3" + "vdot.\\t%0, %1, %2" [(set_attr "type" "neon_dot")] ) +;; These instructions map to the __builtins for the Dot Product operations +(define_expand "neon_dot" + [(set (match_operand:VCVTI 0 "register_operand" "=w") + (plus:VCVTI + (unspec:VCVTI [(match_operand: 2 "register_operand") + (match_operand: 3 "register_operand")] + DOTPROD) + (match_operand:VCVTI 1 "register_operand")))] + "TARGET_DOTPROD" +) + ;; These instructions map to the __builtins for the Dot Product operations. (define_insn "neon_usdot" [(set (match_operand:VCVTI 0 "register_operand" "=w") @@ -2898,17 +2927,40 @@ ;; indexed operations. (define_insn "neon_dot_lane" [(set (match_operand:VCVTI 0 "register_operand" "=w") - (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") - (unspec:VCVTI [(match_operand: 2 - "register_operand" "w") - (match_operand:V8QI 3 "register_operand" "t") - (match_operand:SI 4 "immediate_operand" "i")] - DOTPROD)))] + (plus:VCVTI + (unspec:VCVTI [(match_operand: 2 "register_operand" "w") + (match_operand:V8QI 3 "register_operand" "t") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD) + (match_operand:VCVTI 1 "register_operand" "0")))] + "TARGET_DOTPROD" + "vdot.\\t%0, %2, %P3[%c4]"; + [(set_attr "type" "neon_dot")] +) + +;; These instructions map to the __builtins for the Dot Product +;; indexed operations. +(define_insn "neon_dot_laneq" + [(set (match_operand:VCVTI 0 "register_operand" "=w") + (plus:VCVTI + (unspec:VCVTI [(match_operand: 2 "register_operand" "w") + (match_operand:V16QI 3 "register_operand" "t") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD) + (match_operand:VCVTI 1 "register_operand" "0")))] "TARGET_DOTPROD" { - operands[4] - = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL (operands[4]))); - return "vdot.\\t%0, %2, %P3[%c4]"; + int lane = INTVAL (operands[4]); + if (lane > GET_MODE_NUNITS (V2SImode) - 1) + { + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode)); + return "vdot.\\t%0, %2, %f3[%c4]"; + } + else + { + operands[4] = GEN_INT (lane); + return "vdot.\\t%0, %2, %e3[%c4]"; + } } [(set_attr "type" "neon_dot")] ) @@ -2932,43 +2984,6 @@ [(set_attr "type" "neon_dot")] ) -;; These expands map to the Dot Product optab the vectorizer checks for. -;; The auto-vectorizer expects a dot product builtin that also does an -;; accumulation into the provided register. -;; Given the following pattern -;; -;; for (i=0; idot_prod" - [(set (match_operand:VCVTI 0 "register_operand") - (plus:VCVTI (unspec:VCVTI [(match_operand: 1 - "register_operand") - (match_operand: 2 - "register_operand")] - DOTPROD) - (match_operand:VCVTI 3 "register_operand")))] - "TARGET_DOTPROD" -{ - emit_insn ( - gen_neon_dot (operands[3], operands[3], operands[1], - operands[2])); - emit_insn (gen_rtx_SET (operands[0], operands[3])); - DONE; -}) - ;; Auto-vectorizer pattern for usdot (define_expand "usdot_prod" [(set (match_operand:VCVTI 0 "register_operand") -- cgit v1.1 From f2d131645114f14bd91a60107c941287370650ea Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Mon, 7 Feb 2022 12:55:12 +0000 Subject: AArch32: correct usdot-product RTL patterns. There was a bug in the ACLE specication for dot product which has now been fixed[1]. This means some intrinsics were missing and are added by this patch. Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues. Ok for master? [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3 gcc/ChangeLog: * config/arm/arm_neon.h (vusdotq_s32, vusdot_laneq_s32, vusdotq_laneq_s32, vsudot_laneq_s32, vsudotq_laneq_s32): New * config/arm/arm_neon_builtins.def (usdot): Add V16QI. (usdot_laneq, sudot_laneq): New. * config/arm/neon.md (neon_dot_laneq): New. (neon_dot_lane): Remote unneeded code. gcc/testsuite/ChangeLog: * gcc.target/arm/simd/vdot-2-1.c: Add new tests. * gcc.target/arm/simd/vdot-2-2.c: Likewise and fix output. --- gcc/config/arm/arm_neon.h | 39 ++++++++++++++++++++++++++++++++++++ gcc/config/arm/arm_neon_builtins.def | 4 +++- gcc/config/arm/neon.md | 28 ++++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index fdfea33..b30d04c 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -18930,6 +18930,13 @@ vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b) return __builtin_neon_usdotv8qi_ssus (__r, __a, __b); } +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) +{ + return __builtin_neon_usdotv16qi_ssus (__r, __a, __b); +} + __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, @@ -18962,6 +18969,38 @@ vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, return __builtin_neon_sudot_lanev16qi_sssus (__r, __a, __b, __index); } +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a, + int8x16_t __b, const int __index) +{ + return __builtin_neon_usdot_laneqv8qi_ssuss (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, + int8x16_t __b, const int __index) +{ + return __builtin_neon_usdot_laneqv16qi_ssuss (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, + uint8x16_t __b, const int __index) +{ + return __builtin_neon_sudot_laneqv8qi_sssus (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, + uint8x16_t __b, const int __index) +{ + return __builtin_neon_sudot_laneqv16qi_sssus (__r, __a, __b, __index); +} + #pragma GCC pop_options #pragma GCC pop_options diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index c29ae3a..445b2bf 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -345,9 +345,11 @@ VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi) VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi) VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi) -VAR1 (USTERNOP, usdot, v8qi) +VAR2 (USTERNOP, usdot, v8qi, v16qi) VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) VAR2 (SUMAC_LANE_QUADTUP, sudot_lane, v8qi, v16qi) +VAR2 (USMAC_LANE_QUADTUP, usdot_laneq, v8qi, v16qi) +VAR2 (SUMAC_LANE_QUADTUP, sudot_laneq, v8qi, v16qi) VAR4 (BINOP, vcadd90, v4hf, v2sf, v8hf, v4sf) VAR4 (BINOP, vcadd270, v4hf, v2sf, v8hf, v4sf) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 4a8987b..2b9a3de 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2977,9 +2977,33 @@ DOTPROD_I8MM) (match_operand:VCVTI 1 "register_operand" "0")))] "TARGET_I8MM" + "vdot.\\t%0, %2, %P3[%c4]" + [(set_attr "type" "neon_dot")] +) + +;; These instructions map to the __builtins for the Dot Product +;; indexed operations in the v8.6 I8MM extension. +(define_insn "neon_dot_laneq" + [(set (match_operand:VCVTI 0 "register_operand" "=w") + (plus:VCVTI + (unspec:VCVTI [(match_operand: 2 "register_operand" "w") + (match_operand:V16QI 3 "register_operand" "t") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD_I8MM) + (match_operand:VCVTI 1 "register_operand" "0")))] + "TARGET_I8MM" { - operands[4] = GEN_INT (INTVAL (operands[4])); - return "vdot.\\t%0, %2, %P3[%c4]"; + int lane = INTVAL (operands[4]); + if (lane > GET_MODE_NUNITS (V2SImode) - 1) + { + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode)); + return "vdot.\\t%0, %2, %f3[%c4]"; + } + else + { + operands[4] = GEN_INT (lane); + return "vdot.\\t%0, %2, %e3[%c4]"; + } } [(set_attr "type" "neon_dot")] ) -- cgit v1.1 From 04b54cc486cc6fcc40380445e500eaf46d7901dc Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Thu, 3 Feb 2022 14:00:02 +0100 Subject: [nvptx] Fix .local atomic regressions In PR target/104364, two problems were reported: - in muniform-simt mode, an atom.cas insn is no longer executed in the "master lane" only. - in msoft-stack mode, an __atomic_compare_exchange_n on stack memory is translated assuming it accesses local memory, while that's not the case. Fix these by: - ensuring that all insns with atomic attribute are also predicable, such that the validate_change in nvptx_reorg_uniform_simt will succeed, and asserting that it does, and - guarding the local atomics implementation with a new function nvptx_mem_local_p that correctly handles msoft-stack. Tested on x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-02-04 Tom de Vries PR target/104364 * config/nvptx/nvptx-protos.h (nvptx_mem_local_p): Declare. * config/nvptx/nvptx.cc (nvptx_reorg_uniform_simt): Assert that change is validated. (nvptx_mem_local_p): New function. * config/nvptx/nvptx.md: Use nvptx_mem_local_p. (define_c_enum "unspecv"): Add UNSPECV_CAS_LOCAL. (define_insn "atomic_compare_and_swap_1_local"): New non-atomic, non-predicable define_insn, factored out of ... (define_insn "atomic_compare_and_swap_1"): ... here. Make predicable again. (define_expand "atomic_compare_and_swap"): Use atomic_compare_and_swap_1_local. gcc/testsuite/ChangeLog: 2022-02-04 Tom de Vries PR target/104364 * gcc.target/nvptx/softstack-2.c: New test. * gcc.target/nvptx/uniform-simt-1.c: New test. --- gcc/config/nvptx/nvptx-protos.h | 1 + gcc/config/nvptx/nvptx.cc | 25 +++++++++++++++- gcc/config/nvptx/nvptx.md | 63 +++++++++++++++++++++-------------------- 3 files changed, 58 insertions(+), 31 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h index 3d6ad14..a846e34 100644 --- a/gcc/config/nvptx/nvptx-protos.h +++ b/gcc/config/nvptx/nvptx-protos.h @@ -59,5 +59,6 @@ extern const char *nvptx_output_simt_enter (rtx, rtx, rtx); extern const char *nvptx_output_simt_exit (rtx); extern const char *nvptx_output_red_partition (rtx, rtx); extern const char *nvptx_output_atomic_insn (const char *, rtx *, int, int); +extern bool nvptx_mem_local_p (rtx); #endif #endif diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index b3bb97c..2a69492 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -3150,7 +3150,8 @@ nvptx_reorg_uniform_simt () rtx pred = nvptx_get_unisimt_predicate (); pred = gen_rtx_NE (BImode, pred, const0_rtx); pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat); - validate_change (insn, &PATTERN (insn), pat, false); + bool changed_p = validate_change (insn, &PATTERN (insn), pat, false); + gcc_assert (changed_p); } } @@ -6894,6 +6895,28 @@ nvptx_libc_has_function (enum function_class fn_class, tree type) return default_libc_has_function (fn_class, type); } +bool +nvptx_mem_local_p (rtx mem) +{ + gcc_assert (GET_CODE (mem) == MEM); + + struct address_info info; + decompose_mem_address (&info, mem); + + if (info.base != NULL && REG_P (*info.base) + && REGNO_PTR_FRAME_P (REGNO (*info.base))) + { + if (TARGET_SOFT_STACK) + { + /* Frame-related doesn't mean local. */ + } + else + return true; + } + + return false; +} + #undef TARGET_OPTION_OVERRIDE #define TARGET_OPTION_OVERRIDE nvptx_option_override diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 92768dd..d64dbfd 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -54,6 +54,7 @@ (define_c_enum "unspecv" [ UNSPECV_LOCK UNSPECV_CAS + UNSPECV_CAS_LOCAL UNSPECV_XCHG UNSPECV_BARSYNC UNSPECV_WARPSYNC @@ -1771,8 +1772,14 @@ (match_operand:SI 7 "const_int_operand")] ;; failure model "" { - emit_insn (gen_atomic_compare_and_swap_1 - (operands[1], operands[2], operands[3], operands[4], operands[6])); + if (nvptx_mem_local_p (operands[2])) + emit_insn (gen_atomic_compare_and_swap_1_local + (operands[1], operands[2], operands[3], operands[4], + operands[6])); + else + emit_insn (gen_atomic_compare_and_swap_1 + (operands[1], operands[2], operands[3], operands[4], + operands[6])); rtx cond = gen_reg_rtx (BImode); emit_move_insn (cond, gen_rtx_EQ (BImode, operands[1], operands[3])); @@ -1780,23 +1787,18 @@ DONE; }) -(define_insn "atomic_compare_and_swap_1" +(define_insn "atomic_compare_and_swap_1_local" [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R") (unspec_volatile:SDIM [(match_operand:SDIM 1 "memory_operand" "+m") (match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri") (match_operand:SDIM 3 "nvptx_nonmemory_operand" "Ri") (match_operand:SI 4 "const_int_operand")] - UNSPECV_CAS)) + UNSPECV_CAS_LOCAL)) (set (match_dup 1) - (unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))] + (unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS_LOCAL))] "" { - struct address_info info; - decompose_mem_address (&info, operands[1]); - if (info.base != NULL && REG_P (*info.base) - && REGNO_PTR_FRAME_P (REGNO (*info.base))) - { output_asm_insn ("{", NULL); output_asm_insn ("\\t" ".reg.pred" "\\t" "%%eq_p;", NULL); output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands); @@ -1807,13 +1809,26 @@ output_asm_insn ("\\t" "mov%t0" "\\t" "%0,%%val;", operands); output_asm_insn ("}", NULL); return ""; - } + } + [(set_attr "predicable" "false")]) + +(define_insn "atomic_compare_and_swap_1" + [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R") + (unspec_volatile:SDIM + [(match_operand:SDIM 1 "memory_operand" "+m") + (match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri") + (match_operand:SDIM 3 "nvptx_nonmemory_operand" "Ri") + (match_operand:SI 4 "const_int_operand")] + UNSPECV_CAS)) + (set (match_dup 1) + (unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))] + "" + { const char *t - = "\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;"; + = "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;"; return nvptx_output_atomic_insn (t, operands, 1, 4); } - [(set_attr "atomic" "true") - (set_attr "predicable" "false")]) + [(set_attr "atomic" "true")]) (define_insn "atomic_exchange" [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R") ;; output @@ -1825,10 +1840,7 @@ (match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))] ;; input "" { - struct address_info info; - decompose_mem_address (&info, operands[1]); - if (info.base != NULL && REG_P (*info.base) - && REGNO_PTR_FRAME_P (REGNO (*info.base))) + if (nvptx_mem_local_p (operands[1])) { output_asm_insn ("{", NULL); output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands); @@ -1855,10 +1867,7 @@ (match_dup 1))] "" { - struct address_info info; - decompose_mem_address (&info, operands[1]); - if (info.base != NULL && REG_P (*info.base) - && REGNO_PTR_FRAME_P (REGNO (*info.base))) + if (nvptx_mem_local_p (operands[1])) { output_asm_insn ("{", NULL); output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands); @@ -1888,10 +1897,7 @@ (match_dup 1))] "" { - struct address_info info; - decompose_mem_address (&info, operands[1]); - if (info.base != NULL && REG_P (*info.base) - && REGNO_PTR_FRAME_P (REGNO (*info.base))) + if (nvptx_mem_local_p (operands[1])) { output_asm_insn ("{", NULL); output_asm_insn ("\\t" ".reg%t0" "\\t" "%%val;", operands); @@ -1924,10 +1930,7 @@ (match_dup 1))] "mode == SImode || TARGET_SM35" { - struct address_info info; - decompose_mem_address (&info, operands[1]); - if (info.base != NULL && REG_P (*info.base) - && REGNO_PTR_FRAME_P (REGNO (*info.base))) + if (nvptx_mem_local_p (operands[1])) { output_asm_insn ("{", NULL); output_asm_insn ("\\t" ".reg.b%T0" "\\t" "%%val;", operands); -- cgit v1.1 From 73f4a989b7f8aeaf8bff37e7f33b65d26b8f179f Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Mon, 7 Feb 2022 14:50:13 +0100 Subject: [nvptx] Fix 'main (int argc)' compilation On nvptx, with test-case sso-12.c I run into: ... spawn nvptx-none-run ./sso-12.exe^M error: Prototype doesn't match for 'main' in 'input file 1 at offset 1796', \ first defined in 'input file 1 at offset 1796'^M nvptx-run: cuLinkAddData failed: device kernel image is invalid \ (CUDA_ERROR_INVALID_SOURCE, 300)^M FAIL: gcc.dg/sso-12.c execution test ... The problem is that the test case uses 'main (int)' prototype, while __main uses: ... extern int main (int, void **); ... There's code in write_fn_proto_1 to handle 'main (void)' as if 'main (int, void **)' was specified, but that's not active for 'main (int)'. Fix this in write_fn_proto_1 by handling 'main (int)' as if 'main (int, void **)' was specified. Tested on nvptx. gcc/ChangeLog: 2022-02-07 Tom de Vries * config/nvptx/nvptx.cc (write_fn_proto_1): Handle 'main (int)'. --- gcc/config/nvptx/nvptx.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 2a69492..006fac8 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -938,10 +938,13 @@ write_fn_proto_1 (std::stringstream &s, bool is_defn, if (DECL_STATIC_CHAIN (decl)) argno = write_arg_type (s, -1, argno, ptr_type_node, true); - if (!argno && strcmp (name, "main") == 0) + if (argno < 2 && strcmp (name, "main") == 0) { - argno = write_arg_type (s, -1, argno, integer_type_node, true); - argno = write_arg_type (s, -1, argno, ptr_type_node, true); + if (argno == 0) + argno = write_arg_type (s, -1, argno, integer_type_node, true); + + if (argno == 1) + argno = write_arg_type (s, -1, argno, ptr_type_node, true); } if (argno) -- cgit v1.1 From 3faeba72cf93bdbf0b42d6b1b65fd4f0794f9d2a Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 8 Feb 2022 12:14:58 +0000 Subject: RISC-V: Add target machine headers as a dependency for riscv-sr.o Make riscv-sr.o depend on target machine headers, removing spurious test failures: FAIL: gcc.target/riscv/save-restore-3.c scan-assembler-not call[ \t]*t0,__riscv_save_0 FAIL: gcc.target/riscv/save-restore-3.c scan-assembler-not tail[ \t]*__riscv_restore_0 FAIL: gcc.target/riscv/save-restore-3.c scan-assembler tail[ \t]*foo FAIL: gcc.target/riscv/save-restore-6.c scan-assembler-not call[ \t]*t0,__riscv_save_0 FAIL: gcc.target/riscv/save-restore-6.c scan-assembler-not tail[ \t]*__riscv_restore_0 FAIL: gcc.target/riscv/save-restore-6.c scan-assembler tail[ \t]*other_func if the definitions of UNSPECs are locally changed and GCC rebuilt from a dirty tree. gcc/ * config/riscv/t-riscv (riscv-sr.o): Add $(TM_H) dependency. --- gcc/config/riscv/t-riscv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv index 096d70e..19736b3 100644 --- a/gcc/config/riscv/t-riscv +++ b/gcc/config/riscv/t-riscv @@ -6,7 +6,7 @@ riscv-builtins.o: $(srcdir)/config/riscv/riscv-builtins.cc $(CONFIG_H) \ $(srcdir)/config/riscv/riscv-builtins.cc riscv-sr.o: $(srcdir)/config/riscv/riscv-sr.cc $(CONFIG_H) \ - $(SYSTEM_H) + $(SYSTEM_H) $(TM_H) $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ $(srcdir)/config/riscv/riscv-sr.cc -- cgit v1.1 From decde11183bdccc46587d6614b75f3d56a2f2e4a Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Fri, 4 Feb 2022 08:53:52 +0100 Subject: [nvptx] Choose -mptx default based on -misa While testing with driver version 390.147 I ran into the problem that it doesn't support ptx isa version 6.3 (the new default), only 6.1. Furthermore, using the -mptx option is a bit user-unfriendly. Say we want to compile for sm_80. We can use -misa=sm_80 to specify that, but then run into errors because the default ptx version is 6.3, which doesn't support sm_80 yet. Address both these issues by: - picking a default -mptx based on the active -misa, and - ensuring that the default -mptx is at least 6.0 (instead of 6.3). Also add an error in case of incompatible options like "-misa=sm_80 -mptx=6.3": ... cc1: error: PTX version (-mptx) needs to be at least 7.0 to support \ selected -misa (sm_80) ... Tested on x86_64-linux with nvptx accelerator. gcc/ChangeLog: 2022-02-08 Tom de Vries PR target/104283 * config/nvptx/nvptx-opts.h (enum ptx_version): Add PTX_VERSION_3_0 and PTX_VERSION_4_2. * config/nvptx/nvptx.cc (first_ptx_version_supporting_sm) (default_ptx_version_option, ptx_version_to_string) (sm_version_to_string, handle_ptx_version_option): New function. (nvptx_option_override): Call handle_ptx_version_option. (nvptx_file_start): Use ptx_version_to_string and sm_version_to_string. * config/nvptx/nvptx.md (define_insn "nvptx_shuffle") (define_insn "nvptx_vote_ballot"): Use TARGET_PTX_6_0. * config/nvptx/nvptx.opt (mptx): Remove 'Init'. --- gcc/config/nvptx/nvptx-opts.h | 2 + gcc/config/nvptx/nvptx.cc | 133 +++++++++++++++++++++++++++++++++++++----- gcc/config/nvptx/nvptx.md | 4 +- gcc/config/nvptx/nvptx.opt | 2 +- 4 files changed, 122 insertions(+), 19 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h index c754a51..cc488b2 100644 --- a/gcc/config/nvptx/nvptx-opts.h +++ b/gcc/config/nvptx/nvptx-opts.h @@ -31,7 +31,9 @@ enum ptx_isa enum ptx_version { + PTX_VERSION_3_0, PTX_VERSION_3_1, + PTX_VERSION_4_2, PTX_VERSION_6_0, PTX_VERSION_6_3, PTX_VERSION_7_0 diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 006fac8..1b0227a 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -205,6 +205,109 @@ diagnose_openacc_conflict (bool optval, const char *optname) error ("option %s is not supported together with %<-fopenacc%>", optname); } +static enum ptx_version +first_ptx_version_supporting_sm (enum ptx_isa sm) +{ + switch (sm) + { + case PTX_ISA_SM30: + return PTX_VERSION_3_0; + case PTX_ISA_SM35: + return PTX_VERSION_3_1; + case PTX_ISA_SM53: + return PTX_VERSION_4_2; + case PTX_ISA_SM75: + return PTX_VERSION_6_3; + case PTX_ISA_SM80: + return PTX_VERSION_7_0; + default: + gcc_unreachable (); + } +} + +static enum ptx_version +default_ptx_version_option (void) +{ + enum ptx_version first + = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option); + + /* Pick a version that supports the sm. */ + enum ptx_version res = first; + + /* Pick at least 3.1. This has been the smallest version historically. */ + res = MAX (res, PTX_VERSION_3_1); + + /* Pick at least 6.0, to enable using bar.warp.sync to have a way to force + warp convergence. */ + res = MAX (res, PTX_VERSION_6_0); + + /* Verify that we pick a version that supports the sm. */ + gcc_assert (first <= res); + return res; +} + +static const char * +ptx_version_to_string (enum ptx_version v) +{ + switch (v) + { + case PTX_VERSION_3_0: + return "3.0"; + case PTX_VERSION_3_1: + return "3.1"; + case PTX_VERSION_4_2: + return "4.2"; + case PTX_VERSION_6_0: + return "6.0"; + case PTX_VERSION_6_3: + return "6.3"; + case PTX_VERSION_7_0: + return "7.0"; + default: + gcc_unreachable (); + } +} + +static const char * +sm_version_to_string (enum ptx_isa sm) +{ + switch (sm) + { + case PTX_ISA_SM30: + return "30"; + case PTX_ISA_SM35: + return "35"; + case PTX_ISA_SM53: + return "53"; + case PTX_ISA_SM70: + return "70"; + case PTX_ISA_SM75: + return "75"; + case PTX_ISA_SM80: + return "80"; + default: + gcc_unreachable (); + } +} + +static void +handle_ptx_version_option (void) +{ + if (!OPTION_SET_P (ptx_version_option)) + { + ptx_version_option = default_ptx_version_option (); + return; + } + + enum ptx_version first + = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option); + + if (ptx_version_option < first) + error ("PTX version (-mptx) needs to be at least %s to support selected" + " -misa (sm_%s)", ptx_version_to_string (first), + sm_version_to_string ((enum ptx_isa)ptx_isa_option)); +} + /* Implement TARGET_OPTION_OVERRIDE. */ static void @@ -212,6 +315,8 @@ nvptx_option_override (void) { init_machine_status = nvptx_init_machine_status; + handle_ptx_version_option (); + /* Set toplevel_reorder, unless explicitly disabled. We need reordering so that we emit necessary assembler decls of undeclared variables. */ @@ -5430,23 +5535,19 @@ static void nvptx_file_start (void) { fputs ("// BEGIN PREAMBLE\n", asm_out_file); - if (TARGET_PTX_7_0) - fputs ("\t.version\t7.0\n", asm_out_file); - else if (TARGET_PTX_6_3) - fputs ("\t.version\t6.3\n", asm_out_file); - else - fputs ("\t.version\t3.1\n", asm_out_file); - if (TARGET_SM80) - fputs ("\t.target\tsm_80\n", asm_out_file); - else if (TARGET_SM75) - fputs ("\t.target\tsm_75\n", asm_out_file); - else if (TARGET_SM53) - fputs ("\t.target\tsm_53\n", asm_out_file); - else if (TARGET_SM35) - fputs ("\t.target\tsm_35\n", asm_out_file); - else - fputs ("\t.target\tsm_30\n", asm_out_file); + + fputs ("\t.version\t", asm_out_file); + fputs (ptx_version_to_string ((enum ptx_version)ptx_version_option), + asm_out_file); + fputs ("\n", asm_out_file); + + fputs ("\t.target\tsm_", asm_out_file); + fputs (sm_version_to_string ((enum ptx_isa)ptx_isa_option), + asm_out_file); + fputs ("\n", asm_out_file); + fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode)); + fputs ("// END PREAMBLE\n", asm_out_file); } diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index d64dbfd..7463603 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -1603,7 +1603,7 @@ UNSPEC_SHUFFLE))] "" { - if (TARGET_PTX_6_3) + if (TARGET_PTX_6_0) return "%.\\tshfl.sync%S3.b32\\t%0, %1, %2, 31, 0xffffffff;"; else return "%.\\tshfl%S3.b32\\t%0, %1, %2, 31;"; @@ -1615,7 +1615,7 @@ UNSPEC_VOTE_BALLOT))] "" { - if (TARGET_PTX_6_3) + if (TARGET_PTX_6_0) return "%.\\tvote.sync.ballot.b32\\t%0, %1, 0xffffffff;"; else return "%.\\tvote.ballot.b32\\t%0, %1;"; diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 6e12b1f..e3f65b2 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -89,5 +89,5 @@ EnumValue Enum(ptx_version) String(7.0) Value(PTX_VERSION_7_0) mptx= -Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option) Init(PTX_VERSION_6_3) +Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option) Specify the version of the ptx version to use. -- cgit v1.1 From 1e3185e714e877b2b4d14ade0865322f71a8cbf6 Mon Sep 17 00:00:00 2001 From: Robin Dapp Date: Tue, 8 Feb 2022 14:56:29 +0100 Subject: s390: Increase costs for load on condition and change movqicc expander. This patch changes the costs for a load on condition from 5 to 6 in order to ensure that we only if-convert two and not three or more SETS like if (cond) { a = b; c = d; e = f; } In the movqicc expander we emit a paradoxical subreg directly that combine would otherwise try to create by using a non-optimal sequence (which would be too expensive). Also, fix two oversights in ifcvt testcases. gcc/ChangeLog: * config/s390/s390.cc (s390_rtx_costs): Increase costs for load on condition. * config/s390/s390.md: Use paradoxical subreg. gcc/testsuite/ChangeLog: * gcc.target/s390/ifcvt-two-insns-int.c: Fix array size. * gcc.target/s390/ifcvt-two-insns-long.c: Dito. --- gcc/config/s390/s390.cc | 2 +- gcc/config/s390/s390.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index c6cfe41..d2af6d8 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -3636,7 +3636,7 @@ s390_rtx_costs (rtx x, machine_mode mode, int outer_code, /* It is going to be a load/store on condition. Make it slightly more expensive than a normal load. */ - *total = COSTS_N_INSNS (1) + 1; + *total = COSTS_N_INSNS (1) + 2; rtx dst = SET_DEST (x); rtx then = XEXP (SET_SRC (x), 1); diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index e3ccbac..5eee8e8 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -7003,9 +7003,9 @@ if (!CONSTANT_P (els)) els = simplify_gen_subreg (E_SImode, els, mode, 0); - rtx tmp_target = gen_reg_rtx (E_SImode); + rtx tmp_target = simplify_gen_subreg (E_SImode, operands[0], mode, 0); + emit_insn (gen_movsicc (tmp_target, operands[1], then, els)); - emit_move_insn (operands[0], gen_lowpart (mode, tmp_target)); DONE; }) -- cgit v1.1 From ab1355a4804f04700a6ad49c9cc90261334e9dc3 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Tue, 8 Feb 2022 15:35:37 +0100 Subject: [nvptx] Unbreak build, add PTX_ISA_SM70 With the commit "[nvptx] Choose -mptx default based on -misa" I introduced a use of PTX_ISA_SM70, without adding it first. Add it, as well as the corresponding TARGET_SM70. Build for x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-02-08 Tom de Vries * config/nvptx/nvptx-opts.h (enum ptx_isa): Add PTX_ISA_SM70. * config/nvptx/nvptx.h (TARGET_SM70): Define. --- gcc/config/nvptx/nvptx-opts.h | 1 + gcc/config/nvptx/nvptx.h | 1 + 2 files changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h index cc488b2..e918d43 100644 --- a/gcc/config/nvptx/nvptx-opts.h +++ b/gcc/config/nvptx/nvptx-opts.h @@ -25,6 +25,7 @@ enum ptx_isa PTX_ISA_SM30, PTX_ISA_SM35, PTX_ISA_SM53, + PTX_ISA_SM70, PTX_ISA_SM75, PTX_ISA_SM80 }; diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h index 065d7aa..edffd08 100644 --- a/gcc/config/nvptx/nvptx.h +++ b/gcc/config/nvptx/nvptx.h @@ -88,6 +88,7 @@ #define TARGET_SM35 (ptx_isa_option >= PTX_ISA_SM35) #define TARGET_SM53 (ptx_isa_option >= PTX_ISA_SM53) +#define TARGET_SM70 (ptx_isa_option >= PTX_ISA_SM70) #define TARGET_SM75 (ptx_isa_option >= PTX_ISA_SM75) #define TARGET_SM80 (ptx_isa_option >= PTX_ISA_SM80) -- cgit v1.1 From 943d631abdd7be623cbf2b870d3d0cfef89f5f26 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 8 Feb 2022 10:36:14 -0600 Subject: rs6000: Add support for vmsumcud and vec_msumc 2022-02-08 Bill Schmidt gcc/ * config/rs6000/rs6000-builtins.def (VMSUMCUD): New. * config/rs6000/rs6000-overload.def (VEC_MSUMC): New. * config/rs6000/vsx.md (UNSPEC_VMSUMCUD): New constant. (vmsumcud): New define_insn. gcc/testsuite/ * gcc.target/powerpc/vec-msumc.c: New test. --- gcc/config/rs6000/rs6000-builtins.def | 3 +++ gcc/config/rs6000/rs6000-overload.def | 4 ++++ gcc/config/rs6000/vsx.md | 13 +++++++++++++ 3 files changed, 20 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def index 7f527b6..2d1e63fb 100644 --- a/gcc/config/rs6000/rs6000-builtins.def +++ b/gcc/config/rs6000/rs6000-builtins.def @@ -3497,6 +3497,9 @@ const signed int __builtin_altivec_vstrihr_p (vss); VSTRIHR_P vstrir_p_v8hi {} + const vuq __builtin_vsx_vmsumcud (vull, vull, vuq); + VMSUMCUD vmsumcud {} + const signed int __builtin_vsx_xvtlsbb_all_ones (vsc); XVTLSBB_ONES xvtlsbbo {} diff --git a/gcc/config/rs6000/rs6000-overload.def b/gcc/config/rs6000/rs6000-overload.def index cdc703e..49a6104 100644 --- a/gcc/config/rs6000/rs6000-overload.def +++ b/gcc/config/rs6000/rs6000-overload.def @@ -2456,6 +2456,10 @@ vuq __builtin_vec_msum (vull, vull, vuq); VMSUMUDM VMSUMUDM_U +[VEC_MSUMC, vec_msumc, __builtin_vec_msumc] + vuq __builtin_vec_msumc (vull, vull, vuq); + VMSUMCUD + [VEC_MSUMS, vec_msums, __builtin_vec_msums] vui __builtin_vec_msums (vus, vus, vui); VMSUMUHS diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index c8c891e..2f5a2f7 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -372,6 +372,7 @@ UNSPEC_REPLACE_UN UNSPEC_VDIVES UNSPEC_VDIVEU + UNSPEC_VMSUMCUD UNSPEC_XXEVAL UNSPEC_XXSPLTIW UNSPEC_XXSPLTIDP @@ -6620,3 +6621,15 @@ emit_move_insn (operands[0], tmp4); DONE; }) + +;; vmsumcud +(define_insn "vmsumcud" +[(set (match_operand:V1TI 0 "register_operand" "+v") + (unspec:V1TI [(match_operand:V2DI 1 "register_operand" "v") + (match_operand:V2DI 2 "register_operand" "v") + (match_operand:V1TI 3 "register_operand" "v")] + UNSPEC_VMSUMCUD))] + "TARGET_POWER10" + "vmsumcud %0,%1,%2,%3" + [(set_attr "type" "veccomplex")] +) -- cgit v1.1 From 0c3e491a4e5ae74bfbed6d167d403d262b5a4adc Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 8 Feb 2022 20:14:30 +0100 Subject: rs6000: Fix up vspltis_shifted [PR102140] The following testcase ICEs, because (const_vector:V4SI [ (const_int 0 [0]) repeated x3 (const_int -2147483648 [0xffffffff80000000]) ]) is recognized as valid easy_vector_constant in between split1 pass and end of RA. The problem is that such constants need to be split, and the only splitter for that is: (define_split [(set (match_operand:VM 0 "altivec_register_operand") (match_operand:VM 1 "easy_vector_constant_vsldoi"))] "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode) && can_create_pseudo_p ()" There is only a single splitting pass before RA, so after that finishes, if something gets matched in between that and end of RA (after that can_create_pseudo_p () would be no longer true), it will never be successfully split and we ICE at final.cc time or earlier. The i386 backend (and a few others) already use (cfun->curr_properties & PROP_rtl_split_insns) as a test for split1 pass finished, so that some insns that should be split during split1 and shouldn't be matched afterwards are properly guarded. So, the following patch does that for vspltis_shifted too. 2022-02-08 Jakub Jelinek PR target/102140 * config/rs6000/rs6000.cc (vspltis_shifted): Return false also if split1 pass has finished already. * gcc.dg/pr102140.c: New test. --- gcc/config/rs6000/rs6000.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index e571a0b..eaba9a2 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -6257,8 +6257,11 @@ vspltis_shifted (rtx op) return false; /* We need to create pseudo registers to do the shift, so don't recognize - shift vector constants after reload. */ - if (!can_create_pseudo_p ()) + shift vector constants after reload. Don't match it even before RA + after split1 is done, because there won't be further splitting pass + before RA to do the splitting. */ + if (!can_create_pseudo_p () + || (cfun->curr_properties & PROP_rtl_split_insns)) return false; nunits = GET_MODE_NUNITS (mode); -- cgit v1.1 From 1c827873ed283df282f2df11dfe0ff607e07dab3 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Wed, 9 Feb 2022 08:48:35 +0100 Subject: target/104453 - guard call folding with NULL LHS This guards shift builtin folding to do nothing when there is no LHS, similar to what other foldings do. 2022-02-09 Richard Biener PR target/104453 * config/i386/i386.cc (ix86_gimple_fold_builtin): Guard shift folding for NULL LHS. * gcc.target/i386/pr104453.c: New testcase. --- gcc/config/i386/i386.cc | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index dd5584f..448c079 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -18642,6 +18642,8 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) do_shift: gcc_assert (n_args >= 2); + if (!gimple_call_lhs (stmt)) + break; arg0 = gimple_call_arg (stmt, 0); arg1 = gimple_call_arg (stmt, 1); elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); -- cgit v1.1 From 59b31f0e2d187ebdb3d399661e22b28e4ebd8099 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 9 Feb 2022 13:14:43 +0800 Subject: ICE: QImode(not SImode) operand should be passed to gen_vec_initv16qiqi in ashlv16qi3. ix86_expand_vector_init expects vals to be a parallel containing values of individual fields which should be either element mode of the vector mode, or a vector mode with the same element mode and smaller number of elements. But in the expander ashlv16qi3, the second operand is SImode which can't be directly passed to gen_vec_initv16qiqi. gcc/ChangeLog: PR target/104451 * config/i386/sse.md (3): lowpart_subreg operands[2] from SImode to QImode. gcc/testsuite/ChangeLog: PR target/104451 * gcc.target/i386/pr104451.c: New test. --- gcc/config/i386/sse.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index d8cb7b6..36b35f6 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -24153,8 +24153,9 @@ negate = true; } par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16)); + tmp = lowpart_subreg (QImode, operands[2], SImode); for (i = 0; i < 16; i++) - XVECEXP (par, 0, i) = operands[2]; + XVECEXP (par, 0, i) = tmp; tmp = gen_reg_rtx (V16QImode); emit_insn (gen_vec_initv16qiqi (tmp, par)); -- cgit v1.1 From 5390a2f191682dae3c6d1e1deac20e05be413514 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sun, 30 Jan 2022 10:08:14 -0800 Subject: x86: Check each component of source operand for AVX_U128_DIRTY commit 9775e465c1fbfc32656de77c618c61acf5bd905d Author: H.J. Lu Date: Tue Jul 27 07:46:04 2021 -0700 x86: Don't set AVX_U128_DIRTY when zeroing YMM/ZMM register called ix86_check_avx_upper_register to check mode on source operand. But ix86_check_avx_upper_register doesn't work on source operand like (vec_select:V2DI (reg/v:V4DI 23 xmm3 [orig:91 ymm ] [91]) (parallel [ (const_int 2 [0x2]) (const_int 3 [0x3]) ])) Add ix86_avx_u128_mode_source to check mode for each component of source operand. gcc/ PR target/104441 * config/i386/i386.cc (ix86_avx_u128_mode_source): New function. (ix86_avx_u128_mode_needed): Return AVX_U128_ANY for debug INSN. Call ix86_avx_u128_mode_source to check mode for each component of source operand. gcc/testsuite/ PR target/104441 * gcc.target/i386/pr104441-1a.c: New test. * gcc.target/i386/pr104441-1b.c: Likewise. --- gcc/config/i386/i386.cc | 145 ++++++++++++++++++++++++++---------------------- 1 file changed, 79 insertions(+), 66 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 448c079..db5e168 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -14365,11 +14365,82 @@ ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data) } } +/* For YMM/ZMM store or YMM/ZMM extract. Return mode for the source + operand of SRC DEFs in the same basic block before INSN. */ + +static int +ix86_avx_u128_mode_source (rtx_insn *insn, const_rtx src) +{ + basic_block bb = BLOCK_FOR_INSN (insn); + rtx_insn *end = BB_END (bb); + + /* Return AVX_U128_DIRTY if there is no DEF in the same basic + block. */ + int status = AVX_U128_DIRTY; + + for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src)); + def; def = DF_REF_NEXT_REG (def)) + if (DF_REF_BB (def) == bb) + { + /* Ignore DEF from different basic blocks. */ + rtx_insn *def_insn = DF_REF_INSN (def); + + /* Check if DEF_INSN is before INSN. */ + rtx_insn *next; + for (next = NEXT_INSN (def_insn); + next != nullptr && next != end && next != insn; + next = NEXT_INSN (next)) + ; + + /* Skip if DEF_INSN isn't before INSN. */ + if (next != insn) + continue; + + /* Return AVX_U128_DIRTY if the source operand of DEF_INSN + isn't constant zero. */ + + if (CALL_P (def_insn)) + { + bool avx_upper_reg_found = false; + note_stores (def_insn, + ix86_check_avx_upper_stores, + &avx_upper_reg_found); + + /* Return AVX_U128_DIRTY if call returns AVX. */ + if (avx_upper_reg_found) + return AVX_U128_DIRTY; + + continue; + } + + rtx set = single_set (def_insn); + if (!set) + return AVX_U128_DIRTY; + + rtx dest = SET_DEST (set); + + /* Skip if DEF_INSN is not an AVX load. Return AVX_U128_DIRTY + if the source operand isn't constant zero. */ + if (ix86_check_avx_upper_register (dest) + && standard_sse_constant_p (SET_SRC (set), + GET_MODE (dest)) != 1) + return AVX_U128_DIRTY; + + /* We get here only if all AVX loads are from constant zero. */ + status = AVX_U128_ANY; + } + + return status; +} + /* Return needed mode for entity in optimize_mode_switching pass. */ static int ix86_avx_u128_mode_needed (rtx_insn *insn) { + if (DEBUG_INSN_P (insn)) + return AVX_U128_ANY; + if (CALL_P (insn)) { rtx link; @@ -14409,6 +14480,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) return AVX_U128_CLEAN; } + subrtx_iterator::array_type array; + rtx set = single_set (insn); if (set) { @@ -14423,74 +14496,15 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) else return AVX_U128_ANY; } - else if (ix86_check_avx_upper_register (src)) + else { - /* This is an YMM/ZMM store. Check for the source operand - of SRC DEFs in the same basic block before INSN. */ - basic_block bb = BLOCK_FOR_INSN (insn); - rtx_insn *end = BB_END (bb); - - /* Return AVX_U128_DIRTY if there is no DEF in the same basic - block. */ - int status = AVX_U128_DIRTY; - - for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src)); - def; def = DF_REF_NEXT_REG (def)) - if (DF_REF_BB (def) == bb) + FOR_EACH_SUBRTX (iter, array, src, NONCONST) + if (ix86_check_avx_upper_register (*iter)) { - /* Ignore DEF from different basic blocks. */ - rtx_insn *def_insn = DF_REF_INSN (def); - - /* Check if DEF_INSN is before INSN. */ - rtx_insn *next; - for (next = NEXT_INSN (def_insn); - next != nullptr && next != end && next != insn; - next = NEXT_INSN (next)) - ; - - /* Skip if DEF_INSN isn't before INSN. */ - if (next != insn) - continue; - - /* Return AVX_U128_DIRTY if the source operand of - DEF_INSN isn't constant zero. */ - - if (CALL_P (def_insn)) - { - bool avx_upper_reg_found = false; - note_stores (def_insn, ix86_check_avx_upper_stores, - &avx_upper_reg_found); - - /* Return AVX_U128_DIRTY if call returns AVX. */ - if (avx_upper_reg_found) - return AVX_U128_DIRTY; - - continue; - } - - set = single_set (def_insn); - if (!set) - return AVX_U128_DIRTY; - - dest = SET_DEST (set); - - /* Skip if DEF_INSN is not an AVX load. */ - if (ix86_check_avx_upper_register (dest)) - { - src = SET_SRC (set); - /* Return AVX_U128_DIRTY if the source operand isn't - constant zero. */ - if (standard_sse_constant_p (src, GET_MODE (dest)) - != 1) - return AVX_U128_DIRTY; - } - - /* We get here only if all AVX loads are from constant - zero. */ - status = AVX_U128_ANY; + int status = ix86_avx_u128_mode_source (insn, *iter); + if (status == AVX_U128_DIRTY) + return status; } - - return status; } /* This isn't YMM/ZMM load/store. */ @@ -14501,7 +14515,6 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) Hardware changes state only when a 256bit register is written to, but we need to prevent the compiler from moving optimal insertion point above eventual read from 256bit or 512 bit register. */ - subrtx_iterator::array_type array; FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST) if (ix86_check_avx_upper_register (*iter)) return AVX_U128_DIRTY; -- cgit v1.1 From ab0b5fbfe90168d2e470aefb19e0cf31526290bc Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sat, 19 Jun 2021 05:12:48 -0700 Subject: x86: Add -m[no-]direct-extern-access Add -m[no-]direct-extern-access and nodirect_extern_access attribute. -mdirect-extern-access is the default. With nodirect_extern_access attribute, GOT is always used to access undefined data and function symbols with nodirect_extern_access attribute, including in PIE and non-PIE. With -mno-direct-extern-access: 1. Always use GOT to access undefined data and function symbols, including in PIE and non-PIE. These will avoid copy relocations in executables. This is compatible with existing executables and shared libraries. 2. In executable and shared library, bind symbols with the STV_PROTECTED visibility locally: a. The address of data symbol is the address of data body. b. For systems without function descriptor, the function pointer is the address of function body. c. The resulting shared libraries may not be incompatible with executables which have copy relocations on protected symbols or use executable PLT entries as function addresses for protected functions in shared libraries. 3. Update asm_preferred_eh_data_format to select PC relative EH encoding format with -mno-direct-extern-access to avoid copy relocation. 4. Add ix86_reloc_rw_mask for TARGET_ASM_RELOC_RW_MASK to avoid copy relocation with -mno-direct-extern-access. gcc/ PR target/35513 PR target/100593 * config/i386/gnu-property.cc: Include "i386-protos.h". (file_end_indicate_exec_stack_and_gnu_property): Generate a GNU_PROPERTY_1_NEEDED note for -mno-direct-extern-access or nodirect_extern_access attribute. * config/i386/i386-options.cc (handle_nodirect_extern_access_attribute): New function. (ix86_attribute_table): Add nodirect_extern_access attribute. * config/i386/i386-protos.h (ix86_force_load_from_GOT_p): Add a bool argument. (ix86_has_no_direct_extern_access): New. * config/i386/i386.cc (ix86_has_no_direct_extern_access): New. (ix86_force_load_from_GOT_p): Add a bool argument to indicate call operand. Force non-call load from GOT for -mno-direct-extern-access or nodirect_extern_access attribute. (legitimate_pic_address_disp_p): Avoid copy relocation in PIE for -mno-direct-extern-access or nodirect_extern_access attribute. (ix86_print_operand): Pass true to ix86_force_load_from_GOT_p for call operand. (asm_preferred_eh_data_format): Use PC-relative format for -mno-direct-extern-access to avoid copy relocation. Check ptr_mode instead of TARGET_64BIT when selecting DW_EH_PE_sdata4. (ix86_binds_local_p): Set ix86_has_no_direct_extern_access to true for -mno-direct-extern-access or nodirect_extern_access attribute. Don't treat protected data as extern and avoid copy relocation on common symbol with -mno-direct-extern-access or nodirect_extern_access attribute. (ix86_reloc_rw_mask): New to avoid copy relocation for -mno-direct-extern-access. (TARGET_ASM_RELOC_RW_MASK): New. * config/i386/i386.opt: Add -mdirect-extern-access. * doc/extend.texi: Document nodirect_extern_access attribute. * doc/invoke.texi: Document -m[no-]direct-extern-access. gcc/testsuite/ PR target/35513 PR target/100593 * g++.target/i386/pr35513-1.C: New file. * g++.target/i386/pr35513-2.C: Likewise. * gcc.target/i386/pr35513-1a.c: Likewise. * gcc.target/i386/pr35513-1b.c: Likewise. * gcc.target/i386/pr35513-2a.c: Likewise. * gcc.target/i386/pr35513-2b.c: Likewise. * gcc.target/i386/pr35513-3a.c: Likewise. * gcc.target/i386/pr35513-3b.c: Likewise. * gcc.target/i386/pr35513-4a.c: Likewise. * gcc.target/i386/pr35513-4b.c: Likewise. * gcc.target/i386/pr35513-5a.c: Likewise. * gcc.target/i386/pr35513-5b.c: Likewise. * gcc.target/i386/pr35513-6a.c: Likewise. * gcc.target/i386/pr35513-6b.c: Likewise. * gcc.target/i386/pr35513-7a.c: Likewise. * gcc.target/i386/pr35513-7b.c: Likewise. * gcc.target/i386/pr35513-8.c: Likewise. * gcc.target/i386/pr35513-9a.c: Likewise. * gcc.target/i386/pr35513-9b.c: Likewise. * gcc.target/i386/pr35513-10a.c: Likewise. * gcc.target/i386/pr35513-10b.c: Likewise. * gcc.target/i386/pr35513-11a.c: Likewise. * gcc.target/i386/pr35513-11b.c: Likewise. * gcc.target/i386/pr35513-12a.c: Likewise. * gcc.target/i386/pr35513-12b.c: Likewise. --- gcc/config/i386/gnu-property.cc | 10 +++++- gcc/config/i386/i386-options.cc | 32 ++++++++++++++++++++ gcc/config/i386/i386-protos.h | 4 ++- gcc/config/i386/i386.cc | 67 ++++++++++++++++++++++++++++++++--------- gcc/config/i386/i386.opt | 4 +++ 5 files changed, 101 insertions(+), 16 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/gnu-property.cc b/gcc/config/i386/gnu-property.cc index f08984f..ea63c1e 100644 --- a/gcc/config/i386/gnu-property.cc +++ b/gcc/config/i386/gnu-property.cc @@ -23,6 +23,7 @@ along with GCC; see the file COPYING3. If not see #include "tm.h" #include "output.h" #include "linux-common.h" +#include "i386-protos.h" static void emit_gnu_property (unsigned int type, unsigned int data) @@ -60,7 +61,9 @@ file_end_indicate_exec_stack_and_gnu_property (void) { file_end_indicate_exec_stack (); - if (flag_cf_protection == CF_NONE && !ix86_needed) + if (flag_cf_protection == CF_NONE + && !ix86_needed + && !ix86_has_no_direct_extern_access) return; unsigned int feature_1 = 0; @@ -121,4 +124,9 @@ file_end_indicate_exec_stack_and_gnu_property (void) /* Generate GNU_PROPERTY_X86_ISA_1_NEEDED. */ if (isa_1) emit_gnu_property (0xc0008002, isa_1); + + if (ix86_has_no_direct_extern_access) + /* Emite a GNU_PROPERTY_1_NEEDED note with + GNU_PROPERTY_1_NEEDED_INDIRECT_EXTERN_ACCESS. */ + emit_gnu_property (0xb0008000, (1U << 0)); } diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index 082abd2..8055393 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -3775,6 +3775,36 @@ ix86_handle_fentry_name (tree *node, tree name, tree args, return NULL_TREE; } +/* Handle a "nodirect_extern_access" attribute; arguments as in + struct attribute_spec.handler. */ + +static tree +handle_nodirect_extern_access_attribute (tree *pnode, tree name, + tree ARG_UNUSED (args), + int ARG_UNUSED (flags), + bool *no_add_attrs) +{ + tree node = *pnode; + + if (VAR_OR_FUNCTION_DECL_P (node)) + { + if ((!TREE_STATIC (node) && TREE_CODE (node) != FUNCTION_DECL + && !DECL_EXTERNAL (node)) || !TREE_PUBLIC (node)) + { + warning (OPT_Wattributes, + "%qE attribute have effect only on public objects", name); + *no_add_attrs = true; + } + } + else + { + warning (OPT_Wattributes, "%qE attribute ignored", name); + *no_add_attrs = true; + } + + return NULL_TREE; +} + /* Table of valid machine attributes. */ const struct attribute_spec ix86_attribute_table[] = { @@ -3855,6 +3885,8 @@ const struct attribute_spec ix86_attribute_table[] = ix86_handle_fentry_name, NULL }, { "cf_check", 0, 0, true, false, false, false, ix86_handle_fndecl_attribute, NULL }, + { "nodirect_extern_access", 0, 0, true, false, false, false, + handle_nodirect_extern_access_attribute, NULL }, /* End element. */ { NULL, 0, 0, false, false, false, false, NULL, NULL } diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 6b3c951..b7e9aa7 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -79,7 +79,7 @@ extern bool ix86_expand_cmpstrn_or_cmpmem (rtx, rtx, rtx, rtx, rtx, bool); extern bool constant_address_p (rtx); extern bool legitimate_pic_operand_p (rtx); extern bool legitimate_pic_address_disp_p (rtx); -extern bool ix86_force_load_from_GOT_p (rtx); +extern bool ix86_force_load_from_GOT_p (rtx, bool = false); extern void print_reg (rtx, int, FILE*); extern void ix86_print_operand (FILE *, rtx, int); @@ -401,3 +401,5 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area (gcc::context *); extern rtl_opt_pass *make_pass_remove_partial_avx_dependency (gcc::context *); + +extern bool ix86_has_no_direct_extern_access; diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index db5e168..6b97a2b 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -363,6 +363,9 @@ unsigned int ix86_default_incoming_stack_boundary; /* Alignment for incoming stack boundary in bits. */ unsigned int ix86_incoming_stack_boundary; +/* True if there is no direct access to extern symbols. */ +bool ix86_has_no_direct_extern_access; + /* Calling abi specific va_list type nodes. */ tree sysv_va_list_type_node; tree ms_va_list_type_node; @@ -10514,13 +10517,17 @@ darwin_local_data_pic (rtx disp) } /* True if the function symbol operand X should be loaded from GOT. + If CALL_P is true, X is a call operand. + + NB: -mno-direct-extern-access doesn't force load from GOT for + call. NB: In 32-bit mode, only non-PIC is allowed in inline assembly statements, since a PIC register could not be available at the call site. */ bool -ix86_force_load_from_GOT_p (rtx x) +ix86_force_load_from_GOT_p (rtx x, bool call_p) { return ((TARGET_64BIT || (!flag_pic && HAVE_AS_IX86_GOT32X)) && !TARGET_PECOFF && !TARGET_MACHO @@ -10528,11 +10535,16 @@ ix86_force_load_from_GOT_p (rtx x) && ix86_cmodel != CM_LARGE && ix86_cmodel != CM_LARGE_PIC && GET_CODE (x) == SYMBOL_REF - && SYMBOL_REF_FUNCTION_P (x) - && (!flag_plt - || (SYMBOL_REF_DECL (x) - && lookup_attribute ("noplt", - DECL_ATTRIBUTES (SYMBOL_REF_DECL (x))))) + && ((!call_p + && (!ix86_direct_extern_access + || (SYMBOL_REF_DECL (x) + && lookup_attribute ("nodirect_extern_access", + DECL_ATTRIBUTES (SYMBOL_REF_DECL (x)))))) + || (SYMBOL_REF_FUNCTION_P (x) + && (!flag_plt + || (SYMBOL_REF_DECL (x) + && lookup_attribute ("noplt", + DECL_ATTRIBUTES (SYMBOL_REF_DECL (x))))))) && !SYMBOL_REF_LOCAL_P (x)); } @@ -10799,7 +10811,11 @@ legitimate_pic_address_disp_p (rtx disp) } else if (!SYMBOL_REF_FAR_ADDR_P (op0) && (SYMBOL_REF_LOCAL_P (op0) - || (HAVE_LD_PIE_COPYRELOC + || ((ix86_direct_extern_access + && !(SYMBOL_REF_DECL (op0) + && lookup_attribute ("nodirect_extern_access", + DECL_ATTRIBUTES (SYMBOL_REF_DECL (op0))))) + && HAVE_LD_PIE_COPYRELOC && flag_pie && !SYMBOL_REF_WEAK (op0) && !SYMBOL_REF_FUNCTION_P (op0))) @@ -13755,7 +13771,7 @@ ix86_print_operand (FILE *file, rtx x, int code) if (code == 'P') { - if (ix86_force_load_from_GOT_p (x)) + if (ix86_force_load_from_GOT_p (x, true)) { /* For inline assembly statement, load function address from GOT with 'P' operand modifier to avoid PLT. */ @@ -22536,10 +22552,10 @@ int asm_preferred_eh_data_format (int code, int global) { /* PE-COFF is effectively always -fPIC because of the .reloc section. */ - if (flag_pic || TARGET_PECOFF) + if (flag_pic || TARGET_PECOFF || !ix86_direct_extern_access) { int type = DW_EH_PE_sdata8; - if (!TARGET_64BIT + if (ptr_mode == SImode || ix86_cmodel == CM_SMALL_PIC || (ix86_cmodel == CM_MEDIUM_PIC && (global || code))) type = DW_EH_PE_sdata4; @@ -23629,10 +23645,28 @@ ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update) static bool ix86_binds_local_p (const_tree exp) { - return default_binds_local_p_3 (exp, flag_shlib != 0, true, true, - (!flag_pic - || (TARGET_64BIT - && HAVE_LD_PIE_COPYRELOC != 0))); + bool direct_extern_access + = (ix86_direct_extern_access + && !(VAR_OR_FUNCTION_DECL_P (exp) + && lookup_attribute ("nodirect_extern_access", + DECL_ATTRIBUTES (exp)))); + if (!direct_extern_access) + ix86_has_no_direct_extern_access = true; + return default_binds_local_p_3 (exp, flag_shlib != 0, true, + direct_extern_access, + (direct_extern_access + && (!flag_pic + || (TARGET_64BIT + && HAVE_LD_PIE_COPYRELOC != 0)))); +} + +/* If flag_pic or ix86_direct_extern_access is false, then neither + local nor global relocs should be placed in readonly memory. */ + +static int +ix86_reloc_rw_mask (void) +{ + return (flag_pic || !ix86_direct_extern_access) ? 3 : 0; } #endif @@ -24697,6 +24731,11 @@ ix86_libgcc_floating_mode_supported_p #undef TARGET_IFUNC_REF_LOCAL_OK #define TARGET_IFUNC_REF_LOCAL_OK hook_bool_void_true +#if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES +# undef TARGET_ASM_RELOC_RW_MASK +# define TARGET_ASM_RELOC_RW_MASK ix86_reloc_rw_mask +#endif + static bool ix86_libc_has_fast_function (int fcode ATTRIBUTE_UNUSED) { #ifdef OPTION_GLIBC diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index eb829d1..d8e8656 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1206,3 +1206,7 @@ Support MWAIT and MONITOR built-in functions and code generation. mavx512fp16 Target Mask(ISA2_AVX512FP16) Var(ix86_isa_flags2) Save Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX512FP16 built-in functions and code generation. + +mdirect-extern-access +Target Var(ix86_direct_extern_access) Init(1) +Do not use GOT to access external symbols. -- cgit v1.1 From c48a6819d157fd648e77ef5be0dce887e047c734 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 9 Feb 2022 16:57:02 +0000 Subject: aarch64: Tighten general_operand predicates This patch fixes some case in which *general_operand was used over *nonimmediate_operand by patterns that don't accept immediates. This avoids some complication with later patches. gcc/ * config/aarch64/aarch64-simd.md (aarch64_simd_vec_set): Use aarch64_simd_nonimmediate_operand instead of aarch64_simd_general_operand. (@aarch64_combinez): Use nonimmediate_operand instead of general_operand. (@aarch64_combinez_be): Likewise. --- gcc/config/aarch64/aarch64-simd.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 6646e06..9529bdb 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1039,7 +1039,7 @@ [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w") (vec_merge:VALL_F16 (vec_duplicate:VALL_F16 - (match_operand: 1 "aarch64_simd_general_operand" "w,?r,Utv")) + (match_operand: 1 "aarch64_simd_nonimmediate_operand" "w,?r,Utv")) (match_operand:VALL_F16 3 "register_operand" "0,0,0") (match_operand:SI 2 "immediate_operand" "i,i,i")))] "TARGET_SIMD" @@ -4380,7 +4380,7 @@ (define_insn "@aarch64_combinez" [(set (match_operand: 0 "register_operand" "=w,w,w") (vec_concat: - (match_operand:VDC 1 "general_operand" "w,?r,m") + (match_operand:VDC 1 "nonimmediate_operand" "w,?r,m") (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")))] "TARGET_SIMD && !BYTES_BIG_ENDIAN" "@ @@ -4395,7 +4395,7 @@ [(set (match_operand: 0 "register_operand" "=w,w,w") (vec_concat: (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero") - (match_operand:VDC 1 "general_operand" "w,?r,m")))] + (match_operand:VDC 1 "nonimmediate_operand" "w,?r,m")))] "TARGET_SIMD && BYTES_BIG_ENDIAN" "@ mov\\t%0.8b, %1.8b -- cgit v1.1 From fabc5d9bceb0aec8db2147eb50ae375c711eea90 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 9 Feb 2022 16:57:02 +0000 Subject: aarch64: Generalise vec_set predicate The aarch64_simd_vec_set define_insn takes memory operands, so this patch makes the vec_set optab expander do the same. gcc/ * config/aarch64/aarch64-simd.md (vec_set): Allow the element to be an aarch64_simd_nonimmediate_operand. --- gcc/config/aarch64/aarch64-simd.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 9529bdb..872a3d7 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1378,7 +1378,7 @@ (define_expand "vec_set" [(match_operand:VALL_F16 0 "register_operand") - (match_operand: 1 "register_operand") + (match_operand: 1 "aarch64_simd_nonimmediate_operand") (match_operand:SI 2 "immediate_operand")] "TARGET_SIMD" { -- cgit v1.1 From 958448a9441ee54e012c67cfc3cf88083f3d0e4a Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 9 Feb 2022 16:57:03 +0000 Subject: aarch64: Generalise adjacency check for load_pair_lanes This patch generalises the load_pair_lanes guard so that it uses aarch64_check_consecutive_mems to check for consecutive mems. It also allows the pattern to be used for STRICT_ALIGNMENT targets if the alignment is high enough. The main aim is to avoid an inline test, for the sake of a later patch that needs to repeat it. Reusing aarch64_check_consecutive_mems seemed simpler than writing an entirely new function. gcc/ * config/aarch64/aarch64-protos.h (aarch64_mergeable_load_pair_p): Declare. * config/aarch64/aarch64-simd.md (load_pair_lanes): Use aarch64_mergeable_load_pair_p instead of inline check. * config/aarch64/aarch64.cc (aarch64_expand_vector_init): Likewise. (aarch64_check_consecutive_mems): Allow the reversed parameter to be null. (aarch64_mergeable_load_pair_p): New function. --- gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64-simd.md | 7 ++--- gcc/config/aarch64/aarch64.cc | 54 ++++++++++++++++++++++++------------- 3 files changed, 38 insertions(+), 24 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 2636853..b75ed35 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -1000,6 +1000,7 @@ void aarch64_atomic_assign_expand_fenv (tree *, tree *, tree *); int aarch64_ccmp_mode_to_code (machine_mode mode); bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset); +bool aarch64_mergeable_load_pair_p (machine_mode, rtx, rtx); bool aarch64_operands_ok_for_ldpstp (rtx *, bool, machine_mode); bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, machine_mode); void aarch64_swap_ldrstr_operands (rtx *, bool); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 872a3d7..c5bc2ea 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4353,11 +4353,8 @@ (vec_concat: (match_operand:VDC 1 "memory_operand" "Utq") (match_operand:VDC 2 "memory_operand" "m")))] - "TARGET_SIMD && !STRICT_ALIGNMENT - && rtx_equal_p (XEXP (operands[2], 0), - plus_constant (Pmode, - XEXP (operands[1], 0), - GET_MODE_SIZE (mode)))" + "TARGET_SIMD + && aarch64_mergeable_load_pair_p (mode, operands[1], operands[2])" "ldr\\t%q0, %1" [(set_attr "type" "neon_load1_1reg_q")] ) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 296145e..c47543a 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -21063,11 +21063,7 @@ aarch64_expand_vector_init (rtx target, rtx vals) for store_pair_lanes. */ if (memory_operand (x0, inner_mode) && memory_operand (x1, inner_mode) - && !STRICT_ALIGNMENT - && rtx_equal_p (XEXP (x1, 0), - plus_constant (Pmode, - XEXP (x0, 0), - GET_MODE_SIZE (inner_mode)))) + && aarch64_mergeable_load_pair_p (mode, x0, x1)) { rtx t; if (inner_mode == DFmode) @@ -24687,14 +24683,20 @@ aarch64_sched_adjust_priority (rtx_insn *insn, int priority) return priority; } -/* Check if *MEM1 and *MEM2 are consecutive memory references and, +/* If REVERSED is null, return true if memory reference *MEM2 comes + immediately after memory reference *MEM1. Do not change the references + in this case. + + Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and, if they are, try to make them use constant offsets from the same base register. Return true on success. When returning true, set *REVERSED to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */ static bool aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed) { - *reversed = false; + if (reversed) + *reversed = false; + if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC) return false; @@ -24723,7 +24725,7 @@ aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed) if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2))) return true; - if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1))) + if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed) { *reversed = true; return true; @@ -24756,22 +24758,25 @@ aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed) if (known_eq (expr_offset1 + size1, expr_offset2)) ; - else if (known_eq (expr_offset2 + size2, expr_offset1)) + else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed) *reversed = true; else return false; - if (base2) + if (reversed) { - rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0), - expr_offset1 - expr_offset2); - *mem1 = replace_equiv_address_nv (*mem1, addr1); - } - else - { - rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0), - expr_offset2 - expr_offset1); - *mem2 = replace_equiv_address_nv (*mem2, addr2); + if (base2) + { + rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0), + expr_offset1 - expr_offset2); + *mem1 = replace_equiv_address_nv (*mem1, addr1); + } + else + { + rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0), + expr_offset2 - expr_offset1); + *mem2 = replace_equiv_address_nv (*mem2, addr2); + } } return true; } @@ -24779,6 +24784,17 @@ aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed) return false; } +/* Return true if MEM1 and MEM2 can be combined into a single access + of mode MODE, with the combined access having the same address as MEM1. */ + +bool +aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2) +{ + if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode)) + return false; + return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr); +} + /* Given OPERANDS of consecutive load/store, check if we can merge them into ldp/stp. LOAD is true if they are load instructions. MODE is the mode of memory operands. */ -- cgit v1.1 From aeef5c57f161ad0258c5ab066ade2274bef3271a Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 9 Feb 2022 16:57:04 +0000 Subject: aarch64: Remove redundant vec_concat patterns move_lo_quad_internal_ and move_lo_quad_internal_be_ partially duplicate the later aarch64_combinez{,_be} patterns. The duplication itself is a regression. The only substantive differences between the two are: * combinez uses vector MOV (ORR) instead of element MOV (DUP). The former seems more likely to be handled via renaming. * combinez disparages the GPR->FPR alternative whereas move_lo_quad gave it equal cost. The new test gives a token example of when the combinez behaviour helps. gcc/ * config/aarch64/aarch64-simd.md (move_lo_quad_internal_) (move_lo_quad_internal_be_): Delete. (move_lo_quad_): Use aarch64_combine instead of the above. gcc/testsuite/ * gcc.target/aarch64/vec-init-8.c: New test. --- gcc/config/aarch64/aarch64-simd.md | 37 ++----------------------------------- 1 file changed, 2 insertions(+), 35 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index c5bc2ea..d6cd4c7 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1584,46 +1584,13 @@ ;; On little-endian this is { operand, zeroes } ;; On big-endian this is { zeroes, operand } -(define_insn "move_lo_quad_internal_" - [(set (match_operand:VQMOV 0 "register_operand" "=w,w,w") - (vec_concat:VQMOV - (match_operand: 1 "register_operand" "w,r,r") - (match_operand: 2 "aarch64_simd_or_scalar_imm_zero")))] - "TARGET_SIMD && !BYTES_BIG_ENDIAN" - "@ - dup\\t%d0, %1.d[0] - fmov\\t%d0, %1 - dup\\t%d0, %1" - [(set_attr "type" "neon_dup,f_mcr,neon_dup") - (set_attr "length" "4") - (set_attr "arch" "simd,fp,simd")] -) - -(define_insn "move_lo_quad_internal_be_" - [(set (match_operand:VQMOV 0 "register_operand" "=w,w,w") - (vec_concat:VQMOV - (match_operand: 2 "aarch64_simd_or_scalar_imm_zero") - (match_operand: 1 "register_operand" "w,r,r")))] - "TARGET_SIMD && BYTES_BIG_ENDIAN" - "@ - dup\\t%d0, %1.d[0] - fmov\\t%d0, %1 - dup\\t%d0, %1" - [(set_attr "type" "neon_dup,f_mcr,neon_dup") - (set_attr "length" "4") - (set_attr "arch" "simd,fp,simd")] -) - (define_expand "move_lo_quad_" [(match_operand:VQMOV 0 "register_operand") (match_operand: 1 "register_operand")] "TARGET_SIMD" { - rtx zs = CONST0_RTX (mode); - if (BYTES_BIG_ENDIAN) - emit_insn (gen_move_lo_quad_internal_be_ (operands[0], operands[1], zs)); - else - emit_insn (gen_move_lo_quad_internal_ (operands[0], operands[1], zs)); + emit_insn (gen_aarch64_combine (operands[0], operands[1], + CONST0_RTX (mode))); DONE; } ) -- cgit v1.1 From 85ac2fe44fd4acf8350dd74ccb003a2050baad2a Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 9 Feb 2022 16:57:05 +0000 Subject: aarch64: Add more vec_combine patterns vec_combine is really one instruction on aarch64, provided that the lowpart element is in the same register as the destination vector. This patch adds patterns for that. The patch fixes a regression from GCC 8. Before the patch: int64x2_t s64q_1(int64_t a0, int64_t a1) { if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) return (int64x2_t) { a1, a0 }; else return (int64x2_t) { a0, a1 }; } generated: fmov d0, x0 ins v0.d[1], x1 ins v0.d[1], x1 ret whereas GCC 8 generated the more respectable: dup v0.2d, x0 ins v0.d[1], x1 ret gcc/ * config/aarch64/predicates.md (aarch64_reg_or_mem_pair_operand): New predicate. * config/aarch64/aarch64-simd.md (*aarch64_combine_internal) (*aarch64_combine_internal_be): New patterns. gcc/testsuite/ * gcc.target/aarch64/vec-init-9.c: New test. * gcc.target/aarch64/vec-init-10.c: Likewise. * gcc.target/aarch64/vec-init-11.c: Likewise. --- gcc/config/aarch64/aarch64-simd.md | 62 ++++++++++++++++++++++++++++++++++++++ gcc/config/aarch64/predicates.md | 4 +++ 2 files changed, 66 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index d6cd4c7..ead8039 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4326,6 +4326,25 @@ [(set_attr "type" "neon_load1_1reg_q")] ) +;; This STP pattern is a partial duplicate of the general vec_concat patterns +;; below. The reason for having both of them is that the alternatives of +;; the later patterns do not have consistent register preferences: the STP +;; alternatives have no preference between GPRs and FPRs (and if anything, +;; the GPR form is more natural for scalar integers) whereas the other +;; alternatives *require* an FPR for operand 1 and prefer one for operand 2. +;; +;; Using "*" to hide the STP alternatives from the RA penalizes cases in +;; which the destination was always memory. On the other hand, expressing +;; the true preferences makes GPRs seem more palatable than they really are +;; for register destinations. +;; +;; Despite that, we do still want the general form to have STP alternatives, +;; in order to handle cases where a register destination is spilled. +;; +;; The best compromise therefore seemed to be to have a dedicated STP +;; pattern to catch cases in which the destination was always memory. +;; This dedicated pattern must come first. + (define_insn "store_pair_lanes" [(set (match_operand: 0 "aarch64_mem_pair_lanes_operand" "=Umn, Umn") (vec_concat: @@ -4338,6 +4357,49 @@ [(set_attr "type" "neon_stp, store_16")] ) +;; Form a vector whose least significant half comes from operand 1 and whose +;; most significant half comes from operand 2. The register alternatives +;; tie the least significant half to the same register as the destination, +;; so that only the other half needs to be handled explicitly. For the +;; reasons given above, the STP alternatives use ? for constraints that +;; the register alternatives either don't accept or themselves disparage. + +(define_insn "*aarch64_combine_internal" + [(set (match_operand: 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn") + (vec_concat: + (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r") + (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, w, ?r")))] + "TARGET_SIMD + && !BYTES_BIG_ENDIAN + && (register_operand (operands[0], mode) + || register_operand (operands[2], mode))" + "@ + ins\t%0.d[1], %2.d[0] + ins\t%0.d[1], %2 + ld1\t{%0.d}[1], %2 + stp\t%d1, %d2, %y0 + stp\t%x1, %x2, %y0" + [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")] +) + +(define_insn "*aarch64_combine_internal_be" + [(set (match_operand: 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn") + (vec_concat: + (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, ?w, ?r") + (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r")))] + "TARGET_SIMD + && BYTES_BIG_ENDIAN + && (register_operand (operands[0], mode) + || register_operand (operands[2], mode))" + "@ + ins\t%0.d[1], %2.d[0] + ins\t%0.d[1], %2 + ld1\t{%0.d}[1], %2 + stp\t%d2, %d1, %y0 + stp\t%x2, %x1, %y0" + [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")] +) + ;; In this insn, operand 1 should be low, and operand 2 the high part of the ;; dest vector. diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 7dc4c15..c308015 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -254,6 +254,10 @@ false, ADDR_QUERY_LDP_STP_N)"))) +(define_predicate "aarch64_reg_or_mem_pair_operand" + (ior (match_operand 0 "register_operand") + (match_operand 0 "aarch64_mem_pair_lanes_operand"))) + (define_predicate "aarch64_prefetch_operand" (match_test "aarch64_address_valid_for_prefetch_p (op, false)")) -- cgit v1.1 From 4057266ce5afc1fccd5d4e4971103afaa4be63d4 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 9 Feb 2022 16:57:05 +0000 Subject: aarch64: Add a general vec_concat expander After previous patches, we have a (mostly new) group of vec_concat patterns as well as vestiges of the old move_lo/hi_quad patterns. (A previous patch removed the move_lo_quad insns, but we still have the move_hi_quad insns and both sets of expanders.) This patch is the first of two to remove the old move_lo/hi_quad stuff. It isn't technically a regression fix, but it seemed better to make the changes now rather than leave things in a half-finished and inconsistent state. This patch defines an aarch64_vec_concat expander that coerces the element operands into a valid form, including the ones added by the previous patch. This in turn lets us get rid of one move_lo/hi_quad pair. As a side-effect, it also means that vcombines of 2 vectors make better use of the available forms, like vec_inits of 2 scalars already do. gcc/ * config/aarch64/aarch64-protos.h (aarch64_split_simd_combine): Delete. * config/aarch64/aarch64-simd.md (@aarch64_combinez): Rename to... (*aarch64_combinez): ...this. (@aarch64_combinez_be): Rename to... (*aarch64_combinez_be): ...this. (@aarch64_vec_concat): New expander. (aarch64_combine): Use it. (@aarch64_simd_combine): Delete. * config/aarch64/aarch64.cc (aarch64_split_simd_combine): Delete. (aarch64_expand_vector_init): Use aarch64_vec_concat. gcc/testsuite/ * gcc.target/aarch64/vec-init-12.c: New test. --- gcc/config/aarch64/aarch64-protos.h | 2 - gcc/config/aarch64/aarch64-simd.md | 76 ++++++++++++++++++++++++------------- gcc/config/aarch64/aarch64.cc | 55 ++++----------------------- 3 files changed, 57 insertions(+), 76 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index b75ed35..392efa0 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -925,8 +925,6 @@ bool aarch64_split_128bit_move_p (rtx, rtx); bool aarch64_mov128_immediate (rtx); -void aarch64_split_simd_combine (rtx, rtx, rtx); - void aarch64_split_simd_move (rtx, rtx); /* Check for a legitimate floating point constant for FMOV. */ diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index ead8039..7acde0d 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4403,7 +4403,7 @@ ;; In this insn, operand 1 should be low, and operand 2 the high part of the ;; dest vector. -(define_insn "@aarch64_combinez" +(define_insn "*aarch64_combinez" [(set (match_operand: 0 "register_operand" "=w,w,w") (vec_concat: (match_operand:VDC 1 "nonimmediate_operand" "w,?r,m") @@ -4417,7 +4417,7 @@ (set_attr "arch" "simd,fp,simd")] ) -(define_insn "@aarch64_combinez_be" +(define_insn "*aarch64_combinez_be" [(set (match_operand: 0 "register_operand" "=w,w,w") (vec_concat: (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero") @@ -4431,38 +4431,62 @@ (set_attr "arch" "simd,fp,simd")] ) -(define_expand "aarch64_combine" - [(match_operand: 0 "register_operand") - (match_operand:VDC 1 "register_operand") - (match_operand:VDC 2 "aarch64_simd_reg_or_zero")] +;; Form a vector whose first half (in array order) comes from operand 1 +;; and whose second half (in array order) comes from operand 2. +;; This operand order follows the RTL vec_concat operation. +(define_expand "@aarch64_vec_concat" + [(set (match_operand: 0 "register_operand") + (vec_concat: + (match_operand:VDC 1 "general_operand") + (match_operand:VDC 2 "general_operand")))] "TARGET_SIMD" { - if (operands[2] == CONST0_RTX (mode)) + int lo = BYTES_BIG_ENDIAN ? 2 : 1; + int hi = BYTES_BIG_ENDIAN ? 1 : 2; + + if (MEM_P (operands[1]) + && MEM_P (operands[2]) + && aarch64_mergeable_load_pair_p (mode, operands[1], operands[2])) + /* Use load_pair_lanes. */ + ; + else if (operands[hi] == CONST0_RTX (mode)) { - if (BYTES_BIG_ENDIAN) - emit_insn (gen_aarch64_combinez_be (operands[0], operands[1], - operands[2])); - else - emit_insn (gen_aarch64_combinez (operands[0], operands[1], - operands[2])); + /* Use *aarch64_combinez. */ + if (!nonimmediate_operand (operands[lo], mode)) + operands[lo] = force_reg (mode, operands[lo]); } else - aarch64_split_simd_combine (operands[0], operands[1], operands[2]); - DONE; -} -) + { + /* Use *aarch64_combine_general. */ + operands[lo] = force_reg (mode, operands[lo]); + if (!aarch64_simd_nonimmediate_operand (operands[hi], mode)) + { + if (MEM_P (operands[hi])) + { + rtx addr = force_reg (Pmode, XEXP (operands[hi], 0)); + operands[hi] = replace_equiv_address (operands[hi], addr); + } + else + operands[hi] = force_reg (mode, operands[hi]); + } + } +}) -(define_expand "@aarch64_simd_combine" +;; Form a vector whose least significant half comes from operand 1 and whose +;; most significant half comes from operand 2. This operand order follows +;; arm_neon.h vcombine* intrinsics. +(define_expand "aarch64_combine" [(match_operand: 0 "register_operand") - (match_operand:VDC 1 "register_operand") - (match_operand:VDC 2 "register_operand")] + (match_operand:VDC 1 "general_operand") + (match_operand:VDC 2 "general_operand")] "TARGET_SIMD" - { - emit_insn (gen_move_lo_quad_ (operands[0], operands[1])); - emit_insn (gen_move_hi_quad_ (operands[0], operands[2])); - DONE; - } -[(set_attr "type" "multiple")] +{ + if (BYTES_BIG_ENDIAN) + std::swap (operands[1], operands[2]); + emit_insn (gen_aarch64_vec_concat (operands[0], operands[1], + operands[2])); + DONE; +} ) ;; l. diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index c47543a..af42d1b 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -4239,23 +4239,6 @@ aarch64_split_128bit_move_p (rtx dst, rtx src) return true; } -/* Split a complex SIMD combine. */ - -void -aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2) -{ - machine_mode src_mode = GET_MODE (src1); - machine_mode dst_mode = GET_MODE (dst); - - gcc_assert (VECTOR_MODE_P (dst_mode)); - gcc_assert (register_operand (dst, dst_mode) - && register_operand (src1, src_mode) - && register_operand (src2, src_mode)); - - emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2)); - return; -} - /* Split a complex SIMD move. */ void @@ -20941,37 +20924,13 @@ aarch64_expand_vector_init (rtx target, rtx vals) of mode N in VALS and we must put their concatentation into TARGET. */ if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0)))) { - gcc_assert (known_eq (GET_MODE_SIZE (mode), - 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0))))); - rtx lo = XVECEXP (vals, 0, 0); - rtx hi = XVECEXP (vals, 0, 1); - machine_mode narrow_mode = GET_MODE (lo); - gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode); - gcc_assert (narrow_mode == GET_MODE (hi)); - - /* When we want to concatenate a half-width vector with zeroes we can - use the aarch64_combinez[_be] patterns. Just make sure that the - zeroes are in the right half. */ - if (BYTES_BIG_ENDIAN - && aarch64_simd_imm_zero (lo, narrow_mode) - && general_operand (hi, narrow_mode)) - emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo)); - else if (!BYTES_BIG_ENDIAN - && aarch64_simd_imm_zero (hi, narrow_mode) - && general_operand (lo, narrow_mode)) - emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi)); - else - { - /* Else create the two half-width registers and combine them. */ - if (!REG_P (lo)) - lo = force_reg (GET_MODE (lo), lo); - if (!REG_P (hi)) - hi = force_reg (GET_MODE (hi), hi); - - if (BYTES_BIG_ENDIAN) - std::swap (lo, hi); - emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi)); - } + machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0)); + gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode + && known_eq (GET_MODE_SIZE (mode), + 2 * GET_MODE_SIZE (narrow_mode))); + emit_insn (gen_aarch64_vec_concat (narrow_mode, target, + XVECEXP (vals, 0, 0), + XVECEXP (vals, 0, 1))); return; } -- cgit v1.1 From bce43c0493f65d2589776f0dafa396d5477a84c7 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 9 Feb 2022 16:57:06 +0000 Subject: aarch64: Remove move_lo/hi_quad expanders This patch is the second of two to remove the old move_lo/hi_quad expanders and move_hi_quad insns. gcc/ * config/aarch64/aarch64-simd.md (@aarch64_split_simd_mov): Use aarch64_combine instead of move_lo/hi_quad. Tabify. (move_lo_quad_, aarch64_simd_move_hi_quad_): Delete. (aarch64_simd_move_hi_quad_be_, move_hi_quad_): Delete. (vec_pack_trunc_): Take general_operand elements and use aarch64_combine rather than move_lo/hi_quad to combine them. (vec_pack_trunc_df): Likewise. --- gcc/config/aarch64/aarch64-simd.md | 111 ++++++------------------------------- 1 file changed, 18 insertions(+), 93 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7acde0d..ef6e772 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -272,7 +272,7 @@ (define_expand "@aarch64_split_simd_mov" [(set (match_operand:VQMOV 0) - (match_operand:VQMOV 1))] + (match_operand:VQMOV 1))] "TARGET_SIMD" { rtx dst = operands[0]; @@ -280,23 +280,22 @@ if (GP_REGNUM_P (REGNO (src))) { - rtx src_low_part = gen_lowpart (mode, src); - rtx src_high_part = gen_highpart (mode, src); + rtx src_low_part = gen_lowpart (mode, src); + rtx src_high_part = gen_highpart (mode, src); + rtx dst_low_part = gen_lowpart (mode, dst); - emit_insn - (gen_move_lo_quad_ (dst, src_low_part)); - emit_insn - (gen_move_hi_quad_ (dst, src_high_part)); + emit_move_insn (dst_low_part, src_low_part); + emit_insn (gen_aarch64_combine (dst, dst_low_part, + src_high_part)); } - else { - rtx dst_low_part = gen_lowpart (mode, dst); - rtx dst_high_part = gen_highpart (mode, dst); + rtx dst_low_part = gen_lowpart (mode, dst); + rtx dst_high_part = gen_highpart (mode, dst); rtx lo = aarch64_simd_vect_par_cnst_half (mode, , false); rtx hi = aarch64_simd_vect_par_cnst_half (mode, , true); - emit_insn (gen_aarch64_get_half (dst_low_part, src, lo)); - emit_insn (gen_aarch64_get_half (dst_high_part, src, hi)); + emit_insn (gen_aarch64_get_half (dst_low_part, src, lo)); + emit_insn (gen_aarch64_get_half (dst_high_part, src, hi)); } DONE; } @@ -1580,69 +1579,6 @@ ;; What that means, is that the RTL descriptions of the below patterns ;; need to change depending on endianness. -;; Move to the low architectural bits of the register. -;; On little-endian this is { operand, zeroes } -;; On big-endian this is { zeroes, operand } - -(define_expand "move_lo_quad_" - [(match_operand:VQMOV 0 "register_operand") - (match_operand: 1 "register_operand")] - "TARGET_SIMD" -{ - emit_insn (gen_aarch64_combine (operands[0], operands[1], - CONST0_RTX (mode))); - DONE; -} -) - -;; Move operand1 to the high architectural bits of the register, keeping -;; the low architectural bits of operand2. -;; For little-endian this is { operand2, operand1 } -;; For big-endian this is { operand1, operand2 } - -(define_insn "aarch64_simd_move_hi_quad_" - [(set (match_operand:VQMOV 0 "register_operand" "+w,w") - (vec_concat:VQMOV - (vec_select: - (match_dup 0) - (match_operand:VQMOV 2 "vect_par_cnst_lo_half" "")) - (match_operand: 1 "register_operand" "w,r")))] - "TARGET_SIMD && !BYTES_BIG_ENDIAN" - "@ - ins\\t%0.d[1], %1.d[0] - ins\\t%0.d[1], %1" - [(set_attr "type" "neon_ins")] -) - -(define_insn "aarch64_simd_move_hi_quad_be_" - [(set (match_operand:VQMOV 0 "register_operand" "+w,w") - (vec_concat:VQMOV - (match_operand: 1 "register_operand" "w,r") - (vec_select: - (match_dup 0) - (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))))] - "TARGET_SIMD && BYTES_BIG_ENDIAN" - "@ - ins\\t%0.d[1], %1.d[0] - ins\\t%0.d[1], %1" - [(set_attr "type" "neon_ins")] -) - -(define_expand "move_hi_quad_" - [(match_operand:VQMOV 0 "register_operand") - (match_operand: 1 "register_operand")] - "TARGET_SIMD" -{ - rtx p = aarch64_simd_vect_par_cnst_half (mode, , false); - if (BYTES_BIG_ENDIAN) - emit_insn (gen_aarch64_simd_move_hi_quad_be_ (operands[0], - operands[1], p)); - else - emit_insn (gen_aarch64_simd_move_hi_quad_ (operands[0], - operands[1], p)); - DONE; -}) - ;; Narrowing operations. (define_insn "aarch64_xtn_insn_le" @@ -1743,16 +1679,12 @@ (define_expand "vec_pack_trunc_" [(match_operand: 0 "register_operand") - (match_operand:VDN 1 "register_operand") - (match_operand:VDN 2 "register_operand")] + (match_operand:VDN 1 "general_operand") + (match_operand:VDN 2 "general_operand")] "TARGET_SIMD" { rtx tempreg = gen_reg_rtx (mode); - int lo = BYTES_BIG_ENDIAN ? 2 : 1; - int hi = BYTES_BIG_ENDIAN ? 1 : 2; - - emit_insn (gen_move_lo_quad_ (tempreg, operands[lo])); - emit_insn (gen_move_hi_quad_ (tempreg, operands[hi])); + emit_insn (gen_aarch64_vec_concat (tempreg, operands[1], operands[2])); emit_insn (gen_trunc2 (operands[0], tempreg)); DONE; }) @@ -3402,20 +3334,13 @@ (define_expand "vec_pack_trunc_df" [(set (match_operand:V2SF 0 "register_operand") - (vec_concat:V2SF - (float_truncate:SF - (match_operand:DF 1 "register_operand")) - (float_truncate:SF - (match_operand:DF 2 "register_operand")) - ))] + (vec_concat:V2SF + (float_truncate:SF (match_operand:DF 1 "general_operand")) + (float_truncate:SF (match_operand:DF 2 "general_operand"))))] "TARGET_SIMD" { rtx tmp = gen_reg_rtx (V2SFmode); - int lo = BYTES_BIG_ENDIAN ? 2 : 1; - int hi = BYTES_BIG_ENDIAN ? 1 : 2; - - emit_insn (gen_move_lo_quad_v2df (tmp, operands[lo])); - emit_insn (gen_move_hi_quad_v2df (tmp, operands[hi])); + emit_insn (gen_aarch64_vec_concatdf (tmp, operands[1], operands[2])); emit_insn (gen_aarch64_float_truncate_lo_v2sf (operands[0], tmp)); DONE; } -- cgit v1.1 From 83d7e720cd1d075312e798c4ebd2e093f03465fb Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 9 Feb 2022 16:57:06 +0000 Subject: aarch64: Extend vec_concat patterns to 8-byte vectors This patch extends the previous support for 16-byte vec_concat so that it supports pairs of 4-byte elements. This too isn't strictly a regression fix, since the 8-byte forms weren't affected by the same problems as the 16-byte forms, but it leaves things in a more consistent state. gcc/ * config/aarch64/iterators.md (VDCSIF): New mode iterator. (VDBL): Handle SF. (single_wx, single_type, single_dtype, dblq): New mode attributes. * config/aarch64/aarch64-simd.md (load_pair_lanes): Extend from VDC to VDCSIF. (store_pair_lanes): Likewise. (*aarch64_combine_internal): Likewise. (*aarch64_combine_internal_be): Likewise. (*aarch64_combinez): Likewise. (*aarch64_combinez_be): Likewise. * config/aarch64/aarch64.cc (aarch64_classify_address): Handle 8-byte modes for ADDR_QUERY_LDP_STP_N. (aarch64_print_operand): Likewise for %y. gcc/testsuite/ * gcc.target/aarch64/vec-init-13.c: New test. * gcc.target/aarch64/vec-init-14.c: Likewise. * gcc.target/aarch64/vec-init-15.c: Likewise. * gcc.target/aarch64/vec-init-16.c: Likewise. * gcc.target/aarch64/vec-init-17.c: Likewise. --- gcc/config/aarch64/aarch64-simd.md | 72 +++++++++++++++++++------------------- gcc/config/aarch64/aarch64.cc | 16 ++++++--- gcc/config/aarch64/iterators.md | 38 ++++++++++++++++++-- 3 files changed, 84 insertions(+), 42 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index ef6e772..1873342 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4243,12 +4243,12 @@ (define_insn "load_pair_lanes" [(set (match_operand: 0 "register_operand" "=w") (vec_concat: - (match_operand:VDC 1 "memory_operand" "Utq") - (match_operand:VDC 2 "memory_operand" "m")))] + (match_operand:VDCSIF 1 "memory_operand" "Utq") + (match_operand:VDCSIF 2 "memory_operand" "m")))] "TARGET_SIMD && aarch64_mergeable_load_pair_p (mode, operands[1], operands[2])" - "ldr\\t%q0, %1" - [(set_attr "type" "neon_load1_1reg_q")] + "ldr\\t%0, %1" + [(set_attr "type" "neon_load1_1reg")] ) ;; This STP pattern is a partial duplicate of the general vec_concat patterns @@ -4273,12 +4273,12 @@ (define_insn "store_pair_lanes" [(set (match_operand: 0 "aarch64_mem_pair_lanes_operand" "=Umn, Umn") (vec_concat: - (match_operand:VDC 1 "register_operand" "w, r") - (match_operand:VDC 2 "register_operand" "w, r")))] + (match_operand:VDCSIF 1 "register_operand" "w, r") + (match_operand:VDCSIF 2 "register_operand" "w, r")))] "TARGET_SIMD" "@ - stp\\t%d1, %d2, %y0 - stp\\t%x1, %x2, %y0" + stp\t%1, %2, %y0 + stp\t%1, %2, %y0" [(set_attr "type" "neon_stp, store_16")] ) @@ -4292,37 +4292,37 @@ (define_insn "*aarch64_combine_internal" [(set (match_operand: 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn") (vec_concat: - (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r") - (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, w, ?r")))] + (match_operand:VDCSIF 1 "register_operand" "0, 0, 0, ?w, ?r") + (match_operand:VDCSIF 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, w, ?r")))] "TARGET_SIMD && !BYTES_BIG_ENDIAN && (register_operand (operands[0], mode) || register_operand (operands[2], mode))" "@ - ins\t%0.d[1], %2.d[0] - ins\t%0.d[1], %2 - ld1\t{%0.d}[1], %2 - stp\t%d1, %d2, %y0 - stp\t%x1, %x2, %y0" - [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")] + ins\t%0.[1], %2.[0] + ins\t%0.[1], %2 + ld1\t{%0.}[1], %2 + stp\t%1, %2, %y0 + stp\t%1, %2, %y0" + [(set_attr "type" "neon_ins, neon_from_gp, neon_load1_one_lane, neon_stp, store_16")] ) (define_insn "*aarch64_combine_internal_be" [(set (match_operand: 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn") (vec_concat: - (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, ?w, ?r") - (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r")))] + (match_operand:VDCSIF 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, ?w, ?r") + (match_operand:VDCSIF 1 "register_operand" "0, 0, 0, ?w, ?r")))] "TARGET_SIMD && BYTES_BIG_ENDIAN && (register_operand (operands[0], mode) || register_operand (operands[2], mode))" "@ - ins\t%0.d[1], %2.d[0] - ins\t%0.d[1], %2 - ld1\t{%0.d}[1], %2 - stp\t%d2, %d1, %y0 - stp\t%x2, %x1, %y0" - [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")] + ins\t%0.[1], %2.[0] + ins\t%0.[1], %2 + ld1\t{%0.}[1], %2 + stp\t%2, %1, %y0 + stp\t%2, %1, %y0" + [(set_attr "type" "neon_ins, neon_from_gp, neon_load1_one_lane, neon_stp, store_16")] ) ;; In this insn, operand 1 should be low, and operand 2 the high part of the @@ -4331,13 +4331,13 @@ (define_insn "*aarch64_combinez" [(set (match_operand: 0 "register_operand" "=w,w,w") (vec_concat: - (match_operand:VDC 1 "nonimmediate_operand" "w,?r,m") - (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")))] + (match_operand:VDCSIF 1 "nonimmediate_operand" "w,?r,m") + (match_operand:VDCSIF 2 "aarch64_simd_or_scalar_imm_zero")))] "TARGET_SIMD && !BYTES_BIG_ENDIAN" "@ - mov\\t%0.8b, %1.8b - fmov\t%d0, %1 - ldr\\t%d0, %1" + fmov\\t%0, %1 + fmov\t%0, %1 + ldr\\t%0, %1" [(set_attr "type" "neon_move, neon_from_gp, neon_load1_1reg") (set_attr "arch" "simd,fp,simd")] ) @@ -4345,13 +4345,13 @@ (define_insn "*aarch64_combinez_be" [(set (match_operand: 0 "register_operand" "=w,w,w") (vec_concat: - (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero") - (match_operand:VDC 1 "nonimmediate_operand" "w,?r,m")))] + (match_operand:VDCSIF 2 "aarch64_simd_or_scalar_imm_zero") + (match_operand:VDCSIF 1 "nonimmediate_operand" "w,?r,m")))] "TARGET_SIMD && BYTES_BIG_ENDIAN" "@ - mov\\t%0.8b, %1.8b - fmov\t%d0, %1 - ldr\\t%d0, %1" + fmov\\t%0, %1 + fmov\t%0, %1 + ldr\\t%0, %1" [(set_attr "type" "neon_move, neon_from_gp, neon_load1_1reg") (set_attr "arch" "simd,fp,simd")] ) @@ -4362,8 +4362,8 @@ (define_expand "@aarch64_vec_concat" [(set (match_operand: 0 "register_operand") (vec_concat: - (match_operand:VDC 1 "general_operand") - (match_operand:VDC 2 "general_operand")))] + (match_operand:VDCSIF 1 "general_operand") + (match_operand:VDCSIF 2 "general_operand")))] "TARGET_SIMD" { int lo = BYTES_BIG_ENDIAN ? 2 : 1; diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index af42d1b..7bb97bd 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -9922,9 +9922,15 @@ aarch64_classify_address (struct aarch64_address_info *info, /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode corresponds to the actual size of the memory being loaded/stored and the mode of the corresponding addressing mode is half of that. */ - if (type == ADDR_QUERY_LDP_STP_N - && known_eq (GET_MODE_SIZE (mode), 16)) - mode = DFmode; + if (type == ADDR_QUERY_LDP_STP_N) + { + if (known_eq (GET_MODE_SIZE (mode), 16)) + mode = DFmode; + else if (known_eq (GET_MODE_SIZE (mode), 8)) + mode = SFmode; + else + return false; + } bool allow_reg_index_p = (!load_store_pair_p && ((vec_flags == 0 @@ -11404,7 +11410,9 @@ aarch64_print_operand (FILE *f, rtx x, int code) machine_mode mode = GET_MODE (x); if (!MEM_P (x) - || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16))) + || (code == 'y' + && maybe_ne (GET_MODE_SIZE (mode), 8) + && maybe_ne (GET_MODE_SIZE (mode), 16))) { output_operand_lossage ("invalid operand for '%%%c'", code); return; diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index a0c02e4..88067a3 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -236,6 +236,9 @@ ;; Double vector modes for combines. (define_mode_iterator VDC [V8QI V4HI V4BF V4HF V2SI V2SF DI DF]) +;; VDC plus SI and SF. +(define_mode_iterator VDCSIF [V8QI V4HI V4BF V4HF V2SI V2SF SI SF DI DF]) + ;; Polynomial modes for vector combines. (define_mode_iterator VDC_P [V8QI V4HI DI]) @@ -1436,8 +1439,8 @@ (define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI") (V4HF "V8HF") (V4BF "V8BF") (V2SI "V4SI") (V2SF "V4SF") - (SI "V2SI") (DI "V2DI") - (DF "V2DF")]) + (SI "V2SI") (SF "V2SF") + (DI "V2DI") (DF "V2DF")]) ;; Register suffix for double-length mode. (define_mode_attr Vdtype [(V4HF "8h") (V2SF "4s")]) @@ -1557,6 +1560,30 @@ (V4SI "2s") (V8HF "4h") (V4SF "2s")]) +;; Whether a mode fits in W or X registers (i.e. "w" for 32-bit modes +;; and "x" for 64-bit modes). +(define_mode_attr single_wx [(SI "w") (SF "w") + (V8QI "x") (V4HI "x") + (V4HF "x") (V4BF "x") + (V2SI "x") (V2SF "x") + (DI "x") (DF "x")]) + +;; Whether a mode fits in S or D registers (i.e. "s" for 32-bit modes +;; and "d" for 64-bit modes). +(define_mode_attr single_type [(SI "s") (SF "s") + (V8QI "d") (V4HI "d") + (V4HF "d") (V4BF "d") + (V2SI "d") (V2SF "d") + (DI "d") (DF "d")]) + +;; Whether a double-width mode fits in D or Q registers (i.e. "d" for +;; 32-bit modes and "q" for 64-bit modes). +(define_mode_attr single_dtype [(SI "d") (SF "d") + (V8QI "q") (V4HI "q") + (V4HF "q") (V4BF "q") + (V2SI "q") (V2SF "q") + (DI "q") (DF "q")]) + ;; Define corresponding core/FP element mode for each vector mode. (define_mode_attr vw [(V8QI "w") (V16QI "w") (V4HI "w") (V8HI "w") @@ -1849,6 +1876,13 @@ (V4x1DF "") (V4x2DF "_q") (V4x4BF "") (V4x8BF "_q")]) +;; Equivalent of the "q" attribute for the mode. +(define_mode_attr dblq [(SI "") (SF "") + (V8QI "_q") (V4HI "_q") + (V4HF "_q") (V4BF "_q") + (V2SI "_q") (V2SF "_q") + (DI "_q") (DF "_q")]) + (define_mode_attr vp [(V8QI "v") (V16QI "v") (V4HI "v") (V8HI "v") (V2SI "p") (V4SI "v") -- cgit v1.1 From ed3fea09b18f67e757b5768b42cb6e816626f1db Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Fri, 4 Feb 2022 13:07:17 -0600 Subject: rs6000: Correct function prototypes for vec_replace_unaligned Due to a pasto error in the documentation, vec_replace_unaligned was implemented with the same function prototypes as vec_replace_elt. It was intended that vec_replace_unaligned always specify output vectors as having type vector unsigned char, to emphasize that elements are potentially misaligned by this built-in function. This patch corrects the misimplementation. 2022-02-04 Bill Schmidt gcc/ * config/rs6000/rs6000-builtins.def (VREPLACE_UN_UV2DI): Change function prototype. (VREPLACE_UN_UV4SI): Likewise. (VREPLACE_UN_V2DF): Likewise. (VREPLACE_UN_V2DI): Likewise. (VREPLACE_UN_V4SF): Likewise. (VREPLACE_UN_V4SI): Likewise. * config/rs6000/rs6000-overload.def (VEC_REPLACE_UN): Change all function prototypes. * config/rs6000/vsx.md (vreplace_un_): Remove define_expand. (vreplace_un_): New define_insn. gcc/testsuite/ * gcc.target/powerpc/vec-replace-word-runnable.c: Handle expected prototypes for each call to vec_replace_unaligned. --- gcc/config/rs6000/rs6000-builtins.def | 16 ++++++++-------- gcc/config/rs6000/rs6000-overload.def | 12 ++++++------ gcc/config/rs6000/vsx.md | 25 ++++++++++--------------- 3 files changed, 24 insertions(+), 29 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def index 2d1e63fb..ae2760c 100644 --- a/gcc/config/rs6000/rs6000-builtins.def +++ b/gcc/config/rs6000/rs6000-builtins.def @@ -3387,25 +3387,25 @@ const vull __builtin_altivec_vpextd (vull, vull); VPEXTD vpextd {} - const vull __builtin_altivec_vreplace_un_uv2di (vull, unsigned long long, \ - const int<4>); + const vuc __builtin_altivec_vreplace_un_uv2di (vull, unsigned long long, \ + const int<4>); VREPLACE_UN_UV2DI vreplace_un_v2di {} - const vui __builtin_altivec_vreplace_un_uv4si (vui, unsigned int, \ + const vuc __builtin_altivec_vreplace_un_uv4si (vui, unsigned int, \ const int<4>); VREPLACE_UN_UV4SI vreplace_un_v4si {} - const vd __builtin_altivec_vreplace_un_v2df (vd, double, const int<4>); + const vuc __builtin_altivec_vreplace_un_v2df (vd, double, const int<4>); VREPLACE_UN_V2DF vreplace_un_v2df {} - const vsll __builtin_altivec_vreplace_un_v2di (vsll, signed long long, \ - const int<4>); + const vuc __builtin_altivec_vreplace_un_v2di (vsll, signed long long, \ + const int<4>); VREPLACE_UN_V2DI vreplace_un_v2di {} - const vf __builtin_altivec_vreplace_un_v4sf (vf, float, const int<4>); + const vuc __builtin_altivec_vreplace_un_v4sf (vf, float, const int<4>); VREPLACE_UN_V4SF vreplace_un_v4sf {} - const vsi __builtin_altivec_vreplace_un_v4si (vsi, signed int, const int<4>); + const vuc __builtin_altivec_vreplace_un_v4si (vsi, signed int, const int<4>); VREPLACE_UN_V4SI vreplace_un_v4si {} const vull __builtin_altivec_vreplace_uv2di (vull, unsigned long long, \ diff --git a/gcc/config/rs6000/rs6000-overload.def b/gcc/config/rs6000/rs6000-overload.def index 49a6104..44e2945 100644 --- a/gcc/config/rs6000/rs6000-overload.def +++ b/gcc/config/rs6000/rs6000-overload.def @@ -3059,17 +3059,17 @@ VREPLACE_ELT_V2DF [VEC_REPLACE_UN, vec_replace_unaligned, __builtin_vec_replace_un] - vui __builtin_vec_replace_un (vui, unsigned int, const int); + vuc __builtin_vec_replace_un (vui, unsigned int, const int); VREPLACE_UN_UV4SI - vsi __builtin_vec_replace_un (vsi, signed int, const int); + vuc __builtin_vec_replace_un (vsi, signed int, const int); VREPLACE_UN_V4SI - vull __builtin_vec_replace_un (vull, unsigned long long, const int); + vuc __builtin_vec_replace_un (vull, unsigned long long, const int); VREPLACE_UN_UV2DI - vsll __builtin_vec_replace_un (vsll, signed long long, const int); + vuc __builtin_vec_replace_un (vsll, signed long long, const int); VREPLACE_UN_V2DI - vf __builtin_vec_replace_un (vf, float, const int); + vuc __builtin_vec_replace_un (vf, float, const int); VREPLACE_UN_V4SF - vd __builtin_vec_replace_un (vd, double, const int); + vuc __builtin_vec_replace_un (vd, double, const int); VREPLACE_UN_V2DF [VEC_REVB, vec_revb, __builtin_vec_revb] diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 2f5a2f7..b53de10 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -4197,21 +4197,6 @@ } [(set_attr "type" "vecsimple")]) -(define_expand "vreplace_un_" - [(set (match_operand:REPLACE_ELT 0 "register_operand") - (unspec:REPLACE_ELT [(match_operand:REPLACE_ELT 1 "register_operand") - (match_operand: 2 "register_operand") - (match_operand:QI 3 "const_0_to_12_operand")] - UNSPEC_REPLACE_UN))] - "TARGET_POWER10" -{ - /* Immediate value is the byte index Big Endian numbering. */ - emit_insn (gen_vreplace_elt__inst (operands[0], operands[1], - operands[2], operands[3])); - DONE; - } -[(set_attr "type" "vecsimple")]) - (define_insn "vreplace_elt__inst" [(set (match_operand:REPLACE_ELT 0 "register_operand" "=v") (unspec:REPLACE_ELT [(match_operand:REPLACE_ELT 1 "register_operand" "0") @@ -4222,6 +4207,16 @@ "vins %0,%2,%3" [(set_attr "type" "vecsimple")]) +(define_insn "vreplace_un_" + [(set (match_operand:V16QI 0 "register_operand" "=v") + (unspec:V16QI [(match_operand:REPLACE_ELT 1 "register_operand" "0") + (match_operand: 2 "register_operand" "r") + (match_operand:QI 3 "const_0_to_12_operand" "n")] + UNSPEC_REPLACE_UN))] + "TARGET_POWER10" + "vins %0,%2,%3" + [(set_attr "type" "vecsimple")]) + ;; VSX_EXTRACT optimizations ;; Optimize double d = (double) vec_extract (vi, ) ;; Get the element into the top position and use XVCVSWDP/XVCVUWDP -- cgit v1.1 From eefec38c992e3622a69de9667e91f0cafbff03cc Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Wed, 9 Feb 2022 14:10:53 -0500 Subject: Avoid using predefined insn name for instruction with different semantics This isn't technically a regression, but it only impacts the v850 target and fixes a long standing code correctness issue. As outlined in slightly more detail in the PR, the v850 is using the pattern name "fnmasf4" and "fnmssf4" to generate fnmaf.s and fnmsf.s instructions respectively. Unfortunately fnmasf4 is expected to produce (-a * b) + c and fnmssf4 (-a * b) - c. Those v850 instructions actually negate the entire result. The fix is trivial. Use a different pattern name so that the combiner can still generate those instructions, but prevent those instructions from being used to implement GCC's notion of what fnmas and fnmss should be. This fixes pr97040 as well as a handful of testsuite failures for the v3e5 multilib. gcc/ PR target/97040 * config/v850/v850.md (*v850_fnmasf4): Renamed from fnmasf4. (*v850_fnmssf4): Renamed from fnmssf4 --- gcc/config/v850/v850.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/v850/v850.md b/gcc/config/v850/v850.md index ed51157..6ca31e3 100644 --- a/gcc/config/v850/v850.md +++ b/gcc/config/v850/v850.md @@ -2601,7 +2601,12 @@ (set_attr "type" "fpu")]) ;;; negative-multiply-add -(define_insn "fnmasf4" +;; Note the name on this and the following insn were previously fnmasf4 +;; and fnmssf4. Those names are known to the gimple->rtl expanders and +;; must implement specific semantics (negating one of the inputs to the +;; multiplication). The v850 instructions actually negate the entire +;; result. Thus the names have been changed and hidden. +(define_insn "*v850_fnmasf4" [(set (match_operand:SF 0 "register_operand" "=r") (neg:SF (fma:SF (match_operand:SF 1 "register_operand" "r") (match_operand:SF 2 "register_operand" "r") @@ -2612,7 +2617,7 @@ (set_attr "type" "fpu")]) ;; negative-multiply-subtract -(define_insn "fnmssf4" +(define_insn "*v850_fnmssf4" [(set (match_operand:SF 0 "register_operand" "=r") (neg:SF (fma:SF (match_operand:SF 1 "register_operand" "r") (match_operand:SF 2 "register_operand" "r") -- cgit v1.1 From 2b399dbabd48639ab4daac462c9d82c6cf3f99cc Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 9 Feb 2022 20:18:10 +0100 Subject: i386: Force inputs to a register to avoid lowpart_subreg failure [PR104458] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Input operands can be in the form of: (subreg:DI (reg:V2SF 96) 0) which chokes lowpart_subreg. Force inputs to a register, which is preferable even when the input operand is from memory. 2022-02-09 Uroš Bizjak gcc/ChangeLog: PR target/104458 * config/i386/i386-expand.cc (ix86_split_idivmod): Force operands[2] and operands[3] into a register.. gcc/testsuite/ChangeLog: PR target/104458 * gcc.target/i386/pr104458.c: New test. --- gcc/config/i386/i386-expand.cc | 3 +++ 1 file changed, 3 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index eb1930b..ce9607e 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -1407,6 +1407,9 @@ ix86_split_idivmod (machine_mode mode, rtx operands[], rtx scratch, tmp0, tmp1, tmp2; rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx); + operands[2] = force_reg (mode, operands[2]); + operands[3] = force_reg (mode, operands[3]); + switch (mode) { case E_SImode: -- cgit v1.1 From 41582f88ec01c5ce2f85ebc4ac2743eb426d6e33 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Wed, 9 Feb 2022 14:56:58 -0800 Subject: [COMMITTED] Fix PR aarch64/104474: ICE with vector float initializers and non-consts. The problem here is that the aarch64 back-end was placing const0_rtx into the constant vector RTL even if the mode was a floating point mode. The fix is instead to use CONST0_RTX and pass the mode to select the correct zero (either const_int or const_double). Committed as obvious after a bootstrap/test on aarch64-linux-gnu with no regressions. PR target/104474 gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_sve_expand_vector_init_handle_trailing_constants): Use CONST0_RTX instead of const0_rtx for the non-constant elements. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/pr104474-1.c: New test. * gcc.target/aarch64/sve/pr104474-2.c: New test. * gcc.target/aarch64/sve/pr104474-3.c: New test. --- gcc/config/aarch64/aarch64.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 7bb97bd..e3f18fb 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -21164,7 +21164,7 @@ aarch64_sve_expand_vector_init_handle_trailing_constants { rtx x = builder.elt (i + nelts_reqd - n_trailing_constants); if (!valid_for_const_vector_p (elem_mode, x)) - x = const0_rtx; + x = CONST0_RTX (elem_mode); v.quick_push (x); } rtx const_vec = v.build (); -- cgit v1.1 From 91a7e1daa7520489fafc0001d03c68bad4304f15 Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Thu, 3 Feb 2022 09:07:22 +0100 Subject: nvptx: Improved support for HFMode including neghf2 and abshf2 This patch adds more support for _Float16 (HFmode) to the nvptx backend. Currently negation, absolute value and floating point comparisons are implemented by promoting to float (SFmode). This patch adds suitable define_insns to nvptx.md, most conditional on TARGET_SM53 (-misa=sm_53). This patch also adds support for HFmode fused multiply-add. One subtlety is that neghf2 and abshf2 are implemented by (HImode) bit manipulation operations to update the sign bit. The NVidia PTX ISA documentation for neg.f16 and abs.f16 contains the caution "Future implementations may comply with the IEEE 754 standard by preserving the (NaN) payload and modifying only the sign bit". Given the availability of suitable replacements, I thought it best to provide IEEE 754 compliant implementations. If anyone observes a performance penalty from this choice I'm happy to provide a -ffast-math variant (or revisit this decision). This patch has been tested on nvptx-none hosted on x86_64-pc-linux-gnu (including newlib) with a make and make -k check with no new failures. gcc/ChangeLog: * config/nvptx/nvptx.md (*cmpf): New define_insn. (cstorehf4): New define_expand. (fmahf4): New define_insn. (neghf2): New define_insn. (abshf2): New define_insn. gcc/testsuite/ChangeLog: * gcc.target/nvptx/float16-3.c: New test case for neghf2. * gcc.target/nvptx/float16-4.c: New test case for abshf2. * gcc.target/nvptx/float16-5.c: New test case for fmahf4. * gcc.target/nvptx/float16-6.c: New test case. --- gcc/config/nvptx/nvptx.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 7463603..e26d24e 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -783,6 +783,14 @@ "" "%.\\tsetp%c1\\t%0, %2, %3;") +(define_insn "*cmphf" + [(set (match_operand:BI 0 "nvptx_register_operand" "=R") + (match_operator:BI 1 "nvptx_float_comparison_operator" + [(match_operand:HF 2 "nvptx_register_operand" "R") + (match_operand:HF 3 "nvptx_nonmemory_operand" "RF")]))] + "TARGET_SM53" + "%.\\tsetp%c1\\t%0, %2, %3;") + (define_insn "jump" [(set (pc) (label_ref (match_operand 0 "" "")))] @@ -973,6 +981,21 @@ DONE; }) +(define_expand "cstorehf4" + [(set (match_operand:SI 0 "nvptx_register_operand") + (match_operator:SI 1 "nvptx_float_comparison_operator" + [(match_operand:HF 2 "nvptx_register_operand") + (match_operand:HF 3 "nvptx_nonmemory_operand")]))] + "TARGET_SM53" +{ + rtx reg = gen_reg_rtx (BImode); + rtx cmp = gen_rtx_fmt_ee (GET_CODE (operands[1]), BImode, + operands[2], operands[3]); + emit_move_insn (reg, cmp); + emit_insn (gen_setccsi_from_bi (operands[0], reg)); + DONE; +}) + ;; Calls (define_insn "call_insn_" @@ -1160,6 +1183,26 @@ "TARGET_SM53" "%.\\tmul.f16\\t%0, %1, %2;") +(define_insn "fmahf4" + [(set (match_operand:HF 0 "nvptx_register_operand" "=R") + (fma:HF (match_operand:HF 1 "nvptx_register_operand" "R") + (match_operand:HF 2 "nvptx_nonmemory_operand" "RF") + (match_operand:HF 3 "nvptx_nonmemory_operand" "RF")))] + "TARGET_SM53" + "%.\\tfma%#.f16\\t%0, %1, %2, %3;") + +(define_insn "neghf2" + [(set (match_operand:HF 0 "nvptx_register_operand" "=R") + (neg:HF (match_operand:HF 1 "nvptx_register_operand" "R")))] + "" + "%.\\txor.b16\\t%0, %1, -32768;") + +(define_insn "abshf2" + [(set (match_operand:HF 0 "nvptx_register_operand" "=R") + (abs:HF (match_operand:HF 1 "nvptx_register_operand" "R")))] + "" + "%.\\tand.b16\\t%0, %1, 32767;") + (define_insn "exp2hf2" [(set (match_operand:HF 0 "nvptx_register_operand" "=R") (unspec:HF [(match_operand:HF 1 "nvptx_register_operand" "R")] -- cgit v1.1 From de12b919c74307c5c2a4c79a29683d21e622422e Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Thu, 3 Feb 2022 09:21:58 +0100 Subject: nvptx: Expand QI mode operations using SI mode instructions One of the unusual target features of the Nvidia PTX ISA is that it doesn't provide QI mode (byte sized) operations or registers. Somewhat conventionally, 8-bit quantities are read from/written to memory using special instructions, but stored internally using SImode (32-bit) registers. GCC's middle-end accomodates targets without QImode optabs, by widening operations until suitable support is found, and with the current nvptx backend this means 16-bit HImode operations. The inconvenience is that nvptx is also a TARGET_TRULY_NOOP_TRUNCATION=false target, meaning that additional instructions are required to convert between the SImode registers used to hold QImode values, and the HImode registers used to operate on them (and back again). This results in a large amount of shuffling and type conversion in code dealing with bytes, i.e. using char or Boolean types. This patch improves the situation by providing expanders in the nvptx machine description to perform QImode operations natively in SImode instead of HImode. An alternate implementation might be to provide some form of target hook to specify which fallback modes to use during RTL expansion, but I think this requirement is unusual, and a solution entirely in the nvptx backend doesn't disturb/affect other targets. The improvements can be quite dramatic, as shown in the example below: int foo(int x, int y) { return (x==21) && (y==69); } previously with -O2 required 15 instructions: mov.u32 %r26, %ar0; mov.u32 %r27, %ar1; setp.eq.u32 %r31, %r26, 21; selp.u32 %r30, 1, 0, %r31; mov.u32 %r29, %r30; setp.eq.u32 %r34, %r27, 69; selp.u32 %r33, 1, 0, %r34; mov.u32 %r32, %r33; cvt.u16.u8 %r39, %r29; mov.u16 %r36, %r39; cvt.u16.u8 %r39, %r32; mov.u16 %r37, %r39; and.b16 %r35, %r36, %r37; cvt.u32.u16 %r38, %r35; cvt.u32.u8 %value, %r38; with this patch, now requires only 7 instructions: mov.u32 %r26, %ar0; mov.u32 %r27, %ar1; setp.eq.u32 %r31, %r26, 21; setp.eq.u32 %r34, %r27, 69; selp.u32 %r37, 1, 0, %r31; selp.u32 %r38, 1, 0, %r34; and.b32 %value, %r37, %r38; This patch has been tested on nvptx-none hosted on x86_64-pc-linux-gnu (including newlib) with a make and make -k check with no new failures. gcc/ChangeLog: * config/nvptx/nvptx.md (cmp): Renamed from *cmp. (setcc_from_bi): Additionally support QImode. (extendbi2): Additionally support QImode. (zero_extendbi2): Additionally support QImode. (any_sbinary, any_ubinary, any_sunary, any_uunary): New code iterators for signed and unsigned, binary and unary operations. (qi3, qi3, qi2, qi2): New expanders to perform QImode operations using SImode instructions. (cstoreqi4): New define_expand. (*ext_truncsi2_qi): New define_insn. (*zext_truncsi2_qi): New define_insn. gcc/testsuite/ChangeLog: * gcc.target/nvptx/bool-1.c: New test case. --- gcc/config/nvptx/nvptx.md | 114 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 107 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index e26d24e..f53809ea 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -767,7 +767,7 @@ ;; Comparisons and branches -(define_insn "*cmp" +(define_insn "cmp" [(set (match_operand:BI 0 "nvptx_register_operand" "=R") (match_operator:BI 1 "nvptx_comparison_operator" [(match_operand:HSDIM 2 "nvptx_register_operand" "R") @@ -879,22 +879,22 @@ ;; Conditional stores (define_insn "setcc_from_bi" - [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R") - (ne:HSDIM (match_operand:BI 1 "nvptx_register_operand" "R") + [(set (match_operand:QHSDIM 0 "nvptx_register_operand" "=R") + (ne:QHSDIM (match_operand:BI 1 "nvptx_register_operand" "R") (const_int 0)))] "" "%.\\tselp%t0\\t%0, 1, 0, %1;") (define_insn "extendbi2" - [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R") - (sign_extend:HSDIM + [(set (match_operand:QHSDIM 0 "nvptx_register_operand" "=R") + (sign_extend:QHSDIM (match_operand:BI 1 "nvptx_register_operand" "R")))] "" "%.\\tselp%t0\\t%0, -1, 0, %1;") (define_insn "zero_extendbi2" - [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R") - (zero_extend:HSDIM + [(set (match_operand:QHSDIM 0 "nvptx_register_operand" "=R") + (zero_extend:QHSDIM (match_operand:BI 1 "nvptx_register_operand" "R")))] "" "%.\\tselp%t0\\t%0, 1, 0, %1;") @@ -2117,3 +2117,103 @@ return nvptx_output_red_partition (operands[0], operands[1]); } [(set_attr "predicable" "false")]) + +;; Expand QI mode operations using SI mode instructions. +(define_code_iterator any_sbinary [plus minus smin smax]) +(define_code_attr sbinary [(plus "add") (minus "sub") (smin "smin") (smax "smax")]) + +(define_code_iterator any_ubinary [and ior xor umin umax]) +(define_code_attr ubinary [(and "and") (ior "ior") (xor "xor") (umin "umin") + (umax "umax")]) + +(define_code_iterator any_sunary [neg abs]) +(define_code_attr sunary [(neg "neg") (abs "abs")]) + +(define_code_iterator any_uunary [not]) +(define_code_attr uunary [(not "one_cmpl")]) + +(define_expand "qi3" + [(set (match_operand:QI 0 "nvptx_register_operand") + (any_sbinary:QI (match_operand:QI 1 "nvptx_nonmemory_operand") + (match_operand:QI 2 "nvptx_nonmemory_operand")))] + "" +{ + rtx reg = gen_reg_rtx (SImode); + rtx op0 = convert_modes (SImode, QImode, operands[1], 0); + rtx op1 = convert_modes (SImode, QImode, operands[2], 0); + if ( == MINUS) + op0 = force_reg (SImode, op0); + emit_insn (gen_si3 (reg, op0, op1)); + emit_insn (gen_truncsiqi2 (operands[0], reg)); + DONE; +}) + +(define_expand "qi3" + [(set (match_operand:QI 0 "nvptx_register_operand") + (any_ubinary:QI (match_operand:QI 1 "nvptx_nonmemory_operand") + (match_operand:QI 2 "nvptx_nonmemory_operand")))] + "" +{ + rtx reg = gen_reg_rtx (SImode); + rtx op0 = convert_modes (SImode, QImode, operands[1], 1); + rtx op1 = convert_modes (SImode, QImode, operands[2], 1); + emit_insn (gen_si3 (reg, op0, op1)); + emit_insn (gen_truncsiqi2 (operands[0], reg)); + DONE; +}) + +(define_expand "qi2" + [(set (match_operand:QI 0 "nvptx_register_operand") + (any_sunary:QI (match_operand:QI 1 "nvptx_nonmemory_operand")))] + "" +{ + rtx reg = gen_reg_rtx (SImode); + rtx op0 = convert_modes (SImode, QImode, operands[1], 0); + emit_insn (gen_si2 (reg, op0)); + emit_insn (gen_truncsiqi2 (operands[0], reg)); + DONE; +}) + +(define_expand "qi2" + [(set (match_operand:QI 0 "nvptx_register_operand") + (any_uunary:QI (match_operand:QI 1 "nvptx_nonmemory_operand")))] + "" +{ + rtx reg = gen_reg_rtx (SImode); + rtx op0 = convert_modes (SImode, QImode, operands[1], 1); + emit_insn (gen_si2 (reg, op0)); + emit_insn (gen_truncsiqi2 (operands[0], reg)); + DONE; +}) + +(define_expand "cstoreqi4" + [(set (match_operand:SI 0 "nvptx_register_operand") + (match_operator:SI 1 "nvptx_comparison_operator" + [(match_operand:QI 2 "nvptx_nonmemory_operand") + (match_operand:QI 3 "nvptx_nonmemory_operand")]))] + "" +{ + rtx reg = gen_reg_rtx (BImode); + enum rtx_code code = GET_CODE (operands[1]); + int unsignedp = unsigned_condition_p (code); + rtx op2 = convert_modes (SImode, QImode, operands[2], unsignedp); + rtx op3 = convert_modes (SImode, QImode, operands[3], unsignedp); + rtx cmp = gen_rtx_fmt_ee (code, SImode, op2, op3); + emit_insn (gen_cmpsi (reg, cmp, op2, op3)); + emit_insn (gen_setccsi_from_bi (operands[0], reg)); + DONE; +}) + +(define_insn "*ext_truncsi2_qi" + [(set (match_operand:SI 0 "nvptx_register_operand" "=R") + (sign_extend:SI + (truncate:QI (match_operand:SI 1 "nvptx_register_operand" "R"))))] + "" + "%.\\tcvt.s32.s8\\t%0, %1;") + +(define_insn "*zext_truncsi2_qi" + [(set (match_operand:SI 0 "nvptx_register_operand" "=R") + (zero_extend:SI + (truncate:QI (match_operand:SI 1 "nvptx_register_operand" "R"))))] + "" + "%.\\tcvt.u32.u8\\t%0, %1;") -- cgit v1.1 From 26d7b8f9bdf9ffb414beaa1133672f2d04c954eb Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Thu, 3 Feb 2022 14:41:01 +0100 Subject: nvptx: Add support for 64-bit mul.hi (and other) instructions Now that the middle-end MULT_HIGHPART_EXPR pieces are in place, this patch adds support for nvptx's mul.hi.s64 and mul.hi.u64 instructions, as previously reviewed (provisionally pre-approved) back in August 2020: https://gcc.gnu.org/pipermail/gcc-patches/2020-August/551373.html Since then a few things have changed, so this patch uses the new SMUL_HIGHPART and UMUL_HIGHPART RTX expressions, but the test cases remain the same. Like the x86_64 backend, this patch retains the "trunc" forms of these instructions (while the RTL optimizers/combine may still generate them). Given that we're rapidly approaching stage 4, I also took the liberty of including support in nvptx.md for a few other instructions. With the new 64-bit highpart multiplication instructions added above, we can now provide a define_expand for efficient 64-bit (to 128-bit) widening multiplications. This patch also adds support for nvptx's testp.infinite instruction (for implementing __builtin_isinf) and the not.pred instruction. As an example of the code generation improvements, the function int foo(double x) { return __builtin_isinf(x); } previously generated with -O2: mov.f64 %r26, %ar0; abs.f64 %r28, %r26; setp.leu.f64 %r31, %r28, 0d7fefffffffffffff; selp.u32 %r30, 1, 0, %r31; mov.u32 %r29, %r30; cvt.u16.u8 %r35, %r29; mov.u16 %r33, %r35; xor.b16 %r32, %r33, 1; cvt.u32.u16 %r34, %r32; cvt.u32.u8 %value, %r34; and with this patch now generates: mov.f64 %r23, %ar0; testp.infinite.f64 %r24, %r23; selp.u32 %value, 1, 0, %r24; This patch has been tested on nvptx-none hosted on x86_64-pc-linux-gnu (including newlib) with a make and make -k check with no new failures. gcc/ChangeLog: * config/nvptx/nvptx.md (UNSPEC_ISINF): New UNSPEC. (one_cmplbi2): New define_insn for not.pred. (mulditi3): New define_expand for signed widening multiply. (umulditi3): New define_expand for unsigned widening multiply. (smul3_highpart): New define_insn for signed highpart mult. (umul3_highpart): New define_insn for unsigned highpart mult. (*smulhi3_highpart_2): Renamed from smulhi3_highpart. (*smulsi3_highpart_2): Renamed from smulsi3_highpart. (*umulhi3_highpart_2): Renamed from umulhi3_highpart. (*umulsi3_highpart_2): Renamed from umulsi3_highpart. (*setcc_from_not_bi): New define_insn. (*setcc_isinf): New define_insn for testp.infinite. (isinf2): New define_expand. gcc/testsuite/ChangeLog: * gcc.target/nvptx/mul-hi64.c: New test case. * gcc.target/nvptx/umul-hi64.c: New test case. * gcc.target/nvptx/mul-wide64.c: New test case. * gcc.target/nvptx/umul-wide64.c: New test case. * gcc.target/nvptx/isinf.c: New test case. --- gcc/config/nvptx/nvptx.md | 91 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index f53809ea..d19a687 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -27,6 +27,7 @@ UNSPEC_SIN UNSPEC_COS UNSPEC_TANH + UNSPEC_ISINF UNSPEC_FPINT_FLOOR UNSPEC_FPINT_BTRUNC @@ -596,6 +597,12 @@ "" "%.\\tnot.b%T0\\t%0, %1;") +(define_insn "one_cmplbi2" + [(set (match_operand:BI 0 "nvptx_register_operand" "=R") + (not:BI (match_operand:BI 1 "nvptx_register_operand" "R")))] + "" + "%.\\tnot.pred\\t%0, %1;") + (define_insn "*cnot2" [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R") (eq:HSDIM (match_operand:HSDIM 1 "nvptx_register_operand" "R") @@ -671,7 +678,57 @@ "" "%.\\tmul.wide.u32\\t%0, %1, %2;") -(define_insn "smulhi3_highpart" +(define_expand "mulditi3" + [(set (match_operand:TI 0 "nvptx_register_operand") + (mult:TI (sign_extend:TI + (match_operand:DI 1 "nvptx_register_operand")) + (sign_extend:DI + (match_operand:DI 2 "nvptx_nonmemory_operand"))))] + "" +{ + rtx hi = gen_reg_rtx (DImode); + rtx lo = gen_reg_rtx (DImode); + emit_insn (gen_smuldi3_highpart (hi, operands[1], operands[2])); + emit_insn (gen_muldi3 (lo, operands[1], operands[2])); + emit_move_insn (gen_highpart (DImode, operands[0]), hi); + emit_move_insn (gen_lowpart (DImode, operands[0]), lo); + DONE; +}) + +(define_expand "umulditi3" + [(set (match_operand:TI 0 "nvptx_register_operand") + (mult:TI (zero_extend:TI + (match_operand:DI 1 "nvptx_register_operand")) + (zero_extend:DI + (match_operand:DI 2 "nvptx_nonmemory_operand"))))] + "" +{ + rtx hi = gen_reg_rtx (DImode); + rtx lo = gen_reg_rtx (DImode); + emit_insn (gen_umuldi3_highpart (hi, operands[1], operands[2])); + emit_insn (gen_muldi3 (lo, operands[1], operands[2])); + emit_move_insn (gen_highpart (DImode, operands[0]), hi); + emit_move_insn (gen_lowpart (DImode, operands[0]), lo); + DONE; +}) + +(define_insn "smul3_highpart" + [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R") + (smul_highpart:HSDIM + (match_operand:HSDIM 1 "nvptx_register_operand" "R") + (match_operand:HSDIM 2 "nvptx_nonmemory_operand" "Ri")))] + "" + "%.\\tmul.hi.s%T0\\t%0, %1, %2;") + +(define_insn "umul3_highpart" + [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R") + (umul_highpart:HSDIM + (match_operand:HSDIM 1 "nvptx_register_operand" "R") + (match_operand:HSDIM 2 "nvptx_nonmemory_operand" "Ri")))] + "" + "%.\\tmul.hi.u%T0\\t%0, %1, %2;") + +(define_insn "*smulhi3_highpart_2" [(set (match_operand:HI 0 "nvptx_register_operand" "=R") (truncate:HI (lshiftrt:SI @@ -683,7 +740,7 @@ "" "%.\\tmul.hi.s16\\t%0, %1, %2;") -(define_insn "smulsi3_highpart" +(define_insn "*smulsi3_highpart_2" [(set (match_operand:SI 0 "nvptx_register_operand" "=R") (truncate:SI (lshiftrt:DI @@ -695,7 +752,7 @@ "" "%.\\tmul.hi.s32\\t%0, %1, %2;") -(define_insn "umulhi3_highpart" +(define_insn "*umulhi3_highpart_2" [(set (match_operand:HI 0 "nvptx_register_operand" "=R") (truncate:HI (lshiftrt:SI @@ -707,7 +764,7 @@ "" "%.\\tmul.hi.u16\\t%0, %1, %2;") -(define_insn "umulsi3_highpart" +(define_insn "*umulsi3_highpart_2" [(set (match_operand:SI 0 "nvptx_register_operand" "=R") (truncate:SI (lshiftrt:DI @@ -885,6 +942,13 @@ "" "%.\\tselp%t0\\t%0, 1, 0, %1;") +(define_insn "*setcc_from_not_bi" + [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R") + (eq:HSDIM (match_operand:BI 1 "nvptx_register_operand" "R") + (const_int 0)))] + "" + "%.\\tselp%t0\\t%0, 0, 1, %1;") + (define_insn "extendbi2" [(set (match_operand:QHSDIM 0 "nvptx_register_operand" "=R") (sign_extend:QHSDIM @@ -1160,6 +1224,25 @@ "flag_unsafe_math_optimizations" "%.\\tex2.approx%t0\\t%0, %1;") +(define_insn "setcc_isinf" + [(set (match_operand:BI 0 "nvptx_register_operand" "=R") + (unspec:BI [(match_operand:SDFM 1 "nvptx_register_operand" "R")] + UNSPEC_ISINF))] + "" + "%.\\ttestp.infinite%t1\\t%0, %1;") + +(define_expand "isinf2" + [(set (match_operand:SI 0 "nvptx_register_operand" "=R") + (unspec:SI [(match_operand:SDFM 1 "nvptx_register_operand" "R")] + UNSPEC_ISINF))] + "" +{ + rtx pred = gen_reg_rtx (BImode); + emit_insn (gen_setcc_isinf (pred, operands[1])); + emit_insn (gen_setccsi_from_bi (operands[0], pred)); + DONE; +}) + ;; HFmode floating point arithmetic. (define_insn "addhf3" -- cgit v1.1 From f68c3de7fc9065d8c9ac75b3736ea27abffdce45 Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Thu, 3 Feb 2022 14:46:40 +0100 Subject: nvptx: Fix and use BI mode logic instructions (e.g. and.pred) This patch adds support for nvptx's BImode and.pred, or.pred and xor.pred instructions. Technically, nvptx.md previously defined andbi3, iorbi3 and xorbi3 instructions, but the assembly language mnemonic output for these was incorrect (e.g. and.b1) and would be rejected by the ptxas assembler. The most significant part of this patch is the new define_split which teaches the compiler to actually use these instructions when appropriate (exposing the latent bug above). After https://gcc.gnu.org/pipermail/gcc-patches/2022-January/587999.html, the function: int foo(int x, int y) { return (x==21) && (y==69); } when compiled with -O2 produces: mov.u32 %r26, %ar0; mov.u32 %r27, %ar1; setp.eq.u32 %r31, %r26, 21; setp.eq.u32 %r34, %r27, 69; selp.u32 %r37, 1, 0, %r31; selp.u32 %r38, 1, 0, %r34; and.b32 %value, %r37, %r38; with this patch we now save an extra instruction and generate: mov.u32 %r26, %ar0; mov.u32 %r27, %ar1; setp.eq.u32 %r31, %r26, 21; setp.eq.u32 %r34, %r27, 69; and.pred %r39, %r34, %r31; selp.u32 %value, 1, 0, %r39; This patch has been tested (on top of the patch mentioned above) on nvptx-none hosted on x86_64-pc-linux-gnu (including newlib) with a make and make -k check with no new failures. gcc/ChangeLog: * config/nvptx/nvptx.md (any_logic): Move code iterator earlier in machine description. (logic): Move code attribute earlier in machine description. (ilogic): New code attribute, like logic but "ior" for IOR. (and3, ior3, xor3): Delete. Replace with... (3): New define_insn for HSDIM logic operations. (bi3): New define_insn for BI mode logic operations. (define_split): Lower logic operations from integer modes to BI mode predicate operations. gcc/testsuite/ChangeLog: * gcc.target/nvptx/bool-1.c: Update. * gcc.target/nvptx/bool-2.c: New test case for and.pred. * gcc.target/nvptx/bool-3.c: New test case for or.pred. * gcc.target/nvptx/bool-4.c: New test case for xor.pred. --- gcc/config/nvptx/nvptx.md | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index d19a687..107df74 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -801,26 +801,38 @@ ;; Logical operations -(define_insn "and3" - [(set (match_operand:BHSDIM 0 "nvptx_register_operand" "=R") - (and:BHSDIM (match_operand:BHSDIM 1 "nvptx_register_operand" "R") - (match_operand:BHSDIM 2 "nvptx_nonmemory_operand" "Ri")))] - "" - "%.\\tand.b%T0\\t%0, %1, %2;") +(define_code_iterator any_logic [and ior xor]) +(define_code_attr logic [(and "and") (ior "or") (xor "xor")]) +(define_code_attr ilogic [(and "and") (ior "ior") (xor "xor")]) -(define_insn "ior3" - [(set (match_operand:BHSDIM 0 "nvptx_register_operand" "=R") - (ior:BHSDIM (match_operand:BHSDIM 1 "nvptx_register_operand" "R") - (match_operand:BHSDIM 2 "nvptx_nonmemory_operand" "Ri")))] +(define_insn "3" + [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R") + (any_logic:HSDIM + (match_operand:HSDIM 1 "nvptx_register_operand" "R") + (match_operand:HSDIM 2 "nvptx_nonmemory_operand" "Ri")))] "" - "%.\\tor.b%T0\\t%0, %1, %2;") + "%.\\t.b%T0\\t%0, %1, %2;") -(define_insn "xor3" - [(set (match_operand:BHSDIM 0 "nvptx_register_operand" "=R") - (xor:BHSDIM (match_operand:BHSDIM 1 "nvptx_register_operand" "R") - (match_operand:BHSDIM 2 "nvptx_nonmemory_operand" "Ri")))] +(define_insn "bi3" + [(set (match_operand:BI 0 "nvptx_register_operand" "=R") + (any_logic:BI (match_operand:BI 1 "nvptx_register_operand" "R") + (match_operand:BI 2 "nvptx_register_operand" "R")))] "" - "%.\\txor.b%T0\\t%0, %1, %2;") + "%.\\t.pred\\t%0, %1, %2;") + +(define_split + [(set (match_operand:HSDIM 0 "nvptx_register_operand") + (any_logic:HSDIM + (ne:HSDIM (match_operand:BI 1 "nvptx_register_operand") + (const_int 0)) + (ne:HSDIM (match_operand:BI 2 "nvptx_register_operand") + (const_int 0))))] + "can_create_pseudo_p ()" + [(set (match_dup 3) (any_logic:BI (match_dup 1) (match_dup 2))) + (set (match_dup 0) (ne:HSDIM (match_dup 3) (const_int 0)))] +{ + operands[3] = gen_reg_rtx (BImode); +}) ;; Comparisons and branches @@ -2042,9 +2054,6 @@ } [(set_attr "atomic" "true")]) -(define_code_iterator any_logic [and ior xor]) -(define_code_attr logic [(and "and") (ior "or") (xor "xor")]) - (define_insn "atomic_fetch_" [(set (match_operand:SDIM 1 "memory_operand" "+m") (unspec_volatile:SDIM -- cgit v1.1 From 9bacd7af2e3bba9ddad17e7de4e2d299419d819d Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Fri, 4 Feb 2022 04:13:53 +0100 Subject: PR target/104345: Use nvptx "set" instruction for cond ? -1 : 0 This patch addresses the "increased register pressure" regression on nvptx-none caused by my change to transition the backend to a STORE_FLAG_VALUE = 1 target. This improved code generation for the more common case of producing 0/1 Boolean values, but unfortunately made things marginally worse when a 0/-1 mask value is desired. Unfortunately, nvptx kernels are extremely sensitive to changes in register usage, which was observable in the reported PR. This patch provides optimizations for -(cond ? 1 : 0), effectively simplify this into cond ? -1 : 0, where these ternary operators are provided by nvptx's selp instruction, and for the specific case of SImode, using (restoring) nvptx's "set" instruction (which avoids the need for a predicate register). This patch has been tested on nvptx-none hosted on x86_64-pc-linux-gnu with a "make" and "make -k check" with no new failures. Unfortunately, the exact register usage of a nvptx kernel depends upon the version of the Cuda drivers being used (and the hardware), but I believe this change should resolve the PR (for Thomas) by improving code generation for the cases that regressed. gcc/ChangeLog: PR target/104345 * config/nvptx/nvptx.md (sel_true): Fix indentation. (sel_false): Likewise. (define_code_iterator eqne): New code iterator for EQ and NE. (*selp_neg_): New define_insn_and_split to optimize the negation of a selp instruction. (*selp_not_): New define_insn_and_split to optimize the bitwise not of a selp instruction. (*setcc_int): Use set instruction for neg:SI of a selp. gcc/testsuite/ChangeLog: PR target/104345 * gcc.target/nvptx/neg-selp.c: New test case. --- gcc/config/nvptx/nvptx.md | 58 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 107df74..ad642e7 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -977,7 +977,7 @@ (define_insn "sel_true" [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R") - (if_then_else:HSDIM + (if_then_else:HSDIM (ne (match_operand:BI 1 "nvptx_register_operand" "R") (const_int 0)) (match_operand:HSDIM 2 "nvptx_nonmemory_operand" "Ri") (match_operand:HSDIM 3 "nvptx_nonmemory_operand" "Ri")))] @@ -986,7 +986,7 @@ (define_insn "sel_true" [(set (match_operand:SDFM 0 "nvptx_register_operand" "=R") - (if_then_else:SDFM + (if_then_else:SDFM (ne (match_operand:BI 1 "nvptx_register_operand" "R") (const_int 0)) (match_operand:SDFM 2 "nvptx_nonmemory_operand" "RF") (match_operand:SDFM 3 "nvptx_nonmemory_operand" "RF")))] @@ -995,7 +995,7 @@ (define_insn "sel_false" [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R") - (if_then_else:HSDIM + (if_then_else:HSDIM (eq (match_operand:BI 1 "nvptx_register_operand" "R") (const_int 0)) (match_operand:HSDIM 2 "nvptx_nonmemory_operand" "Ri") (match_operand:HSDIM 3 "nvptx_nonmemory_operand" "Ri")))] @@ -1004,13 +1004,63 @@ (define_insn "sel_false" [(set (match_operand:SDFM 0 "nvptx_register_operand" "=R") - (if_then_else:SDFM + (if_then_else:SDFM (eq (match_operand:BI 1 "nvptx_register_operand" "R") (const_int 0)) (match_operand:SDFM 2 "nvptx_nonmemory_operand" "RF") (match_operand:SDFM 3 "nvptx_nonmemory_operand" "RF")))] "" "%.\\tselp%t0\\t%0, %3, %2, %1;") +(define_code_iterator eqne [eq ne]) + +;; Split negation of a predicate into a conditional move. +(define_insn_and_split "*selp_neg_" + [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R") + (neg:HSDIM (eqne:HSDIM + (match_operand:BI 1 "nvptx_register_operand" "R") + (const_int 0))))] + "" + "#" + "&& 1" + [(set (match_dup 0) + (if_then_else:HSDIM + (eqne (match_dup 1) (const_int 0)) + (const_int -1) + (const_int 0)))]) + +;; Split bitwise not of a predicate into a conditional move. +(define_insn_and_split "*selp_not_" + [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R") + (not:HSDIM (eqne:HSDIM + (match_operand:BI 1 "nvptx_register_operand" "R") + (const_int 0))))] + "" + "#" + "&& 1" + [(set (match_dup 0) + (if_then_else:HSDIM + (eqne (match_dup 1) (const_int 0)) + (const_int -2) + (const_int -1)))]) + +(define_insn "*setcc_int" + [(set (match_operand:SI 0 "nvptx_register_operand" "=R") + (neg:SI + (match_operator:SI 1 "nvptx_comparison_operator" + [(match_operand:HSDIM 2 "nvptx_register_operand" "R") + (match_operand:HSDIM 3 "nvptx_nonmemory_operand" "Ri")])))] + "" + "%.\\tset%t0%c1\\t%0, %2, %3;") + +(define_insn "*setcc_int" + [(set (match_operand:SI 0 "nvptx_register_operand" "=R") + (neg:SI + (match_operator:SI 1 "nvptx_float_comparison_operator" + [(match_operand:SDFM 2 "nvptx_register_operand" "R") + (match_operand:SDFM 3 "nvptx_nonmemory_operand" "RF")])))] + "" + "%.\\tset%t0%c1\\t%0, %2, %3;") + (define_insn "setcc_float" [(set (match_operand:SF 0 "nvptx_register_operand" "=R") (match_operator:SF 1 "nvptx_comparison_operator" -- cgit v1.1 From 6d98e83b2c919bd9fba2c61333d613bafc37357f Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Tue, 8 Feb 2022 20:56:55 +0100 Subject: nvptx: Tweak constraints on copysign instructions Many thanks to Thomas Schwinge for confirming my hypothesis that the register usage regression, PR target/104345, is solely due to libgcc's _muldc3 function. In addition to the isinf functionality in the previously proposed nvptx patch at https://gcc.gnu.org/pipermail/gcc-patches/2022-January/588453.html which significantly reduces the number of instructions in _muldc3, the patch below further reduces both the number of instructions and the number of explicitly declared registers, by permitting floating point constant immediate operands in nvptx's copysign instruction. Fingers-crossed, the combination with all of the previous proposed nvptx patches improves things. Ultimately, increasing register usage from 50 to 51 registers, reducing the number of concurrent threads by ~2%, can easily be countered if we're now executing significantly fewer instructions in each kernel, for a net performance win. This patch has been tested on nvptx-none hosted on x86_64-pc-linux-gnu with a "make" and "make -k check" with no new failures. gcc/ChangeLog: * config/nvptx/nvptx.md (copysign3): Allow immediate floating point constants as operands 1 and/or 2. --- gcc/config/nvptx/nvptx.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index ad642e7..bb0c0b3 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -1209,8 +1209,8 @@ (define_insn "copysign3" [(set (match_operand:SDFM 0 "nvptx_register_operand" "=R") - (unspec:SDFM [(match_operand:SDFM 1 "nvptx_register_operand" "R") - (match_operand:SDFM 2 "nvptx_register_operand" "R")] + (unspec:SDFM [(match_operand:SDFM 1 "nvptx_nonmemory_operand" "RF") + (match_operand:SDFM 2 "nvptx_nonmemory_operand" "RF")] UNSPEC_COPYSIGN))] "" "%.\\tcopysign%t0\\t%0, %2, %1;") -- cgit v1.1 From 5b2d679bbbcc2b976c6e228ba63afdf67c33164e Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Mon, 7 Feb 2022 14:12:34 +0100 Subject: [nvptx] Workaround sub.u16 driver JIT bug There's a nvidia driver JIT bug that mishandles this code (minimized from builtin-arith-overflow-15.c): ... int main (void) { signed char r; unsigned char y = (unsigned char) 0x80; if (__builtin_sub_overflow ((unsigned char)0, (unsigned char)y, &r)) __builtin_abort (); return 0; } ... which at ptx level minimizes to: ... mov.u16 r22, 0x0080; st.local.u16 [frame_var],r22; ld.local.u16 r32,[frame_var]; sub.u16 r33,0x0000,r32; cvt.u32.u16 r35,r33; ... where we expect r35 == 0x0000ff80 but get instead 0xffffff80, and where using nvptx-none-run -O0 fixes the problem. [ See also https://github.com/vries/nvidia-bugs/tree/master/builtin-arith-overflow-15 . ] Try to workaround the bug by using sub.s16 instead of sub.u16. Tested on nvptx. gcc/ChangeLog: 2022-02-07 Tom de Vries PR target/97005 * config/nvptx/nvptx.md (define_insn "sub3"): Workaround driver JIT bug by using sub.s16 instead of sub.u16. --- gcc/config/nvptx/nvptx.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index bb0c0b3..cced68e 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -506,7 +506,14 @@ (minus:HSDIM (match_operand:HSDIM 1 "nvptx_register_operand" "R") (match_operand:HSDIM 2 "nvptx_register_operand" "R")))] "" - "%.\\tsub%t0\\t%0, %1, %2;") + { + if (GET_MODE (operands[0]) == HImode) + /* Workaround https://developer.nvidia.com/nvidia_bug/3527713. + See PR97005. */ + return "%.\\tsub.s16\\t%0, %1, %2;"; + + return "%.\\tsub%t0\\t%0, %1, %2;"; + }) (define_insn "mul3" [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R") -- cgit v1.1 From 3e7d4e82dc9fecb051e9ac422c312b26206d5ecd Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Thu, 13 Jan 2022 13:13:44 +0100 Subject: [nvptx] Handle pre-sm_7x shared atomic store using atomic exchange The ptx isa specifies (for pre-sm_7x) that atomic operations on shared memory locations do not guarantee atomicity with respect to normal store instructions to the same address. This can be fixed by: - inserting barriers between normal stores and atomic operations to a common address - using atom.exch to store to locations accessed by other atomic operations. It's not clearly spelled out which barriers are needed, and a barrier seem more expensive than atomic exchange. Implement the pre-sm_7x shared atomic store using atomic exchange. That includes stores using generic addressing, since those may also point to shared memory. Tested on x86-64 with nvptx accelerator. gcc/ChangeLog: 2022-02-02 Tom de Vries * config/nvptx/nvptx-protos.h (nvptx_mem_maybe_shared_p): Declare. * config/nvptx/nvptx.cc (nvptx_mem_data_area): New static function. (nvptx_mem_maybe_shared_p): New function. * config/nvptx/nvptx.md (define_expand "atomic_store"): New define_expand. gcc/testsuite/ChangeLog: 2022-02-02 Tom de Vries * gcc.target/nvptx/atomic-store-1.c: New test. * gcc.target/nvptx/atomic-store-3.c: New test. * gcc.target/nvptx/stack-atomics-run.c: Update. --- gcc/config/nvptx/nvptx-protos.h | 1 + gcc/config/nvptx/nvptx.cc | 22 ++++++++++++++++++++++ gcc/config/nvptx/nvptx.md | 30 ++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h index a846e34..0bf9af4 100644 --- a/gcc/config/nvptx/nvptx-protos.h +++ b/gcc/config/nvptx/nvptx-protos.h @@ -60,5 +60,6 @@ extern const char *nvptx_output_simt_exit (rtx); extern const char *nvptx_output_red_partition (rtx, rtx); extern const char *nvptx_output_atomic_insn (const char *, rtx *, int, int); extern bool nvptx_mem_local_p (rtx); +extern bool nvptx_mem_maybe_shared_p (const_rtx); #endif #endif diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 1b0227a..5b26c0f 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -76,6 +76,7 @@ #include "intl.h" #include "opts.h" #include "tree-pretty-print.h" +#include "rtl-iter.h" /* This file should be included last. */ #include "target-def.h" @@ -2787,6 +2788,27 @@ nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr) nvptx_print_address_operand (file, addr, mode); } +static nvptx_data_area +nvptx_mem_data_area (const_rtx x) +{ + gcc_assert (GET_CODE (x) == MEM); + + const_rtx addr = XEXP (x, 0); + subrtx_iterator::array_type array; + FOR_EACH_SUBRTX (iter, array, addr, ALL) + if (SYMBOL_REF_P (*iter)) + return SYMBOL_DATA_AREA (*iter); + + return DATA_AREA_GENERIC; +} + +bool +nvptx_mem_maybe_shared_p (const_rtx x) +{ + nvptx_data_area area = nvptx_mem_data_area (x); + return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC; +} + /* Print an operand, X, to FILE, with an optional modifier in CODE. Meaning of CODE: diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index cced68e..1a283b4 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -2051,6 +2051,36 @@ } [(set_attr "atomic" "true")]) +(define_expand "atomic_store" + [(match_operand:SDIM 0 "memory_operand" "=m") ;; memory + (match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri") ;; input + (match_operand:SI 2 "const_int_operand")] ;; model + "" +{ + struct address_info info; + decompose_mem_address (&info, operands[0]); + if (info.base != NULL && REG_P (*info.base) + && REGNO_PTR_FRAME_P (REGNO (*info.base))) + { + emit_insn (gen_mov (operands[0], operands[1])); + DONE; + } + + if (TARGET_SM70) + /* Fall back to expand_atomic_store. */ + FAIL; + + bool maybe_shared_p = nvptx_mem_maybe_shared_p (operands[0]); + if (!maybe_shared_p) + /* Fall back to expand_atomic_store. */ + FAIL; + + rtx tmpreg = gen_reg_rtx (mode); + emit_insn (gen_atomic_exchange (tmpreg, operands[0], operands[1], + operands[2])); + DONE; +}) + (define_insn "atomic_fetch_add" [(set (match_operand:SDIM 1 "memory_operand" "+m") (unspec_volatile:SDIM -- cgit v1.1 From 19a13d5a1d695465b3c3905b7c8ec888add1a39e Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Wed, 2 Feb 2022 16:23:37 +0100 Subject: [nvptx] Handle sm_7x shared atomic store more optimal For sm_7x atomic stores we fall back on expand_atomic_store, but this results in using membar.sys for shared stores. Fix this by adding an nvptx_atomic_store insn that adds a membar.cta for a shared store. Tested on x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-02-02 Tom de Vries * config/nvptx/nvptx.md (define_insn "nvptx_atomic_store"): New define_insn. (define_expand "atomic_store"): Use nvptx_atomic_store for TARGET_SM70. (define_c_enum "unspecv"): Add UNSPECV_ST. gcc/testsuite/ChangeLog: 2022-02-02 Tom de Vries * gcc.target/nvptx/atomic-store-2.c: New test. --- gcc/config/nvptx/nvptx.md | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 1a283b4..4c378ec 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -57,6 +57,7 @@ UNSPECV_CAS UNSPECV_CAS_LOCAL UNSPECV_XCHG + UNSPECV_ST UNSPECV_BARSYNC UNSPECV_WARPSYNC UNSPECV_UNIFORM_WARP_CHECK @@ -2067,8 +2068,11 @@ } if (TARGET_SM70) - /* Fall back to expand_atomic_store. */ - FAIL; + { + emit_insn (gen_nvptx_atomic_store (operands[0], operands[1], + operands[2])); + DONE; + } bool maybe_shared_p = nvptx_mem_maybe_shared_p (operands[0]); if (!maybe_shared_p) @@ -2081,6 +2085,20 @@ DONE; }) +(define_insn "nvptx_atomic_store" + [(set (match_operand:SDIM 0 "memory_operand" "+m") ;; memory + (unspec_volatile:SDIM + [(match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri") ;; input + (match_operand:SI 2 "const_int_operand")] ;; model + UNSPECV_ST))] + "TARGET_SM70" + { + const char *t + = "%.\tst%A0.b%T0\t%0, %1;"; + return nvptx_output_atomic_insn (t, operands, 0, 2); + } + [(set_attr "atomic" "true")]) + (define_insn "atomic_fetch_add" [(set (match_operand:SDIM 1 "memory_operand" "+m") (unspec_volatile:SDIM -- cgit v1.1 From 53fcc46339239c4958e2a15bb9e59274133bbcf7 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 10 Feb 2022 17:23:17 +0100 Subject: i386: Fix vec_unpacks_float_lo_v4si operand constraint [PR104469] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2022-02-10 Uroš Bizjak gcc/ChangeLog: PR target/104469 * config/i386/sse.md (vec_unpacks_float_lo_v4si): Change operand 1 constraint to register_operand. gcc/testsuite/ChangeLog: PR target/104469 * gcc.target/i386/pr104469.c: New test. --- gcc/config/i386/sse.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 36b35f6..b2f5634 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -9223,7 +9223,7 @@ (define_expand "vec_unpacks_float_hi_v8si" [(set (match_dup 2) (vec_select:V4SI - (match_operand:V8SI 1 "vector_operand") + (match_operand:V8SI 1 "register_operand") (parallel [(const_int 4) (const_int 5) (const_int 6) (const_int 7)]))) (set (match_operand:V4DF 0 "register_operand") -- cgit v1.1 From fd64b09217fbe8fa33b559e61564071e8aca71e5 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Thu, 10 Feb 2022 11:26:16 +0100 Subject: [nvptx] Handle asm insn in prevent_branch_around_nothing With GOMP_NVPTX_JIT=-00 and -mptx=3.1, I run into: ... FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/acc_prof-version-1.c \ -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none -O2 \ execution test ... The problem is that we're generating a diverging branch around nothing: ... { .reg.u32 %x; mov.u32 %x, %tid.x; setp.ne.u32 %r23, %x, 0; } @%r23 bra $L2; $L2: ... which the driver JIT has problems with at -O0, so consequently we run into the nvptx_uniform_warp_check. Fix this by handling asm ("") and alike in prevent_branch_around_nothing. Tested on x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-02-10 Tom de Vries PR target/104456 * config/nvptx/nvptx.cc (prevent_branch_around_nothing): Handle asm insn. --- gcc/config/nvptx/nvptx.cc | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 5b26c0f..afbad5b 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -5257,6 +5257,14 @@ prevent_branch_around_nothing (void) case CODE_FOR_nvptx_join: case CODE_FOR_nop: continue; + case -1: + /* Handle asm ("") and similar. */ + if (GET_CODE (PATTERN (insn)) == ASM_INPUT + || GET_CODE (PATTERN (insn)) == ASM_OPERANDS + || (GET_CODE (PATTERN (insn)) == PARALLEL + && asm_noperands (PATTERN (insn)) >= 0)) + continue; + /* FALLTHROUGH. */ default: seen_label = NULL; continue; -- cgit v1.1 From 4c3792d448964f7bd99e7eac2c29c9eb7c2bfb84 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Mon, 7 Feb 2022 15:36:35 +0000 Subject: LRA, rs6000, Darwin: Amend lo_sum use for forced constants [PR104117]. Two issues resulted in this PR, which manifests when we force a constant into memory in LRA (in PIC code on Darwin). The presence of such forced constants is quite dependent on other RTL optimisations, and it is easy for the issue to become latent for a specific case. First, in the Darwin-specific rs6000 backend code, we were not being careful enough in rejecting invalid symbolic addresses. Specifically, when generating PIC code, we require a SYMBOL_REF to be wrapped in an UNSPEC_MACHOPIC_OFFSET. Second, LRA was attempting to load a register using an invalid lo_sum address. Signed-off-by: Iain Sandoe Co-authored-by: Vladimir Makarov PR target/104117 gcc/ChangeLog: * config/rs6000/rs6000.cc (darwin_rs6000_legitimate_lo_sum_const_p): Check for UNSPEC_MACHOPIC_OFFSET wrappers on symbolic addresses when emitting PIC code. (legitimate_lo_sum_address_p): Likewise. * lra-constraints.cc (process_address_1): Do not attempt to emit a reg load from an invalid lo_sum address. --- gcc/config/rs6000/rs6000.cc | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index eaba9a2..bc3ef072 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -8317,8 +8317,14 @@ darwin_rs6000_legitimate_lo_sum_const_p (rtx x, machine_mode mode) if (GET_CODE (x) == CONST) x = XEXP (x, 0); + /* If we are building PIC code, then any symbol must be wrapped in an + UNSPEC_MACHOPIC_OFFSET so that it will get the picbase subtracted. */ + bool machopic_offs_p = false; if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_MACHOPIC_OFFSET) - x = XVECEXP (x, 0, 0); + { + x = XVECEXP (x, 0, 0); + machopic_offs_p = true; + } rtx sym = NULL_RTX; unsigned HOST_WIDE_INT offset = 0; @@ -8349,6 +8355,9 @@ darwin_rs6000_legitimate_lo_sum_const_p (rtx x, machine_mode mode) if (sym) { tree decl = SYMBOL_REF_DECL (sym); + /* As noted above, PIC code cannot use a bare SYMBOL_REF. */ + if (TARGET_MACHO && flag_pic && !machopic_offs_p) + return false; #if TARGET_MACHO if (MACHO_SYMBOL_INDIRECTION_P (sym)) /* The decl in an indirection symbol is the original one, which might @@ -8936,7 +8945,7 @@ legitimate_lo_sum_address_p (machine_mode mode, rtx x, int strict) return false; x = XEXP (x, 1); - if (TARGET_ELF || TARGET_MACHO) + if (TARGET_ELF) { bool large_toc_ok; @@ -8962,7 +8971,32 @@ legitimate_lo_sum_address_p (machine_mode mode, rtx x, int strict) return CONSTANT_P (x) || large_toc_ok; } + else if (TARGET_MACHO) + { + if (GET_MODE_NUNITS (mode) != 1) + return false; + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD + && !(/* see above */ + TARGET_HARD_FLOAT && (mode == DFmode || mode == DDmode))) + return false; +#if TARGET_MACHO + if (MACHO_DYNAMIC_NO_PIC_P || !flag_pic) + return CONSTANT_P (x); +#endif + /* Macho-O PIC code from here. */ + if (GET_CODE (x) == CONST) + x = XEXP (x, 0); + + /* SYMBOL_REFs need to be wrapped in an UNSPEC_MACHOPIC_OFFSET. */ + if (SYMBOL_REF_P (x)) + return false; + /* So this is OK if the wrapped object is const. */ + if (GET_CODE (x) == UNSPEC + && XINT (x, 1) == UNSPEC_MACHOPIC_OFFSET) + return CONSTANT_P (XVECEXP (x, 0, 0)); + return CONSTANT_P (x); + } return false; } -- cgit v1.1 From edadc7e0510b703d9727cf5ff68d55d84bb95def Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sat, 12 Feb 2022 10:53:49 +0100 Subject: i386: Skip decimal float vector modes in type_natural_mode [PR79754] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2022-02-12 Uroš Bizjak gcc/ChangeLog: PR target/79754 * config/i386/i386.cc (type_natural_mode): Skip decimal float vector modes. gcc/testsuite/ChangeLog: PR target/79754 * gcc.target/i386/pr79754.c: New test. --- gcc/config/i386/i386.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 6b97a2b..cf246e7 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -1876,10 +1876,14 @@ type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum, { machine_mode innermode = TYPE_MODE (TREE_TYPE (type)); - /* There are no XFmode vector modes. */ + /* There are no XFmode vector modes ... */ if (innermode == XFmode) return mode; + /* ... and no decimal float vector modes. */ + if (DECIMAL_FLOAT_MODE_P (innermode)) + return mode; + if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE) mode = MIN_MODE_VECTOR_FLOAT; else -- cgit v1.1 From 0538d42cdd68f6b65d72ed7768f1d00ba44f8631 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Sat, 12 Feb 2022 11:17:41 +0100 Subject: i386: Fix up cvtsd2ss splitter [PR104502] The following testcase ICEs, because AVX512F is enabled, AVX512VL is not, and the cvtsd2ss insn has %xmm0-15 as output operand and %xmm16-31 as input operand. For output operand %xmm16+ the splitter just gives up in such case, but for such input it just emits vmovddup which requires AVX512VL if either operand is EXT_REX_SSE_REG_P (when it is 128-bit). The following patch fixes it by treating that case like the pre-SSE3 output != input case - move the input to output and do everything on the output reg which is known to be < %xmm16. 2022-02-12 Jakub Jelinek PR target/104502 * config/i386/i386.md (cvtsd2ss splitter): If operands[1] is xmm16+ and AVX512VL isn't available, move operands[1] to operands[0] first. * gcc.target/i386/pr104502.c: New test. --- gcc/config/i386/i386.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 74da0d4..8ffa641 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -4838,8 +4838,8 @@ movddup is available. */ if (REG_P (operands[1])) { - if (!TARGET_SSE3 - && REGNO (operands[0]) != REGNO (operands[1])) + if ((!TARGET_SSE3 && REGNO (operands[0]) != REGNO (operands[1])) + || (EXT_REX_SSE_REG_P (operands[1]) && !TARGET_AVX512VL)) { rtx tmp = lowpart_subreg (DFmode, operands[0], SFmode); emit_move_insn (tmp, operands[1]); -- cgit v1.1 From d51cad0b840a14c66732cb6a166c11ddf55d18b2 Mon Sep 17 00:00:00 2001 From: Andrew Stubbs Date: Sat, 12 Feb 2022 23:44:48 +0000 Subject: amdgcn: Allow vector reductions on constants Obviously it would be better if these reductions could be evaluated at compile time, but this will avoid an ICE. gcc/ChangeLog: * config/gcn/gcn.cc (gcn_expand_reduc_scalar): Use force_reg. --- gcc/config/gcn/gcn.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index 74819c6..402f025 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -4460,7 +4460,7 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec) pair of lanes, then on every pair of results from the previous iteration (thereby effectively reducing every 4 lanes) and so on until all lanes are reduced. */ - rtx in, out = src; + rtx in, out = force_reg (mode, src); for (int i = 0, shift = 1; i < 6; i++, shift <<= 1) { rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift); -- cgit v1.1 From 16b65b08484237cc2845c4f5c4f15efe3a43a32c Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Mon, 14 Feb 2022 17:42:14 -0500 Subject: Use correct names for __ibm128 if long double is IEEE 128-bit. If you are on a PowerPC system where the default long double is IEEE 128-bit (either through the compiler option -mabi=ieeelongdouble or via the configure option --with-long-double-format=ieee), GCC used the wrong names for some of the conversion functions for the __ibm128 type. Internally, GCC uses IFmode for __ibm128 if long double is IEEE 128-bit, instead of TFmode when long double is IBM 128-bit. This patch adds the missing conversions to prevent the 'if' name from being used. In particular, before the patch, the conversions used were: IFmode to DImode signed: __fixifdi instead of __fixtfdi IFmode to DImode unsigned __fixunsifti instead of __fixunstfti DImode to IFmode signed: __floatdiif instead of __floatditf DImode to IFmode unsigned: __floatundiif instead of __floatunditf 2022-02-14 Michael Meissner gcc/ PR target/104253 * config/rs6000/rs6000.cc (init_float128_ibm): Update the conversion functions used to convert IFmode types. gcc/testsuite/ PR target/104253 * gcc.target/powerpc/pr104253.c: New test. --- gcc/config/rs6000/rs6000.cc | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index bc3ef072..e76c017 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -11018,6 +11018,12 @@ init_float128_ibm (machine_mode mode) set_conv_libfunc (trunc_optab, DDmode, mode, "__dpd_trunctfdd"); set_conv_libfunc (sext_optab, TDmode, mode, "__dpd_extendtftd"); + set_conv_libfunc (sfix_optab, DImode, mode, "__fixtfdi"); + set_conv_libfunc (ufix_optab, DImode, mode, "__fixunstfdi"); + + set_conv_libfunc (sfloat_optab, mode, DImode, "__floatditf"); + set_conv_libfunc (ufloat_optab, mode, DImode, "__floatunditf"); + if (TARGET_POWERPC64) { set_conv_libfunc (sfix_optab, TImode, mode, "__fixtfti"); -- cgit v1.1 From 0863d0ede34d21b2258686e6ccfd6dbb100bb754 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 15 Feb 2022 12:17:41 +0100 Subject: cygwin: Fix up -Werror=format-diag errors [PR104536] As the testcase reports, cygwin has 3 can%'t contractions in diagnostics, we use cannot everywhere else instead and -Wformat-diag enforces that. 2022-02-15 Jakub Jelinek PR target/104536 * config/i386/host-cygwin.cc (cygwin_gt_pch_get_address): Use cannot instead of can%'t in diagnostics. Formatting fixes. --- gcc/config/i386/host-cygwin.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/host-cygwin.cc b/gcc/config/i386/host-cygwin.cc index fcf6333..05ad3a8 100644 --- a/gcc/config/i386/host-cygwin.cc +++ b/gcc/config/i386/host-cygwin.cc @@ -51,18 +51,18 @@ static void * cygwin_gt_pch_get_address (size_t sz, int fd) { void *base; - off_t p = lseek(fd, 0, SEEK_CUR); + off_t p = lseek (fd, 0, SEEK_CUR); if (p == (off_t) -1) - fatal_error (input_location, "can%'t get position in PCH file: %m"); + fatal_error (input_location, "cannot get position in PCH file: %m"); /* Cygwin requires that the underlying file be at least as large as the requested mapping. */ if ((size_t) p < sz) - { - if ( ftruncate (fd, sz) == -1 ) - fatal_error (input_location, "can%'t extend PCH file: %m"); - } + { + if (ftruncate (fd, sz) == -1) + fatal_error (input_location, "cannot extend PCH file: %m"); + } base = mmap (NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); @@ -71,8 +71,8 @@ cygwin_gt_pch_get_address (size_t sz, int fd) else munmap (base, sz); - if (lseek (fd, p, SEEK_SET) == (off_t) -1 ) - fatal_error (input_location, "can%'t set position in PCH file: %m"); + if (lseek (fd, p, SEEK_SET) == (off_t) -1) + fatal_error (input_location, "cannot set position in PCH file: %m"); return base; } -- cgit v1.1 From 4963079769c99c4073adfd799885410ad484cbbe Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 15 Feb 2022 18:09:33 +0000 Subject: vect+aarch64: Fix ldp_stp_* regressions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ldp_stp_1.c, ldp_stp_4.c and ldp_stp_5.c have been failing since vectorisation was enabled at -O2. In all three cases SLP is generating vector code when scalar code would be better. The problem is that the target costs do not model whether STP could be used for the scalar or vector code, so the normal latency-based costs for store-heavy code can be way off. It would be good to fix that “properly” at some point, but it isn't easy; see the existing discussion in aarch64_sve_adjust_stmt_cost for more details. This patch therefore adds an on-the-side check for whether the code is doing nothing more than set-up+stores. It then applies STP-based costs to those cases only, in addition to the normal latency-based costs. (That is, the vector code has to win on both counts rather than on one count individually.) However, at the moment, SLP costs one vector set-up instruction for every vector in an SLP node, even if the contents are the same as a previous vector in the same node. Fixing the STP costs without fixing that would regress other cases, tested in the patch. The patch therefore makes the SLP costing code check for duplicates within a node. Ideally we'd check for duplicates more globally, but that would require a more global approach to costs: the cost of an initialisation should be amoritised across all trees that use the initialisation, rather than fully counted against one arbitrarily-chosen subtree. Back on aarch64: an earlier version of the patch tried to apply the new heuristic to constant stores. However, that didn't work too well in practice; see the comments for details. The patch therefore just tests the status quo for constant cases, leaving out a match if the current choice is dubious. ldp_stp_5.c was affected by the same thing. The test would be worth vectorising if we generated better vector code, but: (1) We do a bad job of moving the { -1, 1 } constant, given that we have { -1, -1 } and { 1, 1 } to hand. (2) The vector code has 6 pairable stores to misaligned offsets. We have peephole patterns to handle such misalignment for 4 pairable stores, but not 6. So the SLP decision isn't wrong as such. It's just being let down by later codegen. The patch therefore adds -mstrict-align to preserve the original intention of the test while adding ldp_stp_19.c to check for the preferred vector code (XFAILed for now). gcc/ * tree-vectorizer.h (vect_scalar_ops_slice): New struct. (vect_scalar_ops_slice_hash): Likewise. (vect_scalar_ops_slice::op): New function. * tree-vect-slp.cc (vect_scalar_ops_slice::all_same_p): New function. (vect_scalar_ops_slice_hash::hash): Likewise. (vect_scalar_ops_slice_hash::equal): Likewise. (vect_prologue_cost_for_slp): Check for duplicate vectors. * config/aarch64/aarch64.cc (aarch64_vector_costs::m_stp_sequence_cost): New member variable. (aarch64_aligned_constant_offset_p): New function. (aarch64_stp_sequence_cost): Likewise. (aarch64_vector_costs::add_stmt_cost): Handle new STP heuristic. (aarch64_vector_costs::finish_cost): Likewise. gcc/testsuite/ * gcc.target/aarch64/ldp_stp_5.c: Require -mstrict-align. * gcc.target/aarch64/ldp_stp_14.h, * gcc.target/aarch64/ldp_stp_14.c: New test. * gcc.target/aarch64/ldp_stp_15.c: Likewise. * gcc.target/aarch64/ldp_stp_16.c: Likewise. * gcc.target/aarch64/ldp_stp_17.c: Likewise. * gcc.target/aarch64/ldp_stp_18.c: Likewise. * gcc.target/aarch64/ldp_stp_19.c: Likewise. --- gcc/config/aarch64/aarch64.cc | 140 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index e3f18fb..1a460d4 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -14932,6 +14932,31 @@ private: - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */ unsigned int m_vec_flags = 0; + /* At the moment, we do not model LDP and STP in the vector and scalar costs. + This means that code such as: + + a[0] = x; + a[1] = x; + + will be costed as two scalar instructions and two vector instructions + (a scalar_to_vec and an unaligned_store). For SLP, the vector form + wins if the costs are equal, because of the fact that the vector costs + include constant initializations whereas the scalar costs don't. + We would therefore tend to vectorize the code above, even though + the scalar version can use a single STP. + + We should eventually fix this and model LDP and STP in the main costs; + see the comment in aarch64_sve_adjust_stmt_cost for some of the problems. + Until then, we look specifically for code that does nothing more than + STP-like operations. We cost them on that basis in addition to the + normal latency-based costs. + + If the scalar or vector code could be a sequence of STPs + + initialization, this variable counts the cost of the sequence, + with 2 units per instruction. The variable is ~0U for other + kinds of code. */ + unsigned int m_stp_sequence_cost = 0; + /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those situations, we try to predict whether an Advanced SIMD implementation @@ -15724,6 +15749,104 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, } } +/* Return true if STMT_INFO contains a memory access and if the constant + component of the memory address is aligned to SIZE bytes. */ +static bool +aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info, + poly_uint64 size) +{ + if (!STMT_VINFO_DATA_REF (stmt_info)) + return false; + + if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info)) + stmt_info = first_stmt; + tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info)); + /* Needed for gathers & scatters, for example. */ + if (!constant_offset) + return false; + + return multiple_p (wi::to_poly_offset (constant_offset), size); +} + +/* Check if a scalar or vector stmt could be part of a region of code + that does nothing more than store values to memory, in the scalar + case using STP. Return the cost of the stmt if so, counting 2 for + one instruction. Return ~0U otherwise. + + The arguments are a subset of those passed to add_stmt_cost. */ +unsigned int +aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind, + stmt_vec_info stmt_info, tree vectype) +{ + /* Code that stores vector constants uses a vector_load to create + the constant. We don't apply the heuristic to that case for two + main reasons: + + - At the moment, STPs are only formed via peephole2, and the + constant scalar moves would often come between STRs and so + prevent STP formation. + + - The scalar code also has to load the constant somehow, and that + isn't costed. */ + switch (kind) + { + case scalar_to_vec: + /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */ + return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count; + + case vec_construct: + if (FLOAT_TYPE_P (vectype)) + /* Count 1 insn for the maximum number of FP->SIMD INS + instructions. */ + return (vect_nunits_for_cost (vectype) - 1) * 2 * count; + + /* Count 2 insns for a GPR->SIMD move and 2 insns for the + maximum number of GPR->SIMD INS instructions. */ + return vect_nunits_for_cost (vectype) * 4 * count; + + case vector_store: + case unaligned_store: + /* Count 1 insn per vector if we can't form STP Q pairs. */ + if (aarch64_sve_mode_p (TYPE_MODE (vectype))) + return count * 2; + if (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) + return count * 2; + + if (stmt_info) + { + /* Assume we won't be able to use STP if the constant offset + component of the address is misaligned. ??? This could be + removed if we formed STP pairs earlier, rather than relying + on peephole2. */ + auto size = GET_MODE_SIZE (TYPE_MODE (vectype)); + if (!aarch64_aligned_constant_offset_p (stmt_info, size)) + return count * 2; + } + return CEIL (count, 2) * 2; + + case scalar_store: + if (stmt_info && STMT_VINFO_DATA_REF (stmt_info)) + { + /* Check for a mode in which STP pairs can be formed. */ + auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info))); + if (maybe_ne (size, 4) && maybe_ne (size, 8)) + return ~0U; + + /* Assume we won't be able to use STP if the constant offset + component of the address is misaligned. ??? This could be + removed if we formed STP pairs earlier, rather than relying + on peephole2. */ + if (!aarch64_aligned_constant_offset_p (stmt_info, size)) + return ~0U; + } + return count; + + default: + return ~0U; + } +} + unsigned aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, stmt_vec_info stmt_info, tree vectype, @@ -15747,6 +15870,14 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, m_analyzed_vinfo = true; } + /* Apply the heuristic described above m_stp_sequence_cost. */ + if (m_stp_sequence_cost != ~0U) + { + uint64_t cost = aarch64_stp_sequence_cost (count, kind, + stmt_info, vectype); + m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U); + } + /* Try to get a more accurate cost by looking at STMT_INFO instead of just looking at KIND. */ if (stmt_info && aarch64_use_new_vector_costs_p ()) @@ -16017,6 +16148,15 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs) m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs, m_costs[vect_body]); + /* Apply the heuristic described above m_stp_sequence_cost. Prefer + the scalar code in the event of a tie, since there is more chance + of scalar code being optimized with surrounding operations. */ + if (!loop_vinfo + && scalar_costs + && m_stp_sequence_cost != ~0U + && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost) + m_costs[vect_body] = 2 * scalar_costs->total_cost (); + vector_costs::finish_cost (scalar_costs); } -- cgit v1.1 From 8e84b2b37a541b27feea69769fc314d534464ebd Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 15 Feb 2022 18:09:35 +0000 Subject: aarch64: Fix subs_compare_2.c regression [PR100874] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit subs_compare_2.c tests that we can use a SUBS+CSEL sequence for: unsigned int foo (unsigned int a, unsigned int b) { unsigned int x = a - 4; if (a < 4) return x; else return 0; } As Andrew notes in the PR, this is effectively MIN (x, 4) - 4, and it is now recognised as such by phiopt. Previously it was if-converted in RTL instead. I tried to look for ways to generalise this to other situations and to other ?:-style operations, not just max and min. However, for general ?: we tend to push an outer “- CST” into the arms of the ?: -- at least if one of them simplifies -- so I didn't find any useful abstraction. This patch therefore adds a pattern specifically for max/min(a,cst)-cst. I'm not thrilled at having to do this, but it seems like the least worst fix in the circumstances. Also, max(a,cst)-cst for unsigned a is a useful saturating subtraction idiom and so is arguably worth its own code for that reason. gcc/ PR target/100874 * config/aarch64/aarch64-protos.h (aarch64_maxmin_plus_const): Declare. * config/aarch64/aarch64.cc (aarch64_maxmin_plus_const): New function. * config/aarch64/aarch64.md (*aarch64_minmax_plus): New pattern. gcc/testsuite/ * gcc.target/aarch64/max_plus_1.c: New test. * gcc.target/aarch64/max_plus_2.c: Likewise. * gcc.target/aarch64/max_plus_3.c: Likewise. * gcc.target/aarch64/max_plus_4.c: Likewise. * gcc.target/aarch64/max_plus_5.c: Likewise. * gcc.target/aarch64/max_plus_6.c: Likewise. * gcc.target/aarch64/max_plus_7.c: Likewise. * gcc.target/aarch64/min_plus_1.c: Likewise. * gcc.target/aarch64/min_plus_2.c: Likewise. * gcc.target/aarch64/min_plus_3.c: Likewise. * gcc.target/aarch64/min_plus_4.c: Likewise. * gcc.target/aarch64/min_plus_5.c: Likewise. * gcc.target/aarch64/min_plus_6.c: Likewise. * gcc.target/aarch64/min_plus_7.c: Likewise. --- gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64.cc | 104 ++++++++++++++++++++++++++++++++++++ gcc/config/aarch64/aarch64.md | 27 ++++++++++ 3 files changed, 132 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 392efa0..d0e78d6 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -939,6 +939,7 @@ bool aarch64_legitimate_address_p (machine_mode, rtx, bool, aarch64_addr_query_type = ADDR_QUERY_M); machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx); rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx); +bool aarch64_maxmin_plus_const (rtx_code, rtx *, bool); rtx aarch64_load_tp (rtx); void aarch64_expand_compare_and_swap (rtx op[]); diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 1a460d4..37ed22bc 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -3781,6 +3781,110 @@ aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y, return aarch64_gen_compare_reg (code, x, y); } +/* Consider the operation: + + OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3] + + where: + + - CODE is [SU]MAX or [SU]MIN + - OPERANDS[2] and OPERANDS[3] are constant integers + - OPERANDS[3] is a positive or negative shifted 12-bit immediate + - all operands have mode MODE + + Decide whether it is possible to implement the operation using: + + SUBS , OPERANDS[1], -OPERANDS[3] + or + ADDS , OPERANDS[1], OPERANDS[3] + + followed by: + + OPERANDS[0], , [wx]zr, + + where is one of CSEL, CSINV or CSINC. Return true if so. + If GENERATE_P is true, also update OPERANDS as follows: + + OPERANDS[4] = -OPERANDS[3] + OPERANDS[5] = the rtl condition representing + OPERANDS[6] = + OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */ +bool +aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p) +{ + signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED); + rtx dst = operands[0]; + rtx maxmin_op = operands[2]; + rtx add_op = operands[3]; + machine_mode mode = GET_MODE (dst); + + /* max (x, y) - z == (x >= y + 1 ? x : y) - z + == (x >= y ? x : y) - z + == (x > y ? x : y) - z + == (x > y - 1 ? x : y) - z + + min (x, y) - z == (x <= y - 1 ? x : y) - z + == (x <= y ? x : y) - z + == (x < y ? x : y) - z + == (x < y + 1 ? x : y) - z + + Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for + which x is compared with z. Set DIFF to y - z. Thus the supported + combinations are as follows, with DIFF being the value after the ":": + + max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1] + == x >= y ? x - y : 0 [z == y] + == x > y ? x - y : 0 [z == y] + == x > y - 1 ? x - (y - 1) : 1 [z == y - 1] + + min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1] + == x <= y ? x - y : 0 [z == y] + == x < y ? x - y : 0 [z == y] + == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */ + auto maxmin_val = rtx_mode_t (maxmin_op, mode); + auto add_val = rtx_mode_t (add_op, mode); + auto sub_val = wi::neg (add_val); + auto diff = wi::sub (maxmin_val, sub_val); + if (!(diff == 0 + || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn)) + || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn)))) + return false; + + if (!generate_p) + return true; + + rtx_code cmp; + switch (code) + { + case SMAX: + cmp = diff == 1 ? GT : GE; + break; + case UMAX: + cmp = diff == 1 ? GTU : GEU; + break; + case SMIN: + cmp = diff == -1 ? LT : LE; + break; + case UMIN: + cmp = diff == -1 ? LTU : LEU; + break; + default: + gcc_unreachable (); + } + rtx cc = gen_rtx_REG (CCmode, CC_REGNUM); + + operands[4] = immed_wide_int_const (sub_val, mode); + operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx); + if (can_create_pseudo_p ()) + operands[6] = gen_reg_rtx (mode); + else + operands[6] = dst; + operands[7] = immed_wide_int_const (diff, mode); + + return true; +} + + /* Build the SYMBOL_REF for __tls_get_addr. */ static GTY(()) rtx tls_get_addr_libfunc; diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 3c72bda..64cc21d 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4405,6 +4405,33 @@ } ) +;; Implement MAX/MIN (A, B) - C using SUBS/ADDS followed by CSEL/CSINV/CSINC. +;; See aarch64_maxmin_plus_const for details about the supported cases. +(define_insn_and_split "*aarch64_minmax_plus" + [(set (match_operand:GPI 0 "register_operand" "=r") + (plus:GPI + (MAXMIN:GPI + (match_operand:GPI 1 "register_operand" "r") + (match_operand:GPI 2 "const_int_operand")) + (match_operand:GPI 3 "aarch64_plus_immediate"))) + (clobber (reg:CC CC_REGNUM))] + "aarch64_maxmin_plus_const (, operands, false)" + "#" + "&& 1" + [(parallel + [(set (reg:CC CC_REGNUM) + (compare:CC (match_dup 1) (match_dup 4))) + (set (match_dup 6) + (plus:GPI (match_dup 1) (match_dup 3)))]) + (set (match_dup 0) + (if_then_else:GPI (match_dup 5) (match_dup 6) (match_dup 7)))] + { + if (!aarch64_maxmin_plus_const (, operands, true)) + gcc_unreachable (); + } + [(set_attr "length" "8")] +) + ;; ------------------------------------------------------------------- ;; Logical operations ;; ------------------------------------------------------------------- -- cgit v1.1 From 25332d2325c720f584444c3858efdb85b8a3c06a Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 16 Feb 2022 10:21:13 +0000 Subject: aarch64: Extend PR100056 patterns to + MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pr100056.c contains things like: int or_shift_u3a (unsigned i) { i &= 7; return i | (i << 11); } After g:96146e61cd7aee62c21c2845916ec42152918ab7, the preferred gimple representation of this is a multiplication: i_2 = i_1(D) & 7; _5 = i_2 * 2049; Expand then open-codes the multiplication back to individual shifts, but (of course) it uses + rather than | to combine the shifts. This means that we end up with the RTL equivalent of: i + (i << 11) I wondered about canonicalising the + to | (*back* to | in this case) when the operands have no set bits in common and when one of the operands is &, | or ^, but that didn't seem to be a popular idea when I asked on IRC. The feeling seemed to be that + is inherently simpler than |, so we shouldn't be “simplifying” the other way. This patch therefore adjusts the PR100056 patterns to handle + as well as |, in cases where the operands are provably disjoint. For: int or_shift_u8 (unsigned char i) { return i | (i << 11); } the instructions: 2: r95:SI=zero_extend(x0:QI) REG_DEAD x0:QI 7: r98:SI=r95:SI<<0xb are combined into: (parallel [ (set (reg:SI 98) (and:SI (ashift:SI (reg:SI 0 x0 [ i ]) (const_int 11 [0xb])) (const_int 522240 [0x7f800]))) (set (reg/v:SI 95 [ i ]) (zero_extend:SI (reg:QI 0 x0 [ i ]))) ]) which fails to match, but which is then split into its individual (independent) sets. Later the zero_extend is combined with the add to get an ADD UXTB: (set (reg:SI 99) (plus:SI (zero_extend:SI (reg:QI 0 x0 [ i ])) (reg:SI 98))) This means that there is never a 3-insn combo to match the split against. The end result is therefore: ubfiz w1, w0, 11, 8 add w0, w1, w0, uxtb This is a bit redundant, since it's doing the zero_extend twice. It is at least 2 instructions though, rather than the 3 that we had before the original patch for PR100056. or_shift_u8_asm is affected similarly. The net effect is that we do still have 2 UBFIZs, but we're at least back down to 2 instructions per function, as for GCC 11. I think that's good enough for now. There are probably other instructions that should be extended to support + as well as | (e.g. the EXTR ones), but those aren't regressions and so are GCC 13 material. gcc/ PR target/100056 * config/aarch64/iterators.md (LOGICAL_OR_PLUS): New iterator. * config/aarch64/aarch64.md: Extend the PR100056 patterns to handle plus in the same way as ior, if the operands have no set bits in common. gcc/testsuite/ PR target/100056 * gcc.target/aarch64/pr100056.c: XFAIL the original UBFIZ test and instead expect two UBFIZs + two ADD UXTBs. --- gcc/config/aarch64/aarch64.md | 33 +++++++++++++++++++++++---------- gcc/config/aarch64/iterators.md | 3 +++ 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 64cc21d..5909184 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4558,7 +4558,7 @@ (define_split [(set (match_operand:GPI 0 "register_operand") - (LOGICAL:GPI + (LOGICAL_OR_PLUS:GPI (and:GPI (ashift:GPI (match_operand:GPI 1 "register_operand") (match_operand:QI 2 "aarch64_shift_imm_")) (match_operand:GPI 3 "const_int_operand")) @@ -4571,16 +4571,23 @@ && REGNO (operands[1]) == REGNO (operands[4]))) && (trunc_int_for_mode (GET_MODE_MASK (GET_MODE (operands[4])) << INTVAL (operands[2]), mode) - == INTVAL (operands[3]))" + == INTVAL (operands[3])) + && ( != PLUS + || (GET_MODE_MASK (GET_MODE (operands[4])) + & INTVAL (operands[3])) == 0)" [(set (match_dup 5) (zero_extend:GPI (match_dup 4))) - (set (match_dup 0) (LOGICAL:GPI (ashift:GPI (match_dup 5) (match_dup 2)) - (match_dup 5)))] - "operands[5] = gen_reg_rtx (mode);" + (set (match_dup 0) (match_dup 6))] + { + operands[5] = gen_reg_rtx (mode); + rtx shift = gen_rtx_ASHIFT (mode, operands[5], operands[2]); + rtx_code new_code = ( == PLUS ? IOR : ); + operands[6] = gen_rtx_fmt_ee (new_code, mode, shift, operands[5]); + } ) (define_split [(set (match_operand:GPI 0 "register_operand") - (LOGICAL:GPI + (LOGICAL_OR_PLUS:GPI (and:GPI (ashift:GPI (match_operand:GPI 1 "register_operand") (match_operand:QI 2 "aarch64_shift_imm_")) (match_operand:GPI 4 "const_int_operand")) @@ -4589,11 +4596,17 @@ && pow2_or_zerop (UINTVAL (operands[3]) + 1) && (trunc_int_for_mode (UINTVAL (operands[3]) << INTVAL (operands[2]), mode) - == INTVAL (operands[4]))" + == INTVAL (operands[4])) + && ( != PLUS + || (INTVAL (operands[4]) & INTVAL (operands[3])) == 0)" [(set (match_dup 5) (and:GPI (match_dup 1) (match_dup 3))) - (set (match_dup 0) (LOGICAL:GPI (ashift:GPI (match_dup 5) (match_dup 2)) - (match_dup 5)))] - "operands[5] = gen_reg_rtx (mode);" + (set (match_dup 0) (match_dup 6))] + { + operands[5] = gen_reg_rtx (mode); + rtx shift = gen_rtx_ASHIFT (mode, operands[5], operands[2]); + rtx_code new_code = ( == PLUS ? IOR : ); + operands[6] = gen_rtx_fmt_ee (new_code, mode, shift, operands[5]); + } ) (define_split diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 88067a3..e72fdf35 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -2122,6 +2122,9 @@ ;; Code iterator for logical operations (define_code_iterator LOGICAL [and ior xor]) +;; LOGICAL with plus, for when | gets converted to +. +(define_code_iterator LOGICAL_OR_PLUS [and ior xor plus]) + ;; LOGICAL without AND. (define_code_iterator LOGICAL_OR [ior xor]) -- cgit v1.1 From 687e57d7ac741d1c48ac030f87041aa56b888532 Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Wed, 16 Feb 2022 22:00:00 -0500 Subject: Define __SIZEOF_FLOAT128__ and __SIZEOF_IBM128__. Define the sizes of the PowerPC specific types __float128 and __ibm128 if those types are enabled. This patch will define __SIZEOF_IBM128__ and __SIZEOF_FLOAT128__ if their respective types are created in the compiler. Currently, this means both of these will be defined if float128 support is enabled. But at some point in the future, __ibm128 could be enabled without enabling float128 support and __SIZEOF_IBM128__ would be defined. 2022-02-16 Michael Meissner gcc/ PR target/99708 * config/rs6000/rs6000-c.cc (rs6000_cpu_cpp_builtins): Define __SIZEOF_IBM128__ if the IBM 128-bit long double type is created. Define __SIZEOF_FLOAT128__ if the IEEE 128-bit floating point type is created. gcc/testsuite/ PR target/99708 * gcc.target/powerpc/pr99708.c: New test. --- gcc/config/rs6000/rs6000-c.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc index 15251ef..d2e480a 100644 --- a/gcc/config/rs6000/rs6000-c.cc +++ b/gcc/config/rs6000/rs6000-c.cc @@ -623,7 +623,11 @@ rs6000_cpu_cpp_builtins (cpp_reader *pfile) if (TARGET_FRSQRTES) builtin_define ("__RSQRTEF__"); if (TARGET_FLOAT128_TYPE) - builtin_define ("__FLOAT128_TYPE__"); + builtin_define ("__FLOAT128_TYPE__"); + if (ibm128_float_type_node) + builtin_define ("__SIZEOF_IBM128__=16"); + if (ieee128_float_type_node) + builtin_define ("__SIZEOF_FLOAT128__=16"); #ifdef TARGET_LIBC_PROVIDES_HWCAP_IN_TCB builtin_define ("__BUILTIN_CPU_SUPPORTS__"); #endif -- cgit v1.1 From 550cabd00238a8e74783ba6ad05a7580d074aabd Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 16 Feb 2022 15:00:59 +0800 Subject: Clean up MPX-related bit_{MPX,BNDREGS,BNDCSR}. gcc/ChangeLog: * config/i386/cpuid.h (bit_MPX): Removed. (bit_BNDREGS): Ditto. (bit_BNDCSR): Ditto. --- gcc/config/i386/cpuid.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h index ed61130..8b3dc2b 100644 --- a/gcc/config/i386/cpuid.h +++ b/gcc/config/i386/cpuid.h @@ -86,7 +86,6 @@ #define bit_AVX2 (1 << 5) #define bit_BMI2 (1 << 8) #define bit_RTM (1 << 11) -#define bit_MPX (1 << 14) #define bit_AVX512F (1 << 16) #define bit_AVX512DQ (1 << 17) #define bit_RDSEED (1 << 18) @@ -136,10 +135,6 @@ #define bit_AMX_TILE (1 << 24) #define bit_AMX_INT8 (1 << 25) -/* XFEATURE_ENABLED_MASK register bits (%eax == 0xd, %ecx == 0) */ -#define bit_BNDREGS (1 << 3) -#define bit_BNDCSR (1 << 4) - /* Extended State Enumeration Sub-leaf (%eax == 0xd, %ecx == 1) */ #define bit_XSAVEOPT (1 << 0) #define bit_XSAVEC (1 << 1) -- cgit v1.1 From fac15bf84807a58f83c741b1034c1bc96348319d Mon Sep 17 00:00:00 2001 From: Robin Dapp Date: Thu, 17 Feb 2022 19:59:51 +0100 Subject: rs6000: Workaround for new ifcvt behavior [PR104335]. Since r12-6747-gaa8cfe785953a0 ifcvt passes a "cc comparison" i.e. the representation of the result of a comparison to the backend. rs6000_emit_int_cmove () is not prepared to handle this. Therefore, this patch makes it return false in such a case. PR target/104335 gcc/ChangeLog: * config/rs6000/rs6000.cc (rs6000_emit_int_cmove): Return false if the expected comparison's first operand is of mode MODE_CC. --- gcc/config/rs6000/rs6000.cc | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index e76c017..32a13cd 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -16215,6 +16215,12 @@ rs6000_emit_int_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond) if (mode != SImode && (!TARGET_POWERPC64 || mode != DImode)) return false; + /* PR104335: We now need to expect CC-mode "comparisons" + coming from ifcvt. The following code expects proper + comparisons so better abort here. */ + if (GET_MODE_CLASS (GET_MODE (XEXP (op, 0))) == MODE_CC) + return false; + /* We still have to do the compare, because isel doesn't do a compare, it just looks at the CRx bits set by a previous compare instruction. */ -- cgit v1.1 From efbb17db52afd802300c4dcce208fab326ec2915 Mon Sep 17 00:00:00 2001 From: "Paul A. Clarke" Date: Wed, 16 Feb 2022 20:01:41 -0600 Subject: rs6000: __Uglify non-uglified local variables in headers Properly prefix (with "__") all local variables in shipped headers for x86 compatibility intrinsics implementations. This avoids possible problems with usages like: ``` ``` 2022-02-16 Paul A. Clarke gcc PR target/104257 * config/rs6000/bmi2intrin.h: Uglify local variables. * config/rs6000/emmintrin.h: Likewise. * config/rs6000/mm_malloc.h: Likewise. * config/rs6000/mmintrin.h: Likewise. * config/rs6000/pmmintrin.h: Likewise. * config/rs6000/smmintrin.h: Likewise. * config/rs6000/tmmintrin.h: Likewise. * config/rs6000/xmmintrin.h: Likewise. --- gcc/config/rs6000/bmi2intrin.h | 68 +-- gcc/config/rs6000/emmintrin.h | 908 ++++++++++++++++++++--------------------- gcc/config/rs6000/mm_malloc.h | 26 +- gcc/config/rs6000/mmintrin.h | 768 +++++++++++++++++----------------- gcc/config/rs6000/pmmintrin.h | 28 +- gcc/config/rs6000/smmintrin.h | 18 +- gcc/config/rs6000/tmmintrin.h | 4 +- gcc/config/rs6000/xmmintrin.h | 861 +++++++++++++++++++------------------- 8 files changed, 1340 insertions(+), 1341 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/bmi2intrin.h b/gcc/config/rs6000/bmi2intrin.h index f2d7eb5..b7a7ded 100644 --- a/gcc/config/rs6000/bmi2intrin.h +++ b/gcc/config/rs6000/bmi2intrin.h @@ -77,39 +77,39 @@ extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _pdep_u64 (unsigned long long __X, unsigned long long __M) { - unsigned long result = 0x0UL; - const unsigned long mask = 0x8000000000000000UL; - unsigned long m = __M; - unsigned long c, t; - unsigned long p; + unsigned long __result = 0x0UL; + const unsigned long __mask = 0x8000000000000000UL; + unsigned long __m = __M; + unsigned long __c, __t; + unsigned long __p; /* The pop-count of the mask gives the number of the bits from source to process. This is also needed to shift bits from the source into the correct position for the result. */ - p = 64 - __builtin_popcountl (__M); + __p = 64 - __builtin_popcountl (__M); /* The loop is for the number of '1' bits in the mask and clearing each mask bit as it is processed. */ - while (m != 0) + while (__m != 0) { - c = __builtin_clzl (m); - t = __X << (p - c); - m ^= (mask >> c); - result |= (t & (mask >> c)); - p++; + __c = __builtin_clzl (__m); + __t = __X << (__p - __c); + __m ^= (__mask >> __c); + __result |= (__t & (__mask >> __c)); + __p++; } - return (result); + return __result; } extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _pext_u64 (unsigned long long __X, unsigned long long __M) { - unsigned long p = 0x4040404040404040UL; // initial bit permute control - const unsigned long mask = 0x8000000000000000UL; - unsigned long m = __M; - unsigned long c; - unsigned long result; + unsigned long __p = 0x4040404040404040UL; // initial bit permute control + const unsigned long __mask = 0x8000000000000000UL; + unsigned long __m = __M; + unsigned long __c; + unsigned long __result; /* if the mask is constant and selects 8 bits or less we can use the Power8 Bit permute instruction. */ @@ -118,35 +118,35 @@ _pext_u64 (unsigned long long __X, unsigned long long __M) /* Also if the pext mask is constant, then the popcount is constant, we can evaluate the following loop at compile time and use a constant bit permute vector. */ - long i; - for (i = 0; i < __builtin_popcountl (__M); i++) + long __i; + for (__i = 0; __i < __builtin_popcountl (__M); __i++) { - c = __builtin_clzl (m); - p = (p << 8) | c; - m ^= (mask >> c); + __c = __builtin_clzl (__m); + __p = (__p << 8) | __c; + __m ^= (__mask >> __c); } - result = __builtin_bpermd (p, __X); + __result = __builtin_bpermd (__p, __X); } else { - p = 64 - __builtin_popcountl (__M); - result = 0; + __p = 64 - __builtin_popcountl (__M); + __result = 0; /* We could a use a for loop here, but that combined with -funroll-loops can expand to a lot of code. The while loop avoids unrolling and the compiler commons the xor from clearing the mask bit with the (m != 0) test. The result is a more compact loop setup and body. */ - while (m != 0) + while (__m != 0) { - unsigned long t; - c = __builtin_clzl (m); - t = (__X & (mask >> c)) >> (p - c); - m ^= (mask >> c); - result |= (t); - p++; + unsigned long __t; + __c = __builtin_clzl (__m); + __t = (__X & (__mask >> __c)) >> (__p - __c); + __m ^= (__mask >> __c); + __result |= (__t); + __p++; } } - return (result); + return __result; } /* these 32-bit implementations depend on 64-bit pdep/pext diff --git a/gcc/config/rs6000/emmintrin.h b/gcc/config/rs6000/emmintrin.h index 71abcca..8329679 100644 --- a/gcc/config/rs6000/emmintrin.h +++ b/gcc/config/rs6000/emmintrin.h @@ -141,9 +141,9 @@ _mm_setzero_pd (void) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_move_sd (__m128d __A, __m128d __B) { - __v2df result = (__v2df) __A; - result [0] = ((__v2df) __B)[0]; - return (__m128d) result; + __v2df __result = (__v2df) __A; + __result [0] = ((__v2df) __B)[0]; + return (__m128d) __result; } /* Load two DPFP values from P. The address must be 16-byte aligned. */ @@ -329,9 +329,9 @@ _mm_sqrt_pd (__m128d __A) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_sd (__m128d __A, __m128d __B) { - __v2df c; - c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0])); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __v2df __c; + __c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0])); + return (__m128d) _mm_setr_pd (__c[0], __A[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -343,11 +343,11 @@ _mm_min_pd (__m128d __A, __m128d __B) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_sd (__m128d __A, __m128d __B) { - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = vec_min (a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __v2df __a, __b, __c; + __a = vec_splats (__A[0]); + __b = vec_splats (__B[0]); + __c = vec_min (__a, __b); + return (__m128d) _mm_setr_pd (__c[0], __A[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -359,11 +359,11 @@ _mm_max_pd (__m128d __A, __m128d __B) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_sd (__m128d __A, __m128d __B) { - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = vec_max (a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __v2df __a, __b, __c; + __a = vec_splats (__A[0]); + __b = vec_splats (__B[0]); + __c = vec_max (__a, __b); + return (__m128d) _mm_setr_pd (__c[0], __A[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -399,8 +399,8 @@ _mm_cmpge_pd (__m128d __A, __m128d __B) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpneq_pd (__m128d __A, __m128d __B) { - __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B); - return ((__m128d)vec_nor (temp, temp)); + __v2df __temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B); + return ((__m128d)vec_nor (__temp, __temp)); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -430,163 +430,163 @@ _mm_cmpnge_pd (__m128d __A, __m128d __B) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpord_pd (__m128d __A, __m128d __B) { - __v2du c, d; + __v2du __c, __d; /* Compare against self will return false (0's) if NAN. */ - c = (__v2du)vec_cmpeq (__A, __A); - d = (__v2du)vec_cmpeq (__B, __B); + __c = (__v2du)vec_cmpeq (__A, __A); + __d = (__v2du)vec_cmpeq (__B, __B); /* A != NAN and B != NAN. */ - return ((__m128d)vec_and(c, d)); + return ((__m128d)vec_and(__c, __d)); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpunord_pd (__m128d __A, __m128d __B) { #if _ARCH_PWR8 - __v2du c, d; + __v2du __c, __d; /* Compare against self will return false (0's) if NAN. */ - c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); - d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); + __c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); + __d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); /* A == NAN OR B == NAN converts too: NOT(A != NAN) OR NOT(B != NAN). */ - c = vec_nor (c, c); - return ((__m128d)vec_orc(c, d)); + __c = vec_nor (__c, __c); + return ((__m128d)vec_orc(__c, __d)); #else - __v2du c, d; + __v2du __c, __d; /* Compare against self will return false (0's) if NAN. */ - c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); - d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); + __c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); + __d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); /* Convert the true ('1's) is NAN. */ - c = vec_nor (c, c); - d = vec_nor (d, d); - return ((__m128d)vec_or(c, d)); + __c = vec_nor (__c, __c); + __d = vec_nor (__d, __d); + return ((__m128d)vec_or(__c, __d)); #endif } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_sd(__m128d __A, __m128d __B) { - __v2df a, b, c; + __v2df __a, __b, __c; /* PowerISA VSX does not allow partial (for just lower double) results. So to insure we don't generate spurious exceptions (from the upper double values) we splat the lower double before we do the operation. */ - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = (__v2df) vec_cmpeq(a, b); + __a = vec_splats (__A[0]); + __b = vec_splats (__B[0]); + __c = (__v2df) vec_cmpeq(__a, __b); /* Then we merge the lower double result with the original upper double from __A. */ - return (__m128d) _mm_setr_pd (c[0], __A[1]); + return (__m128d) _mm_setr_pd (__c[0], __A[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_sd (__m128d __A, __m128d __B) { - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = (__v2df) vec_cmplt(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __v2df __a, __b, __c; + __a = vec_splats (__A[0]); + __b = vec_splats (__B[0]); + __c = (__v2df) vec_cmplt(__a, __b); + return (__m128d) _mm_setr_pd (__c[0], __A[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmple_sd (__m128d __A, __m128d __B) { - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = (__v2df) vec_cmple(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __v2df __a, __b, __c; + __a = vec_splats (__A[0]); + __b = vec_splats (__B[0]); + __c = (__v2df) vec_cmple(__a, __b); + return (__m128d) _mm_setr_pd (__c[0], __A[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_sd (__m128d __A, __m128d __B) { - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = (__v2df) vec_cmpgt(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __v2df __a, __b, __c; + __a = vec_splats (__A[0]); + __b = vec_splats (__B[0]); + __c = (__v2df) vec_cmpgt(__a, __b); + return (__m128d) _mm_setr_pd (__c[0], __A[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpge_sd (__m128d __A, __m128d __B) { - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = (__v2df) vec_cmpge(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __v2df __a, __b, __c; + __a = vec_splats (__A[0]); + __b = vec_splats (__B[0]); + __c = (__v2df) vec_cmpge(__a, __b); + return (__m128d) _mm_setr_pd (__c[0], __A[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpneq_sd (__m128d __A, __m128d __B) { - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); - c = (__v2df) vec_cmpeq(a, b); - c = vec_nor (c, c); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __v2df __a, __b, __c; + __a = vec_splats (__A[0]); + __b = vec_splats (__B[0]); + __c = (__v2df) vec_cmpeq(__a, __b); + __c = vec_nor (__c, __c); + return (__m128d) _mm_setr_pd (__c[0], __A[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnlt_sd (__m128d __A, __m128d __B) { - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); + __v2df __a, __b, __c; + __a = vec_splats (__A[0]); + __b = vec_splats (__B[0]); /* Not less than is just greater than or equal. */ - c = (__v2df) vec_cmpge(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __c = (__v2df) vec_cmpge(__a, __b); + return (__m128d) _mm_setr_pd (__c[0], __A[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnle_sd (__m128d __A, __m128d __B) { - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); + __v2df __a, __b, __c; + __a = vec_splats (__A[0]); + __b = vec_splats (__B[0]); /* Not less than or equal is just greater than. */ - c = (__v2df) vec_cmpge(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __c = (__v2df) vec_cmpge(__a, __b); + return (__m128d) _mm_setr_pd (__c[0], __A[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpngt_sd (__m128d __A, __m128d __B) { - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); + __v2df __a, __b, __c; + __a = vec_splats (__A[0]); + __b = vec_splats (__B[0]); /* Not greater than is just less than or equal. */ - c = (__v2df) vec_cmple(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __c = (__v2df) vec_cmple(__a, __b); + return (__m128d) _mm_setr_pd (__c[0], __A[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnge_sd (__m128d __A, __m128d __B) { - __v2df a, b, c; - a = vec_splats (__A[0]); - b = vec_splats (__B[0]); + __v2df __a, __b, __c; + __a = vec_splats (__A[0]); + __b = vec_splats (__B[0]); /* Not greater than or equal is just less than. */ - c = (__v2df) vec_cmplt(a, b); - return (__m128d) _mm_setr_pd (c[0], __A[1]); + __c = (__v2df) vec_cmplt(__a, __b); + return (__m128d) _mm_setr_pd (__c[0], __A[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpord_sd (__m128d __A, __m128d __B) { - __v2df r; - r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0])); - return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]); + __v2df __r; + __r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0])); + return (__m128d) _mm_setr_pd (__r[0], ((__v2df)__A)[1]); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpunord_sd (__m128d __A, __m128d __B) { - __v2df r; - r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0])); - return (__m128d) _mm_setr_pd (r[0], __A[1]); + __v2df __r; + __r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0])); + return (__m128d) _mm_setr_pd (__r[0], __A[1]); } /* FIXME @@ -845,12 +845,12 @@ _mm_setzero_si128 (void) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtepi32_pd (__m128i __A) { - __v2di val; + __v2di __val; /* For LE need to generate Vector Unpack Low Signed Word. Which is generated from unpackh. */ - val = (__v2di)vec_unpackh ((__v4si)__A); + __val = (__v2di)vec_unpackh ((__v4si)__A); - return (__m128d)vec_ctf (val, 0); + return (__m128d)vec_ctf (__val, 0); } #endif @@ -863,116 +863,116 @@ _mm_cvtepi32_ps (__m128i __A) extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpd_epi32 (__m128d __A) { - __v2df rounded = vec_rint (__A); - __v4si result, temp; - const __v4si vzero = + __v2df __rounded = vec_rint (__A); + __v4si __result, __temp; + const __v4si __vzero = { 0, 0, 0, 0 }; /* VSX Vector truncate Double-Precision to integer and Convert to Signed Integer Word format with Saturate. */ __asm__( "xvcvdpsxws %x0,%x1" - : "=wa" (temp) - : "wa" (rounded) + : "=wa" (__temp) + : "wa" (__rounded) : ); #ifdef _ARCH_PWR8 #ifdef __LITTLE_ENDIAN__ - temp = vec_mergeo (temp, temp); + __temp = vec_mergeo (__temp, __temp); #else - temp = vec_mergee (temp, temp); + __temp = vec_mergee (__temp, __temp); #endif - result = (__v4si) vec_vpkudum ((__vector long long) temp, - (__vector long long) vzero); + __result = (__v4si) vec_vpkudum ((__vector long long) __temp, + (__vector long long) __vzero); #else { - const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; - result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); + __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm); } #endif - return (__m128i) result; + return (__m128i) __result; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpd_pi32 (__m128d __A) { - __m128i result = _mm_cvtpd_epi32(__A); + __m128i __result = _mm_cvtpd_epi32(__A); - return (__m64) result[0]; + return (__m64) __result[0]; } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpd_ps (__m128d __A) { - __v4sf result; - __v4si temp; - const __v4si vzero = { 0, 0, 0, 0 }; + __v4sf __result; + __v4si __temp; + const __v4si __vzero = { 0, 0, 0, 0 }; __asm__( "xvcvdpsp %x0,%x1" - : "=wa" (temp) + : "=wa" (__temp) : "wa" (__A) : ); #ifdef _ARCH_PWR8 #ifdef __LITTLE_ENDIAN__ - temp = vec_mergeo (temp, temp); + __temp = vec_mergeo (__temp, __temp); #else - temp = vec_mergee (temp, temp); + __temp = vec_mergee (__temp, __temp); #endif - result = (__v4sf) vec_vpkudum ((__vector long long) temp, - (__vector long long) vzero); + __result = (__v4sf) vec_vpkudum ((__vector long long) __temp, + (__vector long long) __vzero); #else { - const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; - result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); + __result = (__v4sf) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm); } #endif - return ((__m128)result); + return ((__m128)__result); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttpd_epi32 (__m128d __A) { - __v4si result; - __v4si temp; - const __v4si vzero = { 0, 0, 0, 0 }; + __v4si __result; + __v4si __temp; + const __v4si __vzero = { 0, 0, 0, 0 }; /* VSX Vector truncate Double-Precision to integer and Convert to Signed Integer Word format with Saturate. */ __asm__( "xvcvdpsxws %x0,%x1" - : "=wa" (temp) + : "=wa" (__temp) : "wa" (__A) : ); #ifdef _ARCH_PWR8 #ifdef __LITTLE_ENDIAN__ - temp = vec_mergeo (temp, temp); + __temp = vec_mergeo (__temp, __temp); #else - temp = vec_mergee (temp, temp); + __temp = vec_mergee (__temp, __temp); #endif - result = (__v4si) vec_vpkudum ((__vector long long) temp, - (__vector long long) vzero); + __result = (__v4si) vec_vpkudum ((__vector long long) __temp, + (__vector long long) __vzero); #else { - const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; - result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); + __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm); } #endif - return ((__m128i) result); + return ((__m128i) __result); } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttpd_pi32 (__m128d __A) { - __m128i result = _mm_cvttpd_epi32 (__A); + __m128i __result = _mm_cvttpd_epi32 (__A); - return (__m64) result[0]; + return (__m64) __result[0]; } extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -985,35 +985,35 @@ _mm_cvtsi128_si32 (__m128i __A) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpi32_pd (__m64 __A) { - __v4si temp; - __v2di tmp2; - __v2df result; + __v4si __temp; + __v2di __tmp2; + __v2df __result; - temp = (__v4si)vec_splats (__A); - tmp2 = (__v2di)vec_unpackl (temp); - result = vec_ctf ((__vector signed long long) tmp2, 0); - return (__m128d)result; + __temp = (__v4si)vec_splats (__A); + __tmp2 = (__v2di)vec_unpackl (__temp); + __result = vec_ctf ((__vector signed long long) __tmp2, 0); + return (__m128d)__result; } #endif extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtps_epi32 (__m128 __A) { - __v4sf rounded; - __v4si result; + __v4sf __rounded; + __v4si __result; - rounded = vec_rint((__v4sf) __A); - result = vec_cts (rounded, 0); - return (__m128i) result; + __rounded = vec_rint((__v4sf) __A); + __result = vec_cts (__rounded, 0); + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttps_epi32 (__m128 __A) { - __v4si result; + __v4si __result; - result = vec_cts ((__v4sf) __A, 0); - return (__m128i) result; + __result = vec_cts ((__v4sf) __A, 0); + return (__m128i) __result; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1025,48 +1025,48 @@ _mm_cvtps_pd (__m128 __A) #else /* Otherwise the compiler is not current and so need to generate the equivalent code. */ - __v4sf a = (__v4sf)__A; - __v4sf temp; - __v2df result; + __v4sf __a = (__v4sf)__A; + __v4sf __temp; + __v2df __result; #ifdef __LITTLE_ENDIAN__ /* The input float values are in elements {[0], [1]} but the convert instruction needs them in elements {[1], [3]}, So we use two shift left double vector word immediates to get the elements lined up. */ - temp = __builtin_vsx_xxsldwi (a, a, 3); - temp = __builtin_vsx_xxsldwi (a, temp, 2); + __temp = __builtin_vsx_xxsldwi (__a, __a, 3); + __temp = __builtin_vsx_xxsldwi (__a, __temp, 2); #else /* The input float values are in elements {[0], [1]} but the convert instruction needs them in elements {[0], [2]}, So we use two shift left double vector word immediates to get the elements lined up. */ - temp = vec_vmrghw (a, a); + __temp = vec_vmrghw (__a, __a); #endif __asm__( " xvcvspdp %x0,%x1" - : "=wa" (result) - : "wa" (temp) + : "=wa" (__result) + : "wa" (__temp) : ); - return (__m128d) result; + return (__m128d) __result; #endif } extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_si32 (__m128d __A) { - __v2df rounded = vec_rint((__v2df) __A); - int result = ((__v2df)rounded)[0]; + __v2df __rounded = vec_rint((__v2df) __A); + int __result = ((__v2df)__rounded)[0]; - return result; + return __result; } /* Intel intrinsic. */ extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_si64 (__m128d __A) { - __v2df rounded = vec_rint ((__v2df) __A ); - long long result = ((__v2df) rounded)[0]; + __v2df __rounded = vec_rint ((__v2df) __A ); + long long __result = ((__v2df) __rounded)[0]; - return result; + return __result; } /* Microsoft intrinsic. */ @@ -1079,18 +1079,18 @@ _mm_cvtsd_si64x (__m128d __A) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttsd_si32 (__m128d __A) { - int result = ((__v2df)__A)[0]; + int __result = ((__v2df)__A)[0]; - return result; + return __result; } /* Intel intrinsic. */ extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttsd_si64 (__m128d __A) { - long long result = ((__v2df)__A)[0]; + long long __result = ((__v2df)__A)[0]; - return result; + return __result; } /* Microsoft intrinsic. */ @@ -1103,46 +1103,46 @@ _mm_cvttsd_si64x (__m128d __A) extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_ss (__m128 __A, __m128d __B) { - __v4sf result = (__v4sf)__A; + __v4sf __result = (__v4sf)__A; #ifdef __LITTLE_ENDIAN__ - __v4sf temp_s; + __v4sf __temp_s; /* Copy double element[0] to element [1] for conversion. */ - __v2df temp_b = vec_splat((__v2df)__B, 0); + __v2df __temp_b = vec_splat((__v2df)__B, 0); /* Pre-rotate __A left 3 (logically right 1) elements. */ - result = __builtin_vsx_xxsldwi (result, result, 3); + __result = __builtin_vsx_xxsldwi (__result, __result, 3); /* Convert double to single float scalar in a vector. */ __asm__( "xscvdpsp %x0,%x1" - : "=wa" (temp_s) - : "wa" (temp_b) + : "=wa" (__temp_s) + : "wa" (__temp_b) : ); /* Shift the resulting scalar into vector element [0]. */ - result = __builtin_vsx_xxsldwi (result, temp_s, 1); + __result = __builtin_vsx_xxsldwi (__result, __temp_s, 1); #else - result [0] = ((__v2df)__B)[0]; + __result [0] = ((__v2df)__B)[0]; #endif - return (__m128) result; + return (__m128) __result; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi32_sd (__m128d __A, int __B) { - __v2df result = (__v2df)__A; - double db = __B; - result [0] = db; - return (__m128d)result; + __v2df __result = (__v2df)__A; + double __db = __B; + __result [0] = __db; + return (__m128d)__result; } /* Intel intrinsic. */ extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64_sd (__m128d __A, long long __B) { - __v2df result = (__v2df)__A; - double db = __B; - result [0] = db; - return (__m128d)result; + __v2df __result = (__v2df)__A; + double __db = __B; + __result [0] = __db; + return (__m128d)__result; } /* Microsoft intrinsic. */ @@ -1157,45 +1157,45 @@ _mm_cvtss_sd (__m128d __A, __m128 __B) { #ifdef __LITTLE_ENDIAN__ /* Use splat to move element [0] into position for the convert. */ - __v4sf temp = vec_splat ((__v4sf)__B, 0); - __v2df res; + __v4sf __temp = vec_splat ((__v4sf)__B, 0); + __v2df __res; /* Convert single float scalar to double in a vector. */ __asm__( "xscvspdp %x0,%x1" - : "=wa" (res) - : "wa" (temp) + : "=wa" (__res) + : "wa" (__temp) : ); - return (__m128d) vec_mergel (res, (__v2df)__A); + return (__m128d) vec_mergel (__res, (__v2df)__A); #else - __v2df res = (__v2df)__A; - res [0] = ((__v4sf)__B) [0]; - return (__m128d) res; + __v2df __res = (__v2df)__A; + __res [0] = ((__v4sf)__B) [0]; + return (__m128d) __res; #endif } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) { - __vector double result; - const int litmsk = __mask & 0x3; + __vector double __result; + const int __litmsk = __mask & 0x3; - if (litmsk == 0) - result = vec_mergeh (__A, __B); + if (__litmsk == 0) + __result = vec_mergeh (__A, __B); #if __GNUC__ < 6 - else if (litmsk == 1) - result = vec_xxpermdi (__B, __A, 2); - else if (litmsk == 2) - result = vec_xxpermdi (__B, __A, 1); + else if (__litmsk == 1) + __result = vec_xxpermdi (__B, __A, 2); + else if (__litmsk == 2) + __result = vec_xxpermdi (__B, __A, 1); #else - else if (litmsk == 1) - result = vec_xxpermdi (__A, __B, 2); - else if (litmsk == 2) - result = vec_xxpermdi (__A, __B, 1); + else if (__litmsk == 1) + __result = vec_xxpermdi (__A, __B, 2); + else if (__litmsk == 2) + __result = vec_xxpermdi (__A, __B, 1); #endif else - result = vec_mergel (__A, __B); + __result = vec_mergel (__A, __B); - return result; + return __result; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1213,17 +1213,17 @@ _mm_unpacklo_pd (__m128d __A, __m128d __B) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadh_pd (__m128d __A, double const *__B) { - __v2df result = (__v2df)__A; - result [1] = *__B; - return (__m128d)result; + __v2df __result = (__v2df)__A; + __result [1] = *__B; + return (__m128d)__result; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadl_pd (__m128d __A, double const *__B) { - __v2df result = (__v2df)__A; - result [0] = *__B; - return (__m128d)result; + __v2df __result = (__v2df)__A; + __result [0] = *__B; + return (__m128d)__result; } #ifdef _ARCH_PWR8 @@ -1236,8 +1236,8 @@ _mm_movemask_pd (__m128d __A) #ifdef _ARCH_PWR10 return vec_extractm ((__v2du) __A); #else - __vector unsigned long long result; - static const __vector unsigned int perm_mask = + __vector unsigned long long __result; + static const __vector unsigned int __perm_mask = { #ifdef __LITTLE_ENDIAN__ 0x80800040, 0x80808080, 0x80808080, 0x80808080 @@ -1246,14 +1246,14 @@ _mm_movemask_pd (__m128d __A) #endif }; - result = ((__vector unsigned long long) + __result = ((__vector unsigned long long) vec_vbpermq ((__vector unsigned char) __A, - (__vector unsigned char) perm_mask)); + (__vector unsigned char) __perm_mask)); #ifdef __LITTLE_ENDIAN__ - return result[1]; + return __result[1]; #else - return result[0]; + return __result[0]; #endif #endif /* !_ARCH_PWR10 */ } @@ -1426,17 +1426,17 @@ _mm_subs_epu16 (__m128i __A, __m128i __B) extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_madd_epi16 (__m128i __A, __m128i __B) { - __vector signed int zero = {0, 0, 0, 0}; + __vector signed int __zero = {0, 0, 0, 0}; - return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero); + return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, __zero); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_epi16 (__m128i __A, __m128i __B) { - __vector signed int w0, w1; + __vector signed int __w0, __w1; - __vector unsigned char xform1 = { + __vector unsigned char __xform1 = { #ifdef __LITTLE_ENDIAN__ 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F @@ -1446,9 +1446,9 @@ _mm_mulhi_epi16 (__m128i __A, __m128i __B) #endif }; - w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B); - w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B); - return (__m128i) vec_perm (w0, w1, xform1); + __w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B); + __w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B); + return (__m128i) vec_perm (__w0, __w1, __xform1); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1460,10 +1460,10 @@ _mm_mullo_epi16 (__m128i __A, __m128i __B) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_su32 (__m64 __A, __m64 __B) { - unsigned int a = __A; - unsigned int b = __B; + unsigned int __a = __A; + unsigned int __b = __B; - return ((__m64)a * (__m64)b); + return ((__m64)__a * (__m64)__b); } #ifdef _ARCH_PWR8 @@ -1471,24 +1471,24 @@ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __arti _mm_mul_epu32 (__m128i __A, __m128i __B) { #if __GNUC__ < 8 - __v2du result; + __v2du __result; #ifdef __LITTLE_ENDIAN__ /* VMX Vector Multiply Odd Unsigned Word. */ __asm__( "vmulouw %0,%1,%2" - : "=v" (result) + : "=v" (__result) : "v" (__A), "v" (__B) : ); #else /* VMX Vector Multiply Even Unsigned Word. */ __asm__( "vmuleuw %0,%1,%2" - : "=v" (result) + : "=v" (__result) : "v" (__A), "v" (__B) : ); #endif - return (__m128i) result; + return (__m128i) __result; #else return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); #endif @@ -1498,122 +1498,122 @@ _mm_mul_epu32 (__m128i __A, __m128i __B) extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi16 (__m128i __A, int __B) { - __v8hu lshift; - __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; + __v8hu __lshift; + __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 }; if (__B >= 0 && __B < 16) { if (__builtin_constant_p(__B)) - lshift = (__v8hu) vec_splat_s16(__B); + __lshift = (__v8hu) vec_splat_s16(__B); else - lshift = vec_splats ((unsigned short) __B); + __lshift = vec_splats ((unsigned short) __B); - result = vec_sl ((__v8hi) __A, lshift); + __result = vec_sl ((__v8hi) __A, __lshift); } - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi32 (__m128i __A, int __B) { - __v4su lshift; - __v4si result = { 0, 0, 0, 0 }; + __v4su __lshift; + __v4si __result = { 0, 0, 0, 0 }; if (__B >= 0 && __B < 32) { if (__builtin_constant_p(__B) && __B < 16) - lshift = (__v4su) vec_splat_s32(__B); + __lshift = (__v4su) vec_splat_s32(__B); else - lshift = vec_splats ((unsigned int) __B); + __lshift = vec_splats ((unsigned int) __B); - result = vec_sl ((__v4si) __A, lshift); + __result = vec_sl ((__v4si) __A, __lshift); } - return (__m128i) result; + return (__m128i) __result; } #ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi64 (__m128i __A, int __B) { - __v2du lshift; - __v2di result = { 0, 0 }; + __v2du __lshift; + __v2di __result = { 0, 0 }; if (__B >= 0 && __B < 64) { if (__builtin_constant_p(__B) && __B < 16) - lshift = (__v2du) vec_splat_s32(__B); + __lshift = (__v2du) vec_splat_s32(__B); else - lshift = (__v2du) vec_splats ((unsigned int) __B); + __lshift = (__v2du) vec_splats ((unsigned int) __B); - result = vec_sl ((__v2di) __A, lshift); + __result = vec_sl ((__v2di) __A, __lshift); } - return (__m128i) result; + return (__m128i) __result; } #endif extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srai_epi16 (__m128i __A, int __B) { - __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 }; - __v8hi result; + __v8hu __rshift = { 15, 15, 15, 15, 15, 15, 15, 15 }; + __v8hi __result; if (__B < 16) { if (__builtin_constant_p(__B)) - rshift = (__v8hu) vec_splat_s16(__B); + __rshift = (__v8hu) vec_splat_s16(__B); else - rshift = vec_splats ((unsigned short) __B); + __rshift = vec_splats ((unsigned short) __B); } - result = vec_sra ((__v8hi) __A, rshift); + __result = vec_sra ((__v8hi) __A, __rshift); - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srai_epi32 (__m128i __A, int __B) { - __v4su rshift = { 31, 31, 31, 31 }; - __v4si result; + __v4su __rshift = { 31, 31, 31, 31 }; + __v4si __result; if (__B < 32) { if (__builtin_constant_p(__B)) { if (__B < 16) - rshift = (__v4su) vec_splat_s32(__B); + __rshift = (__v4su) vec_splat_s32(__B); else - rshift = (__v4su) vec_splats((unsigned int)__B); + __rshift = (__v4su) vec_splats((unsigned int)__B); } else - rshift = vec_splats ((unsigned int) __B); + __rshift = vec_splats ((unsigned int) __B); } - result = vec_sra ((__v4si) __A, rshift); + __result = vec_sra ((__v4si) __A, __rshift); - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_bslli_si128 (__m128i __A, const int __N) { - __v16qu result; - const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + __v16qu __result; + const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; if (__N < 16) - result = vec_sld ((__v16qu) __A, zeros, __N); + __result = vec_sld ((__v16qu) __A, __zeros, __N); else - result = zeros; + __result = __zeros; - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_bsrli_si128 (__m128i __A, const int __N) { - __v16qu result; - const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + __v16qu __result; + const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; if (__N < 16) #ifdef __LITTLE_ENDIAN__ @@ -1621,21 +1621,21 @@ _mm_bsrli_si128 (__m128i __A, const int __N) /* Would like to use Vector Shift Left Double by Octet Immediate here to use the immediate form and avoid load of __N * 8 value into a separate VR. */ - result = vec_sld (zeros, (__v16qu) __A, (16 - __N)); + __result = vec_sld (__zeros, (__v16qu) __A, (16 - __N)); else #endif { - __v16qu shift = vec_splats((unsigned char)(__N*8)); + __v16qu __shift = vec_splats((unsigned char)(__N*8)); #ifdef __LITTLE_ENDIAN__ - result = vec_sro ((__v16qu)__A, shift); + __result = vec_sro ((__v16qu)__A, __shift); #else - result = vec_slo ((__v16qu)__A, shift); + __result = vec_slo ((__v16qu)__A, __shift); #endif } else - result = zeros; + __result = __zeros; - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1647,239 +1647,239 @@ _mm_srli_si128 (__m128i __A, const int __N) extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_si128 (__m128i __A, const int _imm5) { - __v16qu result; - const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + __v16qu __result; + const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; if (_imm5 < 16) #ifdef __LITTLE_ENDIAN__ - result = vec_sld ((__v16qu) __A, zeros, _imm5); + __result = vec_sld ((__v16qu) __A, __zeros, _imm5); #else - result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5)); + __result = vec_sld (__zeros, (__v16qu) __A, (16 - _imm5)); #endif else - result = zeros; + __result = __zeros; - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi16 (__m128i __A, int __B) { - __v8hu rshift; - __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; + __v8hu __rshift; + __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 }; if (__B < 16) { if (__builtin_constant_p(__B)) - rshift = (__v8hu) vec_splat_s16(__B); + __rshift = (__v8hu) vec_splat_s16(__B); else - rshift = vec_splats ((unsigned short) __B); + __rshift = vec_splats ((unsigned short) __B); - result = vec_sr ((__v8hi) __A, rshift); + __result = vec_sr ((__v8hi) __A, __rshift); } - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi32 (__m128i __A, int __B) { - __v4su rshift; - __v4si result = { 0, 0, 0, 0 }; + __v4su __rshift; + __v4si __result = { 0, 0, 0, 0 }; if (__B < 32) { if (__builtin_constant_p(__B)) { if (__B < 16) - rshift = (__v4su) vec_splat_s32(__B); + __rshift = (__v4su) vec_splat_s32(__B); else - rshift = (__v4su) vec_splats((unsigned int)__B); + __rshift = (__v4su) vec_splats((unsigned int)__B); } else - rshift = vec_splats ((unsigned int) __B); + __rshift = vec_splats ((unsigned int) __B); - result = vec_sr ((__v4si) __A, rshift); + __result = vec_sr ((__v4si) __A, __rshift); } - return (__m128i) result; + return (__m128i) __result; } #ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi64 (__m128i __A, int __B) { - __v2du rshift; - __v2di result = { 0, 0 }; + __v2du __rshift; + __v2di __result = { 0, 0 }; if (__B < 64) { if (__builtin_constant_p(__B)) { if (__B < 16) - rshift = (__v2du) vec_splat_s32(__B); + __rshift = (__v2du) vec_splat_s32(__B); else - rshift = (__v2du) vec_splats((unsigned long long)__B); + __rshift = (__v2du) vec_splats((unsigned long long)__B); } else - rshift = (__v2du) vec_splats ((unsigned int) __B); + __rshift = (__v2du) vec_splats ((unsigned int) __B); - result = vec_sr ((__v2di) __A, rshift); + __result = vec_sr ((__v2di) __A, __rshift); } - return (__m128i) result; + return (__m128i) __result; } #endif extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_epi16 (__m128i __A, __m128i __B) { - __v8hu lshift; - __vector __bool short shmask; - const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; - __v8hu result; + __v8hu __lshift; + __vector __bool short __shmask; + const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; + __v8hu __result; #ifdef __LITTLE_ENDIAN__ - lshift = vec_splat ((__v8hu) __B, 0); + __lshift = vec_splat ((__v8hu) __B, 0); #else - lshift = vec_splat ((__v8hu) __B, 3); + __lshift = vec_splat ((__v8hu) __B, 3); #endif - shmask = vec_cmple (lshift, shmax); - result = vec_sl ((__v8hu) __A, lshift); - result = vec_sel ((__v8hu) shmask, result, shmask); + __shmask = vec_cmple (__lshift, __shmax); + __result = vec_sl ((__v8hu) __A, __lshift); + __result = vec_sel ((__v8hu) __shmask, __result, __shmask); - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_epi32 (__m128i __A, __m128i __B) { - __v4su lshift; - __vector __bool int shmask; - const __v4su shmax = { 32, 32, 32, 32 }; - __v4su result; + __v4su __lshift; + __vector __bool int __shmask; + const __v4su __shmax = { 32, 32, 32, 32 }; + __v4su __result; #ifdef __LITTLE_ENDIAN__ - lshift = vec_splat ((__v4su) __B, 0); + __lshift = vec_splat ((__v4su) __B, 0); #else - lshift = vec_splat ((__v4su) __B, 1); + __lshift = vec_splat ((__v4su) __B, 1); #endif - shmask = vec_cmplt (lshift, shmax); - result = vec_sl ((__v4su) __A, lshift); - result = vec_sel ((__v4su) shmask, result, shmask); + __shmask = vec_cmplt (__lshift, __shmax); + __result = vec_sl ((__v4su) __A, __lshift); + __result = vec_sel ((__v4su) __shmask, __result, __shmask); - return (__m128i) result; + return (__m128i) __result; } #ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_epi64 (__m128i __A, __m128i __B) { - __v2du lshift; - __vector __bool long long shmask; - const __v2du shmax = { 64, 64 }; - __v2du result; + __v2du __lshift; + __vector __bool long long __shmask; + const __v2du __shmax = { 64, 64 }; + __v2du __result; - lshift = vec_splat ((__v2du) __B, 0); - shmask = vec_cmplt (lshift, shmax); - result = vec_sl ((__v2du) __A, lshift); - result = vec_sel ((__v2du) shmask, result, shmask); + __lshift = vec_splat ((__v2du) __B, 0); + __shmask = vec_cmplt (__lshift, __shmax); + __result = vec_sl ((__v2du) __A, __lshift); + __result = vec_sel ((__v2du) __shmask, __result, __shmask); - return (__m128i) result; + return (__m128i) __result; } #endif extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_epi16 (__m128i __A, __m128i __B) { - const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; - __v8hu rshift; - __v8hi result; + const __v8hu __rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; + __v8hu __rshift; + __v8hi __result; #ifdef __LITTLE_ENDIAN__ - rshift = vec_splat ((__v8hu)__B, 0); + __rshift = vec_splat ((__v8hu)__B, 0); #else - rshift = vec_splat ((__v8hu)__B, 3); + __rshift = vec_splat ((__v8hu)__B, 3); #endif - rshift = vec_min (rshift, rshmax); - result = vec_sra ((__v8hi) __A, rshift); + __rshift = vec_min (__rshift, __rshmax); + __result = vec_sra ((__v8hi) __A, __rshift); - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_epi32 (__m128i __A, __m128i __B) { - const __v4su rshmax = { 31, 31, 31, 31 }; - __v4su rshift; - __v4si result; + const __v4su __rshmax = { 31, 31, 31, 31 }; + __v4su __rshift; + __v4si __result; #ifdef __LITTLE_ENDIAN__ - rshift = vec_splat ((__v4su)__B, 0); + __rshift = vec_splat ((__v4su)__B, 0); #else - rshift = vec_splat ((__v4su)__B, 1); + __rshift = vec_splat ((__v4su)__B, 1); #endif - rshift = vec_min (rshift, rshmax); - result = vec_sra ((__v4si) __A, rshift); + __rshift = vec_min (__rshift, __rshmax); + __result = vec_sra ((__v4si) __A, __rshift); - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_epi16 (__m128i __A, __m128i __B) { - __v8hu rshift; - __vector __bool short shmask; - const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; - __v8hu result; + __v8hu __rshift; + __vector __bool short __shmask; + const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; + __v8hu __result; #ifdef __LITTLE_ENDIAN__ - rshift = vec_splat ((__v8hu) __B, 0); + __rshift = vec_splat ((__v8hu) __B, 0); #else - rshift = vec_splat ((__v8hu) __B, 3); + __rshift = vec_splat ((__v8hu) __B, 3); #endif - shmask = vec_cmple (rshift, shmax); - result = vec_sr ((__v8hu) __A, rshift); - result = vec_sel ((__v8hu) shmask, result, shmask); + __shmask = vec_cmple (__rshift, __shmax); + __result = vec_sr ((__v8hu) __A, __rshift); + __result = vec_sel ((__v8hu) __shmask, __result, __shmask); - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_epi32 (__m128i __A, __m128i __B) { - __v4su rshift; - __vector __bool int shmask; - const __v4su shmax = { 32, 32, 32, 32 }; - __v4su result; + __v4su __rshift; + __vector __bool int __shmask; + const __v4su __shmax = { 32, 32, 32, 32 }; + __v4su __result; #ifdef __LITTLE_ENDIAN__ - rshift = vec_splat ((__v4su) __B, 0); + __rshift = vec_splat ((__v4su) __B, 0); #else - rshift = vec_splat ((__v4su) __B, 1); + __rshift = vec_splat ((__v4su) __B, 1); #endif - shmask = vec_cmplt (rshift, shmax); - result = vec_sr ((__v4su) __A, rshift); - result = vec_sel ((__v4su) shmask, result, shmask); + __shmask = vec_cmplt (__rshift, __shmax); + __result = vec_sr ((__v4su) __A, __rshift); + __result = vec_sel ((__v4su) __shmask, __result, __shmask); - return (__m128i) result; + return (__m128i) __result; } #ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_epi64 (__m128i __A, __m128i __B) { - __v2du rshift; - __vector __bool long long shmask; - const __v2du shmax = { 64, 64 }; - __v2du result; + __v2du __rshift; + __vector __bool long long __shmask; + const __v2du __shmax = { 64, 64 }; + __v2du __result; - rshift = vec_splat ((__v2du) __B, 0); - shmask = vec_cmplt (rshift, shmax); - result = vec_sr ((__v2du) __A, rshift); - result = vec_sel ((__v2du) shmask, result, shmask); + __rshift = vec_splat ((__v2du) __B, 0); + __shmask = vec_cmplt (__rshift, __shmax); + __result = vec_sr ((__v2du) __A, __rshift); + __result = vec_sel ((__v2du) __shmask, __result, __shmask); - return (__m128i) result; + return (__m128i) __result; } #endif @@ -1994,11 +1994,11 @@ _mm_extract_epi16 (__m128i const __A, int const __N) extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_insert_epi16 (__m128i const __A, int const __D, int const __N) { - __v8hi result = (__v8hi)__A; + __v8hi __result = (__v8hi)__A; - result [(__N & 7)] = __D; + __result [(__N & 7)] = __D; - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -2037,21 +2037,21 @@ _mm_movemask_epi8 (__m128i __A) #ifdef _ARCH_PWR10 return vec_extractm ((__v16qu) __A); #else - __vector unsigned long long result; - static const __vector unsigned char perm_mask = + __vector unsigned long long __result; + static const __vector unsigned char __perm_mask = { 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 }; - result = ((__vector unsigned long long) + __result = ((__vector unsigned long long) vec_vbpermq ((__vector unsigned char) __A, - (__vector unsigned char) perm_mask)); + (__vector unsigned char) __perm_mask)); #ifdef __LITTLE_ENDIAN__ - return result[1]; + return __result[1]; #else - return result[0]; + return __result[0]; #endif #endif /* !_ARCH_PWR10 */ } @@ -2060,8 +2060,8 @@ _mm_movemask_epi8 (__m128i __A) extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_epu16 (__m128i __A, __m128i __B) { - __v4su w0, w1; - __v16qu xform1 = { + __v4su __w0, __w1; + __v16qu __xform1 = { #ifdef __LITTLE_ENDIAN__ 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F @@ -2071,19 +2071,19 @@ _mm_mulhi_epu16 (__m128i __A, __m128i __B) #endif }; - w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B); - w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B); - return (__m128i) vec_perm (w0, w1, xform1); + __w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B); + __w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B); + return (__m128i) vec_perm (__w0, __w1, __xform1); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shufflehi_epi16 (__m128i __A, const int __mask) { - unsigned long element_selector_98 = __mask & 0x03; - unsigned long element_selector_BA = (__mask >> 2) & 0x03; - unsigned long element_selector_DC = (__mask >> 4) & 0x03; - unsigned long element_selector_FE = (__mask >> 6) & 0x03; - static const unsigned short permute_selectors[4] = + unsigned long __element_selector_98 = __mask & 0x03; + unsigned long __element_selector_BA = (__mask >> 2) & 0x03; + unsigned long __element_selector_DC = (__mask >> 4) & 0x03; + unsigned long __element_selector_FE = (__mask >> 6) & 0x03; + static const unsigned short __permute_selectors[4] = { #ifdef __LITTLE_ENDIAN__ 0x0908, 0x0B0A, 0x0D0C, 0x0F0E @@ -2091,33 +2091,33 @@ _mm_shufflehi_epi16 (__m128i __A, const int __mask) 0x0809, 0x0A0B, 0x0C0D, 0x0E0F #endif }; - __v2du pmask = + __v2du __pmask = #ifdef __LITTLE_ENDIAN__ { 0x1716151413121110UL, 0UL}; #else { 0x1011121314151617UL, 0UL}; #endif - __m64_union t; - __v2du a, r; + __m64_union __t; + __v2du __a, __r; - t.as_short[0] = permute_selectors[element_selector_98]; - t.as_short[1] = permute_selectors[element_selector_BA]; - t.as_short[2] = permute_selectors[element_selector_DC]; - t.as_short[3] = permute_selectors[element_selector_FE]; - pmask[1] = t.as_m64; - a = (__v2du)__A; - r = vec_perm (a, a, (__vector unsigned char)pmask); - return (__m128i) r; + __t.as_short[0] = __permute_selectors[__element_selector_98]; + __t.as_short[1] = __permute_selectors[__element_selector_BA]; + __t.as_short[2] = __permute_selectors[__element_selector_DC]; + __t.as_short[3] = __permute_selectors[__element_selector_FE]; + __pmask[1] = __t.as_m64; + __a = (__v2du)__A; + __r = vec_perm (__a, __a, (__vector unsigned char)__pmask); + return (__m128i) __r; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shufflelo_epi16 (__m128i __A, const int __mask) { - unsigned long element_selector_10 = __mask & 0x03; - unsigned long element_selector_32 = (__mask >> 2) & 0x03; - unsigned long element_selector_54 = (__mask >> 4) & 0x03; - unsigned long element_selector_76 = (__mask >> 6) & 0x03; - static const unsigned short permute_selectors[4] = + unsigned long __element_selector_10 = __mask & 0x03; + unsigned long __element_selector_32 = (__mask >> 2) & 0x03; + unsigned long __element_selector_54 = (__mask >> 4) & 0x03; + unsigned long __element_selector_76 = (__mask >> 6) & 0x03; + static const unsigned short __permute_selectors[4] = { #ifdef __LITTLE_ENDIAN__ 0x0100, 0x0302, 0x0504, 0x0706 @@ -2125,32 +2125,32 @@ _mm_shufflelo_epi16 (__m128i __A, const int __mask) 0x0001, 0x0203, 0x0405, 0x0607 #endif }; - __v2du pmask = + __v2du __pmask = #ifdef __LITTLE_ENDIAN__ { 0UL, 0x1f1e1d1c1b1a1918UL}; #else { 0UL, 0x18191a1b1c1d1e1fUL}; #endif - __m64_union t; - __v2du a, r; - t.as_short[0] = permute_selectors[element_selector_10]; - t.as_short[1] = permute_selectors[element_selector_32]; - t.as_short[2] = permute_selectors[element_selector_54]; - t.as_short[3] = permute_selectors[element_selector_76]; - pmask[0] = t.as_m64; - a = (__v2du)__A; - r = vec_perm (a, a, (__vector unsigned char)pmask); - return (__m128i) r; + __m64_union __t; + __v2du __a, __r; + __t.as_short[0] = __permute_selectors[__element_selector_10]; + __t.as_short[1] = __permute_selectors[__element_selector_32]; + __t.as_short[2] = __permute_selectors[__element_selector_54]; + __t.as_short[3] = __permute_selectors[__element_selector_76]; + __pmask[0] = __t.as_m64; + __a = (__v2du)__A; + __r = vec_perm (__a, __a, (__vector unsigned char)__pmask); + return (__m128i) __r; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shuffle_epi32 (__m128i __A, const int __mask) { - unsigned long element_selector_10 = __mask & 0x03; - unsigned long element_selector_32 = (__mask >> 2) & 0x03; - unsigned long element_selector_54 = (__mask >> 4) & 0x03; - unsigned long element_selector_76 = (__mask >> 6) & 0x03; - static const unsigned int permute_selectors[4] = + unsigned long __element_selector_10 = __mask & 0x03; + unsigned long __element_selector_32 = (__mask >> 2) & 0x03; + unsigned long __element_selector_54 = (__mask >> 4) & 0x03; + unsigned long __element_selector_76 = (__mask >> 6) & 0x03; + static const unsigned int __permute_selectors[4] = { #ifdef __LITTLE_ENDIAN__ 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C @@ -2158,26 +2158,26 @@ _mm_shuffle_epi32 (__m128i __A, const int __mask) 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F #endif }; - __v4su t; + __v4su __t; - t[0] = permute_selectors[element_selector_10]; - t[1] = permute_selectors[element_selector_32]; - t[2] = permute_selectors[element_selector_54] + 0x10101010; - t[3] = permute_selectors[element_selector_76] + 0x10101010; - return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t); + __t[0] = __permute_selectors[__element_selector_10]; + __t[1] = __permute_selectors[__element_selector_32]; + __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; + __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; + return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)__t); } extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) { - __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; - __v16qu mask, tmp; - __m128i_u *p = (__m128i_u*)__C; + __v2du __hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; + __v16qu __mask, __tmp; + __m128i_u *__p = (__m128i_u*)__C; - tmp = (__v16qu)_mm_loadu_si128(p); - mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit); - tmp = vec_sel (tmp, (__v16qu)__A, mask); - _mm_storeu_si128 (p, (__m128i)tmp); + __tmp = (__v16qu)_mm_loadu_si128(__p); + __mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)__hibit); + __tmp = vec_sel (__tmp, (__v16qu)__A, __mask); + _mm_storeu_si128 (__p, (__m128i)__tmp); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -2196,26 +2196,26 @@ _mm_avg_epu16 (__m128i __A, __m128i __B) extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sad_epu8 (__m128i __A, __m128i __B) { - __v16qu a, b; - __v16qu vabsdiff; - __v4si vsum; - const __v4su zero = { 0, 0, 0, 0 }; - __v4si result; + __v16qu __a, __b; + __v16qu __vabsdiff; + __v4si __vsum; + const __v4su __zero = { 0, 0, 0, 0 }; + __v4si __result; - a = (__v16qu) __A; - b = (__v16qu) __B; + __a = (__v16qu) __A; + __b = (__v16qu) __B; #ifndef _ARCH_PWR9 - __v16qu vmin = vec_min (a, b); - __v16qu vmax = vec_max (a, b); - vabsdiff = vec_sub (vmax, vmin); + __v16qu __vmin = vec_min (__a, __b); + __v16qu __vmax = vec_max (__a, __b); + __vabsdiff = vec_sub (__vmax, __vmin); #else - vabsdiff = vec_absd (a, b); + __vabsdiff = vec_absd (__a, __b); #endif /* Sum four groups of bytes into integers. */ - vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); + __vsum = (__vector signed int) vec_sum4s (__vabsdiff, __zero); #ifdef __LITTLE_ENDIAN__ /* Sum across four integers with two integer results. */ - __asm__ ("vsum2sws %0,%1,%2" : "=v" (result) : "v" (vsum), "v" (zero)); + __asm__ ("vsum2sws %0,%1,%2" : "=v" (__result) : "v" (__vsum), "v" (__zero)); /* Note: vec_sum2s could be used here, but on little-endian, vector shifts are added that are not needed for this use-case. A vector shift to correctly position the 32-bit integer results @@ -2224,11 +2224,11 @@ _mm_sad_epu8 (__m128i __A, __m128i __B) integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */ #else /* Sum across four integers with two integer results. */ - result = vec_sum2s (vsum, (__vector signed int) zero); + __result = vec_sum2s (__vsum, (__vector signed int) __zero); /* Rotate the sums into the correct position. */ - result = vec_sld (result, result, 6); + __result = vec_sld (__result, __result, 6); #endif - return (__m128i) result; + return (__m128i) __result; } extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) diff --git a/gcc/config/rs6000/mm_malloc.h b/gcc/config/rs6000/mm_malloc.h index 3d2e09e..721f756 100644 --- a/gcc/config/rs6000/mm_malloc.h +++ b/gcc/config/rs6000/mm_malloc.h @@ -35,28 +35,28 @@ extern "C" int posix_memalign (void **, size_t, size_t) throw (); #endif static __inline void * -_mm_malloc (size_t size, size_t alignment) +_mm_malloc (size_t __size, size_t __alignment) { /* PowerPC64 ELF V2 ABI requires quadword alignment. */ - size_t vec_align = sizeof (__vector float); + size_t __vec_align = sizeof (__vector float); /* Linux GLIBC malloc alignment is at least 2 X ptr size. */ - size_t malloc_align = (sizeof (void *) + sizeof (void *)); - void *ptr; - - if (alignment == malloc_align && alignment == vec_align) - return malloc (size); - if (alignment < vec_align) - alignment = vec_align; - if (posix_memalign (&ptr, alignment, size) == 0) - return ptr; + size_t __malloc_align = (sizeof (void *) + sizeof (void *)); + void *__ptr; + + if (__alignment == __malloc_align && __alignment == __vec_align) + return malloc (__size); + if (__alignment < __vec_align) + __alignment = __vec_align; + if (__posix_memalign (&__ptr, __alignment, __size) == 0) + return __ptr; else return NULL; } static __inline void -_mm_free (void * ptr) +_mm_free (void * __ptr) { - free (ptr); + free (__ptr); } #endif /* _MM_MALLOC_H_INCLUDED */ diff --git a/gcc/config/rs6000/mmintrin.h b/gcc/config/rs6000/mmintrin.h index da4f7d5..bf7f3b1 100644 --- a/gcc/config/rs6000/mmintrin.h +++ b/gcc/config/rs6000/mmintrin.h @@ -170,17 +170,17 @@ _mm_cvtsi64_si64x (__m64 __i) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pi16 (__m64 __m1, __m64 __m2) { - __vector signed short vm1; - __vector signed char vresult; + __vector signed short __vm1; + __vector signed char __vresult; - vm1 = (__vector signed short) (__vector unsigned long long) + __vm1 = (__vector signed short) (__vector unsigned long long) #ifdef __LITTLE_ENDIAN__ { __m1, __m2 }; #else { __m2, __m1 }; #endif - vresult = vec_packs (vm1, vm1); - return (__m64) ((__vector long long) vresult)[0]; + __vresult = vec_packs (__vm1, __vm1); + return (__m64) ((__vector long long) __vresult)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -195,17 +195,17 @@ _m_packsswb (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pi32 (__m64 __m1, __m64 __m2) { - __vector signed int vm1; - __vector signed short vresult; + __vector signed int __vm1; + __vector signed short __vresult; - vm1 = (__vector signed int) (__vector unsigned long long) + __vm1 = (__vector signed int) (__vector unsigned long long) #ifdef __LITTLE_ENDIAN__ { __m1, __m2 }; #else { __m2, __m1 }; #endif - vresult = vec_packs (vm1, vm1); - return (__m64) ((__vector long long) vresult)[0]; + __vresult = vec_packs (__vm1, __vm1); + return (__m64) ((__vector long long) __vresult)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -220,19 +220,19 @@ _m_packssdw (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pu16 (__m64 __m1, __m64 __m2) { - __vector unsigned char r; - __vector signed short vm1 = (__vector signed short) (__vector long long) + __vector unsigned char __r; + __vector signed short __vm1 = (__vector signed short) (__vector long long) #ifdef __LITTLE_ENDIAN__ { __m1, __m2 }; #else { __m2, __m1 }; #endif const __vector signed short __zero = { 0 }; - __vector __bool short __select = vec_cmplt (vm1, __zero); - r = vec_packs ((__vector unsigned short) vm1, (__vector unsigned short) vm1); - __vector __bool char packsel = vec_pack (__select, __select); - r = vec_sel (r, (const __vector unsigned char) __zero, packsel); - return (__m64) ((__vector long long) r)[0]; + __vector __bool short __select = vec_cmplt (__vm1, __zero); + __r = vec_packs ((__vector unsigned short) __vm1, (__vector unsigned short) __vm1); + __vector __bool char __packsel = vec_pack (__select, __select); + __r = vec_sel (__r, (const __vector unsigned char) __zero, __packsel); + return (__m64) ((__vector long long) __r)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -248,28 +248,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector unsigned char a, b, c; + __vector unsigned char __a, __b, __c; - a = (__vector unsigned char)vec_splats (__m1); - b = (__vector unsigned char)vec_splats (__m2); - c = vec_mergel (a, b); - return (__m64) ((__vector long long) c)[1]; + __a = (__vector unsigned char)vec_splats (__m1); + __b = (__vector unsigned char)vec_splats (__m2); + __c = vec_mergel (__a, __b); + return (__m64) ((__vector long long) __c)[1]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_char[0] = m1.as_char[4]; - res.as_char[1] = m2.as_char[4]; - res.as_char[2] = m1.as_char[5]; - res.as_char[3] = m2.as_char[5]; - res.as_char[4] = m1.as_char[6]; - res.as_char[5] = m2.as_char[6]; - res.as_char[6] = m1.as_char[7]; - res.as_char[7] = m2.as_char[7]; + __res.as_char[0] = __mu1.as_char[4]; + __res.as_char[1] = __mu2.as_char[4]; + __res.as_char[2] = __mu1.as_char[5]; + __res.as_char[3] = __mu2.as_char[5]; + __res.as_char[4] = __mu1.as_char[6]; + __res.as_char[5] = __mu2.as_char[6]; + __res.as_char[6] = __mu1.as_char[7]; + __res.as_char[7] = __mu2.as_char[7]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -284,17 +284,17 @@ _m_punpckhbw (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) { - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_short[0] = m1.as_short[2]; - res.as_short[1] = m2.as_short[2]; - res.as_short[2] = m1.as_short[3]; - res.as_short[3] = m2.as_short[3]; + __res.as_short[0] = __mu1.as_short[2]; + __res.as_short[1] = __mu2.as_short[2]; + __res.as_short[2] = __mu1.as_short[3]; + __res.as_short[3] = __mu2.as_short[3]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -307,15 +307,15 @@ _m_punpckhwd (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) { - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_int[0] = m1.as_int[1]; - res.as_int[1] = m2.as_int[1]; + __res.as_int[0] = __mu1.as_int[1]; + __res.as_int[1] = __mu2.as_int[1]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -329,28 +329,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector unsigned char a, b, c; + __vector unsigned char __a, __b, __c; - a = (__vector unsigned char)vec_splats (__m1); - b = (__vector unsigned char)vec_splats (__m2); - c = vec_mergel (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector unsigned char)vec_splats (__m1); + __b = (__vector unsigned char)vec_splats (__m2); + __c = vec_mergel (__a, __b); + return (__m64) ((__vector long long) __c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_char[0] = m1.as_char[0]; - res.as_char[1] = m2.as_char[0]; - res.as_char[2] = m1.as_char[1]; - res.as_char[3] = m2.as_char[1]; - res.as_char[4] = m1.as_char[2]; - res.as_char[5] = m2.as_char[2]; - res.as_char[6] = m1.as_char[3]; - res.as_char[7] = m2.as_char[3]; + __res.as_char[0] = __mu1.as_char[0]; + __res.as_char[1] = __mu2.as_char[0]; + __res.as_char[2] = __mu1.as_char[1]; + __res.as_char[3] = __mu2.as_char[1]; + __res.as_char[4] = __mu1.as_char[2]; + __res.as_char[5] = __mu2.as_char[2]; + __res.as_char[6] = __mu1.as_char[3]; + __res.as_char[7] = __mu2.as_char[3]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -364,17 +364,17 @@ _m_punpcklbw (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) { - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_short[0] = m1.as_short[0]; - res.as_short[1] = m2.as_short[0]; - res.as_short[2] = m1.as_short[1]; - res.as_short[3] = m2.as_short[1]; + __res.as_short[0] = __mu1.as_short[0]; + __res.as_short[1] = __mu2.as_short[0]; + __res.as_short[2] = __mu1.as_short[1]; + __res.as_short[3] = __mu2.as_short[1]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -388,15 +388,15 @@ _m_punpcklwd (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) { - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_int[0] = m1.as_int[0]; - res.as_int[1] = m2.as_int[0]; + __res.as_int[0] = __mu1.as_int[0]; + __res.as_int[1] = __mu2.as_int[0]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -410,28 +410,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_add_pi8 (__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed char a, b, c; + __vector signed char __a, __b, __c; - a = (__vector signed char)vec_splats (__m1); - b = (__vector signed char)vec_splats (__m2); - c = vec_add (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed char)vec_splats (__m1); + __b = (__vector signed char)vec_splats (__m2); + __c = vec_add (__a, __b); + return (__m64) ((__vector long long) __c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_char[0] = m1.as_char[0] + m2.as_char[0]; - res.as_char[1] = m1.as_char[1] + m2.as_char[1]; - res.as_char[2] = m1.as_char[2] + m2.as_char[2]; - res.as_char[3] = m1.as_char[3] + m2.as_char[3]; - res.as_char[4] = m1.as_char[4] + m2.as_char[4]; - res.as_char[5] = m1.as_char[5] + m2.as_char[5]; - res.as_char[6] = m1.as_char[6] + m2.as_char[6]; - res.as_char[7] = m1.as_char[7] + m2.as_char[7]; + __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0]; + __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1]; + __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2]; + __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3]; + __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4]; + __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5]; + __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6]; + __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -446,24 +446,24 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_add_pi16 (__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats (__m1); - b = (__vector signed short)vec_splats (__m2); - c = vec_add (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed short)vec_splats (__m1); + __b = (__vector signed short)vec_splats (__m2); + __c = vec_add (__a, __b); + return (__m64) ((__vector long long) __c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_short[0] = m1.as_short[0] + m2.as_short[0]; - res.as_short[1] = m1.as_short[1] + m2.as_short[1]; - res.as_short[2] = m1.as_short[2] + m2.as_short[2]; - res.as_short[3] = m1.as_short[3] + m2.as_short[3]; + __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0]; + __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1]; + __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2]; + __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -478,22 +478,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_add_pi32 (__m64 __m1, __m64 __m2) { #if _ARCH_PWR9 - __vector signed int a, b, c; + __vector signed int __a, __b, __c; - a = (__vector signed int)vec_splats (__m1); - b = (__vector signed int)vec_splats (__m2); - c = vec_add (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed int)vec_splats (__m1); + __b = (__vector signed int)vec_splats (__m2); + __c = vec_add (__a, __b); + return (__m64) ((__vector long long) __c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_int[0] = m1.as_int[0] + m2.as_int[0]; - res.as_int[1] = m1.as_int[1] + m2.as_int[1]; + __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0]; + __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -508,28 +508,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_sub_pi8 (__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed char a, b, c; + __vector signed char __a, __b, __c; - a = (__vector signed char)vec_splats (__m1); - b = (__vector signed char)vec_splats (__m2); - c = vec_sub (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed char)vec_splats (__m1); + __b = (__vector signed char)vec_splats (__m2); + __c = vec_sub (__a, __b); + return (__m64) ((__vector long long) __c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_char[0] = m1.as_char[0] - m2.as_char[0]; - res.as_char[1] = m1.as_char[1] - m2.as_char[1]; - res.as_char[2] = m1.as_char[2] - m2.as_char[2]; - res.as_char[3] = m1.as_char[3] - m2.as_char[3]; - res.as_char[4] = m1.as_char[4] - m2.as_char[4]; - res.as_char[5] = m1.as_char[5] - m2.as_char[5]; - res.as_char[6] = m1.as_char[6] - m2.as_char[6]; - res.as_char[7] = m1.as_char[7] - m2.as_char[7]; + __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0]; + __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1]; + __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2]; + __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3]; + __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4]; + __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5]; + __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6]; + __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -544,24 +544,24 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_sub_pi16 (__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats (__m1); - b = (__vector signed short)vec_splats (__m2); - c = vec_sub (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed short)vec_splats (__m1); + __b = (__vector signed short)vec_splats (__m2); + __c = vec_sub (__a, __b); + return (__m64) ((__vector long long) __c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_short[0] = m1.as_short[0] - m2.as_short[0]; - res.as_short[1] = m1.as_short[1] - m2.as_short[1]; - res.as_short[2] = m1.as_short[2] - m2.as_short[2]; - res.as_short[3] = m1.as_short[3] - m2.as_short[3]; + __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0]; + __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1]; + __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2]; + __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -576,22 +576,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_sub_pi32 (__m64 __m1, __m64 __m2) { #if _ARCH_PWR9 - __vector signed int a, b, c; + __vector signed int __a, __b, __c; - a = (__vector signed int)vec_splats (__m1); - b = (__vector signed int)vec_splats (__m2); - c = vec_sub (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed int)vec_splats (__m1); + __b = (__vector signed int)vec_splats (__m2); + __c = vec_sub (__a, __b); + return (__m64) ((__vector long long) __c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_int[0] = m1.as_int[0] - m2.as_int[0]; - res.as_int[1] = m1.as_int[1] - m2.as_int[1]; + __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0]; + __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -729,30 +729,30 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) { #if defined(_ARCH_PWR6) && defined(__powerpc64__) - __m64 res; + __m64 __res; __asm__( "cmpb %0,%1,%2;\n" - : "=r" (res) + : "=r" (__res) : "r" (__m1), "r" (__m2) : ); - return (res); + return (__res); #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0; - res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0; - res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0; - res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0; - res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0; - res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0; - res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0; - res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0; + __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0])? -1: 0; + __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1])? -1: 0; + __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2])? -1: 0; + __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3])? -1: 0; + __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4])? -1: 0; + __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5])? -1: 0; + __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6])? -1: 0; + __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7])? -1: 0; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -766,28 +766,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed char a, b, c; + __vector signed char __a, __b, __c; - a = (__vector signed char)vec_splats (__m1); - b = (__vector signed char)vec_splats (__m2); - c = (__vector signed char)vec_cmpgt (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed char)vec_splats (__m1); + __b = (__vector signed char)vec_splats (__m2); + __c = (__vector signed char)vec_cmpgt (__a, __b); + return (__m64) ((__vector long long) __c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0; - res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0; - res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0; - res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0; - res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0; - res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0; - res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0; - res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0; + __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0])? -1: 0; + __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1])? -1: 0; + __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2])? -1: 0; + __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3])? -1: 0; + __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4])? -1: 0; + __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5])? -1: 0; + __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6])? -1: 0; + __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7])? -1: 0; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -803,24 +803,24 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats (__m1); - b = (__vector signed short)vec_splats (__m2); - c = (__vector signed short)vec_cmpeq (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed short)vec_splats (__m1); + __b = (__vector signed short)vec_splats (__m2); + __c = (__vector signed short)vec_cmpeq (__a, __b); + return (__m64) ((__vector long long) __c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0; - res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0; - res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0; - res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0; + __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0])? -1: 0; + __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1])? -1: 0; + __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2])? -1: 0; + __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3])? -1: 0; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -834,24 +834,24 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) { #if _ARCH_PWR8 - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats (__m1); - b = (__vector signed short)vec_splats (__m2); - c = (__vector signed short)vec_cmpgt (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed short)vec_splats (__m1); + __b = (__vector signed short)vec_splats (__m2); + __c = (__vector signed short)vec_cmpgt (__a, __b); + return (__m64) ((__vector long long) __c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0; - res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0; - res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0; - res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0; + __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0])? -1: 0; + __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1])? -1: 0; + __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2])? -1: 0; + __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3])? -1: 0; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -867,22 +867,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) { #if _ARCH_PWR9 - __vector signed int a, b, c; + __vector signed int __a, __b, __c; - a = (__vector signed int)vec_splats (__m1); - b = (__vector signed int)vec_splats (__m2); - c = (__vector signed int)vec_cmpeq (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed int)vec_splats (__m1); + __b = (__vector signed int)vec_splats (__m2); + __c = (__vector signed int)vec_cmpeq (__a, __b); + return (__m64) ((__vector long long) __c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0; - res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0; + __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0])? -1: 0; + __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1])? -1: 0; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -896,22 +896,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) { #if _ARCH_PWR9 - __vector signed int a, b, c; + __vector signed int __a, __b, __c; - a = (__vector signed int)vec_splats (__m1); - b = (__vector signed int)vec_splats (__m2); - c = (__vector signed int)vec_cmpgt (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed int)vec_splats (__m1); + __b = (__vector signed int)vec_splats (__m2); + __c = (__vector signed int)vec_cmpgt (__a, __b); + return (__m64) ((__vector long long) __c)[0]; #else - __m64_union m1, m2, res; + __m64_union __mu1, __mu2, __res; - m1.as_m64 = __m1; - m2.as_m64 = __m2; + __mu1.as_m64 = __m1; + __mu2.as_m64 = __m2; - res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0; - res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0; + __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0])? -1: 0; + __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1])? -1: 0; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -927,12 +927,12 @@ _m_pcmpgtd (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pi8 (__m64 __m1, __m64 __m2) { - __vector signed char a, b, c; + __vector signed char __a, __b, __c; - a = (__vector signed char)vec_splats (__m1); - b = (__vector signed char)vec_splats (__m2); - c = vec_adds (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed char)vec_splats (__m1); + __b = (__vector signed char)vec_splats (__m2); + __c = vec_adds (__a, __b); + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -945,12 +945,12 @@ _m_paddsb (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pi16 (__m64 __m1, __m64 __m2) { - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats (__m1); - b = (__vector signed short)vec_splats (__m2); - c = vec_adds (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed short)vec_splats (__m1); + __b = (__vector signed short)vec_splats (__m2); + __c = vec_adds (__a, __b); + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -963,12 +963,12 @@ _m_paddsw (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pu8 (__m64 __m1, __m64 __m2) { - __vector unsigned char a, b, c; + __vector unsigned char __a, __b, __c; - a = (__vector unsigned char)vec_splats (__m1); - b = (__vector unsigned char)vec_splats (__m2); - c = vec_adds (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector unsigned char)vec_splats (__m1); + __b = (__vector unsigned char)vec_splats (__m2); + __c = vec_adds (__a, __b); + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -982,12 +982,12 @@ _m_paddusb (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pu16 (__m64 __m1, __m64 __m2) { - __vector unsigned short a, b, c; + __vector unsigned short __a, __b, __c; - a = (__vector unsigned short)vec_splats (__m1); - b = (__vector unsigned short)vec_splats (__m2); - c = vec_adds (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector unsigned short)vec_splats (__m1); + __b = (__vector unsigned short)vec_splats (__m2); + __c = vec_adds (__a, __b); + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1001,12 +1001,12 @@ _m_paddusw (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pi8 (__m64 __m1, __m64 __m2) { - __vector signed char a, b, c; + __vector signed char __a, __b, __c; - a = (__vector signed char)vec_splats (__m1); - b = (__vector signed char)vec_splats (__m2); - c = vec_subs (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed char)vec_splats (__m1); + __b = (__vector signed char)vec_splats (__m2); + __c = vec_subs (__a, __b); + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1020,12 +1020,12 @@ _m_psubsb (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pi16 (__m64 __m1, __m64 __m2) { - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats (__m1); - b = (__vector signed short)vec_splats (__m2); - c = vec_subs (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed short)vec_splats (__m1); + __b = (__vector signed short)vec_splats (__m2); + __c = vec_subs (__a, __b); + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1039,12 +1039,12 @@ _m_psubsw (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pu8 (__m64 __m1, __m64 __m2) { - __vector unsigned char a, b, c; + __vector unsigned char __a, __b, __c; - a = (__vector unsigned char)vec_splats (__m1); - b = (__vector unsigned char)vec_splats (__m2); - c = vec_subs (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector unsigned char)vec_splats (__m1); + __b = (__vector unsigned char)vec_splats (__m2); + __c = vec_subs (__a, __b); + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1058,12 +1058,12 @@ _m_psubusb (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pu16 (__m64 __m1, __m64 __m2) { - __vector unsigned short a, b, c; + __vector unsigned short __a, __b, __c; - a = (__vector unsigned short)vec_splats (__m1); - b = (__vector unsigned short)vec_splats (__m2); - c = vec_subs (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector unsigned short)vec_splats (__m1); + __b = (__vector unsigned short)vec_splats (__m2); + __c = vec_subs (__a, __b); + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1078,14 +1078,14 @@ _m_psubusw (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_madd_pi16 (__m64 __m1, __m64 __m2) { - __vector signed short a, b; - __vector signed int c; - __vector signed int zero = {0, 0, 0, 0}; + __vector signed short __a, __b; + __vector signed int __c; + __vector signed int __zero = {0, 0, 0, 0}; - a = (__vector signed short)vec_splats (__m1); - b = (__vector signed short)vec_splats (__m2); - c = vec_vmsumshm (a, b, zero); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed short)vec_splats (__m1); + __b = (__vector signed short)vec_splats (__m2); + __c = vec_vmsumshm (__a, __b, __zero); + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1098,10 +1098,10 @@ _m_pmaddwd (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_pi16 (__m64 __m1, __m64 __m2) { - __vector signed short a, b; - __vector signed short c; - __vector signed int w0, w1; - __vector unsigned char xform1 = { + __vector signed short __a, __b; + __vector signed short __c; + __vector signed int __w0, __w1; + __vector unsigned char __xform1 = { #ifdef __LITTLE_ENDIAN__ 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F @@ -1111,14 +1111,14 @@ _mm_mulhi_pi16 (__m64 __m1, __m64 __m2) #endif }; - a = (__vector signed short)vec_splats (__m1); - b = (__vector signed short)vec_splats (__m2); + __a = (__vector signed short)vec_splats (__m1); + __b = (__vector signed short)vec_splats (__m2); - w0 = vec_vmulesh (a, b); - w1 = vec_vmulosh (a, b); - c = (__vector signed short)vec_perm (w0, w1, xform1); + __w0 = vec_vmulesh (__a, __b); + __w1 = vec_vmulosh (__a, __b); + __c = (__vector signed short)vec_perm (__w0, __w1, __xform1); - return (__m64) ((__vector long long) c)[0]; + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1132,12 +1132,12 @@ _m_pmulhw (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mullo_pi16 (__m64 __m1, __m64 __m2) { - __vector signed short a, b, c; + __vector signed short __a, __b, __c; - a = (__vector signed short)vec_splats (__m1); - b = (__vector signed short)vec_splats (__m2); - c = a * b; - return (__m64) ((__vector long long) c)[0]; + __a = (__vector signed short)vec_splats (__m1); + __b = (__vector signed short)vec_splats (__m2); + __c = __a * __b; + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1150,15 +1150,15 @@ _m_pmullw (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_pi16 (__m64 __m, __m64 __count) { - __vector signed short m, r; - __vector unsigned short c; + __vector signed short __r; + __vector unsigned short __c; if (__count <= 15) { - m = (__vector signed short)vec_splats (__m); - c = (__vector unsigned short)vec_splats ((unsigned short)__count); - r = vec_sl (m, (__vector unsigned short)c); - return (__m64) ((__vector long long) r)[0]; + __r = (__vector signed short)vec_splats (__m); + __c = (__vector unsigned short)vec_splats ((unsigned short)__count); + __r = vec_sl (__r, (__vector unsigned short)__c); + return (__m64) ((__vector long long) __r)[0]; } else return (0); @@ -1187,13 +1187,13 @@ _m_psllwi (__m64 __m, int __count) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_pi32 (__m64 __m, __m64 __count) { - __m64_union m, res; + __m64_union __res; - m.as_m64 = __m; + __res.as_m64 = __m; - res.as_int[0] = m.as_int[0] << __count; - res.as_int[1] = m.as_int[1] << __count; - return (res.as_m64); + __res.as_int[0] = __res.as_int[0] << __count; + __res.as_int[1] = __res.as_int[1] << __count; + return (__res.as_m64); } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1219,15 +1219,15 @@ _m_pslldi (__m64 __m, int __count) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_pi16 (__m64 __m, __m64 __count) { - __vector signed short m, r; - __vector unsigned short c; + __vector signed short __r; + __vector unsigned short __c; if (__count <= 15) { - m = (__vector signed short)vec_splats (__m); - c = (__vector unsigned short)vec_splats ((unsigned short)__count); - r = vec_sra (m, (__vector unsigned short)c); - return (__m64) ((__vector long long) r)[0]; + __r = (__vector signed short)vec_splats (__m); + __c = (__vector unsigned short)vec_splats ((unsigned short)__count); + __r = vec_sra (__r, (__vector unsigned short)__c); + return (__m64) ((__vector long long) __r)[0]; } else return (0); @@ -1256,13 +1256,13 @@ _m_psrawi (__m64 __m, int __count) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_pi32 (__m64 __m, __m64 __count) { - __m64_union m, res; + __m64_union __res; - m.as_m64 = __m; + __res.as_m64 = __m; - res.as_int[0] = m.as_int[0] >> __count; - res.as_int[1] = m.as_int[1] >> __count; - return (res.as_m64); + __res.as_int[0] = __res.as_int[0] >> __count; + __res.as_int[1] = __res.as_int[1] >> __count; + return (__res.as_m64); } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1288,15 +1288,15 @@ _m_psradi (__m64 __m, int __count) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_pi16 (__m64 __m, __m64 __count) { - __vector unsigned short m, r; - __vector unsigned short c; + __vector unsigned short __r; + __vector unsigned short __c; if (__count <= 15) { - m = (__vector unsigned short)vec_splats (__m); - c = (__vector unsigned short)vec_splats ((unsigned short)__count); - r = vec_sr (m, (__vector unsigned short)c); - return (__m64) ((__vector long long) r)[0]; + __r = (__vector unsigned short)vec_splats (__m); + __c = (__vector unsigned short)vec_splats ((unsigned short)__count); + __r = vec_sr (__r, (__vector unsigned short)__c); + return (__m64) ((__vector long long) __r)[0]; } else return (0); @@ -1325,13 +1325,13 @@ _m_psrlwi (__m64 __m, int __count) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_pi32 (__m64 __m, __m64 __count) { - __m64_union m, res; + __m64_union __res; - m.as_m64 = __m; + __res.as_m64 = __m; - res.as_int[0] = (unsigned int)m.as_int[0] >> __count; - res.as_int[1] = (unsigned int)m.as_int[1] >> __count; - return (res.as_m64); + __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count; + __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count; + return (__res.as_m64); } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1358,24 +1358,24 @@ _m_psrldi (__m64 __m, int __count) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pi32 (int __i1, int __i0) { - __m64_union res; + __m64_union __res; - res.as_int[0] = __i0; - res.as_int[1] = __i1; - return (res.as_m64); + __res.as_int[0] = __i0; + __res.as_int[1] = __i1; + return (__res.as_m64); } /* Creates a vector of four 16-bit values; W0 is least significant. */ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) { - __m64_union res; + __m64_union __res; - res.as_short[0] = __w0; - res.as_short[1] = __w1; - res.as_short[2] = __w2; - res.as_short[3] = __w3; - return (res.as_m64); + __res.as_short[0] = __w0; + __res.as_short[1] = __w1; + __res.as_short[2] = __w2; + __res.as_short[3] = __w3; + return (__res.as_m64); } /* Creates a vector of eight 8-bit values; B0 is least significant. */ @@ -1383,28 +1383,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) { - __m64_union res; + __m64_union __res; - res.as_char[0] = __b0; - res.as_char[1] = __b1; - res.as_char[2] = __b2; - res.as_char[3] = __b3; - res.as_char[4] = __b4; - res.as_char[5] = __b5; - res.as_char[6] = __b6; - res.as_char[7] = __b7; - return (res.as_m64); + __res.as_char[0] = __b0; + __res.as_char[1] = __b1; + __res.as_char[2] = __b2; + __res.as_char[3] = __b3; + __res.as_char[4] = __b4; + __res.as_char[5] = __b5; + __res.as_char[6] = __b6; + __res.as_char[7] = __b7; + return (__res.as_m64); } /* Similar, but with the arguments in reverse order. */ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_pi32 (int __i0, int __i1) { - __m64_union res; + __m64_union __res; - res.as_int[0] = __i0; - res.as_int[1] = __i1; - return (res.as_m64); + __res.as_int[0] = __i0; + __res.as_int[1] = __i1; + return (__res.as_m64); } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1424,11 +1424,11 @@ _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_pi32 (int __i) { - __m64_union res; + __m64_union __res; - res.as_int[0] = __i; - res.as_int[1] = __i; - return (res.as_m64); + __res.as_int[0] = __i; + __res.as_int[1] = __i; + return (__res.as_m64); } /* Creates a vector of four 16-bit values, all elements containing W. */ @@ -1441,13 +1441,13 @@ _mm_set1_pi16 (short __w) w = (__vector signed short)vec_splats (__w); return (__m64) ((__vector long long) w)[0]; #else - __m64_union res; + __m64_union __res; - res.as_short[0] = __w; - res.as_short[1] = __w; - res.as_short[2] = __w; - res.as_short[3] = __w; - return (res.as_m64); + __res.as_short[0] = __w; + __res.as_short[1] = __w; + __res.as_short[2] = __w; + __res.as_short[3] = __w; + return (__res.as_m64); #endif } @@ -1456,22 +1456,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_set1_pi8 (signed char __b) { #if _ARCH_PWR8 - __vector signed char b; + __vector signed char __res; - b = (__vector signed char)vec_splats (__b); - return (__m64) ((__vector long long) b)[0]; + __res = (__vector signed char)vec_splats (__b); + return (__m64) ((__vector long long) __res)[0]; #else - __m64_union res; - - res.as_char[0] = __b; - res.as_char[1] = __b; - res.as_char[2] = __b; - res.as_char[3] = __b; - res.as_char[4] = __b; - res.as_char[5] = __b; - res.as_char[6] = __b; - res.as_char[7] = __b; - return (res.as_m64); + __m64_union __res; + + __res.as_char[0] = __b; + __res.as_char[1] = __b; + __res.as_char[2] = __b; + __res.as_char[3] = __b; + __res.as_char[4] = __b; + __res.as_char[5] = __b; + __res.as_char[6] = __b; + __res.as_char[7] = __b; + return (__res.as_m64); #endif } #endif /* _MMINTRIN_H_INCLUDED */ diff --git a/gcc/config/rs6000/pmmintrin.h b/gcc/config/rs6000/pmmintrin.h index bcbca15..e1b5426 100644 --- a/gcc/config/rs6000/pmmintrin.h +++ b/gcc/config/rs6000/pmmintrin.h @@ -58,55 +58,55 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_addsub_ps (__m128 __X, __m128 __Y) { - const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0}; - __v4sf even_neg_Y = vec_xor(__Y, even_n0); - return (__m128) vec_add (__X, even_neg_Y); + const __v4sf __even_n0 = {-0.0, 0.0, -0.0, 0.0}; + __v4sf __even_neg_Y = vec_xor(__Y, __even_n0); + return (__m128) vec_add (__X, __even_neg_Y); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_addsub_pd (__m128d __X, __m128d __Y) { - const __v2df even_n0 = {-0.0, 0.0}; - __v2df even_neg_Y = vec_xor(__Y, even_n0); - return (__m128d) vec_add (__X, even_neg_Y); + const __v2df __even_n0 = {-0.0, 0.0}; + __v2df __even_neg_Y = vec_xor(__Y, __even_n0); + return (__m128d) vec_add (__X, __even_neg_Y); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_hadd_ps (__m128 __X, __m128 __Y) { - __vector unsigned char xform2 = { + __vector unsigned char __xform2 = { 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B }; - __vector unsigned char xform1 = { + __vector unsigned char __xform1 = { 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F }; - return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), - vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); + return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, __xform2), + vec_perm ((__v4sf) __X, (__v4sf) __Y, __xform1)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_hsub_ps (__m128 __X, __m128 __Y) { - __vector unsigned char xform2 = { + __vector unsigned char __xform2 = { 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B }; - __vector unsigned char xform1 = { + __vector unsigned char __xform1 = { 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F }; - return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), - vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); + return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, __xform2), + vec_perm ((__v4sf) __X, (__v4sf) __Y, __xform1)); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h index cca2f7d..3628c88 100644 --- a/gcc/config/rs6000/smmintrin.h +++ b/gcc/config/rs6000/smmintrin.h @@ -273,31 +273,31 @@ _mm_round_ss (__m128 __A, __m128 __B, int __rounding) extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_insert_epi8 (__m128i const __A, int const __D, int const __N) { - __v16qi result = (__v16qi)__A; + __v16qi __result = (__v16qi)__A; - result [__N & 0xf] = __D; + __result [__N & 0xf] = __D; - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_insert_epi32 (__m128i const __A, int const __D, int const __N) { - __v4si result = (__v4si)__A; + __v4si __result = (__v4si)__A; - result [__N & 3] = __D; + __result [__N & 3] = __D; - return (__m128i) result; + return (__m128i) __result; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_insert_epi64 (__m128i const __A, long long const __D, int const __N) { - __v2di result = (__v2di)__A; + __v2di __result = (__v2di)__A; - result [__N & 1] = __D; + __result [__N & 1] = __D; - return (__m128i) result; + return (__m128i) __result; } extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) diff --git a/gcc/config/rs6000/tmmintrin.h b/gcc/config/rs6000/tmmintrin.h index c06a643..05b985b 100644 --- a/gcc/config/rs6000/tmmintrin.h +++ b/gcc/config/rs6000/tmmintrin.h @@ -112,8 +112,8 @@ _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count) { if (__count >= 32) { - const __v16qu zero = { 0 }; - return (__m128i) zero; + const __v16qu __zero = { 0 }; + return (__m128i) __zero; } else { diff --git a/gcc/config/rs6000/xmmintrin.h b/gcc/config/rs6000/xmmintrin.h index 5867431..c602011 100644 --- a/gcc/config/rs6000/xmmintrin.h +++ b/gcc/config/rs6000/xmmintrin.h @@ -127,14 +127,14 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif _mm_loadr_ps (float const *__P) { __v4sf __tmp; - __m128 result; - static const __vector unsigned char permute_vector = + __m128 __result; + static const __vector unsigned char __permute_vector = { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13 }; __tmp = vec_ld (0, (__v4sf *) __P); - result = (__m128) vec_perm (__tmp, __tmp, permute_vector); - return result; + __result = (__m128) vec_perm (__tmp, __tmp, __permute_vector); + return __result; } /* Create a vector with all four elements equal to F. */ @@ -184,11 +184,11 @@ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artific _mm_storer_ps (float *__P, __m128 __A) { __v4sf __tmp; - static const __vector unsigned char permute_vector = + static const __vector unsigned char __permute_vector = { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13 }; - __tmp = (__m128) vec_perm (__A, __A, permute_vector); + __tmp = (__m128) vec_perm (__A, __A, __permute_vector); _mm_store_ps (__P, __tmp); } @@ -218,9 +218,9 @@ _mm_set_ss (float __F) extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_move_ss (__m128 __A, __m128 __B) { - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; - return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask)); + return (vec_sel ((__v4sf)__A, (__v4sf)__B, __mask)); } /* Create a vector with element 0 as *P and the rest zero. */ @@ -245,18 +245,18 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif _mm_add_ss (__m128 __A, __m128 __B) { #ifdef _ARCH_PWR7 - __m128 a, b, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + __m128 __a, __b, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) results. So to insure we don't generate spurious exceptions (from the upper double values) we splat the lower double before we to the operation. */ - a = vec_splat (__A, 0); - b = vec_splat (__B, 0); - c = a + b; + __a = vec_splat (__A, 0); + __b = vec_splat (__B, 0); + __c = __a + __b; /* Then we merge the lower float result with the original upper float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel (__A, __c, __mask)); #else __A[0] = __A[0] + __B[0]; return (__A); @@ -267,18 +267,18 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif _mm_sub_ss (__m128 __A, __m128 __B) { #ifdef _ARCH_PWR7 - __m128 a, b, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + __m128 __a, __b, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) results. So to insure we don't generate spurious exceptions (from the upper double values) we splat the lower double before we to the operation. */ - a = vec_splat (__A, 0); - b = vec_splat (__B, 0); - c = a - b; + __a = vec_splat (__A, 0); + __b = vec_splat (__B, 0); + __c = __a - __b; /* Then we merge the lower float result with the original upper float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel (__A, __c, __mask)); #else __A[0] = __A[0] - __B[0]; return (__A); @@ -289,18 +289,18 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif _mm_mul_ss (__m128 __A, __m128 __B) { #ifdef _ARCH_PWR7 - __m128 a, b, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + __m128 __a, __b, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) results. So to insure we don't generate spurious exceptions (from the upper double values) we splat the lower double before we to the operation. */ - a = vec_splat (__A, 0); - b = vec_splat (__B, 0); - c = a * b; + __a = vec_splat (__A, 0); + __b = vec_splat (__B, 0); + __c = __a * __b; /* Then we merge the lower float result with the original upper float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel (__A, __c, __mask)); #else __A[0] = __A[0] * __B[0]; return (__A); @@ -311,18 +311,18 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif _mm_div_ss (__m128 __A, __m128 __B) { #ifdef _ARCH_PWR7 - __m128 a, b, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + __m128 __a, __b, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) results. So to insure we don't generate spurious exceptions (from the upper double values) we splat the lower double before we to the operation. */ - a = vec_splat (__A, 0); - b = vec_splat (__B, 0); - c = a / b; + __a = vec_splat (__A, 0); + __b = vec_splat (__B, 0); + __c = __a / __b; /* Then we merge the lower float result with the original upper float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel (__A, __c, __mask)); #else __A[0] = __A[0] / __B[0]; return (__A); @@ -332,17 +332,17 @@ _mm_div_ss (__m128 __A, __m128 __B) extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_ss (__m128 __A) { - __m128 a, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + __m128 __a, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) * results. So to insure we don't generate spurious exceptions * (from the upper double values) we splat the lower double * before we to the operation. */ - a = vec_splat (__A, 0); - c = vec_sqrt (a); + __a = vec_splat (__A, 0); + __c = vec_sqrt (__a); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel (__A, __c, __mask)); } /* Perform the respective operation on the four SPFP values in A and B. */ @@ -391,81 +391,81 @@ _mm_rsqrt_ps (__m128 __A) extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_rcp_ss (__m128 __A) { - __m128 a, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + __m128 __a, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) * results. So to insure we don't generate spurious exceptions * (from the upper double values) we splat the lower double * before we to the operation. */ - a = vec_splat (__A, 0); - c = _mm_rcp_ps (a); + __a = vec_splat (__A, 0); + __c = _mm_rcp_ps (__a); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel (__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_rsqrt_ss (__m128 __A) { - __m128 a, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + __m128 __a, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower double) * results. So to insure we don't generate spurious exceptions * (from the upper double values) we splat the lower double * before we to the operation. */ - a = vec_splat (__A, 0); - c = vec_rsqrte (a); + __a = vec_splat (__A, 0); + __c = vec_rsqrte (__a); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return (vec_sel (__A, c, mask)); + return (vec_sel (__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_ss (__m128 __A, __m128 __B) { - __v4sf a, b, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + __v4sf __a, __b, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower float) * results. So to insure we don't generate spurious exceptions * (from the upper float values) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf)__A, 0); - b = vec_splat ((__v4sf)__B, 0); - c = vec_min (a, b); + __a = vec_splat ((__v4sf)__A, 0); + __b = vec_splat ((__v4sf)__B, 0); + __c = vec_min (__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return (vec_sel ((__v4sf)__A, c, mask)); + return (vec_sel ((__v4sf)__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_ss (__m128 __A, __m128 __B) { - __v4sf a, b, c; - static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + __v4sf __a, __b, __c; + static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; /* PowerISA VSX does not allow partial (for just lower float) * results. So to insure we don't generate spurious exceptions * (from the upper float values) we splat the lower float * before we to the operation. */ - a = vec_splat (__A, 0); - b = vec_splat (__B, 0); - c = vec_max (a, b); + __a = vec_splat (__A, 0); + __b = vec_splat (__B, 0); + __c = vec_max (__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return (vec_sel ((__v4sf)__A, c, mask)); + return (vec_sel ((__v4sf)__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_ps (__m128 __A, __m128 __B) { - __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A); - return vec_sel (__B, __A, m); + __vector __bool int __m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A); + return vec_sel (__B, __A, __m); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_ps (__m128 __A, __m128 __B) { - __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B); - return vec_sel (__B, __A, m); + __vector __bool int __m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B); + return vec_sel (__B, __A, __m); } /* Perform logical bit-wise operations on 128-bit values. */ @@ -530,8 +530,8 @@ _mm_cmpge_ps (__m128 __A, __m128 __B) extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpneq_ps (__m128 __A, __m128 __B) { - __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B); - return ((__m128)vec_nor (temp, temp)); + __v4sf __temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B); + return ((__m128)vec_nor (__temp, __temp)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -561,31 +561,31 @@ _mm_cmpnge_ps (__m128 __A, __m128 __B) extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpord_ps (__m128 __A, __m128 __B) { - __vector unsigned int a, b; - __vector unsigned int c, d; - static const __vector unsigned int float_exp_mask = + __vector unsigned int __a, __b; + __vector unsigned int __c, __d; + static const __vector unsigned int __float_exp_mask = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; - a = (__vector unsigned int) vec_abs ((__v4sf)__A); - b = (__vector unsigned int) vec_abs ((__v4sf)__B); - c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); - d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); - return ((__m128 ) vec_and (c, d)); + __a = (__vector unsigned int) vec_abs ((__v4sf)__A); + __b = (__vector unsigned int) vec_abs ((__v4sf)__B); + __c = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __a); + __d = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __b); + return ((__m128 ) vec_and (__c, __d)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpunord_ps (__m128 __A, __m128 __B) { - __vector unsigned int a, b; - __vector unsigned int c, d; - static const __vector unsigned int float_exp_mask = + __vector unsigned int __a, __b; + __vector unsigned int __c, __d; + static const __vector unsigned int __float_exp_mask = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; - a = (__vector unsigned int) vec_abs ((__v4sf)__A); - b = (__vector unsigned int) vec_abs ((__v4sf)__B); - c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); - d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); - return ((__m128 ) vec_or (c, d)); + __a = (__vector unsigned int) vec_abs ((__v4sf)__A); + __b = (__vector unsigned int) vec_abs ((__v4sf)__B); + __c = (__vector unsigned int) vec_cmpgt (__a, __float_exp_mask); + __d = (__vector unsigned int) vec_cmpgt (__b, __float_exp_mask); + return ((__m128 ) vec_or (__c, __d)); } /* Perform a comparison on the lower SPFP values of A and B. If the @@ -594,222 +594,222 @@ _mm_cmpunord_ps (__m128 __A, __m128 __B) extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_ss (__m128 __A, __m128 __B) { - static const __vector unsigned int mask = + static const __vector unsigned int __mask = { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmpeq(a, b); + __a = vec_splat ((__v4sf) __A, 0); + __b = vec_splat ((__v4sf) __B, 0); + __c = (__v4sf) vec_cmpeq (__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_ss (__m128 __A, __m128 __B) { - static const __vector unsigned int mask = + static const __vector unsigned int __mask = { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmplt(a, b); + __a = vec_splat ((__v4sf) __A, 0); + __b = vec_splat ((__v4sf) __B, 0); + __c = (__v4sf) vec_cmplt(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmple_ss (__m128 __A, __m128 __B) { - static const __vector unsigned int mask = + static const __vector unsigned int __mask = { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmple(a, b); + __a = vec_splat ((__v4sf) __A, 0); + __b = vec_splat ((__v4sf) __B, 0); + __c = (__v4sf) vec_cmple(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_ss (__m128 __A, __m128 __B) { - static const __vector unsigned int mask = + static const __vector unsigned int __mask = { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmpgt(a, b); + __a = vec_splat ((__v4sf) __A, 0); + __b = vec_splat ((__v4sf) __B, 0); + __c = (__v4sf) vec_cmpgt(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpge_ss (__m128 __A, __m128 __B) { - static const __vector unsigned int mask = + static const __vector unsigned int __mask = { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmpge(a, b); + __a = vec_splat ((__v4sf) __A, 0); + __b = vec_splat ((__v4sf) __B, 0); + __c = (__v4sf) vec_cmpge(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpneq_ss (__m128 __A, __m128 __B) { - static const __vector unsigned int mask = + static const __vector unsigned int __mask = { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmpeq(a, b); - c = vec_nor (c, c); + __a = vec_splat ((__v4sf) __A, 0); + __b = vec_splat ((__v4sf) __B, 0); + __c = (__v4sf) vec_cmpeq(__a, __b); + __c = vec_nor (__c, __c); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnlt_ss (__m128 __A, __m128 __B) { - static const __vector unsigned int mask = + static const __vector unsigned int __mask = { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmpge(a, b); + __a = vec_splat ((__v4sf) __A, 0); + __b = vec_splat ((__v4sf) __B, 0); + __c = (__v4sf) vec_cmpge(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnle_ss (__m128 __A, __m128 __B) { - static const __vector unsigned int mask = + static const __vector unsigned int __mask = { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmpgt(a, b); + __a = vec_splat ((__v4sf) __A, 0); + __b = vec_splat ((__v4sf) __B, 0); + __c = (__v4sf) vec_cmpgt(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpngt_ss (__m128 __A, __m128 __B) { - static const __vector unsigned int mask = + static const __vector unsigned int __mask = { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we to the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmple(a, b); + __a = vec_splat ((__v4sf) __A, 0); + __b = vec_splat ((__v4sf) __B, 0); + __c = (__v4sf) vec_cmple(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnge_ss (__m128 __A, __m128 __B) { - static const __vector unsigned int mask = + static const __vector unsigned int __mask = { 0xffffffff, 0, 0, 0 }; - __v4sf a, b, c; + __v4sf __a, __b, __c; /* PowerISA VMX does not allow partial (for just element 0) * results. So to insure we don't generate spurious exceptions * (from the upper elements) we splat the lower float * before we do the operation. */ - a = vec_splat ((__v4sf) __A, 0); - b = vec_splat ((__v4sf) __B, 0); - c = (__v4sf) vec_cmplt(a, b); + __a = vec_splat ((__v4sf) __A, 0); + __b = vec_splat ((__v4sf) __B, 0); + __c = (__v4sf) vec_cmplt(__a, __b); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, c, mask)); + return ((__m128)vec_sel ((__v4sf)__A, __c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpord_ss (__m128 __A, __m128 __B) { - __vector unsigned int a, b; - __vector unsigned int c, d; - static const __vector unsigned int float_exp_mask = + __vector unsigned int __a, __b; + __vector unsigned int __c, __d; + static const __vector unsigned int __float_exp_mask = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; - static const __vector unsigned int mask = + static const __vector unsigned int __mask = { 0xffffffff, 0, 0, 0 }; - a = (__vector unsigned int) vec_abs ((__v4sf)__A); - b = (__vector unsigned int) vec_abs ((__v4sf)__B); - c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); - d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); - c = vec_and (c, d); + __a = (__vector unsigned int) vec_abs ((__v4sf)__A); + __b = (__vector unsigned int) vec_abs ((__v4sf)__B); + __c = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __a); + __d = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __b); + __c = vec_and (__c, __d); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); + return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)__c, __mask)); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpunord_ss (__m128 __A, __m128 __B) { - __vector unsigned int a, b; - __vector unsigned int c, d; - static const __vector unsigned int float_exp_mask = + __vector unsigned int __a, __b; + __vector unsigned int __c, __d; + static const __vector unsigned int __float_exp_mask = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; - static const __vector unsigned int mask = + static const __vector unsigned int __mask = { 0xffffffff, 0, 0, 0 }; - a = (__vector unsigned int) vec_abs ((__v4sf)__A); - b = (__vector unsigned int) vec_abs ((__v4sf)__B); - c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); - d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); - c = vec_or (c, d); + __a = (__vector unsigned int) vec_abs ((__v4sf)__A); + __b = (__vector unsigned int) vec_abs ((__v4sf)__B); + __c = (__vector unsigned int) vec_cmpgt (__a, __float_exp_mask); + __d = (__vector unsigned int) vec_cmpgt (__b, __float_exp_mask); + __c = vec_or (__c, __d); /* Then we merge the lower float result with the original upper * float elements from __A. */ - return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); + return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)__c, __mask)); } /* Compare the lower SPFP values of A and B and return 1 if true @@ -905,9 +905,9 @@ _mm_cvtss_f32 (__m128 __A) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_si32 (__m128 __A) { - int res; + int __res; #ifdef _ARCH_PWR8 - double dtmp; + double __dtmp; __asm__( #ifdef __LITTLE_ENDIAN__ "xxsldwi %x0,%x0,%x0,3;\n" @@ -916,13 +916,13 @@ _mm_cvtss_si32 (__m128 __A) "fctiw %2,%2;\n" "mfvsrd %1,%x2;\n" : "+wa" (__A), - "=r" (res), - "=f" (dtmp) + "=r" (__res), + "=f" (__dtmp) : ); #else - res = __builtin_rint(__A[0]); + __res = __builtin_rint(__A[0]); #endif - return (res); + return __res; } extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -938,9 +938,9 @@ _mm_cvt_ss2si (__m128 __A) extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_si64 (__m128 __A) { - long long res; + long long __res; #if defined (_ARCH_PWR8) && defined (__powerpc64__) - double dtmp; + double __dtmp; __asm__( #ifdef __LITTLE_ENDIAN__ "xxsldwi %x0,%x0,%x0,3;\n" @@ -949,13 +949,13 @@ _mm_cvtss_si64 (__m128 __A) "fctid %2,%2;\n" "mfvsrd %1,%x2;\n" : "+wa" (__A), - "=r" (res), - "=f" (dtmp) + "=r" (__res), + "=f" (__dtmp) : ); #else - res = __builtin_llrint(__A[0]); + __res = __builtin_llrint(__A[0]); #endif - return (res); + return __res; } /* Microsoft intrinsic. */ @@ -992,15 +992,15 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_cvtps_pi32 (__m128 __A) { /* Splat two lower SPFP values to both halves. */ - __v4sf temp, rounded; - __vector unsigned long long result; + __v4sf __temp, __rounded; + __vector unsigned long long __result; /* Splat two lower SPFP values to both halves. */ - temp = (__v4sf) vec_splat ((__vector long long)__A, 0); - rounded = vec_rint(temp); - result = (__vector unsigned long long) vec_cts (rounded, 0); + __temp = (__v4sf) vec_splat ((__vector long long)__A, 0); + __rounded = vec_rint (__temp); + __result = (__vector unsigned long long) vec_cts (__rounded, 0); - return (__m64) ((__vector long long) result)[0]; + return (__m64) ((__vector long long) __result)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1014,9 +1014,9 @@ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artifici _mm_cvttss_si32 (__m128 __A) { /* Extract the lower float element. */ - float temp = __A[0]; + float __temp = __A[0]; /* truncate to 32-bit integer and return. */ - return temp; + return __temp; } extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1030,9 +1030,9 @@ extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __ar _mm_cvttss_si64 (__m128 __A) { /* Extract the lower float element. */ - float temp = __A[0]; + float __temp = __A[0]; /* truncate to 32-bit integer and return. */ - return temp; + return __temp; } /* Microsoft intrinsic. */ @@ -1040,9 +1040,9 @@ extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __ar _mm_cvttss_si64x (__m128 __A) { /* Extract the lower float element. */ - float temp = __A[0]; + float __temp = __A[0]; /* truncate to 32-bit integer and return. */ - return temp; + return __temp; } /* Truncate the two lower SPFP values to 32-bit integers. Return the @@ -1050,14 +1050,14 @@ _mm_cvttss_si64x (__m128 __A) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttps_pi32 (__m128 __A) { - __v4sf temp; - __vector unsigned long long result; + __v4sf __temp; + __vector unsigned long long __result; /* Splat two lower SPFP values to both halves. */ - temp = (__v4sf) vec_splat ((__vector long long)__A, 0); - result = (__vector unsigned long long) vec_cts (temp, 0); + __temp = (__v4sf) vec_splat ((__vector long long)__A, 0); + __result = (__vector unsigned long long) vec_cts (__temp, 0); - return (__m64) ((__vector long long) result)[0]; + return (__m64) ((__vector long long) __result)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1070,8 +1070,8 @@ _mm_cvtt_ps2pi (__m128 __A) extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi32_ss (__m128 __A, int __B) { - float temp = __B; - __A[0] = temp; + float __temp = __B; + __A[0] = __temp; return __A; } @@ -1087,8 +1087,8 @@ _mm_cvt_si2ss (__m128 __A, int __B) extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64_ss (__m128 __A, long long __B) { - float temp = __B; - __A[0] = temp; + float __temp = __B; + __A[0] = __temp; return __A; } @@ -1105,14 +1105,14 @@ _mm_cvtsi64x_ss (__m128 __A, long long __B) extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpi32_ps (__m128 __A, __m64 __B) { - __vector signed int vm1; - __vector float vf1; + __vector signed int __vm1; + __vector float __vf1; - vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B}; - vf1 = (__vector float) vec_ctf (vm1, 0); + __vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B}; + __vf1 = (__vector float) vec_ctf (__vm1, 0); return ((__m128) (__vector unsigned long long) - { ((__vector unsigned long long)vf1) [0], + { ((__vector unsigned long long)__vf1) [0], ((__vector unsigned long long)__A) [1]}); } @@ -1126,54 +1126,54 @@ _mm_cvt_pi2ps (__m128 __A, __m64 __B) extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpi16_ps (__m64 __A) { - __vector signed short vs8; - __vector signed int vi4; - __vector float vf1; + __vector signed short __vs8; + __vector signed int __vi4; + __vector float __vf1; - vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A }; - vi4 = vec_vupklsh (vs8); - vf1 = (__vector float) vec_ctf (vi4, 0); + __vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A }; + __vi4 = vec_vupklsh (__vs8); + __vf1 = (__vector float) vec_ctf (__vi4, 0); - return (__m128) vf1; + return (__m128) __vf1; } /* Convert the four unsigned 16-bit values in A to SPFP form. */ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpu16_ps (__m64 __A) { - const __vector unsigned short zero = + const __vector unsigned short __zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; - __vector unsigned short vs8; - __vector unsigned int vi4; - __vector float vf1; + __vector unsigned short __vs8; + __vector unsigned int __vi4; + __vector float __vf1; - vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A }; - vi4 = (__vector unsigned int) vec_mergel + __vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A }; + __vi4 = (__vector unsigned int) vec_mergel #ifdef __LITTLE_ENDIAN__ - (vs8, zero); + (__vs8, __zero); #else - (zero, vs8); + (__zero, __vs8); #endif - vf1 = (__vector float) vec_ctf (vi4, 0); + __vf1 = (__vector float) vec_ctf (__vi4, 0); - return (__m128) vf1; + return (__m128) __vf1; } /* Convert the low four signed 8-bit values in A to SPFP form. */ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpi8_ps (__m64 __A) { - __vector signed char vc16; - __vector signed short vs8; - __vector signed int vi4; - __vector float vf1; + __vector signed char __vc16; + __vector signed short __vs8; + __vector signed int __vi4; + __vector float __vf1; - vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A }; - vs8 = vec_vupkhsb (vc16); - vi4 = vec_vupkhsh (vs8); - vf1 = (__vector float) vec_ctf (vi4, 0); + __vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A }; + __vs8 = vec_vupkhsb (__vc16); + __vi4 = vec_vupkhsh (__vs8); + __vf1 = (__vector float) vec_ctf (__vi4, 0); - return (__m128) vf1; + return (__m128) __vf1; } /* Convert the low four unsigned 8-bit values in A to SPFP form. */ @@ -1181,70 +1181,70 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __art _mm_cvtpu8_ps (__m64 __A) { - const __vector unsigned char zero = + const __vector unsigned char __zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; - __vector unsigned char vc16; - __vector unsigned short vs8; - __vector unsigned int vi4; - __vector float vf1; + __vector unsigned char __vc16; + __vector unsigned short __vs8; + __vector unsigned int __vi4; + __vector float __vf1; - vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A }; + __vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A }; #ifdef __LITTLE_ENDIAN__ - vs8 = (__vector unsigned short) vec_mergel (vc16, zero); - vi4 = (__vector unsigned int) vec_mergeh (vs8, - (__vector unsigned short) zero); + __vs8 = (__vector unsigned short) vec_mergel (__vc16, __zero); + __vi4 = (__vector unsigned int) vec_mergeh (__vs8, + (__vector unsigned short) __zero); #else - vs8 = (__vector unsigned short) vec_mergel (zero, vc16); - vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero, - vs8); + __vs8 = (__vector unsigned short) vec_mergel (__zero, __vc16); + __vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) __zero, + __vs8); #endif - vf1 = (__vector float) vec_ctf (vi4, 0); + __vf1 = (__vector float) vec_ctf (__vi4, 0); - return (__m128) vf1; + return (__m128) __vf1; } /* Convert the four signed 32-bit values in A and B to SPFP form. */ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpi32x2_ps (__m64 __A, __m64 __B) { - __vector signed int vi4; - __vector float vf4; + __vector signed int __vi4; + __vector float __vf4; - vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B }; - vf4 = (__vector float) vec_ctf (vi4, 0); - return (__m128) vf4; + __vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B }; + __vf4 = (__vector float) vec_ctf (__vi4, 0); + return (__m128) __vf4; } /* Convert the four SPFP values in A to four signed 16-bit integers. */ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtps_pi16 (__m128 __A) { - __v4sf rounded; - __vector signed int temp; - __vector unsigned long long result; + __v4sf __rounded; + __vector signed int __temp; + __vector unsigned long long __result; - rounded = vec_rint(__A); - temp = vec_cts (rounded, 0); - result = (__vector unsigned long long) vec_pack (temp, temp); + __rounded = vec_rint(__A); + __temp = vec_cts (__rounded, 0); + __result = (__vector unsigned long long) vec_pack (__temp, __temp); - return (__m64) ((__vector long long) result)[0]; + return (__m64) ((__vector long long) __result)[0]; } /* Convert the four SPFP values in A to four signed 8-bit integers. */ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtps_pi8 (__m128 __A) { - __v4sf rounded; - __vector signed int tmp_i; - static const __vector signed int zero = {0, 0, 0, 0}; - __vector signed short tmp_s; - __vector signed char res_v; + __v4sf __rounded; + __vector signed int __tmp_i; + static const __vector signed int __zero = {0, 0, 0, 0}; + __vector signed short __tmp_s; + __vector signed char __res_v; - rounded = vec_rint(__A); - tmp_i = vec_cts (rounded, 0); - tmp_s = vec_pack (tmp_i, zero); - res_v = vec_pack (tmp_s, tmp_s); - return (__m64) ((__vector long long) res_v)[0]; + __rounded = vec_rint(__A); + __tmp_i = vec_cts (__rounded, 0); + __tmp_s = vec_pack (__tmp_i, __zero); + __res_v = vec_pack (__tmp_s, __tmp_s); + return (__m64) ((__vector long long) __res_v)[0]; } /* Selects four specific SPFP values from A and B based on MASK. */ @@ -1252,11 +1252,11 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __art _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) { - unsigned long element_selector_10 = __mask & 0x03; - unsigned long element_selector_32 = (__mask >> 2) & 0x03; - unsigned long element_selector_54 = (__mask >> 4) & 0x03; - unsigned long element_selector_76 = (__mask >> 6) & 0x03; - static const unsigned int permute_selectors[4] = + unsigned long __element_selector_10 = __mask & 0x03; + unsigned long __element_selector_32 = (__mask >> 2) & 0x03; + unsigned long __element_selector_54 = (__mask >> 4) & 0x03; + unsigned long __element_selector_76 = (__mask >> 6) & 0x03; + static const unsigned int __permute_selectors[4] = { #ifdef __LITTLE_ENDIAN__ 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C @@ -1264,13 +1264,13 @@ _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F #endif }; - __vector unsigned int t; + __vector unsigned int __t; - t[0] = permute_selectors[element_selector_10]; - t[1] = permute_selectors[element_selector_32]; - t[2] = permute_selectors[element_selector_54] + 0x10101010; - t[3] = permute_selectors[element_selector_76] + 0x10101010; - return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t); + __t[0] = __permute_selectors[__element_selector_10]; + __t[1] = __permute_selectors[__element_selector_32]; + __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; + __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; + return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)__t); } /* Selects and interleaves the upper two SPFP values from A and B. */ @@ -1355,8 +1355,8 @@ _mm_movemask_ps (__m128 __A) #ifdef _ARCH_PWR10 return vec_extractm ((__vector unsigned int) __A); #else - __vector unsigned long long result; - static const __vector unsigned int perm_mask = + __vector unsigned long long __result; + static const __vector unsigned int __perm_mask = { #ifdef __LITTLE_ENDIAN__ 0x00204060, 0x80808080, 0x80808080, 0x80808080 @@ -1365,14 +1365,14 @@ _mm_movemask_ps (__m128 __A) #endif }; - result = ((__vector unsigned long long) + __result = ((__vector unsigned long long) vec_vbpermq ((__vector unsigned char) __A, - (__vector unsigned char) perm_mask)); + (__vector unsigned char) __perm_mask)); #ifdef __LITTLE_ENDIAN__ - return result[1]; + return __result[1]; #else - return result[0]; + return __result[0]; #endif #endif /* !_ARCH_PWR10 */ } @@ -1395,12 +1395,12 @@ _mm_load_ps1 (float const *__P) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_extract_pi16 (__m64 const __A, int const __N) { - unsigned int shiftr = __N & 3; + unsigned int __shiftr = __N & 3; #ifdef __BIG_ENDIAN__ - shiftr = 3 - shiftr; + __shiftr = 3 - __shiftr; #endif - return ((__A >> (shiftr * 16)) & 0xffff); + return ((__A >> (__shiftr * 16)) & 0xffff); } extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1414,12 +1414,12 @@ _m_pextrw (__m64 const __A, int const __N) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_insert_pi16 (__m64 const __A, int const __D, int const __N) { - const int shiftl = (__N & 3) * 16; - const __m64 shiftD = (const __m64) __D << shiftl; - const __m64 mask = 0xffffUL << shiftl; - __m64 result = (__A & (~mask)) | (shiftD & mask); + const int __shiftl = (__N & 3) * 16; + const __m64 __shiftD = (const __m64) __D << __shiftl; + const __m64 __mask = 0xffffUL << __shiftl; + __m64 __result = (__A & (~__mask)) | (__shiftD & __mask); - return (result); + return __result; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1434,30 +1434,30 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_max_pi16 (__m64 __A, __m64 __B) { #if _ARCH_PWR8 - __vector signed short a, b, r; - __vector __bool short c; - - a = (__vector signed short)vec_splats (__A); - b = (__vector signed short)vec_splats (__B); - c = (__vector __bool short)vec_cmpgt (a, b); - r = vec_sel (b, a, c); - return (__m64) ((__vector long long) r)[0]; + __vector signed short __a, __b, __r; + __vector __bool short __c; + + __a = (__vector signed short)vec_splats (__A); + __b = (__vector signed short)vec_splats (__B); + __c = (__vector __bool short)vec_cmpgt (__a, __b); + __r = vec_sel (__b, __a, __c); + return (__m64) ((__vector long long) __r)[0]; #else - __m64_union m1, m2, res; + __m64_union __m1, __m2, __res; - m1.as_m64 = __A; - m2.as_m64 = __B; + __m1.as_m64 = __A; + __m2.as_m64 = __B; - res.as_short[0] = - (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; - res.as_short[1] = - (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; - res.as_short[2] = - (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; - res.as_short[3] = - (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; + __res.as_short[0] = + (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0] : __m2.as_short[0]; + __res.as_short[1] = + (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1] : __m2.as_short[1]; + __res.as_short[2] = + (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2] : __m2.as_short[2]; + __res.as_short[3] = + (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3] : __m2.as_short[3]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -1472,28 +1472,27 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_max_pu8 (__m64 __A, __m64 __B) { #if _ARCH_PWR8 - __vector unsigned char a, b, r; - __vector __bool char c; - - a = (__vector unsigned char)vec_splats (__A); - b = (__vector unsigned char)vec_splats (__B); - c = (__vector __bool char)vec_cmpgt (a, b); - r = vec_sel (b, a, c); - return (__m64) ((__vector long long) r)[0]; + __vector unsigned char __a, __b, __r; + __vector __bool char __c; + + __a = (__vector unsigned char)vec_splats (__A); + __b = (__vector unsigned char)vec_splats (__B); + __c = (__vector __bool char)vec_cmpgt (__a, __b); + __r = vec_sel (__b, __a, __c); + return (__m64) ((__vector long long) __r)[0]; #else - __m64_union m1, m2, res; - long i; + __m64_union __m1, __m2, __res; + long __i; - m1.as_m64 = __A; - m2.as_m64 = __B; + __m1.as_m64 = __A; + __m2.as_m64 = __B; + for (__i = 0; __i < 8; __i++) + __res.as_char[__i] = + ((unsigned char) __m1.as_char[__i] > (unsigned char) __m2.as_char[__i]) ? + __m1.as_char[__i] : __m2.as_char[__i]; - for (i = 0; i < 8; i++) - res.as_char[i] = - ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ? - m1.as_char[i] : m2.as_char[i]; - - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -1508,30 +1507,30 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_min_pi16 (__m64 __A, __m64 __B) { #if _ARCH_PWR8 - __vector signed short a, b, r; - __vector __bool short c; - - a = (__vector signed short)vec_splats (__A); - b = (__vector signed short)vec_splats (__B); - c = (__vector __bool short)vec_cmplt (a, b); - r = vec_sel (b, a, c); - return (__m64) ((__vector long long) r)[0]; + __vector signed short __a, __b, __r; + __vector __bool short __c; + + __a = (__vector signed short)vec_splats (__A); + __b = (__vector signed short)vec_splats (__B); + __c = (__vector __bool short)vec_cmplt (__a, __b); + __r = vec_sel (__b, __a, __c); + return (__m64) ((__vector long long) __r)[0]; #else - __m64_union m1, m2, res; + __m64_union __m1, __m2, __res; - m1.as_m64 = __A; - m2.as_m64 = __B; + __m1.as_m64 = __A; + __m2.as_m64 = __B; - res.as_short[0] = - (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; - res.as_short[1] = - (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; - res.as_short[2] = - (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; - res.as_short[3] = - (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; + __res.as_short[0] = + (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0] : __m2.as_short[0]; + __res.as_short[1] = + (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1] : __m2.as_short[1]; + __res.as_short[2] = + (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2] : __m2.as_short[2]; + __res.as_short[3] = + (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3] : __m2.as_short[3]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -1546,28 +1545,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi _mm_min_pu8 (__m64 __A, __m64 __B) { #if _ARCH_PWR8 - __vector unsigned char a, b, r; - __vector __bool char c; - - a = (__vector unsigned char)vec_splats (__A); - b = (__vector unsigned char)vec_splats (__B); - c = (__vector __bool char)vec_cmplt (a, b); - r = vec_sel (b, a, c); - return (__m64) ((__vector long long) r)[0]; + __vector unsigned char __a, __b, __r; + __vector __bool char __c; + + __a = (__vector unsigned char)vec_splats (__A); + __b = (__vector unsigned char)vec_splats (__B); + __c = (__vector __bool char)vec_cmplt (__a, __b); + __r = vec_sel (__b, __a, __c); + return (__m64) ((__vector long long) __r)[0]; #else - __m64_union m1, m2, res; - long i; + __m64_union __m1, __m2, __res; + long __i; - m1.as_m64 = __A; - m2.as_m64 = __B; + __m1.as_m64 = __A; + __m2.as_m64 = __B; - for (i = 0; i < 8; i++) - res.as_char[i] = - ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ? - m1.as_char[i] : m2.as_char[i]; + for (__i = 0; __i < 8; __i++) + __res.as_char[__i] = + ((unsigned char) __m1.as_char[__i] < (unsigned char) __m2.as_char[__i]) ? + __m1.as_char[__i] : __m2.as_char[__i]; - return (__m64) res.as_m64; + return (__m64) __res.as_m64; #endif } @@ -1582,24 +1581,24 @@ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artifici _mm_movemask_pi8 (__m64 __A) { #ifdef __powerpc64__ - unsigned long long p = + unsigned long long __p = #ifdef __LITTLE_ENDIAN__ 0x0008101820283038UL; // permute control for sign bits #else 0x3830282018100800UL; // permute control for sign bits #endif - return __builtin_bpermd (p, __A); + return __builtin_bpermd (__p, __A); #else #ifdef __LITTLE_ENDIAN__ - unsigned int mask = 0x20283038UL; - unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf; - unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf; + unsigned int __mask = 0x20283038UL; + unsigned int __r1 = __builtin_bpermd (__mask, __A) & 0xf; + unsigned int __r2 = __builtin_bpermd (__mask, __A >> 32) & 0xf; #else - unsigned int mask = 0x38302820UL; - unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf; - unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf; + unsigned int __mask = 0x38302820UL; + unsigned int __r1 = __builtin_bpermd (__mask, __A >> 32) & 0xf; + unsigned int __r2 = __builtin_bpermd (__mask, __A) & 0xf; #endif - return (r2 << 4) | r1; + return (__r2 << 4) | __r1; #endif } @@ -1614,10 +1613,10 @@ _m_pmovmskb (__m64 __A) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_pu16 (__m64 __A, __m64 __B) { - __vector unsigned short a, b; - __vector unsigned short c; - __vector unsigned int w0, w1; - __vector unsigned char xform1 = { + __vector unsigned short __a, __b; + __vector unsigned short __c; + __vector unsigned int __w0, __w1; + __vector unsigned char __xform1 = { #ifdef __LITTLE_ENDIAN__ 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F @@ -1627,14 +1626,14 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B) #endif }; - a = (__vector unsigned short)vec_splats (__A); - b = (__vector unsigned short)vec_splats (__B); + __a = (__vector unsigned short)vec_splats (__A); + __b = (__vector unsigned short)vec_splats (__B); - w0 = vec_vmuleuh (a, b); - w1 = vec_vmulouh (a, b); - c = (__vector unsigned short)vec_perm (w0, w1, xform1); + __w0 = vec_vmuleuh (__a, __b); + __w1 = vec_vmulouh (__a, __b); + __c = (__vector unsigned short)vec_perm (__w0, __w1, __xform1); - return (__m64) ((__vector long long) c)[0]; + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1648,11 +1647,11 @@ _m_pmulhuw (__m64 __A, __m64 __B) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shuffle_pi16 (__m64 __A, int const __N) { - unsigned long element_selector_10 = __N & 0x03; - unsigned long element_selector_32 = (__N >> 2) & 0x03; - unsigned long element_selector_54 = (__N >> 4) & 0x03; - unsigned long element_selector_76 = (__N >> 6) & 0x03; - static const unsigned short permute_selectors[4] = + unsigned long __element_selector_10 = __N & 0x03; + unsigned long __element_selector_32 = (__N >> 2) & 0x03; + unsigned long __element_selector_54 = (__N >> 4) & 0x03; + unsigned long __element_selector_76 = (__N >> 6) & 0x03; + static const unsigned short __permute_selectors[4] = { #ifdef __LITTLE_ENDIAN__ 0x0908, 0x0B0A, 0x0D0C, 0x0F0E @@ -1660,24 +1659,24 @@ _mm_shuffle_pi16 (__m64 __A, int const __N) 0x0607, 0x0405, 0x0203, 0x0001 #endif }; - __m64_union t; - __vector unsigned long long a, p, r; + __m64_union __t; + __vector unsigned long long __a, __p, __r; #ifdef __LITTLE_ENDIAN__ - t.as_short[0] = permute_selectors[element_selector_10]; - t.as_short[1] = permute_selectors[element_selector_32]; - t.as_short[2] = permute_selectors[element_selector_54]; - t.as_short[3] = permute_selectors[element_selector_76]; + __t.as_short[0] = __permute_selectors[__element_selector_10]; + __t.as_short[1] = __permute_selectors[__element_selector_32]; + __t.as_short[2] = __permute_selectors[__element_selector_54]; + __t.as_short[3] = __permute_selectors[__element_selector_76]; #else - t.as_short[3] = permute_selectors[element_selector_10]; - t.as_short[2] = permute_selectors[element_selector_32]; - t.as_short[1] = permute_selectors[element_selector_54]; - t.as_short[0] = permute_selectors[element_selector_76]; + __t.as_short[3] = __permute_selectors[__element_selector_10]; + __t.as_short[2] = __permute_selectors[__element_selector_32]; + __t.as_short[1] = __permute_selectors[__element_selector_54]; + __t.as_short[0] = __permute_selectors[__element_selector_76]; #endif - p = vec_splats (t.as_m64); - a = vec_splats (__A); - r = vec_perm (a, a, (__vector unsigned char)p); - return (__m64) ((__vector long long) r)[0]; + __p = vec_splats (__t.as_m64); + __a = vec_splats (__A); + __r = vec_perm (__a, __a, (__vector unsigned char)__p); + return (__m64) ((__vector long long) __r)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1692,14 +1691,14 @@ _m_pshufw (__m64 __A, int const __N) extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) { - __m64 hibit = 0x8080808080808080UL; - __m64 mask, tmp; - __m64 *p = (__m64*)__P; + __m64 __hibit = 0x8080808080808080UL; + __m64 __mask, __tmp; + __m64 *__p = (__m64*)__P; - tmp = *p; - mask = _mm_cmpeq_pi8 ((__N & hibit), hibit); - tmp = (tmp & (~mask)) | (__A & mask); - *p = tmp; + __tmp = *__p; + __mask = _mm_cmpeq_pi8 ((__N & __hibit), __hibit); + __tmp = (__tmp & (~__mask)) | (__A & __mask); + *__p = __tmp; } extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1712,12 +1711,12 @@ _m_maskmovq (__m64 __A, __m64 __N, char *__P) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_avg_pu8 (__m64 __A, __m64 __B) { - __vector unsigned char a, b, c; + __vector unsigned char __a, __b, __c; - a = (__vector unsigned char)vec_splats (__A); - b = (__vector unsigned char)vec_splats (__B); - c = vec_avg (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector unsigned char)vec_splats (__A); + __b = (__vector unsigned char)vec_splats (__B); + __c = vec_avg (__a, __b); + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1730,12 +1729,12 @@ _m_pavgb (__m64 __A, __m64 __B) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_avg_pu16 (__m64 __A, __m64 __B) { - __vector unsigned short a, b, c; + __vector unsigned short __a, __b, __c; - a = (__vector unsigned short)vec_splats (__A); - b = (__vector unsigned short)vec_splats (__B); - c = vec_avg (a, b); - return (__m64) ((__vector long long) c)[0]; + __a = (__vector unsigned short)vec_splats (__A); + __b = (__vector unsigned short)vec_splats (__B); + __c = vec_avg (__a, __b); + return (__m64) ((__vector long long) __c)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1750,26 +1749,26 @@ _m_pavgw (__m64 __A, __m64 __B) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sad_pu8 (__m64 __A, __m64 __B) { - __vector unsigned char a, b; - __vector unsigned char vmin, vmax, vabsdiff; - __vector signed int vsum; - const __vector unsigned int zero = + __vector unsigned char __a, __b; + __vector unsigned char __vmin, __vmax, __vabsdiff; + __vector signed int __vsum; + const __vector unsigned int __zero = { 0, 0, 0, 0 }; - __m64_union result = {0}; + __m64_union __result = {0}; - a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A }; - b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B }; - vmin = vec_min (a, b); - vmax = vec_max (a, b); - vabsdiff = vec_sub (vmax, vmin); + __a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A }; + __b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B }; + __vmin = vec_min (__a, __b); + __vmax = vec_max (__a, __b); + __vabsdiff = vec_sub (__vmax, __vmin); /* Sum four groups of bytes into integers. */ - vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); + __vsum = (__vector signed int) vec_sum4s (__vabsdiff, __zero); /* Sum across four integers with integer result. */ - vsum = vec_sums (vsum, (__vector signed int) zero); + __vsum = vec_sums (__vsum, (__vector signed int) __zero); /* The sum is in the right most 32-bits of the vector result. Transfer to a GPR and truncate to 16 bits. */ - result.as_short[0] = vsum[3]; - return result.as_m64; + __result.as_short[0] = __vsum[3]; + return __result.as_m64; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -- cgit v1.1 From fe79d652c96b53384ddfa43e312cb0010251391b Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Thu, 17 Feb 2022 14:40:16 +0100 Subject: target/104581 - compile-time regression in mode-switching The x86 backend piggy-backs on mode-switching for insertion of vzeroupper. A recent improvement there was implemented in a way to walk possibly the whole basic-block for all DF reg def definitions in its mode_needed hook which is called for each instruction in a basic-block during mode-switching local analysis. The following mostly reverts this improvement. It needs to be re-done in a way more consistent with a local dataflow which probably means making targets aware of the state of the local dataflow analysis. 2022-02-17 Richard Biener PR target/104581 * config/i386/i386.cc (ix86_avx_u128_mode_source): Remove. (ix86_avx_u128_mode_needed): Return AVX_U128_DIRTY instead of calling ix86_avx_u128_mode_source which would eventually have returned AVX_U128_ANY in some very special case. * gcc.target/i386/pr101456-1.c: XFAIL. --- gcc/config/i386/i386.cc | 78 ++----------------------------------------------- 1 file changed, 3 insertions(+), 75 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index cf246e7..e4b42fb 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -14377,80 +14377,12 @@ ix86_check_avx_upper_register (const_rtx exp) static void ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data) - { - if (ix86_check_avx_upper_register (dest)) +{ + if (ix86_check_avx_upper_register (dest)) { bool *used = (bool *) data; *used = true; } - } - -/* For YMM/ZMM store or YMM/ZMM extract. Return mode for the source - operand of SRC DEFs in the same basic block before INSN. */ - -static int -ix86_avx_u128_mode_source (rtx_insn *insn, const_rtx src) -{ - basic_block bb = BLOCK_FOR_INSN (insn); - rtx_insn *end = BB_END (bb); - - /* Return AVX_U128_DIRTY if there is no DEF in the same basic - block. */ - int status = AVX_U128_DIRTY; - - for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src)); - def; def = DF_REF_NEXT_REG (def)) - if (DF_REF_BB (def) == bb) - { - /* Ignore DEF from different basic blocks. */ - rtx_insn *def_insn = DF_REF_INSN (def); - - /* Check if DEF_INSN is before INSN. */ - rtx_insn *next; - for (next = NEXT_INSN (def_insn); - next != nullptr && next != end && next != insn; - next = NEXT_INSN (next)) - ; - - /* Skip if DEF_INSN isn't before INSN. */ - if (next != insn) - continue; - - /* Return AVX_U128_DIRTY if the source operand of DEF_INSN - isn't constant zero. */ - - if (CALL_P (def_insn)) - { - bool avx_upper_reg_found = false; - note_stores (def_insn, - ix86_check_avx_upper_stores, - &avx_upper_reg_found); - - /* Return AVX_U128_DIRTY if call returns AVX. */ - if (avx_upper_reg_found) - return AVX_U128_DIRTY; - - continue; - } - - rtx set = single_set (def_insn); - if (!set) - return AVX_U128_DIRTY; - - rtx dest = SET_DEST (set); - - /* Skip if DEF_INSN is not an AVX load. Return AVX_U128_DIRTY - if the source operand isn't constant zero. */ - if (ix86_check_avx_upper_register (dest) - && standard_sse_constant_p (SET_SRC (set), - GET_MODE (dest)) != 1) - return AVX_U128_DIRTY; - - /* We get here only if all AVX loads are from constant zero. */ - status = AVX_U128_ANY; - } - - return status; } /* Return needed mode for entity in optimize_mode_switching pass. */ @@ -14520,11 +14452,7 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) { FOR_EACH_SUBRTX (iter, array, src, NONCONST) if (ix86_check_avx_upper_register (*iter)) - { - int status = ix86_avx_u128_mode_source (insn, *iter); - if (status == AVX_U128_DIRTY) - return status; - } + return AVX_U128_DIRTY; } /* This isn't YMM/ZMM load/store. */ -- cgit v1.1 From df5ed150ee5fbcb8255e05eed978c4af2b3d9bcc Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 18 Feb 2022 17:21:43 +0100 Subject: rs6000: Fix up posix_memalign call in _mm_malloc [PR104598] The uglification changes went in one spot too far and uglified also the anem of function, posix_memalign should be called like that and not a non-existent function instead of it. 2022-02-18 Jakub Jelinek PR target/104257 PR target/104598 * config/rs6000/mm_malloc.h (_mm_malloc): Call posix_memalign rather than __posix_memalign. --- gcc/config/rs6000/mm_malloc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/mm_malloc.h b/gcc/config/rs6000/mm_malloc.h index 721f756..ae47cac 100644 --- a/gcc/config/rs6000/mm_malloc.h +++ b/gcc/config/rs6000/mm_malloc.h @@ -47,7 +47,7 @@ _mm_malloc (size_t __size, size_t __alignment) return malloc (__size); if (__alignment < __vec_align) __alignment = __vec_align; - if (__posix_memalign (&__ptr, __alignment, __size) == 0) + if (posix_memalign (&__ptr, __alignment, __size) == 0) return __ptr; else return NULL; -- cgit v1.1 From 4984f882f41be1472df6ce7c439c98c4bc4e6f08 Mon Sep 17 00:00:00 2001 From: Pat Haugen Date: Fri, 18 Feb 2022 15:38:23 -0600 Subject: Mark Power10 fusion option undocumented and remove sub-options. gcc/ * config/rs6000/rs6000.opt (mpower10-fusion): Mark Undocumented. (mpower10-fusion-ld-cmpi, mpower10-fusion-2logical, mpower10-fusion-logical-add, mpower10-fusion-add-logical, mpower10-fusion-2add, mpower10-fusion-2store): Remove. * config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER, OTHER_P9_VECTOR_MASKS): Remove Power10 fusion sub-options. * config/rs6000/rs6000.cc (rs6000_option_override_internal, power10_sched_reorder): Likewise. * config/rs6000/genfusion.pl (gen_ld_cmpi_p10, gen_logical_addsubf, gen_addadd): Likewise * config/rs6000/fusion.md: Regenerate. --- gcc/config/rs6000/fusion.md | 332 +++++++++++++++++++------------------- gcc/config/rs6000/genfusion.pl | 13 +- gcc/config/rs6000/rs6000-cpus.def | 14 +- gcc/config/rs6000/rs6000.cc | 27 +--- gcc/config/rs6000/rs6000.opt | 26 +-- 5 files changed, 174 insertions(+), 238 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md index 6f9f534..15f0c16 100644 --- a/gcc/config/rs6000/fusion.md +++ b/gcc/config/rs6000/fusion.md @@ -25,7 +25,7 @@ (compare:CC (match_operand:DI 1 "ds_form_mem_operand" "m") (match_operand:DI 3 "const_m1_to_1_operand" "n"))) (clobber (match_scratch:DI 0 "=r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "ld%X1 %0,%1\;cmpdi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -46,7 +46,7 @@ (compare:CCUNS (match_operand:DI 1 "ds_form_mem_operand" "m") (match_operand:DI 3 "const_0_to_1_operand" "n"))) (clobber (match_scratch:DI 0 "=r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "ld%X1 %0,%1\;cmpldi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -67,7 +67,7 @@ (compare:CC (match_operand:DI 1 "ds_form_mem_operand" "m") (match_operand:DI 3 "const_m1_to_1_operand" "n"))) (set (match_operand:DI 0 "gpc_reg_operand" "=r") (match_dup 1))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "ld%X1 %0,%1\;cmpdi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -88,7 +88,7 @@ (compare:CCUNS (match_operand:DI 1 "ds_form_mem_operand" "m") (match_operand:DI 3 "const_0_to_1_operand" "n"))) (set (match_operand:DI 0 "gpc_reg_operand" "=r") (match_dup 1))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "ld%X1 %0,%1\;cmpldi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -109,7 +109,7 @@ (compare:CC (match_operand:SI 1 "ds_form_mem_operand" "m") (match_operand:SI 3 "const_m1_to_1_operand" "n"))) (clobber (match_scratch:SI 0 "=r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "lwa%X1 %0,%1\;cmpdi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -130,7 +130,7 @@ (compare:CCUNS (match_operand:SI 1 "non_update_memory_operand" "m") (match_operand:SI 3 "const_0_to_1_operand" "n"))) (clobber (match_scratch:SI 0 "=r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "lwz%X1 %0,%1\;cmpldi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -151,7 +151,7 @@ (compare:CC (match_operand:SI 1 "ds_form_mem_operand" "m") (match_operand:SI 3 "const_m1_to_1_operand" "n"))) (set (match_operand:SI 0 "gpc_reg_operand" "=r") (match_dup 1))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "lwa%X1 %0,%1\;cmpdi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -172,7 +172,7 @@ (compare:CCUNS (match_operand:SI 1 "non_update_memory_operand" "m") (match_operand:SI 3 "const_0_to_1_operand" "n"))) (set (match_operand:SI 0 "gpc_reg_operand" "=r") (match_dup 1))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "lwz%X1 %0,%1\;cmpldi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -193,7 +193,7 @@ (compare:CC (match_operand:SI 1 "ds_form_mem_operand" "m") (match_operand:SI 3 "const_m1_to_1_operand" "n"))) (set (match_operand:EXTSI 0 "gpc_reg_operand" "=r") (sign_extend:EXTSI (match_dup 1)))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "lwa%X1 %0,%1\;cmpdi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -214,7 +214,7 @@ (compare:CCUNS (match_operand:SI 1 "non_update_memory_operand" "m") (match_operand:SI 3 "const_0_to_1_operand" "n"))) (set (match_operand:EXTSI 0 "gpc_reg_operand" "=r") (zero_extend:EXTSI (match_dup 1)))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "lwz%X1 %0,%1\;cmpldi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -235,7 +235,7 @@ (compare:CC (match_operand:HI 1 "non_update_memory_operand" "m") (match_operand:HI 3 "const_m1_to_1_operand" "n"))) (clobber (match_scratch:GPR 0 "=r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "lha%X1 %0,%1\;cmpdi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -256,7 +256,7 @@ (compare:CCUNS (match_operand:HI 1 "non_update_memory_operand" "m") (match_operand:HI 3 "const_0_to_1_operand" "n"))) (clobber (match_scratch:GPR 0 "=r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "lhz%X1 %0,%1\;cmpldi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -277,7 +277,7 @@ (compare:CC (match_operand:HI 1 "non_update_memory_operand" "m") (match_operand:HI 3 "const_m1_to_1_operand" "n"))) (set (match_operand:EXTHI 0 "gpc_reg_operand" "=r") (sign_extend:EXTHI (match_dup 1)))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "lha%X1 %0,%1\;cmpdi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -298,7 +298,7 @@ (compare:CCUNS (match_operand:HI 1 "non_update_memory_operand" "m") (match_operand:HI 3 "const_0_to_1_operand" "n"))) (set (match_operand:EXTHI 0 "gpc_reg_operand" "=r") (zero_extend:EXTHI (match_dup 1)))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "lhz%X1 %0,%1\;cmpldi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -319,7 +319,7 @@ (compare:CCUNS (match_operand:QI 1 "non_update_memory_operand" "m") (match_operand:QI 3 "const_0_to_1_operand" "n"))) (clobber (match_scratch:GPR 0 "=r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "lbz%X1 %0,%1\;cmpldi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -340,7 +340,7 @@ (compare:CCUNS (match_operand:QI 1 "non_update_memory_operand" "m") (match_operand:QI 3 "const_0_to_1_operand" "n"))) (set (match_operand:GPR 0 "gpc_reg_operand" "=r") (zero_extend:GPR (match_dup 1)))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)" + "(TARGET_P10_FUSION)" "lbz%X1 %0,%1\;cmpldi %2,%0,%3" "&& reload_completed && (cc_reg_not_cr0_operand (operands[2], CCmode) @@ -363,7 +363,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ and %3,%1,%0\;and %3,%3,%2 and %3,%1,%0\;and %3,%3,%2 @@ -381,7 +381,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ andc %3,%1,%0\;and %3,%3,%2 andc %3,%1,%0\;and %3,%3,%2 @@ -399,7 +399,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ eqv %3,%1,%0\;and %3,%3,%2 eqv %3,%1,%0\;and %3,%3,%2 @@ -417,7 +417,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nand %3,%1,%0\;and %3,%3,%2 nand %3,%1,%0\;and %3,%3,%2 @@ -435,7 +435,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nor %3,%1,%0\;and %3,%3,%2 nor %3,%1,%0\;and %3,%3,%2 @@ -453,7 +453,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ or %3,%1,%0\;and %3,%3,%2 or %3,%1,%0\;and %3,%3,%2 @@ -471,7 +471,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ orc %3,%1,%0\;and %3,%3,%2 orc %3,%1,%0\;and %3,%3,%2 @@ -489,7 +489,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ xor %3,%1,%0\;and %3,%3,%2 xor %3,%1,%0\;and %3,%3,%2 @@ -507,7 +507,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "(TARGET_P10_FUSION)" "@ add %3,%1,%0\;and %3,%3,%2 add %3,%1,%0\;and %3,%3,%2 @@ -525,7 +525,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "(TARGET_P10_FUSION)" "@ subf %3,%1,%0\;and %3,%3,%2 subf %3,%1,%0\;and %3,%3,%2 @@ -543,7 +543,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ and %3,%1,%0\;andc %3,%3,%2 and %3,%1,%0\;andc %3,%3,%2 @@ -561,7 +561,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ andc %3,%1,%0\;andc %3,%3,%2 andc %3,%1,%0\;andc %3,%3,%2 @@ -579,7 +579,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ eqv %3,%1,%0\;andc %3,%3,%2 eqv %3,%1,%0\;andc %3,%3,%2 @@ -597,7 +597,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nand %3,%1,%0\;andc %3,%3,%2 nand %3,%1,%0\;andc %3,%3,%2 @@ -615,7 +615,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nor %3,%1,%0\;andc %3,%3,%2 nor %3,%1,%0\;andc %3,%3,%2 @@ -633,7 +633,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ or %3,%1,%0\;andc %3,%3,%2 or %3,%1,%0\;andc %3,%3,%2 @@ -651,7 +651,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ orc %3,%1,%0\;andc %3,%3,%2 orc %3,%1,%0\;andc %3,%3,%2 @@ -669,7 +669,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ xor %3,%1,%0\;andc %3,%3,%2 xor %3,%1,%0\;andc %3,%3,%2 @@ -687,7 +687,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ and %3,%1,%0\;eqv %3,%3,%2 and %3,%1,%0\;eqv %3,%3,%2 @@ -705,7 +705,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ andc %3,%1,%0\;eqv %3,%3,%2 andc %3,%1,%0\;eqv %3,%3,%2 @@ -723,7 +723,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ eqv %3,%1,%0\;eqv %3,%3,%2 eqv %3,%1,%0\;eqv %3,%3,%2 @@ -741,7 +741,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nand %3,%1,%0\;eqv %3,%3,%2 nand %3,%1,%0\;eqv %3,%3,%2 @@ -759,7 +759,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nor %3,%1,%0\;eqv %3,%3,%2 nor %3,%1,%0\;eqv %3,%3,%2 @@ -777,7 +777,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ or %3,%1,%0\;eqv %3,%3,%2 or %3,%1,%0\;eqv %3,%3,%2 @@ -795,7 +795,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ orc %3,%1,%0\;eqv %3,%3,%2 orc %3,%1,%0\;eqv %3,%3,%2 @@ -813,7 +813,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ xor %3,%1,%0\;eqv %3,%3,%2 xor %3,%1,%0\;eqv %3,%3,%2 @@ -831,7 +831,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ and %3,%1,%0\;nand %3,%3,%2 and %3,%1,%0\;nand %3,%3,%2 @@ -849,7 +849,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ andc %3,%1,%0\;nand %3,%3,%2 andc %3,%1,%0\;nand %3,%3,%2 @@ -867,7 +867,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ eqv %3,%1,%0\;nand %3,%3,%2 eqv %3,%1,%0\;nand %3,%3,%2 @@ -885,7 +885,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nand %3,%1,%0\;nand %3,%3,%2 nand %3,%1,%0\;nand %3,%3,%2 @@ -903,7 +903,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nor %3,%1,%0\;nand %3,%3,%2 nor %3,%1,%0\;nand %3,%3,%2 @@ -921,7 +921,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ or %3,%1,%0\;nand %3,%3,%2 or %3,%1,%0\;nand %3,%3,%2 @@ -939,7 +939,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ orc %3,%1,%0\;nand %3,%3,%2 orc %3,%1,%0\;nand %3,%3,%2 @@ -957,7 +957,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ xor %3,%1,%0\;nand %3,%3,%2 xor %3,%1,%0\;nand %3,%3,%2 @@ -975,7 +975,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "(TARGET_P10_FUSION)" "@ add %3,%1,%0\;nand %3,%3,%2 add %3,%1,%0\;nand %3,%3,%2 @@ -993,7 +993,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "(TARGET_P10_FUSION)" "@ subf %3,%1,%0\;nand %3,%3,%2 subf %3,%1,%0\;nand %3,%3,%2 @@ -1011,7 +1011,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ and %3,%1,%0\;nor %3,%3,%2 and %3,%1,%0\;nor %3,%3,%2 @@ -1029,7 +1029,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ andc %3,%1,%0\;nor %3,%3,%2 andc %3,%1,%0\;nor %3,%3,%2 @@ -1047,7 +1047,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ eqv %3,%1,%0\;nor %3,%3,%2 eqv %3,%1,%0\;nor %3,%3,%2 @@ -1065,7 +1065,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nand %3,%1,%0\;nor %3,%3,%2 nand %3,%1,%0\;nor %3,%3,%2 @@ -1083,7 +1083,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nor %3,%1,%0\;nor %3,%3,%2 nor %3,%1,%0\;nor %3,%3,%2 @@ -1101,7 +1101,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ or %3,%1,%0\;nor %3,%3,%2 or %3,%1,%0\;nor %3,%3,%2 @@ -1119,7 +1119,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ orc %3,%1,%0\;nor %3,%3,%2 orc %3,%1,%0\;nor %3,%3,%2 @@ -1137,7 +1137,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ xor %3,%1,%0\;nor %3,%3,%2 xor %3,%1,%0\;nor %3,%3,%2 @@ -1155,7 +1155,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "(TARGET_P10_FUSION)" "@ add %3,%1,%0\;nor %3,%3,%2 add %3,%1,%0\;nor %3,%3,%2 @@ -1173,7 +1173,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "(TARGET_P10_FUSION)" "@ subf %3,%1,%0\;nor %3,%3,%2 subf %3,%1,%0\;nor %3,%3,%2 @@ -1191,7 +1191,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ and %3,%1,%0\;or %3,%3,%2 and %3,%1,%0\;or %3,%3,%2 @@ -1209,7 +1209,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ andc %3,%1,%0\;or %3,%3,%2 andc %3,%1,%0\;or %3,%3,%2 @@ -1227,7 +1227,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ eqv %3,%1,%0\;or %3,%3,%2 eqv %3,%1,%0\;or %3,%3,%2 @@ -1245,7 +1245,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nand %3,%1,%0\;or %3,%3,%2 nand %3,%1,%0\;or %3,%3,%2 @@ -1263,7 +1263,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nor %3,%1,%0\;or %3,%3,%2 nor %3,%1,%0\;or %3,%3,%2 @@ -1281,7 +1281,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ or %3,%1,%0\;or %3,%3,%2 or %3,%1,%0\;or %3,%3,%2 @@ -1299,7 +1299,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ orc %3,%1,%0\;or %3,%3,%2 orc %3,%1,%0\;or %3,%3,%2 @@ -1317,7 +1317,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ xor %3,%1,%0\;or %3,%3,%2 xor %3,%1,%0\;or %3,%3,%2 @@ -1335,7 +1335,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "(TARGET_P10_FUSION)" "@ add %3,%1,%0\;or %3,%3,%2 add %3,%1,%0\;or %3,%3,%2 @@ -1353,7 +1353,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)" + "(TARGET_P10_FUSION)" "@ subf %3,%1,%0\;or %3,%3,%2 subf %3,%1,%0\;or %3,%3,%2 @@ -1371,7 +1371,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ and %3,%1,%0\;orc %3,%3,%2 and %3,%1,%0\;orc %3,%3,%2 @@ -1389,7 +1389,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ andc %3,%1,%0\;orc %3,%3,%2 andc %3,%1,%0\;orc %3,%3,%2 @@ -1407,7 +1407,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ eqv %3,%1,%0\;orc %3,%3,%2 eqv %3,%1,%0\;orc %3,%3,%2 @@ -1425,7 +1425,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nand %3,%1,%0\;orc %3,%3,%2 nand %3,%1,%0\;orc %3,%3,%2 @@ -1443,7 +1443,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nor %3,%1,%0\;orc %3,%3,%2 nor %3,%1,%0\;orc %3,%3,%2 @@ -1461,7 +1461,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ or %3,%1,%0\;orc %3,%3,%2 or %3,%1,%0\;orc %3,%3,%2 @@ -1479,7 +1479,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ orc %3,%1,%0\;orc %3,%3,%2 orc %3,%1,%0\;orc %3,%3,%2 @@ -1497,7 +1497,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ xor %3,%1,%0\;orc %3,%3,%2 xor %3,%1,%0\;orc %3,%3,%2 @@ -1515,7 +1515,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ and %3,%1,%0\;xor %3,%3,%2 and %3,%1,%0\;xor %3,%3,%2 @@ -1533,7 +1533,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ andc %3,%1,%0\;xor %3,%3,%2 andc %3,%1,%0\;xor %3,%3,%2 @@ -1551,7 +1551,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ eqv %3,%1,%0\;xor %3,%3,%2 eqv %3,%1,%0\;xor %3,%3,%2 @@ -1569,7 +1569,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nand %3,%1,%0\;xor %3,%3,%2 nand %3,%1,%0\;xor %3,%3,%2 @@ -1587,7 +1587,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ nor %3,%1,%0\;xor %3,%3,%2 nor %3,%1,%0\;xor %3,%3,%2 @@ -1605,7 +1605,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ or %3,%1,%0\;xor %3,%3,%2 or %3,%1,%0\;xor %3,%3,%2 @@ -1623,7 +1623,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ orc %3,%1,%0\;xor %3,%3,%2 orc %3,%1,%0\;xor %3,%3,%2 @@ -1641,7 +1641,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ xor %3,%1,%0\;xor %3,%3,%2 xor %3,%1,%0\;xor %3,%3,%2 @@ -1659,7 +1659,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "(TARGET_P10_FUSION)" "@ and %3,%1,%0\;add %3,%3,%2 and %3,%1,%0\;add %3,%3,%2 @@ -1677,7 +1677,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "(TARGET_P10_FUSION)" "@ nand %3,%1,%0\;add %3,%3,%2 nand %3,%1,%0\;add %3,%3,%2 @@ -1695,7 +1695,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "(TARGET_P10_FUSION)" "@ nor %3,%1,%0\;add %3,%3,%2 nor %3,%1,%0\;add %3,%3,%2 @@ -1713,7 +1713,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "(TARGET_P10_FUSION)" "@ or %3,%1,%0\;add %3,%3,%2 or %3,%1,%0\;add %3,%3,%2 @@ -1731,7 +1731,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "(TARGET_P10_FUSION)" "@ and %3,%1,%0\;subf %3,%2,%3 and %3,%1,%0\;subf %3,%2,%3 @@ -1749,7 +1749,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "(TARGET_P10_FUSION)" "@ nand %3,%1,%0\;subf %3,%2,%3 nand %3,%1,%0\;subf %3,%2,%3 @@ -1767,7 +1767,7 @@ (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "(TARGET_P10_FUSION)" "@ nor %3,%1,%0\;subf %3,%2,%3 nor %3,%1,%0\;subf %3,%2,%3 @@ -1785,7 +1785,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "(TARGET_P10_FUSION)" "@ or %3,%1,%0\;subf %3,%2,%3 or %3,%1,%0\;subf %3,%2,%3 @@ -1803,7 +1803,7 @@ (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "(TARGET_P10_FUSION)" "@ and %3,%1,%0\;subf %3,%3,%2 and %3,%1,%0\;subf %3,%3,%2 @@ -1821,7 +1821,7 @@ (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "(TARGET_P10_FUSION)" "@ nand %3,%1,%0\;subf %3,%3,%2 nand %3,%1,%0\;subf %3,%3,%2 @@ -1839,7 +1839,7 @@ (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")) (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "(TARGET_P10_FUSION)" "@ nor %3,%1,%0\;subf %3,%3,%2 nor %3,%1,%0\;subf %3,%3,%2 @@ -1857,7 +1857,7 @@ (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)" + "(TARGET_P10_FUSION)" "@ or %3,%1,%0\;subf %3,%3,%2 or %3,%1,%0\;subf %3,%3,%2 @@ -1875,7 +1875,7 @@ (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vand %3,%1,%0\;vand %3,%3,%2 vand %3,%1,%0\;vand %3,%3,%2 @@ -1893,7 +1893,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vandc %3,%1,%0\;vand %3,%3,%2 vandc %3,%1,%0\;vand %3,%3,%2 @@ -1911,7 +1911,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ veqv %3,%1,%0\;vand %3,%3,%2 veqv %3,%1,%0\;vand %3,%3,%2 @@ -1929,7 +1929,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnand %3,%1,%0\;vand %3,%3,%2 vnand %3,%1,%0\;vand %3,%3,%2 @@ -1947,7 +1947,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnor %3,%1,%0\;vand %3,%3,%2 vnor %3,%1,%0\;vand %3,%3,%2 @@ -1965,7 +1965,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vor %3,%1,%0\;vand %3,%3,%2 vor %3,%1,%0\;vand %3,%3,%2 @@ -1983,7 +1983,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vorc %3,%1,%0\;vand %3,%3,%2 vorc %3,%1,%0\;vand %3,%3,%2 @@ -2001,7 +2001,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vxor %3,%1,%0\;vand %3,%3,%2 vxor %3,%1,%0\;vand %3,%3,%2 @@ -2019,7 +2019,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vand %3,%1,%0\;vandc %3,%3,%2 vand %3,%1,%0\;vandc %3,%3,%2 @@ -2037,7 +2037,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vandc %3,%1,%0\;vandc %3,%3,%2 vandc %3,%1,%0\;vandc %3,%3,%2 @@ -2055,7 +2055,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ veqv %3,%1,%0\;vandc %3,%3,%2 veqv %3,%1,%0\;vandc %3,%3,%2 @@ -2073,7 +2073,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnand %3,%1,%0\;vandc %3,%3,%2 vnand %3,%1,%0\;vandc %3,%3,%2 @@ -2091,7 +2091,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnor %3,%1,%0\;vandc %3,%3,%2 vnor %3,%1,%0\;vandc %3,%3,%2 @@ -2109,7 +2109,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vor %3,%1,%0\;vandc %3,%3,%2 vor %3,%1,%0\;vandc %3,%3,%2 @@ -2127,7 +2127,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vorc %3,%1,%0\;vandc %3,%3,%2 vorc %3,%1,%0\;vandc %3,%3,%2 @@ -2145,7 +2145,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vxor %3,%1,%0\;vandc %3,%3,%2 vxor %3,%1,%0\;vandc %3,%3,%2 @@ -2163,7 +2163,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vand %3,%1,%0\;veqv %3,%3,%2 vand %3,%1,%0\;veqv %3,%3,%2 @@ -2181,7 +2181,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vandc %3,%1,%0\;veqv %3,%3,%2 vandc %3,%1,%0\;veqv %3,%3,%2 @@ -2199,7 +2199,7 @@ (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ veqv %3,%1,%0\;veqv %3,%3,%2 veqv %3,%1,%0\;veqv %3,%3,%2 @@ -2217,7 +2217,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnand %3,%1,%0\;veqv %3,%3,%2 vnand %3,%1,%0\;veqv %3,%3,%2 @@ -2235,7 +2235,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnor %3,%1,%0\;veqv %3,%3,%2 vnor %3,%1,%0\;veqv %3,%3,%2 @@ -2253,7 +2253,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vor %3,%1,%0\;veqv %3,%3,%2 vor %3,%1,%0\;veqv %3,%3,%2 @@ -2271,7 +2271,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vorc %3,%1,%0\;veqv %3,%3,%2 vorc %3,%1,%0\;veqv %3,%3,%2 @@ -2289,7 +2289,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vxor %3,%1,%0\;veqv %3,%3,%2 vxor %3,%1,%0\;veqv %3,%3,%2 @@ -2307,7 +2307,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vand %3,%1,%0\;vnand %3,%3,%2 vand %3,%1,%0\;vnand %3,%3,%2 @@ -2325,7 +2325,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vandc %3,%1,%0\;vnand %3,%3,%2 vandc %3,%1,%0\;vnand %3,%3,%2 @@ -2343,7 +2343,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ veqv %3,%1,%0\;vnand %3,%3,%2 veqv %3,%1,%0\;vnand %3,%3,%2 @@ -2361,7 +2361,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnand %3,%1,%0\;vnand %3,%3,%2 vnand %3,%1,%0\;vnand %3,%3,%2 @@ -2379,7 +2379,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnor %3,%1,%0\;vnand %3,%3,%2 vnor %3,%1,%0\;vnand %3,%3,%2 @@ -2397,7 +2397,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vor %3,%1,%0\;vnand %3,%3,%2 vor %3,%1,%0\;vnand %3,%3,%2 @@ -2415,7 +2415,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vorc %3,%1,%0\;vnand %3,%3,%2 vorc %3,%1,%0\;vnand %3,%3,%2 @@ -2433,7 +2433,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vxor %3,%1,%0\;vnand %3,%3,%2 vxor %3,%1,%0\;vnand %3,%3,%2 @@ -2451,7 +2451,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vand %3,%1,%0\;vnor %3,%3,%2 vand %3,%1,%0\;vnor %3,%3,%2 @@ -2469,7 +2469,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vandc %3,%1,%0\;vnor %3,%3,%2 vandc %3,%1,%0\;vnor %3,%3,%2 @@ -2487,7 +2487,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ veqv %3,%1,%0\;vnor %3,%3,%2 veqv %3,%1,%0\;vnor %3,%3,%2 @@ -2505,7 +2505,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnand %3,%1,%0\;vnor %3,%3,%2 vnand %3,%1,%0\;vnor %3,%3,%2 @@ -2523,7 +2523,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnor %3,%1,%0\;vnor %3,%3,%2 vnor %3,%1,%0\;vnor %3,%3,%2 @@ -2541,7 +2541,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vor %3,%1,%0\;vnor %3,%3,%2 vor %3,%1,%0\;vnor %3,%3,%2 @@ -2559,7 +2559,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vorc %3,%1,%0\;vnor %3,%3,%2 vorc %3,%1,%0\;vnor %3,%3,%2 @@ -2577,7 +2577,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vxor %3,%1,%0\;vnor %3,%3,%2 vxor %3,%1,%0\;vnor %3,%3,%2 @@ -2595,7 +2595,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vand %3,%1,%0\;vor %3,%3,%2 vand %3,%1,%0\;vor %3,%3,%2 @@ -2613,7 +2613,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vandc %3,%1,%0\;vor %3,%3,%2 vandc %3,%1,%0\;vor %3,%3,%2 @@ -2631,7 +2631,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ veqv %3,%1,%0\;vor %3,%3,%2 veqv %3,%1,%0\;vor %3,%3,%2 @@ -2649,7 +2649,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnand %3,%1,%0\;vor %3,%3,%2 vnand %3,%1,%0\;vor %3,%3,%2 @@ -2667,7 +2667,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnor %3,%1,%0\;vor %3,%3,%2 vnor %3,%1,%0\;vor %3,%3,%2 @@ -2685,7 +2685,7 @@ (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vor %3,%1,%0\;vor %3,%3,%2 vor %3,%1,%0\;vor %3,%3,%2 @@ -2703,7 +2703,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vorc %3,%1,%0\;vor %3,%3,%2 vorc %3,%1,%0\;vor %3,%3,%2 @@ -2721,7 +2721,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vxor %3,%1,%0\;vor %3,%3,%2 vxor %3,%1,%0\;vor %3,%3,%2 @@ -2739,7 +2739,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vand %3,%1,%0\;vorc %3,%3,%2 vand %3,%1,%0\;vorc %3,%3,%2 @@ -2757,7 +2757,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vandc %3,%1,%0\;vorc %3,%3,%2 vandc %3,%1,%0\;vorc %3,%3,%2 @@ -2775,7 +2775,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ veqv %3,%1,%0\;vorc %3,%3,%2 veqv %3,%1,%0\;vorc %3,%3,%2 @@ -2793,7 +2793,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnand %3,%1,%0\;vorc %3,%3,%2 vnand %3,%1,%0\;vorc %3,%3,%2 @@ -2811,7 +2811,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnor %3,%1,%0\;vorc %3,%3,%2 vnor %3,%1,%0\;vorc %3,%3,%2 @@ -2829,7 +2829,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vor %3,%1,%0\;vorc %3,%3,%2 vor %3,%1,%0\;vorc %3,%3,%2 @@ -2847,7 +2847,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vorc %3,%1,%0\;vorc %3,%3,%2 vorc %3,%1,%0\;vorc %3,%3,%2 @@ -2865,7 +2865,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vxor %3,%1,%0\;vorc %3,%3,%2 vxor %3,%1,%0\;vorc %3,%3,%2 @@ -2883,7 +2883,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vand %3,%1,%0\;vxor %3,%3,%2 vand %3,%1,%0\;vxor %3,%3,%2 @@ -2901,7 +2901,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vandc %3,%1,%0\;vxor %3,%3,%2 vandc %3,%1,%0\;vxor %3,%3,%2 @@ -2919,7 +2919,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ veqv %3,%1,%0\;vxor %3,%3,%2 veqv %3,%1,%0\;vxor %3,%3,%2 @@ -2937,7 +2937,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnand %3,%1,%0\;vxor %3,%3,%2 vnand %3,%1,%0\;vxor %3,%3,%2 @@ -2955,7 +2955,7 @@ (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vnor %3,%1,%0\;vxor %3,%3,%2 vnor %3,%1,%0\;vxor %3,%3,%2 @@ -2973,7 +2973,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vor %3,%1,%0\;vxor %3,%3,%2 vor %3,%1,%0\;vxor %3,%3,%2 @@ -2991,7 +2991,7 @@ (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vorc %3,%1,%0\;vxor %3,%3,%2 vorc %3,%1,%0\;vxor %3,%3,%2 @@ -3009,7 +3009,7 @@ (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:VM 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" + "(TARGET_P10_FUSION)" "@ vxor %3,%1,%0\;vxor %3,%3,%2 vxor %3,%1,%0\;vxor %3,%3,%2 @@ -3027,7 +3027,7 @@ (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))) (clobber (match_scratch:GPR 4 "=X,X,X,&r"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2ADD)" + "(TARGET_P10_FUSION)" "@ add %3,%1,%0\;add %3,%3,%2 add %3,%1,%0\;add %3,%3,%2 @@ -3045,7 +3045,7 @@ (match_operand:V2DI 1 "altivec_register_operand" "%v,v,v,v")) (match_operand:V2DI 2 "altivec_register_operand" "v,v,v,v"))) (clobber (match_scratch:V2DI 4 "=X,X,X,&v"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2ADD)" + "(TARGET_P10_FUSION)" "@ vaddudm %3,%1,%0\;vaddudm %3,%3,%2 vaddudm %3,%1,%0\;vaddudm %3,%3,%2 diff --git a/gcc/config/rs6000/genfusion.pl b/gcc/config/rs6000/genfusion.pl index 7e201f7..81cc225 100755 --- a/gcc/config/rs6000/genfusion.pl +++ b/gcc/config/rs6000/genfusion.pl @@ -118,7 +118,7 @@ sub gen_ld_cmpi_p10 } else { print " (set (match_operand:${result} 0 \"gpc_reg_operand\" \"=r\") (${extend}_extend:${result} (match_dup 1)))]\n"; } - print " \"(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)\"\n"; + print " \"(TARGET_P10_FUSION)\"\n"; print " \"l${ldst}${echr}%X1 %0,%1\\;cmp${cmpl}di %2,%0,%3\"\n"; print " \"&& reload_completed\n"; print " && (cc_reg_not_cr0_operand (operands[2], CCmode)\n"; @@ -166,8 +166,8 @@ sub gen_logical_addsubf $outer_op, $outer_comp, $outer_inv, $outer_rtl, $inner, @inner_ops, $inner_comp, $inner_inv, $inner_rtl, $inner_op, $both_commute, $c4, $bc, $inner_arg0, $inner_arg1, $inner_exp, $outer_arg2, $outer_exp, - $target_flag, $ftype, $insn, $is_subf, $is_rsubf, $outer_32, $outer_42, - $outer_name, $fuse_type); + $ftype, $insn, $is_subf, $is_rsubf, $outer_32, $outer_42,$outer_name, + $fuse_type); KIND: foreach $kind ('scalar','vector') { @outer_ops = @logicals; if ( $kind eq 'vector' ) { @@ -199,18 +199,15 @@ sub gen_logical_addsubf $outer_rtl = $rtlop{$outer}; @inner_ops = @logicals; $ftype = "logical-logical"; - $target_flag = "TARGET_P10_FUSION_2LOGICAL"; if ( exists $isaddsub{$outer} ) { @inner_ops = sort keys %logicals_addsub; $ftype = "logical-add"; - $target_flag = "TARGET_P10_FUSION_LOGADD"; } elsif ( $kind ne 'vector' && exists $logicals_addsub{$outer} ) { push (@inner_ops, @addsub); } INNER: foreach $inner ( @inner_ops ) { if ( exists $isaddsub{$inner} ) { $ftype = "add-logical"; - $target_flag = "TARGET_P10_FUSION_ADDLOG"; } $inner_comp = $complement{$inner}; $inner_inv = $invert{$inner}; @@ -266,7 +263,7 @@ sub gen_logical_addsubf [(set (match_operand:${mode} 3 "${pred}" "=&0,&1,&${constraint},${constraint}") ${outer_exp}) (clobber (match_scratch:${mode} 4 "=X,X,X,&${constraint}"))] - "(TARGET_P10_FUSION && $target_flag)" + "(TARGET_P10_FUSION)" "@ ${inner_op} %3,%1,%0\\;${outer_op} %3,${outer_32} ${inner_op} %3,%1,%0\\;${outer_op} %3,${outer_32} @@ -313,7 +310,7 @@ sub gen_addadd (match_operand:${mode} 1 "${pred}" "%${c4}")) (match_operand:${mode} 2 "${pred}" "${c4}"))) (clobber (match_scratch:${mode} 4 "=X,X,X,&${constraint}"))] - "(TARGET_P10_FUSION && TARGET_P10_FUSION_2ADD)" + "(TARGET_P10_FUSION)" "@ ${op} %3,%1,%0\\;${op} %3,%3,%2 ${op} %3,%1,%0\\;${op} %3,%3,%2 diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def index 325b219..963947f 100644 --- a/gcc/config/rs6000/rs6000-cpus.def +++ b/gcc/config/rs6000/rs6000-cpus.def @@ -85,13 +85,7 @@ #define ISA_3_1_MASKS_SERVER (ISA_3_0_MASKS_SERVER \ | OPTION_MASK_POWER10 \ | OTHER_POWER10_MASKS \ - | OPTION_MASK_P10_FUSION \ - | OPTION_MASK_P10_FUSION_LD_CMPI \ - | OPTION_MASK_P10_FUSION_2LOGICAL \ - | OPTION_MASK_P10_FUSION_LOGADD \ - | OPTION_MASK_P10_FUSION_ADDLOG \ - | OPTION_MASK_P10_FUSION_2ADD \ - | OPTION_MASK_P10_FUSION_2STORE) + | OPTION_MASK_P10_FUSION) /* Flags that need to be turned off if -mno-power9-vector. */ #define OTHER_P9_VECTOR_MASKS (OPTION_MASK_FLOAT128_HW \ @@ -139,12 +133,6 @@ | OPTION_MASK_FPRND \ | OPTION_MASK_POWER10 \ | OPTION_MASK_P10_FUSION \ - | OPTION_MASK_P10_FUSION_LD_CMPI \ - | OPTION_MASK_P10_FUSION_2LOGICAL \ - | OPTION_MASK_P10_FUSION_LOGADD \ - | OPTION_MASK_P10_FUSION_ADDLOG \ - | OPTION_MASK_P10_FUSION_2ADD \ - | OPTION_MASK_P10_FUSION_2STORE \ | OPTION_MASK_HTM \ | OPTION_MASK_ISEL \ | OPTION_MASK_MFCRF \ diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 32a13cd..d7a7cfe 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -4446,30 +4446,6 @@ rs6000_option_override_internal (bool global_init_p) && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION) == 0) rs6000_isa_flags |= OPTION_MASK_P10_FUSION; - if (TARGET_POWER10 && - (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_LD_CMPI) == 0) - rs6000_isa_flags |= OPTION_MASK_P10_FUSION_LD_CMPI; - - if (TARGET_POWER10 - && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2LOGICAL) == 0) - rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2LOGICAL; - - if (TARGET_POWER10 - && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_LOGADD) == 0) - rs6000_isa_flags |= OPTION_MASK_P10_FUSION_LOGADD; - - if (TARGET_POWER10 - && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_ADDLOG) == 0) - rs6000_isa_flags |= OPTION_MASK_P10_FUSION_ADDLOG; - - if (TARGET_POWER10 - && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0) - rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD; - - if (TARGET_POWER10 - && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0) - rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE; - /* Turn off vector pair/mma options on non-power10 systems. */ else if (!TARGET_POWER10 && TARGET_MMA) { @@ -19032,8 +19008,7 @@ power10_sched_reorder (rtx_insn **ready, int lastpos) /* Try to pair certain store insns to adjacent memory locations so that the hardware will fuse them to a single operation. */ - if (TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE - && is_fusable_store (last_scheduled_insn, &mem1)) + if (TARGET_P10_FUSION && is_fusable_store (last_scheduled_insn, &mem1)) { /* A fusable store was just scheduled. Scan the ready list for another diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index 68c0cae..4931d78 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -487,33 +487,9 @@ Target Mask(P8_VECTOR) Var(rs6000_isa_flags) Use vector and scalar instructions added in ISA 2.07. mpower10-fusion -Target Mask(P10_FUSION) Var(rs6000_isa_flags) +Target Undocumented Mask(P10_FUSION) Var(rs6000_isa_flags) Fuse certain integer operations together for better performance on power10. -mpower10-fusion-ld-cmpi -Target Undocumented Mask(P10_FUSION_LD_CMPI) Var(rs6000_isa_flags) -Fuse certain integer operations together for better performance on power10. - -mpower10-fusion-2logical -Target Undocumented Mask(P10_FUSION_2LOGICAL) Var(rs6000_isa_flags) -Fuse pairs of scalar or vector logical operations together for better performance on power10. - -mpower10-fusion-logical-add -Target Undocumented Mask(P10_FUSION_LOGADD) Var(rs6000_isa_flags) -Fuse scalar logical op with add/subf for better performance on power10. - -mpower10-fusion-add-logical -Target Undocumented Mask(P10_FUSION_ADDLOG) Var(rs6000_isa_flags) -Fuse scalar add/subf with logical op for better performance on power10. - -mpower10-fusion-2add -Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags) -Fuse dependent pairs of add or vaddudm instructions for better performance on power10. - -mpower10-fusion-2store -Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags) -Fuse certain store operations together for better performance on power10. - mcrypto Target Mask(CRYPTO) Var(rs6000_isa_flags) Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions. -- cgit v1.1 From 8e5c34ab45f34aadea65c5ba33ec685264b6ec66 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Fri, 18 Feb 2022 16:50:03 +0100 Subject: [nvptx] Use nvptx_warpsync / nvptx_uniform_warp_check for -muniform-simt With the default ptx isa 6.0, we have for uniform-simt-1.c: ... @%r33 atom.global.cas.b32 %r26, [a], %r28, %r29; shfl.sync.idx.b32 %r26, %r26, %r32, 31, 0xffffffff; ... The atomic insn is predicated by -muniform-simt, and the subsequent insn does a warp sync, at which point the warp is uniform again. But with -mptx=3.1, we have instead: ... @%r33 atom.global.cas.b32 %r26, [a], %r28, %r29; shfl.idx.b32 %r26, %r26, %r32, 31; ... The shfl does not sync the warp, and we want the warp to go back to executing uniformly asap. We cannot enforce this, but at least check this using nvptx_uniform_warp_check, similar to how that is done for openacc. Likewise, detect the case that no shfl insn is emitted, and add a nvptx_uniform_warp_check or nvptx_warpsync. gcc/ChangeLog: 2022-02-19 Tom de Vries * config/nvptx/nvptx.cc (nvptx_unisimt_handle_set): Change return type to bool. (nvptx_reorg_uniform_simt): Insert nvptx_uniform_warp_check or nvptx_warpsync, if necessary. gcc/testsuite/ChangeLog: 2022-02-19 Tom de Vries * gcc.target/nvptx/uniform-simt-1.c: Add scan-assembler test. * gcc.target/nvptx/uniform-simt-2.c: New test. --- gcc/config/nvptx/nvptx.cc | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index afbad5b..4942f11 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -3248,12 +3248,18 @@ nvptx_call_insn_is_syscall_p (rtx_insn *insn) /* If SET subexpression of INSN sets a register, emit a shuffle instruction to propagate its value from lane MASTER to current lane. */ -static void +static bool nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master) { rtx reg; if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set))) - emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn); + { + emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), + insn); + return true; + } + + return false; } /* Adjust code for uniform-simt code generation variant by making atomics and @@ -3275,8 +3281,30 @@ nvptx_reorg_uniform_simt () continue; rtx pat = PATTERN (insn); rtx master = nvptx_get_unisimt_master (); + bool shuffle_p = false; for (int i = 0; i < XVECLEN (pat, 0); i++) - nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master); + shuffle_p + |= nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master); + if (shuffle_p && TARGET_PTX_6_0) + { + /* The shuffle is a sync, so uniformity is guaranteed. */ + } + else + { + if (TARGET_PTX_6_0) + { + gcc_assert (!shuffle_p); + /* Emit after the insn, to guarantee uniformity. */ + emit_insn_after (gen_nvptx_warpsync (), insn); + } + else + { + /* Emit after the insn (and before the shuffle, if there are any) + to check uniformity. */ + emit_insn_after (gen_nvptx_uniform_warp_check (), insn); + } + } + rtx pred = nvptx_get_unisimt_predicate (); pred = gen_rtx_NE (BImode, pred, const0_rtx); pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat); -- cgit v1.1 From 9ed52438b8ca99a0dffe74da96c2281cbc9cbb4b Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Fri, 18 Feb 2022 17:38:50 +0100 Subject: [nvptx] Don't skip atomic insns in nvptx_reorg_uniform_simt In nvptx_reorg_uniform_simt we have a loop: ... for (insn = get_insns (); insn; insn = next) { next = NEXT_INSN (insn); if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn)) && !(NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PARALLEL && get_attr_atomic (insn))) continue; ... that intends to handle syscalls and atomic insns. However, this also silently skips the atomic insn nvptx_atomic_store, which has GET_CODE (PATTERN (insn)) == SET. This does not cause problems, because the nvptx_atomic_store actually maps onto a "st" insn, and therefore is not atomic and doesn't need to be handled by nvptx_reorg_uniform_simt. Fix this by: - explicitly setting nvptx_atomic_store's atomic attribute to false, - rewriting the skip condition to make sure all insn with atomic attribute are handled, and - asserting that all handled insns are PARALLEL. Tested on nvptx. gcc/ChangeLog: 2022-02-19 Tom de Vries * config/nvptx/nvptx.cc (nvptx_reorg_uniform_simt): Handle all insns with atomic attribute. Assert that all handled insns are PARALLELs. * config/nvptx/nvptx.md (define_insn "nvptx_atomic_store"): Set atomic attribute to false. gcc/testsuite/ChangeLog: 2022-02-19 Tom de Vries * gcc.target/nvptx/uniform-simt-3.c: New test. --- gcc/config/nvptx/nvptx.cc | 20 ++++++++++++++++---- gcc/config/nvptx/nvptx.md | 2 +- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 4942f11..55fab3e 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -3274,12 +3274,24 @@ nvptx_reorg_uniform_simt () for (insn = get_insns (); insn; insn = next) { next = NEXT_INSN (insn); - if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn)) - && !(NONJUMP_INSN_P (insn) - && GET_CODE (PATTERN (insn)) == PARALLEL - && get_attr_atomic (insn))) + + /* Skip NOTE, USE, etc. */ + if (!INSN_P (insn) || recog_memoized (insn) == -1) continue; + + if (CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn)) + { + /* Handle syscall. */ + } + else if (get_attr_atomic (insn)) + { + /* Handle atomic insn. */ + } + else + continue; + rtx pat = PATTERN (insn); + gcc_assert (GET_CODE (pat) == PARALLEL); rtx master = nvptx_get_unisimt_master (); bool shuffle_p = false; for (int i = 0; i < XVECLEN (pat, 0); i++) diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 4c378ec..132ef2f 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -2097,7 +2097,7 @@ = "%.\tst%A0.b%T0\t%0, %1;"; return nvptx_output_atomic_insn (t, operands, 0, 2); } - [(set_attr "atomic" "true")]) + [(set_attr "atomic" "false")]) ;; Note: st is not an atomic insn. (define_insn "atomic_fetch_add" [(set (match_operand:SDIM 1 "memory_operand" "+m") -- cgit v1.1 From 69cb3f2abb911acebfc7ffede2ee7151a3e14a59 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Tue, 15 Feb 2022 14:36:26 +0100 Subject: [nvptx] Use _ as destination operand of atom.exch We currently generate this code for an atomic store: ... .reg.u32 %r21; atom.exch.b32 %r21,[%r22],%r23; ... where %r21 is set but unused. Use the ptx bit bucket operand '_' instead, such that we have: ... atom.exch.b32 _,[%r22],%r23; ... [ Note that the same problem still occurs for this code: ... void atomic_store (int *ptr, int val) { __atomic_exchange_n (ptr, val, MEMMODEL_RELAXED); } ... ] Tested on nvptx. gcc/ChangeLog: 2022-02-19 Tom de Vries * config/nvptx/nvptx.cc (nvptx_reorg_uniform_simt): Handle SET insn. * config/nvptx/nvptx.md (define_insn "nvptx_atomic_store"): Rename to ... (define_insn "nvptx_atomic_store_sm70"): This. (define_insn "nvptx_atomic_store"): New define_insn. (define_expand "atomic_store"): Handle rename. Use nvptx_atomic_store instead of atomic_exchange. gcc/testsuite/ChangeLog: 2022-02-19 Tom de Vries * gcc.target/nvptx/atomic-store-1.c: Update. --- gcc/config/nvptx/nvptx.cc | 18 ++++++++++++++---- gcc/config/nvptx/nvptx.md | 25 +++++++++++++++++++------ 2 files changed, 33 insertions(+), 10 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 55fab3e..ed347ca 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -3291,12 +3291,22 @@ nvptx_reorg_uniform_simt () continue; rtx pat = PATTERN (insn); - gcc_assert (GET_CODE (pat) == PARALLEL); rtx master = nvptx_get_unisimt_master (); bool shuffle_p = false; - for (int i = 0; i < XVECLEN (pat, 0); i++) - shuffle_p - |= nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master); + switch (GET_CODE (pat)) + { + case PARALLEL: + for (int i = 0; i < XVECLEN (pat, 0); i++) + shuffle_p + |= nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master); + break; + case SET: + shuffle_p |= nvptx_unisimt_handle_set (pat, insn, master); + break; + default: + gcc_unreachable (); + } + if (shuffle_p && TARGET_PTX_6_0) { /* The shuffle is a sync, so uniformity is guaranteed. */ diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 132ef2f..f6dc817 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -2069,8 +2069,8 @@ if (TARGET_SM70) { - emit_insn (gen_nvptx_atomic_store (operands[0], operands[1], - operands[2])); + emit_insn (gen_nvptx_atomic_store_sm70 (operands[0], operands[1], + operands[2])); DONE; } @@ -2079,13 +2079,12 @@ /* Fall back to expand_atomic_store. */ FAIL; - rtx tmpreg = gen_reg_rtx (mode); - emit_insn (gen_atomic_exchange (tmpreg, operands[0], operands[1], - operands[2])); + emit_insn (gen_nvptx_atomic_store (operands[0], operands[1], + operands[2])); DONE; }) -(define_insn "nvptx_atomic_store" +(define_insn "nvptx_atomic_store_sm70" [(set (match_operand:SDIM 0 "memory_operand" "+m") ;; memory (unspec_volatile:SDIM [(match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri") ;; input @@ -2099,6 +2098,20 @@ } [(set_attr "atomic" "false")]) ;; Note: st is not an atomic insn. +(define_insn "nvptx_atomic_store" + [(set (match_operand:SDIM 0 "memory_operand" "+m") ;; memory + (unspec_volatile:SDIM + [(match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri") ;; input + (match_operand:SI 2 "const_int_operand")] ;; model + UNSPECV_ST))] + "!TARGET_SM70" + { + const char *t + = "%.\tatom%A0.exch.b%T0\t_, %0, %1;"; + return nvptx_output_atomic_insn (t, operands, 0, 2); + } + [(set_attr "atomic" "true")]) + (define_insn "atomic_fetch_add" [(set (match_operand:SDIM 1 "memory_operand" "+m") (unspec_volatile:SDIM -- cgit v1.1 From 02aedc6f269b5e3c1f354edcf5b84d27b0a15946 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Wed, 16 Feb 2022 17:09:11 +0100 Subject: [nvptx] Initialize ptx regs With nvptx target, driver version 510.47.03 and board GT 1030 I, we run into: ... FAIL: gcc.c-torture/execute/pr53465.c -O1 execution test FAIL: gcc.c-torture/execute/pr53465.c -O2 execution test FAIL: gcc.c-torture/execute/pr53465.c -O3 -g execution test ... while the test-cases pass with nvptx-none-run -O0. The problem is that the generated ptx contains a read from an uninitialized ptx register, and the driver JIT doesn't handle this well. For -O2 and -O3, we can get rid of the FAIL using --param logical-op-non-short-circuit=0. But not for -O1. At -O1, the test-case minimizes to: ... void __attribute__((noinline, noclone)) foo (int y) { int c; for (int i = 0; i < y; i++) { int d = i + 1; if (i && d <= c) __builtin_abort (); c = d; } } int main () { foo (2); return 0; } ... Note that the test-case does not contain an uninitialized use. In the first iteration, i is 0 and consequently c is not read. In the second iteration, c is read, but by that time it's already initialized by 'c = d' from the first iteration. AFAICT the problem is introduced as follows: the conditional use of c in the loop body is translated into an unconditional use of c in the loop header: ... # c_1 = PHI ... which forwprop1 propagates the 'c_9 = d_7' assignment into: ... # c_1 = PHI ... which ends up being translated by expand into an unconditional: ... (insn 13 12 0 (set (reg/v:SI 22 [ c ]) (reg/v:SI 23 [ d ])) -1 (nil)) ... at the start of the loop body, creating an uninitialized read of d on the path from loop entry. By disabling coalesce_ssa_name, we get the more usual copies on the incoming edges. The copy on the loop entry path still does an uninitialized read, but that one's now initialized by init-regs. The test-case passes, also when disabling init-regs, so it's possible that the JIT driver doesn't object to this type of uninitialized read. Now that we characterized the problem to some degree, we need to fix this, because either: - we're violating an undocumented ptx invariant, and this is a compiler bug, or - this is is a driver JIT bug and we need to work around it. There are essentially two strategies to address this: - stop the compiler from creating uninitialized reads - patch up uninitialized reads using additional initialization The former will probably involve: - making some optimizations more conservative in the presence of uninitialized reads, and - disabling some other optimizations (where making them more conservative is not possible, or cannot easily be achieved). This will probably will have a cost penalty for code that does not suffer from the original problem. The latter has the problem that it may paper over uninitialized reads in the source code, or indeed over ones that were incorrectly introduced by the compiler. But it has the advantage that it allows for the problem to be addressed at a single location. There's an existing pass, init-regs, which implements a form of the latter, but it doesn't work for this example because it only inserts additional initialization for uses that have not a single reaching definition. Fix this by adding initialization of uninitialized ptx regs in reorg. Control the new functionality using -minit-regs=<0|1|2|3>, meaning: - 0: disabled. - 1: add initialization of all regs at the entry bb - 2: add initialization of uninitialized regs at the entry bb - 3: add initialization of uninitialized regs close to the use and defaulting to 3. Tested on nvptx. gcc/ChangeLog: 2022-02-17 Tom de Vries PR target/104440 * config/nvptx/nvptx.cc (workaround_uninit_method_1) (workaround_uninit_method_2, workaround_uninit_method_3) (workaround_uninit): New function. (nvptx_reorg): Use workaround_uninit. * config/nvptx/nvptx.opt (minit-regs): New option. --- gcc/config/nvptx/nvptx.cc | 188 +++++++++++++++++++++++++++++++++++++++++++++ gcc/config/nvptx/nvptx.opt | 4 + 2 files changed, 192 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index ed347ca..a37a6c7 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -5372,6 +5372,190 @@ workaround_barsyncs (void) } #endif +/* Initialize all declared regs at function entry. + Advantage : Fool-proof. + Disadvantage: Potentially creates a lot of long live ranges and adds a lot + of insns. */ + +static void +workaround_uninit_method_1 (void) +{ + rtx_insn *first = get_insns (); + rtx_insn *insert_here = NULL; + + for (int ix = LAST_VIRTUAL_REGISTER + 1; ix < max_reg_num (); ix++) + { + rtx reg = regno_reg_rtx[ix]; + + /* Skip undeclared registers. */ + if (reg == const0_rtx) + continue; + + gcc_assert (CONST0_RTX (GET_MODE (reg))); + + start_sequence (); + emit_move_insn (reg, CONST0_RTX (GET_MODE (reg))); + rtx_insn *inits = get_insns (); + end_sequence (); + + if (dump_file && (dump_flags & TDF_DETAILS)) + for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init)) + fprintf (dump_file, "Default init of reg %u inserted: insn %u\n", + ix, INSN_UID (init)); + + if (first != NULL) + { + insert_here = emit_insn_before (inits, first); + first = NULL; + } + else + insert_here = emit_insn_after (inits, insert_here); + } +} + +/* Find uses of regs that are not defined on all incoming paths, and insert a + corresponding def at function entry. + Advantage : Simple. + Disadvantage: Potentially creates long live ranges. + May not catch all cases. F.i. a clobber cuts a live range in + the compiler and may prevent entry_lr_in from being set for a + reg, but the clobber does not translate to a ptx insn, so in + ptx there still may be an uninitialized ptx reg. See f.i. + gcc.c-torture/compile/20020926-1.c. */ + +static void +workaround_uninit_method_2 (void) +{ + auto_bitmap entry_pseudo_uninit; + { + auto_bitmap not_pseudo; + bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER); + + bitmap entry_lr_in = DF_LR_IN (ENTRY_BLOCK_PTR_FOR_FN (cfun)); + bitmap_and_compl (entry_pseudo_uninit, entry_lr_in, not_pseudo); + } + + rtx_insn *first = get_insns (); + rtx_insn *insert_here = NULL; + + bitmap_iterator iterator; + unsigned ix; + EXECUTE_IF_SET_IN_BITMAP (entry_pseudo_uninit, 0, ix, iterator) + { + rtx reg = regno_reg_rtx[ix]; + gcc_assert (CONST0_RTX (GET_MODE (reg))); + + start_sequence (); + emit_move_insn (reg, CONST0_RTX (GET_MODE (reg))); + rtx_insn *inits = get_insns (); + end_sequence (); + + if (dump_file && (dump_flags & TDF_DETAILS)) + for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init)) + fprintf (dump_file, "Missing init of reg %u inserted: insn %u\n", + ix, INSN_UID (init)); + + if (first != NULL) + { + insert_here = emit_insn_before (inits, first); + first = NULL; + } + else + insert_here = emit_insn_after (inits, insert_here); + } +} + +/* Find uses of regs that are not defined on all incoming paths, and insert a + corresponding def on those. + Advantage : Doesn't create long live ranges. + Disadvantage: More complex, and potentially also more defs. */ + +static void +workaround_uninit_method_3 (void) +{ + auto_bitmap not_pseudo; + bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER); + + basic_block bb; + FOR_EACH_BB_FN (bb, cfun) + { + if (single_pred_p (bb)) + continue; + + auto_bitmap bb_pseudo_uninit; + bitmap_and_compl (bb_pseudo_uninit, DF_LIVE_IN (bb), DF_MIR_IN (bb)); + bitmap_and_compl_into (bb_pseudo_uninit, not_pseudo); + + bitmap_iterator iterator; + unsigned ix; + EXECUTE_IF_SET_IN_BITMAP (bb_pseudo_uninit, 0, ix, iterator) + { + bool have_false = false; + bool have_true = false; + + edge e; + edge_iterator ei; + FOR_EACH_EDGE (e, ei, bb->preds) + { + if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix)) + have_true = true; + else + have_false = true; + } + if (have_false ^ have_true) + continue; + + FOR_EACH_EDGE (e, ei, bb->preds) + { + if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix)) + continue; + + rtx reg = regno_reg_rtx[ix]; + gcc_assert (CONST0_RTX (GET_MODE (reg))); + + start_sequence (); + emit_move_insn (reg, CONST0_RTX (GET_MODE (reg))); + rtx_insn *inits = get_insns (); + end_sequence (); + + if (dump_file && (dump_flags & TDF_DETAILS)) + for (rtx_insn *init = inits; init != NULL; + init = NEXT_INSN (init)) + fprintf (dump_file, + "Missing init of reg %u inserted on edge: %d -> %d:" + " insn %u\n", ix, e->src->index, e->dest->index, + INSN_UID (init)); + + insert_insn_on_edge (inits, e); + } + } + } + + commit_edge_insertions (); +} + +static void +workaround_uninit (void) +{ + switch (nvptx_init_regs) + { + case 0: + /* Skip. */ + break; + case 1: + workaround_uninit_method_1 (); + break; + case 2: + workaround_uninit_method_2 (); + break; + case 3: + workaround_uninit_method_3 (); + break; + default: + gcc_unreachable (); + } +} + /* PTX-specific reorganization - Split blocks at fork and join instructions - Compute live registers @@ -5401,6 +5585,8 @@ nvptx_reorg (void) df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS); df_live_add_problem (); df_live_set_all_dirty (); + if (nvptx_init_regs == 3) + df_mir_add_problem (); df_analyze (); regstat_init_n_sets_and_refs (); @@ -5413,6 +5599,8 @@ nvptx_reorg (void) if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0) regno_reg_rtx[i] = const0_rtx; + workaround_uninit (); + /* Determine launch dimensions of the function. If it is not an offloaded function (i.e. this is a regular compiler), the function has no neutering. */ diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index e3f65b2..0858007 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -91,3 +91,7 @@ Enum(ptx_version) String(7.0) Value(PTX_VERSION_7_0) mptx= Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option) Specify the version of the ptx version to use. + +minit-regs= +Target Var(nvptx_init_regs) IntegerRange(0, 3) Joined UInteger Init(3) +Initialize ptx registers. -- cgit v1.1 From ce09ab17ddd21f73ff2caf6eec3b0ee9b0e1a11e Mon Sep 17 00:00:00 2001 From: Dan Li Date: Mon, 21 Feb 2022 20:01:14 +0000 Subject: aarch64: Add compiler support for Shadow Call Stack Shadow Call Stack can be used to protect the return address of a function at runtime, and clang already supports this feature[1]. To enable SCS in user mode, in addition to compiler, other support is also required (as discussed in [2]). This patch only adds basic support for SCS from the compiler side, and provides convenience for users to enable SCS. For linux kernel, only the support of the compiler is required. [1] https://clang.llvm.org/docs/ShadowCallStack.html [2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102768 Signed-off-by: Dan Li gcc/ChangeLog: * config/aarch64/aarch64.cc (SLOT_REQUIRED): Change wb_candidate[12] to wb_push_candidate[12]. (aarch64_layout_frame): Likewise, and change callee_adjust when scs is enabled. (aarch64_save_callee_saves): Change wb_candidate[12] to wb_push_candidate[12]. (aarch64_restore_callee_saves): Change wb_candidate[12] to wb_pop_candidate[12]. (aarch64_get_separate_components): Change wb_candidate[12] to wb_push_candidate[12]. (aarch64_expand_prologue): Push x30 onto SCS before it's pushed onto stack. (aarch64_expand_epilogue): Pop x30 frome SCS, while preventing it from being popped from the regular stack again. (aarch64_override_options_internal): Add SCS compile option check. (TARGET_HAVE_SHADOW_CALL_STACK): New hook. * config/aarch64/aarch64.h (struct GTY): Add is_scs_enabled, wb_pop_candidate[12], and rename wb_candidate[12] to wb_push_candidate[12]. * config/aarch64/aarch64.md (scs_push): New template. (scs_pop): Likewise. * doc/invoke.texi: Document -fsanitize=shadow-call-stack. * doc/tm.texi: Regenerate. * doc/tm.texi.in: Add hook have_shadow_call_stack. * flag-types.h (enum sanitize_code): Add SANITIZE_SHADOW_CALL_STACK. * opts.cc (parse_sanitizer_options): Add shadow-call-stack and exclude SANITIZE_SHADOW_CALL_STACK. * target.def: New hook. * toplev.cc (process_options): Add SCS compile option check. * ubsan.cc (ubsan_expand_null_ifn): Enum type conversion. gcc/testsuite/ChangeLog: * gcc.target/aarch64/shadow_call_stack_1.c: New test. * gcc.target/aarch64/shadow_call_stack_2.c: New test. * gcc.target/aarch64/shadow_call_stack_3.c: New test. * gcc.target/aarch64/shadow_call_stack_4.c: New test. * gcc.target/aarch64/shadow_call_stack_5.c: New test. * gcc.target/aarch64/shadow_call_stack_6.c: New test. * gcc.target/aarch64/shadow_call_stack_7.c: New test. * gcc.target/aarch64/shadow_call_stack_8.c: New test. --- gcc/config/aarch64/aarch64.cc | 113 +++++++++++++++++++++++++++++++----------- gcc/config/aarch64/aarch64.h | 21 ++++++-- gcc/config/aarch64/aarch64.md | 10 ++++ 3 files changed, 113 insertions(+), 31 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 37ed22bc..8bcee8b 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -80,6 +80,7 @@ #include "fractional-cost.h" #include "rtlanal.h" #include "tree-dfa.h" +#include "asan.h" /* This file should be included last. */ #include "target-def.h" @@ -7547,8 +7548,8 @@ aarch64_layout_frame (void) #define SLOT_NOT_REQUIRED (-2) #define SLOT_REQUIRED (-1) - frame.wb_candidate1 = INVALID_REGNUM; - frame.wb_candidate2 = INVALID_REGNUM; + frame.wb_push_candidate1 = INVALID_REGNUM; + frame.wb_push_candidate2 = INVALID_REGNUM; frame.spare_pred_reg = INVALID_REGNUM; /* First mark all the registers that really need to be saved... */ @@ -7663,9 +7664,9 @@ aarch64_layout_frame (void) { /* FP and LR are placed in the linkage record. */ frame.reg_offset[R29_REGNUM] = offset; - frame.wb_candidate1 = R29_REGNUM; + frame.wb_push_candidate1 = R29_REGNUM; frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD; - frame.wb_candidate2 = R30_REGNUM; + frame.wb_push_candidate2 = R30_REGNUM; offset += 2 * UNITS_PER_WORD; } @@ -7673,10 +7674,10 @@ aarch64_layout_frame (void) if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) { frame.reg_offset[regno] = offset; - if (frame.wb_candidate1 == INVALID_REGNUM) - frame.wb_candidate1 = regno; - else if (frame.wb_candidate2 == INVALID_REGNUM) - frame.wb_candidate2 = regno; + if (frame.wb_push_candidate1 == INVALID_REGNUM) + frame.wb_push_candidate1 = regno; + else if (frame.wb_push_candidate2 == INVALID_REGNUM) + frame.wb_push_candidate2 = regno; offset += UNITS_PER_WORD; } @@ -7699,11 +7700,11 @@ aarch64_layout_frame (void) } frame.reg_offset[regno] = offset; - if (frame.wb_candidate1 == INVALID_REGNUM) - frame.wb_candidate1 = regno; - else if (frame.wb_candidate2 == INVALID_REGNUM - && frame.wb_candidate1 >= V0_REGNUM) - frame.wb_candidate2 = regno; + if (frame.wb_push_candidate1 == INVALID_REGNUM) + frame.wb_push_candidate1 = regno; + else if (frame.wb_push_candidate2 == INVALID_REGNUM + && frame.wb_push_candidate1 >= V0_REGNUM) + frame.wb_push_candidate2 = regno; offset += vector_save_size; } @@ -7734,10 +7735,38 @@ aarch64_layout_frame (void) frame.sve_callee_adjust = 0; frame.callee_offset = 0; + frame.wb_pop_candidate1 = frame.wb_push_candidate1; + frame.wb_pop_candidate2 = frame.wb_push_candidate2; + + /* Shadow call stack only deals with functions where the LR is pushed + onto the stack and without specifying the "no_sanitize" attribute + with the argument "shadow-call-stack". */ + frame.is_scs_enabled + = (!crtl->calls_eh_return + && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK) + && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)); + + /* When shadow call stack is enabled, the scs_pop in the epilogue will + restore x30, and we don't need to pop x30 again in the traditional + way. Pop candidates record the registers that need to be popped + eventually. */ + if (frame.is_scs_enabled) + { + if (frame.wb_pop_candidate2 == R30_REGNUM) + frame.wb_pop_candidate2 = INVALID_REGNUM; + else if (frame.wb_pop_candidate1 == R30_REGNUM) + frame.wb_pop_candidate1 = INVALID_REGNUM; + } + + /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to + 256 to ensure that the offset meets the requirements of emit_move_insn. + Similarly, if candidate1 is INVALID_REGNUM, we need to set + max_push_offset to 0, because no registers are popped at this time, + so callee_adjust cannot be adjusted. */ HOST_WIDE_INT max_push_offset = 0; - if (frame.wb_candidate2 != INVALID_REGNUM) + if (frame.wb_pop_candidate2 != INVALID_REGNUM) max_push_offset = 512; - else if (frame.wb_candidate1 != INVALID_REGNUM) + else if (frame.wb_pop_candidate1 != INVALID_REGNUM) max_push_offset = 256; HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset; @@ -7827,8 +7856,8 @@ aarch64_layout_frame (void) { /* We've decided not to associate any register saves with the initial stack allocation. */ - frame.wb_candidate1 = INVALID_REGNUM; - frame.wb_candidate2 = INVALID_REGNUM; + frame.wb_pop_candidate1 = frame.wb_push_candidate1 = INVALID_REGNUM; + frame.wb_pop_candidate2 = frame.wb_push_candidate2 = INVALID_REGNUM; } frame.laid_out = true; @@ -8141,8 +8170,8 @@ aarch64_save_callee_saves (poly_int64 start_offset, bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno); if (skip_wb - && (regno == cfun->machine->frame.wb_candidate1 - || regno == cfun->machine->frame.wb_candidate2)) + && (regno == cfun->machine->frame.wb_push_candidate1 + || regno == cfun->machine->frame.wb_push_candidate2)) continue; if (cfun->machine->reg_is_wrapped_separately[regno]) @@ -8252,8 +8281,8 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, rtx reg, mem; if (skip_wb - && (regno == cfun->machine->frame.wb_candidate1 - || regno == cfun->machine->frame.wb_candidate2)) + && (regno == cfun->machine->frame.wb_pop_candidate1 + || regno == cfun->machine->frame.wb_pop_candidate2)) continue; machine_mode mode = aarch64_reg_save_mode (regno); @@ -8424,8 +8453,8 @@ aarch64_get_separate_components (void) if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM) bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg); - unsigned reg1 = cfun->machine->frame.wb_candidate1; - unsigned reg2 = cfun->machine->frame.wb_candidate2; + unsigned reg1 = cfun->machine->frame.wb_push_candidate1; + unsigned reg2 = cfun->machine->frame.wb_push_candidate2; /* If registers have been chosen to be stored/restored with writeback don't interfere with them to avoid having to output explicit stack adjustment instructions. */ @@ -9034,8 +9063,8 @@ aarch64_expand_prologue (void) poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; poly_int64 below_hard_fp_saved_regs_size = cfun->machine->frame.below_hard_fp_saved_regs_size; - unsigned reg1 = cfun->machine->frame.wb_candidate1; - unsigned reg2 = cfun->machine->frame.wb_candidate2; + unsigned reg1 = cfun->machine->frame.wb_push_candidate1; + unsigned reg2 = cfun->machine->frame.wb_push_candidate2; bool emit_frame_chain = cfun->machine->frame.emit_frame_chain; rtx_insn *insn; @@ -9066,6 +9095,10 @@ aarch64_expand_prologue (void) RTX_FRAME_RELATED_P (insn) = 1; } + /* Push return address to shadow call stack. */ + if (cfun->machine->frame.is_scs_enabled) + emit_insn (gen_scs_push ()); + if (flag_stack_usage_info) current_function_static_stack_size = constant_lower_bound (frame_size); @@ -9212,8 +9245,10 @@ aarch64_expand_epilogue (bool for_sibcall) poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; poly_int64 below_hard_fp_saved_regs_size = cfun->machine->frame.below_hard_fp_saved_regs_size; - unsigned reg1 = cfun->machine->frame.wb_candidate1; - unsigned reg2 = cfun->machine->frame.wb_candidate2; + unsigned reg1 = cfun->machine->frame.wb_pop_candidate1; + unsigned reg2 = cfun->machine->frame.wb_pop_candidate2; + unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled + ? R29_REGNUM : R30_REGNUM); rtx cfi_ops = NULL; rtx_insn *insn; /* A stack clash protection prologue may not have left EP0_REGNUM or @@ -9283,8 +9318,12 @@ aarch64_expand_epilogue (bool for_sibcall) false, &cfi_ops); if (maybe_ne (sve_callee_adjust, 0)) aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true); + + /* When shadow call stack is enabled, the scs_pop in the epilogue will + restore x30, we don't need to restore x30 again in the traditional + way. */ aarch64_restore_callee_saves (callee_offset - sve_callee_adjust, - R0_REGNUM, R30_REGNUM, + R0_REGNUM, last_gpr, callee_adjust != 0, &cfi_ops); if (need_barrier_p) @@ -9322,6 +9361,17 @@ aarch64_expand_epilogue (bool for_sibcall) RTX_FRAME_RELATED_P (insn) = 1; } + /* Pop return address from shadow call stack. */ + if (cfun->machine->frame.is_scs_enabled) + { + machine_mode mode = aarch64_reg_save_mode (R30_REGNUM); + rtx reg = gen_rtx_REG (mode, R30_REGNUM); + + insn = emit_insn (gen_scs_pop ()); + add_reg_note (insn, REG_CFA_RESTORE, reg); + RTX_FRAME_RELATED_P (insn) = 1; + } + /* We prefer to emit the combined return/authenticate instruction RETAA, however there are three cases in which we must instead emit an explicit authentication instruction. @@ -16878,6 +16928,10 @@ aarch64_override_options_internal (struct gcc_options *opts) aarch64_stack_protector_guard_offset = offs; } + if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK) + && !fixed_regs[R18_REGNUM]) + error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>"); + initialize_aarch64_code_model (opts); initialize_aarch64_tls_size (opts); @@ -27084,6 +27138,9 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_ASM_FUNCTION_EPILOGUE #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks +#undef TARGET_HAVE_SHADOW_CALL_STACK +#define TARGET_HAVE_SHADOW_CALL_STACK true + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-aarch64.h" diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index dddf133..27ba4f4 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -922,9 +922,21 @@ struct GTY (()) aarch64_frame Indicated by CALLEE_ADJUST == 0 && EMIT_FRAME_CHAIN. These fields indicate which registers we've decided to handle using - (1) or (2), or INVALID_REGNUM if none. */ - unsigned wb_candidate1; - unsigned wb_candidate2; + (1) or (2), or INVALID_REGNUM if none. + + In some cases we don't always need to pop all registers in the push + candidates, pop candidates record which registers need to be popped + eventually. The initial value of a pop candidate is copied from its + corresponding push candidate. + + Currently, different pop candidates are only used for shadow call + stack. When "-fsanitize=shadow-call-stack" is specified, we replace + x30 in the pop candidate with INVALID_REGNUM to ensure that x30 is + not popped twice. */ + unsigned wb_push_candidate1; + unsigned wb_push_candidate2; + unsigned wb_pop_candidate1; + unsigned wb_pop_candidate2; /* Big-endian SVE frames need a spare predicate register in order to save vector registers in the correct layout for unwinding. @@ -932,6 +944,9 @@ struct GTY (()) aarch64_frame unsigned spare_pred_reg; bool laid_out; + + /* True if shadow call stack should be enabled for the current function. */ + bool is_scs_enabled; }; typedef struct GTY (()) machine_function diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 5909184..c985250 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -7093,6 +7093,16 @@ "hint\t7 // xpaclri" ) +;; Save X30 in the X18-based POST_INC stack (consistent with clang). +(define_expand "scs_push" + [(set (mem:DI (post_inc:DI (reg:DI R18_REGNUM))) + (reg:DI R30_REGNUM))]) + +;; Load X30 form the X18-based PRE_DEC stack (consistent with clang). +(define_expand "scs_pop" + [(set (reg:DI R30_REGNUM) + (mem:DI (pre_dec:DI (reg:DI R18_REGNUM))))]) + ;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and ;; all of memory. This blocks insns from being moved across this point. -- cgit v1.1 From 0435b978f95971e139882549f5a1765c50682216 Mon Sep 17 00:00:00 2001 From: Hongyu Wang Date: Fri, 11 Feb 2022 14:44:15 +0800 Subject: i386: Relax cmpxchg instruction under -mrelax-cmpxchg-loop [PR103069] For cmpxchg, it is commonly used in spin loop, and several user code such as pthread directly takes cmpxchg as loop condition, which cause huge cache bouncing. This patch extends previous implementation to relax all cmpxchg instruction under -mrelax-cmpxchg-loop with an extra atomic load, compare and emulate the failed cmpxchg behavior. For original spin loop which looks like loop: mov %eax,%r8d or $1,%r8d lock cmpxchg %r8d,(%rdi) jne loop It will now truns to loop: mov %eax,%r8d or $1,%r8d mov (%r8),%rsi <--- load lock first cmp %rsi,%rax <--- compare with expected input jne .L2 <--- lock ne expected lock cmpxchg %r8d,(%rdi) jne loop L2: mov %rsi,%rax <--- perform the behavior of failed cmpxchg jne loop under -mrelax-cmpxchg-loop. gcc/ChangeLog: PR target/103069 * config/i386/i386-expand.cc (ix86_expand_atomic_fetch_op_loop): Split atomic fetch and loop part. (ix86_expand_cmpxchg_loop): New expander for cmpxchg loop. * config/i386/i386-protos.h (ix86_expand_cmpxchg_loop): New prototype. * config/i386/sync.md (atomic_compare_and_swap): Call new expander under TARGET_RELAX_CMPXCHG_LOOP. (atomic_compare_and_swap): Likewise for doubleword modes. gcc/testsuite/ChangeLog: PR target/103069 * gcc.target/i386/pr103069-2.c: Adjust result check. * gcc.target/i386/pr103069-3.c: New test. * gcc.target/i386/pr103069-4.c: Likewise. --- gcc/config/i386/i386-expand.cc | 153 ++++++++++++++++++++++++++++++----------- gcc/config/i386/i386-protos.h | 2 + gcc/config/i386/sync.md | 65 ++++++++++------- 3 files changed, 157 insertions(+), 63 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index ce9607e..6cf1a0b 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -23203,16 +23203,14 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val, enum rtx_code code, bool after, bool doubleword) { - rtx old_reg, new_reg, old_mem, success, oldval, new_mem; - rtx_code_label *loop_label, *pause_label, *done_label; + rtx old_reg, new_reg, old_mem, success; machine_mode mode = GET_MODE (target); + rtx_code_label *loop_label = NULL; old_reg = gen_reg_rtx (mode); new_reg = old_reg; - loop_label = gen_label_rtx (); - pause_label = gen_label_rtx (); - done_label = gen_label_rtx (); old_mem = copy_to_reg (mem); + loop_label = gen_label_rtx (); emit_label (loop_label); emit_move_insn (old_reg, old_mem); @@ -23234,50 +23232,125 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val, if (after) emit_move_insn (target, new_reg); - /* Load memory again inside loop. */ - new_mem = copy_to_reg (mem); - /* Compare mem value with expected value. */ + success = NULL_RTX; + + ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg, + gen_int_mode (MEMMODEL_SYNC_SEQ_CST, + SImode), + doubleword, loop_label); +} + +/* Relax cmpxchg instruction, param loop_label indicates whether + the instruction should be relaxed with a pause loop. If not, + it will be relaxed to an atomic load + compare, and skip + cmpxchg instruction if mem != exp_input. */ + +void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val, + rtx mem, rtx exp_input, rtx new_input, + rtx mem_model, bool doubleword, + rtx_code_label *loop_label) +{ + rtx_code_label *cmp_label = NULL; + rtx_code_label *done_label = NULL; + rtx target_bool = NULL_RTX, new_mem = NULL_RTX; + rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL; + rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL; + machine_mode mode = GET_MODE (target_val), hmode = mode; + + if (*ptarget_bool == NULL) + target_bool = gen_reg_rtx (QImode); + else + target_bool = *ptarget_bool; + + cmp_label = gen_label_rtx (); + done_label = gen_label_rtx (); + + new_mem = gen_reg_rtx (mode); + /* Load memory first. */ + expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST); + + switch (mode) + { + case TImode: + gendw = gen_atomic_compare_and_swapti_doubleword; + hmode = DImode; + break; + case DImode: + if (doubleword) + { + gendw = gen_atomic_compare_and_swapdi_doubleword; + hmode = SImode; + } + else + gen = gen_atomic_compare_and_swapdi_1; + break; + case SImode: + gen = gen_atomic_compare_and_swapsi_1; break; + case HImode: + gen = gen_atomic_compare_and_swaphi_1; break; + case QImode: + gen = gen_atomic_compare_and_swapqi_1; break; + default: + gcc_unreachable (); + } + /* Compare mem value with expected value. */ if (doubleword) { - machine_mode half_mode = (mode == DImode)? SImode : DImode; - rtx low_new_mem = gen_lowpart (half_mode, new_mem); - rtx low_old_mem = gen_lowpart (half_mode, old_mem); - rtx high_new_mem = gen_highpart (half_mode, new_mem); - rtx high_old_mem = gen_highpart (half_mode, old_mem); - emit_cmp_and_jump_insns (low_new_mem, low_old_mem, NE, NULL_RTX, - half_mode, 1, pause_label, + rtx low_new_mem = gen_lowpart (hmode, new_mem); + rtx low_exp_input = gen_lowpart (hmode, exp_input); + rtx high_new_mem = gen_highpart (hmode, new_mem); + rtx high_exp_input = gen_highpart (hmode, exp_input); + emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX, + hmode, 1, cmp_label, profile_probability::guessed_never ()); - emit_cmp_and_jump_insns (high_new_mem, high_old_mem, NE, NULL_RTX, - half_mode, 1, pause_label, + emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX, + hmode, 1, cmp_label, profile_probability::guessed_never ()); } else - emit_cmp_and_jump_insns (new_mem, old_mem, NE, NULL_RTX, - GET_MODE (old_mem), 1, pause_label, + emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX, + GET_MODE (exp_input), 1, cmp_label, profile_probability::guessed_never ()); - success = NULL_RTX; - oldval = old_mem; - expand_atomic_compare_and_swap (&success, &oldval, mem, old_reg, - new_reg, false, MEMMODEL_SYNC_SEQ_CST, - MEMMODEL_RELAXED); - if (oldval != old_mem) - emit_move_insn (old_mem, oldval); - - emit_cmp_and_jump_insns (success, const0_rtx, EQ, const0_rtx, - GET_MODE (success), 1, loop_label, - profile_probability::guessed_never ()); - - emit_jump_insn (gen_jump (done_label)); - emit_barrier (); - - /* If mem is not expected, pause and loop back. */ - emit_label (pause_label); - emit_insn (gen_pause ()); - emit_jump_insn (gen_jump (loop_label)); - emit_barrier (); - emit_label (done_label); + /* Directly emits cmpxchg here. */ + if (doubleword) + emit_insn (gendw (target_val, mem, exp_input, + gen_lowpart (hmode, new_input), + gen_highpart (hmode, new_input), + mem_model)); + else + emit_insn (gen (target_val, mem, exp_input, new_input, mem_model)); + + if (!loop_label) + { + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + emit_label (cmp_label); + emit_move_insn (target_val, new_mem); + emit_label (done_label); + ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG), + const0_rtx); + } + else + { + ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG), + const0_rtx); + emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx, + GET_MODE (target_bool), 1, loop_label, + profile_probability::guessed_never ()); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + + /* If mem is not expected, pause and loop back. */ + emit_label (cmp_label); + emit_insn (gen_pause ()); + emit_jump_insn (gen_jump (loop_label)); + emit_barrier (); + emit_label (done_label); + } + + *ptarget_bool = target_bool; } #include "gt-i386-expand.h" diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index b7e9aa7..d5e1125 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -221,6 +221,8 @@ extern void ix86_split_mmx_punpck (rtx[], bool); extern void ix86_expand_avx_vzeroupper (void); extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code, bool, bool); +extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx, + bool, rtx_code_label *); #ifdef TREE_CODE extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md index 36417c5..820e9ca 100644 --- a/gcc/config/i386/sync.md +++ b/gcc/config/i386/sync.md @@ -373,11 +373,20 @@ (match_operand:SI 7 "const_int_operand")] ;; failure model "TARGET_CMPXCHG" { - emit_insn - (gen_atomic_compare_and_swap_1 - (operands[1], operands[2], operands[3], operands[4], operands[6])); - ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG), - const0_rtx); + if (TARGET_RELAX_CMPXCHG_LOOP) + { + ix86_expand_cmpxchg_loop (&operands[0], operands[1], operands[2], + operands[3], operands[4], operands[6], + false, NULL); + } + else + { + emit_insn + (gen_atomic_compare_and_swap_1 + (operands[1], operands[2], operands[3], operands[4], operands[6])); + ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG), + const0_rtx); + } DONE; }) @@ -397,25 +406,35 @@ (match_operand:SI 7 "const_int_operand")] ;; failure model "TARGET_CMPXCHG" { - if (mode == DImode && TARGET_64BIT) - { - emit_insn - (gen_atomic_compare_and_swapdi_1 - (operands[1], operands[2], operands[3], operands[4], operands[6])); - } + int doubleword = !(mode == DImode && TARGET_64BIT); + if (TARGET_RELAX_CMPXCHG_LOOP) + { + ix86_expand_cmpxchg_loop (&operands[0], operands[1], operands[2], + operands[3], operands[4], operands[6], + doubleword, NULL); + } else - { - machine_mode hmode = mode; - - emit_insn - (gen_atomic_compare_and_swap_doubleword - (operands[1], operands[2], operands[3], - gen_lowpart (hmode, operands[4]), gen_highpart (hmode, operands[4]), - operands[6])); - } - - ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG), - const0_rtx); + { + if (!doubleword) + { + emit_insn + (gen_atomic_compare_and_swapdi_1 + (operands[1], operands[2], operands[3], operands[4], operands[6])); + } + else + { + machine_mode hmode = mode; + + emit_insn + (gen_atomic_compare_and_swap_doubleword + (operands[1], operands[2], operands[3], + gen_lowpart (hmode, operands[4]), gen_highpart (hmode, operands[4]), + operands[6])); + } + + ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG), + const0_rtx); + } DONE; }) -- cgit v1.1 From f24dfc76177b3994434c8beb287cde1a9976b5ce Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 18 Feb 2022 11:50:44 +0100 Subject: tree-optimization/104582 - make SLP node available in vector cost hook This adjusts the vectorizer costing API to allow passing down the SLP node the vector stmt is created from. 2022-02-18 Richard Biener PR tree-optimization/104582 * tree-vectorizer.h (stmt_info_for_cost::node): New field. (vector_costs::add_stmt_cost): Add SLP node parameter. (dump_stmt_cost): Likewise. (add_stmt_cost): Likewise, new overload and adjust. (add_stmt_costs): Adjust. (record_stmt_cost): New overload. * tree-vectorizer.cc (dump_stmt_cost): Dump the SLP node. (vector_costs::add_stmt_cost): Adjust. * tree-vect-loop.cc (vect_estimate_min_profitable_iters): Adjust. * tree-vect-slp.cc (vect_prologue_cost_for_slp): Record the SLP node for costing. (vectorizable_slp_permutation): Likewise. * tree-vect-stmts.cc (record_stmt_cost): Adjust and add new overloads. * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Adjust. * config/aarch64/aarch64.cc (aarch64_vector_costs::add_stmt_cost): Adjust. * config/rs6000/rs6000.cc (rs6000_vector_costs::add_stmt_cost): Adjust. (rs6000_cost_data::adjust_vect_cost_per_loop): Likewise. --- gcc/config/aarch64/aarch64.cc | 6 +++--- gcc/config/i386/i386.cc | 9 +++++---- gcc/config/rs6000/rs6000.cc | 10 ++++++---- 3 files changed, 14 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 8bcee8b..dbeaaf4 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -15058,7 +15058,7 @@ public: aarch64_vector_costs (vec_info *, bool); unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, tree vectype, + stmt_vec_info stmt_info, slp_tree, tree vectype, int misalign, vect_cost_model_location where) override; void finish_cost (const vector_costs *) override; @@ -16003,8 +16003,8 @@ aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind, unsigned aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, tree vectype, - int misalign, + stmt_vec_info stmt_info, slp_tree, + tree vectype, int misalign, vect_cost_model_location where) { fractional_cost stmt_cost diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index e4b42fb..0830dbd 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -22982,8 +22982,8 @@ class ix86_vector_costs : public vector_costs using vector_costs::vector_costs; unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, tree vectype, - int misalign, + stmt_vec_info stmt_info, slp_tree node, + tree vectype, int misalign, vect_cost_model_location where) override; }; @@ -22997,8 +22997,9 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar) unsigned ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, tree vectype, - int misalign, vect_cost_model_location where) + stmt_vec_info stmt_info, slp_tree, + tree vectype, int misalign, + vect_cost_model_location where) { unsigned retval = 0; bool scalar_p diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index d7a7cfe..ca9e7b8 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -5212,7 +5212,7 @@ public: using vector_costs::vector_costs; unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, tree vectype, + stmt_vec_info stmt_info, slp_tree, tree vectype, int misalign, vect_cost_model_location where) override; void finish_cost (const vector_costs *) override; @@ -5428,8 +5428,9 @@ rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind, unsigned rs6000_cost_data::add_stmt_cost (int count, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, tree vectype, - int misalign, vect_cost_model_location where) + stmt_vec_info stmt_info, slp_tree, + tree vectype, int misalign, + vect_cost_model_location where) { unsigned retval = 0; @@ -5470,7 +5471,8 @@ rs6000_cost_data::adjust_vect_cost_per_loop (loop_vec_info loop_vinfo) /* Each length needs one shift to fill into bits 0-7. */ shift_cnt += num_vectors_m1 + 1; - add_stmt_cost (shift_cnt, scalar_stmt, NULL, NULL_TREE, 0, vect_body); + add_stmt_cost (shift_cnt, scalar_stmt, NULL, NULL, + NULL_TREE, 0, vect_body); } } -- cgit v1.1 From 90d693bdc9d71841f51d68826ffa5bd685d7f0bc Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 18 Feb 2022 14:32:14 +0100 Subject: target/99881 - x86 vector cost of CTOR from integer regs This uses the now passed SLP node to the vectorizer costing hook to adjust vector construction costs for the cost of moving an integer component from a GPR to a vector register when that's required for building a vector from components. A cruical difference here is whether the component is loaded from memory or extracted from a vector register as in those cases no intermediate GPR is involved. The pr99881.c testcase can be Un-XFAILed with this patch, the pr91446.c testcase now produces scalar code which looks superior to me so I've adjusted it as well. 2022-02-18 Richard Biener PR tree-optimization/104582 PR target/99881 * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Cost GPR to vector register moves for integer vector construction. * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-1.c: New. * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c: Likewise. * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-3.c: Likewise. * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-4.c: Likewise. * gcc.target/i386/pr99881.c: Un-XFAIL. * gcc.target/i386/pr91446.c: Adjust to not expect vectorization. --- gcc/config/i386/i386.cc | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 0830dbd..b2bf905 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -22997,7 +22997,7 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar) unsigned ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, slp_tree, + stmt_vec_info stmt_info, slp_tree node, tree vectype, int misalign, vect_cost_model_location where) { @@ -23160,6 +23160,49 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1); } + else if (kind == vec_construct + && node + && SLP_TREE_DEF_TYPE (node) == vect_external_def + && INTEGRAL_TYPE_P (TREE_TYPE (vectype))) + { + stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + unsigned i; + tree op; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) + if (TREE_CODE (op) == SSA_NAME) + TREE_VISITED (op) = 0; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) + { + if (TREE_CODE (op) != SSA_NAME + || TREE_VISITED (op)) + continue; + TREE_VISITED (op) = 1; + gimple *def = SSA_NAME_DEF_STMT (op); + tree tem; + if (is_gimple_assign (def) + && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)) + && ((tem = gimple_assign_rhs1 (def)), true) + && TREE_CODE (tem) == SSA_NAME + /* A sign-change expands to nothing. */ + && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (def)), + TREE_TYPE (tem))) + def = SSA_NAME_DEF_STMT (tem); + /* When the component is loaded from memory we can directly + move it to a vector register, otherwise we have to go + via a GPR or via vpinsr which involves similar cost. + Likewise with a BIT_FIELD_REF extracting from a vector + register we can hope to avoid using a GPR. */ + if (!is_gimple_assign (def) + || (!gimple_assign_load_p (def) + && (gimple_assign_rhs_code (def) != BIT_FIELD_REF + || !VECTOR_TYPE_P (TREE_TYPE + (TREE_OPERAND (gimple_assign_rhs1 (def), 0)))))) + stmt_cost += ix86_cost->sse_to_integer; + } + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) + if (TREE_CODE (op) == SSA_NAME) + TREE_VISITED (op) = 0; + } if (stmt_cost == -1) stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); -- cgit v1.1 From 7e691189ca9c04fdba71ceada1faba62afbc1463 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 22 Feb 2022 10:38:37 +0100 Subject: i386: Fix up copysign/xorsign expansion [PR104612] We ICE on the following testcase for -m32 since r12-3435. because operands[2] is (subreg:SF (reg:DI ...) 0) and lowpart_subreg (V4SFmode, operands[2], SFmode) returns NULL, and that is what we use in AND etc. insns we emit. My earlier version of the patch fixes that by calling force_reg for the input operands, to make sure they are really REGs and so lowpart_subreg will succeed on them - even for theoretical MEMs using REGs there seems desirable, we don't want to read following memory slots for the paradoxical subreg. For the outputs, I thought we'd get better code by always computing result into a new pseudo and them move lowpart of that pseudo into dest. Unfortunately it regressed FAIL: gcc.target/i386/pr89984-2.c scan-assembler-not vmovaps on which the patch changes: vandps .LC0(%rip), %xmm1, %xmm1 - vxorps %xmm0, %xmm1, %xmm0 + vxorps %xmm0, %xmm1, %xmm1 + vmovaps %xmm1, %xmm0 ret The RA sees: (insn 8 4 9 2 (set (reg:V4SF 85) (and:V4SF (subreg:V4SF (reg:SF 90) 0) (mem/u/c:V4SF (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S16 A128]))) "pr89984-2.c":7:12 2838 {*andv4sf3} (expr_list:REG_DEAD (reg:SF 90) (nil))) (insn 9 8 10 2 (set (reg:V4SF 87) (xor:V4SF (reg:V4SF 85) (subreg:V4SF (reg:SF 89) 0))) "pr89984-2.c":7:12 2842 {*xorv4sf3} (expr_list:REG_DEAD (reg:SF 89) (expr_list:REG_DEAD (reg:V4SF 85) (nil)))) (insn 10 9 14 2 (set (reg:SF 82 [ ]) (subreg:SF (reg:V4SF 87) 0)) "pr89984-2.c":7:12 142 {*movsf_internal} (expr_list:REG_DEAD (reg:V4SF 87) (nil))) (insn 14 10 15 2 (set (reg/i:SF 20 xmm0) (reg:SF 82 [ ])) "pr89984-2.c":8:1 142 {*movsf_internal} (expr_list:REG_DEAD (reg:SF 82 [ ]) (nil))) (insn 15 14 0 2 (use (reg/i:SF 20 xmm0)) "pr89984-2.c":8:1 -1 (nil)) and doesn't know that if it would use xmm0 not just for pseudo 82 but also for pseudo 87, it could create a noop move in insn 10 and so could avoid an extra register copy and nothing later on is able to figure that out either. I don't know how the RA should know that though. So that we don't regress, this version of the patch will do this stuff (i.e. use fresh vector pseudo as destination and then move lowpart of that to dest) over what it used before (i.e. use paradoxical subreg of the dest) only if lowpart_subreg returns NULL. 2022-02-22 Jakub Jelinek PR target/104612 * config/i386/i386-expand.cc (ix86_expand_copysign): Call force_reg on input operands before calling lowpart_subreg on it. For output operand, use a vmode pseudo as destination and then move its lowpart subreg into operands[0] if lowpart_subreg fails on dest. (ix86_expand_xorsign): Likewise. * gcc.dg/pr104612.c: New test. --- gcc/config/i386/i386-expand.cc | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 6cf1a0b..7f7055b 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -2153,7 +2153,7 @@ void ix86_expand_copysign (rtx operands[]) { machine_mode mode, vmode; - rtx dest, op0, op1, mask, op2, op3; + rtx dest, vdest, op0, op1, mask, op2, op3; mode = GET_MODE (operands[0]); @@ -2174,8 +2174,13 @@ ix86_expand_copysign (rtx operands[]) return; } - dest = lowpart_subreg (vmode, operands[0], mode); - op1 = lowpart_subreg (vmode, operands[2], mode); + dest = operands[0]; + vdest = lowpart_subreg (vmode, dest, mode); + if (vdest == NULL_RTX) + vdest = gen_reg_rtx (vmode); + else + dest = NULL_RTX; + op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode); mask = ix86_build_signbit_mask (vmode, 0, 0); if (CONST_DOUBLE_P (operands[1])) @@ -2184,7 +2189,9 @@ ix86_expand_copysign (rtx operands[]) /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */ if (op0 == CONST0_RTX (mode)) { - emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1)); + emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1)); + if (dest) + emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode)); return; } @@ -2193,7 +2200,7 @@ ix86_expand_copysign (rtx operands[]) op0 = force_reg (vmode, op0); } else - op0 = lowpart_subreg (vmode, operands[1], mode); + op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode); op2 = gen_reg_rtx (vmode); op3 = gen_reg_rtx (vmode); @@ -2201,7 +2208,9 @@ ix86_expand_copysign (rtx operands[]) gen_rtx_NOT (vmode, mask), op0)); emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1)); - emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3)); + emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3)); + if (dest) + emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode)); } /* Expand an xorsign operation. */ @@ -2210,7 +2219,7 @@ void ix86_expand_xorsign (rtx operands[]) { machine_mode mode, vmode; - rtx dest, op0, op1, mask, x, temp; + rtx dest, vdest, op0, op1, mask, x, temp; dest = operands[0]; op0 = operands[1]; @@ -2230,15 +2239,22 @@ ix86_expand_xorsign (rtx operands[]) temp = gen_reg_rtx (vmode); mask = ix86_build_signbit_mask (vmode, 0, 0); - op1 = lowpart_subreg (vmode, op1, mode); + op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode); x = gen_rtx_AND (vmode, op1, mask); emit_insn (gen_rtx_SET (temp, x)); - op0 = lowpart_subreg (vmode, op0, mode); + op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode); x = gen_rtx_XOR (vmode, temp, op0); - dest = lowpart_subreg (vmode, dest, mode); - emit_insn (gen_rtx_SET (dest, x)); + vdest = lowpart_subreg (vmode, dest, mode); + if (vdest == NULL_RTX) + vdest = gen_reg_rtx (vmode); + else + dest = NULL_RTX; + emit_insn (gen_rtx_SET (vdest, x)); + + if (dest) + emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode)); } static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1); -- cgit v1.1 From c2b23aaaf4457278403c01cd145cd3936683384e Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Fri, 18 Feb 2022 12:31:02 +0100 Subject: [nvptx] Add -mptx-comment Add functionality that indicates which insns are added by -minit-regs, such that for instance we have for pr53465.s: ... // #APP // 9 "gcc/testsuite/gcc.c-torture/execute/pr53465.c" 1 // Start: Added by -minit-regs=3: // #NO_APP mov.u32 %r26, 0; // #APP // 9 "gcc/testsuite/gcc.c-torture/execute/pr53465.c" 1 // End: Added by -minit-regs=3: // #NO_APP ... Can be switched off using -mno-ptx-comment. Tested on nvptx. gcc/ChangeLog: 2022-02-21 Tom de Vries * config/nvptx/nvptx.cc (gen_comment): New function. (workaround_uninit_method_1, workaround_uninit_method_2) (workaround_uninit_method_3): : Use gen_comment. * config/nvptx/nvptx.opt (mptx-comment): New option. --- gcc/config/nvptx/nvptx.cc | 42 ++++++++++++++++++++++++++++++++++++++++++ gcc/config/nvptx/nvptx.opt | 3 +++ 2 files changed, 45 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index a37a6c7..981b91f 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -5372,6 +5372,17 @@ workaround_barsyncs (void) } #endif +static rtx +gen_comment (const char *s) +{ + const char *sep = " "; + size_t len = strlen (ASM_COMMENT_START) + strlen (sep) + strlen (s) + 1; + char *comment = (char *) alloca (len); + snprintf (comment, len, "%s%s%s", ASM_COMMENT_START, sep, s); + return gen_rtx_ASM_INPUT_loc (VOIDmode, ggc_strdup (comment), + cfun->function_start_locus); +} + /* Initialize all declared regs at function entry. Advantage : Fool-proof. Disadvantage: Potentially creates a lot of long live ranges and adds a lot @@ -5394,6 +5405,8 @@ workaround_uninit_method_1 (void) gcc_assert (CONST0_RTX (GET_MODE (reg))); start_sequence (); + if (nvptx_comment && first != NULL) + emit_insn (gen_comment ("Start: Added by -minit-regs=1")); emit_move_insn (reg, CONST0_RTX (GET_MODE (reg))); rtx_insn *inits = get_insns (); end_sequence (); @@ -5411,6 +5424,9 @@ workaround_uninit_method_1 (void) else insert_here = emit_insn_after (inits, insert_here); } + + if (nvptx_comment && insert_here != NULL) + emit_insn_after (gen_comment ("End: Added by -minit-regs=1"), insert_here); } /* Find uses of regs that are not defined on all incoming paths, and insert a @@ -5446,6 +5462,8 @@ workaround_uninit_method_2 (void) gcc_assert (CONST0_RTX (GET_MODE (reg))); start_sequence (); + if (nvptx_comment && first != NULL) + emit_insn (gen_comment ("Start: Added by -minit-regs=2:")); emit_move_insn (reg, CONST0_RTX (GET_MODE (reg))); rtx_insn *inits = get_insns (); end_sequence (); @@ -5463,6 +5481,9 @@ workaround_uninit_method_2 (void) else insert_here = emit_insn_after (inits, insert_here); } + + if (nvptx_comment && insert_here != NULL) + emit_insn_after (gen_comment ("End: Added by -minit-regs=2"), insert_here); } /* Find uses of regs that are not defined on all incoming paths, and insert a @@ -5531,6 +5552,27 @@ workaround_uninit_method_3 (void) } } + if (nvptx_comment) + FOR_EACH_BB_FN (bb, cfun) + { + if (single_pred_p (bb)) + continue; + + edge e; + edge_iterator ei; + FOR_EACH_EDGE (e, ei, bb->preds) + { + if (e->insns.r == NULL_RTX) + continue; + start_sequence (); + emit_insn (gen_comment ("Start: Added by -minit-regs=3:")); + emit_insn (e->insns.r); + emit_insn (gen_comment ("End: Added by -minit-regs=3:")); + e->insns.r = get_insns (); + end_sequence (); + } + } + commit_edge_insertions (); } diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 0858007..e56ec92 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -95,3 +95,6 @@ Specify the version of the ptx version to use. minit-regs= Target Var(nvptx_init_regs) IntegerRange(0, 3) Joined UInteger Init(3) Initialize ptx registers. + +mptx-comment +Target Var(nvptx_comment) Init(1) Undocumented -- cgit v1.1 From bc91cb8d8cf1d4abbb74fb69d918071e1801fd77 Mon Sep 17 00:00:00 2001 From: Tobias Burnus Date: Sat, 19 Feb 2022 23:28:49 +0100 Subject: nvptx: Add -mptx=6.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently supported internally are 3.1, 6.0, 6.3 and 7.0. However, -mptx= supports 3.1, 6.3, 7.0 – but not the internal default 6.0. Add -mptx=6.0 for consistency. Tested on nvptx. gcc/ChangeLog: * config/nvptx/nvptx.opt (mptx): Add 6.0 alias PTX_VERSION_6_0. * doc/invoke.texi (-mptx): Update for new values and defaults. Co-Authored-By: Tom de Vries --- gcc/config/nvptx/nvptx.opt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index e56ec92..97e127c 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -83,6 +83,9 @@ EnumValue Enum(ptx_version) String(3.1) Value(PTX_VERSION_3_1) EnumValue +Enum(ptx_version) String(6.0) Value(PTX_VERSION_6_0) + +EnumValue Enum(ptx_version) String(6.3) Value(PTX_VERSION_6_3) EnumValue -- cgit v1.1 From bd73d8dd312c759ee505b401d6b4fd7be07a3f1a Mon Sep 17 00:00:00 2001 From: Tobias Burnus Date: Sun, 20 Feb 2022 00:25:33 +0100 Subject: nvptx: Add -misa=sm_70 Add -misa=sm_70, and use it to specify the misa value in test-case gcc.target/nvptx/atomic-store-2.c. Tested on nvptx. gcc/ChangeLog: * config/nvptx/nvptx-c.cc (nvptx_cpu_cpp_builtins): Handle SM70. * config/nvptx/nvptx.cc (first_ptx_version_supporting_sm): Likewise. * config/nvptx/nvptx.opt (misa): Add sm_70 alias PTX_ISA_SM70. gcc/testsuite/ChangeLog: 2022-02-22 Tom de Vries * gcc.target/nvptx/atomic-store-2.c: Use -misa=sm_70. * gcc.target/nvptx/uniform-simt-3.c: Same. Co-Authored-By: Tom de Vries --- gcc/config/nvptx/nvptx-c.cc | 2 ++ gcc/config/nvptx/nvptx.cc | 2 ++ gcc/config/nvptx/nvptx.opt | 3 +++ 3 files changed, 7 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx-c.cc b/gcc/config/nvptx/nvptx-c.cc index d68b991..b2375fb 100644 --- a/gcc/config/nvptx/nvptx-c.cc +++ b/gcc/config/nvptx/nvptx-c.cc @@ -43,6 +43,8 @@ nvptx_cpu_cpp_builtins (void) cpp_define (parse_in, "__PTX_SM__=800"); else if (TARGET_SM75) cpp_define (parse_in, "__PTX_SM__=750"); + else if (TARGET_SM70) + cpp_define (parse_in, "__PTX_SM__=700"); else if (TARGET_SM53) cpp_define (parse_in, "__PTX_SM__=530"); else if (TARGET_SM35) diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 981b91f..858789e 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -217,6 +217,8 @@ first_ptx_version_supporting_sm (enum ptx_isa sm) return PTX_VERSION_3_1; case PTX_ISA_SM53: return PTX_VERSION_4_2; + case PTX_ISA_SM70: + return PTX_VERSION_6_0; case PTX_ISA_SM75: return PTX_VERSION_6_3; case PTX_ISA_SM80: diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 97e127c..9776c3b 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -65,6 +65,9 @@ EnumValue Enum(ptx_isa) String(sm_53) Value(PTX_ISA_SM53) EnumValue +Enum(ptx_isa) String(sm_70) Value(PTX_ISA_SM70) + +EnumValue Enum(ptx_isa) String(sm_75) Value(PTX_ISA_SM75) EnumValue -- cgit v1.1 From bf3e36fbf13f0db44a79988036cb9c042288841a Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 13 Oct 2021 09:16:09 +0000 Subject: arm: Add GENERAL_AND_VPR_REGS regclass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At some point during the development of this patch series, it appeared that in some cases the register allocator wants “VPR or general” rather than “VPR or general or FP” (which is the same thing as ALL_REGS). The series does not seem to require this anymore, but it seems to be a good thing to do anyway, to give the register allocator more freedom. CLASS_MAX_NREGS and arm_hard_regno_nregs need adjustment to avoid a regression in gcc.dg/stack-usage-1.c when compiled with -mthumb -mfloat-abi=hard -march=armv8.1-m.main+mve.fp+fp.dp. Most of the work of this patch series was carried out while I was working at STMicroelectronics as a Linaro assignee. 2022-02-22 Christophe Lyon gcc/ * config/arm/arm.h (reg_class): Add GENERAL_AND_VPR_REGS. (REG_CLASS_NAMES): Likewise. (REG_CLASS_CONTENTS): Likewise. (CLASS_MAX_NREGS): Handle VPR. * config/arm/arm.cc (arm_hard_regno_nregs): Handle VPR. --- gcc/config/arm/arm.cc | 3 +++ gcc/config/arm/arm.h | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 663f459..9c19589 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -25339,6 +25339,9 @@ thumb2_asm_output_opcode (FILE * stream) static unsigned int arm_hard_regno_nregs (unsigned int regno, machine_mode mode) { + if (IS_VPR_REGNUM (regno)) + return CEIL (GET_MODE_SIZE (mode), 2); + if (TARGET_32BIT && regno > PC_REGNUM && regno != FRAME_POINTER_REGNUM diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index f52724d..61c0221 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -1287,6 +1287,7 @@ enum reg_class SFP_REG, AFP_REG, VPR_REG, + GENERAL_AND_VPR_REGS, ALL_REGS, LIM_REG_CLASSES }; @@ -1316,6 +1317,7 @@ enum reg_class "SFP_REG", \ "AFP_REG", \ "VPR_REG", \ + "GENERAL_AND_VPR_REGS", \ "ALL_REGS" \ } @@ -1344,6 +1346,7 @@ enum reg_class { 0x00000000, 0x00000000, 0x00000000, 0x00000040 }, /* SFP_REG */ \ { 0x00000000, 0x00000000, 0x00000000, 0x00000080 }, /* AFP_REG */ \ { 0x00000000, 0x00000000, 0x00000000, 0x00000400 }, /* VPR_REG. */ \ + { 0x00005FFF, 0x00000000, 0x00000000, 0x00000400 }, /* GENERAL_AND_VPR_REGS. */ \ { 0xFFFF7FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000000F } /* ALL_REGS. */ \ } @@ -1453,7 +1456,9 @@ extern const char *fp_sysreg_names[NB_FP_SYSREGS]; ARM regs are UNITS_PER_WORD bits. FIXME: Is this true for iWMMX? */ #define CLASS_MAX_NREGS(CLASS, MODE) \ - (ARM_NUM_REGS (MODE)) + (CLASS == VPR_REG) \ + ? CEIL (GET_MODE_SIZE (MODE), 2) \ + : (ARM_NUM_REGS (MODE)) /* If defined, gives a class of registers that cannot be used as the operand of a SUBREG that changes the mode of the object illegally. */ -- cgit v1.1 From 6769084fdf159fb5c0fd20c8d28cfef5b2126cb0 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 13 Oct 2021 09:16:14 +0000 Subject: arm: Add support for VPR_REG in arm_class_likely_spilled_p VPR_REG is the only register in its class, so it should be handled by TARGET_CLASS_LIKELY_SPILLED_P, which is achieved by calling default_class_likely_spilled_p. No test fails without this patch, but it seems it should be implemented. Most of the work of this patch series was carried out while I was working at STMicroelectronics as a Linaro assignee. 2022-02-22 Christophe Lyon gcc/ * config/arm/arm.cc (arm_class_likely_spilled_p): Handle VPR_REG. --- gcc/config/arm/arm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 9c19589..8d7f095 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -29369,7 +29369,7 @@ arm_class_likely_spilled_p (reg_class_t rclass) || rclass == CC_REG) return true; - return false; + return default_class_likely_spilled_p (rclass); } /* Implements target hook small_register_classes_for_mode_p. */ -- cgit v1.1 From 0d0aaea105f6b5ddd9b4763e4cbd16ef65a74cb9 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 13 Oct 2021 09:16:17 +0000 Subject: arm: Fix mve_vmvnq_n_ argument mode The vmvnq_n* intrinsics and have [u]int[16|32]_t arguments, so use iterator instead of HI in mve_vmvnq_n_. Most of the work of this patch series was carried out while I was working at STMicroelectronics as a Linaro assignee. 2022-02-22 Christophe Lyon gcc/ * config/arm/mve.md (mve_vmvnq_n_): Use V_elem mode for operand 1. --- gcc/config/arm/mve.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 171dd38..5c3b34d 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -617,7 +617,7 @@ (define_insn "mve_vmvnq_n_" [ (set (match_operand:MVE_5 0 "s_register_operand" "=w") - (unspec:MVE_5 [(match_operand:HI 1 "immediate_operand" "i")] + (unspec:MVE_5 [(match_operand: 1 "immediate_operand" "i")] VMVNQ_N)) ] "TARGET_HAVE_MVE" -- cgit v1.1 From 884f77b4222289510e1df9db2889b60c5df6fcda Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 13 Oct 2021 09:16:22 +0000 Subject: arm: Implement MVE predicates as vectors of booleans This patch implements support for vectors of booleans to support MVE predicates, instead of HImode. Since the ABI mandates pred16_t (aka uint16_t) to represent predicates in intrinsics prototypes, we introduce a new "predicate" type qualifier so that we can map relevant builtins HImode arguments and return value to the appropriate vector of booleans (VxBI). We have to update test_vector_ops_duplicate, because it iterates using an offset in bytes, where we would need to iterate in bits: we stop iterating when we reach the end of the vector of booleans. In addition, we have to fix the underlying definition of vectors of booleans because ARM/MVE needs a different representation than AArch64/SVE. With ARM/MVE the 'true' bit is duplicated over the element size, so that a true element of V4BI is represented by '0b1111'. This patch updates the aarch64 definition of VNx*BI as needed. Most of the work of this patch series was carried out while I was working at STMicroelectronics as a Linaro assignee. 2022-02-22 Christophe Lyon Richard Sandiford gcc/ PR target/100757 PR target/101325 * config/aarch64/aarch64-modes.def (VNx16BI, VNx8BI, VNx4BI, VNx2BI): Update definition. * config/arm/arm-builtins.cc (arm_init_simd_builtin_types): Add new simd types. (arm_init_builtin): Map predicate vectors arguments to HImode. (arm_expand_builtin_args): Move HImode predicate arguments to VxBI rtx. Move return value to HImode rtx. * config/arm/arm-builtins.h (arm_type_qualifiers): Add qualifier_predicate. * config/arm/arm-modes.def (B2I, B4I, V16BI, V8BI, V4BI): New modes. * config/arm/arm-simd-builtin-types.def (Pred1x16_t, Pred2x8_t,Pred4x4_t): New. * emit-rtl.cc (init_emit_once): Handle all boolean modes. * genmodes.cc (mode_data): Add boolean field. (blank_mode): Initialize it. (make_complex_modes): Fix handling of boolean modes. (make_vector_modes): Likewise. (VECTOR_BOOL_MODE): Use new COMPONENT parameter. (make_vector_bool_mode): Likewise. (BOOL_MODE): New. (make_bool_mode): New. (emit_insn_modes_h): Fix generation of boolean modes. (emit_class_narrowest_mode): Likewise. * machmode.def: (VECTOR_BOOL_MODE): Document new COMPONENT parameter. Use new BOOL_MODE instead of FRACTIONAL_INT_MODE to define BImode. * rtx-vector-builder.cc (rtx_vector_builder::find_cached_value): Fix handling of constm1_rtx for VECTOR_BOOL. * simplify-rtx.cc (native_encode_rtx): Fix support for VECTOR_BOOL. (native_decode_vector_rtx): Likewise. (test_vector_ops_duplicate): Skip vec_merge test with vectors of booleans. * varasm.cc (output_constant_pool_2): Likewise. --- gcc/config/aarch64/aarch64-modes.def | 8 +++---- gcc/config/arm/arm-builtins.cc | 39 +++++++++++++++++++++++++++++-- gcc/config/arm/arm-builtins.h | 4 +++- gcc/config/arm/arm-modes.def | 8 +++++++ gcc/config/arm/arm-simd-builtin-types.def | 4 ++++ 5 files changed, 56 insertions(+), 7 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def index 976bf9b..8f39922 100644 --- a/gcc/config/aarch64/aarch64-modes.def +++ b/gcc/config/aarch64/aarch64-modes.def @@ -47,10 +47,10 @@ ADJUST_FLOAT_FORMAT (HF, &ieee_half_format); /* Vector modes. */ -VECTOR_BOOL_MODE (VNx16BI, 16, 2); -VECTOR_BOOL_MODE (VNx8BI, 8, 2); -VECTOR_BOOL_MODE (VNx4BI, 4, 2); -VECTOR_BOOL_MODE (VNx2BI, 2, 2); +VECTOR_BOOL_MODE (VNx16BI, 16, BI, 2); +VECTOR_BOOL_MODE (VNx8BI, 8, BI, 2); +VECTOR_BOOL_MODE (VNx4BI, 4, BI, 2); +VECTOR_BOOL_MODE (VNx2BI, 2, BI, 2); ADJUST_NUNITS (VNx16BI, aarch64_sve_vg * 8); ADJUST_NUNITS (VNx8BI, aarch64_sve_vg * 4); diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc index e6bbda2..993a2f7 100644 --- a/gcc/config/arm/arm-builtins.cc +++ b/gcc/config/arm/arm-builtins.cc @@ -1553,11 +1553,28 @@ arm_init_simd_builtin_types (void) tree eltype = arm_simd_types[i].eltype; machine_mode mode = arm_simd_types[i].mode; - if (eltype == NULL) + if (eltype == NULL + /* VECTOR_BOOL is not supported unless MVE is activated, + this would make build_truth_vector_type_for_mode + crash. */ + && ((GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL) + || !TARGET_HAVE_MVE)) continue; if (arm_simd_types[i].itype == NULL) { - tree type = build_vector_type (eltype, GET_MODE_NUNITS (mode)); + tree type; + if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) + { + /* Handle MVE predicates: they are internally stored as + 16 bits, but are used as vectors of 1, 2 or 4-bit + elements. */ + type = build_truth_vector_type_for_mode (GET_MODE_NUNITS (mode), + mode); + eltype = TREE_TYPE (type); + } + else + type = build_vector_type (eltype, GET_MODE_NUNITS (mode)); + type = build_distinct_type_copy (type); SET_TYPE_STRUCTURAL_EQUALITY (type); @@ -1695,6 +1712,11 @@ arm_init_builtin (unsigned int fcode, arm_builtin_datum *d, if (qualifiers & qualifier_map_mode) op_mode = d->mode; + /* MVE Predicates use HImode as mandated by the ABI: pred16_t is + unsigned short. */ + if (qualifiers & qualifier_predicate) + op_mode = HImode; + /* For pointers, we want a pointer to the basic type of the vector. */ if (qualifiers & qualifier_pointer && VECTOR_MODE_P (op_mode)) @@ -2939,6 +2961,12 @@ arm_expand_builtin_args (rtx target, machine_mode map_mode, int fcode, case ARG_BUILTIN_COPY_TO_REG: if (POINTER_TYPE_P (TREE_TYPE (arg[argc]))) op[argc] = convert_memory_address (Pmode, op[argc]); + + /* MVE uses mve_pred16_t (aka HImode) for vectors of + predicates. */ + if (GET_MODE_CLASS (mode[argc]) == MODE_VECTOR_BOOL) + op[argc] = gen_lowpart (mode[argc], op[argc]); + /*gcc_assert (GET_MODE (op[argc]) == mode[argc]); */ if (!(*insn_data[icode].operand[opno].predicate) (op[argc], mode[argc])) @@ -3144,6 +3172,13 @@ constant_arg: else emit_insn (insn); + if (GET_MODE_CLASS (tmode) == MODE_VECTOR_BOOL) + { + rtx HItarget = gen_reg_rtx (HImode); + emit_move_insn (HItarget, gen_lowpart (HImode, target)); + return HItarget; + } + return target; } diff --git a/gcc/config/arm/arm-builtins.h b/gcc/config/arm/arm-builtins.h index e5130d6..a8ef8ae 100644 --- a/gcc/config/arm/arm-builtins.h +++ b/gcc/config/arm/arm-builtins.h @@ -84,7 +84,9 @@ enum arm_type_qualifiers qualifier_lane_pair_index = 0x1000, /* Lane indices selected in quadtuplets - must be within range of previous argument = a vector. */ - qualifier_lane_quadtup_index = 0x2000 + qualifier_lane_quadtup_index = 0x2000, + /* MVE vector predicates. */ + qualifier_predicate = 0x4000 }; struct arm_simd_type_info diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def index de689c8..9ed0cd0 100644 --- a/gcc/config/arm/arm-modes.def +++ b/gcc/config/arm/arm-modes.def @@ -84,6 +84,14 @@ VECTOR_MODE (FLOAT, BF, 2); /* V2BF. */ VECTOR_MODE (FLOAT, BF, 4); /* V4BF. */ VECTOR_MODE (FLOAT, BF, 8); /* V8BF. */ +/* Predicates for MVE. */ +BOOL_MODE (B2I, 2, 1); +BOOL_MODE (B4I, 4, 1); + +VECTOR_BOOL_MODE (V16BI, 16, BI, 2); +VECTOR_BOOL_MODE (V8BI, 8, B2I, 2); +VECTOR_BOOL_MODE (V4BI, 4, B4I, 2); + /* Fraction and accumulator vector modes. */ VECTOR_MODES (FRACT, 4); /* V4QQ V2HQ */ VECTOR_MODES (UFRACT, 4); /* V4UQQ V2UHQ */ diff --git a/gcc/config/arm/arm-simd-builtin-types.def b/gcc/config/arm/arm-simd-builtin-types.def index 6ba6f21..d1d6416 100644 --- a/gcc/config/arm/arm-simd-builtin-types.def +++ b/gcc/config/arm/arm-simd-builtin-types.def @@ -51,3 +51,7 @@ ENTRY (Bfloat16x2_t, V2BF, none, 32, bfloat16, 20) ENTRY (Bfloat16x4_t, V4BF, none, 64, bfloat16, 20) ENTRY (Bfloat16x8_t, V8BF, none, 128, bfloat16, 20) + + ENTRY (Pred1x16_t, V16BI, predicate, 16, pred1, 16) + ENTRY (Pred2x8_t, V8BI, predicate, 8, pred1, 15) + ENTRY (Pred4x4_t, V4BI, predicate, 4, pred1, 15) -- cgit v1.1 From 91224cf625dc90304bb515a0cc602beed48fe3da Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 13 Oct 2021 09:16:27 +0000 Subject: arm: Implement auto-vectorized MVE comparisons with vectors of boolean predicates We make use of qualifier_predicate to describe MVE builtins prototypes, restricting to auto-vectorizable vcmp* and vpsel builtins, as they are exercised by the tests added earlier in the series. Special handling is needed for mve_vpselq because it has a v2di variant, which has no natural VPR.P0 representation: we keep HImode for it. The vector_compare expansion code is updated to use the right VxBI mode instead of HI for the result. We extend the existing thumb2_movhi_vfp and thumb2_movhi_fp16 patterns to use the new MVE_7_HI iterator which covers HI and the new VxBI modes, in conjunction with the new DB constraint for a constant vector of booleans. This patch also adds tests derived from the one provided in PR target/101325: there is a compile-only test because I did not have access to anything that could execute MVE code until recently. I have been able to add an executable test since QEMU supports MVE. Instead of adding arm_v8_1m_mve_hw, I update arm_mve_hw so that it uses add_options_for_arm_v8_1m_mve_fp, like arm_neon_hw does. This ensures arm_mve_hw passes even if the toolchain does not generate MVE code by default. Most of the work of this patch series was carried out while I was working at STMicroelectronics as a Linaro assignee. 2022-02-22 Christophe Lyon Richard Sandiford gcc/ PR target/100757 PR target/101325 * config/arm/arm-builtins.cc (BINOP_PRED_UNONE_UNONE_QUALIFIERS) (BINOP_PRED_NONE_NONE_QUALIFIERS) (TERNOP_NONE_NONE_NONE_PRED_QUALIFIERS) (TERNOP_UNONE_UNONE_UNONE_PRED_QUALIFIERS): New. * config/arm/arm-protos.h (mve_bool_vec_to_const): New. * config/arm/arm.cc (arm_hard_regno_mode_ok): Handle new VxBI modes. (arm_mode_to_pred_mode): New. (arm_expand_vector_compare): Use the right VxBI mode instead of HI. (arm_expand_vcond): Likewise. (simd_valid_immediate): Handle MODE_VECTOR_BOOL. (mve_bool_vec_to_const): New. (neon_make_constant): Call mve_bool_vec_to_const when needed. * config/arm/arm_mve_builtins.def (vcmpneq_, vcmphiq_, vcmpcsq_) (vcmpltq_, vcmpleq_, vcmpgtq_, vcmpgeq_, vcmpeqq_, vcmpneq_f) (vcmpltq_f, vcmpleq_f, vcmpgtq_f, vcmpgeq_f, vcmpeqq_f, vpselq_u) (vpselq_s, vpselq_f): Use new predicated qualifiers. * config/arm/constraints.md (DB): New. * config/arm/iterators.md (MVE_7, MVE_7_HI): New mode iterators. (MVE_VPRED, MVE_vpred): New attribute iterators. * config/arm/mve.md (@mve_vcmpq_) (@mve_vcmpq_f, @mve_vpselq_) (@mve_vpselq_f): Use MVE_VPRED instead of HI. (@mve_vpselq_v2di): Define separately. (mov): New expander for VxBI modes. * config/arm/vfp.md (thumb2_movhi_vfp, thumb2_movhi_fp16): Use MVE_7_HI iterator and add support for DB constraint. gcc/testsuite/ PR target/100757 PR target/101325 * gcc.dg/rtl/arm/mve-vxbi.c: New test. * gcc.target/arm/simd/pr101325.c: New. * gcc.target/arm/simd/pr101325-2.c: New. * lib/target-supports.exp (check_effective_target_arm_mve_hw): Use add_options_for_arm_v8_1m_mve_fp. --- gcc/config/arm/arm-builtins.cc | 25 +++++++++++++++++ gcc/config/arm/arm-protos.h | 1 + gcc/config/arm/arm.cc | 56 +++++++++++++++++++++++++++++++++---- gcc/config/arm/arm_mve_builtins.def | 34 +++++++++++----------- gcc/config/arm/constraints.md | 6 ++++ gcc/config/arm/iterators.md | 6 ++++ gcc/config/arm/mve.md | 23 +++++++++++---- gcc/config/arm/vfp.md | 34 ++++++++++++++-------- 8 files changed, 144 insertions(+), 41 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc index 993a2f7..1c6b9c9 100644 --- a/gcc/config/arm/arm-builtins.cc +++ b/gcc/config/arm/arm-builtins.cc @@ -421,6 +421,12 @@ arm_binop_unone_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_binop_unone_unone_unone_qualifiers) static enum arm_type_qualifiers +arm_binop_pred_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_predicate, qualifier_unsigned, qualifier_unsigned }; +#define BINOP_PRED_UNONE_UNONE_QUALIFIERS \ + (arm_binop_pred_unone_unone_qualifiers) + +static enum arm_type_qualifiers arm_binop_unone_none_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_none, qualifier_immediate }; #define BINOP_UNONE_NONE_IMM_QUALIFIERS \ @@ -439,6 +445,12 @@ arm_binop_unone_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_binop_unone_none_none_qualifiers) static enum arm_type_qualifiers +arm_binop_pred_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_predicate, qualifier_none, qualifier_none }; +#define BINOP_PRED_NONE_NONE_QUALIFIERS \ + (arm_binop_pred_none_none_qualifiers) + +static enum arm_type_qualifiers arm_binop_unone_unone_none_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_none }; #define BINOP_UNONE_UNONE_NONE_QUALIFIERS \ @@ -510,6 +522,12 @@ arm_ternop_none_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_ternop_none_none_none_unone_qualifiers) static enum arm_type_qualifiers +arm_ternop_none_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_none, qualifier_none, qualifier_none, qualifier_predicate }; +#define TERNOP_NONE_NONE_NONE_PRED_QUALIFIERS \ + (arm_ternop_none_none_none_pred_qualifiers) + +static enum arm_type_qualifiers arm_ternop_none_none_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_immediate, qualifier_unsigned }; #define TERNOP_NONE_NONE_IMM_UNONE_QUALIFIERS \ @@ -529,6 +547,13 @@ arm_ternop_unone_unone_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_ternop_unone_unone_unone_unone_qualifiers) static enum arm_type_qualifiers +arm_ternop_unone_unone_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, + qualifier_predicate }; +#define TERNOP_UNONE_UNONE_UNONE_PRED_QUALIFIERS \ + (arm_ternop_unone_unone_unone_pred_qualifiers) + +static enum arm_type_qualifiers arm_ternop_none_none_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_none, qualifier_none }; #define TERNOP_NONE_NONE_NONE_NONE_QUALIFIERS \ diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 881c72c..f2f7ca6 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -101,6 +101,7 @@ extern char *neon_output_shift_immediate (const char *, char, rtx *, machine_mode, int, bool); extern void neon_pairwise_reduce (rtx, rtx, machine_mode, rtx (*) (rtx, rtx, rtx)); +extern rtx mve_bool_vec_to_const (rtx const_vec); extern rtx neon_make_constant (rtx, bool generate = true); extern tree arm_builtin_vectorized_function (unsigned int, tree, tree); extern void neon_expand_vector_init (rtx, rtx); diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 8d7f095..df43c67 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -12802,7 +12802,10 @@ simd_valid_immediate (rtx op, machine_mode mode, int inverse, innersize = GET_MODE_UNIT_SIZE (mode); /* Only support 128-bit vectors for MVE. */ - if (TARGET_HAVE_MVE && (!vector || n_elts * innersize != 16)) + if (TARGET_HAVE_MVE + && (!vector + || (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) + || n_elts * innersize != 16)) return -1; /* Vectors of float constants. */ @@ -13167,6 +13170,29 @@ neon_vdup_constant (rtx vals, bool generate) return gen_vec_duplicate (mode, x); } +/* Return a HI representation of CONST_VEC suitable for MVE predicates. */ +rtx +mve_bool_vec_to_const (rtx const_vec) +{ + int n_elts = GET_MODE_NUNITS ( GET_MODE (const_vec)); + int repeat = 16 / n_elts; + int i; + int hi_val = 0; + + for (i = 0; i < n_elts; i++) + { + rtx el = CONST_VECTOR_ELT (const_vec, i); + unsigned HOST_WIDE_INT elpart; + + gcc_assert (CONST_INT_P (el)); + elpart = INTVAL (el); + + for (int j = 0; j < repeat; j++) + hi_val |= elpart << (i * repeat + j); + } + return gen_int_mode (hi_val, HImode); +} + /* Return a non-NULL RTX iff VALS, which is a PARALLEL containing only constants (for vec_init) or CONST_VECTOR, can be effeciently loaded into a register. @@ -13207,6 +13233,8 @@ neon_make_constant (rtx vals, bool generate) && simd_immediate_valid_for_move (const_vec, mode, NULL, NULL)) /* Load using VMOV. On Cortex-A8 this takes one cycle. */ return const_vec; + else if (TARGET_HAVE_MVE && (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)) + return mve_bool_vec_to_const (const_vec); else if ((target = neon_vdup_constant (vals, generate)) != NULL_RTX) /* Loaded using VDUP. On Cortex-A8 the VDUP takes one NEON pipeline cycle; creating the constant takes one or two ARM @@ -25365,7 +25393,10 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode) return false; if (IS_VPR_REGNUM (regno)) - return mode == HImode; + return mode == HImode + || mode == V16BImode + || mode == V8BImode + || mode == V4BImode; if (TARGET_THUMB1) /* For the Thumb we only allow values bigger than SImode in @@ -31053,6 +31084,19 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem, arm_post_atomic_barrier (model); } +/* Return the mode for the MVE vector of predicates corresponding to MODE. */ +machine_mode +arm_mode_to_pred_mode (machine_mode mode) +{ + switch (GET_MODE_NUNITS (mode)) + { + case 16: return V16BImode; + case 8: return V8BImode; + case 4: return V4BImode; + } + gcc_unreachable (); +} + /* Expand code to compare vectors OP0 and OP1 using condition CODE. If CAN_INVERT, store either the result or its inverse in TARGET and return true if TARGET contains the inverse. If !CAN_INVERT, @@ -31136,7 +31180,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, if (vcond_mve) vpr_p0 = target; else - vpr_p0 = gen_reg_rtx (HImode); + vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode)); switch (GET_MODE_CLASS (cmp_mode)) { @@ -31178,7 +31222,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, if (vcond_mve) vpr_p0 = target; else - vpr_p0 = gen_reg_rtx (HImode); + vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode)); emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1))); if (!vcond_mve) @@ -31205,7 +31249,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, if (vcond_mve) vpr_p0 = target; else - vpr_p0 = gen_reg_rtx (HImode); + vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode)); emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0)); if (!vcond_mve) @@ -31258,7 +31302,7 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode) if (TARGET_HAVE_MVE) { vcond_mve=true; - mask = gen_reg_rtx (HImode); + mask = gen_reg_rtx (arm_mode_to_pred_mode (cmp_result_mode)); } else mask = gen_reg_rtx (cmp_result_mode); diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def index c3ae407..44b41ea 100644 --- a/gcc/config/arm/arm_mve_builtins.def +++ b/gcc/config/arm/arm_mve_builtins.def @@ -89,7 +89,7 @@ VAR3 (BINOP_UNONE_UNONE_IMM, vshrq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_IMM, vshrq_n_s, v16qi, v8hi, v4si) VAR1 (BINOP_NONE_NONE_UNONE, vaddlvq_p_s, v4si) VAR1 (BINOP_UNONE_UNONE_UNONE, vaddlvq_p_u, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_NONE_NONE, vcmpneq_, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vshlq_s, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_NONE, vshlq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vsubq_u, v16qi, v8hi, v4si) @@ -117,9 +117,9 @@ VAR3 (BINOP_UNONE_UNONE_UNONE, vhsubq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, veorq_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_UNONE_UNONE, vcmphiq_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_n_, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_UNONE_UNONE, vcmpcsq_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vandq_u, v16qi, v8hi, v4si) @@ -143,15 +143,15 @@ VAR3 (BINOP_UNONE_UNONE_IMM, vshlq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_IMM, vrshrq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_IMM, vqshlq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_n_, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_NONE_NONE, vcmpltq_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_n_, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_NONE_NONE, vcmpleq_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_n_, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_NONE_NONE, vcmpgtq_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_NONE_NONE, vcmpgeq_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_NONE_NONE, vcmpeqq_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_NONE_IMM, vqshluq_n_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_UNONE, vaddvq_p_s, v16qi, v8hi, v4si) @@ -219,17 +219,17 @@ VAR2 (BINOP_UNONE_UNONE_IMM, vshllbq_n_u, v16qi, v8hi) VAR2 (BINOP_UNONE_UNONE_IMM, vorrq_n_u, v8hi, v4si) VAR2 (BINOP_UNONE_UNONE_IMM, vbicq_n_u, v8hi, v4si) VAR2 (BINOP_UNONE_NONE_NONE, vcmpneq_n_f, v8hf, v4sf) -VAR2 (BINOP_UNONE_NONE_NONE, vcmpneq_f, v8hf, v4sf) +VAR2 (BINOP_PRED_NONE_NONE, vcmpneq_f, v8hf, v4sf) VAR2 (BINOP_UNONE_NONE_NONE, vcmpltq_n_f, v8hf, v4sf) -VAR2 (BINOP_UNONE_NONE_NONE, vcmpltq_f, v8hf, v4sf) +VAR2 (BINOP_PRED_NONE_NONE, vcmpltq_f, v8hf, v4sf) VAR2 (BINOP_UNONE_NONE_NONE, vcmpleq_n_f, v8hf, v4sf) -VAR2 (BINOP_UNONE_NONE_NONE, vcmpleq_f, v8hf, v4sf) +VAR2 (BINOP_PRED_NONE_NONE, vcmpleq_f, v8hf, v4sf) VAR2 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_f, v8hf, v4sf) -VAR2 (BINOP_UNONE_NONE_NONE, vcmpgtq_f, v8hf, v4sf) +VAR2 (BINOP_PRED_NONE_NONE, vcmpgtq_f, v8hf, v4sf) VAR2 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_f, v8hf, v4sf) -VAR2 (BINOP_UNONE_NONE_NONE, vcmpgeq_f, v8hf, v4sf) +VAR2 (BINOP_PRED_NONE_NONE, vcmpgeq_f, v8hf, v4sf) VAR2 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_f, v8hf, v4sf) -VAR2 (BINOP_UNONE_NONE_NONE, vcmpeqq_f, v8hf, v4sf) +VAR2 (BINOP_PRED_NONE_NONE, vcmpeqq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vsubq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vqmovntq_s, v8hi, v4si) VAR2 (BINOP_NONE_NONE_NONE, vqmovnbq_s, v8hi, v4si) @@ -295,8 +295,8 @@ VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vcvtaq_m_u, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtaq_m_s, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vshlcq_vec_u, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_UNONE_IMM, vshlcq_vec_s, v16qi, v8hi, v4si) -VAR4 (TERNOP_UNONE_UNONE_UNONE_UNONE, vpselq_u, v16qi, v8hi, v4si, v2di) -VAR4 (TERNOP_NONE_NONE_NONE_UNONE, vpselq_s, v16qi, v8hi, v4si, v2di) +VAR4 (TERNOP_UNONE_UNONE_UNONE_PRED, vpselq_u, v16qi, v8hi, v4si, v2di) +VAR4 (TERNOP_NONE_NONE_NONE_PRED, vpselq_s, v16qi, v8hi, v4si, v2di) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrev64q_m_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmvnq_m_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlasq_n_u, v16qi, v8hi, v4si) @@ -426,7 +426,7 @@ VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrev64q_m_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrev32q_m_s, v16qi, v8hi) VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vqmovntq_m_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vqmovnbq_m_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vpselq_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vpselq_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vnegq_m_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmovntq_m_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmovnbq_m_s, v8hi, v4si) diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md index 1920004..2b411b0 100644 --- a/gcc/config/arm/constraints.md +++ b/gcc/config/arm/constraints.md @@ -312,6 +312,12 @@ (and (match_code "const_vector") (match_test "(TARGET_NEON || TARGET_HAVE_MVE) && op == CONST0_RTX (mode)"))) +(define_constraint "DB" + "@internal + In ARM/Thumb-2 state with MVE a constant vector of booleans." + (and (match_code "const_vector") + (match_test "TARGET_HAVE_MVE && GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL"))) + (define_constraint "Da" "@internal In ARM/Thumb-2 state a const_int, const_double or const_vector that can diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 8202c27..37cf797 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -272,6 +272,8 @@ (define_mode_iterator MVE_2 [V16QI V8HI V4SI]) (define_mode_iterator MVE_5 [V8HI V4SI]) (define_mode_iterator MVE_6 [V8HI V4SI]) +(define_mode_iterator MVE_7 [V16BI V8BI V4BI]) +(define_mode_iterator MVE_7_HI [HI V16BI V8BI V4BI]) ;;---------------------------------------------------------------------------- ;; Code iterators @@ -946,6 +948,10 @@ (V8HF "u16") (V4SF "32")]) (define_mode_attr earlyclobber_32 [(V16QI "=w") (V8HI "=w") (V4SI "=&w") (V8HF "=w") (V4SF "=&w")]) +(define_mode_attr MVE_VPRED [(V16QI "V16BI") (V8HI "V8BI") (V4SI "V4BI") + (V2DI "HI") (V8HF "V8BI") (V4SF "V4BI")]) +(define_mode_attr MVE_vpred [(V16QI "v16bi") (V8HI "v8bi") (V4SI "v4bi") + (V2DI "hi") (V8HF "v8bi") (V4SF "v4bi")]) ;;---------------------------------------------------------------------------- ;; Code attributes diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 5c3b34d..983aa10 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -839,8 +839,8 @@ ;; (define_insn "@mve_vcmpq_" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (MVE_COMPARISONS: (match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE" @@ -1929,8 +1929,8 @@ ;; (define_insn "@mve_vcmpq_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (MVE_FP_COMPARISONS: (match_operand:MVE_0 1 "s_register_operand" "w") (match_operand:MVE_0 2 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3324,7 +3324,7 @@ (set (match_operand:MVE_1 0 "s_register_operand" "=w") (unspec:MVE_1 [(match_operand:MVE_1 1 "s_register_operand" "w") (match_operand:MVE_1 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VPSELQ)) ] "TARGET_HAVE_MVE" @@ -4419,7 +4419,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VPSELQ_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -10516,3 +10516,14 @@ "vldr.\t%q0, %E1" [(set_attr "type" "mve_load")] ) + +;; Expander for VxBI moves +(define_expand "mov" + [(set (match_operand:MVE_7 0 "nonimmediate_operand") + (match_operand:MVE_7 1 "general_operand"))] + "TARGET_HAVE_MVE" + { + if (!register_operand (operands[0], mode)) + operands[1] = force_reg (mode, operands[1]); + } +) diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md index f5ccb92..f00d1ca 100644 --- a/gcc/config/arm/vfp.md +++ b/gcc/config/arm/vfp.md @@ -73,21 +73,26 @@ (define_insn "*thumb2_movhi_vfp" [(set - (match_operand:HI 0 "nonimmediate_operand" + (match_operand:MVE_7_HI 0 "nonimmediate_operand" "=rk, r, l, r, m, r, *t, r, *t, Up, r") - (match_operand:HI 1 "general_operand" - "rk, I, Py, n, r, m, r, *t, *t, r, Up"))] + (match_operand:MVE_7_HI 1 "general_operand" + "rk, IDB, Py, n, r, m, r, *t, *t, r, Up"))] "TARGET_THUMB2 && TARGET_VFP_BASE && !TARGET_VFP_FP16INST - && (register_operand (operands[0], HImode) - || register_operand (operands[1], HImode))" + && (register_operand (operands[0], mode) + || register_operand (operands[1], mode))" { switch (which_alternative) { case 0: - case 1: case 2: return "mov%?\t%0, %1\t%@ movhi"; + case 1: + if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_VECTOR_BOOL) + operands[1] = mve_const_bool_vec_to_hi (operands[1]); + else + operands[1] = gen_lowpart (HImode, operands[1]); + return "mov%?\t%0, %1\t%@ movhi"; case 3: return "movw%?\t%0, %L1\t%@ movhi"; case 4: @@ -173,20 +178,25 @@ (define_insn "*thumb2_movhi_fp16" [(set - (match_operand:HI 0 "nonimmediate_operand" + (match_operand:MVE_7_HI 0 "nonimmediate_operand" "=rk, r, l, r, m, r, *t, r, *t, Up, r") - (match_operand:HI 1 "general_operand" - "rk, I, Py, n, r, m, r, *t, *t, r, Up"))] + (match_operand:MVE_7_HI 1 "general_operand" + "rk, IDB, Py, n, r, m, r, *t, *t, r, Up"))] "TARGET_THUMB2 && (TARGET_VFP_FP16INST || TARGET_HAVE_MVE) - && (register_operand (operands[0], HImode) - || register_operand (operands[1], HImode))" + && (register_operand (operands[0], mode) + || register_operand (operands[1], mode))" { switch (which_alternative) { case 0: - case 1: case 2: return "mov%?\t%0, %1\t%@ movhi"; + case 1: + if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_VECTOR_BOOL) + operands[1] = mve_const_bool_vec_to_hi (operands[1]); + else + operands[1] = gen_lowpart (HImode, operands[1]); + return "mov%?\t%0, %1\t%@ movhi"; case 3: return "movw%?\t%0, %L1\t%@ movhi"; case 4: -- cgit v1.1 From df0e57c2c032cea0f77f2e68231c035f282b26d6 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 20 Oct 2021 15:30:16 +0000 Subject: arm: Fix vcond_mask expander for MVE (PR target/100757) The problem in this PR is that we call VPSEL with a mask of vector type instead of HImode. This happens because operand 3 in vcond_mask is the pre-computed vector comparison and has vector type. This patch fixes it by implementing TARGET_VECTORIZE_GET_MASK_MODE, returning the appropriate VxBI mode when targeting MVE. In turn, this implies implementing vec_cmp, vec_cmpu and vcond_mask_, and we can move vec_cmp, vec_cmpu and vcond_mask_ back to neon.md since they are not used by MVE anymore. The new * patterns listed above are implemented in mve.md since they are only valid for MVE. However this may make maintenance/comparison more painful than having all of them in vec-common.md. In the process, we can get rid of the recently added vcond_mve parameter of arm_expand_vector_compare. Compared to neon.md's vcond_mask_ before my "arm: Auto-vectorization for MVE: vcmp" patch (r12-834), it keeps the VDQWH iterator added in r12-835 (to have V4HF/V8HF support), as well as the (! || flag_unsafe_math_optimizations) condition which was not present before r12-834 although SF modes were enabled by VDQW (I think this was a bug). Using TARGET_VECTORIZE_GET_MASK_MODE has the advantage that we no longer need to generate vpsel with vectors of 0 and 1: the masks are now merged via scalar 'ands' instructions operating on 16-bit masks after converting the boolean vectors. In addition, this patch fixes a problem in arm_expand_vcond() where the result would be a vector of 0 or 1 instead of operand 1 or 2. Since we want to skip gcc.dg/signbit-2.c for MVE, we also add a new arm_mve effective target. Reducing the number of iterations in pr100757-3.c from 32 to 8, we generate the code below: float a[32]; float fn1(int d) { float c = 4.0f; for (int b = 0; b < 8; b++) if (a[b] != 2.0f) c = 5.0f; return c; } fn1: ldr r3, .L3+48 vldr.64 d4, .L3 // q2=(2.0,2.0,2.0,2.0) vldr.64 d5, .L3+8 vldrw.32 q0, [r3] // q0=a(0..3) adds r3, r3, #16 vcmp.f32 eq, q0, q2 // cmp a(0..3) == (2.0,2.0,2.0,2.0) vldrw.32 q1, [r3] // q1=a(4..7) vmrs r3, P0 vcmp.f32 eq, q1, q2 // cmp a(4..7) == (2.0,2.0,2.0,2.0) vmrs r2, P0 @ movhi ands r3, r3, r2 // r3=select(a(0..3]) & select(a(4..7)) vldr.64 d4, .L3+16 // q2=(5.0,5.0,5.0,5.0) vldr.64 d5, .L3+24 vmsr P0, r3 vldr.64 d6, .L3+32 // q3=(4.0,4.0,4.0,4.0) vldr.64 d7, .L3+40 vpsel q3, q3, q2 // q3=vcond_mask(4.0,5.0) vmov.32 r2, q3[1] // keep the scalar max vmov.32 r0, q3[3] vmov.32 r3, q3[2] vmov.f32 s11, s12 vmov s15, r2 vmov s14, r3 vmaxnm.f32 s15, s11, s15 vmaxnm.f32 s15, s15, s14 vmov s14, r0 vmaxnm.f32 s15, s15, s14 vmov r0, s15 bx lr .L4: .align 3 .L3: .word 1073741824 // 2.0f .word 1073741824 .word 1073741824 .word 1073741824 .word 1084227584 // 5.0f .word 1084227584 .word 1084227584 .word 1084227584 .word 1082130432 // 4.0f .word 1082130432 .word 1082130432 .word 1082130432 This patch adds tests that trigger an ICE without this fix. The pr100757*.c testcases are derived from gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using various types and return values different from 0 and 1 to avoid commonalization with boolean masks. In addition, since we should not need these masks, the tests make sure they are not present. Most of the work of this patch series was carried out while I was working at STMicroelectronics as a Linaro assignee. 2022-02-22 Christophe Lyon PR target/100757 gcc/ * config/arm/arm-protos.h (arm_get_mask_mode): New prototype. (arm_expand_vector_compare): Update prototype. * config/arm/arm.cc (TARGET_VECTORIZE_GET_MASK_MODE): New. (arm_vector_mode_supported_p): Add support for VxBI modes. (arm_expand_vector_compare): Remove useless generation of vpsel. (arm_expand_vcond): Fix select operands. (arm_get_mask_mode): New. * config/arm/mve.md (vec_cmp): New. (vec_cmpu): New. (vcond_mask_): New. * config/arm/vec-common.md (vec_cmp) (vec_cmpu): Move to ... * config/arm/neon.md (vec_cmp) (vec_cmpu): ... here and disable for MVE. * doc/sourcebuild.texi (arm_mve): Document new effective-target. gcc/testsuite/ PR target/100757 * gcc.target/arm/simd/pr100757-2.c: New. * gcc.target/arm/simd/pr100757-3.c: New. * gcc.target/arm/simd/pr100757-4.c: New. * gcc.target/arm/simd/pr100757.c: New. * gcc.dg/signbit-2.c: Skip when targeting ARM/MVE. * lib/target-supports.exp (check_effective_target_arm_mve): New. --- gcc/config/arm/arm-protos.h | 3 +- gcc/config/arm/arm.cc | 117 +++++++++++++++---------------------------- gcc/config/arm/mve.md | 51 +++++++++++++++++++ gcc/config/arm/neon.md | 39 +++++++++++++++ gcc/config/arm/vec-common.md | 52 ------------------- 5 files changed, 132 insertions(+), 130 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index f2f7ca6..9d14209 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -204,6 +204,7 @@ extern void arm_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree); extern bool arm_pad_reg_upward (machine_mode, tree, int); #endif extern int arm_apply_result_size (void); +extern opt_machine_mode arm_get_mask_mode (machine_mode mode); #endif /* RTX_CODE */ @@ -380,7 +381,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx, extern bool arm_fusion_enabled_p (tune_params::fuse_ops); extern bool arm_valid_symbolic_address_p (rtx); extern bool arm_validize_comparison (rtx *, rtx *, rtx *); -extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool, bool); +extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool); #endif /* RTX_CODE */ extern bool arm_gen_setmem (rtx *); diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index df43c67..c1103d9 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -832,6 +832,9 @@ static const struct attribute_spec arm_attribute_table[] = #undef TARGET_STACK_PROTECT_GUARD #define TARGET_STACK_PROTECT_GUARD arm_stack_protect_guard + +#undef TARGET_VECTORIZE_GET_MASK_MODE +#define TARGET_VECTORIZE_GET_MASK_MODE arm_get_mask_mode /* Obstack for minipool constant handling. */ static struct obstack minipool_obstack; @@ -29286,7 +29289,8 @@ arm_vector_mode_supported_p (machine_mode mode) if (TARGET_HAVE_MVE && (mode == V2DImode || mode == V4SImode || mode == V8HImode - || mode == V16QImode)) + || mode == V16QImode + || mode == V16BImode || mode == V8BImode || mode == V4BImode)) return true; if (TARGET_HAVE_MVE_FLOAT @@ -31085,7 +31089,7 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem, } /* Return the mode for the MVE vector of predicates corresponding to MODE. */ -machine_mode +opt_machine_mode arm_mode_to_pred_mode (machine_mode mode) { switch (GET_MODE_NUNITS (mode)) @@ -31094,7 +31098,7 @@ arm_mode_to_pred_mode (machine_mode mode) case 8: return V8BImode; case 4: return V4BImode; } - gcc_unreachable (); + return opt_machine_mode (); } /* Expand code to compare vectors OP0 and OP1 using condition CODE. @@ -31102,16 +31106,12 @@ arm_mode_to_pred_mode (machine_mode mode) and return true if TARGET contains the inverse. If !CAN_INVERT, always store the result in TARGET, never its inverse. - If VCOND_MVE, do not emit the vpsel instruction here, let arm_expand_vcond do - it with the right destination type to avoid emiting two vpsel, one here and - one in arm_expand_vcond. - Note that the handling of floating-point comparisons is not IEEE compliant. */ bool arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, - bool can_invert, bool vcond_mve) + bool can_invert) { machine_mode cmp_result_mode = GET_MODE (target); machine_mode cmp_mode = GET_MODE (op0); @@ -31140,7 +31140,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, and then store its inverse in TARGET. This avoids reusing TARGET (which for integer NE could be one of the inputs). */ rtx tmp = gen_reg_rtx (cmp_result_mode); - if (arm_expand_vector_compare (tmp, code, op0, op1, true, vcond_mve)) + if (arm_expand_vector_compare (tmp, code, op0, op1, true)) gcc_unreachable (); emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp))); return false; @@ -31176,36 +31176,22 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, case NE: if (TARGET_HAVE_MVE) { - rtx vpr_p0; - if (vcond_mve) - vpr_p0 = target; - else - vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode)); - switch (GET_MODE_CLASS (cmp_mode)) { case MODE_VECTOR_INT: - emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1))); + emit_insn (gen_mve_vcmpq (code, cmp_mode, target, + op0, force_reg (cmp_mode, op1))); break; case MODE_VECTOR_FLOAT: if (TARGET_HAVE_MVE_FLOAT) - emit_insn (gen_mve_vcmpq_f (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1))); + emit_insn (gen_mve_vcmpq_f (code, cmp_mode, target, + op0, force_reg (cmp_mode, op1))); else gcc_unreachable (); break; default: gcc_unreachable (); } - - /* If we are not expanding a vcond, build the result here. */ - if (!vcond_mve) - { - rtx zero = gen_reg_rtx (cmp_result_mode); - rtx one = gen_reg_rtx (cmp_result_mode); - emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); - emit_move_insn (one, CONST1_RTX (cmp_result_mode)); - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); - } } else emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1)); @@ -31217,23 +31203,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, case GEU: case GTU: if (TARGET_HAVE_MVE) - { - rtx vpr_p0; - if (vcond_mve) - vpr_p0 = target; - else - vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode)); - - emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1))); - if (!vcond_mve) - { - rtx zero = gen_reg_rtx (cmp_result_mode); - rtx one = gen_reg_rtx (cmp_result_mode); - emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); - emit_move_insn (one, CONST1_RTX (cmp_result_mode)); - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); - } - } + emit_insn (gen_mve_vcmpq (code, cmp_mode, target, + op0, force_reg (cmp_mode, op1))); else emit_insn (gen_neon_vc (code, cmp_mode, target, op0, force_reg (cmp_mode, op1))); @@ -31244,23 +31215,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, case LEU: case LTU: if (TARGET_HAVE_MVE) - { - rtx vpr_p0; - if (vcond_mve) - vpr_p0 = target; - else - vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode)); - - emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0)); - if (!vcond_mve) - { - rtx zero = gen_reg_rtx (cmp_result_mode); - rtx one = gen_reg_rtx (cmp_result_mode); - emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); - emit_move_insn (one, CONST1_RTX (cmp_result_mode)); - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); - } - } + emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, target, + force_reg (cmp_mode, op1), op0)); else emit_insn (gen_neon_vc (swap_condition (code), cmp_mode, target, force_reg (cmp_mode, op1), op0)); @@ -31275,8 +31231,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, rtx gt_res = gen_reg_rtx (cmp_result_mode); rtx alt_res = gen_reg_rtx (cmp_result_mode); rtx_code alt_code = (code == LTGT ? LT : LE); - if (arm_expand_vector_compare (gt_res, GT, op0, op1, true, vcond_mve) - || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true, vcond_mve)) + if (arm_expand_vector_compare (gt_res, GT, op0, op1, true) + || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true)) gcc_unreachable (); emit_insn (gen_rtx_SET (target, gen_rtx_IOR (cmp_result_mode, gt_res, alt_res))); @@ -31296,19 +31252,15 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode) { /* When expanding for MVE, we do not want to emit a (useless) vpsel in arm_expand_vector_compare, and another one here. */ - bool vcond_mve=false; rtx mask; if (TARGET_HAVE_MVE) - { - vcond_mve=true; - mask = gen_reg_rtx (arm_mode_to_pred_mode (cmp_result_mode)); - } + mask = gen_reg_rtx (arm_mode_to_pred_mode (cmp_result_mode).require ()); else mask = gen_reg_rtx (cmp_result_mode); bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]), - operands[4], operands[5], true, vcond_mve); + operands[4], operands[5], true); if (inverted) std::swap (operands[1], operands[2]); if (TARGET_NEON) @@ -31316,20 +31268,20 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode) mask, operands[1], operands[2])); else { - machine_mode cmp_mode = GET_MODE (operands[4]); - rtx vpr_p0 = mask; - rtx zero = gen_reg_rtx (cmp_mode); - rtx one = gen_reg_rtx (cmp_mode); - emit_move_insn (zero, CONST0_RTX (cmp_mode)); - emit_move_insn (one, CONST1_RTX (cmp_mode)); + machine_mode cmp_mode = GET_MODE (operands[0]); + switch (GET_MODE_CLASS (cmp_mode)) { case MODE_VECTOR_INT: - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0)); + emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_mode, operands[0], + operands[1], operands[2], mask)); break; case MODE_VECTOR_FLOAT: if (TARGET_HAVE_MVE_FLOAT) - emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0)); + emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], + operands[1], operands[2], mask)); + else + gcc_unreachable (); break; default: gcc_unreachable (); @@ -34251,4 +34203,15 @@ arm_mode_base_reg_class (machine_mode mode) struct gcc_target targetm = TARGET_INITIALIZER; +/* Implement TARGET_VECTORIZE_GET_MASK_MODE. */ + +opt_machine_mode +arm_get_mask_mode (machine_mode mode) +{ + if (TARGET_HAVE_MVE) + return arm_mode_to_pred_mode (mode); + + return default_get_mask_mode (mode); +} + #include "gt-arm.h" diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 983aa10..d0c3100 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -10527,3 +10527,54 @@ operands[1] = force_reg (mode, operands[1]); } ) + +;; Expanders for vec_cmp and vcond + +(define_expand "vec_cmp" + [(set (match_operand: 0 "s_register_operand") + (match_operator: 1 "comparison_operator" + [(match_operand:MVE_VLD_ST 2 "s_register_operand") + (match_operand:MVE_VLD_ST 3 "reg_or_zero_operand")]))] + "TARGET_HAVE_MVE + && (! || flag_unsafe_math_optimizations)" +{ + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), + operands[2], operands[3], false); + DONE; +}) + +(define_expand "vec_cmpu" + [(set (match_operand: 0 "s_register_operand") + (match_operator: 1 "comparison_operator" + [(match_operand:MVE_2 2 "s_register_operand") + (match_operand:MVE_2 3 "reg_or_zero_operand")]))] + "TARGET_HAVE_MVE" +{ + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), + operands[2], operands[3], false); + DONE; +}) + +(define_expand "vcond_mask_" + [(set (match_operand:MVE_VLD_ST 0 "s_register_operand") + (if_then_else:MVE_VLD_ST + (match_operand: 3 "s_register_operand") + (match_operand:MVE_VLD_ST 1 "s_register_operand") + (match_operand:MVE_VLD_ST 2 "s_register_operand")))] + "TARGET_HAVE_MVE" +{ + switch (GET_MODE_CLASS (mode)) + { + case MODE_VECTOR_INT: + emit_insn (gen_mve_vpselq (VPSELQ_S, mode, operands[0], + operands[1], operands[2], operands[3])); + break; + case MODE_VECTOR_FLOAT: + emit_insn (gen_mve_vpselq_f (mode, operands[0], + operands[1], operands[2], operands[3])); + break; + default: + gcc_unreachable (); + } + DONE; +}) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 2b9a3de..f270ded 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -1394,6 +1394,45 @@ [(set_attr "type" "neon_qsub")] ) +(define_expand "vec_cmp" + [(set (match_operand: 0 "s_register_operand") + (match_operator: 1 "comparison_operator" + [(match_operand:VDQWH 2 "s_register_operand") + (match_operand:VDQWH 3 "reg_or_zero_operand")]))] + "TARGET_NEON + && (! || flag_unsafe_math_optimizations)" +{ + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), + operands[2], operands[3], false); + DONE; +}) + +(define_expand "vec_cmpu" + [(set (match_operand:VDQIW 0 "s_register_operand") + (match_operator:VDQIW 1 "comparison_operator" + [(match_operand:VDQIW 2 "s_register_operand") + (match_operand:VDQIW 3 "reg_or_zero_operand")]))] + "TARGET_NEON" +{ + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), + operands[2], operands[3], false); + DONE; +}) + +(define_expand "vcond_mask_" + [(set (match_operand:VDQWH 0 "s_register_operand") + (if_then_else:VDQWH + (match_operand: 3 "s_register_operand") + (match_operand:VDQWH 1 "s_register_operand") + (match_operand:VDQWH 2 "s_register_operand")))] + "TARGET_NEON + && (! || flag_unsafe_math_optimizations)" +{ + emit_insn (gen_neon_vbsl (operands[0], operands[3], operands[1], + operands[2])); + DONE; +}) + ;; Patterns for builtins. ; good for plain vadd, vaddq. diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 2718d82..f130090 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -363,33 +363,6 @@ } }) -(define_expand "vec_cmp" - [(set (match_operand: 0 "s_register_operand") - (match_operator: 1 "comparison_operator" - [(match_operand:VDQWH 2 "s_register_operand") - (match_operand:VDQWH 3 "reg_or_zero_operand")]))] - "ARM_HAVE__ARITH - && !TARGET_REALLY_IWMMXT - && (! || flag_unsafe_math_optimizations)" -{ - arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), - operands[2], operands[3], false, false); - DONE; -}) - -(define_expand "vec_cmpu" - [(set (match_operand:VDQIW 0 "s_register_operand") - (match_operator:VDQIW 1 "comparison_operator" - [(match_operand:VDQIW 2 "s_register_operand") - (match_operand:VDQIW 3 "reg_or_zero_operand")]))] - "ARM_HAVE__ARITH - && !TARGET_REALLY_IWMMXT" -{ - arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), - operands[2], operands[3], false, false); - DONE; -}) - ;; Conditional instructions. These are comparisons with conditional moves for ;; vectors. They perform the assignment: ;; @@ -461,31 +434,6 @@ DONE; }) -(define_expand "vcond_mask_" - [(set (match_operand:VDQWH 0 "s_register_operand") - (if_then_else:VDQWH - (match_operand: 3 "s_register_operand") - (match_operand:VDQWH 1 "s_register_operand") - (match_operand:VDQWH 2 "s_register_operand")))] - "ARM_HAVE__ARITH - && !TARGET_REALLY_IWMMXT - && (! || flag_unsafe_math_optimizations)" -{ - if (TARGET_NEON) - { - emit_insn (gen_neon_vbsl (mode, operands[0], operands[3], - operands[1], operands[2])); - } - else if (TARGET_HAVE_MVE) - { - emit_insn (gen_mve_vpselq (VPSELQ_S, mode, operands[0], - operands[1], operands[2], operands[3])); - } - else - gcc_unreachable (); - DONE; -}) - (define_expand "vec_load_lanesoi" [(set (match_operand:OI 0 "s_register_operand") (unspec:OI [(match_operand:OI 1 "neon_struct_operand") -- cgit v1.1 From e6a4aefce8e47a7d3ba781066a1410ebfa963e59 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 13 Oct 2021 09:16:35 +0000 Subject: arm: Convert remaining MVE vcmp builtins to predicate qualifiers This is mostly a mechanical change, only tested by the intrinsics expansion tests. Most of the work of this patch series was carried out while I was working at STMicroelectronics as a Linaro assignee. 2022-02-22 Christophe Lyon gcc/ PR target/100757 PR target/101325 * config/arm/arm-builtins.cc (BINOP_UNONE_NONE_NONE_QUALIFIERS): Delete. (TERNOP_UNONE_NONE_NONE_UNONE_QUALIFIERS): Change to ... (TERNOP_PRED_NONE_NONE_PRED_QUALIFIERS): ... this. (TERNOP_PRED_UNONE_UNONE_PRED_QUALIFIERS): New. * config/arm/arm_mve_builtins.def (vcmp*q_n_, vcmp*q_m_f): Use new predicated qualifiers. * config/arm/mve.md (mve_vcmpq_n_) (mve_vcmp*q_m_f): Use MVE_VPRED instead of HI. --- gcc/config/arm/arm-builtins.cc | 21 +++-- gcc/config/arm/arm_mve_builtins.def | 92 +++++++++---------- gcc/config/arm/mve.md | 176 ++++++++++++++++++------------------ 3 files changed, 145 insertions(+), 144 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc index 1c6b9c9..02411c6 100644 --- a/gcc/config/arm/arm-builtins.cc +++ b/gcc/config/arm/arm-builtins.cc @@ -439,12 +439,6 @@ arm_binop_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_binop_none_none_unone_qualifiers) static enum arm_type_qualifiers -arm_binop_unone_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_unsigned, qualifier_none, qualifier_none }; -#define BINOP_UNONE_NONE_NONE_QUALIFIERS \ - (arm_binop_unone_none_none_qualifiers) - -static enum arm_type_qualifiers arm_binop_pred_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_predicate, qualifier_none, qualifier_none }; #define BINOP_PRED_NONE_NONE_QUALIFIERS \ @@ -504,10 +498,10 @@ arm_ternop_unone_unone_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_ternop_unone_unone_imm_unone_qualifiers) static enum arm_type_qualifiers -arm_ternop_unone_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_unsigned, qualifier_none, qualifier_none, qualifier_unsigned }; -#define TERNOP_UNONE_NONE_NONE_UNONE_QUALIFIERS \ - (arm_ternop_unone_none_none_unone_qualifiers) +arm_ternop_pred_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_predicate, qualifier_none, qualifier_none, qualifier_predicate }; +#define TERNOP_PRED_NONE_NONE_PRED_QUALIFIERS \ + (arm_ternop_pred_none_none_pred_qualifiers) static enum arm_type_qualifiers arm_ternop_none_none_none_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] @@ -554,6 +548,13 @@ arm_ternop_unone_unone_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_ternop_unone_unone_unone_pred_qualifiers) static enum arm_type_qualifiers +arm_ternop_pred_unone_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_predicate, qualifier_unsigned, qualifier_unsigned, + qualifier_predicate }; +#define TERNOP_PRED_UNONE_UNONE_PRED_QUALIFIERS \ + (arm_ternop_pred_unone_unone_pred_qualifiers) + +static enum arm_type_qualifiers arm_ternop_none_none_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_none, qualifier_none }; #define TERNOP_NONE_NONE_NONE_NONE_QUALIFIERS \ diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def index 44b41ea..b7ebbca 100644 --- a/gcc/config/arm/arm_mve_builtins.def +++ b/gcc/config/arm/arm_mve_builtins.def @@ -118,9 +118,9 @@ VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, veorq_u, v16qi, v8hi, v4si) VAR3 (BINOP_PRED_UNONE_UNONE, vcmphiq_, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_UNONE_UNONE, vcmphiq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_PRED_UNONE_UNONE, vcmpcsq_, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_UNONE_UNONE, vcmpcsq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vandq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vaddvq_p_u, v16qi, v8hi, v4si) @@ -142,17 +142,17 @@ VAR3 (BINOP_UNONE_UNONE_NONE, vbrsrq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_IMM, vshlq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_IMM, vrshrq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_IMM, vqshlq_n_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_NONE_NONE, vcmpneq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_PRED_NONE_NONE, vcmpltq_, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_NONE_NONE, vcmpltq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_PRED_NONE_NONE, vcmpleq_, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_NONE_NONE, vcmpleq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_PRED_NONE_NONE, vcmpgtq_, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_NONE_NONE, vcmpgtq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_PRED_NONE_NONE, vcmpgeq_, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_NONE_NONE, vcmpgeq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_PRED_NONE_NONE, vcmpeqq_, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_, v16qi, v8hi, v4si) +VAR3 (BINOP_PRED_NONE_NONE, vcmpeqq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_NONE_IMM, vqshluq_n_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_UNONE, vaddvq_p_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vsubq_s, v16qi, v8hi, v4si) @@ -218,17 +218,17 @@ VAR2 (BINOP_UNONE_UNONE_IMM, vshlltq_n_u, v16qi, v8hi) VAR2 (BINOP_UNONE_UNONE_IMM, vshllbq_n_u, v16qi, v8hi) VAR2 (BINOP_UNONE_UNONE_IMM, vorrq_n_u, v8hi, v4si) VAR2 (BINOP_UNONE_UNONE_IMM, vbicq_n_u, v8hi, v4si) -VAR2 (BINOP_UNONE_NONE_NONE, vcmpneq_n_f, v8hf, v4sf) +VAR2 (BINOP_PRED_NONE_NONE, vcmpneq_n_f, v8hf, v4sf) VAR2 (BINOP_PRED_NONE_NONE, vcmpneq_f, v8hf, v4sf) -VAR2 (BINOP_UNONE_NONE_NONE, vcmpltq_n_f, v8hf, v4sf) +VAR2 (BINOP_PRED_NONE_NONE, vcmpltq_n_f, v8hf, v4sf) VAR2 (BINOP_PRED_NONE_NONE, vcmpltq_f, v8hf, v4sf) -VAR2 (BINOP_UNONE_NONE_NONE, vcmpleq_n_f, v8hf, v4sf) +VAR2 (BINOP_PRED_NONE_NONE, vcmpleq_n_f, v8hf, v4sf) VAR2 (BINOP_PRED_NONE_NONE, vcmpleq_f, v8hf, v4sf) -VAR2 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_f, v8hf, v4sf) +VAR2 (BINOP_PRED_NONE_NONE, vcmpgtq_n_f, v8hf, v4sf) VAR2 (BINOP_PRED_NONE_NONE, vcmpgtq_f, v8hf, v4sf) -VAR2 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_f, v8hf, v4sf) +VAR2 (BINOP_PRED_NONE_NONE, vcmpgeq_n_f, v8hf, v4sf) VAR2 (BINOP_PRED_NONE_NONE, vcmpgeq_f, v8hf, v4sf) -VAR2 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_f, v8hf, v4sf) +VAR2 (BINOP_PRED_NONE_NONE, vcmpeqq_n_f, v8hf, v4sf) VAR2 (BINOP_PRED_NONE_NONE, vcmpeqq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vsubq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vqmovntq_s, v8hi, v4si) @@ -285,7 +285,7 @@ VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlaldavhaq_s, v4si) VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrmlaldavhaq_u, v4si) VAR2 (TERNOP_NONE_NONE_UNONE_UNONE, vcvtq_m_to_f_u, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtq_m_to_f_s, v8hf, v4sf) -VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpeqq_m_f, v8hf, v4sf) +VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_f, v8hf, v4sf) VAR3 (TERNOP_UNONE_NONE_UNONE_IMM, vshlcq_carry_s, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vshlcq_carry_u, v16qi, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqrshrunbq_n_s, v8hi, v4si) @@ -306,14 +306,14 @@ VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmladavaq_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vminvq_p_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmaxvq_p_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vdupq_m_n_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmpneq_m_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmpneq_m_n_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmphiq_m_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmphiq_m_n_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmpeqq_m_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmpeqq_m_n_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmpcsq_m_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmpcsq_m_n_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpneq_m_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpneq_m_n_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmphiq_m_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmphiq_m_n_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpeqq_m_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpeqq_m_n_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpcsq_m_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpcsq_m_n_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vclzq_m_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vaddvaq_p_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vsriq_n_u, v16qi, v8hi, v4si) @@ -326,18 +326,18 @@ VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vminavq_p_s, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vminaq_m_s, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vmaxavq_p_s, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vmaxaq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpneq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpneq_m_n_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpltq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpltq_m_n_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpleq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpleq_m_n_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgtq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgtq_m_n_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgeq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgeq_m_n_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpeqq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpeqq_m_n_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_n_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpltq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpltq_m_n_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpleq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpleq_m_n_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpgtq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpgtq_m_n_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_n_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_n_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vshlq_m_r_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vrshlq_m_n_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vrev64q_m_s, v16qi, v8hi, v4si) @@ -405,17 +405,17 @@ VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqshrunbq_n_s, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqrshruntq_n_s, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_IMM_UNONE, vorrq_m_n_u, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_IMM_UNONE, vmvnq_m_n_u, v8hi, v4si) -VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpneq_m_n_f, v8hf, v4sf) -VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpneq_m_f, v8hf, v4sf) -VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpltq_m_n_f, v8hf, v4sf) -VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpltq_m_f, v8hf, v4sf) -VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpleq_m_n_f, v8hf, v4sf) -VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpleq_m_f, v8hf, v4sf) -VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgtq_m_n_f, v8hf, v4sf) -VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgtq_m_f, v8hf, v4sf) -VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgeq_m_n_f, v8hf, v4sf) -VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgeq_m_f, v8hf, v4sf) -VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpeqq_m_n_f, v8hf, v4sf) +VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_n_f, v8hf, v4sf) +VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_f, v8hf, v4sf) +VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpltq_m_n_f, v8hf, v4sf) +VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpltq_m_f, v8hf, v4sf) +VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpleq_m_n_f, v8hf, v4sf) +VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpleq_m_f, v8hf, v4sf) +VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgtq_m_n_f, v8hf, v4sf) +VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgtq_m_f, v8hf, v4sf) +VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_n_f, v8hf, v4sf) +VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_f, v8hf, v4sf) +VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_n_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndxq_m_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndq_m_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndpq_m_f, v8hf, v4sf) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index d0c3100..12f05b3 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -853,8 +853,8 @@ ;; (define_insn "mve_vcmpq_n_" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (MVE_COMPARISONS: (match_operand:MVE_2 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r"))) ] "TARGET_HAVE_MVE" @@ -1943,8 +1943,8 @@ ;; (define_insn "@mve_vcmpq_n_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (MVE_FP_COMPARISONS: (match_operand:MVE_0 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r"))) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -2593,10 +2593,10 @@ ;; (define_insn "mve_vcmpeqq_m_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPEQQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -2809,10 +2809,10 @@ ;; (define_insn "mve_vcmpcsq_m_n_u" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPCSQ_M_N_U)) ] "TARGET_HAVE_MVE" @@ -2825,10 +2825,10 @@ ;; (define_insn "mve_vcmpcsq_m_u" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPCSQ_M_U)) ] "TARGET_HAVE_MVE" @@ -2841,10 +2841,10 @@ ;; (define_insn "mve_vcmpeqq_m_n_" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPEQQ_M_N)) ] "TARGET_HAVE_MVE" @@ -2857,10 +2857,10 @@ ;; (define_insn "mve_vcmpeqq_m_" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPEQQ_M)) ] "TARGET_HAVE_MVE" @@ -2873,10 +2873,10 @@ ;; (define_insn "mve_vcmpgeq_m_n_s" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPGEQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -2889,10 +2889,10 @@ ;; (define_insn "mve_vcmpgeq_m_s" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPGEQ_M_S)) ] "TARGET_HAVE_MVE" @@ -2905,10 +2905,10 @@ ;; (define_insn "mve_vcmpgtq_m_n_s" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPGTQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -2921,10 +2921,10 @@ ;; (define_insn "mve_vcmpgtq_m_s" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPGTQ_M_S)) ] "TARGET_HAVE_MVE" @@ -2937,10 +2937,10 @@ ;; (define_insn "mve_vcmphiq_m_n_u" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPHIQ_M_N_U)) ] "TARGET_HAVE_MVE" @@ -2953,10 +2953,10 @@ ;; (define_insn "mve_vcmphiq_m_u" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPHIQ_M_U)) ] "TARGET_HAVE_MVE" @@ -2969,10 +2969,10 @@ ;; (define_insn "mve_vcmpleq_m_n_s" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPLEQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -2985,10 +2985,10 @@ ;; (define_insn "mve_vcmpleq_m_s" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPLEQ_M_S)) ] "TARGET_HAVE_MVE" @@ -3001,10 +3001,10 @@ ;; (define_insn "mve_vcmpltq_m_n_s" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPLTQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -3017,10 +3017,10 @@ ;; (define_insn "mve_vcmpltq_m_s" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPLTQ_M_S)) ] "TARGET_HAVE_MVE" @@ -3033,10 +3033,10 @@ ;; (define_insn "mve_vcmpneq_m_n_" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPNEQ_M_N)) ] "TARGET_HAVE_MVE" @@ -3049,10 +3049,10 @@ ;; (define_insn "mve_vcmpneq_m_" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPNEQ_M)) ] "TARGET_HAVE_MVE" @@ -3770,10 +3770,10 @@ ;; (define_insn "mve_vcmpeqq_m_n_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPEQQ_M_N_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3786,10 +3786,10 @@ ;; (define_insn "mve_vcmpgeq_m_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPGEQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3802,10 +3802,10 @@ ;; (define_insn "mve_vcmpgeq_m_n_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPGEQ_M_N_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3818,10 +3818,10 @@ ;; (define_insn "mve_vcmpgtq_m_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPGTQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3834,10 +3834,10 @@ ;; (define_insn "mve_vcmpgtq_m_n_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPGTQ_M_N_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3850,10 +3850,10 @@ ;; (define_insn "mve_vcmpleq_m_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPLEQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3866,10 +3866,10 @@ ;; (define_insn "mve_vcmpleq_m_n_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPLEQ_M_N_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3882,10 +3882,10 @@ ;; (define_insn "mve_vcmpltq_m_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPLTQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3898,10 +3898,10 @@ ;; (define_insn "mve_vcmpltq_m_n_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPLTQ_M_N_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3914,10 +3914,10 @@ ;; (define_insn "mve_vcmpneq_m_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPNEQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3930,10 +3930,10 @@ ;; (define_insn "mve_vcmpneq_m_n_f" [ - (set (match_operand:HI 0 "vpr_register_operand" "=Up") - (unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w") + (set (match_operand: 0 "vpr_register_operand" "=Up") + (unspec: [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCMPNEQ_M_N_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" -- cgit v1.1 From 724d6566cd11c676f3bc082a9771784c825affb1 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 13 Oct 2021 09:16:40 +0000 Subject: arm: Convert more MVE builtins to predicate qualifiers This patch covers all builtins that have an HI operand and use the iterator, thus we can replace HI whe . Most of the work of this patch series was carried out while I was working at STMicroelectronics as a Linaro assignee. 2022-02-22 Christophe Lyon gcc/ PR target/100757 PR target/101325 * config/arm/arm-builtins.cc (TERNOP_UNONE_UNONE_NONE_UNONE_QUALIFIERS): Change to ... (TERNOP_UNONE_UNONE_NONE_PRED_QUALIFIERS): ... this. (TERNOP_UNONE_UNONE_IMM_UNONE_QUALIFIERS): Change to ... (TERNOP_UNONE_UNONE_IMM_PRED_QUALIFIERS): ... this. (TERNOP_NONE_NONE_IMM_UNONE_QUALIFIERS): Change to ... (TERNOP_NONE_NONE_IMM_PRED_QUALIFIERS): ... this. (TERNOP_NONE_NONE_UNONE_UNONE_QUALIFIERS): Change to ... (TERNOP_NONE_NONE_UNONE_PRED_QUALIFIERS): ... this. (QUADOP_UNONE_UNONE_NONE_NONE_UNONE_QUALIFIERS): Change to ... (QUADOP_UNONE_UNONE_NONE_NONE_PRED_QUALIFIERS): ... this. (QUADOP_NONE_NONE_NONE_NONE_PRED_QUALIFIERS): New. (QUADOP_NONE_NONE_NONE_IMM_UNONE_QUALIFIERS): Change to ... (QUADOP_NONE_NONE_NONE_IMM_PRED_QUALIFIERS): ... this. (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED_QUALIFIERS): New. (QUADOP_UNONE_UNONE_NONE_IMM_UNONE_QUALIFIERS): Change to ... (QUADOP_UNONE_UNONE_NONE_IMM_PRED_QUALIFIERS): ... this. (QUADOP_NONE_NONE_UNONE_IMM_UNONE_QUALIFIERS): Change to ... (QUADOP_NONE_NONE_UNONE_IMM_PRED_QUALIFIERS): ... this. (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE_QUALIFIERS): Change to ... (QUADOP_UNONE_UNONE_UNONE_IMM_PRED_QUALIFIERS): ... this. (QUADOP_UNONE_UNONE_UNONE_NONE_UNONE_QUALIFIERS): Change to ... (QUADOP_UNONE_UNONE_UNONE_NONE_PRED_QUALIFIERS): ... this. (STRS_P_QUALIFIERS): Use predicate qualifier. (STRU_P_QUALIFIERS): Likewise. (STRSU_P_QUALIFIERS): Likewise. (STRSS_P_QUALIFIERS): Likewise. (LDRGS_Z_QUALIFIERS): Likewise. (LDRGU_Z_QUALIFIERS): Likewise. (LDRS_Z_QUALIFIERS): Likewise. (LDRU_Z_QUALIFIERS): Likewise. (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_UNONE_QUALIFIERS): Change to ... (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED_QUALIFIERS): ... this. (BINOP_NONE_NONE_PRED_QUALIFIERS): New. (BINOP_UNONE_UNONE_PRED_QUALIFIERS): New. * config/arm/arm_mve_builtins.def: Use new predicated qualifiers. * config/arm/mve.md: Use MVE_VPRED instead of HI. --- gcc/config/arm/arm-builtins.cc | 130 +++++---- gcc/config/arm/arm_mve_builtins.def | 562 ++++++++++++++++++------------------ gcc/config/arm/mve.md | 420 +++++++++++++-------------- 3 files changed, 569 insertions(+), 543 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc index 02411c6..a9536b2 100644 --- a/gcc/config/arm/arm-builtins.cc +++ b/gcc/config/arm/arm-builtins.cc @@ -484,18 +484,18 @@ arm_ternop_unone_unone_none_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_ternop_unone_unone_none_imm_qualifiers) static enum arm_type_qualifiers -arm_ternop_unone_unone_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] +arm_ternop_unone_unone_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_none, - qualifier_unsigned }; -#define TERNOP_UNONE_UNONE_NONE_UNONE_QUALIFIERS \ - (arm_ternop_unone_unone_none_unone_qualifiers) + qualifier_predicate }; +#define TERNOP_UNONE_UNONE_NONE_PRED_QUALIFIERS \ + (arm_ternop_unone_unone_none_pred_qualifiers) static enum arm_type_qualifiers -arm_ternop_unone_unone_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] +arm_ternop_unone_unone_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate, - qualifier_unsigned }; -#define TERNOP_UNONE_UNONE_IMM_UNONE_QUALIFIERS \ - (arm_ternop_unone_unone_imm_unone_qualifiers) + qualifier_predicate }; +#define TERNOP_UNONE_UNONE_IMM_PRED_QUALIFIERS \ + (arm_ternop_unone_unone_imm_pred_qualifiers) static enum arm_type_qualifiers arm_ternop_pred_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] @@ -522,16 +522,16 @@ arm_ternop_none_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_ternop_none_none_none_pred_qualifiers) static enum arm_type_qualifiers -arm_ternop_none_none_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_none, qualifier_none, qualifier_immediate, qualifier_unsigned }; -#define TERNOP_NONE_NONE_IMM_UNONE_QUALIFIERS \ - (arm_ternop_none_none_imm_unone_qualifiers) +arm_ternop_none_none_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_none, qualifier_none, qualifier_immediate, qualifier_predicate }; +#define TERNOP_NONE_NONE_IMM_PRED_QUALIFIERS \ + (arm_ternop_none_none_imm_pred_qualifiers) static enum arm_type_qualifiers -arm_ternop_none_none_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_unsigned }; -#define TERNOP_NONE_NONE_UNONE_UNONE_QUALIFIERS \ - (arm_ternop_none_none_unone_unone_qualifiers) +arm_ternop_none_none_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_predicate }; +#define TERNOP_NONE_NONE_UNONE_PRED_QUALIFIERS \ + (arm_ternop_none_none_unone_pred_qualifiers) static enum arm_type_qualifiers arm_ternop_unone_unone_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] @@ -561,11 +561,11 @@ arm_ternop_none_none_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_ternop_none_none_none_none_qualifiers) static enum arm_type_qualifiers -arm_quadop_unone_unone_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] +arm_quadop_unone_unone_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_none, qualifier_none, - qualifier_unsigned }; -#define QUADOP_UNONE_UNONE_NONE_NONE_UNONE_QUALIFIERS \ - (arm_quadop_unone_unone_none_none_unone_qualifiers) + qualifier_predicate }; +#define QUADOP_UNONE_UNONE_NONE_NONE_PRED_QUALIFIERS \ + (arm_quadop_unone_unone_none_none_pred_qualifiers) static enum arm_type_qualifiers arm_quadop_none_none_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] @@ -575,11 +575,18 @@ arm_quadop_none_none_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_quadop_none_none_none_none_unone_qualifiers) static enum arm_type_qualifiers -arm_quadop_none_none_none_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] +arm_quadop_none_none_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_none, qualifier_none, qualifier_none, qualifier_none, + qualifier_predicate }; +#define QUADOP_NONE_NONE_NONE_NONE_PRED_QUALIFIERS \ + (arm_quadop_none_none_none_none_pred_qualifiers) + +static enum arm_type_qualifiers +arm_quadop_none_none_none_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_none, qualifier_immediate, - qualifier_unsigned }; -#define QUADOP_NONE_NONE_NONE_IMM_UNONE_QUALIFIERS \ - (arm_quadop_none_none_none_imm_unone_qualifiers) + qualifier_predicate }; +#define QUADOP_NONE_NONE_NONE_IMM_PRED_QUALIFIERS \ + (arm_quadop_none_none_none_imm_pred_qualifiers) static enum arm_type_qualifiers arm_quadop_unone_unone_unone_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] @@ -589,32 +596,39 @@ arm_quadop_unone_unone_unone_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_quadop_unone_unone_unone_unone_unone_qualifiers) static enum arm_type_qualifiers -arm_quadop_unone_unone_none_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] +arm_quadop_unone_unone_unone_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, + qualifier_unsigned, qualifier_predicate }; +#define QUADOP_UNONE_UNONE_UNONE_UNONE_PRED_QUALIFIERS \ + (arm_quadop_unone_unone_unone_unone_pred_qualifiers) + +static enum arm_type_qualifiers +arm_quadop_unone_unone_none_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_none, - qualifier_immediate, qualifier_unsigned }; -#define QUADOP_UNONE_UNONE_NONE_IMM_UNONE_QUALIFIERS \ - (arm_quadop_unone_unone_none_imm_unone_qualifiers) + qualifier_immediate, qualifier_predicate }; +#define QUADOP_UNONE_UNONE_NONE_IMM_PRED_QUALIFIERS \ + (arm_quadop_unone_unone_none_imm_pred_qualifiers) static enum arm_type_qualifiers -arm_quadop_none_none_unone_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] +arm_quadop_none_none_unone_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_immediate, - qualifier_unsigned }; -#define QUADOP_NONE_NONE_UNONE_IMM_UNONE_QUALIFIERS \ - (arm_quadop_none_none_unone_imm_unone_qualifiers) + qualifier_predicate }; +#define QUADOP_NONE_NONE_UNONE_IMM_PRED_QUALIFIERS \ + (arm_quadop_none_none_unone_imm_pred_qualifiers) static enum arm_type_qualifiers -arm_quadop_unone_unone_unone_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] +arm_quadop_unone_unone_unone_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, - qualifier_immediate, qualifier_unsigned }; -#define QUADOP_UNONE_UNONE_UNONE_IMM_UNONE_QUALIFIERS \ - (arm_quadop_unone_unone_unone_imm_unone_qualifiers) + qualifier_immediate, qualifier_predicate }; +#define QUADOP_UNONE_UNONE_UNONE_IMM_PRED_QUALIFIERS \ + (arm_quadop_unone_unone_unone_imm_pred_qualifiers) static enum arm_type_qualifiers -arm_quadop_unone_unone_unone_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] +arm_quadop_unone_unone_unone_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, - qualifier_none, qualifier_unsigned }; -#define QUADOP_UNONE_UNONE_UNONE_NONE_UNONE_QUALIFIERS \ - (arm_quadop_unone_unone_unone_none_unone_qualifiers) + qualifier_none, qualifier_predicate }; +#define QUADOP_UNONE_UNONE_UNONE_NONE_PRED_QUALIFIERS \ + (arm_quadop_unone_unone_unone_none_pred_qualifiers) static enum arm_type_qualifiers arm_strs_qualifiers[SIMD_MAX_BUILTIN_ARGS] @@ -651,25 +665,25 @@ arm_strsbu_qualifiers[SIMD_MAX_BUILTIN_ARGS] static enum arm_type_qualifiers arm_strs_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_void, qualifier_pointer, qualifier_none, qualifier_unsigned}; + = { qualifier_void, qualifier_pointer, qualifier_none, qualifier_predicate}; #define STRS_P_QUALIFIERS (arm_strs_p_qualifiers) static enum arm_type_qualifiers arm_stru_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_void, qualifier_pointer, qualifier_unsigned, - qualifier_unsigned}; + qualifier_predicate}; #define STRU_P_QUALIFIERS (arm_stru_p_qualifiers) static enum arm_type_qualifiers arm_strsu_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_void, qualifier_pointer, qualifier_unsigned, - qualifier_unsigned, qualifier_unsigned}; + qualifier_unsigned, qualifier_predicate}; #define STRSU_P_QUALIFIERS (arm_strsu_p_qualifiers) static enum arm_type_qualifiers arm_strss_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_void, qualifier_pointer, qualifier_unsigned, - qualifier_none, qualifier_unsigned}; + qualifier_none, qualifier_predicate}; #define STRSS_P_QUALIFIERS (arm_strss_p_qualifiers) static enum arm_type_qualifiers @@ -729,31 +743,31 @@ arm_ldrgbu_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] static enum arm_type_qualifiers arm_ldrgs_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_pointer, qualifier_unsigned, - qualifier_unsigned}; + qualifier_predicate}; #define LDRGS_Z_QUALIFIERS (arm_ldrgs_z_qualifiers) static enum arm_type_qualifiers arm_ldrgu_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_pointer, qualifier_unsigned, - qualifier_unsigned}; + qualifier_predicate}; #define LDRGU_Z_QUALIFIERS (arm_ldrgu_z_qualifiers) static enum arm_type_qualifiers arm_ldrs_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_none, qualifier_pointer, qualifier_unsigned}; + = { qualifier_none, qualifier_pointer, qualifier_predicate}; #define LDRS_Z_QUALIFIERS (arm_ldrs_z_qualifiers) static enum arm_type_qualifiers arm_ldru_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_unsigned, qualifier_pointer, qualifier_unsigned}; + = { qualifier_unsigned, qualifier_pointer, qualifier_predicate}; #define LDRU_Z_QUALIFIERS (arm_ldru_z_qualifiers) static enum arm_type_qualifiers -arm_quinop_unone_unone_unone_unone_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] +arm_quinop_unone_unone_unone_unone_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, - qualifier_unsigned, qualifier_immediate, qualifier_unsigned }; -#define QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_UNONE_QUALIFIERS \ - (arm_quinop_unone_unone_unone_unone_imm_unone_qualifiers) + qualifier_unsigned, qualifier_immediate, qualifier_predicate }; +#define QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED_QUALIFIERS \ + (arm_quinop_unone_unone_unone_unone_imm_pred_qualifiers) static enum arm_type_qualifiers arm_ldrgbwbxu_qualifiers[SIMD_MAX_BUILTIN_ARGS] @@ -830,6 +844,18 @@ arm_sqshl_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_const}; #define SQSHL_QUALIFIERS (arm_sqshl_qualifiers) +static enum arm_type_qualifiers +arm_binop_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_none, qualifier_none, qualifier_predicate }; +#define BINOP_NONE_NONE_PRED_QUALIFIERS \ + (arm_binop_none_none_pred_qualifiers) + +static enum arm_type_qualifiers +arm_binop_unone_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_unsigned, qualifier_unsigned, qualifier_predicate }; +#define BINOP_UNONE_UNONE_PRED_QUALIFIERS \ + (arm_binop_unone_unone_pred_qualifiers) + /* End of Qualifier for MVE builtins. */ /* void ([T element type] *, T, immediate). */ diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def index b7ebbca..7db6d47 100644 --- a/gcc/config/arm/arm_mve_builtins.def +++ b/gcc/config/arm/arm_mve_builtins.def @@ -123,7 +123,7 @@ VAR3 (BINOP_PRED_UNONE_UNONE, vcmpcsq_, v16qi, v8hi, v4si) VAR3 (BINOP_PRED_UNONE_UNONE, vcmpcsq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vandq_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vaddvq_p_u, v16qi, v8hi, v4si) +VAR3 (BINOP_UNONE_UNONE_PRED, vaddvq_p_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vaddvaq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vaddq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vabdq_u, v16qi, v8hi, v4si) @@ -154,7 +154,7 @@ VAR3 (BINOP_PRED_NONE_NONE, vcmpgeq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_PRED_NONE_NONE, vcmpeqq_, v16qi, v8hi, v4si) VAR3 (BINOP_PRED_NONE_NONE, vcmpeqq_n_, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_NONE_IMM, vqshluq_n_s, v16qi, v8hi, v4si) -VAR3 (BINOP_NONE_NONE_UNONE, vaddvq_p_s, v16qi, v8hi, v4si) +VAR3 (BINOP_NONE_NONE_PRED, vaddvq_p_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vsubq_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vsubq_n_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vshlq_r_s, v16qi, v8hi, v4si) @@ -277,35 +277,35 @@ VAR1 (BINOP_NONE_NONE_NONE, vrmlaldavhq_s, v4si) VAR1 (BINOP_NONE_NONE_NONE, vcvttq_f16_f32, v8hf) VAR1 (BINOP_NONE_NONE_NONE, vcvtbq_f16_f32, v8hf) VAR1 (BINOP_NONE_NONE_NONE, vaddlvaq_s, v4si) -VAR2 (TERNOP_NONE_NONE_IMM_UNONE, vbicq_m_n_s, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_IMM_UNONE, vbicq_m_n_u, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_IMM_PRED, vbicq_m_n_s, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_IMM_PRED, vbicq_m_n_u, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_IMM, vqrshrnbq_n_s, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vqrshrnbq_n_u, v8hi, v4si) VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlaldavhaq_s, v4si) VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrmlaldavhaq_u, v4si) -VAR2 (TERNOP_NONE_NONE_UNONE_UNONE, vcvtq_m_to_f_u, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtq_m_to_f_s, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_UNONE_PRED, vcvtq_m_to_f_u, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtq_m_to_f_s, v8hf, v4sf) VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_f, v8hf, v4sf) VAR3 (TERNOP_UNONE_NONE_UNONE_IMM, vshlcq_carry_s, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vshlcq_carry_u, v16qi, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqrshrunbq_n_s, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_NONE_NONE, vabavq_s, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vabavq_u, v16qi, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vcvtaq_m_u, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtaq_m_s, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vcvtaq_m_u, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtaq_m_s, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vshlcq_vec_u, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_UNONE_IMM, vshlcq_vec_s, v16qi, v8hi, v4si) VAR4 (TERNOP_UNONE_UNONE_UNONE_PRED, vpselq_u, v16qi, v8hi, v4si, v2di) VAR4 (TERNOP_NONE_NONE_NONE_PRED, vpselq_s, v16qi, v8hi, v4si, v2di) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrev64q_m_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmvnq_m_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vrev64q_m_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vmvnq_m_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlasq_n_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlaq_n_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmladavq_p_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vmladavq_p_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmladavaq_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vminvq_p_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmaxvq_p_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vdupq_m_n_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vminvq_p_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vmaxvq_p_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vdupq_m_n_u, v16qi, v8hi, v4si) VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpneq_m_u, v16qi, v8hi, v4si) VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpneq_m_n_u, v16qi, v8hi, v4si) VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmphiq_m_u, v16qi, v8hi, v4si) @@ -314,18 +314,18 @@ VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpeqq_m_u, v16qi, v8hi, v4si) VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpeqq_m_n_u, v16qi, v8hi, v4si) VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpcsq_m_u, v16qi, v8hi, v4si) VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpcsq_m_n_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vclzq_m_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vaddvaq_p_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vclzq_m_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vaddvaq_p_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vsriq_n_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vsliq_n_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vshlq_m_r_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vrshlq_m_n_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vqshlq_m_r_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vqrshlq_m_n_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vminavq_p_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vminaq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vmaxavq_p_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vmaxaq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vshlq_m_r_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vrshlq_m_n_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vqshlq_m_r_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vqrshlq_m_n_u, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vminavq_p_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vminaq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vmaxavq_p_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vmaxaq_m_s, v16qi, v8hi, v4si) VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_s, v16qi, v8hi, v4si) VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_n_s, v16qi, v8hi, v4si) VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpltq_m_s, v16qi, v8hi, v4si) @@ -338,26 +338,26 @@ VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_s, v16qi, v8hi, v4si) VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_n_s, v16qi, v8hi, v4si) VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_s, v16qi, v8hi, v4si) VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_n_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vshlq_m_r_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vrshlq_m_n_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vrev64q_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vqshlq_m_r_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vqrshlq_m_n_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vqnegq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vqabsq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vnegq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vmvnq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vmlsdavxq_p_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vmlsdavq_p_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vmladavxq_p_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vmladavq_p_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vminvq_p_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vmaxvq_p_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vdupq_m_n_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vclzq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vclsq_m_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vaddvaq_p_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vabsq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vshlq_m_r_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vrshlq_m_n_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vrev64q_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vqshlq_m_r_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vqrshlq_m_n_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vqnegq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vqabsq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vnegq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vmvnq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vmlsdavxq_p_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vmlsdavq_p_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vmladavxq_p_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vmladavq_p_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vminvq_p_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vmaxvq_p_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vdupq_m_n_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vclzq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vclsq_m_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vaddvaq_p_s, v16qi, v8hi, v4si) +VAR3 (TERNOP_NONE_NONE_NONE_PRED, vabsq_m_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqrdmlsdhxq_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqrdmlsdhq_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqrdmlashq_n_s, v16qi, v8hi, v4si) @@ -378,14 +378,14 @@ VAR3 (TERNOP_NONE_NONE_NONE_NONE, vmladavaxq_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_NONE, vmladavaq_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_IMM, vsriq_n_s, v16qi, v8hi, v4si) VAR3 (TERNOP_NONE_NONE_NONE_IMM, vsliq_n_s, v16qi, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrev32q_m_u, v16qi, v8hi) -VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vqmovntq_m_u, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vqmovnbq_m_u, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmovntq_m_u, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmovnbq_m_u, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmovltq_m_u, v16qi, v8hi) -VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmovlbq_m_u, v16qi, v8hi) -VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlaldavq_p_u, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vrev32q_m_u, v16qi, v8hi) +VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vqmovntq_m_u, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vqmovnbq_m_u, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vmovntq_m_u, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vmovnbq_m_u, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vmovltq_m_u, v16qi, v8hi) +VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vmovlbq_m_u, v16qi, v8hi) +VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vmlaldavq_p_u, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlaldavaq_u, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vshrntq_n_u, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vshrnbq_n_u, v8hi, v4si) @@ -394,17 +394,17 @@ VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vrshrnbq_n_u, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vqshrntq_n_u, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vqshrnbq_n_u, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vqrshrntq_n_u, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vqmovuntq_m_s, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vqmovunbq_m_s, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vcvtq_m_from_f_u, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vcvtpq_m_u, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vcvtnq_m_u, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vcvtmq_m_u, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vqmovuntq_m_s, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vqmovunbq_m_s, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vcvtq_m_from_f_u, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vcvtpq_m_u, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vcvtnq_m_u, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vcvtmq_m_u, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqshruntq_n_s, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqshrunbq_n_s, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqrshruntq_n_s, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_IMM_UNONE, vorrq_m_n_u, v8hi, v4si) -VAR2 (TERNOP_UNONE_UNONE_IMM_UNONE, vmvnq_m_n_u, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_IMM_PRED, vorrq_m_n_u, v8hi, v4si) +VAR2 (TERNOP_UNONE_UNONE_IMM_PRED, vmvnq_m_n_u, v8hi, v4si) VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_n_f, v8hf, v4sf) VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_f, v8hf, v4sf) VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpltq_m_n_f, v8hf, v4sf) @@ -416,38 +416,38 @@ VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgtq_m_f, v8hf, v4sf) VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_n_f, v8hf, v4sf) VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_f, v8hf, v4sf) VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_n_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndxq_m_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndq_m_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndpq_m_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndnq_m_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndmq_m_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndaq_m_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrev64q_m_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrev32q_m_s, v16qi, v8hi) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vqmovntq_m_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vqmovnbq_m_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrndxq_m_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrndq_m_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrndpq_m_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrndnq_m_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrndmq_m_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrndaq_m_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrev64q_m_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrev32q_m_s, v16qi, v8hi) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vqmovntq_m_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vqmovnbq_m_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_PRED, vpselq_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vnegq_m_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmovntq_m_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmovnbq_m_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmovltq_m_s, v16qi, v8hi) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmovlbq_m_s, v16qi, v8hi) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmlsldavxq_p_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmlsldavq_p_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmlaldavxq_p_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmlaldavq_p_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vminnmvq_p_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vminnmavq_p_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vminnmaq_m_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmaxnmvq_p_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmaxnmavq_p_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmaxnmaq_m_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vdupq_m_n_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtq_m_from_f_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtpq_m_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtnq_m_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtmq_m_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vabsq_m_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vnegq_m_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmovntq_m_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmovnbq_m_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmovltq_m_s, v16qi, v8hi) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmovlbq_m_s, v16qi, v8hi) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmlsldavxq_p_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmlsldavq_p_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmlaldavxq_p_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmlaldavq_p_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vminnmvq_p_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vminnmavq_p_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vminnmaq_m_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmaxnmvq_p_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmaxnmavq_p_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmaxnmaq_m_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vdupq_m_n_f, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtq_m_from_f_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtpq_m_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtnq_m_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtmq_m_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_NONE_PRED, vabsq_m_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_NONE, vmlsldavaxq_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_NONE, vmlsldavaq_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_NONE, vmlaldavaxq_s, v8hi, v4si) @@ -463,8 +463,8 @@ VAR2 (TERNOP_NONE_NONE_NONE_IMM, vrshrnbq_n_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_IMM, vqshrntq_n_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_IMM, vqshrnbq_n_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_IMM, vqrshrntq_n_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_IMM_UNONE, vorrq_m_n_s, v8hi, v4si) -VAR2 (TERNOP_NONE_NONE_IMM_UNONE, vmvnq_m_n_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_IMM_PRED, vorrq_m_n_s, v8hi, v4si) +VAR2 (TERNOP_NONE_NONE_IMM_PRED, vmvnq_m_n_s, v8hi, v4si) VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrmlaldavhq_p_u, v4si) VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrev16q_m_u, v16qi) VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vaddlvaq_p_u, v4si) @@ -482,189 +482,189 @@ VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vaddlvaq_p_s, v4si) VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlsldavhaxq_s, v4si) VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlsldavhaq_s, v4si) VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlaldavhaxq_s, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vsriq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vsriq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vsubq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vsubq_m_u, v16qi, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vcvtq_m_n_to_f_u, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vcvtq_m_n_to_f_s, v8hf, v4sf) -VAR3 (QUADOP_UNONE_UNONE_NONE_IMM_UNONE, vqshluq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_NONE_NONE_UNONE, vabavq_p_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vabavq_p_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_UNONE, vshlq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vshlq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vsubq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vrmulhq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vrhaddq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vqsubq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vqsubq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vqaddq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vqaddq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vorrq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vornq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmulq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmulq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmulltq_int_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmullbq_int_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmulhq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmlasq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmlaq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmladavaq_p_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vminq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmaxq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vhsubq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vhsubq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vhaddq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vhaddq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, veorq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vcaddq_rot90_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vcaddq_rot270_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vbicq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vandq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vaddq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vaddq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vabdq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_UNONE, vrshlq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_UNONE, vqshlq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_UNONE, vqrshlq_m_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_UNONE, vbrsrq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vsliq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshrq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vrshrq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vqshlq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vsubq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrshlq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmulhq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrhaddq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqsubq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqsubq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqshlq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrshlq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmulhq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmulhq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmlsdhxq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmlsdhq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmlashq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmlahq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmladhxq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmladhq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmulhq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmulhq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlsdhxq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlsdhq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlahq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlashq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmladhxq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmladhq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqaddq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqaddq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vorrq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vornq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmulq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmulq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmulltq_int_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmullbq_int_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmulhq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlsdavaxq_p_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlsdavaq_p_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlasq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlaq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmladavaxq_p_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmladavaq_p_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vminq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmaxq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vhsubq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vhsubq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vhcaddq_rot90_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vhcaddq_rot270_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vhaddq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vhaddq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, veorq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcaddq_rot90_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcaddq_rot270_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vbrsrq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vbicq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vandq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vaddq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vaddq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vabdq_m_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vsliq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vshrq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vshlq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vrshrq_m_n_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vqshlq_m_n_s, v16qi, v8hi, v4si) -VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmulltq_poly_m_p, v16qi, v8hi) -VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmullbq_poly_m_p, v16qi, v8hi) -VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmlaldavaq_p_u, v8hi, v4si) -VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshrntq_m_n_u, v8hi, v4si) -VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshrnbq_m_n_u, v8hi, v4si) -VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlltq_m_n_u, v16qi, v8hi) -VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshllbq_m_n_u, v16qi, v8hi) -VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vrshrntq_m_n_u, v8hi, v4si) -VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vrshrnbq_m_n_u, v8hi, v4si) -VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vqshrntq_m_n_u, v8hi, v4si) -VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vqshrnbq_m_n_u, v8hi, v4si) -VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vqrshrntq_m_n_u, v8hi, v4si) -VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vqrshrnbq_m_n_u, v8hi, v4si) -VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_UNONE, vqshruntq_m_n_s, v8hi, v4si) -VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_UNONE, vqshrunbq_m_n_s, v8hi, v4si) -VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_UNONE, vqrshruntq_m_n_s, v8hi, v4si) -VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_UNONE, vqrshrunbq_m_n_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmulltq_m_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmulltq_m_n_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmullbq_m_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmullbq_m_n_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlsldavaxq_p_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlsldavaq_p_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlaldavaxq_p_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlaldavaq_p_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vshrntq_m_n_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vshrnbq_m_n_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vshlltq_m_n_s, v16qi, v8hi) -VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vshllbq_m_n_s, v16qi, v8hi) -VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vrshrntq_m_n_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vrshrnbq_m_n_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vqshrntq_m_n_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vqshrnbq_m_n_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vqrshrntq_m_n_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vqrshrnbq_m_n_s, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_IMM_PRED, vsriq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vsriq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsubq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsubq_m_u, v16qi, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_UNONE_IMM_PRED, vcvtq_m_n_to_f_u, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vcvtq_m_n_to_f_s, v8hf, v4sf) +VAR3 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vqshluq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_NONE_NONE_PRED, vabavq_p_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vabavq_p_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_PRED, vshlq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vshlq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsubq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vrmulhq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vrhaddq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vqsubq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vqsubq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vqaddq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vqaddq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vorrq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vornq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmulq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmulq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmulltq_int_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmullbq_int_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmulhq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmlasq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmlaq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmladavaq_p_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vminq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmaxq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vhsubq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vhsubq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vhaddq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vhaddq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, veorq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vcaddq_rot90_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vcaddq_rot270_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vbicq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vandq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vaddq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vaddq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vabdq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_PRED, vrshlq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_PRED, vqshlq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_PRED, vqrshlq_m_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_PRED, vbrsrq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vsliq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshrq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vrshrq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vqshlq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsubq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrshlq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrmulhq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrhaddq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqsubq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqsubq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqshlq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrshlq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmulhq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmulhq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmlsdhxq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmlsdhq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmlashq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmlahq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmladhxq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmladhq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmulhq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmulhq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmlsdhxq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmlsdhq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmlahq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmlashq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmladhxq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmladhq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqaddq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqaddq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vorrq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vornq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmulq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmulq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmulltq_int_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmullbq_int_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmulhq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlsdavaxq_p_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlsdavaq_p_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlasq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlaq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmladavaxq_p_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmladavaq_p_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vminq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmaxq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhsubq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhsubq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhcaddq_rot90_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhcaddq_rot270_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhaddq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhaddq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, veorq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcaddq_rot90_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcaddq_rot270_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vbrsrq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vbicq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vandq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vaddq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vaddq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vabdq_m_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_IMM_PRED, vsliq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_IMM_PRED, vshrq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_IMM_PRED, vshlq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_IMM_PRED, vrshrq_m_n_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqshlq_m_n_s, v16qi, v8hi, v4si) +VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmulltq_poly_m_p, v16qi, v8hi) +VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmullbq_poly_m_p, v16qi, v8hi) +VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmlaldavaq_p_u, v8hi, v4si) +VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshrntq_m_n_u, v8hi, v4si) +VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshrnbq_m_n_u, v8hi, v4si) +VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlltq_m_n_u, v16qi, v8hi) +VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshllbq_m_n_u, v16qi, v8hi) +VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vrshrntq_m_n_u, v8hi, v4si) +VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vrshrnbq_m_n_u, v8hi, v4si) +VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vqshrntq_m_n_u, v8hi, v4si) +VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vqshrnbq_m_n_u, v8hi, v4si) +VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vqrshrntq_m_n_u, v8hi, v4si) +VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vqrshrnbq_m_n_u, v8hi, v4si) +VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vqshruntq_m_n_s, v8hi, v4si) +VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vqshrunbq_m_n_s, v8hi, v4si) +VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vqrshruntq_m_n_s, v8hi, v4si) +VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vqrshrunbq_m_n_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmulltq_m_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmulltq_m_n_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmullbq_m_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmullbq_m_n_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlsldavaxq_p_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlsldavaq_p_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlaldavaxq_p_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlaldavaq_p_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vshrntq_m_n_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vshrnbq_m_n_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vshlltq_m_n_s, v16qi, v8hi) +VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vshllbq_m_n_s, v16qi, v8hi) +VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vrshrntq_m_n_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vrshrnbq_m_n_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqshrntq_m_n_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqshrnbq_m_n_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqrshrntq_m_n_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqrshrnbq_m_n_s, v8hi, v4si) VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vrmlaldavhaq_p_u, v4si) VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlsldavhaxq_p_s, v4si) VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlsldavhaq_p_s, v4si) VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlaldavhaxq_p_s, v4si) VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlaldavhaq_p_s, v4si) -VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_UNONE, vcvtq_m_n_from_f_u, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vcvtq_m_n_from_f_s, v8hi, v4si) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vbrsrq_m_n_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vsubq_m_n_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vsubq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vorrq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vornq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmulq_m_n_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmulq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vminnmq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmaxnmq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vfmsq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vfmasq_m_n_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vfmaq_m_n_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vfmaq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, veorq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmulq_rot90_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmulq_rot270_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmulq_rot180_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmulq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmlaq_rot90_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmlaq_rot270_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmlaq_rot180_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmlaq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcaddq_rot90_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcaddq_rot270_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vbicq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vandq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vaddq_m_n_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vaddq_m_f, v8hf, v4sf) -VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vabdq_m_f, v8hf, v4sf) +VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vcvtq_m_n_from_f_u, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vcvtq_m_n_from_f_s, v8hi, v4si) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vbrsrq_m_n_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsubq_m_n_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsubq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vorrq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vornq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmulq_m_n_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmulq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vminnmq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmaxnmq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vfmsq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vfmasq_m_n_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vfmaq_m_n_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vfmaq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, veorq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmulq_rot90_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmulq_rot270_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmulq_rot180_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmulq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmlaq_rot90_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmlaq_rot270_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmlaq_rot180_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmlaq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcaddq_rot90_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcaddq_rot270_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vbicq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vandq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vaddq_m_n_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vaddq_m_f, v8hf, v4sf) +VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vabdq_m_f, v8hf, v4sf) VAR3 (STRS, vstrbq_s, v16qi, v8hi, v4si) VAR3 (STRU, vstrbq_u, v16qi, v8hi, v4si) VAR3 (STRSS, vstrbq_scatter_offset_s, v16qi, v8hi, v4si) @@ -797,14 +797,14 @@ VAR1 (STRSU_P, vstrwq_scatter_offset_p_u, v4si) VAR1 (STRSU_P, vstrwq_scatter_shifted_offset_p_u, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, viwdupq_wb_u, v16qi, v4si, v8hi) VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vdwdupq_wb_u, v16qi, v4si, v8hi) -VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_UNONE, viwdupq_m_wb_u, v16qi, v8hi, v4si) -VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_UNONE, vdwdupq_m_wb_u, v16qi, v8hi, v4si) -VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_UNONE, viwdupq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_UNONE, vdwdupq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED, viwdupq_m_wb_u, v16qi, v8hi, v4si) +VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED, vdwdupq_m_wb_u, v16qi, v8hi, v4si) +VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED, viwdupq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED, vdwdupq_m_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_IMM, vddupq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_IMM, vidupq_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vddupq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vidupq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vddupq_m_n_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vidupq_m_n_u, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vdwdupq_n_u, v16qi, v4si, v8hi) VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, viwdupq_n_u, v16qi, v4si, v8hi) VAR1 (STRSBWBU, vstrwq_scatter_base_wb_u, v4si) @@ -870,10 +870,10 @@ VAR1 (UQSHL, urshr_, si) VAR1 (UQSHL, urshrl_, di) VAR1 (UQSHL, uqshl_, si) VAR1 (UQSHL, uqshll_, di) -VAR3 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vshlcq_m_vec_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vshlcq_m_carry_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlcq_m_vec_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlcq_m_carry_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_UNONE_IMM_PRED, vshlcq_m_vec_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_NONE_NONE_UNONE_IMM_PRED, vshlcq_m_carry_s, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlcq_m_vec_u, v16qi, v8hi, v4si) +VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlcq_m_carry_u, v16qi, v8hi, v4si) /* optabs without any suffixes. */ VAR5 (BINOP_NONE_NONE_NONE, vcaddq_rot90, v16qi, v8hi, v4si, v8hf, v4sf) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 12f05b3..5d51da1 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -130,7 +130,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VRNDQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -918,7 +918,7 @@ [ (set (match_operand:SI 0 "s_register_operand" "=Te") (unspec:SI [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:HI 2 "vpr_register_operand" "Up")] + (match_operand: 2 "vpr_register_operand" "Up")] VADDVQ_P)) ] "TARGET_HAVE_MVE" @@ -2581,7 +2581,7 @@ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") (match_operand:SI 2 "immediate_operand" "i") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VBICQ_M_N)) ] "TARGET_HAVE_MVE" @@ -2611,7 +2611,7 @@ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") (match_operand: 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCVTAQ_M)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -2626,7 +2626,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand: 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCVTQ_M_TO_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -2748,7 +2748,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VABSQ_M_S)) ] "TARGET_HAVE_MVE" @@ -2764,7 +2764,7 @@ (set (match_operand:SI 0 "s_register_operand" "=Te") (unspec:SI [(match_operand:SI 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VADDVAQ_P)) ] "TARGET_HAVE_MVE" @@ -2780,7 +2780,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCLSQ_M_S)) ] "TARGET_HAVE_MVE" @@ -2796,7 +2796,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCLZQ_M)) ] "TARGET_HAVE_MVE" @@ -3068,7 +3068,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VDUPQ_M_N)) ] "TARGET_HAVE_MVE" @@ -3084,7 +3084,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMAXAQ_M_S)) ] "TARGET_HAVE_MVE" @@ -3100,7 +3100,7 @@ (set (match_operand: 0 "s_register_operand" "=r") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMAXAVQ_P_S)) ] "TARGET_HAVE_MVE" @@ -3116,7 +3116,7 @@ (set (match_operand: 0 "s_register_operand" "=r") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMAXVQ_P)) ] "TARGET_HAVE_MVE" @@ -3132,7 +3132,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMINAQ_M_S)) ] "TARGET_HAVE_MVE" @@ -3148,7 +3148,7 @@ (set (match_operand: 0 "s_register_operand" "=r") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMINAVQ_P_S)) ] "TARGET_HAVE_MVE" @@ -3164,7 +3164,7 @@ (set (match_operand: 0 "s_register_operand" "=r") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMINVQ_P)) ] "TARGET_HAVE_MVE" @@ -3196,7 +3196,7 @@ (set (match_operand:SI 0 "s_register_operand" "=Te") (unspec:SI [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMLADAVQ_P)) ] "TARGET_HAVE_MVE" @@ -3212,7 +3212,7 @@ (set (match_operand:SI 0 "s_register_operand" "=Te") (unspec:SI [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMLADAVXQ_P_S)) ] "TARGET_HAVE_MVE" @@ -3260,7 +3260,7 @@ (set (match_operand:SI 0 "s_register_operand" "=Te") (unspec:SI [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMLSDAVQ_P_S)) ] "TARGET_HAVE_MVE" @@ -3276,7 +3276,7 @@ (set (match_operand:SI 0 "s_register_operand" "=Te") (unspec:SI [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMLSDAVXQ_P_S)) ] "TARGET_HAVE_MVE" @@ -3292,7 +3292,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMVNQ_M)) ] "TARGET_HAVE_MVE" @@ -3308,7 +3308,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VNEGQ_M_S)) ] "TARGET_HAVE_MVE" @@ -3340,7 +3340,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VQABSQ_M_S)) ] "TARGET_HAVE_MVE" @@ -3388,7 +3388,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VQNEGQ_M_S)) ] "TARGET_HAVE_MVE" @@ -3500,7 +3500,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:SI 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VQRSHLQ_M_N)) ] "TARGET_HAVE_MVE" @@ -3516,7 +3516,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:SI 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VQSHLQ_M_R)) ] "TARGET_HAVE_MVE" @@ -3532,7 +3532,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VREV64Q_M)) ] "TARGET_HAVE_MVE" @@ -3548,7 +3548,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:SI 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VRSHLQ_M_N)) ] "TARGET_HAVE_MVE" @@ -3564,7 +3564,7 @@ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:SI 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VSHLQ_M_R)) ] "TARGET_HAVE_MVE" @@ -3723,7 +3723,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VABSQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4013,7 +4013,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand: 2 "s_register_operand" "r") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VDUPQ_M_N_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4092,7 +4092,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMAXNMAQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4107,7 +4107,7 @@ (set (match_operand: 0 "s_register_operand" "=r") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMAXNMAVQ_P_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4123,7 +4123,7 @@ (set (match_operand: 0 "s_register_operand" "=r") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMAXNMVQ_P_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4138,7 +4138,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMINNMAQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4154,7 +4154,7 @@ (set (match_operand: 0 "s_register_operand" "=r") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMINNMAVQ_P_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4169,7 +4169,7 @@ (set (match_operand: 0 "s_register_operand" "=r") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMINNMVQ_P_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4217,7 +4217,7 @@ (set (match_operand:DI 0 "s_register_operand" "=r") (unspec:DI [(match_operand:MVE_5 1 "s_register_operand" "w") (match_operand:MVE_5 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMLALDAVQ_P)) ] "TARGET_HAVE_MVE" @@ -4233,7 +4233,7 @@ (set (match_operand:DI 0 "s_register_operand" "=r") (unspec:DI [(match_operand:MVE_5 1 "s_register_operand" "w") (match_operand:MVE_5 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMLALDAVXQ_P_S)) ] "TARGET_HAVE_MVE" @@ -4280,7 +4280,7 @@ (set (match_operand:DI 0 "s_register_operand" "=r") (unspec:DI [(match_operand:MVE_5 1 "s_register_operand" "w") (match_operand:MVE_5 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMLSLDAVQ_P_S)) ] "TARGET_HAVE_MVE" @@ -4296,7 +4296,7 @@ (set (match_operand:DI 0 "s_register_operand" "=r") (unspec:DI [(match_operand:MVE_5 1 "s_register_operand" "w") (match_operand:MVE_5 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMLSLDAVXQ_P_S)) ] "TARGET_HAVE_MVE" @@ -4311,7 +4311,7 @@ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_3 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMOVLBQ_M)) ] "TARGET_HAVE_MVE" @@ -4326,7 +4326,7 @@ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_3 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMOVLTQ_M)) ] "TARGET_HAVE_MVE" @@ -4341,7 +4341,7 @@ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMOVNBQ_M)) ] "TARGET_HAVE_MVE" @@ -4357,7 +4357,7 @@ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMOVNTQ_M)) ] "TARGET_HAVE_MVE" @@ -4373,7 +4373,7 @@ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") (match_operand:SI 2 "immediate_operand" "i") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VMVNQ_M_N)) ] "TARGET_HAVE_MVE" @@ -4388,7 +4388,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VNEGQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4404,7 +4404,7 @@ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") (match_operand:SI 2 "immediate_operand" "i") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VORRQ_M_N)) ] "TARGET_HAVE_MVE" @@ -4435,7 +4435,7 @@ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VQMOVNBQ_M)) ] "TARGET_HAVE_MVE" @@ -4451,7 +4451,7 @@ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VQMOVNTQ_M)) ] "TARGET_HAVE_MVE" @@ -4467,7 +4467,7 @@ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VQMOVUNBQ_M_S)) ] "TARGET_HAVE_MVE" @@ -4483,7 +4483,7 @@ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VQMOVUNTQ_M_S)) ] "TARGET_HAVE_MVE" @@ -4611,7 +4611,7 @@ (set (match_operand:MVE_3 0 "s_register_operand" "=w") (unspec:MVE_3 [(match_operand:MVE_3 1 "s_register_operand" "0") (match_operand:MVE_3 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VREV32Q_M)) ] "TARGET_HAVE_MVE" @@ -4627,7 +4627,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VREV64Q_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4723,7 +4723,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VRNDAQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4739,7 +4739,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VRNDMQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4755,7 +4755,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VRNDNQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4771,7 +4771,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VRNDPQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4787,7 +4787,7 @@ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VRNDXQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4867,7 +4867,7 @@ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") (match_operand: 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCVTMQ_M)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4883,7 +4883,7 @@ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") (match_operand: 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCVTPQ_M)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4899,7 +4899,7 @@ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") (match_operand: 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCVTNQ_M)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4916,7 +4916,7 @@ (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") (match_operand: 2 "s_register_operand" "w") (match_operand:SI 3 "" "") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCVTQ_M_N_FROM_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4948,7 +4948,7 @@ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") (match_operand: 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCVTQ_M_FROM_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4997,7 +4997,7 @@ (unspec:SI [(match_operand:SI 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VABAVQ_P)) ] "TARGET_HAVE_MVE" @@ -5014,7 +5014,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:SI 3 "mve_imm_7" "Ra") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQSHLUQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -5030,7 +5030,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VSHLQ_M)) ] "TARGET_HAVE_MVE" @@ -5046,7 +5046,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:SI 3 "mve_imm_selective_upto_8" "Rg") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VSRIQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5062,7 +5062,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VSUBQ_M)) ] "TARGET_HAVE_MVE" @@ -5078,7 +5078,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand: 2 "s_register_operand" "w") (match_operand:SI 3 "" "") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCVTQ_M_N_TO_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -5094,7 +5094,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VABDQ_M)) ] "TARGET_HAVE_MVE" @@ -5111,7 +5111,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VADDQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5128,7 +5128,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VADDQ_M)) ] "TARGET_HAVE_MVE" @@ -5145,7 +5145,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VANDQ_M)) ] "TARGET_HAVE_MVE" @@ -5162,7 +5162,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VBICQ_M)) ] "TARGET_HAVE_MVE" @@ -5179,7 +5179,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:SI 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VBRSRQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5196,7 +5196,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCADDQ_ROT270_M)) ] "TARGET_HAVE_MVE" @@ -5213,7 +5213,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCADDQ_ROT90_M)) ] "TARGET_HAVE_MVE" @@ -5230,7 +5230,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VEORQ_M)) ] "TARGET_HAVE_MVE" @@ -5247,7 +5247,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VHADDQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5264,7 +5264,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VHADDQ_M)) ] "TARGET_HAVE_MVE" @@ -5281,7 +5281,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VHSUBQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5298,7 +5298,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VHSUBQ_M)) ] "TARGET_HAVE_MVE" @@ -5315,7 +5315,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMAXQ_M)) ] "TARGET_HAVE_MVE" @@ -5332,7 +5332,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMINQ_M)) ] "TARGET_HAVE_MVE" @@ -5349,7 +5349,7 @@ (unspec:SI [(match_operand:SI 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMLADAVAQ_P)) ] "TARGET_HAVE_MVE" @@ -5366,7 +5366,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMLAQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5383,7 +5383,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMLASQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5400,7 +5400,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMULHQ_M)) ] "TARGET_HAVE_MVE" @@ -5417,7 +5417,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMULLBQ_INT_M)) ] "TARGET_HAVE_MVE" @@ -5434,7 +5434,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMULLTQ_INT_M)) ] "TARGET_HAVE_MVE" @@ -5451,7 +5451,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMULQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5468,7 +5468,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMULQ_M)) ] "TARGET_HAVE_MVE" @@ -5485,7 +5485,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VORNQ_M)) ] "TARGET_HAVE_MVE" @@ -5502,7 +5502,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VORRQ_M)) ] "TARGET_HAVE_MVE" @@ -5519,7 +5519,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQADDQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5536,7 +5536,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQADDQ_M)) ] "TARGET_HAVE_MVE" @@ -5553,7 +5553,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQDMLAHQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -5570,7 +5570,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQDMLASHQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -5587,7 +5587,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQRDMLAHQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -5604,7 +5604,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQRDMLASHQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -5621,7 +5621,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQRSHLQ_M)) ] "TARGET_HAVE_MVE" @@ -5638,7 +5638,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:SI 3 "immediate_operand" "i") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQSHLQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5655,7 +5655,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQSHLQ_M)) ] "TARGET_HAVE_MVE" @@ -5672,7 +5672,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQSUBQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5689,7 +5689,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQSUBQ_M)) ] "TARGET_HAVE_MVE" @@ -5706,7 +5706,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VRHADDQ_M)) ] "TARGET_HAVE_MVE" @@ -5723,7 +5723,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VRMULHQ_M)) ] "TARGET_HAVE_MVE" @@ -5740,7 +5740,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VRSHLQ_M)) ] "TARGET_HAVE_MVE" @@ -5757,7 +5757,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:SI 3 "" "") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VRSHRQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5774,7 +5774,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:SI 3 "immediate_operand" "i") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VSHLQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5791,7 +5791,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:SI 3 "" "") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VSHRQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5808,7 +5808,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:SI 3 "" "") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VSLIQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5825,7 +5825,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VSUBQ_M_N)) ] "TARGET_HAVE_MVE" @@ -5842,7 +5842,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VHCADDQ_ROT270_M_S)) ] "TARGET_HAVE_MVE" @@ -5859,7 +5859,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VHCADDQ_ROT90_M_S)) ] "TARGET_HAVE_MVE" @@ -5876,7 +5876,7 @@ (unspec:SI [(match_operand:SI 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMLADAVAXQ_P_S)) ] "TARGET_HAVE_MVE" @@ -5893,7 +5893,7 @@ (unspec:SI [(match_operand:SI 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMLSDAVAQ_P_S)) ] "TARGET_HAVE_MVE" @@ -5910,7 +5910,7 @@ (unspec:SI [(match_operand:SI 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMLSDAVAXQ_P_S)) ] "TARGET_HAVE_MVE" @@ -5927,7 +5927,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQDMLADHQ_M_S)) ] "TARGET_HAVE_MVE" @@ -5944,7 +5944,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQDMLADHXQ_M_S)) ] "TARGET_HAVE_MVE" @@ -5961,7 +5961,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQDMLSDHQ_M_S)) ] "TARGET_HAVE_MVE" @@ -5978,7 +5978,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQDMLSDHXQ_M_S)) ] "TARGET_HAVE_MVE" @@ -5995,7 +5995,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQDMULHQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -6012,7 +6012,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQDMULHQ_M_S)) ] "TARGET_HAVE_MVE" @@ -6029,7 +6029,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQRDMLADHQ_M_S)) ] "TARGET_HAVE_MVE" @@ -6046,7 +6046,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQRDMLADHXQ_M_S)) ] "TARGET_HAVE_MVE" @@ -6063,7 +6063,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQRDMLSDHQ_M_S)) ] "TARGET_HAVE_MVE" @@ -6080,7 +6080,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQRDMLSDHXQ_M_S)) ] "TARGET_HAVE_MVE" @@ -6097,7 +6097,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQRDMULHQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -6114,7 +6114,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:MVE_2 2 "s_register_operand" "w") (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQRDMULHQ_M_S)) ] "TARGET_HAVE_MVE" @@ -6131,7 +6131,7 @@ (unspec:DI [(match_operand:DI 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:MVE_5 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMLALDAVAQ_P)) ] "TARGET_HAVE_MVE" @@ -6148,7 +6148,7 @@ (unspec:DI [(match_operand:DI 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:MVE_5 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMLALDAVAXQ_P)) ] "TARGET_HAVE_MVE" @@ -6165,7 +6165,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:SI 3 "mve_imm_8" "Rb") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQRSHRNBQ_M_N)) ] "TARGET_HAVE_MVE" @@ -6182,7 +6182,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:SI 3 "mve_imm_8" "Rb") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQRSHRNTQ_M_N)) ] "TARGET_HAVE_MVE" @@ -6199,7 +6199,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:SI 3 "" "") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQSHRNBQ_M_N)) ] "TARGET_HAVE_MVE" @@ -6216,7 +6216,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:SI 3 "" "") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQSHRNTQ_M_N)) ] "TARGET_HAVE_MVE" @@ -6250,7 +6250,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:SI 3 "mve_imm_8" "Rb") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VRSHRNBQ_M_N)) ] "TARGET_HAVE_MVE" @@ -6267,7 +6267,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:SI 3 "mve_imm_8" "Rb") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VRSHRNTQ_M_N)) ] "TARGET_HAVE_MVE" @@ -6284,7 +6284,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_3 2 "s_register_operand" "w") (match_operand:SI 3 "immediate_operand" "i") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VSHLLBQ_M_N)) ] "TARGET_HAVE_MVE" @@ -6301,7 +6301,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_3 2 "s_register_operand" "w") (match_operand:SI 3 "immediate_operand" "i") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VSHLLTQ_M_N)) ] "TARGET_HAVE_MVE" @@ -6318,7 +6318,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:SI 3 "" "") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VSHRNBQ_M_N)) ] "TARGET_HAVE_MVE" @@ -6335,7 +6335,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:SI 3 "" "") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VSHRNTQ_M_N)) ] "TARGET_HAVE_MVE" @@ -6352,7 +6352,7 @@ (unspec:DI [(match_operand:DI 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:MVE_5 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMLSLDAVAQ_P_S)) ] "TARGET_HAVE_MVE" @@ -6369,7 +6369,7 @@ (unspec:DI [(match_operand:DI 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:MVE_5 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMLSLDAVAXQ_P_S)) ] "TARGET_HAVE_MVE" @@ -6386,7 +6386,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_3 2 "s_register_operand" "w") (match_operand:MVE_3 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMULLBQ_POLY_M_P)) ] "TARGET_HAVE_MVE" @@ -6403,7 +6403,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_3 2 "s_register_operand" "w") (match_operand:MVE_3 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMULLTQ_POLY_M_P)) ] "TARGET_HAVE_MVE" @@ -6420,7 +6420,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQDMULLBQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -6437,7 +6437,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:MVE_5 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQDMULLBQ_M_S)) ] "TARGET_HAVE_MVE" @@ -6454,7 +6454,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQDMULLTQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -6471,7 +6471,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:MVE_5 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQDMULLTQ_M_S)) ] "TARGET_HAVE_MVE" @@ -6488,7 +6488,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:SI 3 "mve_imm_8" "Rb") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQRSHRUNBQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -6505,7 +6505,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:SI 3 "" "") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQRSHRUNTQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -6522,7 +6522,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:SI 3 "" "") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQSHRUNBQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -6539,7 +6539,7 @@ (unspec: [(match_operand: 1 "s_register_operand" "0") (match_operand:MVE_5 2 "s_register_operand" "w") (match_operand:SI 3 "" "") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VQSHRUNTQ_M_N_S)) ] "TARGET_HAVE_MVE" @@ -6623,7 +6623,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VABDQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6640,7 +6640,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VADDQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6657,7 +6657,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VADDQ_M_N_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6674,7 +6674,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VANDQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6691,7 +6691,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VBICQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6708,7 +6708,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:SI 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VBRSRQ_M_N_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6725,7 +6725,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCADDQ_ROT270_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6742,7 +6742,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCADDQ_ROT90_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6759,7 +6759,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCMLAQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6776,7 +6776,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCMLAQ_ROT180_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6793,7 +6793,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCMLAQ_ROT270_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6810,7 +6810,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCMLAQ_ROT90_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6827,7 +6827,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCMULQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6844,7 +6844,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCMULQ_ROT180_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6861,7 +6861,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCMULQ_ROT270_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6878,7 +6878,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VCMULQ_ROT90_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6895,7 +6895,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VEORQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6912,7 +6912,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VFMAQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6929,7 +6929,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VFMAQ_M_N_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6946,7 +6946,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VFMASQ_M_N_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6963,7 +6963,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VFMSQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6980,7 +6980,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMAXNMQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -6997,7 +6997,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMINNMQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -7014,7 +7014,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMULQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -7031,7 +7031,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VMULQ_M_N_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -7048,7 +7048,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VORNQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -7065,7 +7065,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VORRQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -7082,7 +7082,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VSUBQ_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -7099,7 +7099,7 @@ (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") (match_operand:MVE_0 2 "s_register_operand" "w") (match_operand: 3 "s_register_operand" "r") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VSUBQ_M_N_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -7248,7 +7248,7 @@ [(match_operand: 0 "mve_scatter_memory") (match_operand:MVE_2 1 "s_register_operand") (match_operand:MVE_2 2 "s_register_operand") - (match_operand:HI 3 "vpr_register_operand" "Up") + (match_operand: 3 "vpr_register_operand" "Up") (unspec:V4SI [(const_int 0)] VSTRBSOQ)] "TARGET_HAVE_MVE" { @@ -7267,7 +7267,7 @@ [(match_operand:SI 0 "register_operand" "r") (match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VSTRBSOQ))] "TARGET_HAVE_MVE" "vpst\;vstrbt.\t%q2, [%0, %q1]" @@ -7302,7 +7302,7 @@ (define_insn "mve_vstrbq_p_" [(set (match_operand: 0 "mve_memory_operand" "=Ux") (unspec: [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:HI 2 "vpr_register_operand" "Up")] + (match_operand: 2 "vpr_register_operand" "Up")] VSTRBQ)) ] "TARGET_HAVE_MVE" @@ -7323,7 +7323,7 @@ [(set (match_operand:MVE_2 0 "s_register_operand" "=&w") (unspec:MVE_2 [(match_operand: 1 "memory_operand" "Us") (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VLDRBGOQ)) ] "TARGET_HAVE_MVE" @@ -7347,7 +7347,7 @@ (define_insn "mve_vldrbq_z_" [(set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand: 1 "mve_memory_operand" "Ux") - (match_operand:HI 2 "vpr_register_operand" "Up")] + (match_operand: 2 "vpr_register_operand" "Up")] VLDRBQ)) ] "TARGET_HAVE_MVE" @@ -7434,7 +7434,7 @@ [(set (match_operand:MVE_6 0 "s_register_operand" "=&w") (unspec:MVE_6 [(match_operand: 1 "memory_operand" "Us") (match_operand:MVE_6 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up") + (match_operand: 3 "vpr_register_operand" "Up") ]VLDRHGOQ)) ] "TARGET_HAVE_MVE" @@ -7482,7 +7482,7 @@ [(set (match_operand:MVE_6 0 "s_register_operand" "=&w") (unspec:MVE_6 [(match_operand: 1 "memory_operand" "Us") (match_operand:MVE_6 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up") + (match_operand: 3 "vpr_register_operand" "Up") ]VLDRHGSOQ)) ] "TARGET_HAVE_MVE" @@ -7548,7 +7548,7 @@ (define_insn "mve_vldrhq_z_" [(set (match_operand:MVE_6 0 "s_register_operand" "=w") (unspec:MVE_6 [(match_operand: 1 "mve_memory_operand" "Ux") - (match_operand:HI 2 "vpr_register_operand" "Up")] + (match_operand: 2 "vpr_register_operand" "Up")] VLDRHQ)) ] "TARGET_HAVE_MVE" @@ -8124,7 +8124,7 @@ (define_insn "mve_vstrhq_p_" [(set (match_operand: 0 "mve_memory_operand" "=Ux") (unspec: [(match_operand:MVE_6 1 "s_register_operand" "w") - (match_operand:HI 2 "vpr_register_operand" "Up")] + (match_operand: 2 "vpr_register_operand" "Up")] VSTRHQ)) ] "TARGET_HAVE_MVE" @@ -8145,7 +8145,7 @@ [(match_operand: 0 "mve_scatter_memory") (match_operand:MVE_6 1 "s_register_operand") (match_operand:MVE_6 2 "s_register_operand") - (match_operand:HI 3 "vpr_register_operand") + (match_operand: 3 "vpr_register_operand") (unspec:V4SI [(const_int 0)] VSTRHSOQ)] "TARGET_HAVE_MVE" { @@ -8164,7 +8164,7 @@ [(match_operand:SI 0 "register_operand" "r") (match_operand:MVE_6 1 "s_register_operand" "w") (match_operand:MVE_6 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VSTRHSOQ))] "TARGET_HAVE_MVE" "vpst\;vstrht.\t%q2, [%0, %q1]" @@ -8205,7 +8205,7 @@ [(match_operand: 0 "mve_scatter_memory") (match_operand:MVE_6 1 "s_register_operand") (match_operand:MVE_6 2 "s_register_operand") - (match_operand:HI 3 "vpr_register_operand") + (match_operand: 3 "vpr_register_operand") (unspec:V4SI [(const_int 0)] VSTRHSSOQ)] "TARGET_HAVE_MVE" { @@ -8224,7 +8224,7 @@ [(match_operand:SI 0 "register_operand" "r") (match_operand:MVE_6 1 "s_register_operand" "w") (match_operand:MVE_6 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VSTRHSSOQ))] "TARGET_HAVE_MVE" "vpst\;vstrht.\t%q2, [%0, %q1, uxtw #1]" @@ -9011,7 +9011,7 @@ (match_operand:MVE_2 1 "s_register_operand") (match_operand:SI 2 "s_register_operand") (match_operand:SI 3 "mve_imm_selective_upto_8") - (match_operand:HI 4 "vpr_register_operand")] + (match_operand: 4 "vpr_register_operand")] "TARGET_HAVE_MVE" { rtx temp = gen_reg_rtx (SImode); @@ -9031,7 +9031,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:SI 3 "s_register_operand" "2") (match_operand:SI 4 "mve_imm_selective_upto_8" "Rg") - (match_operand:HI 5 "vpr_register_operand" "Up")] + (match_operand: 5 "vpr_register_operand" "Up")] VIDUPQ_M)) (set (match_operand:SI 2 "s_register_operand" "=Te") (plus:SI (match_dup 3) @@ -9079,7 +9079,7 @@ (match_operand:MVE_2 1 "s_register_operand") (match_operand:SI 2 "s_register_operand") (match_operand:SI 3 "mve_imm_selective_upto_8") - (match_operand:HI 4 "vpr_register_operand")] + (match_operand: 4 "vpr_register_operand")] "TARGET_HAVE_MVE" { rtx temp = gen_reg_rtx (SImode); @@ -9099,7 +9099,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:SI 3 "s_register_operand" "2") (match_operand:SI 4 "mve_imm_selective_upto_8" "Rg") - (match_operand:HI 5 "vpr_register_operand" "Up")] + (match_operand: 5 "vpr_register_operand" "Up")] VDDUPQ_M)) (set (match_operand:SI 2 "s_register_operand" "=Te") (minus:SI (match_dup 3) @@ -9170,7 +9170,7 @@ (match_operand:SI 2 "s_register_operand") (match_operand:DI 3 "s_register_operand") (match_operand:SI 4 "mve_imm_selective_upto_8") - (match_operand:HI 5 "vpr_register_operand")] + (match_operand: 5 "vpr_register_operand")] "TARGET_HAVE_MVE" { rtx ignore_wb = gen_reg_rtx (SImode); @@ -9190,7 +9190,7 @@ (match_operand:SI 2 "s_register_operand") (match_operand:DI 3 "s_register_operand") (match_operand:SI 4 "mve_imm_selective_upto_8") - (match_operand:HI 5 "vpr_register_operand")] + (match_operand: 5 "vpr_register_operand")] "TARGET_HAVE_MVE" { rtx ignore_vec = gen_reg_rtx (mode); @@ -9210,7 +9210,7 @@ (match_operand:SI 3 "s_register_operand" "1") (subreg:SI (match_operand:DI 4 "s_register_operand" "r") 4) (match_operand:SI 5 "mve_imm_selective_upto_8" "Rg") - (match_operand:HI 6 "vpr_register_operand" "Up")] + (match_operand: 6 "vpr_register_operand" "Up")] VDWDUPQ_M)) (set (match_operand:SI 1 "s_register_operand" "=Te") (unspec:SI [(match_dup 2) @@ -9287,7 +9287,7 @@ (match_operand:SI 2 "s_register_operand") (match_operand:DI 3 "s_register_operand") (match_operand:SI 4 "mve_imm_selective_upto_8") - (match_operand:HI 5 "vpr_register_operand")] + (match_operand: 5 "vpr_register_operand")] "TARGET_HAVE_MVE" { rtx ignore_wb = gen_reg_rtx (SImode); @@ -9307,7 +9307,7 @@ (match_operand:SI 2 "s_register_operand") (match_operand:DI 3 "s_register_operand") (match_operand:SI 4 "mve_imm_selective_upto_8") - (match_operand:HI 5 "vpr_register_operand")] + (match_operand: 5 "vpr_register_operand")] "TARGET_HAVE_MVE" { rtx ignore_vec = gen_reg_rtx (mode); @@ -9327,7 +9327,7 @@ (match_operand:SI 3 "s_register_operand" "1") (subreg:SI (match_operand:DI 4 "s_register_operand" "r") 4) (match_operand:SI 5 "mve_imm_selective_upto_8" "Rg") - (match_operand:HI 6 "vpr_register_operand" "Up")] + (match_operand: 6 "vpr_register_operand" "Up")] VIWDUPQ_M)) (set (match_operand:SI 1 "s_register_operand" "=Te") (unspec:SI [(match_dup 2) @@ -10335,7 +10335,7 @@ (match_operand:MVE_2 1 "s_register_operand") (match_operand:SI 2 "s_register_operand") (match_operand:SI 3 "mve_imm_32") - (match_operand:HI 4 "vpr_register_operand") + (match_operand: 4 "vpr_register_operand") (unspec:MVE_2 [(const_int 0)] VSHLCQ_M)] "TARGET_HAVE_MVE" { @@ -10351,7 +10351,7 @@ (match_operand:MVE_2 1 "s_register_operand") (match_operand:SI 2 "s_register_operand") (match_operand:SI 3 "mve_imm_32") - (match_operand:HI 4 "vpr_register_operand") + (match_operand: 4 "vpr_register_operand") (unspec:MVE_2 [(const_int 0)] VSHLCQ_M)] "TARGET_HAVE_MVE" { @@ -10367,7 +10367,7 @@ (unspec:MVE_2 [(match_operand:MVE_2 2 "s_register_operand" "0") (match_operand:SI 3 "s_register_operand" "1") (match_operand:SI 4 "mve_imm_32" "Rf") - (match_operand:HI 5 "vpr_register_operand" "Up")] + (match_operand: 5 "vpr_register_operand" "Up")] VSHLCQ_M)) (set (match_operand:SI 1 "s_register_operand" "=r") (unspec:SI [(match_dup 2) -- cgit v1.1 From 6a7c13a0cf2290b60ab36f9ce1027b92838586bd Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 20 Oct 2021 15:39:17 +0000 Subject: arm: Convert more load/store MVE builtins to predicate qualifiers This patch covers a few builtins where we do not use the iterator and thus we cannot use . For v2di instructions, we keep the HI mode for predicates. Most of the work of this patch series was carried out while I was working at STMicroelectronics as a Linaro assignee. 2022-02-22 Christophe Lyon gcc/ PR target/100757 PR target/101325 * config/arm/arm-builtins.cc (STRSBS_P_QUALIFIERS): Use predicate qualifier. (STRSBU_P_QUALIFIERS): Likewise. (LDRGBS_Z_QUALIFIERS): Likewise. (LDRGBU_Z_QUALIFIERS): Likewise. (LDRGBWBXU_Z_QUALIFIERS): Likewise. (LDRGBWBS_Z_QUALIFIERS): Likewise. (LDRGBWBU_Z_QUALIFIERS): Likewise. (STRSBWBS_P_QUALIFIERS): Likewise. (STRSBWBU_P_QUALIFIERS): Likewise. * config/arm/mve.md: Use VxBI instead of HI. --- gcc/config/arm/arm-builtins.cc | 18 +++++------ gcc/config/arm/mve.md | 68 +++++++++++++++++++++--------------------- 2 files changed, 43 insertions(+), 43 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc index a9536b2..5d582f1 100644 --- a/gcc/config/arm/arm-builtins.cc +++ b/gcc/config/arm/arm-builtins.cc @@ -689,13 +689,13 @@ arm_strss_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] static enum arm_type_qualifiers arm_strsbs_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_void, qualifier_unsigned, qualifier_immediate, - qualifier_none, qualifier_unsigned}; + qualifier_none, qualifier_predicate}; #define STRSBS_P_QUALIFIERS (arm_strsbs_p_qualifiers) static enum arm_type_qualifiers arm_strsbu_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_void, qualifier_unsigned, qualifier_immediate, - qualifier_unsigned, qualifier_unsigned}; + qualifier_unsigned, qualifier_predicate}; #define STRSBU_P_QUALIFIERS (arm_strsbu_p_qualifiers) static enum arm_type_qualifiers @@ -731,13 +731,13 @@ arm_ldrgbu_qualifiers[SIMD_MAX_BUILTIN_ARGS] static enum arm_type_qualifiers arm_ldrgbs_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_unsigned, qualifier_immediate, - qualifier_unsigned}; + qualifier_predicate}; #define LDRGBS_Z_QUALIFIERS (arm_ldrgbs_z_qualifiers) static enum arm_type_qualifiers arm_ldrgbu_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate, - qualifier_unsigned}; + qualifier_predicate}; #define LDRGBU_Z_QUALIFIERS (arm_ldrgbu_z_qualifiers) static enum arm_type_qualifiers @@ -777,7 +777,7 @@ arm_ldrgbwbxu_qualifiers[SIMD_MAX_BUILTIN_ARGS] static enum arm_type_qualifiers arm_ldrgbwbxu_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate, - qualifier_unsigned}; + qualifier_predicate}; #define LDRGBWBXU_Z_QUALIFIERS (arm_ldrgbwbxu_z_qualifiers) static enum arm_type_qualifiers @@ -793,13 +793,13 @@ arm_ldrgbwbu_qualifiers[SIMD_MAX_BUILTIN_ARGS] static enum arm_type_qualifiers arm_ldrgbwbs_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_unsigned, qualifier_immediate, - qualifier_unsigned}; + qualifier_predicate}; #define LDRGBWBS_Z_QUALIFIERS (arm_ldrgbwbs_z_qualifiers) static enum arm_type_qualifiers arm_ldrgbwbu_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate, - qualifier_unsigned}; + qualifier_predicate}; #define LDRGBWBU_Z_QUALIFIERS (arm_ldrgbwbu_z_qualifiers) static enum arm_type_qualifiers @@ -815,13 +815,13 @@ arm_strsbwbu_qualifiers[SIMD_MAX_BUILTIN_ARGS] static enum arm_type_qualifiers arm_strsbwbs_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_const, - qualifier_none, qualifier_unsigned}; + qualifier_none, qualifier_predicate}; #define STRSBWBS_P_QUALIFIERS (arm_strsbwbs_p_qualifiers) static enum arm_type_qualifiers arm_strsbwbu_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_const, - qualifier_unsigned, qualifier_unsigned}; + qualifier_unsigned, qualifier_predicate}; #define STRSBWBU_P_QUALIFIERS (arm_strsbwbu_p_qualifiers) static enum arm_type_qualifiers diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 5d51da1..e291c67 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -7282,7 +7282,7 @@ [(match_operand:V4SI 0 "s_register_operand" "w") (match_operand:SI 1 "immediate_operand" "i") (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VSTRWSBQ)) ] "TARGET_HAVE_MVE" @@ -7371,7 +7371,7 @@ [(set (match_operand:V4SI 0 "s_register_operand" "=&w") (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w") (match_operand:SI 2 "immediate_operand" "i") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VLDRWGBQ)) ] "TARGET_HAVE_MVE" @@ -7609,7 +7609,7 @@ (define_insn "mve_vldrwq_z_fv4sf" [(set (match_operand:V4SF 0 "s_register_operand" "=w") (unspec:V4SF [(match_operand:V4SI 1 "mve_memory_operand" "Ux") - (match_operand:HI 2 "vpr_register_operand" "Up")] + (match_operand:V4BI 2 "vpr_register_operand" "Up")] VLDRWQ_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -7629,7 +7629,7 @@ (define_insn "mve_vldrwq_z_v4si" [(set (match_operand:V4SI 0 "s_register_operand" "=w") (unspec:V4SI [(match_operand:V4SI 1 "mve_memory_operand" "Ux") - (match_operand:HI 2 "vpr_register_operand" "Up")] + (match_operand:V4BI 2 "vpr_register_operand" "Up")] VLDRWQ)) ] "TARGET_HAVE_MVE" @@ -7813,7 +7813,7 @@ [(set (match_operand:V8HF 0 "s_register_operand" "=&w") (unspec:V8HF [(match_operand:V8HI 1 "memory_operand" "Us") (match_operand:V8HI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V8BI 3 "vpr_register_operand" "Up")] VLDRHQGO_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -7855,7 +7855,7 @@ [(set (match_operand:V8HF 0 "s_register_operand" "=&w") (unspec:V8HF [(match_operand:V8HI 1 "memory_operand" "Us") (match_operand:V8HI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V8BI 3 "vpr_register_operand" "Up")] VLDRHQGSO_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -7897,7 +7897,7 @@ [(set (match_operand:V4SF 0 "s_register_operand" "=&w") (unspec:V4SF [(match_operand:V4SI 1 "s_register_operand" "w") (match_operand:SI 2 "immediate_operand" "i") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VLDRWQGB_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -7958,7 +7958,7 @@ [(set (match_operand:V4SF 0 "s_register_operand" "=&w") (unspec:V4SF [(match_operand:V4SI 1 "memory_operand" "Us") (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VLDRWQGO_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -7980,7 +7980,7 @@ [(set (match_operand:V4SI 0 "s_register_operand" "=&w") (unspec:V4SI [(match_operand:V4SI 1 "memory_operand" "Us") (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VLDRWGOQ)) ] "TARGET_HAVE_MVE" @@ -8042,7 +8042,7 @@ [(set (match_operand:V4SF 0 "s_register_operand" "=&w") (unspec:V4SF [(match_operand:V4SI 1 "memory_operand" "Us") (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VLDRWQGSO_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -8064,7 +8064,7 @@ [(set (match_operand:V4SI 0 "s_register_operand" "=&w") (unspec:V4SI [(match_operand:V4SI 1 "memory_operand" "Us") (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VLDRWGSOQ)) ] "TARGET_HAVE_MVE" @@ -8104,7 +8104,7 @@ (define_insn "mve_vstrhq_p_fv8hf" [(set (match_operand:V8HI 0 "mve_memory_operand" "=Ux") (unspec:V8HI [(match_operand:V8HF 1 "s_register_operand" "w") - (match_operand:HI 2 "vpr_register_operand" "Up")] + (match_operand:V8BI 2 "vpr_register_operand" "Up")] VSTRHQ_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -8323,7 +8323,7 @@ (define_insn "mve_vstrwq_p_v4si" [(set (match_operand:V4SI 0 "mve_memory_operand" "=Ux") (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w") - (match_operand:HI 2 "vpr_register_operand" "Up")] + (match_operand:V4BI 2 "vpr_register_operand" "Up")] VSTRWQ)) ] "TARGET_HAVE_MVE" @@ -8576,7 +8576,7 @@ [(match_operand:V8HI 0 "mve_scatter_memory") (match_operand:V8HI 1 "s_register_operand") (match_operand:V8HF 2 "s_register_operand") - (match_operand:HI 3 "vpr_register_operand") + (match_operand:V8BI 3 "vpr_register_operand") (unspec:V4SI [(const_int 0)] VSTRHQSO_F)] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" { @@ -8594,7 +8594,7 @@ [(match_operand:SI 0 "register_operand" "r") (match_operand:V8HI 1 "s_register_operand" "w") (match_operand:V8HF 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V8BI 3 "vpr_register_operand" "Up")] VSTRHQSO_F))] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "vpst\;vstrht.16\t%q2, [%0, %q1]" @@ -8635,7 +8635,7 @@ [(match_operand:V8HI 0 "memory_operand" "=Us") (match_operand:V8HI 1 "s_register_operand" "w") (match_operand:V8HF 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up") + (match_operand:V8BI 3 "vpr_register_operand" "Up") (unspec:V4SI [(const_int 0)] VSTRHQSSO_F)] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" { @@ -8654,7 +8654,7 @@ [(match_operand:SI 0 "register_operand" "r") (match_operand:V8HI 1 "s_register_operand" "w") (match_operand:V8HF 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V8BI 3 "vpr_register_operand" "Up")] VSTRHQSSO_F))] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "vpst\;vstrht.16\t%q2, [%0, %q1, uxtw #1]" @@ -8691,7 +8691,7 @@ [(match_operand:V4SI 0 "s_register_operand" "w") (match_operand:SI 1 "immediate_operand" "i") (match_operand:V4SF 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VSTRWQSB_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -8740,7 +8740,7 @@ [(match_operand:V4SI 0 "mve_scatter_memory") (match_operand:V4SI 1 "s_register_operand") (match_operand:V4SF 2 "s_register_operand") - (match_operand:HI 3 "vpr_register_operand") + (match_operand:V4BI 3 "vpr_register_operand") (unspec:V4SI [(const_int 0)] VSTRWQSO_F)] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" { @@ -8758,7 +8758,7 @@ [(match_operand:SI 0 "register_operand" "r") (match_operand:V4SI 1 "s_register_operand" "w") (match_operand:V4SF 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VSTRWQSO_F))] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "vpst\;vstrwt.32\t%q2, [%0, %q1]" @@ -8771,7 +8771,7 @@ [(match_operand:V4SI 0 "mve_scatter_memory") (match_operand:V4SI 1 "s_register_operand") (match_operand:V4SI 2 "s_register_operand") - (match_operand:HI 3 "vpr_register_operand") + (match_operand:V4BI 3 "vpr_register_operand") (unspec:V4SI [(const_int 0)] VSTRWSOQ)] "TARGET_HAVE_MVE" { @@ -8789,7 +8789,7 @@ [(match_operand:SI 0 "register_operand" "r") (match_operand:V4SI 1 "s_register_operand" "w") (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VSTRWSOQ))] "TARGET_HAVE_MVE" "vpst\;vstrwt.32\t%q2, [%0, %q1]" @@ -8858,7 +8858,7 @@ [(match_operand:V4SI 0 "mve_scatter_memory") (match_operand:V4SI 1 "s_register_operand") (match_operand:V4SF 2 "s_register_operand") - (match_operand:HI 3 "vpr_register_operand") + (match_operand:V4BI 3 "vpr_register_operand") (unspec:V4SI [(const_int 0)] VSTRWQSSO_F)] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" { @@ -8877,7 +8877,7 @@ [(match_operand:SI 0 "register_operand" "r") (match_operand:V4SI 1 "s_register_operand" "w") (match_operand:V4SF 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VSTRWQSSO_F))] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "vpst\;vstrwt.32\t%q2, [%0, %q1, uxtw #2]" @@ -8890,7 +8890,7 @@ [(match_operand:V4SI 0 "mve_scatter_memory") (match_operand:V4SI 1 "s_register_operand") (match_operand:V4SI 2 "s_register_operand") - (match_operand:HI 3 "vpr_register_operand") + (match_operand:V4BI 3 "vpr_register_operand") (unspec:V4SI [(const_int 0)] VSTRWSSOQ)] "TARGET_HAVE_MVE" { @@ -8909,7 +8909,7 @@ [(match_operand:SI 0 "register_operand" "r") (match_operand:V4SI 1 "s_register_operand" "w") (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VSTRWSSOQ))] "TARGET_HAVE_MVE" "vpst\;vstrwt.32\t%q2, [%0, %q1, uxtw #2]" @@ -9376,7 +9376,7 @@ [(match_operand:V4SI 1 "s_register_operand" "0") (match_operand:SI 2 "mve_vldrd_immediate" "Ri") (match_operand:V4SI 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand")] + (match_operand:V4BI 4 "vpr_register_operand")] VSTRWSBWBQ)) (set (match_operand:V4SI 0 "s_register_operand" "=w") (unspec:V4SI [(match_dup 1) (match_dup 2)] @@ -9427,7 +9427,7 @@ [(match_operand:V4SI 1 "s_register_operand" "0") (match_operand:SI 2 "mve_vldrd_immediate" "Ri") (match_operand:V4SF 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand")] + (match_operand:V4BI 4 "vpr_register_operand")] VSTRWQSBWB_F)) (set (match_operand:V4SI 0 "s_register_operand" "=w") (unspec:V4SI [(match_dup 1) (match_dup 2)] @@ -9551,7 +9551,7 @@ [(match_operand:V4SI 0 "s_register_operand") (match_operand:V4SI 1 "s_register_operand") (match_operand:SI 2 "mve_vldrd_immediate") - (match_operand:HI 3 "vpr_register_operand") + (match_operand:V4BI 3 "vpr_register_operand") (unspec:V4SI [(const_int 0)] VLDRWGBWBQ)] "TARGET_HAVE_MVE" { @@ -9566,7 +9566,7 @@ [(match_operand:V4SI 0 "s_register_operand") (match_operand:V4SI 1 "s_register_operand") (match_operand:SI 2 "mve_vldrd_immediate") - (match_operand:HI 3 "vpr_register_operand") + (match_operand:V4BI 3 "vpr_register_operand") (unspec:V4SI [(const_int 0)] VLDRWGBWBQ)] "TARGET_HAVE_MVE" { @@ -9585,7 +9585,7 @@ [(set (match_operand:V4SI 0 "s_register_operand" "=&w") (unspec:V4SI [(match_operand:V4SI 2 "s_register_operand" "1") (match_operand:SI 3 "mve_vldrd_immediate" "Ri") - (match_operand:HI 4 "vpr_register_operand" "Up") + (match_operand:V4BI 4 "vpr_register_operand" "Up") (mem:BLK (scratch))] VLDRWGBWBQ)) (set (match_operand:V4SI 1 "s_register_operand" "=&w") @@ -9659,7 +9659,7 @@ [(match_operand:V4SI 0 "s_register_operand") (match_operand:V4SI 1 "s_register_operand") (match_operand:SI 2 "mve_vldrd_immediate") - (match_operand:HI 3 "vpr_register_operand") + (match_operand:V4BI 3 "vpr_register_operand") (unspec:V4SI [(const_int 0)] VLDRWQGBWB_F)] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" { @@ -9675,7 +9675,7 @@ [(match_operand:V4SF 0 "s_register_operand") (match_operand:V4SI 1 "s_register_operand") (match_operand:SI 2 "mve_vldrd_immediate") - (match_operand:HI 3 "vpr_register_operand") + (match_operand:V4BI 3 "vpr_register_operand") (unspec:V4SI [(const_int 0)] VLDRWQGBWB_F)] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" { @@ -9694,7 +9694,7 @@ [(set (match_operand:V4SF 0 "s_register_operand" "=&w") (unspec:V4SF [(match_operand:V4SI 2 "s_register_operand" "1") (match_operand:SI 3 "mve_vldrd_immediate" "Ri") - (match_operand:HI 4 "vpr_register_operand" "Up") + (match_operand:V4BI 4 "vpr_register_operand" "Up") (mem:BLK (scratch))] VLDRWQGBWB_F)) (set (match_operand:V4SI 1 "s_register_operand" "=&w") -- cgit v1.1 From c6b4ea7ab1aa6c5c07798fa6c6ad15dd1761b5ed Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 13 Oct 2021 09:16:49 +0000 Subject: arm: Convert more MVE/CDE builtins to predicate qualifiers This patch covers a few non-load/store builtins where we do not use the iterator and thus we cannot use . Most of the work of this patch series was carried out while I was working at STMicroelectronics as a Linaro assignee. 2022-02-22 Christophe Lyon gcc/ PR target/100757 PR target/101325 * config/arm/arm-builtins.cc (CX_UNARY_UNONE_QUALIFIERS): Use predicate. (CX_BINARY_UNONE_QUALIFIERS): Likewise. (CX_TERNARY_UNONE_QUALIFIERS): Likewise. (TERNOP_NONE_NONE_NONE_UNONE_QUALIFIERS): Delete. (QUADOP_NONE_NONE_NONE_NONE_UNONE_QUALIFIERS): Delete. (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE_QUALIFIERS): Delete. * config/arm/arm_mve_builtins.def: Use predicated qualifiers. * config/arm/mve.md: Use VxBI instead of HI. --- gcc/config/arm/arm-builtins.cc | 26 ++--------------- gcc/config/arm/arm_mve_builtins.def | 58 ++++++++++++++++++------------------- gcc/config/arm/mve.md | 52 ++++++++++++++++----------------- 3 files changed, 58 insertions(+), 78 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc index 5d582f1..a7acc1d 100644 --- a/gcc/config/arm/arm-builtins.cc +++ b/gcc/config/arm/arm-builtins.cc @@ -295,7 +295,7 @@ static enum arm_type_qualifiers arm_cx_unary_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_immediate, qualifier_none, qualifier_unsigned_immediate, - qualifier_unsigned }; + qualifier_predicate }; #define CX_UNARY_UNONE_QUALIFIERS (arm_cx_unary_unone_qualifiers) /* T (immediate, T, T, unsigned immediate). */ @@ -304,7 +304,7 @@ arm_cx_binary_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_immediate, qualifier_none, qualifier_none, qualifier_unsigned_immediate, - qualifier_unsigned }; + qualifier_predicate }; #define CX_BINARY_UNONE_QUALIFIERS (arm_cx_binary_unone_qualifiers) /* T (immediate, T, T, T, unsigned immediate). */ @@ -313,7 +313,7 @@ arm_cx_ternary_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_immediate, qualifier_none, qualifier_none, qualifier_none, qualifier_unsigned_immediate, - qualifier_unsigned }; + qualifier_predicate }; #define CX_TERNARY_UNONE_QUALIFIERS (arm_cx_ternary_unone_qualifiers) /* The first argument (return type) of a store should be void type, @@ -510,12 +510,6 @@ arm_ternop_none_none_none_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_ternop_none_none_none_imm_qualifiers) static enum arm_type_qualifiers -arm_ternop_none_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_none, qualifier_none, qualifier_none, qualifier_unsigned }; -#define TERNOP_NONE_NONE_NONE_UNONE_QUALIFIERS \ - (arm_ternop_none_none_none_unone_qualifiers) - -static enum arm_type_qualifiers arm_ternop_none_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_none, qualifier_predicate }; #define TERNOP_NONE_NONE_NONE_PRED_QUALIFIERS \ @@ -568,13 +562,6 @@ arm_quadop_unone_unone_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_quadop_unone_unone_none_none_pred_qualifiers) static enum arm_type_qualifiers -arm_quadop_none_none_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_none, qualifier_none, qualifier_none, qualifier_none, - qualifier_unsigned }; -#define QUADOP_NONE_NONE_NONE_NONE_UNONE_QUALIFIERS \ - (arm_quadop_none_none_none_none_unone_qualifiers) - -static enum arm_type_qualifiers arm_quadop_none_none_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_none, qualifier_none, qualifier_predicate }; @@ -589,13 +576,6 @@ arm_quadop_none_none_none_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_quadop_none_none_none_imm_pred_qualifiers) static enum arm_type_qualifiers -arm_quadop_unone_unone_unone_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, - qualifier_unsigned, qualifier_unsigned }; -#define QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE_QUALIFIERS \ - (arm_quadop_unone_unone_unone_unone_unone_qualifiers) - -static enum arm_type_qualifiers arm_quadop_unone_unone_unone_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, qualifier_predicate }; diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def index 7db6d47..1c8ee34 100644 --- a/gcc/config/arm/arm_mve_builtins.def +++ b/gcc/config/arm/arm_mve_builtins.def @@ -87,8 +87,8 @@ VAR4 (BINOP_UNONE_UNONE_UNONE, vcreateq_u, v16qi, v8hi, v4si, v2di) VAR4 (BINOP_NONE_UNONE_UNONE, vcreateq_s, v16qi, v8hi, v4si, v2di) VAR3 (BINOP_UNONE_UNONE_IMM, vshrq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_IMM, vshrq_n_s, v16qi, v8hi, v4si) -VAR1 (BINOP_NONE_NONE_UNONE, vaddlvq_p_s, v4si) -VAR1 (BINOP_UNONE_UNONE_UNONE, vaddlvq_p_u, v4si) +VAR1 (BINOP_NONE_NONE_PRED, vaddlvq_p_s, v4si) +VAR1 (BINOP_UNONE_UNONE_PRED, vaddlvq_p_u, v4si) VAR3 (BINOP_PRED_NONE_NONE, vcmpneq_, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vshlq_s, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_NONE, vshlq_u, v16qi, v8hi, v4si) @@ -465,20 +465,20 @@ VAR2 (TERNOP_NONE_NONE_NONE_IMM, vqshrnbq_n_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_IMM, vqrshrntq_n_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_IMM_PRED, vorrq_m_n_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_IMM_PRED, vmvnq_m_n_s, v8hi, v4si) -VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrmlaldavhq_p_u, v4si) -VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrev16q_m_u, v16qi) -VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vaddlvaq_p_u, v4si) -VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vrmlsldavhxq_p_s, v4si) -VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vrmlsldavhq_p_s, v4si) -VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vrmlaldavhxq_p_s, v4si) -VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vrmlaldavhq_p_s, v4si) -VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vrev32q_m_f, v8hf) -VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vrev16q_m_s, v16qi) -VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vcvttq_m_f32_f16, v4sf) -VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vcvttq_m_f16_f32, v8hf) -VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vcvtbq_m_f32_f16, v4sf) -VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vcvtbq_m_f16_f32, v8hf) -VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vaddlvaq_p_s, v4si) +VAR1 (TERNOP_UNONE_UNONE_UNONE_PRED, vrmlaldavhq_p_u, v4si) +VAR1 (TERNOP_UNONE_UNONE_UNONE_PRED, vrev16q_m_u, v16qi) +VAR1 (TERNOP_UNONE_UNONE_UNONE_PRED, vaddlvaq_p_u, v4si) +VAR1 (TERNOP_NONE_NONE_NONE_PRED, vrmlsldavhxq_p_s, v4si) +VAR1 (TERNOP_NONE_NONE_NONE_PRED, vrmlsldavhq_p_s, v4si) +VAR1 (TERNOP_NONE_NONE_NONE_PRED, vrmlaldavhxq_p_s, v4si) +VAR1 (TERNOP_NONE_NONE_NONE_PRED, vrmlaldavhq_p_s, v4si) +VAR1 (TERNOP_NONE_NONE_NONE_PRED, vrev32q_m_f, v8hf) +VAR1 (TERNOP_NONE_NONE_NONE_PRED, vrev16q_m_s, v16qi) +VAR1 (TERNOP_NONE_NONE_NONE_PRED, vcvttq_m_f32_f16, v4sf) +VAR1 (TERNOP_NONE_NONE_NONE_PRED, vcvttq_m_f16_f32, v8hf) +VAR1 (TERNOP_NONE_NONE_NONE_PRED, vcvtbq_m_f32_f16, v4sf) +VAR1 (TERNOP_NONE_NONE_NONE_PRED, vcvtbq_m_f16_f32, v8hf) +VAR1 (TERNOP_NONE_NONE_NONE_PRED, vaddlvaq_p_s, v4si) VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlsldavhaxq_s, v4si) VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlsldavhaq_s, v4si) VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlaldavhaxq_s, v4si) @@ -629,11 +629,11 @@ VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqshrntq_m_n_s, v8hi, v4si) VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqshrnbq_m_n_s, v8hi, v4si) VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqrshrntq_m_n_s, v8hi, v4si) VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqrshrnbq_m_n_s, v8hi, v4si) -VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vrmlaldavhaq_p_u, v4si) -VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlsldavhaxq_p_s, v4si) -VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlsldavhaq_p_s, v4si) -VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlaldavhaxq_p_s, v4si) -VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlaldavhaq_p_s, v4si) +VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vrmlaldavhaq_p_u, v4si) +VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrmlsldavhaxq_p_s, v4si) +VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrmlsldavhaq_p_s, v4si) +VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrmlaldavhaxq_p_s, v4si) +VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrmlaldavhaq_p_s, v4si) VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vcvtq_m_n_from_f_u, v8hi, v4si) VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vcvtq_m_n_from_f_s, v8hi, v4si) VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vbrsrq_m_n_f, v8hf, v4sf) @@ -845,14 +845,14 @@ VAR1 (BINOP_NONE_NONE_NONE, vsbciq_s, v4si) VAR1 (BINOP_UNONE_UNONE_UNONE, vsbciq_u, v4si) VAR1 (BINOP_NONE_NONE_NONE, vsbcq_s, v4si) VAR1 (BINOP_UNONE_UNONE_UNONE, vsbcq_u, v4si) -VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vadciq_m_s, v4si) -VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vadciq_m_u, v4si) -VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vadcq_m_s, v4si) -VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vadcq_m_u, v4si) -VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vsbciq_m_s, v4si) -VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vsbciq_m_u, v4si) -VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vsbcq_m_s, v4si) -VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vsbcq_m_u, v4si) +VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vadciq_m_s, v4si) +VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vadciq_m_u, v4si) +VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vadcq_m_s, v4si) +VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vadcq_m_u, v4si) +VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsbciq_m_s, v4si) +VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsbciq_m_u, v4si) +VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsbcq_m_s, v4si) +VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsbcq_m_u, v4si) VAR5 (STORE1, vst2q, v16qi, v8hi, v4si, v8hf, v4sf) VAR5 (LOAD1, vld4q, v16qi, v8hi, v4si, v8hf, v4sf) VAR5 (LOAD1, vld2q, v16qi, v8hi, v4si, v8hf, v4sf) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index e291c67..908bedc 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -826,7 +826,7 @@ [ (set (match_operand:DI 0 "s_register_operand" "=r") (unspec:DI [(match_operand:V4SI 1 "s_register_operand" "w") - (match_operand:HI 2 "vpr_register_operand" "Up")] + (match_operand:V4BI 2 "vpr_register_operand" "Up")] VADDLVQ_P)) ] "TARGET_HAVE_MVE" @@ -3739,7 +3739,7 @@ (set (match_operand:DI 0 "s_register_operand" "=r") (unspec:DI [(match_operand:DI 1 "s_register_operand" "0") (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VADDLVAQ_P)) ] "TARGET_HAVE_MVE" @@ -3949,7 +3949,7 @@ (set (match_operand:V8HF 0 "s_register_operand" "=w") (unspec:V8HF [(match_operand:V8HF 1 "s_register_operand" "0") (match_operand:V4SF 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCVTBQ_M_F16_F32)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3965,7 +3965,7 @@ (set (match_operand:V4SF 0 "s_register_operand" "=w") (unspec:V4SF [(match_operand:V4SF 1 "s_register_operand" "0") (match_operand:V8HF 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCVTBQ_M_F32_F16)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3981,7 +3981,7 @@ (set (match_operand:V8HF 0 "s_register_operand" "=w") (unspec:V8HF [(match_operand:V8HF 1 "s_register_operand" "0") (match_operand:V4SF 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCVTTQ_M_F16_F32)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -3997,7 +3997,7 @@ (set (match_operand:V4SF 0 "s_register_operand" "=w") (unspec:V4SF [(match_operand:V4SF 1 "s_register_operand" "0") (match_operand:V8HF 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VCVTTQ_M_F32_F16)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4595,7 +4595,7 @@ (set (match_operand:V8HF 0 "s_register_operand" "=w") (unspec:V8HF [(match_operand:V8HF 1 "s_register_operand" "0") (match_operand:V8HF 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VREV32Q_M_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -4659,7 +4659,7 @@ (set (match_operand:DI 0 "s_register_operand" "=r") (unspec:DI [(match_operand:V4SI 1 "s_register_operand" "w") (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VRMLALDAVHXQ_P_S)) ] "TARGET_HAVE_MVE" @@ -4691,7 +4691,7 @@ (set (match_operand:DI 0 "s_register_operand" "=r") (unspec:DI [(match_operand:V4SI 1 "s_register_operand" "w") (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VRMLSLDAVHQ_P_S)) ] "TARGET_HAVE_MVE" @@ -4707,7 +4707,7 @@ (set (match_operand:DI 0 "s_register_operand" "=r") (unspec:DI [(match_operand:V4SI 1 "s_register_operand" "w") (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand: 3 "vpr_register_operand" "Up")] VRMLSLDAVHXQ_P_S)) ] "TARGET_HAVE_MVE" @@ -4932,7 +4932,7 @@ (set (match_operand:V16QI 0 "s_register_operand" "=w") (unspec:V16QI [(match_operand:V16QI 1 "s_register_operand" "0") (match_operand:V16QI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V16BI 3 "vpr_register_operand" "Up")] VREV16Q_M)) ] "TARGET_HAVE_MVE" @@ -4964,7 +4964,7 @@ (set (match_operand:DI 0 "s_register_operand" "=r") (unspec:DI [(match_operand:V4SI 1 "s_register_operand" "w") (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:HI 3 "vpr_register_operand" "Up")] + (match_operand:V4BI 3 "vpr_register_operand" "Up")] VRMLALDAVHQ_P)) ] "TARGET_HAVE_MVE" @@ -6233,7 +6233,7 @@ (unspec:DI [(match_operand:DI 1 "s_register_operand" "0") (match_operand:V4SI 2 "s_register_operand" "w") (match_operand:V4SI 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VRMLALDAVHAQ_P_S)) ] "TARGET_HAVE_MVE" @@ -6556,7 +6556,7 @@ (unspec:DI [(match_operand:DI 1 "s_register_operand" "0") (match_operand:V4SI 2 "s_register_operand" "w") (match_operand:V4SI 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VRMLALDAVHAQ_P_U)) ] "TARGET_HAVE_MVE" @@ -6573,7 +6573,7 @@ (unspec:DI [(match_operand:DI 1 "s_register_operand" "0") (match_operand:V4SI 2 "s_register_operand" "w") (match_operand:V4SI 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VRMLALDAVHAXQ_P_S)) ] "TARGET_HAVE_MVE" @@ -6590,7 +6590,7 @@ (unspec:DI [(match_operand:DI 1 "s_register_operand" "0") (match_operand:V4SI 2 "s_register_operand" "w") (match_operand:V4SI 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VRMLSLDAVHAQ_P_S)) ] "TARGET_HAVE_MVE" @@ -6607,7 +6607,7 @@ (unspec:DI [(match_operand:DI 1 "s_register_operand" "0") (match_operand:V4SI 2 "s_register_operand" "w") (match_operand:V4SI 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand: 4 "vpr_register_operand" "Up")] VRMLSLDAVHAXQ_P_S)) ] "TARGET_HAVE_MVE" @@ -7528,7 +7528,7 @@ (define_insn "mve_vldrhq_z_fv8hf" [(set (match_operand:V8HF 0 "s_register_operand" "=w") (unspec:V8HF [(match_operand:V8HI 1 "mve_memory_operand" "Ux") - (match_operand:HI 2 "vpr_register_operand" "Up")] + (match_operand: 2 "vpr_register_operand" "Up")] VLDRHQ_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -8303,7 +8303,7 @@ (define_insn "mve_vstrwq_p_fv4sf" [(set (match_operand:V4SI 0 "mve_memory_operand" "=Ux") (unspec:V4SI [(match_operand:V4SF 1 "s_register_operand" "w") - (match_operand:HI 2 "vpr_register_operand" "Up")] + (match_operand: 2 "vpr_register_operand" "Up")] VSTRWQ_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" @@ -9844,7 +9844,7 @@ (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "0") (match_operand:V4SI 2 "s_register_operand" "w") (match_operand:V4SI 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand:V4BI 4 "vpr_register_operand" "Up")] VADCIQ_M)) (set (reg:SI VFPCC_REGNUM) (unspec:SI [(const_int 0)] @@ -9880,7 +9880,7 @@ (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "0") (match_operand:V4SI 2 "s_register_operand" "w") (match_operand:V4SI 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand:V4BI 4 "vpr_register_operand" "Up")] VADCQ_M)) (set (reg:SI VFPCC_REGNUM) (unspec:SI [(reg:SI VFPCC_REGNUM)] @@ -9917,7 +9917,7 @@ (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w") (match_operand:V4SI 2 "s_register_operand" "w") (match_operand:V4SI 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand:V4BI 4 "vpr_register_operand" "Up")] VSBCIQ_M)) (set (reg:SI VFPCC_REGNUM) (unspec:SI [(const_int 0)] @@ -9953,7 +9953,7 @@ (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w") (match_operand:V4SI 2 "s_register_operand" "w") (match_operand:V4SI 3 "s_register_operand" "w") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand:V4BI 4 "vpr_register_operand" "Up")] VSBCQ_M)) (set (reg:SI VFPCC_REGNUM) (unspec:SI [(reg:SI VFPCC_REGNUM)] @@ -10457,7 +10457,7 @@ (unspec:V16QI [(match_operand:SI 1 "const_int_coproc_operand" "i") (match_operand:V16QI 2 "register_operand" "0") (match_operand:SI 3 "const_int_mve_cde1_operand" "i") - (match_operand:HI 4 "vpr_register_operand" "Up")] + (match_operand:V16BI 4 "vpr_register_operand" "Up")] CDE_VCX))] "TARGET_CDE && TARGET_HAVE_MVE" "vpst\;vcx1t\\tp%c1, %q0, #%c3" @@ -10471,7 +10471,7 @@ (match_operand:V16QI 2 "register_operand" "0") (match_operand:V16QI 3 "register_operand" "t") (match_operand:SI 4 "const_int_mve_cde2_operand" "i") - (match_operand:HI 5 "vpr_register_operand" "Up")] + (match_operand:V16BI 5 "vpr_register_operand" "Up")] CDE_VCX))] "TARGET_CDE && TARGET_HAVE_MVE" "vpst\;vcx2t\\tp%c1, %q0, %q3, #%c4" @@ -10486,7 +10486,7 @@ (match_operand:V16QI 3 "register_operand" "t") (match_operand:V16QI 4 "register_operand" "t") (match_operand:SI 5 "const_int_mve_cde3_operand" "i") - (match_operand:HI 6 "vpr_register_operand" "Up")] + (match_operand:V16BI 6 "vpr_register_operand" "Up")] CDE_VCX))] "TARGET_CDE && TARGET_HAVE_MVE" "vpst\;vcx3t\\tp%c1, %q0, %q3, %q4, #%c5" -- cgit v1.1 From e9f8443a9179c0e9e0d96dfa91c883d6ddb70d3b Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 13 Oct 2021 09:16:53 +0000 Subject: arm: Add VPR_REG to ALL_REGS VPR_REG should be part of ALL_REGS, this patch fixes this omission. Most of the work of this patch series was carried out while I was working at STMicroelectronics as a Linaro assignee. 2022-02-22 Christophe Lyon gcc/ * config/arm/arm.h (REG_CLASS_CONTENTS): Add VPR_REG to ALL_REGS. --- gcc/config/arm/arm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index 61c0221..ef7b66f 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -1347,7 +1347,7 @@ enum reg_class { 0x00000000, 0x00000000, 0x00000000, 0x00000080 }, /* AFP_REG */ \ { 0x00000000, 0x00000000, 0x00000000, 0x00000400 }, /* VPR_REG. */ \ { 0x00005FFF, 0x00000000, 0x00000000, 0x00000400 }, /* GENERAL_AND_VPR_REGS. */ \ - { 0xFFFF7FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000000F } /* ALL_REGS. */ \ + { 0xFFFF7FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000040F } /* ALL_REGS. */ \ } #define FP_SYSREGS \ -- cgit v1.1 From 537c96588026aec09b9a00d6d0f3670f612428b5 Mon Sep 17 00:00:00 2001 From: Segher Boessenkool Date: Tue, 22 Feb 2022 15:49:09 +0000 Subject: rs6000: Fix GC on rs6000.c decls for atomic handling (PR88134) In PR88134 it is pointed out that we do not have GTY markup for some variables we use for atomic. So, let's add that. 2022-02-22 Segher Boessenkool PR target/88134 * config/rs6000/rs6000.cc (atomic_hold_decl, atomic_clear_decl, atomic_update_decl): Add GTY markup. --- gcc/config/rs6000/rs6000.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index ca9e7b8..a855e8c 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -27699,14 +27699,13 @@ emit_fusion_gpr_load (rtx target, rtx mem) return ""; } - -#ifdef RS6000_GLIBC_ATOMIC_FENV -/* Function declarations for rs6000_atomic_assign_expand_fenv. */ -static tree atomic_hold_decl, atomic_clear_decl, atomic_update_decl; -#endif +/* This is not inside an #ifdef RS6000_GLIBC_ATOMIC_FENV because gengtype + ignores it then. */ +static GTY(()) tree atomic_hold_decl; +static GTY(()) tree atomic_clear_decl; +static GTY(()) tree atomic_update_decl; /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV hook. */ - static void rs6000_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update) { -- cgit v1.1 From 9d1796d82d46dd3086f07953129dc5761feb707b Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Tue, 22 Feb 2022 18:17:24 +0000 Subject: Restore bootstrap on x86_64-pc-linux-gnu This patch resolves the bootstrap failure on x86_64-pc-linux-gnu. 2022-02-22 Roger Sayle gcc/ChangeLog * config/i386/i386-expand.cc (ix86_expand_cmpxchg_loop): Restore bootstrap. --- gcc/config/i386/i386-expand.cc | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 7f7055b..faa0191 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -23287,11 +23287,11 @@ void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val, switch (mode) { - case TImode: + case E_TImode: gendw = gen_atomic_compare_and_swapti_doubleword; hmode = DImode; break; - case DImode: + case E_DImode: if (doubleword) { gendw = gen_atomic_compare_and_swapdi_doubleword; @@ -23300,12 +23300,15 @@ void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val, else gen = gen_atomic_compare_and_swapdi_1; break; - case SImode: - gen = gen_atomic_compare_and_swapsi_1; break; - case HImode: - gen = gen_atomic_compare_and_swaphi_1; break; - case QImode: - gen = gen_atomic_compare_and_swapqi_1; break; + case E_SImode: + gen = gen_atomic_compare_and_swapsi_1; + break; + case E_HImode: + gen = gen_atomic_compare_and_swaphi_1; + break; + case E_QImode: + gen = gen_atomic_compare_and_swapqi_1; + break; default: gcc_unreachable (); } -- cgit v1.1 From fd0ab7c734b04b91653467b94afd48ceca122083 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Wed, 23 Feb 2022 06:44:12 +0000 Subject: arm: Fix typo in auto-vectorized MVE comparisons I made a last minute renaming of mve_const_bool_vec_to_hi () into mve_bool_vec_to_const () and forgot to update the call sites in vfp.md accordingly. Committed as obvious. 2022-02-23 Christophe Lyon gcc/ PR target/100757 PR target/101325 * config/arm/vfp.md (thumb2_movhi_vfp, thumb2_movhi_fp16): Fix typo. --- gcc/config/arm/vfp.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md index f00d1ca..d0f423c 100644 --- a/gcc/config/arm/vfp.md +++ b/gcc/config/arm/vfp.md @@ -89,7 +89,7 @@ return "mov%?\t%0, %1\t%@ movhi"; case 1: if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_VECTOR_BOOL) - operands[1] = mve_const_bool_vec_to_hi (operands[1]); + operands[1] = mve_bool_vec_to_const (operands[1]); else operands[1] = gen_lowpart (HImode, operands[1]); return "mov%?\t%0, %1\t%@ movhi"; @@ -193,7 +193,7 @@ return "mov%?\t%0, %1\t%@ movhi"; case 1: if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_VECTOR_BOOL) - operands[1] = mve_const_bool_vec_to_hi (operands[1]); + operands[1] = mve_bool_vec_to_const (operands[1]); else operands[1] = gen_lowpart (HImode, operands[1]); return "mov%?\t%0, %1\t%@ movhi"; -- cgit v1.1 From 06770148711226ba243b964451dfa8816d5d23e5 Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Wed, 23 Feb 2022 07:24:50 +0000 Subject: nvptx: Back-end portion of a fix for PR target/104489. This one line fix/tweak is the back-end specific change for a fix for PR target/104489, that allows the ISA for GCC's nvptx backend to be bumped to sm_53. The machine-independent middle-end pieces were posted here: https://gcc.gnu.org/pipermail/gcc-patches/2022-February/590139.html 2022-02-23 Roger Sayle gcc/ChangeLog PR target/104489 * config/nvptx/nvptx.md (*movhf_insn): Add subregs_ok attribute. --- gcc/config/nvptx/nvptx.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index f6dc817..216e89f 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -288,7 +288,8 @@ "@ %.\\tmov.b16\\t%0, %1; %.\\tld.b16\\t%0, %1; - %.\\tst.b16\\t%0, %1;") + %.\\tst.b16\\t%0, %1;" + [(set_attr "subregs_ok" "true")]) (define_expand "movhf" [(set (match_operand:HF 0 "nonimmediate_operand" "") -- cgit v1.1 From ffb2c67170768d5aa2d84a143405da658930e9b0 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 23 Feb 2022 14:32:29 +0800 Subject: Fix typo in v1ti3. For evex encoding vp{xor,or,and}, suffix is needed. Or there would be an error for vpxor %xmm0, %xmm31, %xmm1 Error: unsupported instruction `vpxor' gcc/ChangeLog: * config/i386/sse.md (v1ti3): Add suffix and replace isa attr of alternative 2 from avx to avx512vl. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512vl-logicsuffix-1.c: New test. --- gcc/config/i386/sse.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index b2f5634..3066ea3 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17025,8 +17025,8 @@ "@ p\t{%2, %0|%0, %2} vp\t{%2, %1, %0|%0, %1, %2} - vp\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "isa" "noavx,avx,avx") + vpd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx,avx512vl") (set_attr "prefix" "orig,vex,evex") (set_attr "prefix_data16" "1,*,*") (set_attr "type" "sselog") -- cgit v1.1 From 7862f6ccd85a001e4d70abb00bb95d8c7846ba80 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Wed, 23 Feb 2022 09:33:33 +0100 Subject: [nvptx] Fix dummy location in gen_comment I committed "[nvptx] Add -mptx-comment", but tested it in combination with the proposed "[final] Handle compiler-generated asm insn" ( https://gcc.gnu.org/pipermail/gcc-patches/2022-February/590721.html ), so by itself the commit introduced some regressions: ... FAIL: gcc.dg/20020426-2.c (internal compiler error: Segmentation fault) FAIL: gcc.dg/analyzer/zlib-3.c (internal compiler error: Segmentation fault) FAIL: gcc.dg/pr101223.c (internal compiler error: Segmentation fault) FAIL: gcc.dg/torture/pr80764.c -O2 (internal compiler error: Segmentation fault) ... There are due to cfun->function_start_locus == 0. Fix these by using DECL_SOURCE_LOCATION (cfun->decl) instead. Tested on nvptx. gcc/ChangeLog: 2022-02-23 Tom de Vries * config/nvptx/nvptx.cc (gen_comment): Use DECL_SOURCE_LOCATION (cfun->decl) instead of cfun->function_start_locus. --- gcc/config/nvptx/nvptx.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 858789e..6f6d592 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -5382,7 +5382,7 @@ gen_comment (const char *s) char *comment = (char *) alloca (len); snprintf (comment, len, "%s%s%s", ASM_COMMENT_START, sep, s); return gen_rtx_ASM_INPUT_loc (VOIDmode, ggc_strdup (comment), - cfun->function_start_locus); + DECL_SOURCE_LOCATION (cfun->decl)); } /* Initialize all declared regs at function entry. -- cgit v1.1 From c982d02ffe26fcd07280bf0f35f90df9be00716e Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Wed, 23 Feb 2022 09:39:53 +0100 Subject: [nvptx] Add shf.{l,r}.wrap insn Ptx contains funnel shift operations shf.l.wrap and shf.r.wrap that can be used to implement 32-bit left or right rotate. Add define_insns rotlsi3 and rotrsi3. Tested on nvptx. gcc/ChangeLog: 2022-02-23 Tom de Vries * config/nvptx/nvptx.md (define_insn "rotlsi3", define_insn "rotrsi3"): New define_insn. gcc/testsuite/ChangeLog: 2022-02-23 Tom de Vries * gcc.target/nvptx/rotate-run.c: New test. * gcc.target/nvptx/rotate.c: New test. --- gcc/config/nvptx/nvptx.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 216e89f..4989b56 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -808,6 +808,22 @@ "" "%.\\tshr.u%T0\\t%0, %1, %2;") +(define_insn "rotlsi3" + [(set (match_operand:SI 0 "nvptx_register_operand" "=R") + (rotate:SI (match_operand:SI 1 "nvptx_register_operand" "R") + (and:SI (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri") + (const_int 31))))] + "TARGET_SM35" + "%.\\tshf.l.wrap.b32\\t%0, %1, %1, %2;") + +(define_insn "rotrsi3" + [(set (match_operand:SI 0 "nvptx_register_operand" "=R") + (rotatert:SI (match_operand:SI 1 "nvptx_register_operand" "R") + (and:SI (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri") + (const_int 31))))] + "TARGET_SM35" + "%.\\tshf.r.wrap.b32\\t%0, %1, %1, %2;") + ;; Logical operations (define_code_iterator any_logic [and ior xor]) -- cgit v1.1 From a046033ea0ba97314265933bc48124574db2d62a Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Wed, 23 Feb 2022 15:58:59 +0100 Subject: [nvptx] Add missing t-omp-device isas In t-omp-device we list isas that can be used in omp declare variant like so: ... #pragma omp declare variant (f30) match (device={isa("sm_30")}) ... and in nvptx_omp_device_kind_arch_isa we handle them. Update both to reflect the current list of isas. Tested on x86_64-linux with nvptx accelerator. gcc/ChangeLog: 2022-02-23 Tom de Vries * config/nvptx/nvptx.cc (nvptx_omp_device_kind_arch_isa): Handle sm_70, sm_75 and sm_80. * config/nvptx/t-omp-device: Add sm_53, sm_70, sm_75 and sm_80. Co-Authored-By: Tobias Burnus --- gcc/config/nvptx/nvptx.cc | 8 +++++++- gcc/config/nvptx/t-omp-device | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 6f6d592..b9451c2 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -6181,7 +6181,13 @@ nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait, if (strcmp (name, "sm_35") == 0) return TARGET_SM35 && !TARGET_SM53; if (strcmp (name, "sm_53") == 0) - return TARGET_SM53; + return TARGET_SM53 && !TARGET_SM70; + if (strcmp (name, "sm_70") == 0) + return TARGET_SM70 && !TARGET_SM75; + if (strcmp (name, "sm_75") == 0) + return TARGET_SM75 && !TARGET_SM80; + if (strcmp (name, "sm_80") == 0) + return TARGET_SM80; return 0; default: gcc_unreachable (); diff --git a/gcc/config/nvptx/t-omp-device b/gcc/config/nvptx/t-omp-device index 8765d9f..4228218 100644 --- a/gcc/config/nvptx/t-omp-device +++ b/gcc/config/nvptx/t-omp-device @@ -1,4 +1,4 @@ omp-device-properties-nvptx: $(srcdir)/config/nvptx/nvptx.cc echo kind: gpu > $@ echo arch: nvptx >> $@ - echo isa: sm_30 sm_35 >> $@ + echo isa: sm_30 sm_35 sm_53 sm_70 sm_75 sm_80 >> $@ -- cgit v1.1 From eabf7bbe601f2c0d87bd0a1012d7a602df2037da Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 25 Feb 2022 12:06:52 +0100 Subject: i386: Use a new temp slot kind for splitter to floatdi2_i387_with_xmm [PR104674] As mentioned in the PR, the following testcase is miscompiled for similar reasons as the already fixed PR78791 - we use SLOT_TEMP slots in various places during expansion and during expansion we can guarantee that the lifetime of those temporary slot doesn't overlap. But the following splitter uses SLOT_TEMP too and in between expansion and split1 there is a possibility that something extends the lifetime of SLOT_TEMP created slots across an instruction that will be split by this splitter. The following patch fixes it by using a new temp slot kind to make sure it doesn't reuse a SLOT_TEMP that could be live across the instruction. 2022-02-25 Jakub Jelinek PR target/104674 * config/i386/i386.h (enum ix86_stack_slot): Add SLOT_FLOATxFDI_387. * config/i386/i386.md (splitter to floatdi2_i387_with_xmm): Use SLOT_FLOATxFDI_387 rather than SLOT_TEMP. * gcc.target/i386/pr104674.c: New test. --- gcc/config/i386/i386.h | 1 + gcc/config/i386/i386.md | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index f41e090..b37d4a9 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2414,6 +2414,7 @@ enum ix86_stack_slot SLOT_CW_FLOOR, SLOT_CW_CEIL, SLOT_STV_TEMP, + SLOT_FLOATxFDI_387, MAX_386_STACK_LOCALS }; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 8ffa641..e7c5490 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -5412,9 +5412,8 @@ && can_create_pseudo_p ()" [(const_int 0)] { - emit_insn (gen_floatdi2_i387_with_xmm - (operands[0], operands[1], - assign_386_stack_local (DImode, SLOT_TEMP))); + rtx s = assign_386_stack_local (DImode, SLOT_FLOATxFDI_387); + emit_insn (gen_floatdi2_i387_with_xmm (operands[0], operands[1], s)); DONE; }) -- cgit v1.1 From d54cdd1538deebed97fb9531dc3e1a42eaf0a80f Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Fri, 25 Feb 2022 13:39:22 +0200 Subject: arc: Fail conditional move expand patterns If the movcc comparison is not valid it triggers an assert in the current implementation. This behavior is not needed as we can FAIL the movcc expand pattern. gcc/ * config/arc/arc.cc (gen_compare_reg): Return NULL_RTX if the comparison is not valid. * config/arc/arc.md (movsicc): Fail if comparison is not valid. (movdicc): Likewise. (movsfcc): Likewise. (movdfcc): Likewise. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/arc.cc | 3 ++- gcc/config/arc/arc.md | 25 ++++++++++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.cc b/gcc/config/arc/arc.cc index 8cc1735..c27ba99 100644 --- a/gcc/config/arc/arc.cc +++ b/gcc/config/arc/arc.cc @@ -2256,7 +2256,8 @@ gen_compare_reg (rtx comparison, machine_mode omode) cmode = GET_MODE (x); if (cmode == VOIDmode) cmode = GET_MODE (y); - gcc_assert (cmode == SImode || cmode == SFmode || cmode == DFmode); + if (cmode != SImode && cmode != SFmode && cmode != DFmode) + return NULL_RTX; if (cmode == SImode) { if (!register_operand (x, SImode)) diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index ace3cb7..39b3580 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -1618,8 +1618,11 @@ core_3, archs4x, archs4xd, archs4xd_slow" (match_operand:SI 2 "nonmemory_operand" "") (match_operand:SI 3 "register_operand" "")))] "" - "operands[1] = gen_compare_reg (operands[1], VOIDmode);") - + " + operands[1] = gen_compare_reg (operands[1], VOIDmode); + if (operands[1] == NULL_RTX) + FAIL; + ") (define_expand "movdicc" [(set (match_operand:DI 0 "dest_reg_operand" "") @@ -1627,7 +1630,11 @@ core_3, archs4x, archs4xd, archs4xd_slow" (match_operand:DI 2 "nonmemory_operand" "") (match_operand:DI 3 "register_operand" "")))] "" - "operands[1] = gen_compare_reg (operands[1], VOIDmode);") + " + operands[1] = gen_compare_reg (operands[1], VOIDmode); + if (operands[1] == NULL_RTX) + FAIL; + ") (define_expand "movsfcc" @@ -1636,7 +1643,11 @@ core_3, archs4x, archs4xd, archs4xd_slow" (match_operand:SF 2 "nonmemory_operand" "") (match_operand:SF 3 "register_operand" "")))] "" - "operands[1] = gen_compare_reg (operands[1], VOIDmode);") + " + operands[1] = gen_compare_reg (operands[1], VOIDmode); + if (operands[1] == NULL_RTX) + FAIL; + ") (define_expand "movdfcc" [(set (match_operand:DF 0 "dest_reg_operand" "") @@ -1644,7 +1655,11 @@ core_3, archs4x, archs4xd, archs4xd_slow" (match_operand:DF 2 "nonmemory_operand" "") (match_operand:DF 3 "register_operand" "")))] "" - "operands[1] = gen_compare_reg (operands[1], VOIDmode);") + " + operands[1] = gen_compare_reg (operands[1], VOIDmode); + if (operands[1] == NULL_RTX) + FAIL; + ") (define_insn "*movsicc_insn" [(set (match_operand:SI 0 "dest_reg_operand" "=w,w") -- cgit v1.1 From 3885a122f817a1b6dca4a84ba9e020d5ab2060af Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 25 Feb 2022 18:58:48 +0100 Subject: rs6000: Use rs6000_emit_move in movmisalign expander [PR104681] The following testcase ICEs, because for some strange reason it decides to use movmisaligntf during expansion where the destination is MEM and source is CONST_DOUBLE. For normal mov expanders the rs6000 backend uses rs6000_emit_move to ensure that if one operand is a MEM, the other is a REG and a few other things, but for movmisalign nothing enforced this. The middle-end documents that movmisalign shouldn't fail, so we can't force that through predicates or condition on the expander. 2022-02-25 Jakub Jelinek PR target/104681 * config/rs6000/vector.md (movmisalign): Use rs6000_emit_move. * g++.dg/opt/pr104681.C: New test. --- gcc/config/rs6000/vector.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md index b87a742..4d0797c 100644 --- a/gcc/config/rs6000/vector.md +++ b/gcc/config/rs6000/vector.md @@ -1519,7 +1519,10 @@ [(set (match_operand:VEC_N 0 "nonimmediate_operand") (match_operand:VEC_N 1 "any_operand"))] "VECTOR_MEM_VSX_P (mode) && TARGET_ALLOW_MOVMISALIGN" - "") +{ + rs6000_emit_move (operands[0], operands[1], mode); + DONE; +}) ;; Vector shift right in bits. Currently supported ony for shift ;; amounts that can be expressed as byte shifts (divisible by 8). -- cgit v1.1 From 50d9ca7104d40f0a331d0dd01e3c069ecf7f6c97 Mon Sep 17 00:00:00 2001 From: Hongyu Wang Date: Fri, 25 Feb 2022 15:09:03 +0800 Subject: AVX512F: Add helper enumeration for ternary logic intrinsics. Sync with llvm change in https://reviews.llvm.org/D120307 to add enumeration and truncate imm to unsigned char, so users could use ~ on immediates. gcc/ChangeLog: * config/i386/avx512fintrin.h (_MM_TERNLOG_ENUM): New enum. (_mm512_ternarylogic_epi64): Truncate imm to unsigned char to avoid error when using ~enum as parameter. (_mm512_mask_ternarylogic_epi64): Likewise. (_mm512_maskz_ternarylogic_epi64): Likewise. (_mm512_ternarylogic_epi32): Likewise. (_mm512_mask_ternarylogic_epi32): Likewise. (_mm512_maskz_ternarylogic_epi32): Likewise. * config/i386/avx512vlintrin.h (_mm256_ternarylogic_epi64): Adjust imm param type to unsigned char. (_mm256_mask_ternarylogic_epi64): Likewise. (_mm256_maskz_ternarylogic_epi64): Likewise. (_mm256_ternarylogic_epi32): Likewise. (_mm256_mask_ternarylogic_epi32): Likewise. (_mm256_maskz_ternarylogic_epi32): Likewise. (_mm_ternarylogic_epi64): Likewise. (_mm_mask_ternarylogic_epi64): Likewise. (_mm_maskz_ternarylogic_epi64): Likewise. (_mm_ternarylogic_epi32): Likewise. (_mm_mask_ternarylogic_epi32): Likewise. (_mm_maskz_ternarylogic_epi32): Likewise. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512f-vpternlogd-1.c: Use new enum. * gcc.target/i386/avx512f-vpternlogq-1.c: Likewise. * gcc.target/i386/avx512vl-vpternlogd-1.c: Likewise. * gcc.target/i386/avx512vl-vpternlogq-1.c: Likewise. * gcc.target/i386/testimm-10.c: Remove imm check for vpternlog insns since the imm has been truncated in intrinsic. --- gcc/config/i386/avx512fintrin.h | 132 ++++++++++++------- gcc/config/i386/avx512vlintrin.h | 278 ++++++++++++++++++++++++--------------- 2 files changed, 262 insertions(+), 148 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h index bc10c82..29511fd 100644 --- a/gcc/config/i386/avx512fintrin.h +++ b/gcc/config/i386/avx512fintrin.h @@ -1639,16 +1639,27 @@ _mm_maskz_sub_round_ss (__mmask8 __U, __m128 __A, __m128 __B, #endif +/* Constant helper to represent the ternary logic operations among + vector A, B and C. */ +typedef enum +{ + _MM_TERNLOG_A = 0xF0, + _MM_TERNLOG_B = 0xCC, + _MM_TERNLOG_C = 0xAA +} _MM_TERNLOG_ENUM; + #ifdef __OPTIMIZE__ extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_ternarylogic_epi64 (__m512i __A, __m512i __B, __m512i __C, const int __imm) { - return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __C, __imm, - (__mmask8) -1); + return (__m512i) + __builtin_ia32_pternlogq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, + (unsigned char) __imm, + (__mmask8) -1); } extern __inline __m512i @@ -1656,10 +1667,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_ternarylogic_epi64 (__m512i __A, __mmask8 __U, __m512i __B, __m512i __C, const int __imm) { - return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __C, __imm, - (__mmask8) __U); + return (__m512i) + __builtin_ia32_pternlogq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, + (unsigned char) __imm, + (__mmask8) __U); } extern __inline __m512i @@ -1667,10 +1680,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_ternarylogic_epi64 (__mmask8 __U, __m512i __A, __m512i __B, __m512i __C, const int __imm) { - return (__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di) __A, - (__v8di) __B, - (__v8di) __C, - __imm, (__mmask8) __U); + return (__m512i) + __builtin_ia32_pternlogq512_maskz ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, + (unsigned char) __imm, + (__mmask8) __U); } extern __inline __m512i @@ -1678,10 +1693,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_ternarylogic_epi32 (__m512i __A, __m512i __B, __m512i __C, const int __imm) { - return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __C, - __imm, (__mmask16) -1); + return (__m512i) + __builtin_ia32_pternlogd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + (unsigned char) __imm, + (__mmask16) -1); } extern __inline __m512i @@ -1689,10 +1706,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_ternarylogic_epi32 (__m512i __A, __mmask16 __U, __m512i __B, __m512i __C, const int __imm) { - return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __C, - __imm, (__mmask16) __U); + return (__m512i) + __builtin_ia32_pternlogd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + (unsigned char) __imm, + (__mmask16) __U); } extern __inline __m512i @@ -1700,33 +1719,56 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_ternarylogic_epi32 (__mmask16 __U, __m512i __A, __m512i __B, __m512i __C, const int __imm) { - return (__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si) __A, - (__v16si) __B, - (__v16si) __C, - __imm, (__mmask16) __U); + return (__m512i) + __builtin_ia32_pternlogd512_maskz ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + (unsigned char) __imm, + (__mmask16) __U); } #else -#define _mm512_ternarylogic_epi64(A, B, C, I) \ - ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)-1)) -#define _mm512_mask_ternarylogic_epi64(A, U, B, C, I) \ - ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U))) -#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, I) \ - ((__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U))) -#define _mm512_ternarylogic_epi32(A, B, C, I) \ - ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I), \ - (__mmask16)-1)) -#define _mm512_mask_ternarylogic_epi32(A, U, B, C, I) \ - ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I), \ - (__mmask16)(U))) -#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, I) \ - ((__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I), \ - (__mmask16)(U))) +#define _mm512_ternarylogic_epi64(A, B, C, I) \ + ((__m512i) \ + __builtin_ia32_pternlogq512_mask ((__v8di) (__m512i) (A), \ + (__v8di) (__m512i) (B), \ + (__v8di) (__m512i) (C), \ + (unsigned char) (I), \ + (__mmask8) -1)) +#define _mm512_mask_ternarylogic_epi64(A, U, B, C, I) \ + ((__m512i) \ + __builtin_ia32_pternlogq512_mask ((__v8di) (__m512i) (A), \ + (__v8di) (__m512i) (B), \ + (__v8di) (__m512i) (C), \ + (unsigned char)(I), \ + (__mmask8) (U))) +#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, I) \ + ((__m512i) \ + __builtin_ia32_pternlogq512_maskz ((__v8di) (__m512i) (A), \ + (__v8di) (__m512i) (B), \ + (__v8di) (__m512i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) +#define _mm512_ternarylogic_epi32(A, B, C, I) \ + ((__m512i) \ + __builtin_ia32_pternlogd512_mask ((__v16si) (__m512i) (A), \ + (__v16si) (__m512i) (B), \ + (__v16si) (__m512i) (C), \ + (unsigned char) (I), \ + (__mmask16) -1)) +#define _mm512_mask_ternarylogic_epi32(A, U, B, C, I) \ + ((__m512i) \ + __builtin_ia32_pternlogd512_mask ((__v16si) (__m512i) (A), \ + (__v16si) (__m512i) (B), \ + (__v16si) (__m512i) (C), \ + (unsigned char) (I), \ + (__mmask16) (U))) +#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, I) \ + ((__m512i) \ + __builtin_ia32_pternlogd512_maskz ((__v16si) (__m512i) (A), \ + (__v16si) (__m512i) (B), \ + (__v16si) (__m512i) (C), \ + (unsigned char) (I), \ + (__mmask16) (U))) #endif extern __inline __m512d diff --git a/gcc/config/i386/avx512vlintrin.h b/gcc/config/i386/avx512vlintrin.h index bbced24..26b286e 100644 --- a/gcc/config/i386/avx512vlintrin.h +++ b/gcc/config/i386/avx512vlintrin.h @@ -10575,10 +10575,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_ternarylogic_epi64 (__m256i __A, __m256i __B, __m256i __C, const int __imm) { - return (__m256i) __builtin_ia32_pternlogq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __C, __imm, - (__mmask8) -1); + return (__m256i) + __builtin_ia32_pternlogq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __C, + (unsigned char) __imm, + (__mmask8) -1); } extern __inline __m256i @@ -10587,10 +10589,12 @@ _mm256_mask_ternarylogic_epi64 (__m256i __A, __mmask8 __U, __m256i __B, __m256i __C, const int __imm) { - return (__m256i) __builtin_ia32_pternlogq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __C, __imm, - (__mmask8) __U); + return (__m256i) + __builtin_ia32_pternlogq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __C, + (unsigned char) __imm, + (__mmask8) __U); } extern __inline __m256i @@ -10599,11 +10603,12 @@ _mm256_maskz_ternarylogic_epi64 (__mmask8 __U, __m256i __A, __m256i __B, __m256i __C, const int __imm) { - return (__m256i) __builtin_ia32_pternlogq256_maskz ((__v4di) __A, - (__v4di) __B, - (__v4di) __C, - __imm, - (__mmask8) __U); + return (__m256i) + __builtin_ia32_pternlogq256_maskz ((__v4di) __A, + (__v4di) __B, + (__v4di) __C, + (unsigned char) __imm, + (__mmask8) __U); } extern __inline __m256i @@ -10611,10 +10616,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_ternarylogic_epi32 (__m256i __A, __m256i __B, __m256i __C, const int __imm) { - return (__m256i) __builtin_ia32_pternlogd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __C, __imm, - (__mmask8) -1); + return (__m256i) + __builtin_ia32_pternlogd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __C, + (unsigned char) __imm, + (__mmask8) -1); } extern __inline __m256i @@ -10623,10 +10630,12 @@ _mm256_mask_ternarylogic_epi32 (__m256i __A, __mmask8 __U, __m256i __B, __m256i __C, const int __imm) { - return (__m256i) __builtin_ia32_pternlogd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __C, __imm, - (__mmask8) __U); + return (__m256i) + __builtin_ia32_pternlogd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __C, + (unsigned char) __imm, + (__mmask8) __U); } extern __inline __m256i @@ -10635,11 +10644,12 @@ _mm256_maskz_ternarylogic_epi32 (__mmask8 __U, __m256i __A, __m256i __B, __m256i __C, const int __imm) { - return (__m256i) __builtin_ia32_pternlogd256_maskz ((__v8si) __A, - (__v8si) __B, - (__v8si) __C, - __imm, - (__mmask8) __U); + return (__m256i) + __builtin_ia32_pternlogd256_maskz ((__v8si) __A, + (__v8si) __B, + (__v8si) __C, + (unsigned char) __imm, + (__mmask8) __U); } extern __inline __m128i @@ -10647,33 +10657,40 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_ternarylogic_epi64 (__m128i __A, __m128i __B, __m128i __C, const int __imm) { - return (__m128i) __builtin_ia32_pternlogq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __C, __imm, - (__mmask8) -1); + return (__m128i) + __builtin_ia32_pternlogq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __C, + (unsigned char) __imm, + (__mmask8) -1); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_ternarylogic_epi64 (__m128i __A, __mmask8 __U, - __m128i __B, __m128i __C, const int __imm) + __m128i __B, __m128i __C, + const int __imm) { - return (__m128i) __builtin_ia32_pternlogq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __C, __imm, - (__mmask8) __U); + return (__m128i) + __builtin_ia32_pternlogq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __C, + (unsigned char) __imm, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_ternarylogic_epi64 (__mmask8 __U, __m128i __A, - __m128i __B, __m128i __C, const int __imm) + __m128i __B, __m128i __C, + const int __imm) { - return (__m128i) __builtin_ia32_pternlogq128_maskz ((__v2di) __A, - (__v2di) __B, - (__v2di) __C, - __imm, - (__mmask8) __U); + return (__m128i) + __builtin_ia32_pternlogq128_maskz ((__v2di) __A, + (__v2di) __B, + (__v2di) __C, + (unsigned char) __imm, + (__mmask8) __U); } extern __inline __m128i @@ -10681,33 +10698,40 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_ternarylogic_epi32 (__m128i __A, __m128i __B, __m128i __C, const int __imm) { - return (__m128i) __builtin_ia32_pternlogd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __C, __imm, - (__mmask8) -1); + return (__m128i) + __builtin_ia32_pternlogd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __C, + (unsigned char) __imm, + (__mmask8) -1); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_ternarylogic_epi32 (__m128i __A, __mmask8 __U, - __m128i __B, __m128i __C, const int __imm) + __m128i __B, __m128i __C, + const int __imm) { - return (__m128i) __builtin_ia32_pternlogd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __C, __imm, - (__mmask8) __U); + return (__m128i) + __builtin_ia32_pternlogd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __C, + (unsigned char) __imm, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_ternarylogic_epi32 (__mmask8 __U, __m128i __A, - __m128i __B, __m128i __C, const int __imm) + __m128i __B, __m128i __C, + const int __imm) { - return (__m128i) __builtin_ia32_pternlogd128_maskz ((__v4si) __A, - (__v4si) __B, - (__v4si) __C, - __imm, - (__mmask8) __U); + return (__m128i) + __builtin_ia32_pternlogd128_maskz ((__v4si) __A, + (__v4si) __B, + (__v4si) __C, + (unsigned char) __imm, + (__mmask8) __U); } extern __inline __m256 @@ -12910,53 +12934,101 @@ _mm256_permutex_pd (__m256d __X, const int __M) (__v2di)(__m128i)_mm_setzero_si128 (),\ (__mmask8)(U))) -#define _mm256_ternarylogic_epi64(A, B, C, I) \ - ((__m256i) __builtin_ia32_pternlogq256_mask ((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), (int)(I), (__mmask8)-1)) - -#define _mm256_mask_ternarylogic_epi64(A, U, B, C, I) \ - ((__m256i) __builtin_ia32_pternlogq256_mask ((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), (int)(I), (__mmask8)(U))) - -#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, I) \ - ((__m256i) __builtin_ia32_pternlogq256_maskz ((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), (int)(I), (__mmask8)(U))) - -#define _mm256_ternarylogic_epi32(A, B, C, I) \ - ((__m256i) __builtin_ia32_pternlogd256_mask ((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), (int)(I), (__mmask8)-1)) - -#define _mm256_mask_ternarylogic_epi32(A, U, B, C, I) \ - ((__m256i) __builtin_ia32_pternlogd256_mask ((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), (int)(I), (__mmask8)(U))) - -#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, I) \ - ((__m256i) __builtin_ia32_pternlogd256_maskz ((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), (int)(I), (__mmask8)(U))) - -#define _mm_ternarylogic_epi64(A, B, C, I) \ - ((__m128i) __builtin_ia32_pternlogq128_mask ((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), (int)(I), (__mmask8)-1)) - -#define _mm_mask_ternarylogic_epi64(A, U, B, C, I) \ - ((__m128i) __builtin_ia32_pternlogq128_mask ((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), (int)(I), (__mmask8)(U))) - -#define _mm_maskz_ternarylogic_epi64(U, A, B, C, I) \ - ((__m128i) __builtin_ia32_pternlogq128_maskz ((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), (int)(I), (__mmask8)(U))) - -#define _mm_ternarylogic_epi32(A, B, C, I) \ - ((__m128i) __builtin_ia32_pternlogd128_mask ((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), (int)(I), (__mmask8)-1)) - -#define _mm_mask_ternarylogic_epi32(A, U, B, C, I) \ - ((__m128i) __builtin_ia32_pternlogd128_mask ((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), (int)(I), (__mmask8)(U))) - -#define _mm_maskz_ternarylogic_epi32(U, A, B, C, I) \ - ((__m128i) __builtin_ia32_pternlogd128_maskz ((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), (int)(I), (__mmask8)(U))) +#define _mm256_ternarylogic_epi64(A, B, C, I) \ + ((__m256i) \ + __builtin_ia32_pternlogq256_mask ((__v4di) (__m256i) (A), \ + (__v4di) (__m256i) (B), \ + (__v4di) (__m256i) (C), \ + (unsigned char) (I), \ + (__mmask8) -1)) + +#define _mm256_mask_ternarylogic_epi64(A, U, B, C, I) \ + ((__m256i) \ + __builtin_ia32_pternlogq256_mask ((__v4di) (__m256i) (A), \ + (__v4di) (__m256i) (B), \ + (__v4di) (__m256i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, I) \ + ((__m256i) \ + __builtin_ia32_pternlogq256_maskz ((__v4di) (__m256i) (A), \ + (__v4di) (__m256i) (B), \ + (__v4di) (__m256i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm256_ternarylogic_epi32(A, B, C, I) \ + ((__m256i) \ + __builtin_ia32_pternlogd256_mask ((__v8si) (__m256i) (A), \ + (__v8si) (__m256i) (B), \ + (__v8si) (__m256i) (C), \ + (unsigned char) (I), \ + (__mmask8) -1)) + +#define _mm256_mask_ternarylogic_epi32(A, U, B, C, I) \ + ((__m256i) \ + __builtin_ia32_pternlogd256_mask ((__v8si) (__m256i) (A), \ + (__v8si) (__m256i) (B), \ + (__v8si) (__m256i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, I) \ + ((__m256i) \ + __builtin_ia32_pternlogd256_maskz ((__v8si) (__m256i) (A), \ + (__v8si) (__m256i) (B), \ + (__v8si) (__m256i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm_ternarylogic_epi64(A, B, C, I) \ + ((__m128i) \ + __builtin_ia32_pternlogq128_mask ((__v2di) (__m128i) (A), \ + (__v2di) (__m128i) (B), \ + (__v2di) (__m128i) (C), \ + (unsigned char) (I), \ + (__mmask8) -1)) + +#define _mm_mask_ternarylogic_epi64(A, U, B, C, I) \ + ((__m128i) \ + __builtin_ia32_pternlogq128_mask ((__v2di) (__m128i) (A), \ + (__v2di) (__m128i) (B), \ + (__v2di) (__m128i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm_maskz_ternarylogic_epi64(U, A, B, C, I) \ + ((__m128i) \ + __builtin_ia32_pternlogq128_maskz ((__v2di) (__m128i) (A), \ + (__v2di) (__m128i) (B), \ + (__v2di) (__m128i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm_ternarylogic_epi32(A, B, C, I) \ + ((__m128i) \ + __builtin_ia32_pternlogd128_mask ((__v4si) (__m128i) (A), \ + (__v4si) (__m128i) (B), \ + (__v4si) (__m128i) (C), \ + (unsigned char) (I), \ + (__mmask8) -1)) + +#define _mm_mask_ternarylogic_epi32(A, U, B, C, I) \ + ((__m128i) \ + __builtin_ia32_pternlogd128_mask ((__v4si) (__m128i) (A), \ + (__v4si) (__m128i) (B), \ + (__v4si) (__m128i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm_maskz_ternarylogic_epi32(U, A, B, C, I) \ + ((__m128i) \ + __builtin_ia32_pternlogd128_maskz ((__v4si) (__m128i) (A), \ + (__v4si) (__m128i) (B), \ + (__v4si) (__m128i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) #define _mm256_roundscale_ps(A, B) \ ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), \ -- cgit v1.1 From 9d87ad0ca5cd18807546a081e7d539be8b5418bf Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Fri, 25 Feb 2022 16:11:23 +0100 Subject: [nvptx] Add -mptx=_ Add an -mptx=_ value, that indicates the default ptx version. It can be used to undo an explicit -mptx setting, so this: ... $ gcc test.c -mptx=3.1 -mptx=_ ... has the same effect as: ... $ gcc test.c ... Tested on nvptx. gcc/ChangeLog: 2022-02-28 Tom de Vries * config/nvptx/nvptx-opts.h (enum ptx_version): Add PTX_VERSION_default. * config/nvptx/nvptx.cc (handle_ptx_version_option): Handle PTX_VERSION_default. * config/nvptx/nvptx.opt: Add EnumValue "_" / PTX_VERSION_default. --- gcc/config/nvptx/nvptx-opts.h | 1 + gcc/config/nvptx/nvptx.cc | 3 ++- gcc/config/nvptx/nvptx.opt | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h index e918d43..30852b6 100644 --- a/gcc/config/nvptx/nvptx-opts.h +++ b/gcc/config/nvptx/nvptx-opts.h @@ -32,6 +32,7 @@ enum ptx_isa enum ptx_version { + PTX_VERSION_default, PTX_VERSION_3_0, PTX_VERSION_3_1, PTX_VERSION_4_2, diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index b9451c2..7862a90 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -296,7 +296,8 @@ sm_version_to_string (enum ptx_isa sm) static void handle_ptx_version_option (void) { - if (!OPTION_SET_P (ptx_version_option)) + if (!OPTION_SET_P (ptx_version_option) + || ptx_version_option == PTX_VERSION_default) { ptx_version_option = default_ptx_version_option (); return; diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 9776c3b..f555ad1 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -94,6 +94,9 @@ Enum(ptx_version) String(6.3) Value(PTX_VERSION_6_3) EnumValue Enum(ptx_version) String(7.0) Value(PTX_VERSION_7_0) +EnumValue +Enum(ptx_version) String(_) Value(PTX_VERSION_default) + mptx= Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option) Specify the version of the ptx version to use. -- cgit v1.1 From 28068d1115648adcc08ae57372170f3277915a0d Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Mon, 28 Feb 2022 22:30:27 +0000 Subject: PR tree-optimization/91384: peephole2 to eliminate testl after negl. This patch is my proposed solution to PR tree-optimization/91384 which is a missed-optimization/code quality regression on x86_64. The problematic idiom is "if (r = -a)" which is equivalent to both "r = -a; if (r != 0)" and alternatively "r = -a; if (a != 0)". In this particular case, on x86_64, we prefer to use the condition codes from the negation, rather than require an explicit testl instruction. Unfortunately, combine can't help, as it doesn't attempt to merge pairs of instructions that share the same operand(s), only pairs/triples of instructions where the result of each instruction feeds the next. But I doubt there's sufficient benefit to attempt this kind of "combination" (that wouldn't already be caught by the tree-ssa passes). Fortunately, it's relatively easy to fix this up (addressing the regression) during peephole2 to eliminate the unnecessary testl in: movl %edi, %ebx negl %ebx testl %edi, %edi je .L2 2022-02-28 Roger Sayle gcc/ChangeLog PR tree-optimization/91384 * config/i386/i386.md (peephole2): Eliminate final testl insn from the sequence *movsi_internal, *negsi_1, *cmpsi_ccno_1 by transforming using *negsi_2 for the negation. gcc/testsuite/ChangeLog PR tree-optimization/91384 * gcc.target/i386/pr91384.c: New test case. --- gcc/config/i386/i386.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index e7c5490..5e0a980 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -11011,6 +11011,19 @@ [(set_attr "type" "negnot") (set_attr "mode" "")]) +;; Optimize *negsi_1 followed by *cmpsi_ccno_1 (PR target/91384) +(define_peephole2 + [(set (match_operand:SWI 0 "general_reg_operand") + (match_operand:SWI 1 "general_reg_operand")) + (parallel [(set (match_dup 0) (neg:SWI (match_dup 0))) + (clobber (reg:CC FLAGS_REG))]) + (set (reg:CCZ FLAGS_REG) (compare:CCZ (match_dup 1) (const_int 0)))] + "" + [(set (match_dup 0) (match_dup 1)) + (parallel [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (neg:SWI (match_dup 0)) (const_int 0))) + (set (match_dup 0) (neg:SWI (match_dup 0)))])]) + ;; Special expand pattern to handle integer mode abs (define_expand "abs2" -- cgit v1.1 From e2385690a3ead66744e51115966f25f9c05bb3e2 Mon Sep 17 00:00:00 2001 From: Hongyu Wang Date: Mon, 28 Feb 2022 15:09:59 +0800 Subject: i386: Fix V8HF vector init under -mno-avx [PR 104664] For V8HFmode vector init with HFmode, do not directly emits V8HF move with subreg, which may cause reload to assign general register to move src. gcc/ChangeLog: PR target/104664 * config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate): Use vec_setv8hf_0 for HF to V8HFmode move instead of subreg. gcc/testsuite/ChangeLog: PR target/104664 * gcc.target/i386/pr104664.c: New test. --- gcc/config/i386/i386-expand.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index faa0191..530f83f 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -14899,7 +14899,12 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, dperm.one_operand_p = true; if (mode == V8HFmode) - tmp1 = lowpart_subreg (V8HFmode, force_reg (HFmode, val), HFmode); + { + tmp1 = force_reg (HFmode, val); + tmp2 = gen_reg_rtx (mode); + emit_insn (gen_vec_setv8hf_0 (tmp2, CONST0_RTX (mode), tmp1)); + tmp1 = gen_lowpart (mode, tmp2); + } else { /* Extend to SImode using a paradoxical SUBREG. */ -- cgit v1.1 From 2240ebd8e46e098f972a662d0aad85348b304889 Mon Sep 17 00:00:00 2001 From: Robin Dapp Date: Mon, 7 Feb 2022 08:39:41 +0100 Subject: arc: Fix for new ifcvt behavior [PR104154] ifcvt now passes a CC-mode "comparison" to backends. This patch simply returns from gen_compare_reg () in that case since nothing needs to be prepared anymore. gcc/ChangeLog: PR rtl-optimization/104154 * config/arc/arc.cc (gen_compare_reg): Return the CC-mode comparison ifcvt passed us. --- gcc/config/arc/arc.cc | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/arc/arc.cc b/gcc/config/arc/arc.cc index c27ba99..fbc17e6 100644 --- a/gcc/config/arc/arc.cc +++ b/gcc/config/arc/arc.cc @@ -2256,6 +2256,12 @@ gen_compare_reg (rtx comparison, machine_mode omode) cmode = GET_MODE (x); if (cmode == VOIDmode) cmode = GET_MODE (y); + + /* If ifcvt passed us a MODE_CC comparison we can + just return it. It should be in the proper form already. */ + if (GET_MODE_CLASS (cmode) == MODE_CC) + return comparison; + if (cmode != SImode && cmode != SFmode && cmode != DFmode) return NULL_RTX; if (cmode == SImode) -- cgit v1.1 From 7efe46935c5fce8db13e00aa6f4b0f1599b330e4 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Fri, 25 Feb 2022 11:47:12 +0100 Subject: [nvptx] Add nvptx-sm.def Add a file gcc/config/nvptx/nvptx-sm.def that lists all sm_xx versions used in the port, like so: ... NVPTX_SM(30, NVPTX_SM_SEP) NVPTX_SM(35, NVPTX_SM_SEP) NVPTX_SM(53, NVPTX_SM_SEP) NVPTX_SM(70, NVPTX_SM_SEP) NVPTX_SM(75, NVPTX_SM_SEP) NVPTX_SM(80,) ... and use it in various places using a pattern: ... #define NVPTX_SM(XX, SEP) { ... } #include "nvptx-sm.def" #undef NVPTX_SM ... Tested on nvptx. gcc/ChangeLog: 2022-02-25 Tom de Vries * config/nvptx/nvptx-sm.def: New file. * config/nvptx/nvptx-c.cc (nvptx_cpu_cpp_builtins): Use nvptx-sm.def. * config/nvptx/nvptx-opts.h (enum ptx_isa): Same. * config/nvptx/nvptx.cc (sm_version_to_string) (nvptx_omp_device_kind_arch_isa): Same. --- gcc/config/nvptx/nvptx-c.cc | 22 ++++++++++------------ gcc/config/nvptx/nvptx-opts.h | 11 +++++------ gcc/config/nvptx/nvptx-sm.def | 30 ++++++++++++++++++++++++++++++ gcc/config/nvptx/nvptx.cc | 36 ++++++++++++------------------------ 4 files changed, 57 insertions(+), 42 deletions(-) create mode 100644 gcc/config/nvptx/nvptx-sm.def (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx-c.cc b/gcc/config/nvptx/nvptx-c.cc index b2375fb..02f7562 100644 --- a/gcc/config/nvptx/nvptx-c.cc +++ b/gcc/config/nvptx/nvptx-c.cc @@ -39,17 +39,15 @@ nvptx_cpu_cpp_builtins (void) cpp_define (parse_in, "__nvptx_softstack__"); if (TARGET_UNIFORM_SIMT) cpp_define (parse_in,"__nvptx_unisimt__"); - if (TARGET_SM80) - cpp_define (parse_in, "__PTX_SM__=800"); - else if (TARGET_SM75) - cpp_define (parse_in, "__PTX_SM__=750"); - else if (TARGET_SM70) - cpp_define (parse_in, "__PTX_SM__=700"); - else if (TARGET_SM53) - cpp_define (parse_in, "__PTX_SM__=530"); - else if (TARGET_SM35) - cpp_define (parse_in, "__PTX_SM__=350"); - else - cpp_define (parse_in,"__PTX_SM__=300"); + + const char *ptx_sm = NULL; +#define NVPTX_SM(XX, SEP) \ + { \ + if (TARGET_SM ## XX) \ + ptx_sm = "__PTX_SM__=" #XX "0"; \ + } +#include "nvptx-sm.def" +#undef NVPTX_SM + cpp_define (parse_in, ptx_sm); } diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h index 30852b6..86b433c 100644 --- a/gcc/config/nvptx/nvptx-opts.h +++ b/gcc/config/nvptx/nvptx-opts.h @@ -22,12 +22,11 @@ enum ptx_isa { - PTX_ISA_SM30, - PTX_ISA_SM35, - PTX_ISA_SM53, - PTX_ISA_SM70, - PTX_ISA_SM75, - PTX_ISA_SM80 +#define NVPTX_SM(XX, SEP) PTX_ISA_SM ## XX SEP +#define NVPTX_SM_SEP , +#include "nvptx-sm.def" +#undef NVPTX_SM_SEP +#undef NVPTX_SM }; enum ptx_version diff --git a/gcc/config/nvptx/nvptx-sm.def b/gcc/config/nvptx/nvptx-sm.def new file mode 100644 index 0000000..c552eb0 --- /dev/null +++ b/gcc/config/nvptx/nvptx-sm.def @@ -0,0 +1,30 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#ifndef NVPTX_SM_SEP +#define NVPTX_SM_SEP +#endif + +NVPTX_SM (30, NVPTX_SM_SEP) +NVPTX_SM (35, NVPTX_SM_SEP) +NVPTX_SM (53, NVPTX_SM_SEP) +NVPTX_SM (70, NVPTX_SM_SEP) +NVPTX_SM (75, NVPTX_SM_SEP) +NVPTX_SM (80,) + +#undef NVPTX_SM_SEP diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 7862a90..f3179ef 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -276,18 +276,11 @@ sm_version_to_string (enum ptx_isa sm) { switch (sm) { - case PTX_ISA_SM30: - return "30"; - case PTX_ISA_SM35: - return "35"; - case PTX_ISA_SM53: - return "53"; - case PTX_ISA_SM70: - return "70"; - case PTX_ISA_SM75: - return "75"; - case PTX_ISA_SM80: - return "80"; +#define NVPTX_SM(XX, SEP) \ + case PTX_ISA_SM ## XX: \ + return #XX; +#include "nvptx-sm.def" +#undef NVPTX_SM default: gcc_unreachable (); } @@ -6177,18 +6170,13 @@ nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait, case omp_device_arch: return strcmp (name, "nvptx") == 0; case omp_device_isa: - if (strcmp (name, "sm_30") == 0) - return !TARGET_SM35; - if (strcmp (name, "sm_35") == 0) - return TARGET_SM35 && !TARGET_SM53; - if (strcmp (name, "sm_53") == 0) - return TARGET_SM53 && !TARGET_SM70; - if (strcmp (name, "sm_70") == 0) - return TARGET_SM70 && !TARGET_SM75; - if (strcmp (name, "sm_75") == 0) - return TARGET_SM75 && !TARGET_SM80; - if (strcmp (name, "sm_80") == 0) - return TARGET_SM80; +#define NVPTX_SM(XX, SEP) \ + { \ + if (strcmp (name, "sm_" #XX) == 0) \ + return ptx_isa_option == PTX_ISA_SM ## XX; \ + } +#include "nvptx-sm.def" +#undef NVPTX_SM return 0; default: gcc_unreachable (); -- cgit v1.1 From 22adaa5e565a0355dc013b4c1eeefd8ff4a96d9a Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Fri, 25 Feb 2022 12:18:17 +0100 Subject: [nvptx] Use nvptx-sm.def for t-omp-device Add a script gen-omp-device-properties.sh that uses nvptx-sm.def to generate omp-device-properties-nvptx. Tested on x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-02-25 Tom de Vries * config/nvptx/gen-omp-device-properties.sh: New file. * config/nvptx/t-omp-device: Use gen-omp-device-properties.sh. --- gcc/config/nvptx/gen-omp-device-properties.sh | 33 +++++++++++++++++++++++++++ gcc/config/nvptx/t-omp-device | 7 +++--- 2 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 gcc/config/nvptx/gen-omp-device-properties.sh (limited to 'gcc/config') diff --git a/gcc/config/nvptx/gen-omp-device-properties.sh b/gcc/config/nvptx/gen-omp-device-properties.sh new file mode 100644 index 0000000..175092c --- /dev/null +++ b/gcc/config/nvptx/gen-omp-device-properties.sh @@ -0,0 +1,33 @@ +#!/bin/sh + +# Copyright (C) 2022 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +nvptx_sm_def="$1/nvptx-sm.def" + +sms=$(grep ^NVPTX_SM $nvptx_sm_def | sed 's/.*(//;s/,.*//') + +echo kind: gpu +echo arch: nvptx + +isa="" +for sm in $sms; do + isa="$isa sm_$sm" +done + +echo isa: $isa diff --git a/gcc/config/nvptx/t-omp-device b/gcc/config/nvptx/t-omp-device index 4228218..c2b28a4 100644 --- a/gcc/config/nvptx/t-omp-device +++ b/gcc/config/nvptx/t-omp-device @@ -1,4 +1,3 @@ -omp-device-properties-nvptx: $(srcdir)/config/nvptx/nvptx.cc - echo kind: gpu > $@ - echo arch: nvptx >> $@ - echo isa: sm_30 sm_35 sm_53 sm_70 sm_75 sm_80 >> $@ +omp-device-properties-nvptx: $(srcdir)/config/nvptx/nvptx-sm.def + $(SHELL) $(srcdir)/config/nvptx/gen-omp-device-properties.sh \ + "$(srcdir)/config/nvptx" > $@ -- cgit v1.1 From d59d13c89503baf92d14b04c05708a6296916fad Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Fri, 25 Feb 2022 11:49:01 +0100 Subject: [nvptx] Add nvptx-gen.h and nvptx-gen.opt Use nvptx-sm.def to generate new files nvptx-gen.h and nvptx-gen.opt, and: - include nvptx-gen.h in nvptx.h, and - add nvptx-gen.opt to extra_options (before nvptx.opt, in case that matters). Tested on nvptx. gcc/ChangeLog: 2022-02-25 Tom de Vries * config.gcc (nvptx*-*-*): Add nvptx/nvptx-gen.opt to extra_options. * config/nvptx/gen-copyright.sh: New file. * config/nvptx/gen-h.sh: New file. * config/nvptx/gen-opt.sh: New file. * config/nvptx/nvptx.h (TARGET_SM35, TARGET_SM53, TARGET_SM70) (TARGET_SM75, TARGET_SM80): Move ... * config/nvptx/nvptx-gen.h: ... here. New file, generate. * config/nvptx/nvptx.opt (Enum ptx_isa): Move ... * config/nvptx/nvptx-gen.opt: ... here. New file, generate. * config/nvptx/t-nvptx ($(srcdir)/config/nvptx/nvptx-gen.h) ($(srcdir)/config/nvptx/nvptx-gen.opt): New make target. --- gcc/config/nvptx/gen-copyright.sh | 82 +++++++++++++++++++++++++++++++++++++++ gcc/config/nvptx/gen-h.sh | 44 +++++++++++++++++++++ gcc/config/nvptx/gen-opt.sh | 66 +++++++++++++++++++++++++++++++ gcc/config/nvptx/nvptx-gen.h | 29 ++++++++++++++ gcc/config/nvptx/nvptx-gen.opt | 42 ++++++++++++++++++++ gcc/config/nvptx/nvptx.h | 6 +-- gcc/config/nvptx/nvptx.opt | 22 ----------- gcc/config/nvptx/t-nvptx | 17 ++++++++ 8 files changed, 281 insertions(+), 27 deletions(-) create mode 100644 gcc/config/nvptx/gen-copyright.sh create mode 100644 gcc/config/nvptx/gen-h.sh create mode 100644 gcc/config/nvptx/gen-opt.sh create mode 100644 gcc/config/nvptx/nvptx-gen.h create mode 100644 gcc/config/nvptx/nvptx-gen.opt (limited to 'gcc/config') diff --git a/gcc/config/nvptx/gen-copyright.sh b/gcc/config/nvptx/gen-copyright.sh new file mode 100644 index 0000000..79f4899 --- /dev/null +++ b/gcc/config/nvptx/gen-copyright.sh @@ -0,0 +1,82 @@ +#!/bin/sh + +# Copyright (C) 2022 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +style="$1" +case $style in + opt) + ;; + c) + first=true + ;; + *) + echo "Unknown style: \"$style\"" + exit 1 + ;; +esac + +( cat <. +EOF +) | while read line; do + case $style in + opt) + if [ "$line" = "" ]; then + echo ";" + else + echo "; $line" + fi + ;; + c) + if $first; then + echo "/* $line" + first=false + else + if [ "$line" = "" ]; then + echo + else + echo " $line" + fi + fi + ;; + esac +done + + +case $style in + c) + echo "*/" + ;; +esac diff --git a/gcc/config/nvptx/gen-h.sh b/gcc/config/nvptx/gen-h.sh new file mode 100644 index 0000000..605f874 --- /dev/null +++ b/gcc/config/nvptx/gen-h.sh @@ -0,0 +1,44 @@ +#!/bin/sh + +# Copyright (C) 2022 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +nvptx_sm_def="$1/nvptx-sm.def" +gen_copyright_sh="$1/gen-copyright.sh" + +sms=$(grep ^NVPTX_SM $nvptx_sm_def | sed 's/.*(//;s/,.*//') + +cat <= PTX_ISA_SM$sm) +EOF +done diff --git a/gcc/config/nvptx/gen-opt.sh b/gcc/config/nvptx/gen-opt.sh new file mode 100644 index 0000000..5248ed2 --- /dev/null +++ b/gcc/config/nvptx/gen-opt.sh @@ -0,0 +1,66 @@ +#!/bin/sh + +# Copyright (C) 2022 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +nvptx_sm_def="$1/nvptx-sm.def" +gen_copyright_sh="$1/gen-copyright.sh" + +sms=$(grep ^NVPTX_SM $nvptx_sm_def | sed 's/.*(//;s/,.*//') + +last= +for sm in $sms; do + last="$sm" +done + +cat <. +*/ + +#define TARGET_SM30 (ptx_isa_option >= PTX_ISA_SM30) +#define TARGET_SM35 (ptx_isa_option >= PTX_ISA_SM35) +#define TARGET_SM53 (ptx_isa_option >= PTX_ISA_SM53) +#define TARGET_SM70 (ptx_isa_option >= PTX_ISA_SM70) +#define TARGET_SM75 (ptx_isa_option >= PTX_ISA_SM75) +#define TARGET_SM80 (ptx_isa_option >= PTX_ISA_SM80) diff --git a/gcc/config/nvptx/nvptx-gen.opt b/gcc/config/nvptx/nvptx-gen.opt new file mode 100644 index 0000000..b6d433e --- /dev/null +++ b/gcc/config/nvptx/nvptx-gen.opt @@ -0,0 +1,42 @@ +; -*- buffer-read-only: t -*- +; Generated automatically by gen-opt.sh from nvptx-sm.def. + +; Copyright (C) 2022 Free Software Foundation, Inc. +; +; This file is part of GCC. +; +; GCC is free software; you can redistribute it and/or modify it under +; the terms of the GNU General Public License as published by the Free +; Software Foundation; either version 3, or (at your option) any later +; version. +; +; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +; WARRANTY; without even the implied warranty of MERCHANTABILITY or +; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received a copy of the GNU General Public License +; along with GCC; see the file COPYING3. If not see +; . + +Enum +Name(ptx_isa) Type(int) +Known PTX ISA versions (for use with the -misa= option): + +EnumValue +Enum(ptx_isa) String(sm_30) Value(PTX_ISA_SM30) + +EnumValue +Enum(ptx_isa) String(sm_35) Value(PTX_ISA_SM35) + +EnumValue +Enum(ptx_isa) String(sm_53) Value(PTX_ISA_SM53) + +EnumValue +Enum(ptx_isa) String(sm_70) Value(PTX_ISA_SM70) + +EnumValue +Enum(ptx_isa) String(sm_75) Value(PTX_ISA_SM75) + +EnumValue +Enum(ptx_isa) String(sm_80) Value(PTX_ISA_SM80) diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h index edffd08..4ab412b 100644 --- a/gcc/config/nvptx/nvptx.h +++ b/gcc/config/nvptx/nvptx.h @@ -86,11 +86,7 @@ #define Pmode (TARGET_ABI64 ? DImode : SImode) #define STACK_SIZE_MODE Pmode -#define TARGET_SM35 (ptx_isa_option >= PTX_ISA_SM35) -#define TARGET_SM53 (ptx_isa_option >= PTX_ISA_SM53) -#define TARGET_SM70 (ptx_isa_option >= PTX_ISA_SM70) -#define TARGET_SM75 (ptx_isa_option >= PTX_ISA_SM75) -#define TARGET_SM80 (ptx_isa_option >= PTX_ISA_SM80) +#include "nvptx-gen.h" #define TARGET_PTX_6_0 (ptx_version_option >= PTX_VERSION_6_0) #define TARGET_PTX_6_3 (ptx_version_option >= PTX_VERSION_6_3) diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index f555ad1..c83ceb3 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -51,28 +51,6 @@ mgomp Target Mask(GOMP) Generate code for OpenMP offloading: enables -msoft-stack and -muniform-simt. -Enum -Name(ptx_isa) Type(int) -Known PTX ISA versions (for use with the -misa= option): - -EnumValue -Enum(ptx_isa) String(sm_30) Value(PTX_ISA_SM30) - -EnumValue -Enum(ptx_isa) String(sm_35) Value(PTX_ISA_SM35) - -EnumValue -Enum(ptx_isa) String(sm_53) Value(PTX_ISA_SM53) - -EnumValue -Enum(ptx_isa) String(sm_70) Value(PTX_ISA_SM70) - -EnumValue -Enum(ptx_isa) String(sm_75) Value(PTX_ISA_SM75) - -EnumValue -Enum(ptx_isa) String(sm_80) Value(PTX_ISA_SM80) - ; Default needs to be in sync with default in ASM_SPEC in nvptx.h. misa= Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) Init(PTX_ISA_SM35) diff --git a/gcc/config/nvptx/t-nvptx b/gcc/config/nvptx/t-nvptx index b170766..f17fc9c 100644 --- a/gcc/config/nvptx/t-nvptx +++ b/gcc/config/nvptx/t-nvptx @@ -13,4 +13,21 @@ mkoffload$(exeext): mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) +$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \ mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBS) +$(srcdir)/config/nvptx/nvptx.h: $(srcdir)/config/nvptx/nvptx-gen.h +$(srcdir)/config/nvptx/nvptx-gen.h: s-nvptx-gen-h; @true +s-nvptx-gen-h: $(srcdir)/config/nvptx/nvptx-sm.def + $(SHELL) $(srcdir)/config/nvptx/gen-h.sh "$(srcdir)/config/nvptx" \ + > tmp-nvptx-gen.h + $(SHELL) $(srcdir)/../move-if-change \ + tmp-nvptx-gen.h $(srcdir)/config/nvptx/nvptx-gen.h + $(STAMP) s-nvptx-gen-h + +$(srcdir)/config/nvptx/nvptx-gen.opt: s-nvptx-gen-opt; @true +s-nvptx-gen-opt: $(srcdir)/config/nvptx/nvptx-sm.def + $(SHELL) $(srcdir)/config/nvptx/gen-opt.sh "$(srcdir)/config/nvptx" \ + > tmp-nvptx-gen.opt + $(SHELL) $(srcdir)/../move-if-change \ + tmp-nvptx-gen.opt $(srcdir)/config/nvptx/nvptx-gen.opt + $(STAMP) s-nvptx-gen-opt + MULTILIB_OPTIONS = mgomp -- cgit v1.1 From c2e0d0c1cfb4bf29daed189b39885841ee201a65 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Mon, 28 Feb 2022 16:06:54 +0100 Subject: [nvptx] Handle DCmode in define_expand "omp_simt_xchg_{bfly,idx}" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For a test-case doing an openmp target simd reduction on a complex double: ... DOUBLE COMPLEX :: counter_N0 ... !$OMP TARGET SIMD reduction(+: counter_N0) ... we run into: ... during RTL pass: expand b.f90: In function ‘MAIN__._omp_fn.0’: b.f90:23:32: internal compiler error: in expand_insn, at optabs.cc:8029 23 | counter_N0 = counter_N0 + 1. | ^ 0x10f1cd3 expand_insn(insn_code, unsigned int, expand_operand*) gcc/optabs.cc:8029 0xeac435 expand_GOMP_SIMT_XCHG_BFLY gcc/internal-fn.cc:375 ... Fix this by handling DCmode and CDImode in define_expand "omp_simt_xchg_{bfly,idx}". Tested on x86_64 with nvptx accelerator. gcc/ChangeLog: 2022-02-28 Tom de Vries PR target/102429 * config/nvptx/nvptx.cc (nvptx_gen_shuffle): Handle DCmode and CDImode. * config/nvptx/nvptx.md (define_predicate "nvptx_register_or_complex_di_df_register_operand"): New predicate. (define_expand "omp_simt_xchg_bfly", define_expand "omp_simt_xchg_idx"): Use nvptx_register_or_complex_di_df_register_operand. --- gcc/config/nvptx/nvptx.cc | 17 +++++++++++++++++ gcc/config/nvptx/nvptx.md | 20 ++++++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index f3179ef..6ca99a6 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -1941,6 +1941,23 @@ nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind) switch (GET_MODE (dst)) { + case E_DCmode: + case E_CDImode: + { + gcc_assert (GET_CODE (dst) == CONCAT); + gcc_assert (GET_CODE (src) == CONCAT); + rtx dst_real = XEXP (dst, 0); + rtx dst_imag = XEXP (dst, 1); + rtx src_real = XEXP (src, 0); + rtx src_imag = XEXP (src, 1); + + start_sequence (); + emit_insn (nvptx_gen_shuffle (dst_real, src_real, idx, kind)); + emit_insn (nvptx_gen_shuffle (dst_imag, src_imag, idx, kind)); + res = get_insns (); + end_sequence (); + } + break; case E_SImode: res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind)); break; diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 4989b56..a453c1d 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -94,6 +94,18 @@ return register_operand (op, mode); }) +(define_predicate "nvptx_register_or_complex_di_df_register_operand" + (ior (match_code "reg") + (match_code "concat")) +{ + if (GET_CODE (op) == CONCAT) + return ((GET_MODE (op) == DCmode || GET_MODE (op) == CDImode) + && nvptx_register_operand (XEXP (op, 0), mode) + && nvptx_register_operand (XEXP (op, 1), mode)); + + return nvptx_register_operand (op, mode); +}) + (define_predicate "nvptx_nonimmediate_operand" (match_code "mem,reg") { @@ -1902,8 +1914,8 @@ ;; Implement IFN_GOMP_SIMT_XCHG_BFLY: perform a "butterfly" exchange ;; across lanes (define_expand "omp_simt_xchg_bfly" - [(match_operand 0 "nvptx_register_operand" "=R") - (match_operand 1 "nvptx_register_operand" "R") + [(match_operand 0 "nvptx_register_or_complex_di_df_register_operand" "=R") + (match_operand 1 "nvptx_register_or_complex_di_df_register_operand" "R") (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")] "" { @@ -1915,8 +1927,8 @@ ;; Implement IFN_GOMP_SIMT_XCHG_IDX: broadcast value in operand 1 ;; from lane given by index in operand 2 to operand 0 in all lanes (define_expand "omp_simt_xchg_idx" - [(match_operand 0 "nvptx_register_operand" "=R") - (match_operand 1 "nvptx_register_operand" "R") + [(match_operand 0 "nvptx_register_or_complex_di_df_register_operand" "=R") + (match_operand 1 "nvptx_register_or_complex_di_df_register_operand" "R") (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")] "" { -- cgit v1.1 From 12fa7641ceed9c9139e2ea7b62c11f3dc5b6f6f4 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Thu, 3 Mar 2022 09:21:04 +0100 Subject: [nvptx] Use --no-verify for sm_30 In PR97348, we ran into the problem that recent CUDA dropped support for sm_30, which inhibited the build when building with CUDA bin in the path, because the nvptx-tools assembler uses CUDA's ptxas to do ptx verification. To fix this, in gcc-11 the default sm_xx was moved from sm_30 to sm_35. This however broke support for sm_30 boards: an executable build for sm_30 might contain sm_35 code from the libraries, which are build with the default sm_xx (PR104758). We want to fix this by going back to having the libraries build with sm_30, as was the case for gcc-5 to gcc-10. That however reintroduces the problem from PR97348. Deal with PR97348 in the simplest way possible: when calling the assembler for sm_30, specify --no-verify. This has the unfortunate effect that after fixing PR104758 by building libraries with sm_30, the libraries are no longer verified. This can be improved upon by: - adding a configure test in gcc that tests if CUDA supports sm_30, and if so disabling this patch - dealing with this in nvptx-tools somehow, either: - detect at ptxas execution time that it doesn't support sm_30, or - detect this at nvptx-tool configure time. gcc/ChangeLog: 2022-03-03 Tom de Vries * config/nvptx/nvptx.h (ASM_SPEC): Add %{misa=sm_30:--no-verify}. --- gcc/config/nvptx/nvptx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h index 4ab412b..3ca22a5 100644 --- a/gcc/config/nvptx/nvptx.h +++ b/gcc/config/nvptx/nvptx.h @@ -32,7 +32,7 @@ /* Default needs to be in sync with default for misa in nvptx.opt. We add a default here to work around a hard-coded sm_30 default in nvptx-as. */ -#define ASM_SPEC "%{misa=*:-m %*; :-m sm_35}" +#define ASM_SPEC "%{misa=*:-m %*; :-m sm_35}%{misa=sm_30:--no-verify}" #define TARGET_CPU_CPP_BUILTINS() nvptx_cpu_cpp_builtins () -- cgit v1.1 From 07667c911b1827fb98a1b5da621d51d8fcf0409a Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Wed, 2 Mar 2022 12:04:39 +0100 Subject: [nvptx] Build libraries with misa=sm_30 In gcc-11, when specifying -misa=sm_30, an executable may still contain sm_35 code (due to libraries being built with the default -misa=sm_35), so it won't run on an sm_30 board. Fix this by building libraries with sm_30, as was the case in gcc-5 to gcc-10. gcc/ChangeLog: 2022-03-03 Tom de Vries PR target/104758 * config/nvptx/t-nvptx (MULTILIB_EXTRA_OPTS): Add misa=sm_30. --- gcc/config/nvptx/t-nvptx | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/t-nvptx b/gcc/config/nvptx/t-nvptx index f17fc9c..056d2dd 100644 --- a/gcc/config/nvptx/t-nvptx +++ b/gcc/config/nvptx/t-nvptx @@ -31,3 +31,5 @@ s-nvptx-gen-opt: $(srcdir)/config/nvptx/nvptx-sm.def $(STAMP) s-nvptx-gen-opt MULTILIB_OPTIONS = mgomp + +MULTILIB_EXTRA_OPTS = misa=sm_30 -- cgit v1.1 From 5b5e456f0187406e17444b6e40d974f94524f2a2 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Thu, 3 Mar 2022 09:22:42 +0100 Subject: [nvptx] Build libraries with mptx=3.1 In gcc-5 to gcc-11, the ptx isa version was 3.1. On trunk, the default is now 6.0, which is also what will be the value in the libraries. Consequently, there may be setups with an older driver that worked with gcc-11, but will become unsupported with gcc-12. Fix this by building the libraries with mptx=3.1. After this, setups with an older driver still won't work out of the box with gcc-12, because the default ptx isa version has changed, but should work after specifying mptx=3.1. gcc/ChangeLog: 2022-03-03 Tom de Vries * config/nvptx/t-nvptx (MULTILIB_EXTRA_OPTS): Add mptx=3.1. --- gcc/config/nvptx/t-nvptx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/nvptx/t-nvptx b/gcc/config/nvptx/t-nvptx index 056d2dd..8f67264 100644 --- a/gcc/config/nvptx/t-nvptx +++ b/gcc/config/nvptx/t-nvptx @@ -32,4 +32,4 @@ s-nvptx-gen-opt: $(srcdir)/config/nvptx/nvptx-sm.def MULTILIB_OPTIONS = mgomp -MULTILIB_EXTRA_OPTS = misa=sm_30 +MULTILIB_EXTRA_OPTS = misa=sm_30 mptx=3.1 -- cgit v1.1 From 609e8c492d62d92465460eae3d43dfc4b2c68288 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sat, 26 Feb 2022 14:17:23 -0800 Subject: x86: Always return pseudo register in ix86_gen_scratch_sse_rtx ix86_gen_scratch_sse_rtx returns XMM7/XMM15/XMM31 as a scratch vector register to prevent RTL optimizers from removing vector register. It introduces a conflict with explicit XMM7/XMM15/XMM31 usage and when it is called by RTL optimizers, it may introduce conflicting usages of XMM7/XMM15/XMM31. Change ix86_gen_scratch_sse_rtx to always return a pseudo register and xfail x86 tests which are optimized with a hard scratch register. gcc/ PR target/104704 * config/i386/i386.cc (ix86_gen_scratch_sse_rtx): Always return a pseudo register. gcc/testsuite/ PR target/104704 * gcc.target/i386/incoming-11.c: Xfail. * gcc.target/i386/pieces-memset-3.c: Likewise. * gcc.target/i386/pieces-memset-37.c: Likewise. * gcc.target/i386/pieces-memset-39.c: Likewise. * gcc.target/i386/pieces-memset-46.c: Likewise. * gcc.target/i386/pieces-memset-47.c: Likewise. * gcc.target/i386/pieces-memset-48.c: Likewise. * gcc.target/i386/pr90773-5.c: Likewise. * gcc.target/i386/pr90773-14.c: Likewise. * gcc.target/i386/pr90773-17.c: Likewise. * gcc.target/i386/pr100865-8a.c: Likewise. * gcc.target/i386/pr100865-8c.c: Likewise. * gcc.target/i386/pr100865-9c.c: Likewise. * gcc.target/i386/pieces-memset-21.c: Always expect vzeroupper. * gcc.target/i386/pr82941-1.c: Likewise. * gcc.target/i386/pr82942-1.c: Likewise. * gcc.target/i386/pr82990-1.c: Likewise. * gcc.target/i386/pr82990-3.c: Likewise. * gcc.target/i386/pr82990-5.c: Likewise. * gcc.target/i386/pr100865-11b.c: Expect vmovdqa instead of vmovdqa64. * gcc.target/i386/pr100865-12b.c: Likewise. * gcc.target/i386/pr100865-8b.c: Likewise. * gcc.target/i386/pr100865-9b.c: Likewise. * gcc.target/i386/pr104704-1.c: New test. * gcc.target/i386/pr104704-2.c: Likewise. * gcc.target/i386/pr104704-3.c: Likewise. * gcc.target/i386/pr104704-4.c: Likewise. * gcc.target/i386/pr104704-5.c: Likewise. * gcc.target/i386/pr104704-6.c: Likewise. --- gcc/config/i386/i386.cc | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index b2bf905..9521990 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -23786,24 +23786,7 @@ ix86_optab_supported_p (int op, machine_mode mode1, machine_mode, rtx ix86_gen_scratch_sse_rtx (machine_mode mode) { - if (TARGET_SSE && !lra_in_progress) - { - unsigned int regno; - if (TARGET_64BIT) - { - /* In 64-bit mode, use XMM31 to avoid vzeroupper and always - use XMM31 for CSE. */ - if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode)) - regno = LAST_EXT_REX_SSE_REG; - else - regno = LAST_REX_SSE_REG; - } - else - regno = LAST_SSE_REG; - return gen_rtx_REG (mode, regno); - } - else - return gen_reg_rtx (mode); + return gen_reg_rtx (mode); } /* Address space support. -- cgit v1.1 From cb16bc3b5f34733ef9bbf8d2e3acacdecb099a62 Mon Sep 17 00:00:00 2001 From: Peter Bergner Date: Fri, 4 Mar 2022 09:03:44 -0600 Subject: rs6000: Allow -mlong-double-64 after -mabi={ibm,ieee}longdouble [PR104208, PR87496] The glibc build is showing a build error due to extra "error" checking from my PR87496 fix. That checking was overeager, disallowing setting the long double size to 64-bits if the 128-bit long double ABI had already been specified. Now we only emit an error if we specify a 128-bit long double ABI if our long double size is not 128 bits. This also fixes an erroneous error when -mabi=ieeelongdouble is used and ISA 2.06 is not enabled, but the long double size has been changed to 64 bits. 2022-03-04 Peter Bergner gcc/ PR target/87496 PR target/104208 * config/rs6000/rs6000.cc (rs6000_option_override_internal): Make the ISA 2.06 requirement for -mabi=ieeelongdouble conditional on -mlong-double-128. Move the -mabi=ieeelongdouble and -mabi=ibmlongdouble error checking from here... * common/config/rs6000/rs6000-common.cc (rs6000_handle_option): ... to here. gcc/testsuite/ PR target/87496 PR target/104208 * gcc.target/powerpc/pr104208-1.c: New test. * gcc.target/powerpc/pr104208-2.c: Likewise. * gcc.target/powerpc/pr87496-2.c: Swap long double options to trigger the expected error. * gcc.target/powerpc/pr87496-3.c: Likewise. --- gcc/config/rs6000/rs6000.cc | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index a855e8c..5b100a8 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -4178,13 +4178,6 @@ rs6000_option_override_internal (bool global_init_p) ; /* The option value can be seen when cl_target_option_restore is called. */ else if (rs6000_long_double_type_size == 128) rs6000_long_double_type_size = FLOAT_PRECISION_TFmode; - else if (OPTION_SET_P (rs6000_ieeequad)) - { - if (global_options.x_rs6000_ieeequad) - error ("%qs requires %qs", "-mabi=ieeelongdouble", "-mlong-double-128"); - else - error ("%qs requires %qs", "-mabi=ibmlongdouble", "-mlong-double-128"); - } /* Set -mabi=ieeelongdouble on some old targets. In the future, power server systems will also set long double to be IEEE 128-bit. AIX and Darwin @@ -4194,13 +4187,13 @@ rs6000_option_override_internal (bool global_init_p) if (!OPTION_SET_P (rs6000_ieeequad)) rs6000_ieeequad = TARGET_IEEEQUAD_DEFAULT; - else + else if (TARGET_LONG_DOUBLE_128) { if (global_options.x_rs6000_ieeequad && (!TARGET_POPCNTD || !TARGET_VSX)) error ("%qs requires full ISA 2.06 support", "-mabi=ieeelongdouble"); - if (rs6000_ieeequad != TARGET_IEEEQUAD_DEFAULT && TARGET_LONG_DOUBLE_128) + if (rs6000_ieeequad != TARGET_IEEEQUAD_DEFAULT) { /* Determine if the user can change the default long double type at compilation time. You need GLIBC 2.32 or newer to be able to -- cgit v1.1 From f1b3e3853329b58fb2e50c17487df2ecbc4a5608 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Wed, 23 Feb 2022 13:53:44 +0000 Subject: LRA, rs6000, Darwin: Revise lo_sum use for forced constants [PR104117]. Follow up discussion to the initial patch for this PR identified that it is preferable to avoid the LRA change, and arrange for the target to reject the hi and lo_sum selections when presented with an invalid address. We split the Darwin high/low selectors into two: 1. One that handles non-PIC addresses (kernel mode, mdynamic-no-pic). 2. One that handles PIC addresses and rejects SYMBOL_REFs unless they are suitably wrapped in the MACHOPIC_OFFSET unspec. The second case is handled by providing a new predicate (macho_pic_address) that checks the requirements. Signed-off-by: Iain Sandoe PR target/104117 gcc/ChangeLog: * config/rs6000/darwin.md (@machopic_high_): New. (@machopic_low_): New. * config/rs6000/predicates.md (macho_pic_address): New. * config/rs6000/rs6000.cc (rs6000_legitimize_address): Do not apply the TLS processing to Darwin. * lra-constraints.cc (process_address_1): Revert the changes in r12-7209. --- gcc/config/rs6000/darwin.md | 19 +++++++++++++++---- gcc/config/rs6000/predicates.md | 14 ++++++++++++++ gcc/config/rs6000/rs6000.cc | 2 +- 3 files changed, 30 insertions(+), 5 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/darwin.md b/gcc/config/rs6000/darwin.md index 8443585..e73d59e 100644 --- a/gcc/config/rs6000/darwin.md +++ b/gcc/config/rs6000/darwin.md @@ -121,21 +121,32 @@ You should have received a copy of the GNU General Public License stw %0,lo16(%2)(%1)" [(set_attr "type" "store")]) -;; 64-bit MachO load/store support - ;; Mach-O PIC. (define_insn "@macho_high_" [(set (match_operand:P 0 "gpc_reg_operand" "=b*r") (high:P (match_operand 1 "" "")))] - "TARGET_MACHO && (DEFAULT_ABI == ABI_DARWIN)" + "TARGET_MACHO && (DEFAULT_ABI == ABI_DARWIN) && !flag_pic" "lis %0,ha16(%1)") (define_insn "@macho_low_" [(set (match_operand:P 0 "gpc_reg_operand" "=r") (lo_sum:P (match_operand:P 1 "gpc_reg_operand" "b") (match_operand 2 "" "")))] - "TARGET_MACHO && (DEFAULT_ABI == ABI_DARWIN)" + "TARGET_MACHO && (DEFAULT_ABI == ABI_DARWIN) && !flag_pic" + "la %0,lo16(%2)(%1)") + +(define_insn "@machopic_high_" + [(set (match_operand:P 0 "gpc_reg_operand" "=b*r") + (high:P (match_operand 1 "macho_pic_address" "")))] + "TARGET_MACHO && flag_pic" + "lis %0,ha16(%1)") + +(define_insn "@machopic_low_" + [(set (match_operand:P 0 "gpc_reg_operand" "=r") + (lo_sum:P (match_operand:P 1 "gpc_reg_operand" "b") + (match_operand 2 "macho_pic_address" "")))] + "TARGET_MACHO && flag_pic" "la %0,lo16(%2)(%1)") (define_split diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index c65dfb9..28f6e98 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -2045,3 +2045,17 @@ (if_then_else (match_test "TARGET_VSX") (match_operand 0 "reg_or_cint_operand") (match_operand 0 "const_int_operand"))) + +;; Return true if the operand is a valid Mach-O pic address. +;; +(define_predicate "macho_pic_address" + (match_code "const,unspec") +{ + if (GET_CODE (op) == CONST) + op = XEXP (op, 0); + + if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_MACHOPIC_OFFSET) + return CONSTANT_P (XVECEXP (op, 0, 0)); + else + return false; +}) diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 5b100a8..2388d44 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -9021,7 +9021,7 @@ rs6000_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, else return force_reg (Pmode, x); } - if (SYMBOL_REF_P (x)) + if (SYMBOL_REF_P (x) && !TARGET_MACHO) { enum tls_model model = SYMBOL_REF_TLS_MODEL (x); if (model != 0) -- cgit v1.1 From 25587472ccd223c861fe77cfeca4ba33c3f6cd99 Mon Sep 17 00:00:00 2001 From: Iain Sandoe Date: Fri, 4 Mar 2022 12:39:03 +0000 Subject: Darwin: Fix a type mismatch warning for a non-GCC bootstrap compiler. DECL_MD_FUNCTION_CODE() returns an int, on one particular compiler the code in darwin_fold_builtin() triggers a warning. Fixed thus. Signed-off-by: Iain Sandoe gcc/ChangeLog: * config/darwin.cc (darwin_fold_builtin): Make fcode an int to avoid a mismatch with DECL_MD_FUNCTION_CODE(). --- gcc/config/darwin.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/config') diff --git a/gcc/config/darwin.cc b/gcc/config/darwin.cc index 783fe3c..f065a13 100644 --- a/gcc/config/darwin.cc +++ b/gcc/config/darwin.cc @@ -3621,7 +3621,7 @@ tree darwin_fold_builtin (tree fndecl, int n_args, tree *argp, bool ARG_UNUSED (ignore)) { - unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl); + int fcode = DECL_MD_FUNCTION_CODE (fndecl); if (fcode == darwin_builtin_cfstring) { -- cgit v1.1 From 77eccbf39ed55297802bb66dff5f62507a7239e3 Mon Sep 17 00:00:00 2001 From: Segher Boessenkool Date: Tue, 1 Mar 2022 17:04:29 +0000 Subject: rs6000: Improve .machine This adds more correct .machine for most older CPUs. It should be conservative in the sense that everything we handled before we handle at least as well now. This does not yet revamp the server CPU handling, it is too risky at this point in time. Tested on powerpc64-linux {-m32,-m64}. Also manually tested with all -mcpu=, and the output of that passed through the GNU assembler. 2022-03-04 Segher Boessenkool * config/rs6000/rs6000.cc (rs6000_machine_from_flags): Restructure a bit. Handle most older CPUs. --- gcc/config/rs6000/rs6000.cc | 81 ++++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 27 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 2388d44..7afbc29 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -5764,33 +5764,60 @@ const char *rs6000_machine; const char * rs6000_machine_from_flags (void) { - /* For some CPUs, the machine cannot be determined by ISA flags. We have to - check them first. */ - switch (rs6000_cpu) - { - case PROCESSOR_PPC8540: - case PROCESSOR_PPC8548: - return "e500"; - - case PROCESSOR_PPCE300C2: - case PROCESSOR_PPCE300C3: - return "e300"; - - case PROCESSOR_PPCE500MC: - return "e500mc"; - - case PROCESSOR_PPCE500MC64: - return "e500mc64"; - - case PROCESSOR_PPCE5500: - return "e5500"; - - case PROCESSOR_PPCE6500: - return "e6500"; - - default: - break; - } + /* e300 and e500 */ + if (rs6000_cpu == PROCESSOR_PPCE300C2 || rs6000_cpu == PROCESSOR_PPCE300C3) + return "e300"; + if (rs6000_cpu == PROCESSOR_PPC8540 || rs6000_cpu == PROCESSOR_PPC8548) + return "e500"; + if (rs6000_cpu == PROCESSOR_PPCE500MC) + return "e500mc"; + if (rs6000_cpu == PROCESSOR_PPCE500MC64) + return "e500mc64"; + if (rs6000_cpu == PROCESSOR_PPCE5500) + return "e5500"; + if (rs6000_cpu == PROCESSOR_PPCE6500) + return "e6500"; + + /* 400 series */ + if (rs6000_cpu == PROCESSOR_PPC403) + return "\"403\""; + if (rs6000_cpu == PROCESSOR_PPC405) + return "\"405\""; + if (rs6000_cpu == PROCESSOR_PPC440) + return "\"440\""; + if (rs6000_cpu == PROCESSOR_PPC476) + return "\"476\""; + + /* A2 */ + if (rs6000_cpu == PROCESSOR_PPCA2) + return "a2"; + + /* Cell BE */ + if (rs6000_cpu == PROCESSOR_CELL) + return "cell"; + + /* Titan */ + if (rs6000_cpu == PROCESSOR_TITAN) + return "titan"; + + /* 500 series and 800 series */ + if (rs6000_cpu == PROCESSOR_MPCCORE) + return "\"821\""; + + /* 600 series and 700 series, "classic" */ + if (rs6000_cpu == PROCESSOR_PPC601 || rs6000_cpu == PROCESSOR_PPC603 + || rs6000_cpu == PROCESSOR_PPC604 || rs6000_cpu == PROCESSOR_PPC604e + || rs6000_cpu == PROCESSOR_PPC750 || rs6000_cpu == PROCESSOR_POWERPC) + return "ppc"; + + /* Classic with AltiVec, "G4" */ + if (rs6000_cpu == PROCESSOR_PPC7400 || rs6000_cpu == PROCESSOR_PPC7450) + return "\"7450\""; + + /* The older 64-bit CPUs */ + if (rs6000_cpu == PROCESSOR_PPC620 || rs6000_cpu == PROCESSOR_PPC630 + || rs6000_cpu == PROCESSOR_RS64A || rs6000_cpu == PROCESSOR_POWERPC64) + return "ppc64"; HOST_WIDE_INT flags = rs6000_isa_flags; -- cgit v1.1 From 1301d7f647c7ac40da7f910aa6e790205e34bb8b Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Sat, 5 Mar 2022 00:01:52 -0500 Subject: Optimize signed DImode -> TImode on power10. On power10, GCC tries to optimize the signed conversion from DImode to TImode by using the vextsd2q instruction. However to generate this instruction, it would have to generate 3 direct moves (1 from the GPR registers to the altivec registers, and 2 from the altivec registers to the GPR register). This patch generates the shift right immediate instruction to do the conversion if the target/source registers ares GPR registers like it does on earlier systems. If the target/source registers are Altivec registers, it will generate the vextsd2q instruction. 2022-03-05 Michael Meissner gcc/ PR target/104698 * config/rs6000/vsx.md (UNSPEC_MTVSRD_DITI_W1): Delete. (mtvsrdd_diti_w1): Delete. (extendditi2): Convert from define_expand to define_insn_and_split. Replace with code to deal with both GPR registers and with altivec registers. gcc/testsuite/ PR target/104698 * gcc.target/powerpc/pr104698-1.c: New test. * gcc.target/powerpc/pr104698-2.c: New test. --- gcc/config/rs6000/vsx.md | 83 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 22 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index b53de10..d0fb92f 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -360,7 +360,6 @@ UNSPEC_XXGENPCV UNSPEC_MTVSBM UNSPEC_EXTENDDITI2 - UNSPEC_MTVSRD_DITI_W1 UNSPEC_VCNTMB UNSPEC_VEXPAND UNSPEC_VEXTRACT @@ -5023,15 +5022,67 @@ DONE; }) -;; ISA 3.1 vector sign extend -;; Move DI value from GPR to TI mode in VSX register, word 1. -(define_insn "mtvsrdd_diti_w1" - [(set (match_operand:TI 0 "register_operand" "=wa") - (unspec:TI [(match_operand:DI 1 "register_operand" "r")] - UNSPEC_MTVSRD_DITI_W1))] - "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" - "mtvsrdd %x0,0,%1" - [(set_attr "type" "vecmove")]) +;; Sign extend DI to TI. We provide both GPR targets and Altivec targets on +;; power10. On earlier systems, the machine independent code will generate a +;; shift left to sign extend the 64-bit value to 128-bit. +;; +;; If the register allocator prefers to use GPR registers, we will use a shift +;; left instruction to sign extend the 64-bit value to 128-bit. +;; +;; If the register allocator prefers to use Altivec registers on power10, +;; generate the vextsd2q instruction. +(define_insn_and_split "extendditi2" + [(set (match_operand:TI 0 "register_operand" "=r,r,v,v,v") + (sign_extend:TI (match_operand:DI 1 "input_operand" "r,m,r,wa,Z"))) + (clobber (reg:DI CA_REGNO))] + "TARGET_POWERPC64 && TARGET_POWER10" + "#" + "&& reload_completed" + [(pc)] +{ + rtx dest = operands[0]; + rtx src = operands[1]; + int dest_regno = reg_or_subregno (dest); + + /* Handle conversion to GPR registers. Load up the low part and then do + a sign extension to the upper part. */ + if (INT_REGNO_P (dest_regno)) + { + rtx dest_hi = gen_highpart (DImode, dest); + rtx dest_lo = gen_lowpart (DImode, dest); + + emit_move_insn (dest_lo, src); + /* In case src is a MEM, we have to use the destination, which is a + register, instead of re-using the source. */ + rtx src2 = (REG_P (src) || SUBREG_P (src)) ? src : dest_lo; + emit_insn (gen_ashrdi3 (dest_hi, src2, GEN_INT (63))); + DONE; + } + + /* For conversion to an Altivec register, generate either a splat operation + or a load rightmost double word instruction. Both instructions gets the + DImode value into the lower 64 bits, and then do the vextsd2q + instruction. */ + + else if (ALTIVEC_REGNO_P (dest_regno)) + { + if (MEM_P (src)) + emit_insn (gen_vsx_lxvrdx (dest, src)); + else + { + rtx dest_v2di = gen_rtx_REG (V2DImode, dest_regno); + emit_insn (gen_vsx_splat_v2di (dest_v2di, src)); + } + + emit_insn (gen_extendditi2_vector (dest, dest)); + DONE; + } + + else + gcc_unreachable (); +} + [(set_attr "length" "8") + (set_attr "type" "shift,load,vecmove,vecperm,load")]) ;; Sign extend 64-bit value in TI reg, word 1, to 128-bit value in TI reg (define_insn "extendditi2_vector" @@ -5042,18 +5093,6 @@ "vextsd2q %0,%1" [(set_attr "type" "vecexts")]) -(define_expand "extendditi2" - [(set (match_operand:TI 0 "gpc_reg_operand") - (sign_extend:DI (match_operand:DI 1 "gpc_reg_operand")))] - "TARGET_POWER10" - { - /* Move 64-bit src from GPR to vector reg and sign extend to 128-bits. */ - rtx temp = gen_reg_rtx (TImode); - emit_insn (gen_mtvsrdd_diti_w1 (temp, operands[1])); - emit_insn (gen_extendditi2_vector (operands[0], temp)); - DONE; - }) - ;; ISA 3.0 Binary Floating-Point Support -- cgit v1.1 From 8ea4a34bd0b0a46277b5e077c89cbd86dfb09c48 Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Sat, 5 Mar 2022 08:50:45 +0000 Subject: PR 104732: Simplify/fix DI mode logic expansion/splitting on -m32. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This clean-up patch resolves PR testsuite/104732, the failure of the recent test gcc.target/i386/pr100711-1.c on 32-bit Solaris/x86. Rather than just tweak the testcase, the proposed approach is to fix the underlying problem by removing the "TARGET_STV && TARGET_SSE2" conditionals from the DI mode logical operation expanders and pre-reload splitters in i386.md, which as I'll show generate inferior code (even a GCC 12 regression) on !TARGET_64BIT whenever -mno-stv (such as Solaris) or -msse (but not -msse2). First a little bit of history. In the beginning, DImode operations on i386 weren't defined by the machine description, and lowered during RTL expansion to SI mode operations. The with PR 65105 in 2015, -mstv was added, together with a SWIM1248x mode iterator (later renamed to SWIM1248x) together with several *di3_doubleword post-reload splitters that made use of register allocation to perform some double word operations in 64-but XMM registers. A short while later in 2016, PR 70322 added similar support for one_cmpldi2. All of this logic was dependent upon "!TARGET_64BIT && TARGET_STV && TARGET_SSE2". With the passing of time, these conditions became irrelevant when in 2019, it was decided to split these double-word patterns before reload. https://gcc.gnu.org/pipermail/gcc-patches/2019-June/523877.html https://gcc.gnu.org/pipermail/gcc-patches/2019-October/532236.html Hence the current situation, where on most modern CPU architectures (where "TARGET_STV && TARGET_SSE2" is true), RTL is expanded with DI mode operations, that are then split into two SI mode instructions before reload, except on Solaris and other odd cases, where the splitting is to two SI mode instructions is done during RTL expansion. By the time compilation reaches register allocation both paths in theory produce identical or similar code, so the vestigial legacy/logic would appear to be harmless. Unfortunately, there is one place where this arbitrary choice of how to lower DI mode doubleword operations is visible to the middle-end, it controls whether the backend appears to have a suitable optab, and the presence (or not) of DImode optabs can influence vectorization cost models and veclower decisions. The issue (and code quality regression) can be seen in this test case: typedef long long v2di __attribute__((vector_size (16))); v2di x; void foo (long long a) { v2di t = {a, a}; x = ~t; } which when compiled with "-O2 -m32 -msse -march=pentiumpro" produces: foo: subl $28, %esp movl %ebx, 16(%esp) movl 32(%esp), %eax movl %esi, 20(%esp) movl 36(%esp), %edx movl %edi, 24(%esp) movl %eax, %esi movl %eax, %edi movl %edx, %ebx movl %edx, %ecx notl %esi notl %ebx movl %esi, (%esp) notl %edi notl %ecx movl %ebx, 4(%esp) movl 20(%esp), %esi movl %edi, 8(%esp) movl 16(%esp), %ebx movl %ecx, 12(%esp) movl 24(%esp), %edi movss 8(%esp), %xmm1 movss 12(%esp), %xmm2 movss (%esp), %xmm0 movss 4(%esp), %xmm3 unpcklps %xmm2, %xmm1 unpcklps %xmm3, %xmm0 movlhps %xmm1, %xmm0 movaps %xmm0, x addl $28, %esp ret Importantly notice the four "notl" instructions. With this patch: foo: subl $28, %esp movl 32(%esp), %edx movl 36(%esp), %eax notl %edx movl %edx, (%esp) notl %eax movl %eax, 4(%esp) movl %edx, 8(%esp) movl %eax, 12(%esp) movaps (%esp), %xmm1 movaps %xmm1, x addl $28, %esp ret Notice only two "notl" instructions. Checking with godbolt.org, GCC generated 4 NOTs in GCC 4.x and 5.x, 2 NOTs between GCC 6.x and 9.x, and regressed to 4 NOTs since GCC 10.x [which hopefully qualifies this clean-up as suitable for stage 4]. Most significantly, this patch allows pr100711-1.c to pass with -mno-stv, allowing pandn to be used with V2DImode on Solaris/x86. Fingers-crossed this should reduce the number of discrepancies encountered supporting Solaris/x86. 2022-03-05 Roger Sayle Uroš Bizjak gcc/ChangeLog PR testsuite/104732 * config/i386/i386.md (SWIM1248x): Renamed from SWIM1248s. Include DI mode unconditionally. (*anddi3_doubleword): Remove && TARGET_STV && TARGET_SSE2 condition, i.e. always split on !TARGET_64BIT. (*di3_doubleword): Likewise. (*one_cmpldi2_doubleword): Likewise. (and3 expander): Update to use SWIM1248x from SWIM1248s. (3 expander): Likewise. (one_cmpl2 expander): Likewise. gcc/testsuite/ChangeLog PR testsuite/104732 * gcc.target/i386/pr104732.c: New test case. --- gcc/config/i386/i386.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'gcc/config') diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 5e0a980..d15170e 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -1079,11 +1079,11 @@ (HI "TARGET_HIMODE_MATH") SI]) -;; Math-dependant integer modes with DImode (enabled for 32bit with STV). -(define_mode_iterator SWIM1248s +;; Math-dependant integer modes with DImode. +(define_mode_iterator SWIM1248x [(QI "TARGET_QIMODE_MATH") (HI "TARGET_HIMODE_MATH") - SI (DI "TARGET_64BIT || (TARGET_STV && TARGET_SSE2)")]) + SI DI]) ;; Math-dependant single word integer modes without QImode. (define_mode_iterator SWIM248 [(HI "TARGET_HIMODE_MATH") @@ -9693,9 +9693,9 @@ ;; it should be done with splitters. (define_expand "and3" - [(set (match_operand:SWIM1248s 0 "nonimmediate_operand") - (and:SWIM1248s (match_operand:SWIM1248s 1 "nonimmediate_operand") - (match_operand:SWIM1248s 2 "")))] + [(set (match_operand:SWIM1248x 0 "nonimmediate_operand") + (and:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand") + (match_operand:SWIM1248x 2 "")))] "" { machine_mode mode = mode; @@ -9733,7 +9733,7 @@ (match_operand:DI 1 "nonimmediate_operand") (match_operand:DI 2 "x86_64_szext_general_operand"))) (clobber (reg:CC FLAGS_REG))] - "!TARGET_64BIT && TARGET_STV && TARGET_SSE2 + "!TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands) && ix86_pre_reload_split ()" "#" @@ -10337,9 +10337,9 @@ ;; If this is considered useful, it should be done with splitters. (define_expand "3" - [(set (match_operand:SWIM1248s 0 "nonimmediate_operand") - (any_or:SWIM1248s (match_operand:SWIM1248s 1 "nonimmediate_operand") - (match_operand:SWIM1248s 2 "")))] + [(set (match_operand:SWIM1248x 0 "nonimmediate_operand") + (any_or:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand") + (match_operand:SWIM1248x 2 "")))] "" "ix86_expand_binary_operator (, mode, operands); DONE;") @@ -10349,7 +10349,7 @@ (match_operand:DI 1 "nonimmediate_operand") (match_operand:DI 2 "x86_64_szext_general_operand"))) (clobber (reg:CC FLAGS_REG))] - "!TARGET_64BIT && TARGET_STV && TARGET_SSE2 + "!TARGET_64BIT && ix86_binary_operator_ok (, DImode, operands) && ix86_pre_reload_split ()" "#" @@ -11427,15 +11427,15 @@ ;; One complement instructions (define_expand "one_cmpl2" - [(set (match_operand:SWIM1248s 0 "nonimmediate_operand") - (not:SWIM1248s (match_operand:SWIM1248s 1 "nonimmediate_operand")))] + [(set (match_operand:SWIM1248x 0 "nonimmediate_operand") + (not:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")))] "" "ix86_expand_unary_operator (NOT, mode, operands); DONE;") (define_insn_and_split "*one_cmpldi2_doubleword" [(set (match_operand:DI 0 "nonimmediate_operand") (not:DI (match_operand:DI 1 "nonimmediate_operand")))] - "!TARGET_64BIT && TARGET_STV && TARGET_SSE2 + "!TARGET_64BIT && ix86_unary_operator_ok (NOT, DImode, operands) && ix86_pre_reload_split ()" "#" -- cgit v1.1