From 206222e0ce349c1205b8c07c367cdaa62e4f7382 Mon Sep 17 00:00:00 2001
From: Martin Liska <mliska@suse.cz>
Date: Thu, 27 Jan 2022 13:37:04 +0100
Subject: internal_error - do not use leading capital letter

gcc/ChangeLog:

	* config/rs6000/host-darwin.cc (segv_crash_handler):
	Do not use leading capital letter.
	(segv_handler): Likewise.
	* ipa-sra.cc (verify_splitting_accesses): Likewise.
	* varasm.cc (get_section): Likewise.

gcc/d/ChangeLog:

	* decl.cc (d_finish_decl): Do not use leading capital letter.
---
 gcc/config/rs6000/host-darwin.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/host-darwin.cc b/gcc/config/rs6000/host-darwin.cc
index 541f7e1..6072a6c 100644
--- a/gcc/config/rs6000/host-darwin.cc
+++ b/gcc/config/rs6000/host-darwin.cc
@@ -58,7 +58,7 @@ extern int sigaltstack(const struct sigaltstack *, struct sigaltstack *);
 static void
 segv_crash_handler (int sig ATTRIBUTE_UNUSED)
 {
-  internal_error ("Segmentation Fault (code)");
+  internal_error ("segmentation fault (code)");
 }
 
 static void
@@ -128,7 +128,7 @@ segv_handler (int sig ATTRIBUTE_UNUSED,
   fprintf (stderr, "[address=%08lx pc=%08x]\n", 
 	   uc->uc_mcontext->MC_FLD(es).MC_FLD(dar),
 	   uc->uc_mcontext->MC_FLD(ss).MC_FLD(srr0));
-  internal_error ("Segmentation Fault");
+  internal_error ("segmentation fault");
   exit (FATAL_EXIT_CODE);
 }
 
-- 
cgit v1.1


From 3a5fdf986dc6ebb6e244087b462132590ad0a184 Mon Sep 17 00:00:00 2001
From: Iain Sandoe <iain@sandoe.co.uk>
Date: Fri, 28 Jan 2022 19:17:16 +0000
Subject: Darwin, PPC: Fix bootstrap after GLIBC version changes.

A recent patch added tests for OPTION_GLIBC that is defined in
linux.h and linux64.h.  This broke bootstrap for powerpc Darwin.
Fixed by adding a definition to 0 for OPTION_GLIBC.

Signed-off-by: Iain Sandoe <iain@sandoe.co.uk>

gcc/ChangeLog:

	* config/rs6000/darwin.h (OPTION_GLIBC): Define to 0.
---
 gcc/config/rs6000/darwin.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/darwin.h b/gcc/config/rs6000/darwin.h
index b5cef42..210c606 100644
--- a/gcc/config/rs6000/darwin.h
+++ b/gcc/config/rs6000/darwin.h
@@ -34,6 +34,8 @@
 #endif
 #endif
 
+#define OPTION_GLIBC 0
+
 /* The object file format is Mach-O.  */
 
 #define TARGET_OBJECT_FORMAT OBJECT_MACHO
-- 
cgit v1.1


From 06995c2958aaae7e1f60b7d8aa5f07ffda10880a Mon Sep 17 00:00:00 2001
From: Yoshinori Sato <yo-satoh@sios.com>
Date: Fri, 28 Jan 2022 17:16:47 -0500
Subject: sh-linux fix target cpu

sh-linux not supported any SH1 and SH2a little-endian.

gcc

	* config/sh/t-linux (MULTILIB_EXCEPTIONS): Add m1, mb/m1 and m2a.
---
 gcc/config/sh/t-linux | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/sh/t-linux b/gcc/config/sh/t-linux
index d33c638..4866dac 100644
--- a/gcc/config/sh/t-linux
+++ b/gcc/config/sh/t-linux
@@ -1,2 +1,3 @@
-MULTILIB_DIRNAMES= 
-MULTILIB_MATCHES = 
+MULTILIB_DIRNAMES=
+MULTILIB_MATCHES=
+MULTILIB_EXCEPTIONS=m1 mb/m1 m2a
-- 
cgit v1.1


From 23987912ddb4207de0714d81237f93f613557d1f Mon Sep 17 00:00:00 2001
From: Eric Botcazou <ebotcazou@adacore.com>
Date: Mon, 31 Jan 2022 09:21:48 +0100
Subject: Use V8+ default in 32-bit mode on SPARC64/Linux

This is what has been done for ages on SPARC/Solaris and makes it possible
to use 64-bit atomic instructions even in 32-bit mode.

gcc/
	PR target/104189
	* config/sparc/linux64.h (TARGET_DEFAULT): Add MASK_V8PLUS.
---
 gcc/config/sparc/linux64.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/sparc/linux64.h b/gcc/config/sparc/linux64.h
index 46823b6..d08a2ef 100644
--- a/gcc/config/sparc/linux64.h
+++ b/gcc/config/sparc/linux64.h
@@ -35,8 +35,8 @@ along with GCC; see the file COPYING3.  If not see
 #if defined(TARGET_64BIT_DEFAULT) && TARGET_CPU_DEFAULT >= TARGET_CPU_v9
 #undef TARGET_DEFAULT
 #define TARGET_DEFAULT \
-  (MASK_V9 + MASK_PTR64 + MASK_64BIT + MASK_STACK_BIAS + \
-   MASK_APP_REGS + MASK_FPU + MASK_LONG_DOUBLE_128)
+  (MASK_V9 + MASK_64BIT + MASK_PTR64 + MASK_STACK_BIAS + \
+   MASK_V8PLUS + MASK_APP_REGS + MASK_FPU + MASK_LONG_DOUBLE_128)
 #endif
 
 /* This must be v9a not just v9 because by default we enable
-- 
cgit v1.1


From 2cbe5dd54f15e88e0b42567319aa9c8e7bad7946 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Mon, 31 Jan 2022 20:08:18 +0100
Subject: rs6000: Fix up build of non-glibc/aix/darwin powerpc* targets
 [PR104298]

As reported by Martin, while David has added OPTION_GLIBC define to aix
and Iain to darwin, all the other non-linux targets now fail because
rs6000.md macro isn't defined.

One possibility is to define this macro in option-defaults.h which on rs6000
targets is included last, then we don't need to define it in aix/darwin
headers and for targets using linux.h or linux64.h it will DTRT too.

The other option is the first 2 hunks + changing the 3
   if (!OPTION_GLIBC)
     FAIL;
cases in rs6000.md to e.g.
 #ifdef OPTION_GLIBC
   if (!OPTION_GLIBC)
 #endif
     FAIL;
or to:
 #ifdef OPTION_GLIBC
   if (!OPTION_GLIBC)
 #else
   if (true)
 #endif
     FAIL;
(the latter case if Richi wants to push the -Wunreachable-code changes for
GCC 13).

2022-01-31  Jakub Jelinek  <jakub@redhat.com>

	PR target/104298
	* config/rs6000/aix.h (OPTION_GLIBC): Remove.
	* config/rs6000/darwin.h (OPTION_GLIBC): Likewise.
	* config/rs6000/option-defaults.h (OPTION_GLIBC): Define to 0
	if not already defined.
---
 gcc/config/rs6000/aix.h             | 1 -
 gcc/config/rs6000/darwin.h          | 2 --
 gcc/config/rs6000/option-defaults.h | 6 ++++++
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/aix.h b/gcc/config/rs6000/aix.h
index eb7a0c0..ad3238b 100644
--- a/gcc/config/rs6000/aix.h
+++ b/gcc/config/rs6000/aix.h
@@ -23,7 +23,6 @@
 #define DEFAULT_ABI ABI_AIX
 #undef  TARGET_AIX
 #define TARGET_AIX 1
-#define OPTION_GLIBC 0
 
 /* Linux64.h wants to redefine TARGET_AIX based on -m64, but it can't be used
    in the #if conditional in options-default.h, so provide another macro.  */
diff --git a/gcc/config/rs6000/darwin.h b/gcc/config/rs6000/darwin.h
index 210c606..b5cef42 100644
--- a/gcc/config/rs6000/darwin.h
+++ b/gcc/config/rs6000/darwin.h
@@ -34,8 +34,6 @@
 #endif
 #endif
 
-#define OPTION_GLIBC 0
-
 /* The object file format is Mach-O.  */
 
 #define TARGET_OBJECT_FORMAT OBJECT_MACHO
diff --git a/gcc/config/rs6000/option-defaults.h b/gcc/config/rs6000/option-defaults.h
index f03694e..2123bfd 100644
--- a/gcc/config/rs6000/option-defaults.h
+++ b/gcc/config/rs6000/option-defaults.h
@@ -62,3 +62,9 @@
   {"cpu_32", "%{" OPT_ARCH32 ":%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
   {"cpu_64", "%{" OPT_ARCH64 ":%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
   {"float", "%{!msoft-float:%{!mhard-float:-m%(VALUE)-float}}" }
+
+/* rs6000.md uses OPTION_GLIBC unconditionally, while it is defined only in
+   linux{,64}.h.  Define fallback for other targets here.  */
+#ifndef OPTION_GLIBC
+#define OPTION_GLIBC 0
+#endif
-- 
cgit v1.1


From 7e83607907151d5fbb3d2a7bceb7dcc6125c6c15 Mon Sep 17 00:00:00 2001
From: Bill Schmidt <wschmidt@linux.ibm.com>
Date: Mon, 31 Jan 2022 12:28:12 -0600
Subject: rs6000: Don't #ifdef "short" built-in names

It was recently pointed out that we get anomalous behavior when using
__attribute__((target)) to select a CPU.  As an example, when building for
-mcpu=power8 but using __attribute__((target("mcpu=power10")), it is legal
to call __builtin_vec_mod, but not vec_mod, even though these are
equivalent.  This is because the equivalence is established with a #define
that is guarded by #ifdef _ARCH_PWR10.

This goofy behavior occurs with both the old builtins support and the
new.  One of the goals of the new builtins support was to make sure all
appropriate interfaces are available using __attribute__((target)), so I
failed in this respect.  This patch corrects the problem by removing the
ifdef.  Note that in a few cases we use an ifdef in a way that can't be
overridden by __attribute__((target)), and we need to keep those.  For
example, #ifdef __PPU__ is still appropriate.

2022-01-06  Bill Schmidt  <wschmidt@linux.ibm.com>

gcc/
	* config/rs6000/rs6000-overload.def (VEC_ABSD): Remove #ifdef token.
	(VEC_BLENDV): Likewise.
	(VEC_BPERM): Likewise.
	(VEC_CFUGE): Likewise.
	(VEC_CIPHER_BE): Likewise.
	(VEC_CIPHERLAST_BE): Likewise.
	(VEC_CLRL): Likewise.
	(VEC_CLRR): Likewise.
	(VEC_CMPNEZ): Likewise.
	(VEC_CNTLZ): Likewise.
	(VEC_CNTLZM): Likewise.
	(VEC_CNTTZM): Likewise.
	(VEC_CNTLZ_LSBB): Likewise.
	(VEC_CNTM): Likewise.
	(VEC_CNTTZ): Likewise.
	(VEC_CNTTZ_LSBB): Likewise.
	(VEC_CONVERT_4F32_8F16): Likewise.
	(VEC_DIV): Likewise.
	(VEC_DIVE): Likewise.
	(VEC_EQV): Likewise.
	(VEC_EXPANDM): Likewise.
	(VEC_EXTRACT_FP_FROM_SHORTH): Likewise.
	(VEC_EXTRACT_FP_FROM_SHORTL): Likewise.
	(VEC_EXTRACTH): Likewise.
	(VEC_EXTRACTL): Likewise.
	(VEC_EXTRACTM): Likewise.
	(VEC_EXTRACT4B): Likewise.
	(VEC_EXTULX): Likewise.
	(VEC_EXTURX): Likewise.
	(VEC_FIRSTMATCHINDEX): Likewise.
	(VEC_FIRSTMACHOREOSINDEX): Likewise.
	(VEC_FIRSTMISMATCHINDEX): Likewise.
	(VEC_FIRSTMISMATCHOREOSINDEX): Likewise.
	(VEC_GB): Likewise.
	(VEC_GENBM): Likewise.
	(VEC_GENHM): Likewise.
	(VEC_GENWM): Likewise.
	(VEC_GENDM): Likewise.
	(VEC_GENQM): Likewise.
	(VEC_GENPCVM): Likewise.
	(VEC_GNB): Likewise.
	(VEC_INSERTH): Likewise.
	(VEC_INSERTL): Likewise.
	(VEC_INSERT4B): Likewise.
	(VEC_LXVL): Likewise.
	(VEC_MERGEE): Likewise.
	(VEC_MERGEO): Likewise.
	(VEC_MOD): Likewise.
	(VEC_MSUB): Likewise.
	(VEC_MULH): Likewise.
	(VEC_NAND): Likewise.
	(VEC_NCIPHER_BE): Likewise.
	(VEC_NCIPHERLAST_BE): Likewise.
	(VEC_NEARBYINT): Likewise.
	(VEC_NMADD): Likewise.
	(VEC_ORC): Likewise.
	(VEC_PDEP): Likewise.
	(VEC_PERMX): Likewise.
	(VEC_PEXT): Likewise.
	(VEC_POPCNT): Likewise.
	(VEC_PARITY_LSBB): Likewise.
	(VEC_REPLACE_ELT): Likewise.
	(VEC_REPLACE_UN): Likewise.
	(VEC_REVB): Likewise.
	(VEC_RINT): Likewise.
	(VEC_RLMI): Likewise.
	(VEC_RLNM): Likewise.
	(VEC_SBOX_BE): Likewise.
	(VEC_SIGNEXTI): Likewise.
	(VEC_SIGNEXTLL): Likewise.
	(VEC_SIGNEXTQ): Likewise.
	(VEC_SLDB): Likewise.
	(VEC_SLV): Likewise.
	(VEC_SPLATI): Likewise.
	(VEC_SPLATID): Likewise.
	(VEC_SPLATI_INS): Likewise.
	(VEC_SQRT): Likewise.
	(VEC_SRDB): Likewise.
	(VEC_SRV): Likewise.
	(VEC_STRIL): Likewise.
	(VEC_STRIL_P): Likewise.
	(VEC_STRIR): Likewise.
	(VEC_STRIR_P): Likewise.
	(VEC_STXVL): Likewise.
	(VEC_TERNARYLOGIC): Likewise.
	(VEC_TEST_LSBB_ALL_ONES): Likewise.
	(VEC_TEST_LSBB_ALL_ZEROS): Likewise.
	(VEC_VEE): Likewise.
	(VEC_VES): Likewise.
	(VEC_VIE): Likewise.
	(VEC_VPRTYB): Likewise.
	(VEC_VSCEEQ): Likewise.
	(VEC_VSCEGT): Likewise.
	(VEC_VSCELT): Likewise.
	(VEC_VSCEUO): Likewise.
	(VEC_VSEE): Likewise.
	(VEC_VSES): Likewise.
	(VEC_VSIE): Likewise.
	(VEC_VSTDC): Likewise.
	(VEC_VSTDCN): Likewise.
	(VEC_VTDC): Likewise.
	(VEC_XL): Likewise.
	(VEC_XL_BE): Likewise.
	(VEC_XL_LEN_R): Likewise.
	(VEC_XL_SEXT): Likewise.
	(VEC_XL_ZEXT): Likewise.
	(VEC_XST): Likewise.
	(VEC_XST_BE): Likewise.
	(VEC_XST_LEN_R): Likewise.
	(VEC_XST_TRUNC): Likewise.
	(VEC_XXPERMDI): Likewise.
	(VEC_XXSLDWI): Likewise.
	(VEC_TSTSFI_EQ_DD): Likewise.
	(VEC_TSTSFI_EQ_TD): Likewise.
	(VEC_TSTSFI_GT_DD): Likewise.
	(VEC_TSTSFI_GT_TD): Likewise.
	(VEC_TSTSFI_LT_DD): Likewise.
	(VEC_TSTSFI_LT_TD): Likewise.
	(VEC_TSTSFI_OV_DD): Likewise.
	(VEC_TSTSFI_OV_TD): Likewise.
	(VEC_VADDCUQ): Likewise.
	(VEC_VADDECUQ): Likewise.
	(VEC_VADDEUQM): Likewise.
	(VEC_VADDUDM): Likewise.
	(VEC_VADDUQM): Likewise.
	(VEC_VBPERMQ): Likewise.
	(VEC_VCLZB): Likewise.
	(VEC_VCLZD): Likewise.
	(VEC_VCLZH): Likewise.
	(VEC_VCLZW): Likewise.
	(VEC_VCTZB): Likewise.
	(VEC_VCTZD): Likewise.
	(VEC_VCTZH): Likewise.
	(VEC_VCTZW): Likewise.
	(VEC_VEEDP): Likewise.
	(VEC_VEESP): Likewise.
	(VEC_VESDP): Likewise.
	(VEC_VESSP): Likewise.
	(VEC_VIEDP): Likewise.
	(VEC_VIESP): Likewise.
	(VEC_VPKSDSS): Likewise.
	(VEC_VPKSDUS): Likewise.
	(VEC_VPKUDUM): Likewise.
	(VEC_VPKUDUS): Likewise.
	(VEC_VPOPCNT): Likewise.
	(VEC_VPOPCNTB): Likewise.
	(VEC_VPOPCNTD): Likewise.
	(VEC_VPOPCNTH): Likewise.
	(VEC_VPOPCNTW): Likewise.
	(VEC_VPRTYBD): Likewise.
	(VEC_VPRTYBQ): Likewise.
	(VEC_VPRTYBW): Likewise.
	(VEC_VRLD): Likewise.
	(VEC_VSLD): Likewise.
	(VEC_VSRAD): Likewise.
	(VEC_VSRD): Likewise.
	(VEC_VSTDCDP): Likewise.
	(VEC_VSTDCNDP): Likewise.
	(VEC_VSTDCNQP): Likewise.
	(VEC_VSTDCNSP): Likewise.
	(VEC_VSTDCQP): Likewise.
	(VEC_VSTDCSP): Likewise.
	(VEC_VSUBECUQ): Likewise.
	(VEC_VSUBEUQM): Likewise.
	(VEC_VSUBUDM): Likewise.
	(VEC_VSUBUQM): Likewise.
	(VEC_VTDCDP): Likewise.
	(VEC_VTDCSP): Likewise.
	(VEC_VUPKHSW): Likewise.
	(VEC_VUPKLSW): Likewise.
---
 gcc/config/rs6000/rs6000-overload.def | 344 +++++++++++++++++-----------------
 1 file changed, 174 insertions(+), 170 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000-overload.def b/gcc/config/rs6000/rs6000-overload.def
index 7d030ab..cdc703e 100644
--- a/gcc/config/rs6000/rs6000-overload.def
+++ b/gcc/config/rs6000/rs6000-overload.def
@@ -34,6 +34,10 @@
 ; in rs6000-vecdefines.h.  If no #define is desired, the <abi-name> should
 ; be replaced with the token SKIP.
 ;
+; The <ifdef> token should be used sparingly, because a #define can't be
+; overridden by __attribute__((target)).  It is appropriate for cases
+; where a target override isn't a possibility, like __PPU__.
+;
 ; Each function entry has two lines.  The first line is a prototype line.
 ; See rs6000-builtin-new.def for a description of the prototype line.
 ; A prototype line in this file differs in that it doesn't have an
@@ -205,7 +209,7 @@
   vd __builtin_vec_abs (vd);
     XVABSDP
 
-[VEC_ABSD, vec_absd, __builtin_vec_vadu, _ARCH_PWR9]
+[VEC_ABSD, vec_absd, __builtin_vec_vadu]
   vuc __builtin_vec_vadu (vuc, vuc);
     VADUB
   vus __builtin_vec_vadu (vus, vus);
@@ -503,7 +507,7 @@
   vui __builtin_vec_avg (vui, vui);
     VAVGUW
 
-[VEC_BLENDV, vec_blendv, __builtin_vec_xxblend, _ARCH_PWR10]
+[VEC_BLENDV, vec_blendv, __builtin_vec_xxblend]
   vsc __builtin_vec_xxblend (vsc, vsc, vuc);
     VXXBLEND_V16QI  VXXBLEND_VSC
   vuc __builtin_vec_xxblend (vuc, vuc, vuc);
@@ -525,7 +529,7 @@
   vd __builtin_vec_xxblend (vd, vd, vull);
     VXXBLEND_V2DF
 
-[VEC_BPERM, vec_bperm, __builtin_vec_vbperm_api, _ARCH_PWR8]
+[VEC_BPERM, vec_bperm, __builtin_vec_vbperm_api]
   vull __builtin_vec_vbperm_api (vull, vuc);
     VBPERMD  VBPERMD_VULL
   vull __builtin_vec_vbperm_api (vuq, vuc);
@@ -541,25 +545,25 @@
   vd __builtin_vec_ceil (vd);
     XVRDPIP
 
-[VEC_CFUGE, vec_cfuge, __builtin_vec_cfuge, _ARCH_PWR10]
+[VEC_CFUGE, vec_cfuge, __builtin_vec_cfuge]
   vull __builtin_vec_cfuge (vull, vull);
     VCFUGED
 
-[VEC_CIPHER_BE, vec_cipher_be, __builtin_vec_vcipher_be, _ARCH_PWR8]
+[VEC_CIPHER_BE, vec_cipher_be, __builtin_vec_vcipher_be]
   vuc __builtin_vec_vcipher_be (vuc, vuc);
     VCIPHER_BE
 
-[VEC_CIPHERLAST_BE, vec_cipherlast_be, __builtin_vec_vcipherlast_be, _ARCH_PWR8]
+[VEC_CIPHERLAST_BE, vec_cipherlast_be, __builtin_vec_vcipherlast_be]
   vuc __builtin_vec_vcipherlast_be (vuc, vuc);
     VCIPHERLAST_BE
 
-[VEC_CLRL, vec_clrl, __builtin_vec_clrl, _ARCH_PWR10]
+[VEC_CLRL, vec_clrl, __builtin_vec_clrl]
   vsc __builtin_vec_clrl (vsc, unsigned int);
     VCLRLB  VCLRLB_S
   vuc __builtin_vec_clrl (vuc, unsigned int);
     VCLRLB  VCLRLB_U
 
-[VEC_CLRR, vec_clrr, __builtin_vec_clrr, _ARCH_PWR10]
+[VEC_CLRR, vec_clrr, __builtin_vec_clrr]
   vsc __builtin_vec_clrr (vsc, unsigned int);
     VCLRRB  VCLRRB_S
   vuc __builtin_vec_clrr (vuc, unsigned int);
@@ -1026,7 +1030,7 @@
   signed int __builtin_vec_vcmpne_p (signed int, vbll, vsll);
     VCMPNED_P  VCMPNED_P_SB
 
-[VEC_CMPNEZ, vec_cmpnez, __builtin_vec_vcmpnez, _ARCH_PWR9]
+[VEC_CMPNEZ, vec_cmpnez, __builtin_vec_vcmpnez]
   vbc __builtin_vec_cmpnez (vsc, vsc);
     CMPNEZB  CMPNEZB_S
   vbc __builtin_vec_cmpnez (vuc, vuc);
@@ -1064,7 +1068,7 @@
   signed int __builtin_byte_in_range (unsigned int, unsigned int);
     CMPRB2
 
-[VEC_CNTLZ, vec_cntlz, __builtin_vec_vclz, _ARCH_PWR8]
+[VEC_CNTLZ, vec_cntlz, __builtin_vec_vclz]
   vsc __builtin_vec_vclz (vsc);
     VCLZB  VCLZB_S
   vuc __builtin_vec_vclz (vuc);
@@ -1082,15 +1086,15 @@
   vull __builtin_vec_vclz (vull);
     VCLZD  VCLZD_U
 
-[VEC_CNTLZM, vec_cntlzm, __builtin_vec_vclzdm, _ARCH_PWR10]
+[VEC_CNTLZM, vec_cntlzm, __builtin_vec_vclzdm]
   vull __builtin_vec_vclzdm (vull, vull);
     VCLZDM
 
-[VEC_CNTTZM, vec_cnttzm, __builtin_vec_vctzdm, _ARCH_PWR10]
+[VEC_CNTTZM, vec_cnttzm, __builtin_vec_vctzdm]
   vull __builtin_vec_vctzdm (vull, vull);
     VCTZDM
 
-[VEC_CNTLZ_LSBB, vec_cntlz_lsbb, __builtin_vec_vclzlsbb, _ARCH_PWR9]
+[VEC_CNTLZ_LSBB, vec_cntlz_lsbb, __builtin_vec_vclzlsbb]
   signed int __builtin_vec_vclzlsbb (vsc);
     VCLZLSBB_V16QI  VCLZLSBB_VSC
   signed int __builtin_vec_vclzlsbb (vuc);
@@ -1104,7 +1108,7 @@
   signed int __builtin_vec_vclzlsbb (vui);
     VCLZLSBB_V4SI  VCLZLSBB_VUI
 
-[VEC_CNTM, vec_cntm, __builtin_vec_cntm, _ARCH_PWR10]
+[VEC_CNTM, vec_cntm, __builtin_vec_cntm]
   unsigned long long __builtin_vec_cntm (vuc, const int);
     VCNTMBB
   unsigned long long __builtin_vec_cntm (vus, const int);
@@ -1114,7 +1118,7 @@
   unsigned long long __builtin_vec_cntm (vull, const int);
     VCNTMBD
 
-[VEC_CNTTZ, vec_cnttz, __builtin_vec_vctz, _ARCH_PWR9]
+[VEC_CNTTZ, vec_cnttz, __builtin_vec_vctz]
   vsc __builtin_vec_vctz (vsc);
     VCTZB  VCTZB_S
   vuc __builtin_vec_vctz (vuc);
@@ -1132,7 +1136,7 @@
   vull __builtin_vec_vctz (vull);
     VCTZD  VCTZD_U
 
-[VEC_CNTTZ_LSBB, vec_cnttz_lsbb, __builtin_vec_vctzlsbb, _ARCH_PWR9]
+[VEC_CNTTZ_LSBB, vec_cnttz_lsbb, __builtin_vec_vctzlsbb]
   signed int __builtin_vec_vctzlsbb (vsc);
     VCTZLSBB_V16QI  VCTZLSBB_VSC
   signed int __builtin_vec_vctzlsbb (vuc);
@@ -1150,7 +1154,7 @@
   vus __builtin_vec_convert_4f32_8i16 (vf, vf);
     CONVERT_4F32_8I16
 
-[VEC_CONVERT_4F32_8F16, vec_pack_to_short_fp32, __builtin_vec_convert_4f32_8f16, _ARCH_PWR9]
+[VEC_CONVERT_4F32_8F16, vec_pack_to_short_fp32, __builtin_vec_convert_4f32_8f16]
   vus __builtin_vec_convert_4f32_8f16 (vf, vf);
     CONVERT_4F32_8F16
 
@@ -1182,7 +1186,7 @@
   vull __builtin_vec_ctu (vd, const int);
     XVCVDPUXDS_SCALE
 
-[VEC_DIV, vec_div, __builtin_vec_div, __VSX__]
+[VEC_DIV, vec_div, __builtin_vec_div]
   vsi __builtin_vec_div (vsi, vsi);
     VDIVSW
   vui __builtin_vec_div (vui, vui);
@@ -1200,7 +1204,7 @@
   vd __builtin_vec_div (vd, vd);
     XVDIVDP
 
-[VEC_DIVE, vec_dive, __builtin_vec_dive, _ARCH_PWR10]
+[VEC_DIVE, vec_dive, __builtin_vec_dive]
   vsi __builtin_vec_dive (vsi, vsi);
     VDIVESW
   vui __builtin_vec_dive (vui, vui);
@@ -1436,7 +1440,7 @@
   void __builtin_vec_dstt (vf *, const int, const int);
     DSTT  DSTT_VF
 
-[VEC_EQV, vec_eqv, __builtin_vec_eqv, _ARCH_PWR8]
+[VEC_EQV, vec_eqv, __builtin_vec_eqv]
   vsc __builtin_vec_eqv (vsc, vsc);
     EQV_V16QI
   vuc __builtin_vec_eqv (vuc, vuc);
@@ -1499,7 +1503,7 @@
   vull __builtin_vec_eqv (vull, vbll);
     EQV_V2DI_UNS  EQV_VULL_VBLL
 
-[VEC_EXPANDM, vec_expandm, __builtin_vec_vexpandm, _ARCH_PWR10]
+[VEC_EXPANDM, vec_expandm, __builtin_vec_vexpandm]
   vuc __builtin_vec_vexpandm (vuc);
     VEXPANDMB
   vus __builtin_vec_vexpandm (vus);
@@ -1524,15 +1528,15 @@
   vsi __builtin_vec_extract (vsi, signed int);
     VSPLTW  EXTRACT_FAKERY
 
-[VEC_EXTRACT_FP_FROM_SHORTH, vec_extract_fp32_from_shorth, __builtin_vec_vextract_fp_from_shorth, _ARCH_PWR9]
+[VEC_EXTRACT_FP_FROM_SHORTH, vec_extract_fp32_from_shorth, __builtin_vec_vextract_fp_from_shorth]
   vf __builtin_vec_vextract_fp_from_shorth (vus);
     VEXTRACT_FP_FROM_SHORTH
 
-[VEC_EXTRACT_FP_FROM_SHORTL, vec_extract_fp32_from_shortl, __builtin_vec_vextract_fp_from_shortl, _ARCH_PWR9]
+[VEC_EXTRACT_FP_FROM_SHORTL, vec_extract_fp32_from_shortl, __builtin_vec_vextract_fp_from_shortl]
   vf __builtin_vec_vextract_fp_from_shortl (vus);
     VEXTRACT_FP_FROM_SHORTL
 
-[VEC_EXTRACTH, vec_extracth, __builtin_vec_extracth, _ARCH_PWR10]
+[VEC_EXTRACTH, vec_extracth, __builtin_vec_extracth]
   vull __builtin_vec_extracth (vuc, vuc, unsigned char);
     VEXTRACTBR
   vull __builtin_vec_extracth (vus, vus, unsigned char);
@@ -1542,7 +1546,7 @@
   vull __builtin_vec_extracth (vull, vull, unsigned char);
     VEXTRACTDR
 
-[VEC_EXTRACTL, vec_extractl, __builtin_vec_extractl, _ARCH_PWR10]
+[VEC_EXTRACTL, vec_extractl, __builtin_vec_extractl]
   vull __builtin_vec_extractl (vuc, vuc, unsigned char);
     VEXTRACTBL
   vull __builtin_vec_extractl (vus, vus, unsigned char);
@@ -1552,7 +1556,7 @@
   vull __builtin_vec_extractl (vull, vull, unsigned char);
     VEXTRACTDL
 
-[VEC_EXTRACTM, vec_extractm, __builtin_vec_vextractm, _ARCH_PWR10]
+[VEC_EXTRACTM, vec_extractm, __builtin_vec_vextractm]
   signed int __builtin_vec_vextractm (vuc);
     VEXTRACTMB
   signed int __builtin_vec_vextractm (vus);
@@ -1564,11 +1568,11 @@
   signed int __builtin_vec_vextractm (vuq);
     VEXTRACTMQ
 
-[VEC_EXTRACT4B, vec_extract4b, __builtin_vec_extract4b, _ARCH_PWR9]
+[VEC_EXTRACT4B, vec_extract4b, __builtin_vec_extract4b]
   vull __builtin_vec_extract4b (vuc, const int);
     EXTRACT4B
 
-[VEC_EXTULX, vec_xlx, __builtin_vec_vextulx, _ARCH_PWR9]
+[VEC_EXTULX, vec_xlx, __builtin_vec_vextulx]
   signed char __builtin_vec_vextulx (unsigned int, vsc);
     VEXTUBLX  VEXTUBLX_S
   unsigned char __builtin_vec_vextulx (unsigned int, vuc);
@@ -1584,7 +1588,7 @@
   float __builtin_vec_vextulx (unsigned int, vf);
     VEXTUWLX  VEXTUWLX_F
 
-[VEC_EXTURX, vec_xrx, __builtin_vec_vexturx, _ARCH_PWR9]
+[VEC_EXTURX, vec_xrx, __builtin_vec_vexturx]
   signed char __builtin_vec_vexturx (unsigned int, vsc);
     VEXTUBRX  VEXTUBRX_S
   unsigned char __builtin_vec_vexturx (unsigned int, vuc);
@@ -1600,7 +1604,7 @@
   float __builtin_vec_vexturx (unsigned int, vf);
     VEXTUWRX  VEXTUWRX_F
 
-[VEC_FIRSTMATCHINDEX, vec_first_match_index, __builtin_vec_first_match_index, _ARCH_PWR9]
+[VEC_FIRSTMATCHINDEX, vec_first_match_index, __builtin_vec_first_match_index]
   unsigned int __builtin_vec_first_match_index (vsc, vsc);
     VFIRSTMATCHINDEX_V16QI FIRSTMATCHINDEX_VSC
   unsigned int __builtin_vec_first_match_index (vuc, vuc);
@@ -1614,7 +1618,7 @@
   unsigned int __builtin_vec_first_match_index (vui, vui);
     VFIRSTMATCHINDEX_V4SI FIRSTMATCHINDEX_VUI
 
-[VEC_FIRSTMATCHOREOSINDEX, vec_first_match_or_eos_index, __builtin_vec_first_match_or_eos_index, _ARCH_PWR9]
+[VEC_FIRSTMATCHOREOSINDEX, vec_first_match_or_eos_index, __builtin_vec_first_match_or_eos_index]
   unsigned int __builtin_vec_first_match_or_eos_index (vsc, vsc);
     VFIRSTMATCHOREOSINDEX_V16QI FIRSTMATCHOREOSINDEX_VSC
   unsigned int __builtin_vec_first_match_or_eos_index (vuc, vuc);
@@ -1628,7 +1632,7 @@
   unsigned int __builtin_vec_first_match_or_eos_index (vui, vui);
     VFIRSTMATCHOREOSINDEX_V4SI FIRSTMATCHOREOSINDEX_VUI
 
-[VEC_FIRSTMISMATCHINDEX, vec_first_mismatch_index, __builtin_vec_first_mismatch_index, _ARCH_PWR9]
+[VEC_FIRSTMISMATCHINDEX, vec_first_mismatch_index, __builtin_vec_first_mismatch_index]
   unsigned int __builtin_vec_first_mismatch_index (vsc, vsc);
     VFIRSTMISMATCHINDEX_V16QI FIRSTMISMATCHINDEX_VSC
   unsigned int __builtin_vec_first_mismatch_index (vuc, vuc);
@@ -1642,7 +1646,7 @@
   unsigned int __builtin_vec_first_mismatch_index (vui, vui);
     VFIRSTMISMATCHINDEX_V4SI FIRSTMISMATCHINDEX_VUI
 
-[VEC_FIRSTMISMATCHOREOSINDEX, vec_first_mismatch_or_eos_index, __builtin_vec_first_mismatch_or_eos_index, _ARCH_PWR9]
+[VEC_FIRSTMISMATCHOREOSINDEX, vec_first_mismatch_or_eos_index, __builtin_vec_first_mismatch_or_eos_index]
   unsigned int __builtin_vec_first_mismatch_or_eos_index (vsc, vsc);
     VFIRSTMISMATCHOREOSINDEX_V16QI FIRSTMISMATCHOREOSINDEX_VSC
   unsigned int __builtin_vec_first_mismatch_or_eos_index (vuc, vuc);
@@ -1692,33 +1696,33 @@
   vd __builtin_vec_floor (vd);
     XVRDPIM
 
-[VEC_GB, vec_gb, __builtin_vec_vgbbd, _ARCH_PWR8]
+[VEC_GB, vec_gb, __builtin_vec_vgbbd]
   vsc __builtin_vec_vgbbd (vsc);
     VGBBD  VGBBD_S
   vuc __builtin_vec_vgbbd (vuc);
     VGBBD  VGBBD_U
 
-[VEC_GENBM, vec_genbm, __builtin_vec_mtvsrbm, _ARCH_PWR10]
+[VEC_GENBM, vec_genbm, __builtin_vec_mtvsrbm]
   vuc __builtin_vec_mtvsrbm (unsigned long long);
     MTVSRBM
 
-[VEC_GENHM, vec_genhm, __builtin_vec_mtvsrhm, _ARCH_PWR10]
+[VEC_GENHM, vec_genhm, __builtin_vec_mtvsrhm]
   vus __builtin_vec_mtvsrhm (unsigned long long);
     MTVSRHM
 
-[VEC_GENWM, vec_genwm, __builtin_vec_mtvsrwm, _ARCH_PWR10]
+[VEC_GENWM, vec_genwm, __builtin_vec_mtvsrwm]
   vui __builtin_vec_mtvsrwm (unsigned long long);
     MTVSRWM
 
-[VEC_GENDM, vec_gendm, __builtin_vec_mtvsrdm, _ARCH_PWR10]
+[VEC_GENDM, vec_gendm, __builtin_vec_mtvsrdm]
   vull __builtin_vec_mtvsrdm (unsigned long long);
     MTVSRDM
 
-[VEC_GENQM, vec_genqm, __builtin_vec_mtvsrqm, _ARCH_PWR10]
+[VEC_GENQM, vec_genqm, __builtin_vec_mtvsrqm]
   vuq __builtin_vec_mtvsrqm (unsigned long long);
     MTVSRQM
 
-[VEC_GENPCVM, vec_genpcvm, __builtin_vec_xxgenpcvm, _ARCH_PWR10]
+[VEC_GENPCVM, vec_genpcvm, __builtin_vec_xxgenpcvm]
   vuc __builtin_vec_xxgenpcvm (vuc, const int);
     XXGENPCVM_V16QI
   vus __builtin_vec_xxgenpcvm (vus, const int);
@@ -1728,7 +1732,7 @@
   vull __builtin_vec_xxgenpcvm (vull, const int);
     XXGENPCVM_V2DI
 
-[VEC_GNB, vec_gnb, __builtin_vec_gnb, _ARCH_PWR10]
+[VEC_GNB, vec_gnb, __builtin_vec_gnb]
   unsigned long long __builtin_vec_gnb (vuq, const int);
     VGNB
 
@@ -1740,7 +1744,7 @@
   vsi __builtin_vec_insert (vsi, vsi, signed int);
     XXPERMDI_4SI  INSERT_FAKERY
 
-[VEC_INSERTH, vec_inserth, __builtin_vec_inserth, _ARCH_PWR10]
+[VEC_INSERTH, vec_inserth, __builtin_vec_inserth]
   vuc __builtin_vec_inserth (unsigned char, vuc, unsigned int);
     VINSERTGPRBR
   vuc __builtin_vec_inserth (vuc, vuc, unsigned int);
@@ -1756,7 +1760,7 @@
   vull __builtin_vec_inserth (unsigned long long, vull, unsigned int);
     VINSERTGPRDR
 
-[VEC_INSERTL, vec_insertl, __builtin_vec_insertl, _ARCH_PWR10]
+[VEC_INSERTL, vec_insertl, __builtin_vec_insertl]
   vuc __builtin_vec_insertl (unsigned char, vuc, unsigned int);
     VINSERTGPRBL
   vuc __builtin_vec_insertl (vuc, vuc, unsigned int);
@@ -1772,7 +1776,7 @@
   vull __builtin_vec_insertl (unsigned long long, vull, unsigned int);
     VINSERTGPRDL
 
-[VEC_INSERT4B, vec_insert4b, __builtin_vec_insert4b, _ARCH_PWR9]
+[VEC_INSERT4B, vec_insert4b, __builtin_vec_insert4b]
   vuc __builtin_vec_insert4b (vsi, vuc, const int);
     INSERT4B  INSERT4B_S
   vuc __builtin_vec_insert4b (vui, vuc, const int);
@@ -2128,7 +2132,7 @@
   vuc __builtin_vec_lvsr (signed long, const double *);
     LVSR  LVSR_D
 
-[VEC_LXVL, vec_xl_len, __builtin_vec_lxvl, _ARCH_PPC64_PWR9]
+[VEC_LXVL, vec_xl_len, __builtin_vec_lxvl]
   vsc __builtin_vec_lxvl (const signed char *, unsigned int);
     LXVL  LXVL_VSC
   vuc __builtin_vec_lxvl (const unsigned char *, unsigned int);
@@ -2227,7 +2231,7 @@
   vull __builtin_vec_max (vbll, vull);
     VMAXUD  VMAXUD_BU
 
-[VEC_MERGEE, vec_mergee, __builtin_vec_vmrgew, _ARCH_PWR8]
+[VEC_MERGEE, vec_mergee, __builtin_vec_vmrgew]
   vsi __builtin_vec_vmrgew (vsi, vsi);
     VMRGEW_V4SI  VMRGEW_VSI
   vui __builtin_vec_vmrgew (vui, vui);
@@ -2327,7 +2331,7 @@
   vull __builtin_vec_mergel (vbll, vull);
     VEC_MERGEL_V2DI  VEC_MERGEL_VBLL_VULL
 
-[VEC_MERGEO, vec_mergeo, __builtin_vec_vmrgow, _ARCH_PWR8]
+[VEC_MERGEO, vec_mergeo, __builtin_vec_vmrgow]
   vsi __builtin_vec_vmrgow (vsi, vsi);
     VMRGOW_V4SI  VMRGOW_VSI
   vui __builtin_vec_vmrgow (vui, vui);
@@ -2414,7 +2418,7 @@
   vus __builtin_vec_mladd (vus, vus, vus);
     VMLADDUHM  VMLADDUHM_VUS2
 
-[VEC_MOD, vec_mod, __builtin_vec_mod, _ARCH_PWR10]
+[VEC_MOD, vec_mod, __builtin_vec_mod]
   vsi __builtin_vec_mod (vsi, vsi);
     VMODSW
   vui __builtin_vec_mod (vui, vui);
@@ -2432,7 +2436,7 @@
   vss __builtin_vec_mradds (vss, vss, vss);
     VMHRADDSHS
 
-[VEC_MSUB, vec_msub, __builtin_vec_msub, __VSX__]
+[VEC_MSUB, vec_msub, __builtin_vec_msub]
   vf __builtin_vec_msub (vf, vf, vf);
     XVMSUBSP
   vd __builtin_vec_msub (vd, vd, vd);
@@ -2511,7 +2515,7 @@
   vuq __builtin_vec_mule (vull, vull);
     VMULEUD
 
-[VEC_MULH, vec_mulh, __builtin_vec_mulh, _ARCH_PWR10]
+[VEC_MULH, vec_mulh, __builtin_vec_mulh]
   vsi __builtin_vec_mulh (vsi, vsi);
     VMULHSW
   vui __builtin_vec_mulh (vui, vui);
@@ -2553,7 +2557,7 @@
   vd __builtin_vec_nabs (vd);
     NABS_V2DF
 
-[VEC_NAND, vec_nand, __builtin_vec_nand, _ARCH_PWR8]
+[VEC_NAND, vec_nand, __builtin_vec_nand]
   vsc __builtin_vec_nand (vsc, vsc);
     NAND_V16QI
   vuc __builtin_vec_nand (vuc, vuc);
@@ -2616,15 +2620,15 @@
   vull __builtin_vec_nand (vull, vbll);
     NAND_V2DI_UNS  NAND_VULL_VBLL
 
-[VEC_NCIPHER_BE, vec_ncipher_be, __builtin_vec_vncipher_be, _ARCH_PWR8]
+[VEC_NCIPHER_BE, vec_ncipher_be, __builtin_vec_vncipher_be]
   vuc __builtin_vec_vncipher_be (vuc, vuc);
     VNCIPHER_BE
 
-[VEC_NCIPHERLAST_BE, vec_ncipherlast_be, __builtin_vec_vncipherlast_be, _ARCH_PWR8]
+[VEC_NCIPHERLAST_BE, vec_ncipherlast_be, __builtin_vec_vncipherlast_be]
   vuc __builtin_vec_vncipherlast_be (vuc, vuc);
     VNCIPHERLAST_BE
 
-[VEC_NEARBYINT, vec_nearbyint, __builtin_vec_nearbyint, __VSX__]
+[VEC_NEARBYINT, vec_nearbyint, __builtin_vec_nearbyint]
   vf __builtin_vec_nearbyint (vf);
     XVRSPI  XVRSPI_NBI
   vd __builtin_vec_nearbyint (vd);
@@ -2644,7 +2648,7 @@
   vd __builtin_vec_neg (vd);
     NEG_V2DF
 
-[VEC_NMADD, vec_nmadd, __builtin_vec_nmadd, __VSX__]
+[VEC_NMADD, vec_nmadd, __builtin_vec_nmadd]
   vf __builtin_vec_nmadd (vf, vf, vf);
     XVNMADDSP
   vd __builtin_vec_nmadd (vd, vd, vd);
@@ -2778,7 +2782,7 @@
   vd __builtin_vec_or (vbll, vd);
     VOR_V2DF  VOR_VBLL_VD
 
-[VEC_ORC, vec_orc, __builtin_vec_orc, _ARCH_PWR8]
+[VEC_ORC, vec_orc, __builtin_vec_orc]
   vsc __builtin_vec_orc (vsc, vsc);
     ORC_V16QI
   vuc __builtin_vec_orc (vuc, vuc);
@@ -2895,7 +2899,7 @@
   vui __builtin_vec_packsu (vsll, vsll);
     VPKSDUS
 
-[VEC_PDEP, vec_pdep, __builtin_vec_vpdepd, _ARCH_PWR10]
+[VEC_PDEP, vec_pdep, __builtin_vec_vpdepd]
   vull __builtin_vec_vpdepd (vull, vull);
     VPDEPD
 
@@ -2940,7 +2944,7 @@
   vbc __builtin_vec_perm (vbc, vbc, vbc);
     VPERM_16QI  VPERM_VBC_VBC_VBC
 
-[VEC_PERMX, vec_permx, __builtin_vec_xxpermx, _ARCH_PWR10]
+[VEC_PERMX, vec_permx, __builtin_vec_xxpermx]
   vsc __builtin_vec_xxpermx (vsc, vsc, vuc, const int);
     XXPERMX_UV2DI  XXPERMX_VSC
   vuc __builtin_vec_xxpermx (vuc, vuc, vuc, const int);
@@ -2970,7 +2974,7 @@
   vbc __builtin_vec_vpermxor (vbc, vbc, vbc);
     VPERMXOR  VPERMXOR_VBC
 
-[VEC_PEXT, vec_pext, __builtin_vec_vpextd, _ARCH_PWR10]
+[VEC_PEXT, vec_pext, __builtin_vec_vpextd]
   vull __builtin_vec_vpextd (vull, vull);
     VPEXTD
 
@@ -2984,7 +2988,7 @@
   vuq __builtin_vec_vpmsum (vull, vull);
     VPMSUMD  VPMSUMD_V
 
-[VEC_POPCNT, vec_popcnt, __builtin_vec_vpopcntu, _ARCH_PWR8]
+[VEC_POPCNT, vec_popcnt, __builtin_vec_vpopcntu]
   vuc __builtin_vec_vpopcntu (vsc);
     VPOPCNTB
   vuc __builtin_vec_vpopcntu (vuc);
@@ -3002,7 +3006,7 @@
   vull __builtin_vec_vpopcntu (vull);
     VPOPCNTUD
 
-[VEC_PARITY_LSBB, vec_parity_lsbb, __builtin_vec_vparity_lsbb, _ARCH_PWR9]
+[VEC_PARITY_LSBB, vec_parity_lsbb, __builtin_vec_vparity_lsbb]
   vui __builtin_vec_vparity_lsbb (vsi);
     VPRTYBW  VPRTYBW_S
   vui __builtin_vec_vparity_lsbb (vui);
@@ -3036,7 +3040,7 @@
   vd __builtin_vec_recipdiv (vd, vd);
     RECIP_V2DF
 
-[VEC_REPLACE_ELT, vec_replace_elt, __builtin_vec_replace_elt, _ARCH_PWR10]
+[VEC_REPLACE_ELT, vec_replace_elt, __builtin_vec_replace_elt]
   vui __builtin_vec_replace_elt (vui, unsigned int, const int);
     VREPLACE_ELT_UV4SI
   vsi __builtin_vec_replace_elt (vsi, signed int, const int);
@@ -3050,7 +3054,7 @@
   vd __builtin_vec_replace_elt (vd, double, const int);
     VREPLACE_ELT_V2DF
 
-[VEC_REPLACE_UN, vec_replace_unaligned, __builtin_vec_replace_un, _ARCH_PWR10]
+[VEC_REPLACE_UN, vec_replace_unaligned, __builtin_vec_replace_un]
   vui __builtin_vec_replace_un (vui, unsigned int, const int);
     VREPLACE_UN_UV4SI
   vsi __builtin_vec_replace_un (vsi, signed int, const int);
@@ -3064,7 +3068,7 @@
   vd __builtin_vec_replace_un (vd, double, const int);
     VREPLACE_UN_V2DF
 
-[VEC_REVB, vec_revb, __builtin_vec_revb, _ARCH_PWR8]
+[VEC_REVB, vec_revb, __builtin_vec_revb]
   vss __builtin_vec_revb (vss);
     REVB_V8HI  REVB_VSS
   vus __builtin_vec_revb (vus);
@@ -3129,7 +3133,7 @@
   vd __builtin_vec_vreve (vd);
     VREVE_V2DF
 
-[VEC_RINT, vec_rint, __builtin_vec_rint, __VSX__]
+[VEC_RINT, vec_rint, __builtin_vec_rint]
   vf __builtin_vec_rint (vf);
     XVRSPIC
   vd __builtin_vec_rint (vd);
@@ -3157,7 +3161,7 @@
   vuq __builtin_vec_rl (vuq, vuq);
     VRLQ  VRLQ_VUQ
 
-[VEC_RLMI, vec_rlmi, __builtin_vec_rlmi, _ARCH_PWR9]
+[VEC_RLMI, vec_rlmi, __builtin_vec_rlmi]
   vui __builtin_vec_rlmi (vui, vui, vui);
     VRLWMI
   vull __builtin_vec_rlmi (vull, vull, vull);
@@ -3167,7 +3171,7 @@
   vuq __builtin_vec_rlmi (vuq, vuq, vuq);
     VRLQMI  VRLQMI_VUQ
 
-[VEC_RLNM, vec_vrlnm, __builtin_vec_rlnm, _ARCH_PWR9]
+[VEC_RLNM, vec_vrlnm, __builtin_vec_rlnm]
   vui __builtin_vec_rlnm (vui, vui);
     VRLWNM
   vull __builtin_vec_rlnm (vull, vull);
@@ -3195,7 +3199,7 @@
   vd __builtin_vec_rsqrte (vd);
     XVRSQRTEDP
 
-[VEC_SBOX_BE, vec_sbox_be, __builtin_vec_sbox_be, _ARCH_PWR8]
+[VEC_SBOX_BE, vec_sbox_be, __builtin_vec_sbox_be]
   vuc __builtin_vec_sbox_be (vuc);
     VSBOX_BE
 
@@ -3294,13 +3298,13 @@
   vsi __builtin_vec_vsignedo (vd);
     VEC_VSIGNEDO_V2DF
 
-[VEC_SIGNEXTI, vec_signexti, __builtin_vec_signexti, _ARCH_PWR9]
+[VEC_SIGNEXTI, vec_signexti, __builtin_vec_signexti]
   vsi __builtin_vec_signexti (vsc);
     VSIGNEXTSB2W
   vsi __builtin_vec_signexti (vss);
     VSIGNEXTSH2W
 
-[VEC_SIGNEXTLL, vec_signextll, __builtin_vec_signextll, _ARCH_PWR9]
+[VEC_SIGNEXTLL, vec_signextll, __builtin_vec_signextll]
   vsll __builtin_vec_signextll (vsc);
     VSIGNEXTSB2D
   vsll __builtin_vec_signextll (vss);
@@ -3308,7 +3312,7 @@
   vsll __builtin_vec_signextll (vsi);
     VSIGNEXTSW2D
 
-[VEC_SIGNEXTQ, vec_signextq, __builtin_vec_signextq, _ARCH_PWR10]
+[VEC_SIGNEXTQ, vec_signextq, __builtin_vec_signextq]
   vsq __builtin_vec_signextq (vsll);
     VSIGNEXTSD2Q
 
@@ -3366,7 +3370,7 @@
   vd __builtin_vec_sld (vd, vd, const int);
     VSLDOI_2DF
 
-[VEC_SLDB, vec_sldb, __builtin_vec_sldb, _ARCH_PWR10]
+[VEC_SLDB, vec_sldb, __builtin_vec_sldb]
   vsc __builtin_vec_sldb (vsc, vsc, const int);
     VSLDB_V16QI  VSLDB_VSC
   vuc __builtin_vec_sldb (vuc, vuc, const int);
@@ -3521,7 +3525,7 @@
   vf __builtin_vec_slo (vf, vuc);
     VSLO  VSLO_VFU
 
-[VEC_SLV, vec_slv, __builtin_vec_vslv, _ARCH_PWR9]
+[VEC_SLV, vec_slv, __builtin_vec_vslv]
   vuc __builtin_vec_vslv (vuc, vuc);
     VSLV
 
@@ -3572,17 +3576,17 @@
 ; There are no entries for vec_splat_u{8,16,32}.  These are handled
 ; in altivec.h with a #define and a cast.
 
-[VEC_SPLATI, vec_splati, __builtin_vec_xxspltiw, _ARCH_PWR10]
+[VEC_SPLATI, vec_splati, __builtin_vec_xxspltiw]
   vsi __builtin_vec_xxspltiw (signed int);
     VXXSPLTIW_V4SI
   vf __builtin_vec_xxspltiw (float);
     VXXSPLTIW_V4SF
 
-[VEC_SPLATID, vec_splatid, __builtin_vec_xxspltid, _ARCH_PWR10]
+[VEC_SPLATID, vec_splatid, __builtin_vec_xxspltid]
   vd __builtin_vec_xxspltid (float);
     VXXSPLTIDP
 
-[VEC_SPLATI_INS, vec_splati_ins, __builtin_vec_xxsplti32dx, _ARCH_PWR10]
+[VEC_SPLATI_INS, vec_splati_ins, __builtin_vec_xxsplti32dx]
   vsi __builtin_vec_xxsplti32dx (vsi, const int, signed int);
     VXXSPLTI32DX_V4SI  VXXSPLTI32DX_VSI
   vui __builtin_vec_xxsplti32dx (vui, const int, unsigned int);
@@ -3598,7 +3602,7 @@
   vsi __builtin_vec_splats (vsi);
     ABS_V4SI SPLATS_FAKERY
 
-[VEC_SQRT, vec_sqrt, __builtin_vec_sqrt, __VSX__]
+[VEC_SQRT, vec_sqrt, __builtin_vec_sqrt]
   vf __builtin_vec_sqrt (vf);
     XVSQRTSP
   vd __builtin_vec_sqrt (vd);
@@ -3648,7 +3652,7 @@
   vuq __builtin_vec_sra (vuq, vuq);
     VSRAQ  VSRAQ_VUQ
 
-[VEC_SRDB, vec_srdb, __builtin_vec_srdb, _ARCH_PWR10]
+[VEC_SRDB, vec_srdb, __builtin_vec_srdb]
   vsc __builtin_vec_srdb (vsc, vsc, const int);
     VSRDB_V16QI  VSRDB_VSC
   vuc __builtin_vec_srdb (vuc, vuc, const int);
@@ -3775,7 +3779,7 @@
   vf __builtin_vec_sro (vf, vuc);
     VSRO  VSRO_VFU
 
-[VEC_SRV, vec_srv, __builtin_vec_vsrv, _ARCH_PWR9]
+[VEC_SRV, vec_srv, __builtin_vec_vsrv]
   vuc __builtin_vec_vsrv (vuc, vuc);
     VSRV
 
@@ -3956,7 +3960,7 @@
   void __builtin_vec_stl (vd, signed long long, double *);
     STVXL_V2DF  STVXL_D
 
-[VEC_STRIL, vec_stril, __builtin_vec_stril, _ARCH_PWR10]
+[VEC_STRIL, vec_stril, __builtin_vec_stril]
   vuc __builtin_vec_stril (vuc);
     VSTRIBL  VSTRIBL_U
   vsc __builtin_vec_stril (vsc);
@@ -3966,7 +3970,7 @@
   vss __builtin_vec_stril (vss);
     VSTRIHL  VSTRIHL_S
 
-[VEC_STRIL_P, vec_stril_p, __builtin_vec_stril_p, _ARCH_PWR10]
+[VEC_STRIL_P, vec_stril_p, __builtin_vec_stril_p]
   signed int __builtin_vec_stril_p (vuc);
     VSTRIBL_P  VSTRIBL_PU
   signed int __builtin_vec_stril_p (vsc);
@@ -3976,7 +3980,7 @@
   signed int __builtin_vec_stril_p (vss);
     VSTRIHL_P  VSTRIHL_PS
 
-[VEC_STRIR, vec_strir, __builtin_vec_strir, _ARCH_PWR10]
+[VEC_STRIR, vec_strir, __builtin_vec_strir]
   vuc __builtin_vec_strir (vuc);
     VSTRIBR  VSTRIBR_U
   vsc __builtin_vec_strir (vsc);
@@ -3986,7 +3990,7 @@
   vss __builtin_vec_strir (vss);
     VSTRIHR  VSTRIHR_S
 
-[VEC_STRIR_P, vec_strir_p, __builtin_vec_strir_p, _ARCH_PWR10]
+[VEC_STRIR_P, vec_strir_p, __builtin_vec_strir_p]
   signed int __builtin_vec_strir_p (vuc);
     VSTRIBR_P  VSTRIBR_PU
   signed int __builtin_vec_strir_p (vsc);
@@ -4148,7 +4152,7 @@
   void __builtin_vec_stvrxl (vf, signed long long, float *);
     STVRXL  STVRXL_F
 
-[VEC_STXVL, vec_xst_len, __builtin_vec_stxvl, _ARCH_PPC64_PWR9]
+[VEC_STXVL, vec_xst_len, __builtin_vec_stxvl]
   void __builtin_vec_stxvl (vsc, signed char *, unsigned int);
     STXVL  STXVL_VSC
   void __builtin_vec_stxvl (vuc, unsigned char *, unsigned int);
@@ -4316,7 +4320,7 @@
   vsi __builtin_vec_sums (vsi, vsi);
     VSUMSWS
 
-[VEC_TERNARYLOGIC, vec_ternarylogic, __builtin_vec_xxeval, _ARCH_PWR10]
+[VEC_TERNARYLOGIC, vec_ternarylogic, __builtin_vec_xxeval]
   vuc __builtin_vec_xxeval (vuc, vuc, vuc, const int);
     XXEVAL  XXEVAL_VUC
   vus __builtin_vec_xxeval (vus, vus, vus, const int);
@@ -4328,11 +4332,11 @@
   vuq __builtin_vec_xxeval (vuq, vuq, vuq, const int);
     XXEVAL  XXEVAL_VUQ
 
-[VEC_TEST_LSBB_ALL_ONES, vec_test_lsbb_all_ones, __builtin_vec_xvtlsbb_all_ones, _ARCH_PWR9]
+[VEC_TEST_LSBB_ALL_ONES, vec_test_lsbb_all_ones, __builtin_vec_xvtlsbb_all_ones]
   signed int __builtin_vec_xvtlsbb_all_ones (vuc);
     XVTLSBB_ONES
 
-[VEC_TEST_LSBB_ALL_ZEROS, vec_test_lsbb_all_zeros, __builtin_vec_xvtlsbb_all_zeros, _ARCH_PWR9]
+[VEC_TEST_LSBB_ALL_ZEROS, vec_test_lsbb_all_zeros, __builtin_vec_xvtlsbb_all_zeros]
   signed int __builtin_vec_xvtlsbb_all_zeros (vuc);
     XVTLSBB_ZEROS
 
@@ -4420,19 +4424,19 @@
   vui __builtin_vec_vunsignedo (vd);
     VEC_VUNSIGNEDO_V2DF
 
-[VEC_VEE, vec_extract_exp, __builtin_vec_extract_exp, _ARCH_PWR9]
+[VEC_VEE, vec_extract_exp, __builtin_vec_extract_exp]
   vui __builtin_vec_extract_exp (vf);
     VEESP
   vull __builtin_vec_extract_exp (vd);
     VEEDP
 
-[VEC_VES, vec_extract_sig, __builtin_vec_extract_sig, _ARCH_PWR9]
+[VEC_VES, vec_extract_sig, __builtin_vec_extract_sig]
   vui __builtin_vec_extract_sig (vf);
     VESSP
   vull __builtin_vec_extract_sig (vd);
     VESDP
 
-[VEC_VIE, vec_insert_exp, __builtin_vec_insert_exp, _ARCH_PWR9]
+[VEC_VIE, vec_insert_exp, __builtin_vec_insert_exp]
   vf __builtin_vec_insert_exp (vf, vui);
     VIESP  VIESP_VF
   vf __builtin_vec_insert_exp (vui, vui);
@@ -4444,7 +4448,7 @@
 
 ; It is truly unfortunate that vec_vprtyb has an incompatible set of
 ; interfaces with vec_parity_lsbb.  So we can't even deprecate this.
-[VEC_VPRTYB, vec_vprtyb, __builtin_vec_vprtyb, _ARCH_PWR9]
+[VEC_VPRTYB, vec_vprtyb, __builtin_vec_vprtyb]
   vsi __builtin_vec_vprtyb (vsi);
     VPRTYBW  VPRTYB_VSI
   vui __builtin_vec_vprtyb (vui);
@@ -4462,43 +4466,43 @@
   unsigned __int128 __builtin_vec_vprtyb (unsigned __int128);
     VPRTYBQ  VPRTYB_UQ
 
-[VEC_VSCEEQ, scalar_cmp_exp_eq, __builtin_vec_scalar_cmp_exp_eq, _ARCH_PWR9]
+[VEC_VSCEEQ, scalar_cmp_exp_eq, __builtin_vec_scalar_cmp_exp_eq]
   signed int __builtin_vec_scalar_cmp_exp_eq (double, double);
     VSCEDPEQ
   signed int __builtin_vec_scalar_cmp_exp_eq (_Float128, _Float128);
     VSCEQPEQ
 
-[VEC_VSCEGT, scalar_cmp_exp_gt, __builtin_vec_scalar_cmp_exp_gt, _ARCH_PWR9]
+[VEC_VSCEGT, scalar_cmp_exp_gt, __builtin_vec_scalar_cmp_exp_gt]
   signed int __builtin_vec_scalar_cmp_exp_gt (double, double);
     VSCEDPGT
   signed int __builtin_vec_scalar_cmp_exp_gt (_Float128, _Float128);
     VSCEQPGT
 
-[VEC_VSCELT, scalar_cmp_exp_lt, __builtin_vec_scalar_cmp_exp_lt, _ARCH_PWR9]
+[VEC_VSCELT, scalar_cmp_exp_lt, __builtin_vec_scalar_cmp_exp_lt]
   signed int __builtin_vec_scalar_cmp_exp_lt (double, double);
     VSCEDPLT
   signed int __builtin_vec_scalar_cmp_exp_lt (_Float128, _Float128);
     VSCEQPLT
 
-[VEC_VSCEUO, scalar_cmp_exp_unordered, __builtin_vec_scalar_cmp_exp_unordered, _ARCH_PWR9]
+[VEC_VSCEUO, scalar_cmp_exp_unordered, __builtin_vec_scalar_cmp_exp_unordered]
   signed int __builtin_vec_scalar_cmp_exp_unordered (double, double);
     VSCEDPUO
   signed int __builtin_vec_scalar_cmp_exp_unordered (_Float128, _Float128);
     VSCEQPUO
 
-[VEC_VSEE, scalar_extract_exp, __builtin_vec_scalar_extract_exp, _ARCH_PWR9]
+[VEC_VSEE, scalar_extract_exp, __builtin_vec_scalar_extract_exp]
   unsigned int __builtin_vec_scalar_extract_exp (double);
     VSEEDP
   unsigned int __builtin_vec_scalar_extract_exp (_Float128);
     VSEEQP
 
-[VEC_VSES, scalar_extract_sig, __builtin_vec_scalar_extract_sig, _ARCH_PWR9]
+[VEC_VSES, scalar_extract_sig, __builtin_vec_scalar_extract_sig]
   unsigned long long __builtin_vec_scalar_extract_sig (double);
     VSESDP
   unsigned __int128 __builtin_vec_scalar_extract_sig (_Float128);
     VSESQP
 
-[VEC_VSIE, scalar_insert_exp, __builtin_vec_scalar_insert_exp, _ARCH_PWR9]
+[VEC_VSIE, scalar_insert_exp, __builtin_vec_scalar_insert_exp]
   double __builtin_vec_scalar_insert_exp (unsigned long long, unsigned long long);
     VSIEDP
   double __builtin_vec_scalar_insert_exp (double, unsigned long long);
@@ -4508,7 +4512,7 @@
   _Float128 __builtin_vec_scalar_insert_exp (_Float128, unsigned long long);
     VSIEQPF
 
-[VEC_VSTDC, scalar_test_data_class, __builtin_vec_scalar_test_data_class, _ARCH_PWR9]
+[VEC_VSTDC, scalar_test_data_class, __builtin_vec_scalar_test_data_class]
   unsigned int __builtin_vec_scalar_test_data_class (float, const int);
     VSTDCSP
   unsigned int __builtin_vec_scalar_test_data_class (double, const int);
@@ -4516,7 +4520,7 @@
   unsigned int __builtin_vec_scalar_test_data_class (_Float128, const int);
     VSTDCQP
 
-[VEC_VSTDCN, scalar_test_neg, __builtin_vec_scalar_test_neg, _ARCH_PWR9]
+[VEC_VSTDCN, scalar_test_neg, __builtin_vec_scalar_test_neg]
   unsigned int __builtin_vec_scalar_test_neg (float);
     VSTDCNSP
   unsigned int __builtin_vec_scalar_test_neg (double);
@@ -4524,13 +4528,13 @@
   unsigned int __builtin_vec_scalar_test_neg (_Float128);
     VSTDCNQP
 
-[VEC_VTDC, vec_test_data_class, __builtin_vec_test_data_class, _ARCH_PWR9]
+[VEC_VTDC, vec_test_data_class, __builtin_vec_test_data_class]
   vbi __builtin_vec_test_data_class (vf, const int);
     VTDCSP
   vbll __builtin_vec_test_data_class (vd, const int);
     VTDCDP
 
-[VEC_XL, vec_xl, __builtin_vec_vsx_ld, __VSX__]
+[VEC_XL, vec_xl, __builtin_vec_vsx_ld]
   vsc __builtin_vec_vsx_ld (signed long long, const vsc *);
     LXVW4X_V16QI  LXVW4X_VSC
   vsc __builtin_vec_vsx_ld (signed long long, const signed char *);
@@ -4588,7 +4592,7 @@
   vd __builtin_vec_vsx_ld (signed long long, const double *);
     LXVD2X_V2DF  LXVD2X_D
 
-[VEC_XL_BE, vec_xl_be, __builtin_vec_xl_be, __VSX__]
+[VEC_XL_BE, vec_xl_be, __builtin_vec_xl_be]
   vsc __builtin_vec_xl_be (signed long long, const vsc *);
     LD_ELEMREV_V16QI  LD_ELEMREV_VSC
   vsc __builtin_vec_xl_be (signed long long, const signed char *);
@@ -4634,11 +4638,11 @@
   vd __builtin_vec_xl_be (signed long long, const double *);
     LD_ELEMREV_V2DF  LD_ELEMREV_DD
 
-[VEC_XL_LEN_R, vec_xl_len_r, __builtin_vec_xl_len_r, _ARCH_PPC64_PWR9]
+[VEC_XL_LEN_R, vec_xl_len_r, __builtin_vec_xl_len_r]
   vuc __builtin_vsx_xl_len_r (const unsigned char *, unsigned int);
     XL_LEN_R
 
-[VEC_XL_SEXT, vec_xl_sext, __builtin_vec_xl_sext, _ARCH_PWR10]
+[VEC_XL_SEXT, vec_xl_sext, __builtin_vec_xl_sext]
   vsq __builtin_vec_xl_sext (signed long long, const signed char *);
     SE_LXVRBX
   vsq __builtin_vec_xl_sext (signed long long, const signed short *);
@@ -4648,7 +4652,7 @@
   vsq __builtin_vec_xl_sext (signed long long, const signed long long *);
     SE_LXVRDX
 
-[VEC_XL_ZEXT, vec_xl_zext, __builtin_vec_xl_zext, _ARCH_PWR10]
+[VEC_XL_ZEXT, vec_xl_zext, __builtin_vec_xl_zext]
   vuq __builtin_vec_xl_zext (signed long long, const unsigned char *);
     ZE_LXVRBX
   vuq __builtin_vec_xl_zext (signed long long, const unsigned short *);
@@ -4733,7 +4737,7 @@
   vd __builtin_vec_xor (vbll, vd);
     VXOR_V2DF  VXOR_VBLL_VD
 
-[VEC_XST, vec_xst, __builtin_vec_vsx_st, __VSX__]
+[VEC_XST, vec_xst, __builtin_vec_vsx_st]
   void __builtin_vec_vsx_st (vsc, signed long long, vsc *);
     STXVW4X_V16QI  STXVW4X_VSC
   void __builtin_vec_vsx_st (vsc, signed long long, signed char *);
@@ -4801,7 +4805,7 @@
   void __builtin_vec_vsx_st (vd, signed long long, double *);
     STXVD2X_V2DF  STXVD2X_D
 
-[VEC_XST_BE, vec_xst_be, __builtin_vec_xst_be, __VSX__]
+[VEC_XST_BE, vec_xst_be, __builtin_vec_xst_be]
   void __builtin_vec_xst_be (vsc, signed long long, vsc *);
     ST_ELEMREV_V16QI  ST_ELEMREV_VSC
   void __builtin_vec_xst_be (vsc, signed long long, signed char *);
@@ -4847,11 +4851,11 @@
   void __builtin_vec_xst_be (vd, signed long long, double *);
     ST_ELEMREV_V2DF  ST_ELEMREV_D
 
-[VEC_XST_LEN_R, vec_xst_len_r, __builtin_vec_xst_len_r, _ARCH_PPC64_PWR9]
+[VEC_XST_LEN_R, vec_xst_len_r, __builtin_vec_xst_len_r]
   void __builtin_vsx_xst_len_r (vuc, unsigned char *, unsigned int);
     XST_LEN_R
 
-[VEC_XST_TRUNC, vec_xst_trunc, __builtin_vec_xst_trunc, _ARCH_PWR10]
+[VEC_XST_TRUNC, vec_xst_trunc, __builtin_vec_xst_trunc]
   void __builtin_vec_xst_trunc (vsq, signed long long, signed char *);
     TR_STXVRBX  TR_STXVRBX_S
   void __builtin_vec_xst_trunc (vuq, signed long long, unsigned char *);
@@ -4869,7 +4873,7 @@
   void __builtin_vec_xst_trunc (vuq, signed long long, unsigned long long *);
     TR_STXVRDX  TR_STXVRDX_U
 
-[VEC_XXPERMDI, vec_xxpermdi, __builtin_vsx_xxpermdi, __VSX__]
+[VEC_XXPERMDI, vec_xxpermdi, __builtin_vsx_xxpermdi]
   vsc __builtin_vsx_xxpermdi (vsc, vsc, const int);
     XXPERMDI_16QI  XXPERMDI_VSC
   vuc __builtin_vsx_xxpermdi (vuc, vuc, const int);
@@ -4891,7 +4895,7 @@
   vd __builtin_vsx_xxpermdi (vd, vd, const int);
     XXPERMDI_2DF  XXPERMDI_VD
 
-[VEC_XXSLDWI, vec_xxsldwi, __builtin_vsx_xxsldwi, __VSX__]
+[VEC_XXSLDWI, vec_xxsldwi, __builtin_vsx_xxsldwi]
   vsc __builtin_vsx_xxsldwi (vsc, vsc, const int);
     XXSLDWI_16QI  XXSLDWI_VSC2
   vuc __builtin_vsx_xxsldwi (vuc, vuc, const int);
@@ -4990,51 +4994,51 @@
   void __builtin_vec_stvewx (vui, signed long, void *);
     STVEWX  STVEWX_DEPR8
 
-[VEC_TSTSFI_EQ_DD, SKIP, __builtin_dfp_dtstsfi_eq_dd, _ARCH_PWR9]
+[VEC_TSTSFI_EQ_DD, SKIP, __builtin_dfp_dtstsfi_eq_dd]
   signed int __builtin_dfp_dtstsfi_eq_dd (const int, _Decimal64);
     TSTSFI_EQ_DD  TSTSFI_EQ_DD_DEPR1
 
-[VEC_TSTSFI_EQ_TD, SKIP, __builtin_dfp_dtstsfi_eq_td, _ARCH_PWR9]
+[VEC_TSTSFI_EQ_TD, SKIP, __builtin_dfp_dtstsfi_eq_td]
   signed int __builtin_dfp_dtstsfi_eq_td (const int, _Decimal128);
     TSTSFI_EQ_TD  TSTSFI_EQ_TD_DEPR1
 
-[VEC_TSTSFI_GT_DD, SKIP, __builtin_dfp_dtstsfi_gt_dd, _ARCH_PWR9]
+[VEC_TSTSFI_GT_DD, SKIP, __builtin_dfp_dtstsfi_gt_dd]
   signed int __builtin_dfp_dtstsfi_gt_dd (const int, _Decimal64);
     TSTSFI_GT_DD  TSTSFI_GT_DD_DEPR1
 
-[VEC_TSTSFI_GT_TD, SKIP, __builtin_dfp_dtstsfi_gt_td, _ARCH_PWR9]
+[VEC_TSTSFI_GT_TD, SKIP, __builtin_dfp_dtstsfi_gt_td]
   signed int __builtin_dfp_dtstsfi_gt_td (const int, _Decimal128);
     TSTSFI_GT_TD  TSTSFI_GT_TD_DEPR1
 
-[VEC_TSTSFI_LT_DD, SKIP, __builtin_dfp_dtstsfi_lt_dd, _ARCH_PWR9]
+[VEC_TSTSFI_LT_DD, SKIP, __builtin_dfp_dtstsfi_lt_dd]
   signed int __builtin_dfp_dtstsfi_lt_dd (const int, _Decimal64);
     TSTSFI_LT_DD  TSTSFI_LT_DD_DEPR1
 
-[VEC_TSTSFI_LT_TD, SKIP, __builtin_dfp_dtstsfi_lt_td, _ARCH_PWR9]
+[VEC_TSTSFI_LT_TD, SKIP, __builtin_dfp_dtstsfi_lt_td]
   signed int __builtin_dfp_dtstsfi_lt_td (const int, _Decimal128);
     TSTSFI_LT_TD  TSTSFI_LT_TD_DEPR1
 
-[VEC_TSTSFI_OV_DD, SKIP, __builtin_dfp_dtstsfi_ov_dd, _ARCH_PWR9]
+[VEC_TSTSFI_OV_DD, SKIP, __builtin_dfp_dtstsfi_ov_dd]
   signed int __builtin_dfp_dtstsfi_ov_dd (const int, _Decimal64);
     TSTSFI_OV_DD  TSTSFI_OV_DD_DEPR1
 
-[VEC_TSTSFI_OV_TD, SKIP, __builtin_dfp_dtstsfi_ov_td, _ARCH_PWR9]
+[VEC_TSTSFI_OV_TD, SKIP, __builtin_dfp_dtstsfi_ov_td]
   signed int __builtin_dfp_dtstsfi_ov_td (const int, _Decimal128);
     TSTSFI_OV_TD  TSTSFI_OV_TD_DEPR1
 
-[VEC_VADDCUQ, vec_vaddcuq, __builtin_vec_vaddcuq, _ARCH_PWR8]
+[VEC_VADDCUQ, vec_vaddcuq, __builtin_vec_vaddcuq]
   vsq __builtin_vec_vaddcuq (vsq, vsq);
     VADDCUQ  VADDCUQ_DEPR1
   vuq __builtin_vec_vaddcuq (vuq, vuq);
     VADDCUQ  VADDCUQ_DEPR2
 
-[VEC_VADDECUQ, vec_vaddecuq, __builtin_vec_vaddecuq, _ARCH_PWR8]
+[VEC_VADDECUQ, vec_vaddecuq, __builtin_vec_vaddecuq]
   vsq __builtin_vec_vaddecuq (vsq, vsq, vsq);
     VADDECUQ  VADDECUQ_DEPR1
   vuq __builtin_vec_vaddecuq (vuq, vuq, vuq);
     VADDECUQ  VADDECUQ_DEPR2
 
-[VEC_VADDEUQM, vec_vaddeuqm, __builtin_vec_vaddeuqm, _ARCH_PWR8]
+[VEC_VADDEUQM, vec_vaddeuqm, __builtin_vec_vaddeuqm]
   vsq __builtin_vec_vaddeuqm (vsq, vsq, vsq);
     VADDEUQM  VADDEUQM_DEPR1
   vuq __builtin_vec_vaddeuqm (vuq, vuq, vuq);
@@ -5098,7 +5102,7 @@
   vuc __builtin_vec_vaddubs (vuc, vbc);
     VADDUBS  VADDUBS_DEPR5
 
-[VEC_VADDUDM, vec_vaddudm, __builtin_vec_vaddudm, _ARCH_PWR8]
+[VEC_VADDUDM, vec_vaddudm, __builtin_vec_vaddudm]
   vsll __builtin_vec_vaddudm (vbll, vsll);
     VADDUDM  VADDUDM_DEPR1
   vsll __builtin_vec_vaddudm (vsll, vbll);
@@ -5142,7 +5146,7 @@
   vus __builtin_vec_vadduhs (vus, vbs);
     VADDUHS  VADDUHS_DEPR5
 
-[VEC_VADDUQM, vec_vadduqm, __builtin_vec_vadduqm, _ARCH_PWR8]
+[VEC_VADDUQM, vec_vadduqm, __builtin_vec_vadduqm]
   vsq __builtin_vec_vadduqm (vsq, vsq);
     VADDUQM  VADDUQM_DEPR1
   vuq __builtin_vec_vadduqm (vuq, vuq);
@@ -5214,7 +5218,7 @@
   vui __builtin_vec_vavguw (vui, vui);
     VAVGUW  VAVGUW_DEPR1
 
-[VEC_VBPERMQ, vec_vbpermq, __builtin_vec_vbpermq, _ARCH_PWR8]
+[VEC_VBPERMQ, vec_vbpermq, __builtin_vec_vbpermq]
   vull __builtin_vec_vbpermq (vull, vuc);
     VBPERMQ  VBPERMQ_DEPR1
   vsll __builtin_vec_vbpermq (vsc, vsc);
@@ -5232,25 +5236,25 @@
   vf __builtin_vec_vcfux (vui, const int);
     VCFUX  VCFUX_DEPR1
 
-[VEC_VCLZB, vec_vclzb, __builtin_vec_vclzb, _ARCH_PWR8]
+[VEC_VCLZB, vec_vclzb, __builtin_vec_vclzb]
   vsc __builtin_vec_vclzb (vsc);
     VCLZB  VCLZB_DEPR1
   vuc __builtin_vec_vclzb (vuc);
     VCLZB  VCLZB_DEPR2
 
-[VEC_VCLZD, vec_vclzd, __builtin_vec_vclzd, _ARCH_PWR8]
+[VEC_VCLZD, vec_vclzd, __builtin_vec_vclzd]
   vsll __builtin_vec_vclzd (vsll);
     VCLZD  VCLZD_DEPR1
   vull __builtin_vec_vclzd (vull);
     VCLZD  VCLZD_DEPR2
 
-[VEC_VCLZH, vec_vclzh, __builtin_vec_vclzh, _ARCH_PWR8]
+[VEC_VCLZH, vec_vclzh, __builtin_vec_vclzh]
   vss __builtin_vec_vclzh (vss);
     VCLZH  VCLZH_DEPR1
   vus __builtin_vec_vclzh (vus);
     VCLZH  VCLZH_DEPR2
 
-[VEC_VCLZW, vec_vclzw, __builtin_vec_vclzw, _ARCH_PWR8]
+[VEC_VCLZW, vec_vclzw, __builtin_vec_vclzw]
   vsi __builtin_vec_vclzw (vsi);
     VCLZW  VCLZW_DEPR1
   vui __builtin_vec_vclzw (vui);
@@ -5306,53 +5310,53 @@
   vbi __builtin_vec_vcmpgtuw (vui, vui);
     VCMPGTUW  VCMPGTUW_DEPR1
 
-[VEC_VCTZB, vec_vctzb, __builtin_vec_vctzb, _ARCH_PWR9]
+[VEC_VCTZB, vec_vctzb, __builtin_vec_vctzb]
   vsc __builtin_vec_vctzb (vsc);
     VCTZB  VCTZB_DEPR1
   vuc __builtin_vec_vctzb (vuc);
     VCTZB  VCTZB_DEPR2
 
-[VEC_VCTZD, vec_vctzd, __builtin_vec_vctzd, _ARCH_PWR9]
+[VEC_VCTZD, vec_vctzd, __builtin_vec_vctzd]
   vsll __builtin_vec_vctzd (vsll);
     VCTZD  VCTZD_DEPR1
   vull __builtin_vec_vctzd (vull);
     VCTZD  VCTZD_DEPR2
 
-[VEC_VCTZH, vec_vctzh, __builtin_vec_vctzh, _ARCH_PWR9]
+[VEC_VCTZH, vec_vctzh, __builtin_vec_vctzh]
   vss __builtin_vec_vctzh (vss);
     VCTZH  VCTZH_DEPR1
   vus __builtin_vec_vctzh (vus);
     VCTZH  VCTZH_DEPR2
 
-[VEC_VCTZW, vec_vctzw, __builtin_vec_vctzw, _ARCH_PWR9]
+[VEC_VCTZW, vec_vctzw, __builtin_vec_vctzw]
   vsi __builtin_vec_vctzw (vsi);
     VCTZW  VCTZW_DEPR1
   vui __builtin_vec_vctzw (vui);
     VCTZW  VCTZW_DEPR2
 
-[VEC_VEEDP, vec_extract_exp_dp, __builtin_vec_extract_exp_dp, _ARCH_PWR9]
+[VEC_VEEDP, vec_extract_exp_dp, __builtin_vec_extract_exp_dp]
   vull __builtin_vec_extract_exp_dp (vd);
     VEEDP  VEEDP_DEPR1
 
-[VEC_VEESP, vec_extract_exp_sp, __builtin_vec_extract_exp_sp, _ARCH_PWR9]
+[VEC_VEESP, vec_extract_exp_sp, __builtin_vec_extract_exp_sp]
   vui __builtin_vec_extract_exp_sp (vf);
     VEESP  VEESP_DEPR1
 
-[VEC_VESDP, vec_extract_sig_dp, __builtin_vec_extract_sig_dp, _ARCH_PWR9]
+[VEC_VESDP, vec_extract_sig_dp, __builtin_vec_extract_sig_dp]
   vull __builtin_vec_extract_sig_dp (vd);
     VESDP  VESDP_DEPR1
 
-[VEC_VESSP, vec_extract_sig_sp, __builtin_vec_extract_sig_sp, _ARCH_PWR9]
+[VEC_VESSP, vec_extract_sig_sp, __builtin_vec_extract_sig_sp]
   vui __builtin_vec_extract_sig_sp (vf);
     VESSP  VESSP_DEPR1
 
-[VEC_VIEDP, vec_insert_exp_dp, __builtin_vec_insert_exp_dp, _ARCH_PWR9]
+[VEC_VIEDP, vec_insert_exp_dp, __builtin_vec_insert_exp_dp]
   vd __builtin_vec_insert_exp_dp (vd, vull);
     VIEDP  VIEDP_DEPR1
   vd __builtin_vec_insert_exp_dp (vull, vull);
     VIEDP  VIEDP_DEPR2
 
-[VEC_VIESP, vec_insert_exp_sp, __builtin_vec_insert_exp_sp, _ARCH_PWR9]
+[VEC_VIESP, vec_insert_exp_sp, __builtin_vec_insert_exp_sp]
   vf __builtin_vec_insert_exp_sp (vf, vui);
     VIESP  VIESP_DEPR1
   vf __builtin_vec_insert_exp_sp (vui, vui);
@@ -5650,11 +5654,11 @@
   vull __builtin_vec_vmulouw (vui, vui);
     VMULOUW  VMULOUW_DEPR1
 
-[VEC_VPKSDSS, vec_vpksdss, __builtin_vec_vpksdss, _ARCH_PWR8]
+[VEC_VPKSDSS, vec_vpksdss, __builtin_vec_vpksdss]
   vsi __builtin_vec_vpksdss (vsll, vsll);
     VPKSDSS  VPKSDSS_DEPR1
 
-[VEC_VPKSDUS, vec_vpksdus, __builtin_vec_vpksdus, _ARCH_PWR8]
+[VEC_VPKSDUS, vec_vpksdus, __builtin_vec_vpksdus]
   vui __builtin_vec_vpksdus (vsll, vsll);
     VPKSDUS  VPKSDUS_DEPR1
 
@@ -5674,7 +5678,7 @@
   vus __builtin_vec_vpkswus (vsi, vsi);
     VPKSWUS  VPKSWUS_DEPR1
 
-[VEC_VPKUDUM, vec_vpkudum, __builtin_vec_vpkudum, _ARCH_PWR8]
+[VEC_VPKUDUM, vec_vpkudum, __builtin_vec_vpkudum]
   vsi __builtin_vec_vpkudum (vsll, vsll);
     VPKUDUM  VPKUDUM_DEPR1
   vui __builtin_vec_vpkudum (vull, vull);
@@ -5682,7 +5686,7 @@
   vbi __builtin_vec_vpkudum (vbll, vbll);
     VPKUDUM  VPKUDUM_DEPR3
 
-[VEC_VPKUDUS, vec_vpkudus, __builtin_vec_vpkudus, _ARCH_PWR8]
+[VEC_VPKUDUS, vec_vpkudus, __builtin_vec_vpkudus]
   vui __builtin_vec_vpkudus (vull, vull);
     VPKUDUS  VPKUDUS_DEPR1
 
@@ -5710,7 +5714,7 @@
   vus __builtin_vec_vpkuwus (vui, vui);
     VPKUWUS  VPKUWUS_DEPR1
 
-[VEC_VPOPCNT, vec_vpopcnt, __builtin_vec_vpopcnt, _ARCH_PWR8]
+[VEC_VPOPCNT, vec_vpopcnt, __builtin_vec_vpopcnt]
   vsc __builtin_vec_vpopcnt (vsc);
     VPOPCNTB  VPOPCNT_DEPR1
   vuc __builtin_vec_vpopcnt (vuc);
@@ -5728,37 +5732,37 @@
   vull __builtin_vec_vpopcnt (vull);
     VPOPCNTD  VPOPCNT_DEPR8
 
-[VEC_VPOPCNTB, vec_vpopcntb, __builtin_vec_vpopcntb, _ARCH_PWR8]
+[VEC_VPOPCNTB, vec_vpopcntb, __builtin_vec_vpopcntb]
   vsc __builtin_vec_vpopcntb (vsc);
     VPOPCNTB  VPOPCNTB_DEPR1
   vuc __builtin_vec_vpopcntb (vuc);
     VPOPCNTB  VPOPCNTB_DEPR2
 
-[VEC_VPOPCNTD, vec_vpopcntd, __builtin_vec_vpopcntd, _ARCH_PWR8]
+[VEC_VPOPCNTD, vec_vpopcntd, __builtin_vec_vpopcntd]
   vsll __builtin_vec_vpopcntd (vsll);
     VPOPCNTD  VPOPCNTD_DEPR1
   vull __builtin_vec_vpopcntd (vull);
     VPOPCNTD  VPOPCNTD_DEPR2
 
-[VEC_VPOPCNTH, vec_vpopcnth, __builtin_vec_vpopcnth, _ARCH_PWR8]
+[VEC_VPOPCNTH, vec_vpopcnth, __builtin_vec_vpopcnth]
   vss __builtin_vec_vpopcnth (vss);
     VPOPCNTH  VPOPCNTH_DEPR1
   vus __builtin_vec_vpopcnth (vus);
     VPOPCNTH  VPOPCNTH_DEPR2
 
-[VEC_VPOPCNTW, vec_vpopcntw, __builtin_vec_vpopcntw, _ARCH_PWR8]
+[VEC_VPOPCNTW, vec_vpopcntw, __builtin_vec_vpopcntw]
   vsi __builtin_vec_vpopcntw (vsi);
     VPOPCNTW  VPOPCNTW_DEPR1
   vui __builtin_vec_vpopcntw (vui);
     VPOPCNTW  VPOPCNTW_DEPR2
 
-[VEC_VPRTYBD, vec_vprtybd, __builtin_vec_vprtybd, _ARCH_PWR9]
+[VEC_VPRTYBD, vec_vprtybd, __builtin_vec_vprtybd]
   vsll __builtin_vec_vprtybd (vsll);
     VPRTYBD  VPRTYBD_DEPR1
   vull __builtin_vec_vprtybd (vull);
     VPRTYBD  VPRTYBD_DEPR2
 
-[VEC_VPRTYBQ, vec_vprtybq, __builtin_vec_vprtybq, _ARCH_PPC64_PWR9]
+[VEC_VPRTYBQ, vec_vprtybq, __builtin_vec_vprtybq]
   vsq __builtin_vec_vprtybq (vsq);
     VPRTYBQ  VPRTYBQ_DEPR1
   vuq __builtin_vec_vprtybq (vuq);
@@ -5768,7 +5772,7 @@
   unsigned __int128 __builtin_vec_vprtybq (unsigned __int128);
     VPRTYBQ  VPRTYBQ_DEPR4
 
-[VEC_VPRTYBW, vec_vprtybw, __builtin_vec_vprtybw, _ARCH_PWR9]
+[VEC_VPRTYBW, vec_vprtybw, __builtin_vec_vprtybw]
   vsi __builtin_vec_vprtybw (vsi);
     VPRTYBW  VPRTYBW_DEPR1
   vui __builtin_vec_vprtybw (vui);
@@ -5780,7 +5784,7 @@
   vuc __builtin_vec_vrlb (vuc, vuc);
     VRLB  VRLB_DEPR2
 
-[VEC_VRLD, SKIP, __builtin_vec_vrld, _ARCH_PWR8]
+[VEC_VRLD, SKIP, __builtin_vec_vrld]
   vsll __builtin_vec_vrld (vsll, vull);
     VRLD  VRLD_DEPR1
   vull __builtin_vec_vrld (vull, vull);
@@ -5804,7 +5808,7 @@
   vuc __builtin_vec_vslb (vuc, vuc);
     VSLB  VSLB_DEPR2
 
-[VEC_VSLD, SKIP, __builtin_vec_vsld, _ARCH_PWR8]
+[VEC_VSLD, SKIP, __builtin_vec_vsld]
   vsll __builtin_vec_vsld (vsll, vull);
     VSLD  VSLD_DEPR1
   vull __builtin_vec_vsld (vull, vull);
@@ -5856,7 +5860,7 @@
   vuc __builtin_vec_vsrab (vuc, vuc);
     VSRAB  VSRAB_DEPR2
 
-[VEC_VSRAD, SKIP, __builtin_vec_vsrad, _ARCH_PWR8]
+[VEC_VSRAD, SKIP, __builtin_vec_vsrad]
   vsll __builtin_vec_vsrad (vsll, vull);
     VSRAD  VSRAD_DEPR1
   vull __builtin_vec_vsrad (vull, vull);
@@ -5880,7 +5884,7 @@
   vuc __builtin_vec_vsrb (vuc, vuc);
     VSRB  VSRB_DEPR2
 
-[VEC_VSRD, SKIP, __builtin_vec_vsrd, _ARCH_PWR8]
+[VEC_VSRD, SKIP, __builtin_vec_vsrd]
   vsll __builtin_vec_vsrd (vsll, vull);
     VSRD  VSRD_DEPR1
   vull __builtin_vec_vsrd (vull, vull);
@@ -5898,27 +5902,27 @@
   vui __builtin_vec_vsrw (vui, vui);
     VSRW  VSRW_DEPR2
 
-[VEC_VSTDCDP, scalar_test_data_class_dp, __builtin_vec_scalar_test_data_class_dp, _ARCH_PWR9]
+[VEC_VSTDCDP, scalar_test_data_class_dp, __builtin_vec_scalar_test_data_class_dp]
   unsigned int __builtin_vec_scalar_test_data_class_dp (double, const int);
     VSTDCDP  VSTDCDP_DEPR1
 
-[VEC_VSTDCNDP, scalar_test_neg_dp, __builtin_vec_scalar_test_neg_dp, _ARCH_PWR9]
+[VEC_VSTDCNDP, scalar_test_neg_dp, __builtin_vec_scalar_test_neg_dp]
   unsigned int __builtin_vec_scalar_test_neg_dp (double);
     VSTDCNDP  VSTDCNDP_DEPR1
 
-[VEC_VSTDCNQP, scalar_test_neg_qp, __builtin_vec_scalar_test_neg_qp, _ARCH_PWR9]
+[VEC_VSTDCNQP, scalar_test_neg_qp, __builtin_vec_scalar_test_neg_qp]
   unsigned int __builtin_vec_scalar_test_neg_qp (_Float128);
     VSTDCNQP  VSTDCNQP_DEPR1
 
-[VEC_VSTDCNSP, scalar_test_neg_sp, __builtin_vec_scalar_test_neg_sp, _ARCH_PWR9]
+[VEC_VSTDCNSP, scalar_test_neg_sp, __builtin_vec_scalar_test_neg_sp]
   unsigned int __builtin_vec_scalar_test_neg_sp (float);
     VSTDCNSP  VSTDCNSP_DEPR1
 
-[VEC_VSTDCQP, scalar_test_data_class_qp, __builtin_vec_scalar_test_data_class_qp, _ARCH_PWR9]
+[VEC_VSTDCQP, scalar_test_data_class_qp, __builtin_vec_scalar_test_data_class_qp]
   unsigned int __builtin_vec_scalar_test_data_class_qp (_Float128, const int);
     VSTDCQP  VSTDCQP_DEPR1
 
-[VEC_VSTDCSP, scalar_test_data_class_sp, __builtin_vec_scalar_test_data_class_sp, _ARCH_PWR9]
+[VEC_VSTDCSP, scalar_test_data_class_sp, __builtin_vec_scalar_test_data_class_sp]
   unsigned int __builtin_vec_scalar_test_data_class_sp (float, const int);
     VSTDCSP  VSTDCSP_DEPR1
 
@@ -5928,13 +5932,13 @@
   vuq __builtin_vec_vsubcuq (vuq, vuq);
     VSUBCUQ  VSUBCUQ_DEPR2
 
-[VEC_VSUBECUQ, vec_vsubecuq, __builtin_vec_vsubecuq, ARCH_PWR8]
+[VEC_VSUBECUQ, vec_vsubecuq, __builtin_vec_vsubecuq]
   vsq __builtin_vec_vsubecuq (vsq, vsq, vsq);
     VSUBECUQ  VSUBECUQ_DEPR1
   vuq __builtin_vec_vsubecuq (vuq, vuq, vuq);
     VSUBECUQ  VSUBECUQ_DEPR2
 
-[VEC_VSUBEUQM, vec_vsubeuqm, __builtin_vec_vsubeuqm, _ARCH_PWR8]
+[VEC_VSUBEUQM, vec_vsubeuqm, __builtin_vec_vsubeuqm]
   vsq __builtin_vec_vsubeuqm (vsq, vsq, vsq);
     VSUBEUQM  VSUBEUQM_DEPR1
   vuq __builtin_vec_vsubeuqm (vuq, vuq, vuq);
@@ -6004,7 +6008,7 @@
   vuc __builtin_vec_vsububs (vuc, vbc);
     VSUBUBS  VSUBUBS_DEPR8
 
-[VEC_VSUBUDM, vec_vsubudm, __builtin_vec_vsubudm, _ARCH_PWR8]
+[VEC_VSUBUDM, vec_vsubudm, __builtin_vec_vsubudm]
   vsll __builtin_vec_vsubudm (vbll, vsll);
     VSUBUDM  VSUBUDM_DEPR1
   vsll __builtin_vec_vsubudm (vsll, vbll);
@@ -6048,7 +6052,7 @@
   vus __builtin_vec_vsubuhs (vus, vbs);
     VSUBUHS  VSUBUHS_DEPR5
 
-[VEC_VSUBUQM, vec_vsubuqm, __builtin_vec_vsubuqm, _ARCH_PWR8]
+[VEC_VSUBUQM, vec_vsubuqm, __builtin_vec_vsubuqm]
   vsq __builtin_vec_vsubuqm (vsq, vsq);
     VSUBUQM  VSUBUQM_DEPR1
   vuq __builtin_vec_vsubuqm (vuq, vuq);
@@ -6096,11 +6100,11 @@
   vui __builtin_vec_vsum4ubs (vuc, vui);
     VSUM4UBS  VSUM4UBS_DEPR1
 
-[VEC_VTDCDP, vec_test_data_class_dp, __builtin_vec_test_data_class_dp, _ARCH_PWR9]
+[VEC_VTDCDP, vec_test_data_class_dp, __builtin_vec_test_data_class_dp]
   vbll __builtin_vec_test_data_class_dp (vd, const int);
     VTDCDP  VTDCDP_DEPR1
 
-[VEC_VTDCSP, vec_test_data_class_sp, __builtin_vec_test_data_class_sp, _ARCH_PWR9]
+[VEC_VTDCSP, vec_test_data_class_sp, __builtin_vec_test_data_class_sp]
   vbi __builtin_vec_test_data_class_sp (vf, const int);
     VTDCSP  VTDCSP_DEPR1
 
@@ -6138,7 +6142,7 @@
   vbi __builtin_vec_vupkhsh (vbs);
     VUPKHSH  VUPKHSH_DEPR2
 
-[VEC_VUPKHSW, vec_vupkhsw, __builtin_vec_vupkhsw, _ARCH_PWR8]
+[VEC_VUPKHSW, vec_vupkhsw, __builtin_vec_vupkhsw]
   vsll __builtin_vec_vupkhsw (vsi);
     VUPKHSW  VUPKHSW_DEPR1
   vbll __builtin_vec_vupkhsw (vbi);
@@ -6162,7 +6166,7 @@
   vbi __builtin_vec_vupklsh (vbs);
     VUPKLSH  VUPKLSH_DEPR2
 
-[VEC_VUPKLSW, vec_vupklsw, __builtin_vec_vupklsw, _ARCH_PWR8]
+[VEC_VUPKLSW, vec_vupklsw, __builtin_vec_vupklsw]
   vsll __builtin_vec_vupklsw (vsi);
     VUPKLSW  VUPKLSW_DEPR1
   vbll __builtin_vec_vupklsw (vbi);
-- 
cgit v1.1


From ca902055d056773bd0ca80f68bca4b20ad0e183f Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Fri, 21 Jan 2022 10:57:43 +0100
Subject: [nvptx] Fix reduction lock

When I run the libgomp test-case reduction-cplx-dbl.c on an nvptx accelerator
(T400, driver version 470.86), I run into:
...
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c \
  -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none  -O0  \
  execution test
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c \
  -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none  -O2  \
  execution test
...

The problem is in this code generated for a gang reduction:
...
$L39:
		atom.global.cas.b32     %r59, [__reduction_lock], 0, 1;
		setp.ne.u32     %r116, %r59, 0;
	@%r116  bra     $L39;
		ld.f64  %r60, [%r44];
		ld.f64  %r61, [%r44+8];
		ld.f64  %r64, [%r44];
		ld.f64  %r65, [%r44+8];
		add.f64 %r117, %r64, %r22;
		add.f64 %r118, %r65, %r41;
		st.f64  [%r44], %r117;
		st.f64  [%r44+8], %r118;
		atom.global.cas.b32     %r119, [__reduction_lock], 1, 0;
...
which is taking and releasing a lock, but missing the appropriate barriers to
protect the loads and store inside the lock.

Fix this by adding membar.gl barriers.

Likewise, add membar.cta barriers if we protect shared memory loads and
stores (even though the worker-partitioning part of the test-case is not
failing).

Tested on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-01-27  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.cc (enum nvptx_builtins): Add
	NVPTX_BUILTIN_MEMBAR_GL and NVPTX_BUILTIN_MEMBAR_CTA.
	(VOID): New macro.
	(nvptx_init_builtins): Add MEMBAR_GL and MEMBAR_CTA.
	(nvptx_expand_builtin): Handle NVPTX_BUILTIN_MEMBAR_GL and
	NVPTX_BUILTIN_MEMBAR_CTA.
	(nvptx_lockfull_update): Add level parameter.  Emit barriers.
	(nvptx_reduction_update, nvptx_goacc_reduction_fini): Update call to
	nvptx_lockfull_update.
	* config/nvptx/nvptx.md (define_c_enum "unspecv"): Add
	UNSPECV_MEMBAR_GL.
	(define_expand "nvptx_membar_gl"): New expand.
	(define_insn "*nvptx_membar_gl"): New insn.
---
 gcc/config/nvptx/nvptx.cc | 37 ++++++++++++++++++++++++++++++++-----
 gcc/config/nvptx/nvptx.md | 17 +++++++++++++++++
 2 files changed, 49 insertions(+), 5 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index db6a405..ceea4d3 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -5622,6 +5622,8 @@ enum nvptx_builtins
   NVPTX_BUILTIN_VECTOR_ADDR,
   NVPTX_BUILTIN_CMP_SWAP,
   NVPTX_BUILTIN_CMP_SWAPLL,
+  NVPTX_BUILTIN_MEMBAR_GL,
+  NVPTX_BUILTIN_MEMBAR_CTA,
   NVPTX_BUILTIN_MAX
 };
 
@@ -5652,6 +5654,7 @@ nvptx_init_builtins (void)
 #define UINT unsigned_type_node
 #define LLUINT long_long_unsigned_type_node
 #define PTRVOID ptr_type_node
+#define VOID void_type_node
 
   DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
   DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
@@ -5661,6 +5664,8 @@ nvptx_init_builtins (void)
        (PTRVOID, ST, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
+  DEF (MEMBAR_GL, "membar_gl", (VOID, VOID, NULL_TREE));
+  DEF (MEMBAR_CTA, "membar_cta", (VOID, VOID, NULL_TREE));
 
 #undef DEF
 #undef ST
@@ -5696,6 +5701,14 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
     case NVPTX_BUILTIN_CMP_SWAPLL:
       return nvptx_expand_cmp_swap (exp, target, mode, ignore);
 
+    case NVPTX_BUILTIN_MEMBAR_GL:
+      emit_insn (gen_nvptx_membar_gl ());
+      return NULL_RTX;
+
+    case NVPTX_BUILTIN_MEMBAR_CTA:
+      emit_insn (gen_nvptx_membar_cta ());
+      return NULL_RTX;
+
     default: gcc_unreachable ();
     }
 }
@@ -6243,7 +6256,7 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
 
 static tree
 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
-		       tree ptr, tree var, tree_code op)
+		       tree ptr, tree var, tree_code op, int level)
 {
   tree var_type = TREE_TYPE (var);
   tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
@@ -6295,8 +6308,17 @@ nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
   lock_loop->any_estimate = true;
   add_loop (lock_loop, entry_bb->loop_father);
 
-  /* Build and insert the reduction calculation.  */
+  /* Build the pre-barrier.  */
   gimple_seq red_seq = NULL;
+  enum nvptx_builtins barrier_builtin
+    = (level == GOMP_DIM_GANG
+       ? NVPTX_BUILTIN_MEMBAR_GL
+       : NVPTX_BUILTIN_MEMBAR_CTA);
+  tree barrier_fn = nvptx_builtin_decl (barrier_builtin, true);
+  tree barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
+  gimplify_stmt (&barrier_expr, &red_seq);
+
+  /* Build the reduction calculation.  */
   tree acc_in = make_ssa_name (var_type);
   tree ref_in = build_simple_mem_ref (ptr);
   TREE_THIS_VOLATILE (ref_in) = 1;
@@ -6310,6 +6332,11 @@ nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
   TREE_THIS_VOLATILE (ref_out) = 1;
   gimplify_assign (ref_out, acc_out, &red_seq);
 
+  /* Build the post-barrier.  */
+  barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
+  gimplify_stmt (&barrier_expr, &red_seq);
+
+  /* Insert the reduction calculation.  */
   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
 
   /* Build & insert the unlock sequence.  */
@@ -6330,7 +6357,7 @@ nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
 
 static tree
 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
-			tree ptr, tree var, tree_code op)
+			tree ptr, tree var, tree_code op, int level)
 {
   tree type = TREE_TYPE (var);
   tree size = TYPE_SIZE (type);
@@ -6339,7 +6366,7 @@ nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
       || size == TYPE_SIZE (long_long_unsigned_type_node))
     return nvptx_lockless_update (loc, gsi, ptr, var, op);
   else
-    return nvptx_lockfull_update (loc, gsi, ptr, var, op);
+    return nvptx_lockfull_update (loc, gsi, ptr, var, op, level);
 }
 
 /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */
@@ -6531,7 +6558,7 @@ nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
 	  gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 	  seq = NULL;
 	  r = nvptx_reduction_update (gimple_location (call), &gsi,
-				      accum, var, op);
+				      accum, var, op, level);
 	}
     }
 
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 5cf190a..773ae8f 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -58,6 +58,7 @@
    UNSPECV_BARSYNC
    UNSPECV_MEMBAR
    UNSPECV_MEMBAR_CTA
+   UNSPECV_MEMBAR_GL
    UNSPECV_DIM_POS
 
    UNSPECV_FORK
@@ -1932,6 +1933,22 @@
   "\\tmembar.cta;"
   [(set_attr "predicable" "false")])
 
+(define_expand "nvptx_membar_gl"
+  [(set (match_dup 0)
+	(unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMBAR_GL))]
+  ""
+{
+  operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[0]) = 1;
+})
+
+(define_insn "*nvptx_membar_gl"
+  [(set (match_operand:BLK 0 "" "")
+	(unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMBAR_GL))]
+  ""
+  "\\tmembar.gl;"
+  [(set_attr "predicable" "false")])
+
 (define_insn "nvptx_nounroll"
   [(unspec_volatile [(const_int 0)] UNSPECV_NOUNROLL)]
   ""
-- 
cgit v1.1


From e0451f93d9faa13495132f4e246e9bef30b51417 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Fri, 21 Jan 2022 21:46:05 +0100
Subject: [nvptx] Add some support for .local atomics

The ptx insn atom doesn't support local memory.  In case of doing an atomic
operation on local memory, we run into:
...
operation not supported on global/shared address space
...
This is the cuGetErrorString message for CUDA_ERROR_INVALID_ADDRESS_SPACE.

The message is somewhat confusing given that actually the operation is not
supported on local address space.

Fix this by falling back on a non-atomic version when detecting
a frame-related memory operand.

This only solves some cases that are detected at compile-time.  It does
however fix the openacc private-atomic-* test-cases.

Tested on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-01-27  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.md (define_insn "atomic_compare_and_swap<mode>_1")
	(define_insn "atomic_exchange<mode>")
	(define_insn "atomic_fetch_add<mode>")
	(define_insn "atomic_fetch_addsf")
	(define_insn "atomic_fetch_<logic><mode>"): Output non-atomic version
	if memory operands is frame-relative.

gcc/testsuite/ChangeLog:

2022-01-31  Tom de Vries  <tdevries@suse.de>

	* gcc.target/nvptx/stack-atomics-run.c: New test.

libgomp/ChangeLog:

2022-01-27  Tom de Vries  <tdevries@suse.de>

	* testsuite/libgomp.oacc-c-c++-common/private-atomic-1.c: Remove
	PR83812 workaround.
	* testsuite/libgomp.oacc-fortran/private-atomic-1-vector.f90: Same.
	* testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90: Same.
---
 gcc/config/nvptx/nvptx.md | 82 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 773ae8f..9cbbd95 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -1790,11 +1790,28 @@
 	(unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))]
   ""
   {
+    struct address_info info;
+    decompose_mem_address (&info, operands[1]);
+    if (info.base != NULL && REG_P (*info.base)
+	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+      {
+	output_asm_insn ("{", NULL);
+	output_asm_insn ("\\t"	      ".reg.pred"  "\\t" "%%eq_p;", NULL);
+	output_asm_insn ("\\t"	      ".reg%t0"	   "\\t" "%%val;", operands);
+	output_asm_insn ("\\t"	      "ld%A1%t0"   "\\t" "%%val,%1;", operands);
+	output_asm_insn ("\\t"	      "setp.eq%t0" "\\t" "%%eq_p, %%val, %2;",
+			 operands);
+	output_asm_insn ("@%%eq_p\\t" "st%A1%t0"   "\\t" "%1,%3;", operands);
+	output_asm_insn ("\\t"	      "mov%t0"	   "\\t" "%0,%%val;", operands);
+	output_asm_insn ("}", NULL);
+	return "";
+      }
     const char *t
-      = "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;";
+      = "\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;";
     return nvptx_output_atomic_insn (t, operands, 1, 4);
   }
-  [(set_attr "atomic" "true")])
+  [(set_attr "atomic" "true")
+   (set_attr "predicable" "false")])
 
 (define_insn "atomic_exchange<mode>"
   [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")	;; output
@@ -1806,6 +1823,19 @@
 	(match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))]	;; input
   ""
   {
+    struct address_info info;
+    decompose_mem_address (&info, operands[1]);
+    if (info.base != NULL && REG_P (*info.base)
+	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+      {
+	output_asm_insn ("{", NULL);
+	output_asm_insn ("\\t"	 ".reg%t0"  "\\t" "%%val;", operands);
+	output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
+	output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%2;", operands);
+	output_asm_insn ("%.\\t" "mov%t0"   "\\t" "%0,%%val;", operands);
+	output_asm_insn ("}", NULL);
+	return "";
+      }
     const char *t
       = "%.\tatom%A1.exch.b%T0\t%0, %1, %2;";
     return nvptx_output_atomic_insn (t, operands, 1, 3);
@@ -1823,6 +1853,22 @@
 	(match_dup 1))]
   ""
   {
+    struct address_info info;
+    decompose_mem_address (&info, operands[1]);
+    if (info.base != NULL && REG_P (*info.base)
+	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+      {
+	output_asm_insn ("{", NULL);
+	output_asm_insn ("\\t"	 ".reg%t0"  "\\t" "%%val;", operands);
+	output_asm_insn ("\\t"	 ".reg%t0"  "\\t" "%%update;", operands);
+	output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
+	output_asm_insn ("%.\\t" "add%t0"   "\\t" "%%update,%%val,%2;",
+			 operands);
+	output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%%update;", operands);
+	output_asm_insn ("%.\\t" "mov%t0"   "\\t" "%0,%%val;", operands);
+	output_asm_insn ("}", NULL);
+	return "";
+      }
     const char *t
       = "%.\\tatom%A1.add%t0\\t%0, %1, %2;";
     return nvptx_output_atomic_insn (t, operands, 1, 3);
@@ -1840,6 +1886,22 @@
 	(match_dup 1))]
   ""
   {
+    struct address_info info;
+    decompose_mem_address (&info, operands[1]);
+    if (info.base != NULL && REG_P (*info.base)
+	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+      {
+	output_asm_insn ("{", NULL);
+	output_asm_insn ("\\t"	 ".reg%t0"  "\\t" "%%val;", operands);
+	output_asm_insn ("\\t"	 ".reg%t0"  "\\t" "%%update;", operands);
+	output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
+	output_asm_insn ("%.\\t" "add%t0"   "\\t" "%%update,%%val,%2;",
+			 operands);
+	output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%%update;", operands);
+	output_asm_insn ("%.\\t" "mov%t0"   "\\t" "%0,%%val;", operands);
+	output_asm_insn ("}", NULL);
+	return "";
+      }
     const char *t
       = "%.\\tatom%A1.add%t0\\t%0, %1, %2;";
     return nvptx_output_atomic_insn (t, operands, 1, 3);
@@ -1860,6 +1922,22 @@
 	(match_dup 1))]
   "<MODE>mode == SImode || TARGET_SM35"
   {
+    struct address_info info;
+    decompose_mem_address (&info, operands[1]);
+    if (info.base != NULL && REG_P (*info.base)
+	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+      {
+	output_asm_insn ("{", NULL);
+	output_asm_insn ("\\t"	 ".reg.b%T0"    "\\t" "%%val;", operands);
+	output_asm_insn ("\\t"	 ".reg.b%T0"    "\\t" "%%update;", operands);
+	output_asm_insn ("%.\\t" "ld%A1%t0"     "\\t" "%%val,%1;", operands);
+	output_asm_insn ("%.\\t" "<logic>.b%T0" "\\t" "%%update,%%val,%2;",
+			 operands);
+	output_asm_insn ("%.\\t" "st%A1%t0"     "\\t" "%1,%%update;", operands);
+	output_asm_insn ("%.\\t" "mov%t0"       "\\t" "%0,%%val;", operands);
+	output_asm_insn ("}", NULL);
+	return "";
+      }
     const char *t
       = "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;";
     return nvptx_output_atomic_insn (t, operands, 1, 3);
-- 
cgit v1.1


From 456de10c549379b74d4858f00d4b8817035a73fc Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Sun, 23 Jan 2022 06:42:24 +0100
Subject: [nvptx] Handle nop in prevent_branch_around_nothing

When running libgomp test-case reduction-7.c on an nvptx accelerator
(T400, driver version 470.86) and GOMP_NVPTX_JIT=-O0, I run into:
...
reduction-7.exe:reduction-7.c:312: v_p_2: \
  Assertion `out[j * 32 + i] == (i + j) * 2' failed.
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-7.c \
  -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none \
  -O0  execution test
...

During investigation I found ptx code like this:
...
@ %r163 bra $L262;
$L262:
...

There's a known problem with executing this type of code, and a workaround is
in place to address this: prevent_branch_around_nothing.  The workaround does
not trigger though because it doesn't handle the nop insn.

Fix this by handling the nop insn in prevent_branch_around_nothing.

Tested libgomp on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-01-27  Tom de Vries  <tdevries@suse.de>

	PR target/100428
	* config/nvptx/nvptx.cc (prevent_branch_around_nothing): Handle nop
	insn.
---
 gcc/config/nvptx/nvptx.cc | 1 +
 1 file changed, 1 insertion(+)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index ceea4d3..262e8f9 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -5103,6 +5103,7 @@ prevent_branch_around_nothing (void)
 	    case CODE_FOR_nvptx_forked:
 	    case CODE_FOR_nvptx_joining:
 	    case CODE_FOR_nvptx_join:
+	    case CODE_FOR_nop:
 	      continue;
 	    default:
 	      seen_label = NULL;
-- 
cgit v1.1


From 57f971f99209cc950d7e706b7b52f4c9ef1d10b0 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Wed, 26 Jan 2022 14:16:42 +0100
Subject: [nvptx] Update bar.sync for ptx isa 6.0

In ptx isa 6.0, a new barrier instruction was added, and bar.sync was
redefined as barrier.sync.aligned.

The aligned modifier indicates that all threads in a CTA will execute the same
barrier instruction.

The seems fine for a form "bar.sync 0".

But a "bar.sync %rx,64" (as used for vector length > 32) may execute a
diffferent barrier depending on the value of %rx, so we can't assume it's
aligned.

Fix this by using "barrier.sync %rx,64" instead.

Tested on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-01-27  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx-opts.h (enum ptx_version): Add PTX_VERSION_6_0.
	* config/nvptx/nvptx.h (TARGET_PTX_6_0): New macro.
	* config/nvptx/nvptx.md (define_insn "nvptx_barsync"): Use barrier
	insn for TARGET_PTX_6_0.
---
 gcc/config/nvptx/nvptx-opts.h | 1 +
 gcc/config/nvptx/nvptx.h      | 1 +
 gcc/config/nvptx/nvptx.md     | 8 ++++++--
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h
index daae72f..c754a51 100644
--- a/gcc/config/nvptx/nvptx-opts.h
+++ b/gcc/config/nvptx/nvptx-opts.h
@@ -32,6 +32,7 @@ enum ptx_isa
 enum ptx_version
 {
   PTX_VERSION_3_1,
+  PTX_VERSION_6_0,
   PTX_VERSION_6_3,
   PTX_VERSION_7_0
 };
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 9fda2f0..065d7aa 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -91,6 +91,7 @@
 #define TARGET_SM75 (ptx_isa_option >= PTX_ISA_SM75)
 #define TARGET_SM80 (ptx_isa_option >= PTX_ISA_SM80)
 
+#define TARGET_PTX_6_0 (ptx_version_option >= PTX_VERSION_6_0)
 #define TARGET_PTX_6_3 (ptx_version_option >= PTX_VERSION_6_3)
 #define TARGET_PTX_7_0 (ptx_version_option >= PTX_VERSION_7_0)
 
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 9cbbd95..b391165 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -1968,9 +1968,13 @@
   ""
   {
     if (INTVAL (operands[1]) == 0)
-      return "\\tbar.sync\\t%0;";
+      return (TARGET_PTX_6_0
+	      ? "\\tbarrier.sync.aligned\\t%0;"
+	      : "\\tbar.sync\\t%0;");
     else
-      return "\\tbar.sync\\t%0, %1;";
+      return (TARGET_PTX_6_0
+	      ? "\\tbarrier.sync\\t%0, %1;"
+	      : "\\tbar.sync\\t%0, %1;");
   }
   [(set_attr "predicable" "false")])
 
-- 
cgit v1.1


From 8ff0669f6d1d6126b7c010da02fa6532abb5e1ca Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Wed, 26 Jan 2022 14:17:40 +0100
Subject: [nvptx] Update default ptx isa to 6.3

With the following example, minimized from parallel-dims.c:
...
int
main (void)
{
  int vectors_max = -1;
  #pragma acc parallel num_gangs (1) num_workers (1) copy (vectors_max)
  {
    for (int i = 0; i < 2; i++)
      for (int j = 0; j < 2; j++)
        #pragma acc loop vector reduction (max: vectors_max)
        for (int k = 0; k < 32; k++)
          vectors_max = k;
  }

  if (vectors_max != 31)
    __builtin_abort ();

  return 0;
}
...
I run into (T400, driver version 470.94):
...
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/parallel-dims.c \
  -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none -O2 \
  execution test
...
The FAIL does not happen with GOMP_NVPTX_JIT=-O0.

The problem seems to be that the shfl insns for the vector reduction are not
executed uniformly by the warp.  Enforcing this by using shfl.sync fixes the
problem.

Fix this by setting the ptx isa to 6.3 by default, which allows the use of
shfl.sync.

Tested on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-01-27  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.opt (mptx): Set to PTX_VERSION_6_3 by default.
---
 gcc/config/nvptx/nvptx.opt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 6514dd3..6e12b1f 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -89,5 +89,5 @@ EnumValue
 Enum(ptx_version) String(7.0) Value(PTX_VERSION_7_0)
 
 mptx=
-Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option) Init(PTX_VERSION_3_1)
+Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option) Init(PTX_VERSION_6_3)
 Specify the version of the ptx version to use.
-- 
cgit v1.1


From bba61d403d05202deb698b352a4faef3feb1f04d Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Thu, 27 Jan 2022 15:03:59 +0100
Subject: [nvptx] Add bar.warp.sync

On a GT 1030 (sm_61), with driver version 470.94 I run into:
...
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/parallel-dims.c \
  -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none \
  -O2 execution test
...
which minimizes to the same test-case as listed in commit "[nvptx] Update
default ptx isa to 6.3".

The first divergent branch looks like:
...
  {
    .reg .u32 %x;
    mov.u32 %x,%tid.x;
    setp.ne.u32 %r59,%x,0;
  }
  @ %r59 bra $L15;
  mov.u64 %r48,%ar0;
  mov.u32 %r22,2;
  ld.u64 %r53,[%r48];
  mov.u32 %r55,%r22;
  mov.u32 %r54,1;
 $L15:
...
and when inspecting the generated SASS, the branch is not setup as a divergent
branch, but instead as a regular branch.

This causes us to execute a shfl.sync insn in divergent mode, which is likely
to cause trouble given a remark in the ptx isa version 6.3, which mentions
that for .target sm_6x or below, all threads must excute the same
shfl.sync instruction in convergence.

Fix this by placing a "bar.warp.sync 0xffffffff" at the desired convergence
point (in the example above, after $L15).

Tested on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-01-31  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.cc (nvptx_single): Use nvptx_warpsync.
	* config/nvptx/nvptx.md (define_c_enum "unspecv"): Add
	UNSPECV_WARPSYNC.
	(define_insn "nvptx_warpsync"): New define_insn.
---
 gcc/config/nvptx/nvptx.cc | 7 +++++++
 gcc/config/nvptx/nvptx.md | 7 +++++++
 2 files changed, 14 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 262e8f9..1b91990 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -4598,6 +4598,7 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
   rtx_insn *neuter_start = NULL;
   rtx_insn *worker_label = NULL, *vector_label = NULL;
   rtx_insn *worker_jump = NULL, *vector_jump = NULL;
+  rtx_insn *warp_sync = NULL;
   for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
     if (GOMP_DIM_MASK (mode) & skip_mask)
       {
@@ -4630,11 +4631,15 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	if (tail_branch)
 	  {
 	    label_insn = emit_label_before (label, before);
+	    if (TARGET_PTX_6_0 && mode == GOMP_DIM_VECTOR)
+	      warp_sync = emit_insn_after (gen_nvptx_warpsync (), label_insn);
 	    before = label_insn;
 	  }
 	else
 	  {
 	    label_insn = emit_label_after (label, tail);
+	    if (TARGET_PTX_6_0 && mode == GOMP_DIM_VECTOR)
+	      warp_sync = emit_insn_after (gen_nvptx_warpsync (), label_insn);
 	    if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
 		&& CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
 	      emit_insn_after (gen_exit (), label_insn);
@@ -4702,6 +4707,8 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 		 setp.ne.u32 %rcond,%rcondu32,0;
 	  */
 	  rtx_insn *label = PREV_INSN (tail);
+	  if (label == warp_sync)
+	    label = PREV_INSN (label);
 	  gcc_assert (label && LABEL_P (label));
 	  rtx tmp = gen_reg_rtx (BImode);
 	  emit_insn_before (gen_movbi (tmp, const0_rtx),
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index b391165..b4c7cd6 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -56,6 +56,7 @@
    UNSPECV_CAS
    UNSPECV_XCHG
    UNSPECV_BARSYNC
+   UNSPECV_WARPSYNC
    UNSPECV_MEMBAR
    UNSPECV_MEMBAR_CTA
    UNSPECV_MEMBAR_GL
@@ -1978,6 +1979,12 @@
   }
   [(set_attr "predicable" "false")])
 
+(define_insn "nvptx_warpsync"
+  [(unspec_volatile [(const_int 0)] UNSPECV_WARPSYNC)]
+  "TARGET_PTX_6_0"
+  "\\tbar.warp.sync\\t0xffffffff;"
+  [(set_attr "predicable" "false")])
+
 (define_expand "memory_barrier"
   [(set (match_dup 0)
 	(unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMBAR))]
-- 
cgit v1.1


From f32f74c2e8cef5fe37af6d4e8d7e8f6b4c8ae9a8 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Fri, 28 Jan 2022 10:28:59 +0100
Subject: [nvptx] Add uniform_warp_check insn

On a GT 1030, with driver version 470.94 and -mptx=3.1 I run into:
...
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/parallel-dims.c \
  -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none \
  -O2 execution test
...
which minimizes to the same test-case as listed in commit "[nvptx]
Update default ptx isa to 6.3".

The problem is again that the first diverging branch is not handled as such in
SASS, which causes problems with a subsequent shfl insn, but given that we
have -mptx=3.1 we can't use the bar.warp.sync insn.

Given that the default is now -mptx=6.3, and consequently -mptx=3.1 is of a
lesser importance, implement the next best thing: abort when detecting
non-convergence using this insn:
...
  { .reg.b32 act;
    vote.ballot.b32 act,1;
    .reg.pred uni;
    setp.eq.b32 uni,act,0xffffffff;
    @ !uni trap;
    @ !uni exit;
  }
...

Interestingly, the effect of this is that rather than aborting, the test-case
now passes.

Tested on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-01-31  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.cc (nvptx_single): Use nvptx_uniform_warp_check.
	* config/nvptx/nvptx.md (define_c_enum "unspecv"): Add
	UNSPECV_UNIFORM_WARP_CHECK.
	(define_insn "nvptx_uniform_warp_check"): New define_insn.
---
 gcc/config/nvptx/nvptx.cc | 22 ++++++++++++++++++----
 gcc/config/nvptx/nvptx.md | 18 ++++++++++++++++++
 2 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 1b91990..b3bb97c 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -4631,15 +4631,29 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	if (tail_branch)
 	  {
 	    label_insn = emit_label_before (label, before);
-	    if (TARGET_PTX_6_0 && mode == GOMP_DIM_VECTOR)
-	      warp_sync = emit_insn_after (gen_nvptx_warpsync (), label_insn);
+	    if (mode == GOMP_DIM_VECTOR)
+	      {
+		if (TARGET_PTX_6_0)
+		  warp_sync = emit_insn_after (gen_nvptx_warpsync (),
+					       label_insn);
+		else
+		  warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
+					       label_insn);
+	      }
 	    before = label_insn;
 	  }
 	else
 	  {
 	    label_insn = emit_label_after (label, tail);
-	    if (TARGET_PTX_6_0 && mode == GOMP_DIM_VECTOR)
-	      warp_sync = emit_insn_after (gen_nvptx_warpsync (), label_insn);
+	    if (mode == GOMP_DIM_VECTOR)
+	      {
+		if (TARGET_PTX_6_0)
+		  warp_sync = emit_insn_after (gen_nvptx_warpsync (),
+					       label_insn);
+		else
+		  warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
+					       label_insn);
+	      }
 	    if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
 		&& CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
 	      emit_insn_after (gen_exit (), label_insn);
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index b4c7cd6..92768dd 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -57,6 +57,7 @@
    UNSPECV_XCHG
    UNSPECV_BARSYNC
    UNSPECV_WARPSYNC
+   UNSPECV_UNIFORM_WARP_CHECK
    UNSPECV_MEMBAR
    UNSPECV_MEMBAR_CTA
    UNSPECV_MEMBAR_GL
@@ -1985,6 +1986,23 @@
   "\\tbar.warp.sync\\t0xffffffff;"
   [(set_attr "predicable" "false")])
 
+(define_insn "nvptx_uniform_warp_check"
+  [(unspec_volatile [(const_int 0)] UNSPECV_UNIFORM_WARP_CHECK)]
+  ""
+  {
+    output_asm_insn ("{", NULL);
+    output_asm_insn ("\\t"	 ".reg.b32"	   "\\t" "act;", NULL);
+    output_asm_insn ("\\t"	 "vote.ballot.b32" "\\t" "act,1;", NULL);
+    output_asm_insn ("\\t"	 ".reg.pred"	   "\\t" "uni;", NULL);
+    output_asm_insn ("\\t"	 "setp.eq.b32"	   "\\t" "uni,act,0xffffffff;",
+		     NULL);
+    output_asm_insn ("@ !uni\\t" "trap;", NULL);
+    output_asm_insn ("@ !uni\\t" "exit;", NULL);
+    output_asm_insn ("}", NULL);
+    return "";
+  }
+  [(set_attr "predicable" "false")])
+
 (define_expand "memory_barrier"
   [(set (match_dup 0)
 	(unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMBAR))]
-- 
cgit v1.1


From fa882c3e3bf642e0ef30772e4b54a2851497db96 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Tue, 1 Feb 2022 20:22:14 +0100
Subject: rs6000: Fix up PCH on powerpc* [PR104323]

As mentioned in the PR and as can be seen on:
--- gcc/testsuite/gcc.dg/pch/pr104323-1.c.jj	2022-02-01 13:06:00.163192414 +0100
+++ gcc/testsuite/gcc.dg/pch/pr104323-1.c	2022-02-01 13:13:41.226712735 +0100
@@ -0,0 +1,16 @@
+/* PR target/104323 */
+/* { dg-require-effective-target powerpc_altivec_ok } */
+/* { dg-options "-maltivec" } */
+
+#include "pr104323-1.h"
+
+__vector int a1 = { 100, 200, 300, 400 };
+__vector int a2 = { 500, 600, 700, 800 };
+__vector int r;
+
+int
+main ()
+{
+  r = vec_add (a1, a2);
+  return 0;
+}
--- gcc/testsuite/gcc.dg/pch/pr104323-1.hs.jj	2022-02-01 13:06:03.180149978 +0100
+++ gcc/testsuite/gcc.dg/pch/pr104323-1.hs	2022-02-01 13:12:30.175706620 +0100
@@ -0,0 +1,5 @@
+/* PR target/104323 */
+/* { dg-require-effective-target powerpc_altivec_ok } */
+/* { dg-options "-maltivec" } */
+
+#include <altivec.h>
testcase which I'm not including into testsuite because for some reason
the test fails on non-powerpc* targets (is done even on those and fails
because of missing altivec.h etc.), PCH is broken on powerpc*-*-* since the
new builtin generator has been introduced.
The generator contains or emits comments like:
  /* #### Cannot mark this as a GC root because only pointer types can
     be marked as GTY((user)) and be GC roots.  All trees in here are
     kept alive by other globals, so not a big deal.  Alternatively,
     we could change the enum fields to ints and cast them in and out
     to avoid requiring a GTY((user)) designation, but that seems
     unnecessarily gross.  */
Having the fntypes stored in other GC roots can work fine for GC,
ggc_collect will then always mark them and so they won't disappear from
the tables, but it definitely doesn't work for PCH, which when the
arrays with fntype members aren't GTY marked means on PCH write we create
copies of those FUNCTION_TYPEs and store in *.gch that the GC roots should
be updated, but don't store that rs6000_builtin_info[?].fntype etc. should
be updated.  When PCH is read again, the blob is read at some other address,
GC roots are updated, rs6000_builtin_info[?].fntype contains garbage
pointers (GC freed pointers with random data, or random unrelated types or
other trees).
The following patch fixes that.  It stops any user markings because that
is totally unnecessary, just skips fields we don't need to mark and adds
GTY(()) to the 2 array variables.  We can get rid of all those global
vars for the fn types, they can be now automatic vars.
With the patch we get
  {
    &rs6000_instance_info[0].fntype,
    1 * (RS6000_INST_MAX),
    sizeof (rs6000_instance_info[0]),
    &gt_ggc_mx_tree_node,
    &gt_pch_nx_tree_node
  },
  {
    &rs6000_builtin_info[0].fntype,
    1 * (RS6000_BIF_MAX),
    sizeof (rs6000_builtin_info[0]),
    &gt_ggc_mx_tree_node,
    &gt_pch_nx_tree_node
  },
as the new roots which is exactly what we want and significantly more
compact than countless
  {
    &uv2di_ftype_pudi_usi,
    1,
    sizeof (uv2di_ftype_pudi_usi),
    &gt_ggc_mx_tree_node,
    &gt_pch_nx_tree_node
  },
  {
    &uv2di_ftype_lg_puv2di,
    1,
    sizeof (uv2di_ftype_lg_puv2di),
    &gt_ggc_mx_tree_node,
    &gt_pch_nx_tree_node
  },
  {
    &uv2di_ftype_lg_pudi,
    1,
    sizeof (uv2di_ftype_lg_pudi),
    &gt_ggc_mx_tree_node,
    &gt_pch_nx_tree_node
  },
  {
    &uv2di_ftype_di_puv2di,
    1,
    sizeof (uv2di_ftype_di_puv2di),
    &gt_ggc_mx_tree_node,
    &gt_pch_nx_tree_node
  },
cases (822 of these instead of just those 4 shown).

2022-02-01  Jakub Jelinek  <jakub@redhat.com>

	PR target/104323
	* config/rs6000/t-rs6000 (EXTRA_GTYPE_DEPS): Append rs6000-builtins.h
	rather than $(srcdir)/config/rs6000/rs6000-builtins.def.
	* config/rs6000/rs6000-gen-builtins.cc (write_decls): Don't use
	GTY((user)) for struct bifdata and struct ovlddata.  Instead add
	GTY((skip(""))) to members with pointer and enum types that don't need
	to be tracked.  Add GTY(()) to rs6000_builtin_info and rs6000_instance_info
	declarations.  Don't emit gt_ggc_mx and gt_pch_nx declarations.
	(write_extern_fntype, write_fntype): Remove.
	(write_fntype_init): Emit the fntype vars as automatic vars instead
	of file scope ones.
	(write_header_file): Don't iterate with write_extern_fntype.
	(write_init_file): Don't iterate with write_fntype.  Don't emit
	gt_ggc_mx and gt_pch_nx definitions.
---
 gcc/config/rs6000/rs6000-gen-builtins.cc | 109 ++++++-------------------------
 gcc/config/rs6000/t-rs6000               |   2 +-
 2 files changed, 22 insertions(+), 89 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000-gen-builtins.cc b/gcc/config/rs6000/rs6000-gen-builtins.cc
index 6a0858a..629ead9 100644
--- a/gcc/config/rs6000/rs6000-gen-builtins.cc
+++ b/gcc/config/rs6000/rs6000-gen-builtins.cc
@@ -2255,20 +2255,20 @@ write_decls (void)
   fprintf (header_file, "};\n\n");
 
   fprintf (header_file, "#define PPC_MAXRESTROPNDS 3\n");
-  fprintf (header_file, "struct GTY((user)) bifdata\n");
+  fprintf (header_file, "struct GTY(()) bifdata\n");
   fprintf (header_file, "{\n");
-  fprintf (header_file, "  const char *bifname;\n");
-  fprintf (header_file, "  bif_enable enable;\n");
+  fprintf (header_file, "  const char *GTY((skip(\"\"))) bifname;\n");
+  fprintf (header_file, "  bif_enable GTY((skip(\"\"))) enable;\n");
   fprintf (header_file, "  tree fntype;\n");
-  fprintf (header_file, "  insn_code icode;\n");
+  fprintf (header_file, "  insn_code GTY((skip(\"\"))) icode;\n");
   fprintf (header_file, "  int  nargs;\n");
   fprintf (header_file, "  int  bifattrs;\n");
   fprintf (header_file, "  int  restr_opnd[PPC_MAXRESTROPNDS];\n");
-  fprintf (header_file, "  restriction restr[PPC_MAXRESTROPNDS];\n");
+  fprintf (header_file, "  restriction GTY((skip(\"\"))) restr[PPC_MAXRESTROPNDS];\n");
   fprintf (header_file, "  int  restr_val1[PPC_MAXRESTROPNDS];\n");
   fprintf (header_file, "  int  restr_val2[PPC_MAXRESTROPNDS];\n");
-  fprintf (header_file, "  const char *attr_string;\n");
-  fprintf (header_file, "  rs6000_gen_builtins assoc_bif;\n");
+  fprintf (header_file, "  const char *GTY((skip(\"\"))) attr_string;\n");
+  fprintf (header_file, "  rs6000_gen_builtins GTY((skip(\"\"))) assoc_bif;\n");
   fprintf (header_file, "};\n\n");
 
   fprintf (header_file, "#define bif_init_bit\t\t(0x00000001)\n");
@@ -2343,21 +2343,15 @@ write_decls (void)
 	   "#define bif_is_ibmld(x)\t((x).bifattrs & bif_ibmld_bit)\n");
   fprintf (header_file, "\n");
 
-  /* #### Cannot mark this as a GC root because only pointer types can
-     be marked as GTY((user)) and be GC roots.  All trees in here are
-     kept alive by other globals, so not a big deal.  Alternatively,
-     we could change the enum fields to ints and cast them in and out
-     to avoid requiring a GTY((user)) designation, but that seems
-     unnecessarily gross.  */
   fprintf (header_file,
-	   "extern bifdata rs6000_builtin_info[RS6000_BIF_MAX];\n\n");
+	   "extern GTY(()) bifdata rs6000_builtin_info[RS6000_BIF_MAX];\n\n");
 
-  fprintf (header_file, "struct GTY((user)) ovlddata\n");
+  fprintf (header_file, "struct GTY(()) ovlddata\n");
   fprintf (header_file, "{\n");
-  fprintf (header_file, "  const char *bifname;\n");
-  fprintf (header_file, "  rs6000_gen_builtins bifid;\n");
+  fprintf (header_file, "  const char *GTY((skip(\"\"))) bifname;\n");
+  fprintf (header_file, "  rs6000_gen_builtins GTY((skip(\"\"))) bifid;\n");
   fprintf (header_file, "  tree fntype;\n");
-  fprintf (header_file, "  ovlddata *next;\n");
+  fprintf (header_file, "  ovlddata *GTY((skip(\"\"))) next;\n");
   fprintf (header_file, "};\n\n");
 
   fprintf (header_file, "struct ovldrecord\n");
@@ -2367,14 +2361,7 @@ write_decls (void)
   fprintf (header_file, "};\n\n");
 
   fprintf (header_file,
-	   "/* #### Cannot mark this as a GC root because only pointer\n"
-	   "   types can be marked as GTY((user)) and be GC roots.  All\n"
-	   "   trees in here are kept alive by other globals, so not a big\n"
-	   "   deal.  Alternatively, we could change the enum fields to ints\n"
-	   "   and cast them in and out to avoid requiring a GTY((user))\n"
-	   "   designation, but that seems unnecessarily gross.  */\n");
-  fprintf (header_file,
-	   "extern ovlddata rs6000_instance_info[RS6000_INST_MAX];\n");
+	   "extern GTY(()) ovlddata rs6000_instance_info[RS6000_INST_MAX];\n");
   fprintf (header_file, "extern ovldrecord rs6000_overload_info[];\n\n");
 
   fprintf (header_file, "extern void rs6000_init_generated_builtins ();\n\n");
@@ -2383,33 +2370,6 @@ write_decls (void)
   fprintf (header_file,
 	   "extern tree rs6000_builtin_decl (unsigned, "
 	   "bool ATTRIBUTE_UNUSED);\n\n");
-  fprintf (header_file,
-	   "extern void gt_ggc_mx (bifdata *bd);\n");
-  fprintf (header_file,
-	   "extern void gt_pch_nx (bifdata *bd);\n");
-  fprintf (header_file,
-	   "extern void gt_pch_nx (bifdata *bd, gt_pointer_operator op, "
-	   "void *cookie);\n");
-  fprintf (header_file,
-	   "extern void gt_ggc_mx (ovlddata *od);\n");
-  fprintf (header_file,
-	   "extern void gt_pch_nx (ovlddata *od);\n");
-  fprintf (header_file,
-	   "extern void gt_pch_nx (ovlddata *od, gt_pointer_operator op, "
-	   "void *cookie);\n");
-}
-
-/* Callback functions used for generating trees for function types.  */
-void
-write_extern_fntype (char *str)
-{
-  fprintf (header_file, "extern GTY(()) tree %s;\n", str);
-}
-
-void
-write_fntype (char *str)
-{
-  fprintf (init_file, "tree %s;\n", str);
 }
 
 /* Comparator for bsearch on the type map.  */
@@ -2452,12 +2412,17 @@ write_fntype_init (char *str)
   /* Avoid side effects of strtok on the original string by using a copy.  */
   char *buf = strdup (str);
 
+  if (tf_found || dfp_found)
+    fprintf (init_file, "  tree %s = NULL_TREE;\n", buf);
+  else
+    fprintf (init_file, "  tree ");
+
   if (tf_found)
-    fprintf (init_file, "  if (float128_type_node)\n  ");
+    fprintf (init_file, "  if (float128_type_node)\n    ");
   else if (dfp_found)
-    fprintf (init_file, "  if (dfloat64_type_node)\n  ");
+    fprintf (init_file, "  if (dfloat64_type_node)\n    ");
 
-  fprintf (init_file, "  %s\n    = build_function_type_list (", buf);
+  fprintf (init_file, "%s\n    = build_function_type_list (", buf);
   tok = strtok (buf, "_");
   write_type_node (tok, tf_found || dfp_found);
   tok = strtok (0, "_");
@@ -2491,8 +2456,6 @@ write_header_file (void)
 
   write_decls ();
 
-  /* Write function type list declarators to the header file.  */
-  rbt_inorder_callback (&fntype_rbt, fntype_rbt.rbt_root, write_extern_fntype);
   fprintf (header_file, "\n");
   fprintf (header_file, "\n#endif\n");
 
@@ -2846,9 +2809,6 @@ write_init_file (void)
   write_bif_static_init ();
   write_ovld_static_init ();
 
-  rbt_inorder_callback (&fntype_rbt, fntype_rbt.rbt_root, write_fntype);
-  fprintf (init_file, "\n");
-
   fprintf (init_file, "void\n");
   fprintf (init_file, "rs6000_init_generated_builtins ()\n");
   fprintf (init_file, "{\n");
@@ -2868,33 +2828,6 @@ write_init_file (void)
 
   fprintf (init_file, "}\n\n");
 
-  fprintf (init_file,
-	   "void gt_ggc_mx (bifdata *bd)\n");
-  fprintf (init_file,
-	   "{\n  gt_ggc_mx (bd->fntype);\n}\n\n");
-  fprintf (init_file,
-	   "void gt_pch_nx (bifdata *bd)\n");
-  fprintf (init_file,
-	   "{\n  gt_pch_nx (bd->fntype);\n}\n\n");
-  fprintf (init_file,
-	   "void gt_pch_nx (bifdata *bd, gt_pointer_operator op, "
-	   "void *cookie)\n");
-  fprintf (init_file,
-	   "{\n  op(&(bd->fntype), NULL, cookie);\n}\n\n");
-  fprintf (init_file,
-	   "void gt_ggc_mx (ovlddata *od)\n");
-  fprintf (init_file,
-	   "{\n  gt_ggc_mx (od->fntype);\n}\n\n");
-  fprintf (init_file,
-	   "void gt_pch_nx (ovlddata *od)\n");
-  fprintf (init_file,
-	   "{\n  gt_pch_nx (od->fntype);\n}\n\n");
-  fprintf (init_file,
-	   "void gt_pch_nx (ovlddata *od, gt_pointer_operator op, "
-	   "void *cookie)\n");
-  fprintf (init_file,
-	   "{\n  op(&(od->fntype), NULL, cookie);\n}\n");
-
   return 1;
 }
 
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index 90079ce..1a460d9 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -21,7 +21,7 @@
 TM_H += $(srcdir)/config/rs6000/rs6000-cpus.def
 TM_H += $(srcdir)/config/rs6000/rs6000-modes.h
 PASSES_EXTRA += $(srcdir)/config/rs6000/rs6000-passes.def
-EXTRA_GTYPE_DEPS += $(srcdir)/config/rs6000/rs6000-builtins.def
+EXTRA_GTYPE_DEPS += rs6000-builtins.h
 
 rs6000-pcrel-opt.o: $(srcdir)/config/rs6000/rs6000-pcrel-opt.cc
 	$(COMPILE) $<
-- 
cgit v1.1


From 8753b13a31c777cdab0265dae0b68534247908f7 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 28 Jan 2022 13:34:24 +0100
Subject: IBM Z: fix `section type conflict` with -mindirect-branch-table

s390_code_end () puts indirect branch tables into separate sections and
tries to switch back to wherever it was in the beginning by calling
switch_to_section (current_function_section ()).

First of all, this is unnecessary - the other backends don't do it.

Furthermore, at this time there is no current function, but if the
last processed function was cold, in_cold_section_p remains set.  This
causes targetm.asm_out.function_section () to call
targetm.section_type_flags (), which in absence of current function
decl classifies the section as SECTION_WRITE.  This causes a section
type conflict with the existing SECTION_CODE.

gcc/ChangeLog:

	* config/s390/s390.cc (s390_code_end): Do not switch back to
	code section.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/nobp-section-type-conflict.c: New test.
---
 gcc/config/s390/s390.cc | 1 -
 1 file changed, 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 43c5c72..2db12d4 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -16809,7 +16809,6 @@ s390_code_end (void)
 	      assemble_name_raw (asm_out_file, label_start);
 	      fputs ("-.\n", asm_out_file);
 	    }
-	  switch_to_section (current_function_section ());
 	}
     }
 }
-- 
cgit v1.1


From 4c4d0af4c94ccf0cfa74c8b13b8ec1029f57cd63 Mon Sep 17 00:00:00 2001
From: Hans-Peter Nilsson <hp@axis.com>
Date: Wed, 2 Feb 2022 00:00:09 +0100
Subject: cris: Don't default to -mmul-bug-workaround

This flips the default for the errata handling for an old version
(TL;DR: workaround: no multiply instruction last on a cache-line).
Newer versions of the CRIS cpu don't have that bug.  While the impact
of the workaround is very marginal (coremark: less than .05% larger,
less than .0005% slower) it's an irritating pseudorandom factor when
assessing the impact of other changes.

Also, fix a wart requiring changes to more than TARGET_DEFAULT to flip
the default.

People building old kernels or operating systems to run on
ETRAX 100 LX are advised to pass "-mmul-bug-workaround".

gcc:
	* config/cris/cris.h (TARGET_DEFAULT): Don't include MASK_MUL_BUG.
	(MUL_BUG_ASM_DEFAULT): New macro.
	(MAYBE_AS_NO_MUL_BUG_ABORT): Define in terms of MUL_BUG_ASM_DEFAULT.
	* doc/invoke.texi (CRIS Options, -mmul-bug-workaround): Adjust
	accordingly.
---
 gcc/config/cris/cris.h | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/cris/cris.h b/gcc/config/cris/cris.h
index b274e11..9245d78 100644
--- a/gcc/config/cris/cris.h
+++ b/gcc/config/cris/cris.h
@@ -153,7 +153,9 @@ extern int cris_cpu_version;
 
 #ifdef HAVE_AS_NO_MUL_BUG_ABORT_OPTION
 #define MAYBE_AS_NO_MUL_BUG_ABORT \
- "%{mno-mul-bug-workaround:-no-mul-bug-abort} "
+ "%{mno-mul-bug-workaround:-no-mul-bug-abort} " \
+ "%{mmul-bug-workaround:-mul-bug-abort} " \
+ "%{!mmul-bug-workaround:%{!mno-mul-bug-workaround:" MUL_BUG_ASM_DEFAULT "}} "
 #else
 #define MAYBE_AS_NO_MUL_BUG_ABORT
 #endif
@@ -255,15 +257,26 @@ extern int cris_cpu_version;
  (MASK_SIDE_EFFECT_PREFIXES + MASK_STACK_ALIGN \
   + MASK_CONST_ALIGN + MASK_DATA_ALIGN \
   + MASK_ALIGN_BY_32 \
-  + MASK_PROLOGUE_EPILOGUE + MASK_MUL_BUG)
+  + MASK_PROLOGUE_EPILOGUE)
 # else  /* 0 */
 #  define TARGET_DEFAULT \
  (MASK_SIDE_EFFECT_PREFIXES + MASK_STACK_ALIGN \
   + MASK_CONST_ALIGN + MASK_DATA_ALIGN \
-  + MASK_PROLOGUE_EPILOGUE + MASK_MUL_BUG)
+  + MASK_PROLOGUE_EPILOGUE)
 # endif
 #endif
 
+/* Don't depend on the assembler default setting for the errata machinery;
+   always pass the option to turn it on or off explicitly.  But, we have to
+   decide on which is the *GCC* default, and for that we should only need to
+   consider what's in TARGET_DEFAULT; no other changes should be necessary.  */
+
+#if (TARGET_DEFAULT & MASK_MUL_BUG)
+#define MUL_BUG_ASM_DEFAULT "-mul-bug-abort"
+#else
+#define MUL_BUG_ASM_DEFAULT "-no-mul-bug-abort"
+#endif
+
 /* Local, providing a default for cris_cpu_version.  */
 #define CRIS_DEFAULT_CPU_VERSION TARGET_CPU_DEFAULT
 
-- 
cgit v1.1


From a58401d2e6d31eb8f0e4ded84b3dde28c98ba4da Mon Sep 17 00:00:00 2001
From: Hans-Peter Nilsson <hp@axis.com>
Date: Wed, 2 Feb 2022 00:00:10 +0100
Subject: cris: For expanded movsi, don't match operands we know will be
 reloaded

In a session investigating unexpected fallout from a change, I
noticed reload needs one operand being a register to make an
informed decision.  It can happen that there's just a constant
and a memory operand, as in:

(insn 668 667 42 104 (parallel [
            (set (mem:SI (plus:SI (reg/v/f:SI 347 [ fs ])
                        (const_int 168 [0xa8])) \
 [1 fs_126(D)->regs.cfa_how+0 S4 A8])
                (const_int 2 [0x2]))
            (clobber (reg:CC 19 dccr))
        ]) "<...>/gcc/libgcc/unwind-dw2.c":1121:21 22 {*movsi_internal}
     (expr_list:REG_UNUSED (reg:CC 19 dccr)
        (nil)))

This was helpfully created by combine.  When this happens,
reload can't check for costs and preferred register classes,
(both operands will start with NO_REGS as the preferred class)
and will default to the constraints order in the insn in reload.
(Which also does its own temporary merge in find_reloads, but
that's a different story.)  Better don't match the simple cases.
Beware that subregs have to be matched.

I'm doing this just for word_mode (SI) for now, but may repeat
this for the other valid modes as well.  In particular, that
goes for DImode as I see the expanded movdi does *almost* this,
but uses register_operand instead of REG_S_P (from cris.h).
Using REG_S_P is the right choice here because register_operand
also matches (subreg (mem ...)  ...) *until* reload is done.
By itself it's just a sub-0.1% performance win (coremark).

Also removing a stale comment.

gcc:
	* config/cris/cris.md ("*movsi_internal<setcc><setnz><setnzvc>"):
	Conditionalize on (sub-)register operands or operand 1 being 0.
---
 gcc/config/cris/cris.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/cris/cris.md b/gcc/config/cris/cris.md
index bc8d758..9d1c179 100644
--- a/gcc/config/cris/cris.md
+++ b/gcc/config/cris/cris.md
@@ -583,9 +583,10 @@
     (match_operand:SI 1 "general_operand"
 		       "r,Q>,M,M, I,r, M,n,g,r,x,  rQ>,x,gi"))
    (clobber (reg:CC CRIS_CC0_REGNUM))]
-    ;; Note that we prefer not to use the S alternative (if for some reason
-    ;; it competes with others) above, but g matches S.
-  ""
+  ;; Avoid matching insns we know must be reloaded.  Without one
+  ;; operand being a (pseudo-)register, reload chooses
+  ;; reload-registers suboptimally.
+  "REG_S_P (operands[0]) || REG_S_P (operands[1]) || operands[1] == const0_rtx"
 {
   /* Better to have c-switch here; it is worth it to optimize the size of
      move insns.  The alternative would be to try to find more constraint
-- 
cgit v1.1


From 27e35bc4910e291d8676c69b08fb88fa51ba528e Mon Sep 17 00:00:00 2001
From: Hans-Peter Nilsson <hp@axis.com>
Date: Wed, 2 Feb 2022 00:00:10 +0100
Subject: cris: Remove CRIS v32 ACR artefacts

This is the change to which I alluded to this in r11-220 /
d0780379c1b6 as "causes extra register moves in libgcc".  It has
unfortunate side-effects due to the change in register-class topology.
There's a slight improvement in coremark numbers (< 0.07%) though also
increase in code size total (< 0.7%) but looking at the individual
changes in functions, it's all-over (-7..+7%).  Looking specifically
at functions that improved in speed, it's also both plus and minus in
code sizes.  It's unworkable to separate improvements from regressions
for this case.  I'll follow up with patches to restore the previous
code quality, in both size and speed.

gcc:
	* config/cris/constraints.md (define_register_constraint "b"): Now
	GENERAL_REGS.
	* config/cris/cris.md (CRIS_ACR_REGNUM): Remove.
	* config/cris/cris.h: (reg_class, REG_CLASS_NAMES)
	(REG_CLASS_CONTENTS): Remove ACR_REGS, SPEC_ACR_REGS, GENNONACR_REGS,
	and SPEC_GENNONACR_REGS.
	* config/cris/cris.cc (cris_preferred_reload_class): Don't mention
	ACR_REGS and return GENERAL_REGS instead of GENNONACR_REGS.
---
 gcc/config/cris/constraints.md |  7 ++++++-
 gcc/config/cris/cris.cc        |  5 ++---
 gcc/config/cris/cris.h         | 27 +++++----------------------
 gcc/config/cris/cris.md        |  1 -
 4 files changed, 13 insertions(+), 27 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/cris/constraints.md b/gcc/config/cris/constraints.md
index 01ec12c..83fab62 100644
--- a/gcc/config/cris/constraints.md
+++ b/gcc/config/cris/constraints.md
@@ -18,7 +18,12 @@
 ;; <http://www.gnu.org/licenses/>.
 
 ;; Register constraints.
-(define_register_constraint "b" "GENNONACR_REGS"
+
+;; Kept for compatibility.  It used to exclude the CRIS v32
+;; register "ACR", which was like GENERAL_REGS except it
+;; couldn't be used for autoincrement, and intended mainly
+;; for use in user asm statements.
+(define_register_constraint "b" "GENERAL_REGS"
   "@internal")
 
 (define_register_constraint "h" "MOF_REGS"
diff --git a/gcc/config/cris/cris.cc b/gcc/config/cris/cris.cc
index a7807b3..264439c 100644
--- a/gcc/config/cris/cris.cc
+++ b/gcc/config/cris/cris.cc
@@ -1663,13 +1663,12 @@ cris_reload_address_legitimized (rtx x,
 static reg_class_t
 cris_preferred_reload_class (rtx x ATTRIBUTE_UNUSED, reg_class_t rclass)
 {
-  if (rclass != ACR_REGS
-      && rclass != MOF_REGS
+  if (rclass != MOF_REGS
       && rclass != MOF_SRP_REGS
       && rclass != SRP_REGS
       && rclass != CC0_REGS
       && rclass != SPECIAL_REGS)
-    return GENNONACR_REGS;
+    return GENERAL_REGS;
 
   return rclass;
 }
diff --git a/gcc/config/cris/cris.h b/gcc/config/cris/cris.h
index 9245d78..6edfe13 100644
--- a/gcc/config/cris/cris.h
+++ b/gcc/config/cris/cris.h
@@ -436,19 +436,15 @@ extern int cris_cpu_version;
 
 /* Node: Register Classes */
 
-/* We need a separate register class to handle register allocation for
-   ACR, since it can't be used for post-increment.
-
-   It's not obvious, but having subunions of all movable-between
+/* It's not obvious, but having subunions of all movable-between
    register classes does really help register allocation (pre-IRA
    comment).  */
 enum reg_class
   {
     NO_REGS,
-    ACR_REGS, MOF_REGS, SRP_REGS, CC0_REGS,
+    MOF_REGS, SRP_REGS, CC0_REGS,
     MOF_SRP_REGS, SPECIAL_REGS,
-    SPEC_ACR_REGS, GENNONACR_REGS,
-    SPEC_GENNONACR_REGS, GENERAL_REGS,
+    GENERAL_REGS,
     ALL_REGS,
     LIM_REG_CLASSES
   };
@@ -457,9 +453,8 @@ enum reg_class
 
 #define REG_CLASS_NAMES						\
   {"NO_REGS",							\
-   "ACR_REGS", "MOF_REGS", "SRP_REGS", "CC0_REGS",		\
+   "MOF_REGS", "SRP_REGS", "CC0_REGS",				\
    "MOF_SRP_REGS", "SPECIAL_REGS",				\
-   "SPEC_ACR_REGS", "GENNONACR_REGS", "SPEC_GENNONACR_REGS",	\
    "GENERAL_REGS", "ALL_REGS"}
 
 #define CRIS_SPECIAL_REGS_CONTENTS					\
@@ -472,37 +467,25 @@ enum reg_class
 #define REG_CLASS_CONTENTS			\
   {						\
    {0},						\
-   {1 << CRIS_ACR_REGNUM},			\
    {1 << CRIS_MOF_REGNUM},			\
    {1 << CRIS_SRP_REGNUM},			\
    {1 << CRIS_CC0_REGNUM},			\
    {(1 << CRIS_MOF_REGNUM)			\
     | (1 << CRIS_SRP_REGNUM)},			\
    {CRIS_SPECIAL_REGS_CONTENTS},		\
-   {CRIS_SPECIAL_REGS_CONTENTS			\
-    | (1 << CRIS_ACR_REGNUM)},			\
-   {(0xffff | CRIS_FAKED_REGS_CONTENTS)		\
-    & ~(1 << CRIS_ACR_REGNUM)},			\
-   {(0xffff | CRIS_FAKED_REGS_CONTENTS		\
-    | CRIS_SPECIAL_REGS_CONTENTS)		\
-    & ~(1 << CRIS_ACR_REGNUM)},			\
    {0xffff | CRIS_FAKED_REGS_CONTENTS},		\
    {0xffff | CRIS_FAKED_REGS_CONTENTS		\
     | CRIS_SPECIAL_REGS_CONTENTS}		\
   }
 
 #define REGNO_REG_CLASS(REGNO)			\
-  ((REGNO) == CRIS_ACR_REGNUM ? ACR_REGS :	\
-   (REGNO) == CRIS_MOF_REGNUM ? MOF_REGS :	\
+  ((REGNO) == CRIS_MOF_REGNUM ? MOF_REGS :	\
    (REGNO) == CRIS_SRP_REGNUM ? SRP_REGS :	\
    (REGNO) == CRIS_CC0_REGNUM ? CC0_REGS :	\
    GENERAL_REGS)
 
 #define BASE_REG_CLASS GENERAL_REGS
 
-#define MODE_CODE_BASE_REG_CLASS(MODE, AS, OCODE, ICODE)	\
-  ((OCODE) != POST_INC ? BASE_REG_CLASS : GENNONACR_REGS)
-
 #define INDEX_REG_CLASS GENERAL_REGS
 
 /* Since it uses reg_renumber, it is safe only once reg_renumber
diff --git a/gcc/config/cris/cris.md b/gcc/config/cris/cris.md
index 9d1c179..9d9eb8b 100644
--- a/gcc/config/cris/cris.md
+++ b/gcc/config/cris/cris.md
@@ -60,7 +60,6 @@
   [(CRIS_STATIC_CHAIN_REGNUM 7)
    (CRIS_REAL_FP_REGNUM 8)
    (CRIS_SP_REGNUM 14)
-   (CRIS_ACR_REGNUM 15)
    (CRIS_SRP_REGNUM 16)
    (CRIS_MOF_REGNUM 17)
    (CRIS_AP_REGNUM 18)
-- 
cgit v1.1


From 9a7f14ef9b6b287d99b8240cdb43e8fe089ea9b3 Mon Sep 17 00:00:00 2001
From: Hans-Peter Nilsson <hp@axis.com>
Date: Wed, 2 Feb 2022 00:00:10 +0100
Subject: cris: Don't discriminate against ALL_REGS in
 TARGET_REGISTER_MOVE_COST

When the tightest class including both SPECIAL_REGS and GENERAL_REGS
is ALL_REGS, artificially special-casing for *either* to or from, hits
artificially hard.  This gets the port back to the code quality before
the previous patch ("cris: Remove CRIS v32 ACR artefacts") - except
for_vfprintf_r and _vfiprintf_r in newlib (still .8 and .4% larger).

gcc:
	* config/cris/cris.cc (cris_register_move_cost): Remove special pre-ira
	extra cost for ALL_REGS.
---
 gcc/config/cris/cris.cc | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/cris/cris.cc b/gcc/config/cris/cris.cc
index 264439c..4f97722 100644
--- a/gcc/config/cris/cris.cc
+++ b/gcc/config/cris/cris.cc
@@ -1683,20 +1683,10 @@ cris_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
      their move cost within that class is higher.  How about 7?  That's 3
      for a move to a GENERAL_REGS register, 3 for the move from the
      GENERAL_REGS register, and 1 for the increased register pressure.
-     Also, it's higher than the memory move cost, as it should.
-     We also do this for ALL_REGS, since we don't want that class to be
-     preferred (even to memory) at all where GENERAL_REGS doesn't fit.
-     Whenever it's about to be used, it's for SPECIAL_REGS.  If we don't
-     present a higher cost for ALL_REGS than memory, a SPECIAL_REGS may be
-     used when a GENERAL_REGS should be used, even if there are call-saved
-     GENERAL_REGS left to allocate.  This is because the fall-back when
-     the most preferred register class isn't available, isn't the next
-     (or next good) wider register class, but the *most widest* register
-     class.  FIXME: pre-IRA comment, perhaps obsolete now.  */
-
-  if ((reg_classes_intersect_p (from, SPECIAL_REGS)
-       && reg_classes_intersect_p (to, SPECIAL_REGS))
-      || from == ALL_REGS || to == ALL_REGS)
+     Also, it's higher than the memory move cost, as it should be.  */
+
+  if (reg_classes_intersect_p (from, SPECIAL_REGS)
+      && reg_classes_intersect_p (to, SPECIAL_REGS))
     return 7;
 
   /* Make moves to/from SPECIAL_REGS slightly more expensive, as we
-- 
cgit v1.1


From 07a6c52c4cd145d20488c4823669a2d984ba2051 Mon Sep 17 00:00:00 2001
From: Hans-Peter Nilsson <hp@axis.com>
Date: Wed, 2 Feb 2022 00:00:10 +0100
Subject: cris: Reload using special-regs before general-regs

On code where reload has an effect (i.e. quite rarely, just enough to be
noticeable), this change gets code quality back to the situation prior
to "Remove CRIS v32 ACR artefacts".  We had from IRA a pseudoregister
marked to be reloaded from a union of all allocatable registers (here:
SPEC_GENNONACR_REGS) but where the register-class corresponding to the
constraint for the register-type alternative (here: GENERAL_REGS) was
*not* a subset of that class: SPEC_GENNONACR_REGS (and GENNONACR_REGS)
had a one-register "hole" for the ACR register, a register present in
GENERAL_REGS.

Code in reload.cc:find_reloads adds 4 to the cost of a register-type
alternative that is neither a subset of the preferred register class nor
vice versa and thus reload thinks it can't use.  It would be preferable
to look for a non-empty intersection of the two, and use that
intersection for that alternative, something that can't be expressed
because a register class can't be formed from a random register set.

The effect was here that the GENERAL_REGS to/from memory alternatives
("r") had their cost raised such that the SPECIAL_REGS alternatives
("x") looked better.  This happened to improve code quality just a
little bit compared to GENERAL_REGS being chosen.

Anyway, with the improved CRIS register-class topology, the
subset-checking code no longer has the GENERAL_REGS-demoting effect.
To get the same quality, we have to adjust the port such that
SPECIAL_REGS are specifically preferred when possible and advisible,
i.e. when there's at least two of those registers as for the CPU variant
with multiplication (which happens to be the variant maintained for
performance).

For the move-pattern, the obvious method may seem to simply "curse" the
constraints of some alternatives (by prepending one of the "?!^$"
characters) but that method can't be used, because we want the effect to
be conditional on the CPU variant.  It'd also be a shame to split the
"*movsi_internal<setcc><setnz><setnzvc>" into two CPU-variants (with
different cursing).  Iterators would help, but it still seems unwieldy.
Instead, add copies of the GENERAL_REGS variants (to the SPECIAL_REGS
alternatives) on the "other" side, and make use of the "enabled"
attribute to activate just the desired order of alternatives.

gcc:

	* config/cris/cris.cc (cris_preferred_reload_class): Reject
	"eliminated" registers and small-enough constants unless
	reloaded into a class that is a subset of GENERAL_REGS.
	* config/cris/cris.md (attribute "cpu_variant"): New.
	(attribute "enabled"): Conditionalize on a matching attribute
	cpu_variant, if specified.
	("*movsi_internal<setcc><setnz><setnzvc>"): For moves to and from
	memory, add cpu-variant-enabled variants for "r" alternatives on
	the far side of the "x" alternatives, preferring the "x" ones
	only for variants where MOF is present (in addition to SRP).
---
 gcc/config/cris/cris.cc | 13 ++++++++++++-
 gcc/config/cris/cris.md | 25 ++++++++++++++++++++-----
 2 files changed, 32 insertions(+), 6 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/cris/cris.cc b/gcc/config/cris/cris.cc
index 4f97722..f0017d6 100644
--- a/gcc/config/cris/cris.cc
+++ b/gcc/config/cris/cris.cc
@@ -1661,7 +1661,7 @@ cris_reload_address_legitimized (rtx x,
    a bug.  */
 
 static reg_class_t
-cris_preferred_reload_class (rtx x ATTRIBUTE_UNUSED, reg_class_t rclass)
+cris_preferred_reload_class (rtx x, reg_class_t rclass)
 {
   if (rclass != MOF_REGS
       && rclass != MOF_SRP_REGS
@@ -1670,6 +1670,17 @@ cris_preferred_reload_class (rtx x ATTRIBUTE_UNUSED, reg_class_t rclass)
       && rclass != SPECIAL_REGS)
     return GENERAL_REGS;
 
+  /* We can't make use of something that's not a general register when
+     reloading an "eliminated" register (i.e. something that has turned into
+     e.g. sp + const_int).  */
+  if (GET_CODE (x) == PLUS && !reg_class_subset_p (rclass, GENERAL_REGS))
+    return NO_REGS;
+
+  /* Avoid putting constants into a special register, where the instruction is
+     shorter if loaded into a general register.  */
+  if (satisfies_constraint_P (x) && !reg_class_subset_p (rclass, GENERAL_REGS))
+    return NO_REGS;
+
   return rclass;
 }
 
diff --git a/gcc/config/cris/cris.md b/gcc/config/cris/cris.md
index 9d9eb8b..dd70941 100644
--- a/gcc/config/cris/cris.md
+++ b/gcc/config/cris/cris.md
@@ -153,9 +153,20 @@
 	(not (match_test "dead_or_set_regno_p (insn, CRIS_SRP_REGNUM)")))
    (nil) (nil)])
 
+;; Enable choosing particular instructions.  The discriminator choice
+;; "v0" stands for "pre-v10", for brevity.
+(define_attr "cpu_variant" "default,v0,v10" (const_string "default"))
+
 (define_attr "enabled" "no,yes"
   (if_then_else
-   (eq_attr "cc_enabled" "normal")
+   (and
+    (eq_attr "cc_enabled" "normal")
+    (ior
+     (eq_attr "cpu_variant" "default")
+     (and (eq_attr "cpu_variant" "v10")
+	  (match_test "TARGET_HAS_MUL_INSNS"))
+     (and (eq_attr "cpu_variant" "v0")
+	  (not (match_test "TARGET_HAS_MUL_INSNS")))))
    (const_string "yes")
    (const_string "no")))
 
@@ -578,9 +589,9 @@
 (define_insn "*movsi_internal<setcc><setnz><setnzvc>"
   [(set
     (match_operand:SI 0 "nonimmediate_operand"
-		      "=r,r, r,Q>,r,Q>,g,r,r,g,rQ>,x,  m,x")
+		      "=r,r, r,Q>,r,Q>,g,r,r,g,rQ>,x,  m,x, Q>,r,g")
     (match_operand:SI 1 "general_operand"
-		       "r,Q>,M,M, I,r, M,n,g,r,x,  rQ>,x,gi"))
+		       "r,Q>,M,M, I,r, M,n,g,r,x,  rQ>,x,gi,r, g,r"))
    (clobber (reg:CC CRIS_CC0_REGNUM))]
   ;; Avoid matching insns we know must be reloaded.  Without one
   ;; operand being a (pseudo-)register, reload chooses
@@ -597,6 +608,9 @@
     case 5:
     case 8:
     case 9:
+    case 14:
+    case 15:
+    case 16:
       return "move.d %1,%0";
 
     case 10:
@@ -634,9 +648,10 @@
       gcc_unreachable ();
     }
 }
-  [(set_attr "slottable" "yes,yes,yes,yes,yes,yes,no,no,no,no,yes,yes,no,no")
+  [(set_attr "cpu_variant" "*,*,*,*,*,v0,*,*,v0,v0,*,*,*,*,v10,v10,v10")
+   (set_attr "slottable" "yes,yes,yes,yes,yes,yes,no,no,no,no,yes,yes,no,no,yes,no,no")
    (set_attr "cc<cccc><ccnz><ccnzvc>"
-	     "*,*,none,none,*,none,none,*,*,none,none,none,none,none")])
+	     "*,*,none,none,*,none,none,*,*,none,none,none,none,none,none,*,none")])
 
 ;; FIXME: See movsi.
 
-- 
cgit v1.1


From ab95fe61fea38fbac7f4e00abd32c2530532351a Mon Sep 17 00:00:00 2001
From: Tamar Christina <tamar.christina@arm.com>
Date: Wed, 2 Feb 2022 10:51:38 +0000
Subject: AArch64: use canonical ordering for complex mul, fma and fms

After the first patch in the series this updates the optabs to expect the
canonical sequence.

gcc/ChangeLog:

	PR tree-optimization/102819
	PR tree-optimization/103169
	* config/aarch64/aarch64-simd.md (cml<fcmac1><conj_op><mode>4): Use
	canonical order.
	* config/aarch64/aarch64-sve.md (cml<fcmac1><conj_op><mode>4): Likewise.
---
 gcc/config/aarch64/aarch64-simd.md | 14 +++++++-------
 gcc/config/aarch64/aarch64-sve.md  |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 71c429f..13255be 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -556,17 +556,17 @@
 ;; remainder.  Because of this, expand early.
 (define_expand "cml<fcmac1><conj_op><mode>4"
   [(set (match_operand:VHSDF 0 "register_operand")
-	(plus:VHSDF (match_operand:VHSDF 1 "register_operand")
-		    (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand")
-				   (match_operand:VHSDF 3 "register_operand")]
-				   FCMLA_OP)))]
+	(plus:VHSDF (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")
+				   (match_operand:VHSDF 2 "register_operand")]
+				   FCMLA_OP)
+		    (match_operand:VHSDF 3 "register_operand")))]
   "TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
 {
   rtx tmp = gen_reg_rtx (<MODE>mode);
-  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (tmp, operands[1],
-						 operands[3], operands[2]));
+  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (tmp, operands[3],
+						 operands[2], operands[1]));
   emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0], tmp,
-						 operands[3], operands[2]));
+						 operands[2], operands[1]));
   DONE;
 })
 
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index bd22fe5..bd60e65 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -7278,11 +7278,11 @@
   rtx tmp = gen_reg_rtx (<MODE>mode);
   emit_insn
     (gen_aarch64_pred_fcmla<sve_rot1><mode> (tmp, operands[4],
-					     operands[3], operands[2],
-					     operands[1], operands[5]));
+					     operands[2], operands[1],
+					     operands[3], operands[5]));
   emit_insn
     (gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], operands[4],
-					     operands[3], operands[2],
+					     operands[2], operands[1],
 					     tmp, operands[5]));
   DONE;
 })
-- 
cgit v1.1


From 9f6f411f63f3aceddd846e4b0d27202a6e13d42c Mon Sep 17 00:00:00 2001
From: Tamar Christina <tamar.christina@arm.com>
Date: Wed, 2 Feb 2022 10:52:17 +0000
Subject: AArch32: use canonical ordering for complex mul, fma and fms

After the first patch in the series this updates the optabs to expect the
canonical sequence.

gcc/ChangeLog:

	PR tree-optimization/102819
	PR tree-optimization/103169
	* config/arm/vec-common.md (cml<fcmac1><conj_op><mode>4): Use
	canonical order.
---
 gcc/config/arm/vec-common.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index cef358e..2718d82 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -265,18 +265,18 @@
 ;; remainder.  Because of this, expand early.
 (define_expand "cml<fcmac1><conj_op><mode>4"
   [(set (match_operand:VF 0 "register_operand")
-	(plus:VF (match_operand:VF 1 "register_operand")
-		 (unspec:VF [(match_operand:VF 2 "register_operand")
-			     (match_operand:VF 3 "register_operand")]
-			    VCMLA_OP)))]
+	(plus:VF (unspec:VF [(match_operand:VF 1 "register_operand")
+			     (match_operand:VF 2 "register_operand")]
+			    VCMLA_OP)
+		 (match_operand:VF 3 "register_operand")))]
   "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
 		      && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
 {
   rtx tmp = gen_reg_rtx (<MODE>mode);
-  emit_insn (gen_arm_vcmla<rotsplit1><mode> (tmp, operands[1],
-					     operands[3], operands[2]));
+  emit_insn (gen_arm_vcmla<rotsplit1><mode> (tmp, operands[3],
+					     operands[2], operands[1]));
   emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], tmp,
-					     operands[3], operands[2]));
+					     operands[2], operands[1]));
   DONE;
 })
 
-- 
cgit v1.1


From cac2f69cdad434ad5cb60f5fe931d45cd82ef476 Mon Sep 17 00:00:00 2001
From: Bernd Kuhls <bernd.kuhls@t-online.de>
Date: Fri, 27 Mar 2020 21:23:53 +0100
Subject: gcc: define _REENTRANT for OpenRISC when -pthread is passed

The detection of pthread support fails on OpenRISC unless _REENTRANT
is defined. Added the CPP_SPEC definition to correct this.

gcc/ChangeLog:

	PR target/94372
	* config/or1k/linux.h (CPP_SPEC): Define.

Signed-off-by: Bernd Kuhls <bernd.kuhls@t-online.de>
---
 gcc/config/or1k/linux.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/or1k/linux.h b/gcc/config/or1k/linux.h
index 52909af..80f77c7 100644
--- a/gcc/config/or1k/linux.h
+++ b/gcc/config/or1k/linux.h
@@ -32,6 +32,8 @@
 #undef MUSL_DYNAMIC_LINKER
 #define MUSL_DYNAMIC_LINKER  "/lib/ld-musl-or1k.so.1"
 
+#define CPP_SPEC "%{pthread:-D_REENTRANT}"
+
 #undef LINK_SPEC
 #define LINK_SPEC "%{h*}			\
    %{static:-Bstatic}				\
-- 
cgit v1.1


From 0415470c8d66200f6ae8ffb5ff4342bafc06251b Mon Sep 17 00:00:00 2001
From: Martin Liska <mliska@suse.cz>
Date: Thu, 3 Feb 2022 09:55:59 +0100
Subject: s390x: Fix one more -Wformat-diag.

gcc/ChangeLog:

	* config/s390/s390.cc (s390_valid_target_attribute_inner_p):
	Use the error message for i386 target.
---
 gcc/config/s390/s390.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 2db12d4..63b78ab 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -15903,7 +15903,7 @@ s390_valid_target_attribute_inner_p (tree args,
       /* Process the option.  */
       if (!found)
 	{
-	  error ("attribute(target(\"%s\")) is unknown", orig_p);
+	  error ("attribute %qs argument %<target%> is unknown", orig_p);
 	  return false;
 	}
       else if (attrs[i].only_as_pragma && !force_pragma)
@@ -15953,7 +15953,7 @@ s390_valid_target_attribute_inner_p (tree args,
 	    }
 	  else
 	    {
-	      error ("attribute(target(\"%s\")) is unknown", orig_p);
+	      error ("attribute %qs argument %<target%> is unknown", orig_p);
 	      ret = false;
 	    }
 	}
@@ -15970,7 +15970,7 @@ s390_valid_target_attribute_inner_p (tree args,
 			global_dc);
 	  else
 	    {
-	      error ("attribute(target(\"%s\")) is unknown", orig_p);
+	      error ("attribute %qs argument %<target%> is unknown", orig_p);
 	      ret = false;
 	    }
 	}
-- 
cgit v1.1


From 9db03cd0caf6bbde1de302bf3509dc26ca8bff2b Mon Sep 17 00:00:00 2001
From: Martin Liska <mliska@suse.cz>
Date: Thu, 3 Feb 2022 10:19:33 +0100
Subject: =?UTF-8?q?Fix=20wording=20for:=20attribute=20=E2=80=98-xyz?=
 =?UTF-8?q?=E2=80=99=20argument=20=E2=80=98target=E2=80=99=20is=20unknown?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gcc/ChangeLog:

	* config/i386/i386-options.cc (ix86_valid_target_attribute_inner_p):
	Change subject and object in the error message.
	* config/s390/s390.cc (s390_valid_target_attribute_inner_p):
	Likewise.
---
 gcc/config/i386/i386-options.cc | 2 +-
 gcc/config/s390/s390.cc         | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 715d9a1..082abd2 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1201,7 +1201,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
       if (opt == N_OPTS)
 	{
 	  error_at (loc, "attribute %qs argument %qs is unknown",
-		    orig_p, attr_name);
+		    attr_name, orig_p);
 	  ret = false;
 	}
 
diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 63b78ab..5c2a830 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -15903,7 +15903,7 @@ s390_valid_target_attribute_inner_p (tree args,
       /* Process the option.  */
       if (!found)
 	{
-	  error ("attribute %qs argument %<target%> is unknown", orig_p);
+	  error ("attribute %<target%> argument %qs is unknown", orig_p);
 	  return false;
 	}
       else if (attrs[i].only_as_pragma && !force_pragma)
@@ -15953,7 +15953,7 @@ s390_valid_target_attribute_inner_p (tree args,
 	    }
 	  else
 	    {
-	      error ("attribute %qs argument %<target%> is unknown", orig_p);
+	      error ("attribute %<target%> argument %qs is unknown", orig_p);
 	      ret = false;
 	    }
 	}
@@ -15970,7 +15970,7 @@ s390_valid_target_attribute_inner_p (tree args,
 			global_dc);
 	  else
 	    {
-	      error ("attribute %qs argument %<target%> is unknown", orig_p);
+	      error ("attribute %<target%> argument %qs is unknown", orig_p);
 	      ret = false;
 	    }
 	}
-- 
cgit v1.1


From 6a770526600a7ffda1f288fa481a4322d5f149b4 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Thu, 3 Feb 2022 10:44:00 +0000
Subject: aarch64: Remove VALL_F16MOV iterator

The VALL_F16MOV iterator now has the same modes as VALL_F16,
in the same order.  This patch removes the former in favour
of the latter.

This doesn't fix a bug as such, but it's ultra-safe (no change in
object code) and it saves a follow-up patch from having to make
a false choice between the iterators.

gcc/
	* config/aarch64/iterators.md (VALL_F16MOV): Delete.
	* config/aarch64/aarch64-simd.md (mov<mode>): Use VALL_F16 instead
	of VALL_F16MOV.
---
 gcc/config/aarch64/aarch64-simd.md | 4 ++--
 gcc/config/aarch64/iterators.md    | 5 -----
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 13255be..f6d7b42 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,8 +19,8 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16MOV 0 "nonimmediate_operand")
-	(match_operand:VALL_F16MOV 1 "general_operand"))]
+  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
+	(match_operand:VALL_F16 1 "general_operand"))]
   "TARGET_SIMD"
   "
   /* Force the operand into a register if it is not an
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 9160ce3..a0c02e4 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -187,11 +187,6 @@
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
 
-;; All Advanced SIMD modes suitable for moving, loading, and storing,
-;; including special Bfloat vector types.
-(define_mode_iterator VALL_F16MOV [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
-				   V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
-
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
 				V4HF V8HF V2SF V4SF])
-- 
cgit v1.1


From 7e4f89a23e32604f71f8f6756c8856bf07bf7ac2 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Thu, 3 Feb 2022 10:44:00 +0000
Subject: aarch64: Add missing movmisalign patterns

The Advanced SIMD movmisalign patterns didn't handle 16-bit
FP modes, which meant that the vector loop for:

  void
  test (_Float16 *data)
  {
    _Pragma ("omp simd")
    for (int i = 0; i < 8; ++i)
      data[i] = 1.0;
  }

would be versioned for alignment.

This was causing some new failures in aarch64/sve/single_5.c:

FAIL: gcc.target/aarch64/sve/single_5.c scan-assembler-not \\tb
FAIL: gcc.target/aarch64/sve/single_5.c scan-assembler-not \\tcmp
FAIL: gcc.target/aarch64/sve/single_5.c scan-assembler-times \\tstr\\tq[0-9]+, 10

but I didn't look into what changed from earlier releases.
Adding the missing modes removes some existing xfails.

gcc/
	* config/aarch64/aarch64-simd.md (movmisalign<mode>): Extend from
	VALL to VALL_F16.

gcc/testsuite/
	* gcc.target/aarch64/sve/single_5.c: Remove some XFAILs.
---
 gcc/config/aarch64/aarch64-simd.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f6d7b42..6646e06 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -50,8 +50,8 @@
 )
 
 (define_expand "movmisalign<mode>"
-  [(set (match_operand:VALL 0 "nonimmediate_operand")
-        (match_operand:VALL 1 "general_operand"))]
+  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
+        (match_operand:VALL_F16 1 "general_operand"))]
   "TARGET_SIMD && !STRICT_ALIGNMENT"
 {
   /* This pattern is not permitted to fail during expansion: if both arguments
-- 
cgit v1.1


From 8439e866a38399f0d5e6aab16faaf10bdabc4b5f Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Thu, 3 Feb 2022 14:34:21 +0100
Subject: arm: Fix up help.exp regression

On Thu, Jan 20, 2022 at 11:27:20AM +0000, Richard Earnshaw via Gcc-patches wrote:
> gcc/ChangeLog:
>
>       * config/arm/arm.opt (mfix-cortex-a57-aes-1742098): New command-line
>       option.
>       (mfix-cortex-a72-aes-1655431): New option alias.

> --- a/gcc/config/arm/arm.opt
> +++ b/gcc/config/arm/arm.opt
> @@ -272,6 +272,16 @@ mfix-cmse-cve-2021-35465
>  Target Var(fix_vlldm) Init(2)
>  Mitigate issues with VLLDM on some M-profile devices (CVE-2021-35465).
>
> +mfix-cortex-a57-aes-1742098
> +Target Var(fix_aes_erratum_1742098) Init(2) Save
> +Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72.
> +Arm erratum #1742098
> +
> +mfix-cortex-a72-aes-1655431
> +Target Alias(mfix-cortex-a57-aes-1742098)
> +Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72.
> +Arm erratum #1655431
> +
>  munaligned-access
>  Target Var(unaligned_access) Init(2) Save
>  Enable unaligned word and halfword accesses to packed data.

This breaks:
Running /usr/src/gcc/gcc/testsuite/gcc.misc-tests/help.exp ...
FAIL: compiler driver --help=target option(s): "^ +-.*[^:.]$" absent from output: "  -mfix-cortex-a57-aes-1742098 Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72. Arm erratum #1742098"

help.exp with help of lib/options.exp tests whether all non-empty descriptions of
options are terminated with . or :.

2022-02-03  Jakub Jelinek  <jakub@redhat.com>

	* config/arm/arm.opt (mfix-cortex-a57-aes-1742098,
	mfix-cortex-a72-aes-1655431): Ensure description ends with full stop.
---
 gcc/config/arm/arm.opt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
index cc16534..3209b6c 100644
--- a/gcc/config/arm/arm.opt
+++ b/gcc/config/arm/arm.opt
@@ -274,13 +274,13 @@ Mitigate issues with VLLDM on some M-profile devices (CVE-2021-35465).
 
 mfix-cortex-a57-aes-1742098
 Target Var(fix_aes_erratum_1742098) Init(2) Save
-Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72.
-Arm erratum #1742098
+Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72
+(Arm erratum #1742098).
 
 mfix-cortex-a72-aes-1655431
 Target Alias(mfix-cortex-a57-aes-1742098)
-Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72.
-Arm erratum #1655431
+Mitigate issues with AES instructions on Cortex-A57 and Cortex-A72
+(Arm erratum #1655431).
 
 munaligned-access
 Target Var(unaligned_access) Init(2) Save
-- 
cgit v1.1


From a1b4d225d8cd07c79eea81fb6416e8ad5a07f018 Mon Sep 17 00:00:00 2001
From: Bill Schmidt <wschmidt@linux.ibm.com>
Date: Wed, 2 Feb 2022 20:59:00 -0600
Subject: rs6000: Unify error messages for built-in constant restrictions

We currently give different error messages for built-in functions that
violate range restrictions on their arguments, depending on whether we
record them as requiring an n-bit literal or a literal between two values.
It's better to be consistent.  Change the error message for the n-bit
literal to look like the other one.

2022-02-02  Bill Schmidt  <wschmidt@linux.ibm.com>

gcc/
	* config/rs6000/rs6000-call.cc (rs6000_expand_builtin): Revise error
	message for RES_BITS case.

gcc/testsuite/
	* gcc.target/powerpc/bfp/scalar-test-data-class-10.c: Adjust error
	messages.
	* gcc.target/powerpc/bfp/scalar-test-data-class-2.c: Likewise.
	* gcc.target/powerpc/bfp/scalar-test-data-class-3.c: Likewise.
	* gcc.target/powerpc/bfp/scalar-test-data-class-4.c: Likewise.
	* gcc.target/powerpc/bfp/scalar-test-data-class-5.c: Likewise.
	* gcc.target/powerpc/bfp/scalar-test-data-class-9.c: Likewise.
	* gcc.target/powerpc/bfp/vec-test-data-class-4.c: Likewise.
	* gcc.target/powerpc/bfp/vec-test-data-class-5.c: Likewise.
	* gcc.target/powerpc/bfp/vec-test-data-class-6.c: Likewise.
	* gcc.target/powerpc/bfp/vec-test-data-class-7.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-12.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-14.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-17.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-19.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-2.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-22.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-24.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-27.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-29.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-32.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-34.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-37.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-39.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-4.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-42.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-44.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-47.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-49.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-52.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-54.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-57.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-59.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-62.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-64.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-67.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-69.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-7.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-72.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-74.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-77.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-79.c: Likewise.
	* gcc.target/powerpc/dfp/dtstsfi-9.c: Likewise.
	* gcc.target/powerpc/pr80315-1.c: Likewise.
	* gcc.target/powerpc/pr80315-2.c: Likewise.
	* gcc.target/powerpc/pr80315-3.c: Likewise.
	* gcc.target/powerpc/pr80315-4.c: Likewise.
	* gcc.target/powerpc/pr82015.c: Likewise.
	* gcc.target/powerpc/pr91903.c: Likewise.
	* gcc.target/powerpc/test_fpscr_rn_builtin_error.c: Likewise.
	* gcc.target/powerpc/vec-ternarylogic-10.c: Likewise.
---
 gcc/config/rs6000/rs6000-call.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000-call.cc b/gcc/config/rs6000/rs6000-call.cc
index 5c870d4..d9bd5ca 100644
--- a/gcc/config/rs6000/rs6000-call.cc
+++ b/gcc/config/rs6000/rs6000-call.cc
@@ -5717,8 +5717,10 @@ rs6000_expand_builtin (tree exp, rtx target, rtx /* subtarget */,
 	    if (!(TREE_CODE (restr_arg) == INTEGER_CST
 		  && (TREE_INT_CST_LOW (restr_arg) & ~mask) == 0))
 	      {
-		error ("argument %d must be a %d-bit unsigned literal",
-		       bifaddr->restr_opnd[i], bifaddr->restr_val1[i]);
+		unsigned p = (1U << bifaddr->restr_val1[i]) - 1;
+		error ("argument %d must be a literal between 0 and %d,"
+		       " inclusive",
+		       bifaddr->restr_opnd[i], p);
 		return CONST0_RTX (mode[0]);
 	      }
 	    break;
-- 
cgit v1.1


From eecee223f435fa01fb07a2fdba1615b89627d710 Mon Sep 17 00:00:00 2001
From: Bill Schmidt <wschmidt@linux.ibm.com>
Date: Thu, 3 Feb 2022 10:26:29 -0600
Subject: rs6000: Consolidate target built-ins code

Continuing with the refactoring effort, this patch moves as much of the
target-specific built-in support code into a new file, rs6000-builtin.cc.
However, we can't easily move the overloading support code out of
rs6000-c.cc, because the build machinery understands that as a special file
to be included with the C and C++ front ends.

This patch is just a straightforward move, with one exception.  I found
that the builtin_mode_to_type[] array is no longer used, so I also removed
all code having to do with it.

The code in rs6000-builtin.cc is organized in related sections:
 - General support functions
 - Initialization support
 - GIMPLE folding support
 - Expansion support

Overloading support remains in rs6000-c.cc.

2022-02-03  Bill Schmidt  <wschmidt@linux.ibm.com>

gcc/
	* config.gcc (powerpc*-*-*): Add rs6000-builtin.o to extra_objs.
	* config/rs6000/rs6000-builtin.cc: New file, containing code moved
	from other files.
	* config/rs6000/rs6000-call.cc (cpu_is_info): Move to
	rs6000-builtin.cc.
	(cpu_supports_info): Likewise.
	(rs6000_type_string): Likewise.
	(altivec_expand_predicate_builtin): Likewise.
	(rs6000_htm_spr_icode): Likewise.
	(altivec_expand_vec_init_builtin): Likewise.
	(get_element_number): Likewise.
	(altivec_expand_vec_set_builtin): Likewise.
	(altivec_expand_vec_ext_builtin): Likewise.
	(rs6000_invalid_builtin): Likewise.
	(rs6000_fold_builtin): Likewise.
	(fold_build_vec_cmp): Likewise.
	(fold_compare_helper): Likewise.
	(map_to_integral_tree_type): Likewise.
	(fold_mergehl_helper): Likewise.
	(fold_mergeeo_helper): Likewise.
	(rs6000_builtin_valid_without_lhs): Likewise.
	(rs6000_builtin_is_supported): Likewise.
	(rs6000_gimple_fold_mma_builtin): Likewise.
	(rs6000_gimple_fold_builtin): Likewise.
	(rs6000_expand_ldst_mask): Likewise.
	(cpu_expand_builtin): Likewise.
	(elemrev_icode): Likewise.
	(ldv_expand_builtin): Likewise.
	(lxvrse_expand_builtin): Likewise.
	(lxvrze_expand_builtin): Likewise.
	(stv_expand_builtin): Likewise.
	(mma_expand_builtin): Likewise.
	(htm_spr_num): Likewise.
	(htm_expand_builtin): Likewise.
	(rs6000_expand_builtin): Likewise.
	(rs6000_vector_type): Likewise.
	(rs6000_init_builtins): Likewise.  Remove initialization of
	builtin_mode_to_type entries.
	(rs6000_builtin_decl): Move to rs6000-builtin.cc.
	* config/rs6000/rs6000.cc (rs6000_builtin_mask_for_load): New
	external declaration.
	(rs6000_builtin_md_vectorized_function): Likewise.
	(rs6000_builtin_reciprocal): Likewise.
	(altivec_builtin_mask_for_load): Move to rs6000-builtin.cc.
	(rs6000_builtin_types): Likewise.
	(builtin_mode_to_type): Remove.
	(rs6000_builtin_mask_for_load): Move to rs6000-builtin.cc.  Remove
	static qualifier.
	(rs6000_builtin_md_vectorized_function): Likewise.
	(rs6000_builtin_reciprocal): Likewise.
	* config/rs6000/rs6000.h (builtin_mode_to_type): Remove.
	* config/rs6000/t-rs6000 (rs6000-builtin.o): New target.
---
 gcc/config/rs6000/rs6000-builtin.cc | 3714 +++++++++++++++++++++++++++++++++++
 gcc/config/rs6000/rs6000-call.cc    | 3526 ---------------------------------
 gcc/config/rs6000/rs6000.cc         |  163 +-
 gcc/config/rs6000/rs6000.h          |    1 -
 gcc/config/rs6000/t-rs6000          |    4 +
 5 files changed, 3722 insertions(+), 3686 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-builtin.cc

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000-builtin.cc b/gcc/config/rs6000/rs6000-builtin.cc
new file mode 100644
index 0000000..005f936
--- /dev/null
+++ b/gcc/config/rs6000/rs6000-builtin.cc
@@ -0,0 +1,3714 @@
+/* Target-specific built-in function support for the Power architecture.
+   See also rs6000-c.c, rs6000-gen-builtins.c, rs6000-builtins.def, and
+   rs6000-overloads.def.
+   Note that "normal" builtins (generic math functions, etc.) are handled
+   in rs6000.c.
+
+   Copyright (C) 2002-2022 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "target.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "gimple.h"
+#include "tm_p.h"
+#include "optabs.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "fold-const.h"
+#include "stor-layout.h"
+#include "calls.h"
+#include "varasm.h"
+#include "explow.h"
+#include "expr.h"
+#include "langhooks.h"
+#include "gimplify.h"
+#include "gimple-fold.h"
+#include "gimple-iterator.h"
+#include "ssa.h"
+#include "tree-ssa-propagate.h"
+#include "builtins.h"
+#include "tree-vector-builder.h"
+#if TARGET_XCOFF
+#include "xcoffout.h"  /* get declarations of xcoff_*_section_name */
+#endif
+#include "ppc-auxv.h"
+#include "rs6000-internal.h"
+
+/* Built in types.  */
+tree rs6000_builtin_types[RS6000_BTI_MAX];
+
+/* Support targetm.vectorize.builtin_mask_for_load.  */
+tree altivec_builtin_mask_for_load;
+
+/* **** General support functions **** */
+
+/* Raise an error message for a builtin function that is called without the
+   appropriate target options being set.  */
+
+void
+rs6000_invalid_builtin (enum rs6000_gen_builtins fncode)
+{
+  size_t j = (size_t) fncode;
+  const char *name = rs6000_builtin_info[j].bifname;
+
+  switch (rs6000_builtin_info[j].enable)
+    {
+    case ENB_P5:
+      error ("%qs requires the %qs option", name, "-mcpu=power5");
+      break;
+    case ENB_P6:
+      error ("%qs requires the %qs option", name, "-mcpu=power6");
+      break;
+    case ENB_P6_64:
+      error ("%qs requires the %qs option and either the %qs or %qs option",
+	     name, "-mcpu=power6", "-m64", "-mpowerpc64");
+      break;
+    case ENB_ALTIVEC:
+      error ("%qs requires the %qs option", name, "-maltivec");
+      break;
+    case ENB_CELL:
+      error ("%qs requires the %qs option", name, "-mcpu=cell");
+      break;
+    case ENB_VSX:
+      error ("%qs requires the %qs option", name, "-mvsx");
+      break;
+    case ENB_P7:
+      error ("%qs requires the %qs option", name, "-mcpu=power7");
+      break;
+    case ENB_P7_64:
+      error ("%qs requires the %qs option and either the %qs or %qs option",
+	     name, "-mcpu=power7", "-m64", "-mpowerpc64");
+      break;
+    case ENB_P8:
+      error ("%qs requires the %qs option", name, "-mcpu=power8");
+      break;
+    case ENB_P8V:
+      error ("%qs requires the %qs and %qs options", name, "-mcpu=power8",
+	     "-mvsx");
+      break;
+    case ENB_P9:
+      error ("%qs requires the %qs option", name, "-mcpu=power9");
+      break;
+    case ENB_P9_64:
+      error ("%qs requires the %qs option and either the %qs or %qs option",
+	     name, "-mcpu=power9", "-m64", "-mpowerpc64");
+      break;
+    case ENB_P9V:
+      error ("%qs requires the %qs and %qs options", name, "-mcpu=power9",
+	     "-mvsx");
+      break;
+    case ENB_IEEE128_HW:
+      error ("%qs requires quad-precision floating-point arithmetic", name);
+      break;
+    case ENB_DFP:
+      error ("%qs requires the %qs option", name, "-mhard-dfp");
+      break;
+    case ENB_CRYPTO:
+      error ("%qs requires the %qs option", name, "-mcrypto");
+      break;
+    case ENB_HTM:
+      error ("%qs requires the %qs option", name, "-mhtm");
+      break;
+    case ENB_P10:
+      error ("%qs requires the %qs option", name, "-mcpu=power10");
+      break;
+    case ENB_P10_64:
+      error ("%qs requires the %qs option and either the %qs or %qs option",
+	     name, "-mcpu=power10", "-m64", "-mpowerpc64");
+      break;
+    case ENB_MMA:
+      error ("%qs requires the %qs option", name, "-mmma");
+      break;
+    default:
+    case ENB_ALWAYS:
+      gcc_unreachable ();
+    }
+}
+
+/* Check whether a builtin function is supported in this target
+   configuration.  */
+bool
+rs6000_builtin_is_supported (enum rs6000_gen_builtins fncode)
+{
+  switch (rs6000_builtin_info[(size_t) fncode].enable)
+    {
+    case ENB_ALWAYS:
+      return true;
+    case ENB_P5:
+      return TARGET_POPCNTB;
+    case ENB_P6:
+      return TARGET_CMPB;
+    case ENB_P6_64:
+      return TARGET_CMPB && TARGET_POWERPC64;
+    case ENB_P7:
+      return TARGET_POPCNTD;
+    case ENB_P7_64:
+      return TARGET_POPCNTD && TARGET_POWERPC64;
+    case ENB_P8:
+      return TARGET_DIRECT_MOVE;
+    case ENB_P8V:
+      return TARGET_P8_VECTOR;
+    case ENB_P9:
+      return TARGET_MODULO;
+    case ENB_P9_64:
+      return TARGET_MODULO && TARGET_POWERPC64;
+    case ENB_P9V:
+      return TARGET_P9_VECTOR;
+    case ENB_P10:
+      return TARGET_POWER10;
+    case ENB_P10_64:
+      return TARGET_POWER10 && TARGET_POWERPC64;
+    case ENB_ALTIVEC:
+      return TARGET_ALTIVEC;
+    case ENB_VSX:
+      return TARGET_VSX;
+    case ENB_CELL:
+      return TARGET_ALTIVEC && rs6000_cpu == PROCESSOR_CELL;
+    case ENB_IEEE128_HW:
+      return TARGET_FLOAT128_HW;
+    case ENB_DFP:
+      return TARGET_DFP;
+    case ENB_CRYPTO:
+      return TARGET_CRYPTO;
+    case ENB_HTM:
+      return TARGET_HTM;
+    case ENB_MMA:
+      return TARGET_MMA;
+    default:
+      gcc_unreachable ();
+    }
+  gcc_unreachable ();
+}
+
+/* Target hook for early folding of built-ins, shamelessly stolen
+   from ia64.cc.  */
+
+tree
+rs6000_fold_builtin (tree fndecl ATTRIBUTE_UNUSED,
+		     int n_args ATTRIBUTE_UNUSED,
+		     tree *args ATTRIBUTE_UNUSED,
+		     bool ignore ATTRIBUTE_UNUSED)
+{
+#ifdef SUBTARGET_FOLD_BUILTIN
+  return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
+#else
+  return NULL_TREE;
+#endif
+}
+
+tree
+rs6000_builtin_decl (unsigned code, bool /* initialize_p */)
+{
+  rs6000_gen_builtins fcode = (rs6000_gen_builtins) code;
+
+  if (fcode >= RS6000_OVLD_MAX)
+    return error_mark_node;
+
+  return rs6000_builtin_decls[code];
+}
+
+/* Implement targetm.vectorize.builtin_mask_for_load.  */
+tree
+rs6000_builtin_mask_for_load (void)
+{
+  /* Don't use lvsl/vperm for P8 and similarly efficient machines.  */
+  if ((TARGET_ALTIVEC && !TARGET_VSX)
+      || (TARGET_VSX && !TARGET_EFFICIENT_UNALIGNED_VSX))
+    return altivec_builtin_mask_for_load;
+  else
+    return 0;
+}
+
+/* Implement targetm.vectorize.builtin_md_vectorized_function.  */
+
+tree
+rs6000_builtin_md_vectorized_function (tree fndecl, tree type_out,
+				       tree type_in)
+{
+  machine_mode in_mode, out_mode;
+  int in_n, out_n;
+
+  if (TARGET_DEBUG_BUILTIN)
+    fprintf (stderr,
+	     "rs6000_builtin_md_vectorized_function (%s, %s, %s)\n",
+	     IDENTIFIER_POINTER (DECL_NAME (fndecl)),
+	     GET_MODE_NAME (TYPE_MODE (type_out)),
+	     GET_MODE_NAME (TYPE_MODE (type_in)));
+
+  /* TODO: Should this be gcc_assert?  */
+  if (TREE_CODE (type_out) != VECTOR_TYPE
+      || TREE_CODE (type_in) != VECTOR_TYPE)
+    return NULL_TREE;
+
+  out_mode = TYPE_MODE (TREE_TYPE (type_out));
+  out_n = TYPE_VECTOR_SUBPARTS (type_out);
+  in_mode = TYPE_MODE (TREE_TYPE (type_in));
+  in_n = TYPE_VECTOR_SUBPARTS (type_in);
+
+  enum rs6000_gen_builtins fn
+    = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl);
+  switch (fn)
+    {
+    case RS6000_BIF_RSQRTF:
+      if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode)
+	  && out_mode == SFmode && out_n == 4
+	  && in_mode == SFmode && in_n == 4)
+	return rs6000_builtin_decls[RS6000_BIF_VRSQRTFP];
+      break;
+    case RS6000_BIF_RSQRT:
+      if (VECTOR_UNIT_VSX_P (V2DFmode)
+	  && out_mode == DFmode && out_n == 2
+	  && in_mode == DFmode && in_n == 2)
+	return rs6000_builtin_decls[RS6000_BIF_RSQRT_2DF];
+      break;
+    case RS6000_BIF_RECIPF:
+      if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode)
+	  && out_mode == SFmode && out_n == 4
+	  && in_mode == SFmode && in_n == 4)
+	return rs6000_builtin_decls[RS6000_BIF_VRECIPFP];
+      break;
+    case RS6000_BIF_RECIP:
+      if (VECTOR_UNIT_VSX_P (V2DFmode)
+	  && out_mode == DFmode && out_n == 2
+	  && in_mode == DFmode && in_n == 2)
+	return rs6000_builtin_decls[RS6000_BIF_RECIP_V2DF];
+      break;
+    default:
+      break;
+    }
+
+  machine_mode in_vmode = TYPE_MODE (type_in);
+  machine_mode out_vmode = TYPE_MODE (type_out);
+
+  /* Power10 supported vectorized built-in functions.  */
+  if (TARGET_POWER10
+      && in_vmode == out_vmode
+      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
+    {
+      machine_mode exp_mode = DImode;
+      machine_mode exp_vmode = V2DImode;
+      enum rs6000_gen_builtins bif;
+      switch (fn)
+	{
+	case RS6000_BIF_DIVWE:
+	case RS6000_BIF_DIVWEU:
+	  exp_mode = SImode;
+	  exp_vmode = V4SImode;
+	  if (fn == RS6000_BIF_DIVWE)
+	    bif = RS6000_BIF_VDIVESW;
+	  else
+	    bif = RS6000_BIF_VDIVEUW;
+	  break;
+	case RS6000_BIF_DIVDE:
+	case RS6000_BIF_DIVDEU:
+	  if (fn == RS6000_BIF_DIVDE)
+	    bif = RS6000_BIF_VDIVESD;
+	  else
+	    bif = RS6000_BIF_VDIVEUD;
+	  break;
+	case RS6000_BIF_CFUGED:
+	  bif = RS6000_BIF_VCFUGED;
+	  break;
+	case RS6000_BIF_CNTLZDM:
+	  bif = RS6000_BIF_VCLZDM;
+	  break;
+	case RS6000_BIF_CNTTZDM:
+	  bif = RS6000_BIF_VCTZDM;
+	  break;
+	case RS6000_BIF_PDEPD:
+	  bif = RS6000_BIF_VPDEPD;
+	  break;
+	case RS6000_BIF_PEXTD:
+	  bif = RS6000_BIF_VPEXTD;
+	  break;
+	default:
+	  return NULL_TREE;
+	}
+
+      if (in_mode == exp_mode && in_vmode == exp_vmode)
+	return rs6000_builtin_decls[bif];
+    }
+
+  return NULL_TREE;
+}
+
+/* Returns a code for a target-specific builtin that implements
+   reciprocal of the function, or NULL_TREE if not available.  */
+
+tree
+rs6000_builtin_reciprocal (tree fndecl)
+{
+  switch (DECL_MD_FUNCTION_CODE (fndecl))
+    {
+    case RS6000_BIF_XVSQRTDP:
+      if (!RS6000_RECIP_AUTO_RSQRTE_P (V2DFmode))
+	return NULL_TREE;
+
+      return rs6000_builtin_decls[RS6000_BIF_RSQRT_2DF];
+
+    case RS6000_BIF_XVSQRTSP:
+      if (!RS6000_RECIP_AUTO_RSQRTE_P (V4SFmode))
+	return NULL_TREE;
+
+      return rs6000_builtin_decls[RS6000_BIF_RSQRT_4SF];
+
+    default:
+      return NULL_TREE;
+    }
+}
+
+/* **** Initialization support **** */
+
+/* Create a builtin vector type with a name.  Taking care not to give
+   the canonical type a name.  */
+
+static tree
+rs6000_vector_type (const char *name, tree elt_type, unsigned num_elts)
+{
+  tree result = build_vector_type (elt_type, num_elts);
+
+  /* Copy so we don't give the canonical type a name.  */
+  result = build_variant_type_copy (result);
+
+  add_builtin_type (name, result);
+
+  return result;
+}
+
+/* Debug utility to translate a type node to a single textual token.  */
+static
+const char *rs6000_type_string (tree type_node)
+{
+  if (type_node == void_type_node)
+    return "void";
+  else if (type_node == long_integer_type_node)
+    return "long";
+  else if (type_node == long_unsigned_type_node)
+    return "ulong";
+  else if (type_node == long_long_integer_type_node)
+    return "longlong";
+  else if (type_node == long_long_unsigned_type_node)
+    return "ulonglong";
+  else if (type_node == bool_V2DI_type_node)
+    return "vbll";
+  else if (type_node == bool_V4SI_type_node)
+    return "vbi";
+  else if (type_node == bool_V8HI_type_node)
+    return "vbs";
+  else if (type_node == bool_V16QI_type_node)
+    return "vbc";
+  else if (type_node == bool_int_type_node)
+    return "bool";
+  else if (type_node == dfloat64_type_node)
+    return "_Decimal64";
+  else if (type_node == double_type_node)
+    return "double";
+  else if (type_node == intDI_type_node)
+    return "sll";
+  else if (type_node == intHI_type_node)
+    return "ss";
+  else if (type_node == ibm128_float_type_node)
+    return "__ibm128";
+  else if (type_node == opaque_V4SI_type_node)
+    return "opaque";
+  else if (POINTER_TYPE_P (type_node))
+    return "void*";
+  else if (type_node == intQI_type_node || type_node == char_type_node)
+    return "sc";
+  else if (type_node == dfloat32_type_node)
+    return "_Decimal32";
+  else if (type_node == float_type_node)
+    return "float";
+  else if (type_node == intSI_type_node || type_node == integer_type_node)
+    return "si";
+  else if (type_node == dfloat128_type_node)
+    return "_Decimal128";
+  else if (type_node == long_double_type_node)
+    return "longdouble";
+  else if (type_node == intTI_type_node)
+    return "sq";
+  else if (type_node == unsigned_intDI_type_node)
+    return "ull";
+  else if (type_node == unsigned_intHI_type_node)
+    return "us";
+  else if (type_node == unsigned_intQI_type_node)
+    return "uc";
+  else if (type_node == unsigned_intSI_type_node)
+    return "ui";
+  else if (type_node == unsigned_intTI_type_node)
+    return "uq";
+  else if (type_node == unsigned_V1TI_type_node)
+    return "vuq";
+  else if (type_node == unsigned_V2DI_type_node)
+    return "vull";
+  else if (type_node == unsigned_V4SI_type_node)
+    return "vui";
+  else if (type_node == unsigned_V8HI_type_node)
+    return "vus";
+  else if (type_node == unsigned_V16QI_type_node)
+    return "vuc";
+  else if (type_node == V16QI_type_node)
+    return "vsc";
+  else if (type_node == V1TI_type_node)
+    return "vsq";
+  else if (type_node == V2DF_type_node)
+    return "vd";
+  else if (type_node == V2DI_type_node)
+    return "vsll";
+  else if (type_node == V4SF_type_node)
+    return "vf";
+  else if (type_node == V4SI_type_node)
+    return "vsi";
+  else if (type_node == V8HI_type_node)
+    return "vss";
+  else if (type_node == pixel_V8HI_type_node)
+    return "vp";
+  else if (type_node == pcvoid_type_node)
+    return "voidc*";
+  else if (type_node == float128_type_node)
+    return "_Float128";
+  else if (type_node == vector_pair_type_node)
+    return "__vector_pair";
+  else if (type_node == vector_quad_type_node)
+    return "__vector_quad";
+
+  return "unknown";
+}
+
+void
+rs6000_init_builtins (void)
+{
+  tree tdecl;
+  tree t;
+
+  if (TARGET_DEBUG_BUILTIN)
+    fprintf (stderr, "rs6000_init_builtins%s%s\n",
+	     (TARGET_ALTIVEC)	   ? ", altivec" : "",
+	     (TARGET_VSX)	   ? ", vsx"	 : "");
+
+  V2DI_type_node = rs6000_vector_type ("__vector long long",
+				       long_long_integer_type_node, 2);
+  ptr_V2DI_type_node
+    = build_pointer_type (build_qualified_type (V2DI_type_node,
+						TYPE_QUAL_CONST));
+
+  V2DF_type_node = rs6000_vector_type ("__vector double", double_type_node, 2);
+  ptr_V2DF_type_node
+    = build_pointer_type (build_qualified_type (V2DF_type_node,
+						TYPE_QUAL_CONST));
+
+  V4SI_type_node = rs6000_vector_type ("__vector signed int",
+				       intSI_type_node, 4);
+  ptr_V4SI_type_node
+    = build_pointer_type (build_qualified_type (V4SI_type_node,
+						TYPE_QUAL_CONST));
+
+  V4SF_type_node = rs6000_vector_type ("__vector float", float_type_node, 4);
+  ptr_V4SF_type_node
+    = build_pointer_type (build_qualified_type (V4SF_type_node,
+						TYPE_QUAL_CONST));
+
+  V8HI_type_node = rs6000_vector_type ("__vector signed short",
+				       intHI_type_node, 8);
+  ptr_V8HI_type_node
+    = build_pointer_type (build_qualified_type (V8HI_type_node,
+						TYPE_QUAL_CONST));
+
+  V16QI_type_node = rs6000_vector_type ("__vector signed char",
+					intQI_type_node, 16);
+  ptr_V16QI_type_node
+    = build_pointer_type (build_qualified_type (V16QI_type_node,
+						TYPE_QUAL_CONST));
+
+  unsigned_V16QI_type_node = rs6000_vector_type ("__vector unsigned char",
+					unsigned_intQI_type_node, 16);
+  ptr_unsigned_V16QI_type_node
+    = build_pointer_type (build_qualified_type (unsigned_V16QI_type_node,
+						TYPE_QUAL_CONST));
+
+  unsigned_V8HI_type_node = rs6000_vector_type ("__vector unsigned short",
+				       unsigned_intHI_type_node, 8);
+  ptr_unsigned_V8HI_type_node
+    = build_pointer_type (build_qualified_type (unsigned_V8HI_type_node,
+						TYPE_QUAL_CONST));
+
+  unsigned_V4SI_type_node = rs6000_vector_type ("__vector unsigned int",
+				       unsigned_intSI_type_node, 4);
+  ptr_unsigned_V4SI_type_node
+    = build_pointer_type (build_qualified_type (unsigned_V4SI_type_node,
+						TYPE_QUAL_CONST));
+
+  unsigned_V2DI_type_node
+    = rs6000_vector_type ("__vector unsigned long long",
+			  long_long_unsigned_type_node, 2);
+
+  ptr_unsigned_V2DI_type_node
+    = build_pointer_type (build_qualified_type (unsigned_V2DI_type_node,
+						TYPE_QUAL_CONST));
+
+  opaque_V4SI_type_node = build_opaque_vector_type (intSI_type_node, 4);
+
+  const_str_type_node
+    = build_pointer_type (build_qualified_type (char_type_node,
+						TYPE_QUAL_CONST));
+
+  /* We use V1TI mode as a special container to hold __int128_t items that
+     must live in VSX registers.  */
+  if (intTI_type_node)
+    {
+      V1TI_type_node = rs6000_vector_type ("__vector __int128",
+					   intTI_type_node, 1);
+      ptr_V1TI_type_node
+	= build_pointer_type (build_qualified_type (V1TI_type_node,
+						    TYPE_QUAL_CONST));
+      unsigned_V1TI_type_node
+	= rs6000_vector_type ("__vector unsigned __int128",
+			      unsigned_intTI_type_node, 1);
+      ptr_unsigned_V1TI_type_node
+	= build_pointer_type (build_qualified_type (unsigned_V1TI_type_node,
+						    TYPE_QUAL_CONST));
+    }
+
+  /* The 'vector bool ...' types must be kept distinct from 'vector unsigned ...'
+     types, especially in C++ land.  Similarly, 'vector pixel' is distinct from
+     'vector unsigned short'.  */
+
+  bool_char_type_node = build_distinct_type_copy (unsigned_intQI_type_node);
+  bool_short_type_node = build_distinct_type_copy (unsigned_intHI_type_node);
+  bool_int_type_node = build_distinct_type_copy (unsigned_intSI_type_node);
+  bool_long_long_type_node = build_distinct_type_copy (unsigned_intDI_type_node);
+  pixel_type_node = build_distinct_type_copy (unsigned_intHI_type_node);
+
+  long_integer_type_internal_node = long_integer_type_node;
+  long_unsigned_type_internal_node = long_unsigned_type_node;
+  long_long_integer_type_internal_node = long_long_integer_type_node;
+  long_long_unsigned_type_internal_node = long_long_unsigned_type_node;
+  intQI_type_internal_node = intQI_type_node;
+  uintQI_type_internal_node = unsigned_intQI_type_node;
+  intHI_type_internal_node = intHI_type_node;
+  uintHI_type_internal_node = unsigned_intHI_type_node;
+  intSI_type_internal_node = intSI_type_node;
+  uintSI_type_internal_node = unsigned_intSI_type_node;
+  intDI_type_internal_node = intDI_type_node;
+  uintDI_type_internal_node = unsigned_intDI_type_node;
+  intTI_type_internal_node = intTI_type_node;
+  uintTI_type_internal_node = unsigned_intTI_type_node;
+  float_type_internal_node = float_type_node;
+  double_type_internal_node = double_type_node;
+  long_double_type_internal_node = long_double_type_node;
+  dfloat64_type_internal_node = dfloat64_type_node;
+  dfloat128_type_internal_node = dfloat128_type_node;
+  void_type_internal_node = void_type_node;
+
+  ptr_intQI_type_node
+    = build_pointer_type (build_qualified_type (intQI_type_internal_node,
+						TYPE_QUAL_CONST));
+  ptr_uintQI_type_node
+    = build_pointer_type (build_qualified_type (uintQI_type_internal_node,
+						TYPE_QUAL_CONST));
+  ptr_intHI_type_node
+    = build_pointer_type (build_qualified_type (intHI_type_internal_node,
+						TYPE_QUAL_CONST));
+  ptr_uintHI_type_node
+    = build_pointer_type (build_qualified_type (uintHI_type_internal_node,
+						TYPE_QUAL_CONST));
+  ptr_intSI_type_node
+    = build_pointer_type (build_qualified_type (intSI_type_internal_node,
+						TYPE_QUAL_CONST));
+  ptr_uintSI_type_node
+    = build_pointer_type (build_qualified_type (uintSI_type_internal_node,
+						TYPE_QUAL_CONST));
+  ptr_intDI_type_node
+    = build_pointer_type (build_qualified_type (intDI_type_internal_node,
+						TYPE_QUAL_CONST));
+  ptr_uintDI_type_node
+    = build_pointer_type (build_qualified_type (uintDI_type_internal_node,
+						TYPE_QUAL_CONST));
+  ptr_intTI_type_node
+    = build_pointer_type (build_qualified_type (intTI_type_internal_node,
+						TYPE_QUAL_CONST));
+  ptr_uintTI_type_node
+    = build_pointer_type (build_qualified_type (uintTI_type_internal_node,
+						TYPE_QUAL_CONST));
+
+  t = build_qualified_type (long_integer_type_internal_node, TYPE_QUAL_CONST);
+  ptr_long_integer_type_node = build_pointer_type (t);
+
+  t = build_qualified_type (long_unsigned_type_internal_node, TYPE_QUAL_CONST);
+  ptr_long_unsigned_type_node = build_pointer_type (t);
+
+  ptr_float_type_node
+    = build_pointer_type (build_qualified_type (float_type_internal_node,
+						TYPE_QUAL_CONST));
+  ptr_double_type_node
+    = build_pointer_type (build_qualified_type (double_type_internal_node,
+						TYPE_QUAL_CONST));
+  ptr_long_double_type_node
+    = build_pointer_type (build_qualified_type (long_double_type_internal_node,
+						TYPE_QUAL_CONST));
+  if (dfloat64_type_node)
+    {
+      t = build_qualified_type (dfloat64_type_internal_node, TYPE_QUAL_CONST);
+      ptr_dfloat64_type_node = build_pointer_type (t);
+    }
+  else
+    ptr_dfloat64_type_node = NULL;
+
+  if (dfloat128_type_node)
+    {
+      t = build_qualified_type (dfloat128_type_internal_node, TYPE_QUAL_CONST);
+      ptr_dfloat128_type_node = build_pointer_type (t);
+    }
+  else
+    ptr_dfloat128_type_node = NULL;
+
+  t = build_qualified_type (long_long_integer_type_internal_node,
+			    TYPE_QUAL_CONST);
+  ptr_long_long_integer_type_node  = build_pointer_type (t);
+
+  t = build_qualified_type (long_long_unsigned_type_internal_node,
+			    TYPE_QUAL_CONST);
+  ptr_long_long_unsigned_type_node = build_pointer_type (t);
+
+  /* 128-bit floating point support.  KFmode is IEEE 128-bit floating point.
+     IFmode is the IBM extended 128-bit format that is a pair of doubles.
+     TFmode will be either IEEE 128-bit floating point or the IBM double-double
+     format that uses a pair of doubles, depending on the switches and
+     defaults.
+
+     If we don't support for either 128-bit IBM double double or IEEE 128-bit
+     floating point, we need make sure the type is non-zero or else self-test
+     fails during bootstrap.
+
+     Always create __ibm128 as a separate type, even if the current long double
+     format is IBM extended double.
+
+     For IEEE 128-bit floating point, always create the type __ieee128.  If the
+     user used -mfloat128, rs6000-c.cc will create a define from __float128 to
+     __ieee128.  */
+  if (TARGET_FLOAT128_TYPE)
+    {
+      if (!TARGET_IEEEQUAD && TARGET_LONG_DOUBLE_128)
+	ibm128_float_type_node = long_double_type_node;
+      else
+	{
+	  ibm128_float_type_node = make_node (REAL_TYPE);
+	  TYPE_PRECISION (ibm128_float_type_node) = 128;
+	  SET_TYPE_MODE (ibm128_float_type_node, IFmode);
+	  layout_type (ibm128_float_type_node);
+	}
+      t = build_qualified_type (ibm128_float_type_node, TYPE_QUAL_CONST);
+      ptr_ibm128_float_type_node = build_pointer_type (t);
+      lang_hooks.types.register_builtin_type (ibm128_float_type_node,
+					      "__ibm128");
+
+      if (TARGET_IEEEQUAD && TARGET_LONG_DOUBLE_128)
+	ieee128_float_type_node = long_double_type_node;
+      else
+	ieee128_float_type_node = float128_type_node;
+      t = build_qualified_type (ieee128_float_type_node, TYPE_QUAL_CONST);
+      ptr_ieee128_float_type_node = build_pointer_type (t);
+      lang_hooks.types.register_builtin_type (ieee128_float_type_node,
+					      "__ieee128");
+    }
+
+  else
+    ieee128_float_type_node = ibm128_float_type_node = long_double_type_node;
+
+  /* Vector pair and vector quad support.  */
+  vector_pair_type_node = make_node (OPAQUE_TYPE);
+  SET_TYPE_MODE (vector_pair_type_node, OOmode);
+  TYPE_SIZE (vector_pair_type_node) = bitsize_int (GET_MODE_BITSIZE (OOmode));
+  TYPE_PRECISION (vector_pair_type_node) = GET_MODE_BITSIZE (OOmode);
+  TYPE_SIZE_UNIT (vector_pair_type_node) = size_int (GET_MODE_SIZE (OOmode));
+  SET_TYPE_ALIGN (vector_pair_type_node, 256);
+  TYPE_USER_ALIGN (vector_pair_type_node) = 0;
+  lang_hooks.types.register_builtin_type (vector_pair_type_node,
+					  "__vector_pair");
+  t = build_qualified_type (vector_pair_type_node, TYPE_QUAL_CONST);
+  ptr_vector_pair_type_node = build_pointer_type (t);
+
+  vector_quad_type_node = make_node (OPAQUE_TYPE);
+  SET_TYPE_MODE (vector_quad_type_node, XOmode);
+  TYPE_SIZE (vector_quad_type_node) = bitsize_int (GET_MODE_BITSIZE (XOmode));
+  TYPE_PRECISION (vector_quad_type_node) = GET_MODE_BITSIZE (XOmode);
+  TYPE_SIZE_UNIT (vector_quad_type_node) = size_int (GET_MODE_SIZE (XOmode));
+  SET_TYPE_ALIGN (vector_quad_type_node, 512);
+  TYPE_USER_ALIGN (vector_quad_type_node) = 0;
+  lang_hooks.types.register_builtin_type (vector_quad_type_node,
+					  "__vector_quad");
+  t = build_qualified_type (vector_quad_type_node, TYPE_QUAL_CONST);
+  ptr_vector_quad_type_node = build_pointer_type (t);
+
+  tdecl = add_builtin_type ("__bool char", bool_char_type_node);
+  TYPE_NAME (bool_char_type_node) = tdecl;
+
+  tdecl = add_builtin_type ("__bool short", bool_short_type_node);
+  TYPE_NAME (bool_short_type_node) = tdecl;
+
+  tdecl = add_builtin_type ("__bool int", bool_int_type_node);
+  TYPE_NAME (bool_int_type_node) = tdecl;
+
+  tdecl = add_builtin_type ("__pixel", pixel_type_node);
+  TYPE_NAME (pixel_type_node) = tdecl;
+
+  bool_V16QI_type_node = rs6000_vector_type ("__vector __bool char",
+					     bool_char_type_node, 16);
+  ptr_bool_V16QI_type_node
+    = build_pointer_type (build_qualified_type (bool_V16QI_type_node,
+						TYPE_QUAL_CONST));
+
+  bool_V8HI_type_node = rs6000_vector_type ("__vector __bool short",
+					    bool_short_type_node, 8);
+  ptr_bool_V8HI_type_node
+    = build_pointer_type (build_qualified_type (bool_V8HI_type_node,
+						TYPE_QUAL_CONST));
+
+  bool_V4SI_type_node = rs6000_vector_type ("__vector __bool int",
+					    bool_int_type_node, 4);
+  ptr_bool_V4SI_type_node
+    = build_pointer_type (build_qualified_type (bool_V4SI_type_node,
+						TYPE_QUAL_CONST));
+
+  bool_V2DI_type_node = rs6000_vector_type (TARGET_POWERPC64
+					    ? "__vector __bool long"
+					    : "__vector __bool long long",
+					    bool_long_long_type_node, 2);
+  ptr_bool_V2DI_type_node
+    = build_pointer_type (build_qualified_type (bool_V2DI_type_node,
+						TYPE_QUAL_CONST));
+
+  bool_V1TI_type_node = rs6000_vector_type ("__vector __bool __int128",
+					    intTI_type_node, 1);
+  ptr_bool_V1TI_type_node
+    = build_pointer_type (build_qualified_type (bool_V1TI_type_node,
+						TYPE_QUAL_CONST));
+
+  pixel_V8HI_type_node = rs6000_vector_type ("__vector __pixel",
+					     pixel_type_node, 8);
+  ptr_pixel_V8HI_type_node
+    = build_pointer_type (build_qualified_type (pixel_V8HI_type_node,
+						TYPE_QUAL_CONST));
+  pcvoid_type_node
+    = build_pointer_type (build_qualified_type (void_type_node,
+						TYPE_QUAL_CONST));
+
+  /* Execute the autogenerated initialization code for builtins.  */
+  rs6000_init_generated_builtins ();
+
+  if (TARGET_DEBUG_BUILTIN)
+    {
+      fprintf (stderr, "\nAutogenerated built-in functions:\n\n");
+      for (int i = 1; i < (int) RS6000_BIF_MAX; i++)
+	{
+	  bif_enable e = rs6000_builtin_info[i].enable;
+	  if (e == ENB_P5 && !TARGET_POPCNTB)
+	    continue;
+	  if (e == ENB_P6 && !TARGET_CMPB)
+	    continue;
+	  if (e == ENB_P6_64 && !(TARGET_CMPB && TARGET_POWERPC64))
+	    continue;
+	  if (e == ENB_ALTIVEC && !TARGET_ALTIVEC)
+	    continue;
+	  if (e == ENB_VSX && !TARGET_VSX)
+	    continue;
+	  if (e == ENB_P7 && !TARGET_POPCNTD)
+	    continue;
+	  if (e == ENB_P7_64 && !(TARGET_POPCNTD && TARGET_POWERPC64))
+	    continue;
+	  if (e == ENB_P8 && !TARGET_DIRECT_MOVE)
+	    continue;
+	  if (e == ENB_P8V && !TARGET_P8_VECTOR)
+	    continue;
+	  if (e == ENB_P9 && !TARGET_MODULO)
+	    continue;
+	  if (e == ENB_P9_64 && !(TARGET_MODULO && TARGET_POWERPC64))
+	    continue;
+	  if (e == ENB_P9V && !TARGET_P9_VECTOR)
+	    continue;
+	  if (e == ENB_IEEE128_HW && !TARGET_FLOAT128_HW)
+	    continue;
+	  if (e == ENB_DFP && !TARGET_DFP)
+	    continue;
+	  if (e == ENB_CRYPTO && !TARGET_CRYPTO)
+	    continue;
+	  if (e == ENB_HTM && !TARGET_HTM)
+	    continue;
+	  if (e == ENB_P10 && !TARGET_POWER10)
+	    continue;
+	  if (e == ENB_P10_64 && !(TARGET_POWER10 && TARGET_POWERPC64))
+	    continue;
+	  if (e == ENB_MMA && !TARGET_MMA)
+	    continue;
+	  tree fntype = rs6000_builtin_info[i].fntype;
+	  tree t = TREE_TYPE (fntype);
+	  fprintf (stderr, "%s %s (", rs6000_type_string (t),
+		   rs6000_builtin_info[i].bifname);
+	  t = TYPE_ARG_TYPES (fntype);
+	  while (t && TREE_VALUE (t) != void_type_node)
+	    {
+	      fprintf (stderr, "%s",
+		       rs6000_type_string (TREE_VALUE (t)));
+	      t = TREE_CHAIN (t);
+	      if (t && TREE_VALUE (t) != void_type_node)
+		fprintf (stderr, ", ");
+	    }
+	  fprintf (stderr, "); %s [%4d]\n",
+		   rs6000_builtin_info[i].attr_string, (int) i);
+	}
+      fprintf (stderr, "\nEnd autogenerated built-in functions.\n\n\n");
+     }
+
+  if (TARGET_XCOFF)
+    {
+      /* AIX libm provides clog as __clog.  */
+      if ((tdecl = builtin_decl_explicit (BUILT_IN_CLOG)) != NULL_TREE)
+	set_user_assembler_name (tdecl, "__clog");
+
+      /* When long double is 64 bit, some long double builtins of libc
+	 functions (like __builtin_frexpl) must call the double version
+	 (frexp) not the long double version (frexpl) that expects a 128 bit
+	 argument.  */
+      if (! TARGET_LONG_DOUBLE_128)
+	{
+	  if ((tdecl = builtin_decl_explicit (BUILT_IN_FMODL)) != NULL_TREE)
+	    set_user_assembler_name (tdecl, "fmod");
+	  if ((tdecl = builtin_decl_explicit (BUILT_IN_FREXPL)) != NULL_TREE)
+	    set_user_assembler_name (tdecl, "frexp");
+	  if ((tdecl = builtin_decl_explicit (BUILT_IN_LDEXPL)) != NULL_TREE)
+	    set_user_assembler_name (tdecl, "ldexp");
+	  if ((tdecl = builtin_decl_explicit (BUILT_IN_MODFL)) != NULL_TREE)
+	    set_user_assembler_name (tdecl, "modf");
+	}
+    }
+
+  altivec_builtin_mask_for_load
+    = rs6000_builtin_decls[RS6000_BIF_MASK_FOR_LOAD];
+
+#ifdef SUBTARGET_INIT_BUILTINS
+  SUBTARGET_INIT_BUILTINS;
+#endif
+
+  return;
+}
+
+/* **** GIMPLE folding support **** */
+
+/* Helper function to handle the gimple folding of a vector compare
+   operation.  This sets up true/false vectors, and uses the
+   VEC_COND_EXPR operation.
+   CODE indicates which comparison is to be made. (EQ, GT, ...).
+   TYPE indicates the type of the result.
+   Code is inserted before GSI.  */
+static tree
+fold_build_vec_cmp (tree_code code, tree type, tree arg0, tree arg1,
+		    gimple_stmt_iterator *gsi)
+{
+  tree cmp_type = truth_type_for (type);
+  tree zero_vec = build_zero_cst (type);
+  tree minus_one_vec = build_minus_one_cst (type);
+  tree temp = create_tmp_reg_or_ssa_name (cmp_type);
+  gimple *g = gimple_build_assign (temp, code, arg0, arg1);
+  gsi_insert_before (gsi, g, GSI_SAME_STMT);
+  return fold_build3 (VEC_COND_EXPR, type, temp, minus_one_vec, zero_vec);
+}
+
+/* Helper function to handle the in-between steps for the
+   vector compare built-ins.  */
+static void
+fold_compare_helper (gimple_stmt_iterator *gsi, tree_code code, gimple *stmt)
+{
+  tree arg0 = gimple_call_arg (stmt, 0);
+  tree arg1 = gimple_call_arg (stmt, 1);
+  tree lhs = gimple_call_lhs (stmt);
+  tree cmp = fold_build_vec_cmp (code, TREE_TYPE (lhs), arg0, arg1, gsi);
+  gimple *g = gimple_build_assign (lhs, cmp);
+  gimple_set_location (g, gimple_location (stmt));
+  gsi_replace (gsi, g, true);
+}
+
+/* Helper function to map V2DF and V4SF types to their
+ integral equivalents (V2DI and V4SI).  */
+tree map_to_integral_tree_type (tree input_tree_type)
+{
+  if (INTEGRAL_TYPE_P (TREE_TYPE (input_tree_type)))
+    return input_tree_type;
+  else
+    {
+      if (types_compatible_p (TREE_TYPE (input_tree_type),
+			      TREE_TYPE (V2DF_type_node)))
+	return V2DI_type_node;
+      else if (types_compatible_p (TREE_TYPE (input_tree_type),
+				   TREE_TYPE (V4SF_type_node)))
+	return V4SI_type_node;
+      else
+	gcc_unreachable ();
+    }
+}
+
+/* Helper function to handle the vector merge[hl] built-ins.  The
+   implementation difference between h and l versions for this code are in
+   the values used when building of the permute vector for high word versus
+   low word merge.  The variance is keyed off the use_high parameter.  */
+static void
+fold_mergehl_helper (gimple_stmt_iterator *gsi, gimple *stmt, int use_high)
+{
+  tree arg0 = gimple_call_arg (stmt, 0);
+  tree arg1 = gimple_call_arg (stmt, 1);
+  tree lhs = gimple_call_lhs (stmt);
+  tree lhs_type = TREE_TYPE (lhs);
+  int n_elts = TYPE_VECTOR_SUBPARTS (lhs_type);
+  int midpoint = n_elts / 2;
+  int offset = 0;
+
+  if (use_high == 1)
+    offset = midpoint;
+
+  /* The permute_type will match the lhs for integral types.  For double and
+     float types, the permute type needs to map to the V2 or V4 type that
+     matches size.  */
+  tree permute_type;
+  permute_type = map_to_integral_tree_type (lhs_type);
+  tree_vector_builder elts (permute_type, VECTOR_CST_NELTS (arg0), 1);
+
+  for (int i = 0; i < midpoint; i++)
+    {
+      elts.safe_push (build_int_cst (TREE_TYPE (permute_type),
+				     offset + i));
+      elts.safe_push (build_int_cst (TREE_TYPE (permute_type),
+				     offset + n_elts + i));
+    }
+
+  tree permute = elts.build ();
+
+  gimple *g = gimple_build_assign (lhs, VEC_PERM_EXPR, arg0, arg1, permute);
+  gimple_set_location (g, gimple_location (stmt));
+  gsi_replace (gsi, g, true);
+}
+
+/* Helper function to handle the vector merge[eo] built-ins.  */
+static void
+fold_mergeeo_helper (gimple_stmt_iterator *gsi, gimple *stmt, int use_odd)
+{
+  tree arg0 = gimple_call_arg (stmt, 0);
+  tree arg1 = gimple_call_arg (stmt, 1);
+  tree lhs = gimple_call_lhs (stmt);
+  tree lhs_type = TREE_TYPE (lhs);
+  int n_elts = TYPE_VECTOR_SUBPARTS (lhs_type);
+
+  /* The permute_type will match the lhs for integral types.  For double and
+     float types, the permute type needs to map to the V2 or V4 type that
+     matches size.  */
+  tree permute_type;
+  permute_type = map_to_integral_tree_type (lhs_type);
+
+  tree_vector_builder elts (permute_type, VECTOR_CST_NELTS (arg0), 1);
+
+ /* Build the permute vector.  */
+  for (int i = 0; i < n_elts / 2; i++)
+    {
+      elts.safe_push (build_int_cst (TREE_TYPE (permute_type),
+				     2*i + use_odd));
+      elts.safe_push (build_int_cst (TREE_TYPE (permute_type),
+				     2*i + use_odd + n_elts));
+    }
+
+  tree permute = elts.build ();
+
+  gimple *g = gimple_build_assign (lhs, VEC_PERM_EXPR, arg0, arg1, permute);
+  gimple_set_location (g, gimple_location (stmt));
+  gsi_replace (gsi, g, true);
+}
+
+/*  Helper function to sort out which built-ins may be valid without having
+    a LHS.  */
+static bool
+rs6000_builtin_valid_without_lhs (enum rs6000_gen_builtins fn_code,
+				  tree fndecl)
+{
+  if (TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node)
+    return true;
+
+  switch (fn_code)
+    {
+    case RS6000_BIF_STVX_V16QI:
+    case RS6000_BIF_STVX_V8HI:
+    case RS6000_BIF_STVX_V4SI:
+    case RS6000_BIF_STVX_V4SF:
+    case RS6000_BIF_STVX_V2DI:
+    case RS6000_BIF_STVX_V2DF:
+    case RS6000_BIF_STXVW4X_V16QI:
+    case RS6000_BIF_STXVW4X_V8HI:
+    case RS6000_BIF_STXVW4X_V4SF:
+    case RS6000_BIF_STXVW4X_V4SI:
+    case RS6000_BIF_STXVD2X_V2DF:
+    case RS6000_BIF_STXVD2X_V2DI:
+      return true;
+    default:
+      return false;
+    }
+}
+
+/* Expand the MMA built-ins early, so that we can convert the pass-by-reference
+   __vector_quad arguments into pass-by-value arguments, leading to more
+   efficient code generation.  */
+static bool
+rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi,
+				rs6000_gen_builtins fn_code)
+{
+  gimple *stmt = gsi_stmt (*gsi);
+  size_t fncode = (size_t) fn_code;
+
+  if (!bif_is_mma (rs6000_builtin_info[fncode]))
+    return false;
+
+  /* Each call that can be gimple-expanded has an associated built-in
+     function that it will expand into.  If this one doesn't, we have
+     already expanded it!  Exceptions: lxvp and stxvp.  */
+  if (rs6000_builtin_info[fncode].assoc_bif == RS6000_BIF_NONE
+      && fncode != RS6000_BIF_LXVP
+      && fncode != RS6000_BIF_STXVP)
+    return false;
+
+  bifdata *bd = &rs6000_builtin_info[fncode];
+  unsigned nopnds = bd->nargs;
+  gimple_seq new_seq = NULL;
+  gimple *new_call;
+  tree new_decl;
+
+  /* Compatibility built-ins; we used to call these
+     __builtin_mma_{dis,}assemble_pair, but now we call them
+     __builtin_vsx_{dis,}assemble_pair.  Handle the old versions.  */
+  if (fncode == RS6000_BIF_ASSEMBLE_PAIR)
+    fncode = RS6000_BIF_ASSEMBLE_PAIR_V;
+  else if (fncode == RS6000_BIF_DISASSEMBLE_PAIR)
+    fncode = RS6000_BIF_DISASSEMBLE_PAIR_V;
+
+  if (fncode == RS6000_BIF_DISASSEMBLE_ACC
+      || fncode == RS6000_BIF_DISASSEMBLE_PAIR_V)
+    {
+      /* This is an MMA disassemble built-in function.  */
+      push_gimplify_context (true);
+      unsigned nvec = (fncode == RS6000_BIF_DISASSEMBLE_ACC) ? 4 : 2;
+      tree dst_ptr = gimple_call_arg (stmt, 0);
+      tree src_ptr = gimple_call_arg (stmt, 1);
+      tree src_type = TREE_TYPE (src_ptr);
+      tree src = create_tmp_reg_or_ssa_name (TREE_TYPE (src_type));
+      gimplify_assign (src, build_simple_mem_ref (src_ptr), &new_seq);
+
+      /* If we are not disassembling an accumulator/pair or our destination is
+	 another accumulator/pair, then just copy the entire thing as is.  */
+      if ((fncode == RS6000_BIF_DISASSEMBLE_ACC
+	   && TREE_TYPE (TREE_TYPE (dst_ptr)) == vector_quad_type_node)
+	  || (fncode == RS6000_BIF_DISASSEMBLE_PAIR_V
+	      && TREE_TYPE (TREE_TYPE (dst_ptr)) == vector_pair_type_node))
+	{
+	  tree dst = build_simple_mem_ref (build1 (VIEW_CONVERT_EXPR,
+						   src_type, dst_ptr));
+	  gimplify_assign (dst, src, &new_seq);
+	  pop_gimplify_context (NULL);
+	  gsi_replace_with_seq (gsi, new_seq, true);
+	  return true;
+	}
+
+      /* If we're disassembling an accumulator into a different type, we need
+	 to emit a xxmfacc instruction now, since we cannot do it later.  */
+      if (fncode == RS6000_BIF_DISASSEMBLE_ACC)
+	{
+	  new_decl = rs6000_builtin_decls[RS6000_BIF_XXMFACC_INTERNAL];
+	  new_call = gimple_build_call (new_decl, 1, src);
+	  src = create_tmp_reg_or_ssa_name (vector_quad_type_node);
+	  gimple_call_set_lhs (new_call, src);
+	  gimple_seq_add_stmt (&new_seq, new_call);
+	}
+
+      /* Copy the accumulator/pair vector by vector.  */
+      new_decl
+	= rs6000_builtin_decls[rs6000_builtin_info[fncode].assoc_bif];
+      tree dst_type = build_pointer_type_for_mode (unsigned_V16QI_type_node,
+						   ptr_mode, true);
+      tree dst_base = build1 (VIEW_CONVERT_EXPR, dst_type, dst_ptr);
+      for (unsigned i = 0; i < nvec; i++)
+	{
+	  unsigned index = WORDS_BIG_ENDIAN ? i : nvec - 1 - i;
+	  tree dst = build2 (MEM_REF, unsigned_V16QI_type_node, dst_base,
+			     build_int_cst (dst_type, index * 16));
+	  tree dstssa = create_tmp_reg_or_ssa_name (unsigned_V16QI_type_node);
+	  new_call = gimple_build_call (new_decl, 2, src,
+					build_int_cstu (uint16_type_node, i));
+	  gimple_call_set_lhs (new_call, dstssa);
+	  gimple_seq_add_stmt (&new_seq, new_call);
+	  gimplify_assign (dst, dstssa, &new_seq);
+	}
+      pop_gimplify_context (NULL);
+      gsi_replace_with_seq (gsi, new_seq, true);
+      return true;
+    }
+
+  /* TODO: Do some factoring on these two chunks.  */
+  if (fncode == RS6000_BIF_LXVP)
+    {
+      push_gimplify_context (true);
+      tree offset = gimple_call_arg (stmt, 0);
+      tree ptr = gimple_call_arg (stmt, 1);
+      tree lhs = gimple_call_lhs (stmt);
+      if (TREE_TYPE (TREE_TYPE (ptr)) != vector_pair_type_node)
+	ptr = build1 (VIEW_CONVERT_EXPR,
+		      build_pointer_type (vector_pair_type_node), ptr);
+      tree mem = build_simple_mem_ref (build2 (POINTER_PLUS_EXPR,
+					       TREE_TYPE (ptr), ptr, offset));
+      gimplify_assign (lhs, mem, &new_seq);
+      pop_gimplify_context (NULL);
+      gsi_replace_with_seq (gsi, new_seq, true);
+      return true;
+    }
+
+  if (fncode == RS6000_BIF_STXVP)
+    {
+      push_gimplify_context (true);
+      tree src = gimple_call_arg (stmt, 0);
+      tree offset = gimple_call_arg (stmt, 1);
+      tree ptr = gimple_call_arg (stmt, 2);
+      if (TREE_TYPE (TREE_TYPE (ptr)) != vector_pair_type_node)
+	ptr = build1 (VIEW_CONVERT_EXPR,
+		      build_pointer_type (vector_pair_type_node), ptr);
+      tree mem = build_simple_mem_ref (build2 (POINTER_PLUS_EXPR,
+					       TREE_TYPE (ptr), ptr, offset));
+      gimplify_assign (mem, src, &new_seq);
+      pop_gimplify_context (NULL);
+      gsi_replace_with_seq (gsi, new_seq, true);
+      return true;
+    }
+
+  /* Convert this built-in into an internal version that uses pass-by-value
+     arguments.  The internal built-in is found in the assoc_bif field.  */
+  new_decl = rs6000_builtin_decls[rs6000_builtin_info[fncode].assoc_bif];
+  tree lhs, op[MAX_MMA_OPERANDS];
+  tree acc = gimple_call_arg (stmt, 0);
+  push_gimplify_context (true);
+
+  if (bif_is_quad (*bd))
+    {
+      /* This built-in has a pass-by-reference accumulator input, so load it
+	 into a temporary accumulator for use as a pass-by-value input.  */
+      op[0] = create_tmp_reg_or_ssa_name (vector_quad_type_node);
+      for (unsigned i = 1; i < nopnds; i++)
+	op[i] = gimple_call_arg (stmt, i);
+      gimplify_assign (op[0], build_simple_mem_ref (acc), &new_seq);
+    }
+  else
+    {
+      /* This built-in does not use its pass-by-reference accumulator argument
+	 as an input argument, so remove it from the input list.  */
+      nopnds--;
+      for (unsigned i = 0; i < nopnds; i++)
+	op[i] = gimple_call_arg (stmt, i + 1);
+    }
+
+  switch (nopnds)
+    {
+    case 0:
+      new_call = gimple_build_call (new_decl, 0);
+      break;
+    case 1:
+      new_call = gimple_build_call (new_decl, 1, op[0]);
+      break;
+    case 2:
+      new_call = gimple_build_call (new_decl, 2, op[0], op[1]);
+      break;
+    case 3:
+      new_call = gimple_build_call (new_decl, 3, op[0], op[1], op[2]);
+      break;
+    case 4:
+      new_call = gimple_build_call (new_decl, 4, op[0], op[1], op[2], op[3]);
+      break;
+    case 5:
+      new_call = gimple_build_call (new_decl, 5, op[0], op[1], op[2], op[3],
+				    op[4]);
+      break;
+    case 6:
+      new_call = gimple_build_call (new_decl, 6, op[0], op[1], op[2], op[3],
+				    op[4], op[5]);
+      break;
+    case 7:
+      new_call = gimple_build_call (new_decl, 7, op[0], op[1], op[2], op[3],
+				    op[4], op[5], op[6]);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (fncode == RS6000_BIF_BUILD_PAIR || fncode == RS6000_BIF_ASSEMBLE_PAIR_V)
+    lhs = create_tmp_reg_or_ssa_name (vector_pair_type_node);
+  else
+    lhs = create_tmp_reg_or_ssa_name (vector_quad_type_node);
+  gimple_call_set_lhs (new_call, lhs);
+  gimple_seq_add_stmt (&new_seq, new_call);
+  gimplify_assign (build_simple_mem_ref (acc), lhs, &new_seq);
+  pop_gimplify_context (NULL);
+  gsi_replace_with_seq (gsi, new_seq, true);
+
+  return true;
+}
+
+/* Fold a machine-dependent built-in in GIMPLE.  (For folding into
+   a constant, use rs6000_fold_builtin.)  */
+bool
+rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
+{
+  gimple *stmt = gsi_stmt (*gsi);
+  tree fndecl = gimple_call_fndecl (stmt);
+  gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
+  enum rs6000_gen_builtins fn_code
+    = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl);
+  tree arg0, arg1, lhs, temp;
+  enum tree_code bcode;
+  gimple *g;
+
+  size_t uns_fncode = (size_t) fn_code;
+  enum insn_code icode = rs6000_builtin_info[uns_fncode].icode;
+  const char *fn_name1 = rs6000_builtin_info[uns_fncode].bifname;
+  const char *fn_name2 = (icode != CODE_FOR_nothing)
+			  ? get_insn_name ((int) icode)
+			  : "nothing";
+
+  if (TARGET_DEBUG_BUILTIN)
+      fprintf (stderr, "rs6000_gimple_fold_builtin %d %s %s\n",
+	       fn_code, fn_name1, fn_name2);
+
+  if (!rs6000_fold_gimple)
+    return false;
+
+  /* Prevent gimple folding for code that does not have a LHS, unless it is
+     allowed per the rs6000_builtin_valid_without_lhs helper function.  */
+  if (!gimple_call_lhs (stmt)
+      && !rs6000_builtin_valid_without_lhs (fn_code, fndecl))
+    return false;
+
+  /* Don't fold invalid builtins, let rs6000_expand_builtin diagnose it.  */
+  if (!rs6000_builtin_is_supported (fn_code))
+    return false;
+
+  if (rs6000_gimple_fold_mma_builtin (gsi, fn_code))
+    return true;
+
+  switch (fn_code)
+    {
+    /* Flavors of vec_add.  We deliberately don't expand
+       RS6000_BIF_VADDUQM as it gets lowered from V1TImode to
+       TImode, resulting in much poorer code generation.  */
+    case RS6000_BIF_VADDUBM:
+    case RS6000_BIF_VADDUHM:
+    case RS6000_BIF_VADDUWM:
+    case RS6000_BIF_VADDUDM:
+    case RS6000_BIF_VADDFP:
+    case RS6000_BIF_XVADDDP:
+    case RS6000_BIF_XVADDSP:
+      bcode = PLUS_EXPR;
+    do_binary:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      if (INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (lhs)))
+	  && !TYPE_OVERFLOW_WRAPS (TREE_TYPE (TREE_TYPE (lhs))))
+	{
+	  /* Ensure the binary operation is performed in a type
+	     that wraps if it is integral type.  */
+	  gimple_seq stmts = NULL;
+	  tree type = unsigned_type_for (TREE_TYPE (lhs));
+	  tree uarg0 = gimple_build (&stmts, VIEW_CONVERT_EXPR,
+				     type, arg0);
+	  tree uarg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR,
+				     type, arg1);
+	  tree res = gimple_build (&stmts, gimple_location (stmt), bcode,
+				   type, uarg0, uarg1);
+	  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	  g = gimple_build_assign (lhs, VIEW_CONVERT_EXPR,
+				   build1 (VIEW_CONVERT_EXPR,
+					   TREE_TYPE (lhs), res));
+	  gsi_replace (gsi, g, true);
+	  return true;
+	}
+      g = gimple_build_assign (lhs, bcode, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* Flavors of vec_sub.  We deliberately don't expand
+       RS6000_BIF_VSUBUQM. */
+    case RS6000_BIF_VSUBUBM:
+    case RS6000_BIF_VSUBUHM:
+    case RS6000_BIF_VSUBUWM:
+    case RS6000_BIF_VSUBUDM:
+    case RS6000_BIF_VSUBFP:
+    case RS6000_BIF_XVSUBDP:
+    case RS6000_BIF_XVSUBSP:
+      bcode = MINUS_EXPR;
+      goto do_binary;
+    case RS6000_BIF_XVMULSP:
+    case RS6000_BIF_XVMULDP:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      g = gimple_build_assign (lhs, MULT_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* Even element flavors of vec_mul (signed). */
+    case RS6000_BIF_VMULESB:
+    case RS6000_BIF_VMULESH:
+    case RS6000_BIF_VMULESW:
+    /* Even element flavors of vec_mul (unsigned).  */
+    case RS6000_BIF_VMULEUB:
+    case RS6000_BIF_VMULEUH:
+    case RS6000_BIF_VMULEUW:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      g = gimple_build_assign (lhs, VEC_WIDEN_MULT_EVEN_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* Odd element flavors of vec_mul (signed).  */
+    case RS6000_BIF_VMULOSB:
+    case RS6000_BIF_VMULOSH:
+    case RS6000_BIF_VMULOSW:
+    /* Odd element flavors of vec_mul (unsigned). */
+    case RS6000_BIF_VMULOUB:
+    case RS6000_BIF_VMULOUH:
+    case RS6000_BIF_VMULOUW:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      g = gimple_build_assign (lhs, VEC_WIDEN_MULT_ODD_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* Flavors of vec_div (Integer).  */
+    case RS6000_BIF_DIV_V2DI:
+    case RS6000_BIF_UDIV_V2DI:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      g = gimple_build_assign (lhs, TRUNC_DIV_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* Flavors of vec_div (Float).  */
+    case RS6000_BIF_XVDIVSP:
+    case RS6000_BIF_XVDIVDP:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      g = gimple_build_assign (lhs, RDIV_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* Flavors of vec_and.  */
+    case RS6000_BIF_VAND_V16QI_UNS:
+    case RS6000_BIF_VAND_V16QI:
+    case RS6000_BIF_VAND_V8HI_UNS:
+    case RS6000_BIF_VAND_V8HI:
+    case RS6000_BIF_VAND_V4SI_UNS:
+    case RS6000_BIF_VAND_V4SI:
+    case RS6000_BIF_VAND_V2DI_UNS:
+    case RS6000_BIF_VAND_V2DI:
+    case RS6000_BIF_VAND_V4SF:
+    case RS6000_BIF_VAND_V2DF:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      g = gimple_build_assign (lhs, BIT_AND_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* Flavors of vec_andc.  */
+    case RS6000_BIF_VANDC_V16QI_UNS:
+    case RS6000_BIF_VANDC_V16QI:
+    case RS6000_BIF_VANDC_V8HI_UNS:
+    case RS6000_BIF_VANDC_V8HI:
+    case RS6000_BIF_VANDC_V4SI_UNS:
+    case RS6000_BIF_VANDC_V4SI:
+    case RS6000_BIF_VANDC_V2DI_UNS:
+    case RS6000_BIF_VANDC_V2DI:
+    case RS6000_BIF_VANDC_V4SF:
+    case RS6000_BIF_VANDC_V2DF:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1));
+      g = gimple_build_assign (temp, BIT_NOT_EXPR, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_insert_before (gsi, g, GSI_SAME_STMT);
+      g = gimple_build_assign (lhs, BIT_AND_EXPR, arg0, temp);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* Flavors of vec_nand.  */
+    case RS6000_BIF_NAND_V16QI_UNS:
+    case RS6000_BIF_NAND_V16QI:
+    case RS6000_BIF_NAND_V8HI_UNS:
+    case RS6000_BIF_NAND_V8HI:
+    case RS6000_BIF_NAND_V4SI_UNS:
+    case RS6000_BIF_NAND_V4SI:
+    case RS6000_BIF_NAND_V2DI_UNS:
+    case RS6000_BIF_NAND_V2DI:
+    case RS6000_BIF_NAND_V4SF:
+    case RS6000_BIF_NAND_V2DF:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1));
+      g = gimple_build_assign (temp, BIT_AND_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_insert_before (gsi, g, GSI_SAME_STMT);
+      g = gimple_build_assign (lhs, BIT_NOT_EXPR, temp);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* Flavors of vec_or.  */
+    case RS6000_BIF_VOR_V16QI_UNS:
+    case RS6000_BIF_VOR_V16QI:
+    case RS6000_BIF_VOR_V8HI_UNS:
+    case RS6000_BIF_VOR_V8HI:
+    case RS6000_BIF_VOR_V4SI_UNS:
+    case RS6000_BIF_VOR_V4SI:
+    case RS6000_BIF_VOR_V2DI_UNS:
+    case RS6000_BIF_VOR_V2DI:
+    case RS6000_BIF_VOR_V4SF:
+    case RS6000_BIF_VOR_V2DF:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      g = gimple_build_assign (lhs, BIT_IOR_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* flavors of vec_orc.  */
+    case RS6000_BIF_ORC_V16QI_UNS:
+    case RS6000_BIF_ORC_V16QI:
+    case RS6000_BIF_ORC_V8HI_UNS:
+    case RS6000_BIF_ORC_V8HI:
+    case RS6000_BIF_ORC_V4SI_UNS:
+    case RS6000_BIF_ORC_V4SI:
+    case RS6000_BIF_ORC_V2DI_UNS:
+    case RS6000_BIF_ORC_V2DI:
+    case RS6000_BIF_ORC_V4SF:
+    case RS6000_BIF_ORC_V2DF:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1));
+      g = gimple_build_assign (temp, BIT_NOT_EXPR, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_insert_before (gsi, g, GSI_SAME_STMT);
+      g = gimple_build_assign (lhs, BIT_IOR_EXPR, arg0, temp);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* Flavors of vec_xor.  */
+    case RS6000_BIF_VXOR_V16QI_UNS:
+    case RS6000_BIF_VXOR_V16QI:
+    case RS6000_BIF_VXOR_V8HI_UNS:
+    case RS6000_BIF_VXOR_V8HI:
+    case RS6000_BIF_VXOR_V4SI_UNS:
+    case RS6000_BIF_VXOR_V4SI:
+    case RS6000_BIF_VXOR_V2DI_UNS:
+    case RS6000_BIF_VXOR_V2DI:
+    case RS6000_BIF_VXOR_V4SF:
+    case RS6000_BIF_VXOR_V2DF:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      g = gimple_build_assign (lhs, BIT_XOR_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* Flavors of vec_nor.  */
+    case RS6000_BIF_VNOR_V16QI_UNS:
+    case RS6000_BIF_VNOR_V16QI:
+    case RS6000_BIF_VNOR_V8HI_UNS:
+    case RS6000_BIF_VNOR_V8HI:
+    case RS6000_BIF_VNOR_V4SI_UNS:
+    case RS6000_BIF_VNOR_V4SI:
+    case RS6000_BIF_VNOR_V2DI_UNS:
+    case RS6000_BIF_VNOR_V2DI:
+    case RS6000_BIF_VNOR_V4SF:
+    case RS6000_BIF_VNOR_V2DF:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1));
+      g = gimple_build_assign (temp, BIT_IOR_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_insert_before (gsi, g, GSI_SAME_STMT);
+      g = gimple_build_assign (lhs, BIT_NOT_EXPR, temp);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* flavors of vec_abs.  */
+    case RS6000_BIF_ABS_V16QI:
+    case RS6000_BIF_ABS_V8HI:
+    case RS6000_BIF_ABS_V4SI:
+    case RS6000_BIF_ABS_V4SF:
+    case RS6000_BIF_ABS_V2DI:
+    case RS6000_BIF_XVABSDP:
+    case RS6000_BIF_XVABSSP:
+      arg0 = gimple_call_arg (stmt, 0);
+      if (INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (arg0)))
+	  && !TYPE_OVERFLOW_WRAPS (TREE_TYPE (TREE_TYPE (arg0))))
+	return false;
+      lhs = gimple_call_lhs (stmt);
+      g = gimple_build_assign (lhs, ABS_EXPR, arg0);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* flavors of vec_min.  */
+    case RS6000_BIF_XVMINDP:
+    case RS6000_BIF_XVMINSP:
+    case RS6000_BIF_VMINFP:
+      {
+	lhs = gimple_call_lhs (stmt);
+	tree type = TREE_TYPE (lhs);
+	if (HONOR_NANS (type))
+	  return false;
+	gcc_fallthrough ();
+      }
+    case RS6000_BIF_VMINSD:
+    case RS6000_BIF_VMINUD:
+    case RS6000_BIF_VMINSB:
+    case RS6000_BIF_VMINSH:
+    case RS6000_BIF_VMINSW:
+    case RS6000_BIF_VMINUB:
+    case RS6000_BIF_VMINUH:
+    case RS6000_BIF_VMINUW:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      g = gimple_build_assign (lhs, MIN_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* flavors of vec_max.  */
+    case RS6000_BIF_XVMAXDP:
+    case RS6000_BIF_XVMAXSP:
+    case RS6000_BIF_VMAXFP:
+      {
+	lhs = gimple_call_lhs (stmt);
+	tree type = TREE_TYPE (lhs);
+	if (HONOR_NANS (type))
+	  return false;
+	gcc_fallthrough ();
+      }
+    case RS6000_BIF_VMAXSD:
+    case RS6000_BIF_VMAXUD:
+    case RS6000_BIF_VMAXSB:
+    case RS6000_BIF_VMAXSH:
+    case RS6000_BIF_VMAXSW:
+    case RS6000_BIF_VMAXUB:
+    case RS6000_BIF_VMAXUH:
+    case RS6000_BIF_VMAXUW:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      g = gimple_build_assign (lhs, MAX_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* Flavors of vec_eqv.  */
+    case RS6000_BIF_EQV_V16QI:
+    case RS6000_BIF_EQV_V8HI:
+    case RS6000_BIF_EQV_V4SI:
+    case RS6000_BIF_EQV_V4SF:
+    case RS6000_BIF_EQV_V2DF:
+    case RS6000_BIF_EQV_V2DI:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1));
+      g = gimple_build_assign (temp, BIT_XOR_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_insert_before (gsi, g, GSI_SAME_STMT);
+      g = gimple_build_assign (lhs, BIT_NOT_EXPR, temp);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+    /* Flavors of vec_rotate_left.  */
+    case RS6000_BIF_VRLB:
+    case RS6000_BIF_VRLH:
+    case RS6000_BIF_VRLW:
+    case RS6000_BIF_VRLD:
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      lhs = gimple_call_lhs (stmt);
+      g = gimple_build_assign (lhs, LROTATE_EXPR, arg0, arg1);
+      gimple_set_location (g, gimple_location (stmt));
+      gsi_replace (gsi, g, true);
+      return true;
+  /* Flavors of vector shift right algebraic.
+     vec_sra{b,h,w} -> vsra{b,h,w}.  */
+    case RS6000_BIF_VSRAB:
+    case RS6000_BIF_VSRAH:
+    case RS6000_BIF_VSRAW:
+    case RS6000_BIF_VSRAD:
+      {
+	arg0 = gimple_call_arg (stmt, 0);
+	arg1 = gimple_call_arg (stmt, 1);
+	lhs = gimple_call_lhs (stmt);
+	tree arg1_type = TREE_TYPE (arg1);
+	tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1));
+	tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type));
+	location_t loc = gimple_location (stmt);
+	/* Force arg1 into the range valid matching the arg0 type.  */
+	/* Build a vector consisting of the max valid bit-size values.  */
+	int n_elts = VECTOR_CST_NELTS (arg1);
+	tree element_size = build_int_cst (unsigned_element_type,
+					   128 / n_elts);
+	tree_vector_builder elts (unsigned_arg1_type, n_elts, 1);
+	for (int i = 0; i < n_elts; i++)
+	  elts.safe_push (element_size);
+	tree modulo_tree = elts.build ();
+	/* Modulo the provided shift value against that vector.  */
+	gimple_seq stmts = NULL;
+	tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR,
+					   unsigned_arg1_type, arg1);
+	tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR,
+				      unsigned_arg1_type, unsigned_arg1,
+				      modulo_tree);
+	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	/* And finally, do the shift.  */
+	g = gimple_build_assign (lhs, RSHIFT_EXPR, arg0, new_arg1);
+	gimple_set_location (g, loc);
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+   /* Flavors of vector shift left.
+      builtin_altivec_vsl{b,h,w} -> vsl{b,h,w}.  */
+    case RS6000_BIF_VSLB:
+    case RS6000_BIF_VSLH:
+    case RS6000_BIF_VSLW:
+    case RS6000_BIF_VSLD:
+      {
+	location_t loc;
+	gimple_seq stmts = NULL;
+	arg0 = gimple_call_arg (stmt, 0);
+	tree arg0_type = TREE_TYPE (arg0);
+	if (INTEGRAL_TYPE_P (TREE_TYPE (arg0_type))
+	    && !TYPE_OVERFLOW_WRAPS (TREE_TYPE (arg0_type)))
+	  return false;
+	arg1 = gimple_call_arg (stmt, 1);
+	tree arg1_type = TREE_TYPE (arg1);
+	tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1));
+	tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type));
+	loc = gimple_location (stmt);
+	lhs = gimple_call_lhs (stmt);
+	/* Force arg1 into the range valid matching the arg0 type.  */
+	/* Build a vector consisting of the max valid bit-size values.  */
+	int n_elts = VECTOR_CST_NELTS (arg1);
+	int tree_size_in_bits = TREE_INT_CST_LOW (size_in_bytes (arg1_type))
+				* BITS_PER_UNIT;
+	tree element_size = build_int_cst (unsigned_element_type,
+					   tree_size_in_bits / n_elts);
+	tree_vector_builder elts (unsigned_type_for (arg1_type), n_elts, 1);
+	for (int i = 0; i < n_elts; i++)
+	  elts.safe_push (element_size);
+	tree modulo_tree = elts.build ();
+	/* Modulo the provided shift value against that vector.  */
+	tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR,
+					   unsigned_arg1_type, arg1);
+	tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR,
+				      unsigned_arg1_type, unsigned_arg1,
+				      modulo_tree);
+	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	/* And finally, do the shift.  */
+	g = gimple_build_assign (lhs, LSHIFT_EXPR, arg0, new_arg1);
+	gimple_set_location (g, gimple_location (stmt));
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+    /* Flavors of vector shift right.  */
+    case RS6000_BIF_VSRB:
+    case RS6000_BIF_VSRH:
+    case RS6000_BIF_VSRW:
+    case RS6000_BIF_VSRD:
+      {
+	arg0 = gimple_call_arg (stmt, 0);
+	arg1 = gimple_call_arg (stmt, 1);
+	lhs = gimple_call_lhs (stmt);
+	tree arg1_type = TREE_TYPE (arg1);
+	tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1));
+	tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type));
+	location_t loc = gimple_location (stmt);
+	gimple_seq stmts = NULL;
+	/* Convert arg0 to unsigned.  */
+	tree arg0_unsigned
+	  = gimple_build (&stmts, VIEW_CONVERT_EXPR,
+			  unsigned_type_for (TREE_TYPE (arg0)), arg0);
+	/* Force arg1 into the range valid matching the arg0 type.  */
+	/* Build a vector consisting of the max valid bit-size values.  */
+	int n_elts = VECTOR_CST_NELTS (arg1);
+	tree element_size = build_int_cst (unsigned_element_type,
+					   128 / n_elts);
+	tree_vector_builder elts (unsigned_arg1_type, n_elts, 1);
+	for (int i = 0; i < n_elts; i++)
+	  elts.safe_push (element_size);
+	tree modulo_tree = elts.build ();
+	/* Modulo the provided shift value against that vector.  */
+	tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR,
+					   unsigned_arg1_type, arg1);
+	tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR,
+				      unsigned_arg1_type, unsigned_arg1,
+				      modulo_tree);
+	/* Do the shift.  */
+	tree res
+	  = gimple_build (&stmts, RSHIFT_EXPR,
+			  TREE_TYPE (arg0_unsigned), arg0_unsigned, new_arg1);
+	/* Convert result back to the lhs type.  */
+	res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
+	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	replace_call_with_value (gsi, res);
+	return true;
+      }
+    /* Vector loads.  */
+    case RS6000_BIF_LVX_V16QI:
+    case RS6000_BIF_LVX_V8HI:
+    case RS6000_BIF_LVX_V4SI:
+    case RS6000_BIF_LVX_V4SF:
+    case RS6000_BIF_LVX_V2DI:
+    case RS6000_BIF_LVX_V2DF:
+    case RS6000_BIF_LVX_V1TI:
+      {
+	arg0 = gimple_call_arg (stmt, 0);  // offset
+	arg1 = gimple_call_arg (stmt, 1);  // address
+	lhs = gimple_call_lhs (stmt);
+	location_t loc = gimple_location (stmt);
+	/* Since arg1 may be cast to a different type, just use ptr_type_node
+	   here instead of trying to enforce TBAA on pointer types.  */
+	tree arg1_type = ptr_type_node;
+	tree lhs_type = TREE_TYPE (lhs);
+	/* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
+	   the tree using the value from arg0.  The resulting type will match
+	   the type of arg1.  */
+	gimple_seq stmts = NULL;
+	tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0);
+	tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
+				       arg1_type, arg1, temp_offset);
+	/* Mask off any lower bits from the address.  */
+	tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR,
+					  arg1_type, temp_addr,
+					  build_int_cst (arg1_type, -16));
+	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	if (!is_gimple_mem_ref_addr (aligned_addr))
+	  {
+	    tree t = make_ssa_name (TREE_TYPE (aligned_addr));
+	    gimple *g = gimple_build_assign (t, aligned_addr);
+	    gsi_insert_before (gsi, g, GSI_SAME_STMT);
+	    aligned_addr = t;
+	  }
+	/* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
+	   take an offset, but since we've already incorporated the offset
+	   above, here we just pass in a zero.  */
+	gimple *g
+	  = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
+					      build_int_cst (arg1_type, 0)));
+	gimple_set_location (g, loc);
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+    /* Vector stores.  */
+    case RS6000_BIF_STVX_V16QI:
+    case RS6000_BIF_STVX_V8HI:
+    case RS6000_BIF_STVX_V4SI:
+    case RS6000_BIF_STVX_V4SF:
+    case RS6000_BIF_STVX_V2DI:
+    case RS6000_BIF_STVX_V2DF:
+      {
+	arg0 = gimple_call_arg (stmt, 0); /* Value to be stored.  */
+	arg1 = gimple_call_arg (stmt, 1); /* Offset.  */
+	tree arg2 = gimple_call_arg (stmt, 2); /* Store-to address.  */
+	location_t loc = gimple_location (stmt);
+	tree arg0_type = TREE_TYPE (arg0);
+	/* Use ptr_type_node (no TBAA) for the arg2_type.
+	   FIXME: (Richard)  "A proper fix would be to transition this type as
+	   seen from the frontend to GIMPLE, for example in a similar way we
+	   do for MEM_REFs by piggy-backing that on an extra argument, a
+	   constant zero pointer of the alias pointer type to use (which would
+	   also serve as a type indicator of the store itself).  I'd use a
+	   target specific internal function for this (not sure if we can have
+	   those target specific, but I guess if it's folded away then that's
+	   fine) and get away with the overload set."  */
+	tree arg2_type = ptr_type_node;
+	/* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
+	   the tree using the value from arg0.  The resulting type will match
+	   the type of arg2.  */
+	gimple_seq stmts = NULL;
+	tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg1);
+	tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
+				       arg2_type, arg2, temp_offset);
+	/* Mask off any lower bits from the address.  */
+	tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR,
+					  arg2_type, temp_addr,
+					  build_int_cst (arg2_type, -16));
+	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	if (!is_gimple_mem_ref_addr (aligned_addr))
+	  {
+	    tree t = make_ssa_name (TREE_TYPE (aligned_addr));
+	    gimple *g = gimple_build_assign (t, aligned_addr);
+	    gsi_insert_before (gsi, g, GSI_SAME_STMT);
+	    aligned_addr = t;
+	  }
+	/* The desired gimple result should be similar to:
+	   MEM[(__vector floatD.1407 *)_1] = vf1D.2697;  */
+	gimple *g
+	  = gimple_build_assign (build2 (MEM_REF, arg0_type, aligned_addr,
+					 build_int_cst (arg2_type, 0)), arg0);
+	gimple_set_location (g, loc);
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+
+    /* unaligned Vector loads.  */
+    case RS6000_BIF_LXVW4X_V16QI:
+    case RS6000_BIF_LXVW4X_V8HI:
+    case RS6000_BIF_LXVW4X_V4SF:
+    case RS6000_BIF_LXVW4X_V4SI:
+    case RS6000_BIF_LXVD2X_V2DF:
+    case RS6000_BIF_LXVD2X_V2DI:
+      {
+	arg0 = gimple_call_arg (stmt, 0);  // offset
+	arg1 = gimple_call_arg (stmt, 1);  // address
+	lhs = gimple_call_lhs (stmt);
+	location_t loc = gimple_location (stmt);
+	/* Since arg1 may be cast to a different type, just use ptr_type_node
+	   here instead of trying to enforce TBAA on pointer types.  */
+	tree arg1_type = ptr_type_node;
+	tree lhs_type = TREE_TYPE (lhs);
+	/* In GIMPLE the type of the MEM_REF specifies the alignment.  The
+	  required alignment (power) is 4 bytes regardless of data type.  */
+	tree align_ltype = build_aligned_type (lhs_type, 4);
+	/* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
+	   the tree using the value from arg0.  The resulting type will match
+	   the type of arg1.  */
+	gimple_seq stmts = NULL;
+	tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0);
+	tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
+				       arg1_type, arg1, temp_offset);
+	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	if (!is_gimple_mem_ref_addr (temp_addr))
+	  {
+	    tree t = make_ssa_name (TREE_TYPE (temp_addr));
+	    gimple *g = gimple_build_assign (t, temp_addr);
+	    gsi_insert_before (gsi, g, GSI_SAME_STMT);
+	    temp_addr = t;
+	  }
+	/* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
+	   take an offset, but since we've already incorporated the offset
+	   above, here we just pass in a zero.  */
+	gimple *g;
+	g = gimple_build_assign (lhs, build2 (MEM_REF, align_ltype, temp_addr,
+					      build_int_cst (arg1_type, 0)));
+	gimple_set_location (g, loc);
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+
+    /* unaligned Vector stores.  */
+    case RS6000_BIF_STXVW4X_V16QI:
+    case RS6000_BIF_STXVW4X_V8HI:
+    case RS6000_BIF_STXVW4X_V4SF:
+    case RS6000_BIF_STXVW4X_V4SI:
+    case RS6000_BIF_STXVD2X_V2DF:
+    case RS6000_BIF_STXVD2X_V2DI:
+      {
+	arg0 = gimple_call_arg (stmt, 0); /* Value to be stored.  */
+	arg1 = gimple_call_arg (stmt, 1); /* Offset.  */
+	tree arg2 = gimple_call_arg (stmt, 2); /* Store-to address.  */
+	location_t loc = gimple_location (stmt);
+	tree arg0_type = TREE_TYPE (arg0);
+	/* Use ptr_type_node (no TBAA) for the arg2_type.  */
+	tree arg2_type = ptr_type_node;
+	/* In GIMPLE the type of the MEM_REF specifies the alignment.  The
+	   required alignment (power) is 4 bytes regardless of data type.  */
+	tree align_stype = build_aligned_type (arg0_type, 4);
+	/* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
+	   the tree using the value from arg1.  */
+	gimple_seq stmts = NULL;
+	tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg1);
+	tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
+				       arg2_type, arg2, temp_offset);
+	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	if (!is_gimple_mem_ref_addr (temp_addr))
+	  {
+	    tree t = make_ssa_name (TREE_TYPE (temp_addr));
+	    gimple *g = gimple_build_assign (t, temp_addr);
+	    gsi_insert_before (gsi, g, GSI_SAME_STMT);
+	    temp_addr = t;
+	  }
+	gimple *g;
+	g = gimple_build_assign (build2 (MEM_REF, align_stype, temp_addr,
+					 build_int_cst (arg2_type, 0)), arg0);
+	gimple_set_location (g, loc);
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+
+    /* Vector Fused multiply-add (fma).  */
+    case RS6000_BIF_VMADDFP:
+    case RS6000_BIF_XVMADDDP:
+    case RS6000_BIF_XVMADDSP:
+    case RS6000_BIF_VMLADDUHM:
+      {
+	arg0 = gimple_call_arg (stmt, 0);
+	arg1 = gimple_call_arg (stmt, 1);
+	tree arg2 = gimple_call_arg (stmt, 2);
+	lhs = gimple_call_lhs (stmt);
+	gcall *g = gimple_build_call_internal (IFN_FMA, 3, arg0, arg1, arg2);
+	gimple_call_set_lhs (g, lhs);
+	gimple_call_set_nothrow (g, true);
+	gimple_set_location (g, gimple_location (stmt));
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+
+    /* Vector compares; EQ, NE, GE, GT, LE.  */
+    case RS6000_BIF_VCMPEQUB:
+    case RS6000_BIF_VCMPEQUH:
+    case RS6000_BIF_VCMPEQUW:
+    case RS6000_BIF_VCMPEQUD:
+    /* We deliberately omit RS6000_BIF_VCMPEQUT for now, because gimple
+       folding produces worse code for 128-bit compares.  */
+      fold_compare_helper (gsi, EQ_EXPR, stmt);
+      return true;
+
+    case RS6000_BIF_VCMPNEB:
+    case RS6000_BIF_VCMPNEH:
+    case RS6000_BIF_VCMPNEW:
+    /* We deliberately omit RS6000_BIF_VCMPNET for now, because gimple
+       folding produces worse code for 128-bit compares.  */
+      fold_compare_helper (gsi, NE_EXPR, stmt);
+      return true;
+
+    case RS6000_BIF_CMPGE_16QI:
+    case RS6000_BIF_CMPGE_U16QI:
+    case RS6000_BIF_CMPGE_8HI:
+    case RS6000_BIF_CMPGE_U8HI:
+    case RS6000_BIF_CMPGE_4SI:
+    case RS6000_BIF_CMPGE_U4SI:
+    case RS6000_BIF_CMPGE_2DI:
+    case RS6000_BIF_CMPGE_U2DI:
+    /* We deliberately omit RS6000_BIF_CMPGE_1TI and RS6000_BIF_CMPGE_U1TI
+       for now, because gimple folding produces worse code for 128-bit
+       compares.  */
+      fold_compare_helper (gsi, GE_EXPR, stmt);
+      return true;
+
+    case RS6000_BIF_VCMPGTSB:
+    case RS6000_BIF_VCMPGTUB:
+    case RS6000_BIF_VCMPGTSH:
+    case RS6000_BIF_VCMPGTUH:
+    case RS6000_BIF_VCMPGTSW:
+    case RS6000_BIF_VCMPGTUW:
+    case RS6000_BIF_VCMPGTUD:
+    case RS6000_BIF_VCMPGTSD:
+    /* We deliberately omit RS6000_BIF_VCMPGTUT and RS6000_BIF_VCMPGTST
+       for now, because gimple folding produces worse code for 128-bit
+       compares.  */
+      fold_compare_helper (gsi, GT_EXPR, stmt);
+      return true;
+
+    case RS6000_BIF_CMPLE_16QI:
+    case RS6000_BIF_CMPLE_U16QI:
+    case RS6000_BIF_CMPLE_8HI:
+    case RS6000_BIF_CMPLE_U8HI:
+    case RS6000_BIF_CMPLE_4SI:
+    case RS6000_BIF_CMPLE_U4SI:
+    case RS6000_BIF_CMPLE_2DI:
+    case RS6000_BIF_CMPLE_U2DI:
+    /* We deliberately omit RS6000_BIF_CMPLE_1TI and RS6000_BIF_CMPLE_U1TI
+       for now, because gimple folding produces worse code for 128-bit
+       compares.  */
+      fold_compare_helper (gsi, LE_EXPR, stmt);
+      return true;
+
+    /* flavors of vec_splat_[us]{8,16,32}.  */
+    case RS6000_BIF_VSPLTISB:
+    case RS6000_BIF_VSPLTISH:
+    case RS6000_BIF_VSPLTISW:
+      {
+	arg0 = gimple_call_arg (stmt, 0);
+	lhs = gimple_call_lhs (stmt);
+
+	/* Only fold the vec_splat_*() if the lower bits of arg 0 is a
+	   5-bit signed constant in range -16 to +15.  */
+	if (TREE_CODE (arg0) != INTEGER_CST
+	    || !IN_RANGE (TREE_INT_CST_LOW (arg0), -16, 15))
+	  return false;
+	gimple_seq stmts = NULL;
+	location_t loc = gimple_location (stmt);
+	tree splat_value = gimple_convert (&stmts, loc,
+					   TREE_TYPE (TREE_TYPE (lhs)), arg0);
+	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	tree splat_tree = build_vector_from_val (TREE_TYPE (lhs), splat_value);
+	g = gimple_build_assign (lhs, splat_tree);
+	gimple_set_location (g, gimple_location (stmt));
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+
+    /* Flavors of vec_splat.  */
+    /* a = vec_splat (b, 0x3) becomes a = { b[3],b[3],b[3],...};  */
+    case RS6000_BIF_VSPLTB:
+    case RS6000_BIF_VSPLTH:
+    case RS6000_BIF_VSPLTW:
+    case RS6000_BIF_XXSPLTD_V2DI:
+    case RS6000_BIF_XXSPLTD_V2DF:
+      {
+	arg0 = gimple_call_arg (stmt, 0); /* input vector.  */
+	arg1 = gimple_call_arg (stmt, 1); /* index into arg0.  */
+	/* Only fold the vec_splat_*() if arg1 is both a constant value and
+	   is a valid index into the arg0 vector.  */
+	unsigned int n_elts = VECTOR_CST_NELTS (arg0);
+	if (TREE_CODE (arg1) != INTEGER_CST
+	    || TREE_INT_CST_LOW (arg1) > (n_elts -1))
+	  return false;
+	lhs = gimple_call_lhs (stmt);
+	tree lhs_type = TREE_TYPE (lhs);
+	tree arg0_type = TREE_TYPE (arg0);
+	tree splat;
+	if (TREE_CODE (arg0) == VECTOR_CST)
+	  splat = VECTOR_CST_ELT (arg0, TREE_INT_CST_LOW (arg1));
+	else
+	  {
+	    /* Determine (in bits) the length and start location of the
+	       splat value for a call to the tree_vec_extract helper.  */
+	    int splat_elem_size = TREE_INT_CST_LOW (size_in_bytes (arg0_type))
+				  * BITS_PER_UNIT / n_elts;
+	    int splat_start_bit = TREE_INT_CST_LOW (arg1) * splat_elem_size;
+	    tree len = build_int_cst (bitsizetype, splat_elem_size);
+	    tree start = build_int_cst (bitsizetype, splat_start_bit);
+	    splat = tree_vec_extract (gsi, TREE_TYPE (lhs_type), arg0,
+				      len, start);
+	  }
+	/* And finally, build the new vector.  */
+	tree splat_tree = build_vector_from_val (lhs_type, splat);
+	g = gimple_build_assign (lhs, splat_tree);
+	gimple_set_location (g, gimple_location (stmt));
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+
+    /* vec_mergel (integrals).  */
+    case RS6000_BIF_VMRGLH:
+    case RS6000_BIF_VMRGLW:
+    case RS6000_BIF_XXMRGLW_4SI:
+    case RS6000_BIF_VMRGLB:
+    case RS6000_BIF_VEC_MERGEL_V2DI:
+    case RS6000_BIF_XXMRGLW_4SF:
+    case RS6000_BIF_VEC_MERGEL_V2DF:
+      fold_mergehl_helper (gsi, stmt, 1);
+      return true;
+    /* vec_mergeh (integrals).  */
+    case RS6000_BIF_VMRGHH:
+    case RS6000_BIF_VMRGHW:
+    case RS6000_BIF_XXMRGHW_4SI:
+    case RS6000_BIF_VMRGHB:
+    case RS6000_BIF_VEC_MERGEH_V2DI:
+    case RS6000_BIF_XXMRGHW_4SF:
+    case RS6000_BIF_VEC_MERGEH_V2DF:
+      fold_mergehl_helper (gsi, stmt, 0);
+      return true;
+
+    /* Flavors of vec_mergee.  */
+    case RS6000_BIF_VMRGEW_V4SI:
+    case RS6000_BIF_VMRGEW_V2DI:
+    case RS6000_BIF_VMRGEW_V4SF:
+    case RS6000_BIF_VMRGEW_V2DF:
+      fold_mergeeo_helper (gsi, stmt, 0);
+      return true;
+    /* Flavors of vec_mergeo.  */
+    case RS6000_BIF_VMRGOW_V4SI:
+    case RS6000_BIF_VMRGOW_V2DI:
+    case RS6000_BIF_VMRGOW_V4SF:
+    case RS6000_BIF_VMRGOW_V2DF:
+      fold_mergeeo_helper (gsi, stmt, 1);
+      return true;
+
+    /* d = vec_pack (a, b) */
+    case RS6000_BIF_VPKUDUM:
+    case RS6000_BIF_VPKUHUM:
+    case RS6000_BIF_VPKUWUM:
+      {
+	arg0 = gimple_call_arg (stmt, 0);
+	arg1 = gimple_call_arg (stmt, 1);
+	lhs = gimple_call_lhs (stmt);
+	gimple *g = gimple_build_assign (lhs, VEC_PACK_TRUNC_EXPR, arg0, arg1);
+	gimple_set_location (g, gimple_location (stmt));
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+
+    /* d = vec_unpackh (a) */
+    /* Note that the UNPACK_{HI,LO}_EXPR used in the gimple_build_assign call
+       in this code is sensitive to endian-ness, and needs to be inverted to
+       handle both LE and BE targets.  */
+    case RS6000_BIF_VUPKHSB:
+    case RS6000_BIF_VUPKHSH:
+    case RS6000_BIF_VUPKHSW:
+      {
+	arg0 = gimple_call_arg (stmt, 0);
+	lhs = gimple_call_lhs (stmt);
+	if (BYTES_BIG_ENDIAN)
+	  g = gimple_build_assign (lhs, VEC_UNPACK_HI_EXPR, arg0);
+	else
+	  g = gimple_build_assign (lhs, VEC_UNPACK_LO_EXPR, arg0);
+	gimple_set_location (g, gimple_location (stmt));
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+    /* d = vec_unpackl (a) */
+    case RS6000_BIF_VUPKLSB:
+    case RS6000_BIF_VUPKLSH:
+    case RS6000_BIF_VUPKLSW:
+      {
+	arg0 = gimple_call_arg (stmt, 0);
+	lhs = gimple_call_lhs (stmt);
+	if (BYTES_BIG_ENDIAN)
+	  g = gimple_build_assign (lhs, VEC_UNPACK_LO_EXPR, arg0);
+	else
+	  g = gimple_build_assign (lhs, VEC_UNPACK_HI_EXPR, arg0);
+	gimple_set_location (g, gimple_location (stmt));
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+    /* There is no gimple type corresponding with pixel, so just return.  */
+    case RS6000_BIF_VUPKHPX:
+    case RS6000_BIF_VUPKLPX:
+      return false;
+
+    /* vec_perm.  */
+    case RS6000_BIF_VPERM_16QI:
+    case RS6000_BIF_VPERM_8HI:
+    case RS6000_BIF_VPERM_4SI:
+    case RS6000_BIF_VPERM_2DI:
+    case RS6000_BIF_VPERM_4SF:
+    case RS6000_BIF_VPERM_2DF:
+    case RS6000_BIF_VPERM_16QI_UNS:
+    case RS6000_BIF_VPERM_8HI_UNS:
+    case RS6000_BIF_VPERM_4SI_UNS:
+    case RS6000_BIF_VPERM_2DI_UNS:
+      {
+	arg0 = gimple_call_arg (stmt, 0);
+	arg1 = gimple_call_arg (stmt, 1);
+	tree permute = gimple_call_arg (stmt, 2);
+	lhs = gimple_call_lhs (stmt);
+	location_t loc = gimple_location (stmt);
+	gimple_seq stmts = NULL;
+	// convert arg0 and arg1 to match the type of the permute
+	// for the VEC_PERM_EXPR operation.
+	tree permute_type = (TREE_TYPE (permute));
+	tree arg0_ptype = gimple_build (&stmts, loc, VIEW_CONVERT_EXPR,
+					permute_type, arg0);
+	tree arg1_ptype = gimple_build (&stmts, loc, VIEW_CONVERT_EXPR,
+					permute_type, arg1);
+	tree lhs_ptype = gimple_build (&stmts, loc, VEC_PERM_EXPR,
+				      permute_type, arg0_ptype, arg1_ptype,
+				      permute);
+	// Convert the result back to the desired lhs type upon completion.
+	tree temp = gimple_build (&stmts, loc, VIEW_CONVERT_EXPR,
+				  TREE_TYPE (lhs), lhs_ptype);
+	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	g = gimple_build_assign (lhs, temp);
+	gimple_set_location (g, loc);
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+
+    default:
+      if (TARGET_DEBUG_BUILTIN)
+	fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
+		 fn_code, fn_name1, fn_name2);
+      break;
+    }
+
+  return false;
+}
+
+/* **** Expansion support ****  */
+
+static rtx
+altivec_expand_predicate_builtin (enum insn_code icode, tree exp, rtx target)
+{
+  rtx pat, scratch;
+  tree cr6_form = CALL_EXPR_ARG (exp, 0);
+  tree arg0 = CALL_EXPR_ARG (exp, 1);
+  tree arg1 = CALL_EXPR_ARG (exp, 2);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  machine_mode tmode = SImode;
+  machine_mode mode0 = insn_data[icode].operand[1].mode;
+  machine_mode mode1 = insn_data[icode].operand[2].mode;
+  int cr6_form_int;
+
+  if (TREE_CODE (cr6_form) != INTEGER_CST)
+    {
+      error ("argument 1 of %qs must be a constant",
+	     "__builtin_altivec_predicate");
+      return const0_rtx;
+    }
+  else
+    cr6_form_int = TREE_INT_CST_LOW (cr6_form);
+
+  gcc_assert (mode0 == mode1);
+
+  /* If we have invalid arguments, bail out before generating bad rtl.  */
+  if (arg0 == error_mark_node || arg1 == error_mark_node)
+    return const0_rtx;
+
+  if (target == 0
+      || GET_MODE (target) != tmode
+      || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  if (! (*insn_data[icode].operand[1].predicate) (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if (! (*insn_data[icode].operand[2].predicate) (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  /* Note that for many of the relevant operations (e.g. cmpne or
+     cmpeq) with float or double operands, it makes more sense for the
+     mode of the allocated scratch register to select a vector of
+     integer.  But the choice to copy the mode of operand 0 was made
+     long ago and there are no plans to change it.  */
+  scratch = gen_reg_rtx (mode0);
+
+  pat = GEN_FCN (icode) (scratch, op0, op1);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+
+  /* The vec_any* and vec_all* predicates use the same opcodes for two
+     different operations, but the bits in CR6 will be different
+     depending on what information we want.  So we have to play tricks
+     with CR6 to get the right bits out.
+
+     If you think this is disgusting, look at the specs for the
+     AltiVec predicates.  */
+
+  switch (cr6_form_int)
+    {
+    case 0:
+      emit_insn (gen_cr6_test_for_zero (target));
+      break;
+    case 1:
+      emit_insn (gen_cr6_test_for_zero_reverse (target));
+      break;
+    case 2:
+      emit_insn (gen_cr6_test_for_lt (target));
+      break;
+    case 3:
+      emit_insn (gen_cr6_test_for_lt_reverse (target));
+      break;
+    default:
+      error ("argument 1 of %qs is out of range",
+	     "__builtin_altivec_predicate");
+      break;
+    }
+
+  return target;
+}
+
+/* Expand vec_init builtin.  */
+static rtx
+altivec_expand_vec_init_builtin (tree type, tree exp, rtx target)
+{
+  machine_mode tmode = TYPE_MODE (type);
+  machine_mode inner_mode = GET_MODE_INNER (tmode);
+  int i, n_elt = GET_MODE_NUNITS (tmode);
+
+  gcc_assert (VECTOR_MODE_P (tmode));
+  gcc_assert (n_elt == call_expr_nargs (exp));
+
+  if (!target || !register_operand (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  /* If we have a vector compromised of a single element, such as V1TImode, do
+     the initialization directly.  */
+  if (n_elt == 1 && GET_MODE_SIZE (tmode) == GET_MODE_SIZE (inner_mode))
+    {
+      rtx x = expand_normal (CALL_EXPR_ARG (exp, 0));
+      emit_move_insn (target, gen_lowpart (tmode, x));
+    }
+  else
+    {
+      rtvec v = rtvec_alloc (n_elt);
+
+      for (i = 0; i < n_elt; ++i)
+	{
+	  rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
+	  RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
+	}
+
+      rs6000_expand_vector_init (target, gen_rtx_PARALLEL (tmode, v));
+    }
+
+  return target;
+}
+
+/* Return the integer constant in ARG.  Constrain it to be in the range
+   of the subparts of VEC_TYPE; issue an error if not.  */
+
+static int
+get_element_number (tree vec_type, tree arg)
+{
+  unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
+
+  if (!tree_fits_uhwi_p (arg)
+      || (elt = tree_to_uhwi (arg), elt > max))
+    {
+      error ("selector must be an integer constant in the range [0, %wi]", max);
+      return 0;
+    }
+
+  return elt;
+}
+
+/* Expand vec_set builtin.  */
+static rtx
+altivec_expand_vec_set_builtin (tree exp)
+{
+  machine_mode tmode, mode1;
+  tree arg0, arg1, arg2;
+  int elt;
+  rtx op0, op1;
+
+  arg0 = CALL_EXPR_ARG (exp, 0);
+  arg1 = CALL_EXPR_ARG (exp, 1);
+  arg2 = CALL_EXPR_ARG (exp, 2);
+
+  tmode = TYPE_MODE (TREE_TYPE (arg0));
+  mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
+  gcc_assert (VECTOR_MODE_P (tmode));
+
+  op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
+  op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
+  elt = get_element_number (TREE_TYPE (arg0), arg2);
+
+  if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
+    op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
+
+  op0 = force_reg (tmode, op0);
+  op1 = force_reg (mode1, op1);
+
+  rs6000_expand_vector_set (op0, op1, GEN_INT (elt));
+
+  return op0;
+}
+
+/* Expand vec_ext builtin.  */
+static rtx
+altivec_expand_vec_ext_builtin (tree exp, rtx target)
+{
+  machine_mode tmode, mode0;
+  tree arg0, arg1;
+  rtx op0;
+  rtx op1;
+
+  arg0 = CALL_EXPR_ARG (exp, 0);
+  arg1 = CALL_EXPR_ARG (exp, 1);
+
+  op0 = expand_normal (arg0);
+  op1 = expand_normal (arg1);
+
+  if (TREE_CODE (arg1) == INTEGER_CST)
+    {
+      unsigned HOST_WIDE_INT elt;
+      unsigned HOST_WIDE_INT size = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+      unsigned int truncated_selector;
+      /* Even if !tree_fits_uhwi_p (arg1)), TREE_INT_CST_LOW (arg0)
+	 returns low-order bits of INTEGER_CST for modulo indexing.  */
+      elt = TREE_INT_CST_LOW (arg1);
+      truncated_selector = elt % size;
+      op1 = GEN_INT (truncated_selector);
+    }
+
+  tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
+  mode0 = TYPE_MODE (TREE_TYPE (arg0));
+  gcc_assert (VECTOR_MODE_P (mode0));
+
+  op0 = force_reg (mode0, op0);
+
+  if (optimize || !target || !register_operand (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  rs6000_expand_vector_extract (target, op0, op1);
+
+  return target;
+}
+
+/* Expand ALTIVEC_BUILTIN_MASK_FOR_LOAD.  */
+rtx
+rs6000_expand_ldst_mask (rtx target, tree arg0)
+{
+  int icode2 = BYTES_BIG_ENDIAN ? (int) CODE_FOR_altivec_lvsr_direct
+				: (int) CODE_FOR_altivec_lvsl_direct;
+  machine_mode tmode = insn_data[icode2].operand[0].mode;
+  machine_mode mode = insn_data[icode2].operand[1].mode;
+
+  gcc_assert (TARGET_ALTIVEC);
+
+  gcc_assert (POINTER_TYPE_P (TREE_TYPE (arg0)));
+  rtx op = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL);
+  rtx addr = memory_address (mode, op);
+  /* We need to negate the address.  */
+  op = gen_reg_rtx (GET_MODE (addr));
+  emit_insn (gen_rtx_SET (op, gen_rtx_NEG (GET_MODE (addr), addr)));
+  op = gen_rtx_MEM (mode, op);
+
+  if (target == 0
+      || GET_MODE (target) != tmode
+      || !insn_data[icode2].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  rtx pat = GEN_FCN (icode2) (target, op);
+  if (!pat)
+    return 0;
+  emit_insn (pat);
+
+  return target;
+}
+
+/* Used by __builtin_cpu_is(), mapping from PLATFORM names to values.  */
+static const struct
+{
+  const char *cpu;
+  unsigned int cpuid;
+} cpu_is_info[] = {
+  { "power10",	   PPC_PLATFORM_POWER10 },
+  { "power9",	   PPC_PLATFORM_POWER9 },
+  { "power8",	   PPC_PLATFORM_POWER8 },
+  { "power7",	   PPC_PLATFORM_POWER7 },
+  { "power6x",	   PPC_PLATFORM_POWER6X },
+  { "power6",	   PPC_PLATFORM_POWER6 },
+  { "power5+",	   PPC_PLATFORM_POWER5_PLUS },
+  { "power5",	   PPC_PLATFORM_POWER5 },
+  { "ppc970",	   PPC_PLATFORM_PPC970 },
+  { "power4",	   PPC_PLATFORM_POWER4 },
+  { "ppca2",	   PPC_PLATFORM_PPCA2 },
+  { "ppc476",	   PPC_PLATFORM_PPC476 },
+  { "ppc464",	   PPC_PLATFORM_PPC464 },
+  { "ppc440",	   PPC_PLATFORM_PPC440 },
+  { "ppc405",	   PPC_PLATFORM_PPC405 },
+  { "ppc-cell-be", PPC_PLATFORM_CELL_BE }
+};
+
+/* Used by __builtin_cpu_supports(), mapping from HWCAP names to masks.  */
+static const struct
+{
+  const char *hwcap;
+  int mask;
+  unsigned int id;
+} cpu_supports_info[] = {
+  /* AT_HWCAP masks.  */
+  { "4xxmac",		PPC_FEATURE_HAS_4xxMAC,		0 },
+  { "altivec",		PPC_FEATURE_HAS_ALTIVEC,	0 },
+  { "arch_2_05",	PPC_FEATURE_ARCH_2_05,		0 },
+  { "arch_2_06",	PPC_FEATURE_ARCH_2_06,		0 },
+  { "archpmu",		PPC_FEATURE_PERFMON_COMPAT,	0 },
+  { "booke",		PPC_FEATURE_BOOKE,		0 },
+  { "cellbe",		PPC_FEATURE_CELL_BE,		0 },
+  { "dfp",		PPC_FEATURE_HAS_DFP,		0 },
+  { "efpdouble",	PPC_FEATURE_HAS_EFP_DOUBLE,	0 },
+  { "efpsingle",	PPC_FEATURE_HAS_EFP_SINGLE,	0 },
+  { "fpu",		PPC_FEATURE_HAS_FPU,		0 },
+  { "ic_snoop",		PPC_FEATURE_ICACHE_SNOOP,	0 },
+  { "mmu",		PPC_FEATURE_HAS_MMU,		0 },
+  { "notb",		PPC_FEATURE_NO_TB,		0 },
+  { "pa6t",		PPC_FEATURE_PA6T,		0 },
+  { "power4",		PPC_FEATURE_POWER4,		0 },
+  { "power5",		PPC_FEATURE_POWER5,		0 },
+  { "power5+",		PPC_FEATURE_POWER5_PLUS,	0 },
+  { "power6x",		PPC_FEATURE_POWER6_EXT,		0 },
+  { "ppc32",		PPC_FEATURE_32,			0 },
+  { "ppc601",		PPC_FEATURE_601_INSTR,		0 },
+  { "ppc64",		PPC_FEATURE_64,			0 },
+  { "ppcle",		PPC_FEATURE_PPC_LE,		0 },
+  { "smt",		PPC_FEATURE_SMT,		0 },
+  { "spe",		PPC_FEATURE_HAS_SPE,		0 },
+  { "true_le",		PPC_FEATURE_TRUE_LE,		0 },
+  { "ucache",		PPC_FEATURE_UNIFIED_CACHE,	0 },
+  { "vsx",		PPC_FEATURE_HAS_VSX,		0 },
+
+  /* AT_HWCAP2 masks.  */
+  { "arch_2_07",	PPC_FEATURE2_ARCH_2_07,		1 },
+  { "dscr",		PPC_FEATURE2_HAS_DSCR,		1 },
+  { "ebb",		PPC_FEATURE2_HAS_EBB,		1 },
+  { "htm",		PPC_FEATURE2_HAS_HTM,		1 },
+  { "htm-nosc",		PPC_FEATURE2_HTM_NOSC,		1 },
+  { "htm-no-suspend",	PPC_FEATURE2_HTM_NO_SUSPEND,	1 },
+  { "isel",		PPC_FEATURE2_HAS_ISEL,		1 },
+  { "tar",		PPC_FEATURE2_HAS_TAR,		1 },
+  { "vcrypto",		PPC_FEATURE2_HAS_VEC_CRYPTO,	1 },
+  { "arch_3_00",	PPC_FEATURE2_ARCH_3_00,		1 },
+  { "ieee128",		PPC_FEATURE2_HAS_IEEE128,	1 },
+  { "darn",		PPC_FEATURE2_DARN,		1 },
+  { "scv",		PPC_FEATURE2_SCV,		1 },
+  { "arch_3_1",		PPC_FEATURE2_ARCH_3_1,		1 },
+  { "mma",		PPC_FEATURE2_MMA,		1 },
+};
+
+/* Expand the CPU builtin in FCODE and store the result in TARGET.  */
+static rtx
+cpu_expand_builtin (enum rs6000_gen_builtins fcode,
+		    tree exp ATTRIBUTE_UNUSED, rtx target)
+{
+  /* __builtin_cpu_init () is a nop, so expand to nothing.  */
+  if (fcode == RS6000_BIF_CPU_INIT)
+    return const0_rtx;
+
+  if (target == 0 || GET_MODE (target) != SImode)
+    target = gen_reg_rtx (SImode);
+
+  /* TODO: Factor the #ifdef'd code into a separate function.  */
+#ifdef TARGET_LIBC_PROVIDES_HWCAP_IN_TCB
+  tree arg = TREE_OPERAND (CALL_EXPR_ARG (exp, 0), 0);
+  /* Target clones creates an ARRAY_REF instead of STRING_CST, convert it back
+     to a STRING_CST.  */
+  if (TREE_CODE (arg) == ARRAY_REF
+      && TREE_CODE (TREE_OPERAND (arg, 0)) == STRING_CST
+      && TREE_CODE (TREE_OPERAND (arg, 1)) == INTEGER_CST
+      && compare_tree_int (TREE_OPERAND (arg, 1), 0) == 0)
+    arg = TREE_OPERAND (arg, 0);
+
+  if (TREE_CODE (arg) != STRING_CST)
+    {
+      error ("builtin %qs only accepts a string argument",
+	     rs6000_builtin_info[(size_t) fcode].bifname);
+      return const0_rtx;
+    }
+
+  if (fcode == RS6000_BIF_CPU_IS)
+    {
+      const char *cpu = TREE_STRING_POINTER (arg);
+      rtx cpuid = NULL_RTX;
+      for (size_t i = 0; i < ARRAY_SIZE (cpu_is_info); i++)
+	if (strcmp (cpu, cpu_is_info[i].cpu) == 0)
+	  {
+	    /* The CPUID value in the TCB is offset by _DL_FIRST_PLATFORM.  */
+	    cpuid = GEN_INT (cpu_is_info[i].cpuid + _DL_FIRST_PLATFORM);
+	    break;
+	  }
+      if (cpuid == NULL_RTX)
+	{
+	  /* Invalid CPU argument.  */
+	  error ("cpu %qs is an invalid argument to builtin %qs",
+		 cpu, rs6000_builtin_info[(size_t) fcode].bifname);
+	  return const0_rtx;
+	}
+
+      rtx platform = gen_reg_rtx (SImode);
+      rtx address = gen_rtx_PLUS (Pmode,
+				  gen_rtx_REG (Pmode, TLS_REGNUM),
+				  GEN_INT (TCB_PLATFORM_OFFSET));
+      rtx tcbmem = gen_const_mem (SImode, address);
+      emit_move_insn (platform, tcbmem);
+      emit_insn (gen_eqsi3 (target, platform, cpuid));
+    }
+  else if (fcode == RS6000_BIF_CPU_SUPPORTS)
+    {
+      const char *hwcap = TREE_STRING_POINTER (arg);
+      rtx mask = NULL_RTX;
+      int hwcap_offset;
+      for (size_t i = 0; i < ARRAY_SIZE (cpu_supports_info); i++)
+	if (strcmp (hwcap, cpu_supports_info[i].hwcap) == 0)
+	  {
+	    mask = GEN_INT (cpu_supports_info[i].mask);
+	    hwcap_offset = TCB_HWCAP_OFFSET (cpu_supports_info[i].id);
+	    break;
+	  }
+      if (mask == NULL_RTX)
+	{
+	  /* Invalid HWCAP argument.  */
+	  error ("%s %qs is an invalid argument to builtin %qs",
+		 "hwcap", hwcap,
+		 rs6000_builtin_info[(size_t) fcode].bifname);
+	  return const0_rtx;
+	}
+
+      rtx tcb_hwcap = gen_reg_rtx (SImode);
+      rtx address = gen_rtx_PLUS (Pmode,
+				  gen_rtx_REG (Pmode, TLS_REGNUM),
+				  GEN_INT (hwcap_offset));
+      rtx tcbmem = gen_const_mem (SImode, address);
+      emit_move_insn (tcb_hwcap, tcbmem);
+      rtx scratch1 = gen_reg_rtx (SImode);
+      emit_insn (gen_rtx_SET (scratch1,
+			      gen_rtx_AND (SImode, tcb_hwcap, mask)));
+      rtx scratch2 = gen_reg_rtx (SImode);
+      emit_insn (gen_eqsi3 (scratch2, scratch1, const0_rtx));
+      emit_insn (gen_rtx_SET (target,
+			      gen_rtx_XOR (SImode, scratch2, const1_rtx)));
+    }
+  else
+    gcc_unreachable ();
+
+  /* Record that we have expanded a CPU builtin, so that we can later
+     emit a reference to the special symbol exported by LIBC to ensure we
+     do not link against an old LIBC that doesn't support this feature.  */
+  cpu_builtin_p = true;
+
+#else
+  warning (0, "builtin %qs needs GLIBC (2.23 and newer) that exports hardware "
+	   "capability bits", rs6000_builtin_info[(size_t) fcode].bifname);
+
+  /* For old LIBCs, always return FALSE.  */
+  emit_move_insn (target, GEN_INT (0));
+#endif /* TARGET_LIBC_PROVIDES_HWCAP_IN_TCB */
+
+  return target;
+}
+
+/* For the element-reversing load/store built-ins, produce the correct
+   insn_code depending on the target endianness.  */
+static insn_code
+elemrev_icode (rs6000_gen_builtins fcode)
+{
+  switch (fcode)
+    {
+    case RS6000_BIF_ST_ELEMREV_V1TI:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v1ti
+			      : CODE_FOR_vsx_st_elemrev_v1ti;
+
+    case RS6000_BIF_ST_ELEMREV_V2DF:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v2df
+			      : CODE_FOR_vsx_st_elemrev_v2df;
+
+    case RS6000_BIF_ST_ELEMREV_V2DI:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v2di
+			      : CODE_FOR_vsx_st_elemrev_v2di;
+
+    case RS6000_BIF_ST_ELEMREV_V4SF:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v4sf
+			      : CODE_FOR_vsx_st_elemrev_v4sf;
+
+    case RS6000_BIF_ST_ELEMREV_V4SI:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v4si
+			      : CODE_FOR_vsx_st_elemrev_v4si;
+
+    case RS6000_BIF_ST_ELEMREV_V8HI:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v8hi
+			      : CODE_FOR_vsx_st_elemrev_v8hi;
+
+    case RS6000_BIF_ST_ELEMREV_V16QI:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v16qi
+			      : CODE_FOR_vsx_st_elemrev_v16qi;
+
+    case RS6000_BIF_LD_ELEMREV_V2DF:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v2df
+			      : CODE_FOR_vsx_ld_elemrev_v2df;
+
+    case RS6000_BIF_LD_ELEMREV_V1TI:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v1ti
+			      : CODE_FOR_vsx_ld_elemrev_v1ti;
+
+    case RS6000_BIF_LD_ELEMREV_V2DI:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v2di
+			      : CODE_FOR_vsx_ld_elemrev_v2di;
+
+    case RS6000_BIF_LD_ELEMREV_V4SF:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v4sf
+			      : CODE_FOR_vsx_ld_elemrev_v4sf;
+
+    case RS6000_BIF_LD_ELEMREV_V4SI:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v4si
+			      : CODE_FOR_vsx_ld_elemrev_v4si;
+
+    case RS6000_BIF_LD_ELEMREV_V8HI:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v8hi
+			      : CODE_FOR_vsx_ld_elemrev_v8hi;
+
+    case RS6000_BIF_LD_ELEMREV_V16QI:
+      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v16qi
+			      : CODE_FOR_vsx_ld_elemrev_v16qi;
+    default:
+      ;
+    }
+
+  gcc_unreachable ();
+}
+
+/* Expand an AltiVec vector load builtin, and return the expanded rtx.  */
+static rtx
+ldv_expand_builtin (rtx target, insn_code icode, rtx *op, machine_mode tmode)
+{
+  if (target == 0
+      || GET_MODE (target) != tmode
+      || !insn_data[icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  op[1] = copy_to_mode_reg (Pmode, op[1]);
+
+  /* These CELL built-ins use BLKmode instead of tmode for historical
+     (i.e., unknown) reasons.  TODO: Is this necessary?  */
+  bool blk = (icode == CODE_FOR_altivec_lvlx
+	      || icode == CODE_FOR_altivec_lvlxl
+	      || icode == CODE_FOR_altivec_lvrx
+	      || icode == CODE_FOR_altivec_lvrxl);
+
+  /* For LVX, express the RTL accurately by ANDing the address with -16.
+     LVXL and LVE*X expand to use UNSPECs to hide their special behavior,
+     so the raw address is fine.  */
+  /* TODO: That statement seems wrong, as the UNSPECs don't surround the
+     memory expression, so a latent bug may lie here.  The &-16 is likely
+     needed for all VMX-style loads.  */
+  if (icode == CODE_FOR_altivec_lvx_v1ti
+      || icode == CODE_FOR_altivec_lvx_v2df
+      || icode == CODE_FOR_altivec_lvx_v2di
+      || icode == CODE_FOR_altivec_lvx_v4sf
+      || icode == CODE_FOR_altivec_lvx_v4si
+      || icode == CODE_FOR_altivec_lvx_v8hi
+      || icode == CODE_FOR_altivec_lvx_v16qi)
+    {
+      rtx rawaddr;
+      if (op[0] == const0_rtx)
+	rawaddr = op[1];
+      else
+	{
+	  op[0] = copy_to_mode_reg (Pmode, op[0]);
+	  rawaddr = gen_rtx_PLUS (Pmode, op[1], op[0]);
+	}
+      rtx addr = gen_rtx_AND (Pmode, rawaddr, gen_rtx_CONST_INT (Pmode, -16));
+      addr = gen_rtx_MEM (blk ? BLKmode : tmode, addr);
+
+      emit_insn (gen_rtx_SET (target, addr));
+    }
+  else
+    {
+      rtx addr;
+      if (op[0] == const0_rtx)
+	addr = gen_rtx_MEM (blk ? BLKmode : tmode, op[1]);
+      else
+	{
+	  op[0] = copy_to_mode_reg (Pmode, op[0]);
+	  addr = gen_rtx_MEM (blk ? BLKmode : tmode,
+			      gen_rtx_PLUS (Pmode, op[1], op[0]));
+	}
+
+      rtx pat = GEN_FCN (icode) (target, addr);
+      if (!pat)
+	return 0;
+      emit_insn (pat);
+    }
+
+  return target;
+}
+
+/* Expand a builtin function that loads a scalar into a vector register
+   with sign extension, and return the expanded rtx.  */
+static rtx
+lxvrse_expand_builtin (rtx target, insn_code icode, rtx *op,
+		       machine_mode tmode, machine_mode smode)
+{
+  rtx pat, addr;
+  op[1] = copy_to_mode_reg (Pmode, op[1]);
+
+  if (op[0] == const0_rtx)
+    addr = gen_rtx_MEM (tmode, op[1]);
+  else
+    {
+      op[0] = copy_to_mode_reg (Pmode, op[0]);
+      addr = gen_rtx_MEM (smode,
+			  gen_rtx_PLUS (Pmode, op[1], op[0]));
+    }
+
+  rtx discratch = gen_reg_rtx (V2DImode);
+  rtx tiscratch = gen_reg_rtx (TImode);
+
+  /* Emit the lxvr*x insn.  */
+  pat = GEN_FCN (icode) (tiscratch, addr);
+  if (!pat)
+    return 0;
+  emit_insn (pat);
+
+  /* Emit a sign extension from V16QI,V8HI,V4SI to V2DI.  */
+  rtx temp1;
+  if (icode == CODE_FOR_vsx_lxvrbx)
+    {
+      temp1  = simplify_gen_subreg (V16QImode, tiscratch, TImode, 0);
+      emit_insn (gen_vsx_sign_extend_qi_v2di (discratch, temp1));
+    }
+  else if (icode == CODE_FOR_vsx_lxvrhx)
+    {
+      temp1  = simplify_gen_subreg (V8HImode, tiscratch, TImode, 0);
+      emit_insn (gen_vsx_sign_extend_hi_v2di (discratch, temp1));
+    }
+  else if (icode == CODE_FOR_vsx_lxvrwx)
+    {
+      temp1  = simplify_gen_subreg (V4SImode, tiscratch, TImode, 0);
+      emit_insn (gen_vsx_sign_extend_si_v2di (discratch, temp1));
+    }
+  else if (icode == CODE_FOR_vsx_lxvrdx)
+    discratch = simplify_gen_subreg (V2DImode, tiscratch, TImode, 0);
+  else
+    gcc_unreachable ();
+
+  /* Emit the sign extension from V2DI (double) to TI (quad).  */
+  rtx temp2 = simplify_gen_subreg (TImode, discratch, V2DImode, 0);
+  emit_insn (gen_extendditi2_vector (target, temp2));
+
+  return target;
+}
+
+/* Expand a builtin function that loads a scalar into a vector register
+   with zero extension, and return the expanded rtx.  */
+static rtx
+lxvrze_expand_builtin (rtx target, insn_code icode, rtx *op,
+		       machine_mode tmode, machine_mode smode)
+{
+  rtx pat, addr;
+  op[1] = copy_to_mode_reg (Pmode, op[1]);
+
+  if (op[0] == const0_rtx)
+    addr = gen_rtx_MEM (tmode, op[1]);
+  else
+    {
+      op[0] = copy_to_mode_reg (Pmode, op[0]);
+      addr = gen_rtx_MEM (smode,
+			  gen_rtx_PLUS (Pmode, op[1], op[0]));
+    }
+
+  pat = GEN_FCN (icode) (target, addr);
+  if (!pat)
+    return 0;
+  emit_insn (pat);
+  return target;
+}
+
+/* Expand an AltiVec vector store builtin, and return the expanded rtx.  */
+static rtx
+stv_expand_builtin (insn_code icode, rtx *op,
+		    machine_mode tmode, machine_mode smode)
+{
+  op[2] = copy_to_mode_reg (Pmode, op[2]);
+
+  /* For STVX, express the RTL accurately by ANDing the address with -16.
+     STVXL and STVE*X expand to use UNSPECs to hide their special behavior,
+     so the raw address is fine.  */
+  /* TODO: That statement seems wrong, as the UNSPECs don't surround the
+     memory expression, so a latent bug may lie here.  The &-16 is likely
+     needed for all VMX-style stores.  */
+  if (icode == CODE_FOR_altivec_stvx_v2df
+      || icode == CODE_FOR_altivec_stvx_v2di
+      || icode == CODE_FOR_altivec_stvx_v4sf
+      || icode == CODE_FOR_altivec_stvx_v4si
+      || icode == CODE_FOR_altivec_stvx_v8hi
+      || icode == CODE_FOR_altivec_stvx_v16qi)
+    {
+      rtx rawaddr;
+      if (op[1] == const0_rtx)
+	rawaddr = op[2];
+      else
+	{
+	  op[1] = copy_to_mode_reg (Pmode, op[1]);
+	  rawaddr = gen_rtx_PLUS (Pmode, op[2], op[1]);
+	}
+
+      rtx addr = gen_rtx_AND (Pmode, rawaddr, gen_rtx_CONST_INT (Pmode, -16));
+      addr = gen_rtx_MEM (tmode, addr);
+      op[0] = copy_to_mode_reg (tmode, op[0]);
+      emit_insn (gen_rtx_SET (addr, op[0]));
+    }
+  else if (icode == CODE_FOR_vsx_stxvrbx
+	   || icode == CODE_FOR_vsx_stxvrhx
+	   || icode == CODE_FOR_vsx_stxvrwx
+	   || icode == CODE_FOR_vsx_stxvrdx)
+    {
+      rtx truncrtx = gen_rtx_TRUNCATE (tmode, op[0]);
+      op[0] = copy_to_mode_reg (E_TImode, truncrtx);
+
+      rtx addr;
+      if (op[1] == const0_rtx)
+	addr = gen_rtx_MEM (Pmode, op[2]);
+      else
+	{
+	  op[1] = copy_to_mode_reg (Pmode, op[1]);
+	  addr = gen_rtx_MEM (tmode, gen_rtx_PLUS (Pmode, op[2], op[1]));
+	}
+      rtx pat = GEN_FCN (icode) (addr, op[0]);
+      if (pat)
+	emit_insn (pat);
+    }
+  else
+    {
+      if (!insn_data[icode].operand[1].predicate (op[0], smode))
+	op[0] = copy_to_mode_reg (smode, op[0]);
+
+      rtx addr;
+      if (op[1] == const0_rtx)
+	addr = gen_rtx_MEM (tmode, op[2]);
+      else
+	{
+	  op[1] = copy_to_mode_reg (Pmode, op[1]);
+	  addr = gen_rtx_MEM (tmode, gen_rtx_PLUS (Pmode, op[2], op[1]));
+	}
+
+      rtx pat = GEN_FCN (icode) (addr, op[0]);
+      if (pat)
+	emit_insn (pat);
+    }
+
+  return NULL_RTX;
+}
+
+/* Expand the MMA built-in in EXP, and return it.  */
+static rtx
+mma_expand_builtin (tree exp, rtx target, insn_code icode,
+		    rs6000_gen_builtins fcode)
+{
+  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+  bool void_func = TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node;
+  machine_mode tmode = VOIDmode;
+  rtx op[MAX_MMA_OPERANDS];
+  unsigned nopnds = 0;
+
+  if (!void_func)
+    {
+      tmode = insn_data[icode].operand[0].mode;
+      if (!(target
+	    && GET_MODE (target) == tmode
+	    && insn_data[icode].operand[0].predicate (target, tmode)))
+	target = gen_reg_rtx (tmode);
+      op[nopnds++] = target;
+    }
+  else
+    target = const0_rtx;
+
+  call_expr_arg_iterator iter;
+  tree arg;
+  FOR_EACH_CALL_EXPR_ARG (arg, iter, exp)
+    {
+      if (arg == error_mark_node)
+	return const0_rtx;
+
+      rtx opnd;
+      const struct insn_operand_data *insn_op;
+      insn_op = &insn_data[icode].operand[nopnds];
+      if (TREE_CODE (arg) == ADDR_EXPR
+	  && MEM_P (DECL_RTL (TREE_OPERAND (arg, 0))))
+	opnd = DECL_RTL (TREE_OPERAND (arg, 0));
+      else
+	opnd = expand_normal (arg);
+
+      if (!insn_op->predicate (opnd, insn_op->mode))
+	{
+	  /* TODO: This use of constraints needs explanation.  */
+	  if (!strcmp (insn_op->constraint, "n"))
+	    {
+	      if (!CONST_INT_P (opnd))
+		error ("argument %d must be an unsigned literal", nopnds);
+	      else
+		error ("argument %d is an unsigned literal that is "
+		       "out of range", nopnds);
+	      return const0_rtx;
+	    }
+	  opnd = copy_to_mode_reg (insn_op->mode, opnd);
+	}
+
+      /* Some MMA instructions have INOUT accumulator operands, so force
+	 their target register to be the same as their input register.  */
+      if (!void_func
+	  && nopnds == 1
+	  && !strcmp (insn_op->constraint, "0")
+	  && insn_op->mode == tmode
+	  && REG_P (opnd)
+	  && insn_data[icode].operand[0].predicate (opnd, tmode))
+	target = op[0] = opnd;
+
+      op[nopnds++] = opnd;
+    }
+
+  rtx pat;
+  switch (nopnds)
+    {
+    case 1:
+      pat = GEN_FCN (icode) (op[0]);
+      break;
+    case 2:
+      pat = GEN_FCN (icode) (op[0], op[1]);
+      break;
+    case 3:
+      /* The ASSEMBLE builtin source operands are reversed in little-endian
+	 mode, so reorder them.  */
+      if (fcode == RS6000_BIF_ASSEMBLE_PAIR_V_INTERNAL && !WORDS_BIG_ENDIAN)
+	std::swap (op[1], op[2]);
+      pat = GEN_FCN (icode) (op[0], op[1], op[2]);
+      break;
+    case 4:
+      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3]);
+      break;
+    case 5:
+      /* The ASSEMBLE builtin source operands are reversed in little-endian
+	 mode, so reorder them.  */
+      if (fcode == RS6000_BIF_ASSEMBLE_ACC_INTERNAL && !WORDS_BIG_ENDIAN)
+	{
+	  std::swap (op[1], op[4]);
+	  std::swap (op[2], op[3]);
+	}
+      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4]);
+      break;
+    case 6:
+      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5]);
+      break;
+    case 7:
+      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5], op[6]);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (!pat)
+    return NULL_RTX;
+
+  emit_insn (pat);
+  return target;
+}
+
+/* Return the correct ICODE value depending on whether we are
+   setting or reading the HTM SPRs.  */
+static inline enum insn_code
+rs6000_htm_spr_icode (bool nonvoid)
+{
+  if (nonvoid)
+    return (TARGET_POWERPC64) ? CODE_FOR_htm_mfspr_di : CODE_FOR_htm_mfspr_si;
+  else
+    return (TARGET_POWERPC64) ? CODE_FOR_htm_mtspr_di : CODE_FOR_htm_mtspr_si;
+}
+
+/* Return the appropriate SPR number associated with the given builtin.  */
+static inline HOST_WIDE_INT
+htm_spr_num (enum rs6000_gen_builtins code)
+{
+  if (code == RS6000_BIF_GET_TFHAR
+      || code == RS6000_BIF_SET_TFHAR)
+    return TFHAR_SPR;
+  else if (code == RS6000_BIF_GET_TFIAR
+	   || code == RS6000_BIF_SET_TFIAR)
+    return TFIAR_SPR;
+  else if (code == RS6000_BIF_GET_TEXASR
+	   || code == RS6000_BIF_SET_TEXASR)
+    return TEXASR_SPR;
+  gcc_assert (code == RS6000_BIF_GET_TEXASRU
+	      || code == RS6000_BIF_SET_TEXASRU);
+  return TEXASRU_SPR;
+}
+
+/* Expand the HTM builtin in EXP and store the result in TARGET.
+   Return the expanded rtx.  */
+static rtx
+htm_expand_builtin (bifdata *bifaddr, rs6000_gen_builtins fcode,
+		    tree exp, rtx target)
+{
+  if (!TARGET_POWERPC64
+      && (fcode == RS6000_BIF_TABORTDC
+	  || fcode == RS6000_BIF_TABORTDCI))
+    {
+      error ("builtin %qs is only valid in 64-bit mode", bifaddr->bifname);
+      return const0_rtx;
+    }
+
+  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+  bool nonvoid = TREE_TYPE (TREE_TYPE (fndecl)) != void_type_node;
+  bool uses_spr = bif_is_htmspr (*bifaddr);
+  insn_code icode = bifaddr->icode;
+
+  if (uses_spr)
+    icode = rs6000_htm_spr_icode (nonvoid);
+
+  rtx op[MAX_HTM_OPERANDS];
+  int nopnds = 0;
+  const insn_operand_data *insn_op = &insn_data[icode].operand[0];
+
+  if (nonvoid)
+    {
+      machine_mode tmode = (uses_spr) ? insn_op->mode : E_SImode;
+      if (!target
+	  || GET_MODE (target) != tmode
+	  || (uses_spr && !insn_op->predicate (target, tmode)))
+	target = gen_reg_rtx (tmode);
+      if (uses_spr)
+	op[nopnds++] = target;
+    }
+
+  tree arg;
+  call_expr_arg_iterator iter;
+
+  FOR_EACH_CALL_EXPR_ARG (arg, iter, exp)
+    {
+      if (arg == error_mark_node || nopnds >= MAX_HTM_OPERANDS)
+	return const0_rtx;
+
+      insn_op = &insn_data[icode].operand[nopnds];
+      op[nopnds] = expand_normal (arg);
+
+      if (!insn_op->predicate (op[nopnds], insn_op->mode))
+	{
+	  /* TODO: This use of constraints could use explanation.
+	     This happens a couple of places, perhaps make that a
+	     function to document what's happening.  */
+	  if (!strcmp (insn_op->constraint, "n"))
+	    {
+	      int arg_num = nonvoid ? nopnds : nopnds + 1;
+	      if (!CONST_INT_P (op[nopnds]))
+		error ("argument %d must be an unsigned literal", arg_num);
+	      else
+		error ("argument %d is an unsigned literal that is "
+		       "out of range", arg_num);
+	      return const0_rtx;
+	    }
+	  op[nopnds] = copy_to_mode_reg (insn_op->mode, op[nopnds]);
+	}
+
+      nopnds++;
+    }
+
+  /* Handle the builtins for extended mnemonics.  These accept
+     no arguments, but map to builtins that take arguments.  */
+  switch (fcode)
+    {
+    case RS6000_BIF_TENDALL:  /* Alias for: tend. 1  */
+    case RS6000_BIF_TRESUME:  /* Alias for: tsr. 1  */
+      op[nopnds++] = GEN_INT (1);
+      break;
+    case RS6000_BIF_TSUSPEND: /* Alias for: tsr. 0  */
+      op[nopnds++] = GEN_INT (0);
+      break;
+    default:
+      break;
+    }
+
+  /* If this builtin accesses SPRs, then pass in the appropriate
+     SPR number and SPR regno as the last two operands.  */
+  rtx cr = NULL_RTX;
+  if (uses_spr)
+    {
+      machine_mode mode = TARGET_POWERPC64 ? DImode : SImode;
+      op[nopnds++] = gen_rtx_CONST_INT (mode, htm_spr_num (fcode));
+    }
+  /* If this builtin accesses a CR field, then pass in a scratch
+     CR field as the last operand.  */
+  else if (bif_is_htmcr (*bifaddr))
+    {
+      cr = gen_reg_rtx (CCmode);
+      op[nopnds++] = cr;
+    }
+
+  rtx pat;
+  switch (nopnds)
+    {
+    case 1:
+      pat = GEN_FCN (icode) (op[0]);
+      break;
+    case 2:
+      pat = GEN_FCN (icode) (op[0], op[1]);
+      break;
+    case 3:
+      pat = GEN_FCN (icode) (op[0], op[1], op[2]);
+      break;
+    case 4:
+      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3]);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  if (!pat)
+    return NULL_RTX;
+  emit_insn (pat);
+
+  if (bif_is_htmcr (*bifaddr))
+    {
+      if (fcode == RS6000_BIF_TBEGIN)
+	{
+	  /* Emit code to set TARGET to true or false depending on
+	     whether the tbegin. instruction succeeded or failed
+	     to start a transaction.  We do this by placing the 1's
+	     complement of CR's EQ bit into TARGET.  */
+	  rtx scratch = gen_reg_rtx (SImode);
+	  emit_insn (gen_rtx_SET (scratch,
+				  gen_rtx_EQ (SImode, cr,
+					      const0_rtx)));
+	  emit_insn (gen_rtx_SET (target,
+				  gen_rtx_XOR (SImode, scratch,
+					       GEN_INT (1))));
+	}
+      else
+	{
+	  /* Emit code to copy the 4-bit condition register field
+	     CR into the least significant end of register TARGET.  */
+	  rtx scratch1 = gen_reg_rtx (SImode);
+	  rtx scratch2 = gen_reg_rtx (SImode);
+	  rtx subreg = simplify_gen_subreg (CCmode, scratch1, SImode, 0);
+	  emit_insn (gen_movcc (subreg, cr));
+	  emit_insn (gen_lshrsi3 (scratch2, scratch1, GEN_INT (28)));
+	  emit_insn (gen_andsi3 (target, scratch2, GEN_INT (0xf)));
+	}
+    }
+
+  if (nonvoid)
+    return target;
+  return const0_rtx;
+}
+
+/* Expand an expression EXP that calls a built-in function,
+   with result going to TARGET if that's convenient
+   (and in mode MODE if that's convenient).
+   SUBTARGET may be used as the target for computing one of EXP's operands.
+   IGNORE is nonzero if the value is to be ignored.
+   Use the new builtin infrastructure.  */
+rtx
+rs6000_expand_builtin (tree exp, rtx target, rtx /* subtarget */,
+		       machine_mode /* mode */, int ignore)
+{
+  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+  enum rs6000_gen_builtins fcode
+    = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl);
+  size_t uns_fcode = (size_t)fcode;
+  enum insn_code icode = rs6000_builtin_info[uns_fcode].icode;
+
+  /* TODO: The following commentary and code is inherited from the original
+     builtin processing code.  The commentary is a bit confusing, with the
+     intent being that KFmode is always IEEE-128, IFmode is always IBM
+     double-double, and TFmode is the current long double.  The code is
+     confusing in that it converts from KFmode to TFmode pattern names,
+     when the other direction is more intuitive.  Try to address this.  */
+
+  /* We have two different modes (KFmode, TFmode) that are the IEEE
+     128-bit floating point type, depending on whether long double is the
+     IBM extended double (KFmode) or long double is IEEE 128-bit (TFmode).
+     It is simpler if we only define one variant of the built-in function,
+     and switch the code when defining it, rather than defining two built-
+     ins and using the overload table in rs6000-c.cc to switch between the
+     two.  If we don't have the proper assembler, don't do this switch
+     because CODE_FOR_*kf* and CODE_FOR_*tf* will be CODE_FOR_nothing.  */
+  if (FLOAT128_IEEE_P (TFmode))
+    switch (icode)
+      {
+      case CODE_FOR_sqrtkf2_odd:
+	icode = CODE_FOR_sqrttf2_odd;
+	break;
+      case CODE_FOR_trunckfdf2_odd:
+	icode = CODE_FOR_trunctfdf2_odd;
+	break;
+      case CODE_FOR_addkf3_odd:
+	icode = CODE_FOR_addtf3_odd;
+	break;
+      case CODE_FOR_subkf3_odd:
+	icode = CODE_FOR_subtf3_odd;
+	break;
+      case CODE_FOR_mulkf3_odd:
+	icode = CODE_FOR_multf3_odd;
+	break;
+      case CODE_FOR_divkf3_odd:
+	icode = CODE_FOR_divtf3_odd;
+	break;
+      case CODE_FOR_fmakf4_odd:
+	icode = CODE_FOR_fmatf4_odd;
+	break;
+      case CODE_FOR_xsxexpqp_kf:
+	icode = CODE_FOR_xsxexpqp_tf;
+	break;
+      case CODE_FOR_xsxsigqp_kf:
+	icode = CODE_FOR_xsxsigqp_tf;
+	break;
+      case CODE_FOR_xststdcnegqp_kf:
+	icode = CODE_FOR_xststdcnegqp_tf;
+	break;
+      case CODE_FOR_xsiexpqp_kf:
+	icode = CODE_FOR_xsiexpqp_tf;
+	break;
+      case CODE_FOR_xsiexpqpf_kf:
+	icode = CODE_FOR_xsiexpqpf_tf;
+	break;
+      case CODE_FOR_xststdcqp_kf:
+	icode = CODE_FOR_xststdcqp_tf;
+	break;
+      case CODE_FOR_xscmpexpqp_eq_kf:
+	icode = CODE_FOR_xscmpexpqp_eq_tf;
+	break;
+      case CODE_FOR_xscmpexpqp_lt_kf:
+	icode = CODE_FOR_xscmpexpqp_lt_tf;
+	break;
+      case CODE_FOR_xscmpexpqp_gt_kf:
+	icode = CODE_FOR_xscmpexpqp_gt_tf;
+	break;
+      case CODE_FOR_xscmpexpqp_unordered_kf:
+	icode = CODE_FOR_xscmpexpqp_unordered_tf;
+	break;
+      default:
+	break;
+      }
+
+  /* In case of "#pragma target" changes, we initialize all builtins
+     but check for actual availability now, during expand time.  For
+     invalid builtins, generate a normal call.  */
+  bifdata *bifaddr = &rs6000_builtin_info[uns_fcode];
+  bif_enable e = bifaddr->enable;
+
+  if (!(e == ENB_ALWAYS
+	|| (e == ENB_P5 && TARGET_POPCNTB)
+	|| (e == ENB_P6 && TARGET_CMPB)
+	|| (e == ENB_P6_64 && TARGET_CMPB && TARGET_POWERPC64)
+	|| (e == ENB_ALTIVEC && TARGET_ALTIVEC)
+	|| (e == ENB_CELL && TARGET_ALTIVEC && rs6000_cpu == PROCESSOR_CELL)
+	|| (e == ENB_VSX && TARGET_VSX)
+	|| (e == ENB_P7 && TARGET_POPCNTD)
+	|| (e == ENB_P7_64 && TARGET_POPCNTD && TARGET_POWERPC64)
+	|| (e == ENB_P8 && TARGET_DIRECT_MOVE)
+	|| (e == ENB_P8V && TARGET_P8_VECTOR)
+	|| (e == ENB_P9 && TARGET_MODULO)
+	|| (e == ENB_P9_64 && TARGET_MODULO && TARGET_POWERPC64)
+	|| (e == ENB_P9V && TARGET_P9_VECTOR)
+	|| (e == ENB_IEEE128_HW && TARGET_FLOAT128_HW)
+	|| (e == ENB_DFP && TARGET_DFP)
+	|| (e == ENB_CRYPTO && TARGET_CRYPTO)
+	|| (e == ENB_HTM && TARGET_HTM)
+	|| (e == ENB_P10 && TARGET_POWER10)
+	|| (e == ENB_P10_64 && TARGET_POWER10 && TARGET_POWERPC64)
+	|| (e == ENB_MMA && TARGET_MMA)))
+    {
+      rs6000_invalid_builtin (fcode);
+      return expand_call (exp, target, ignore);
+    }
+
+  if (bif_is_nosoft (*bifaddr)
+      && rs6000_isa_flags & OPTION_MASK_SOFT_FLOAT)
+    {
+      error ("%qs not supported with %<-msoft-float%>",
+	     bifaddr->bifname);
+      return const0_rtx;
+    }
+
+  if (bif_is_no32bit (*bifaddr) && TARGET_32BIT)
+    {
+      error ("%qs is not supported in 32-bit mode", bifaddr->bifname);
+      return const0_rtx;
+    }
+
+  if (bif_is_ibmld (*bifaddr) && !FLOAT128_2REG_P (TFmode))
+    {
+      error ("%qs requires %<long double%> to be IBM 128-bit format",
+	     bifaddr->bifname);
+      return const0_rtx;
+    }
+
+  if (bif_is_cpu (*bifaddr))
+    return cpu_expand_builtin (fcode, exp, target);
+
+  if (bif_is_init (*bifaddr))
+    return altivec_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
+
+  if (bif_is_set (*bifaddr))
+    return altivec_expand_vec_set_builtin (exp);
+
+  if (bif_is_extract (*bifaddr))
+    return altivec_expand_vec_ext_builtin (exp, target);
+
+  if (bif_is_predicate (*bifaddr))
+    return altivec_expand_predicate_builtin (icode, exp, target);
+
+  if (bif_is_htm (*bifaddr))
+    return htm_expand_builtin (bifaddr, fcode, exp, target);
+
+  if (bif_is_32bit (*bifaddr) && TARGET_32BIT)
+    {
+      if (fcode == RS6000_BIF_MFTB)
+	icode = CODE_FOR_rs6000_mftb_si;
+      else if (fcode == RS6000_BIF_BPERMD)
+	icode = CODE_FOR_bpermd_si;
+      else if (fcode == RS6000_BIF_DARN)
+	icode = CODE_FOR_darn_64_si;
+      else if (fcode == RS6000_BIF_DARN_32)
+	icode = CODE_FOR_darn_32_si;
+      else if (fcode == RS6000_BIF_DARN_RAW)
+	icode = CODE_FOR_darn_raw_si;
+      else
+	gcc_unreachable ();
+    }
+
+  if (bif_is_endian (*bifaddr) && BYTES_BIG_ENDIAN)
+    {
+      if (fcode == RS6000_BIF_LD_ELEMREV_V1TI)
+	icode = CODE_FOR_vsx_load_v1ti;
+      else if (fcode == RS6000_BIF_LD_ELEMREV_V2DF)
+	icode = CODE_FOR_vsx_load_v2df;
+      else if (fcode == RS6000_BIF_LD_ELEMREV_V2DI)
+	icode = CODE_FOR_vsx_load_v2di;
+      else if (fcode == RS6000_BIF_LD_ELEMREV_V4SF)
+	icode = CODE_FOR_vsx_load_v4sf;
+      else if (fcode == RS6000_BIF_LD_ELEMREV_V4SI)
+	icode = CODE_FOR_vsx_load_v4si;
+      else if (fcode == RS6000_BIF_LD_ELEMREV_V8HI)
+	icode = CODE_FOR_vsx_load_v8hi;
+      else if (fcode == RS6000_BIF_LD_ELEMREV_V16QI)
+	icode = CODE_FOR_vsx_load_v16qi;
+      else if (fcode == RS6000_BIF_ST_ELEMREV_V1TI)
+	icode = CODE_FOR_vsx_store_v1ti;
+      else if (fcode == RS6000_BIF_ST_ELEMREV_V2DF)
+	icode = CODE_FOR_vsx_store_v2df;
+      else if (fcode == RS6000_BIF_ST_ELEMREV_V2DI)
+	icode = CODE_FOR_vsx_store_v2di;
+      else if (fcode == RS6000_BIF_ST_ELEMREV_V4SF)
+	icode = CODE_FOR_vsx_store_v4sf;
+      else if (fcode == RS6000_BIF_ST_ELEMREV_V4SI)
+	icode = CODE_FOR_vsx_store_v4si;
+      else if (fcode == RS6000_BIF_ST_ELEMREV_V8HI)
+	icode = CODE_FOR_vsx_store_v8hi;
+      else if (fcode == RS6000_BIF_ST_ELEMREV_V16QI)
+	icode = CODE_FOR_vsx_store_v16qi;
+      else
+	gcc_unreachable ();
+    }
+
+
+  /* TRUE iff the built-in function returns void.  */
+  bool void_func = TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node;
+  /* Position of first argument (0 for void-returning functions, else 1).  */
+  int k;
+  /* Modes for the return value, if any, and arguments.  */
+  const int MAX_BUILTIN_ARGS = 6;
+  machine_mode mode[MAX_BUILTIN_ARGS + 1];
+
+  if (void_func)
+    k = 0;
+  else
+    {
+      k = 1;
+      mode[0] = insn_data[icode].operand[0].mode;
+    }
+
+  /* Tree expressions for each argument.  */
+  tree arg[MAX_BUILTIN_ARGS];
+  /* RTL expressions for each argument.  */
+  rtx op[MAX_BUILTIN_ARGS];
+
+  int nargs = bifaddr->nargs;
+  gcc_assert (nargs <= MAX_BUILTIN_ARGS);
+
+
+  for (int i = 0; i < nargs; i++)
+    {
+      arg[i] = CALL_EXPR_ARG (exp, i);
+      if (arg[i] == error_mark_node)
+	return const0_rtx;
+      STRIP_NOPS (arg[i]);
+      op[i] = expand_normal (arg[i]);
+      /* We have a couple of pesky patterns that don't specify the mode...  */
+      mode[i+k] = insn_data[icode].operand[i+k].mode;
+      if (!mode[i+k])
+	mode[i+k] = Pmode;
+    }
+
+  /* Check for restricted constant arguments.  */
+  for (int i = 0; i < 2; i++)
+    {
+      switch (bifaddr->restr[i])
+	{
+	case RES_BITS:
+	  {
+	    size_t mask = 1;
+	    mask <<= bifaddr->restr_val1[i];
+	    mask--;
+	    tree restr_arg = arg[bifaddr->restr_opnd[i] - 1];
+	    STRIP_NOPS (restr_arg);
+	    if (!(TREE_CODE (restr_arg) == INTEGER_CST
+		  && (TREE_INT_CST_LOW (restr_arg) & ~mask) == 0))
+	      {
+		unsigned p = (1U << bifaddr->restr_val1[i]) - 1;
+		error ("argument %d must be a literal between 0 and %d,"
+		       " inclusive",
+		       bifaddr->restr_opnd[i], p);
+		return CONST0_RTX (mode[0]);
+	      }
+	    break;
+	  }
+	case RES_RANGE:
+	  {
+	    tree restr_arg = arg[bifaddr->restr_opnd[i] - 1];
+	    STRIP_NOPS (restr_arg);
+	    if (!(TREE_CODE (restr_arg) == INTEGER_CST
+		  && IN_RANGE (tree_to_shwi (restr_arg),
+			       bifaddr->restr_val1[i],
+			       bifaddr->restr_val2[i])))
+	      {
+		error ("argument %d must be a literal between %d and %d,"
+		       " inclusive",
+		       bifaddr->restr_opnd[i], bifaddr->restr_val1[i],
+		       bifaddr->restr_val2[i]);
+		return CONST0_RTX (mode[0]);
+	      }
+	    break;
+	  }
+	case RES_VAR_RANGE:
+	  {
+	    tree restr_arg = arg[bifaddr->restr_opnd[i] - 1];
+	    STRIP_NOPS (restr_arg);
+	    if (TREE_CODE (restr_arg) == INTEGER_CST
+		&& !IN_RANGE (tree_to_shwi (restr_arg),
+			      bifaddr->restr_val1[i],
+			      bifaddr->restr_val2[i]))
+	      {
+		error ("argument %d must be a variable or a literal "
+		       "between %d and %d, inclusive",
+		       bifaddr->restr_opnd[i], bifaddr->restr_val1[i],
+		       bifaddr->restr_val2[i]);
+		return CONST0_RTX (mode[0]);
+	      }
+	    break;
+	  }
+	case RES_VALUES:
+	  {
+	    tree restr_arg = arg[bifaddr->restr_opnd[i] - 1];
+	    STRIP_NOPS (restr_arg);
+	    if (!(TREE_CODE (restr_arg) == INTEGER_CST
+		  && (tree_to_shwi (restr_arg) == bifaddr->restr_val1[i]
+		      || tree_to_shwi (restr_arg) == bifaddr->restr_val2[i])))
+	      {
+		error ("argument %d must be either a literal %d or a "
+		       "literal %d",
+		       bifaddr->restr_opnd[i], bifaddr->restr_val1[i],
+		       bifaddr->restr_val2[i]);
+		return CONST0_RTX (mode[0]);
+	      }
+	    break;
+	  }
+	default:
+	case RES_NONE:
+	  break;
+	}
+    }
+
+  if (bif_is_ldstmask (*bifaddr))
+    return rs6000_expand_ldst_mask (target, arg[0]);
+
+  if (bif_is_stvec (*bifaddr))
+    {
+      if (bif_is_reve (*bifaddr))
+	icode = elemrev_icode (fcode);
+      return stv_expand_builtin (icode, op, mode[0], mode[1]);
+    }
+
+  if (bif_is_ldvec (*bifaddr))
+    {
+      if (bif_is_reve (*bifaddr))
+	icode = elemrev_icode (fcode);
+      return ldv_expand_builtin (target, icode, op, mode[0]);
+    }
+
+  if (bif_is_lxvrse (*bifaddr))
+    return lxvrse_expand_builtin (target, icode, op, mode[0], mode[1]);
+
+  if (bif_is_lxvrze (*bifaddr))
+    return lxvrze_expand_builtin (target, icode, op, mode[0], mode[1]);
+
+  if (bif_is_mma (*bifaddr))
+    return mma_expand_builtin (exp, target, icode, fcode);
+
+  if (fcode == RS6000_BIF_PACK_IF
+      && TARGET_LONG_DOUBLE_128
+      && !TARGET_IEEEQUAD)
+    {
+      icode = CODE_FOR_packtf;
+      fcode = RS6000_BIF_PACK_TF;
+      uns_fcode = (size_t) fcode;
+    }
+  else if (fcode == RS6000_BIF_UNPACK_IF
+	   && TARGET_LONG_DOUBLE_128
+	   && !TARGET_IEEEQUAD)
+    {
+      icode = CODE_FOR_unpacktf;
+      fcode = RS6000_BIF_UNPACK_TF;
+      uns_fcode = (size_t) fcode;
+    }
+
+  if (TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node)
+    target = NULL_RTX;
+  else if (target == 0
+	   || GET_MODE (target) != mode[0]
+	   || !insn_data[icode].operand[0].predicate (target, mode[0]))
+    target = gen_reg_rtx (mode[0]);
+
+  for (int i = 0; i < nargs; i++)
+    if (!insn_data[icode].operand[i+k].predicate (op[i], mode[i+k]))
+      op[i] = copy_to_mode_reg (mode[i+k], op[i]);
+
+  rtx pat;
+
+  switch (nargs)
+    {
+    case 0:
+      pat = (void_func
+	     ? GEN_FCN (icode) ()
+	     : GEN_FCN (icode) (target));
+      break;
+    case 1:
+      pat = (void_func
+	     ? GEN_FCN (icode) (op[0])
+	     : GEN_FCN (icode) (target, op[0]));
+      break;
+    case 2:
+      pat = (void_func
+	     ? GEN_FCN (icode) (op[0], op[1])
+	     : GEN_FCN (icode) (target, op[0], op[1]));
+      break;
+    case 3:
+      pat = (void_func
+	     ? GEN_FCN (icode) (op[0], op[1], op[2])
+	     : GEN_FCN (icode) (target, op[0], op[1], op[2]));
+      break;
+    case 4:
+      pat = (void_func
+	     ? GEN_FCN (icode) (op[0], op[1], op[2], op[3])
+	     : GEN_FCN (icode) (target, op[0], op[1], op[2], op[3]));
+      break;
+    case 5:
+      pat = (void_func
+	     ? GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4])
+	     : GEN_FCN (icode) (target, op[0], op[1], op[2], op[3], op[4]));
+      break;
+    case 6:
+      pat = (void_func
+	     ? GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5])
+	     : GEN_FCN (icode) (target, op[0], op[1],
+				op[2], op[3], op[4], op[5]));
+      break;
+    default:
+      gcc_assert (MAX_BUILTIN_ARGS == 6);
+      gcc_unreachable ();
+    }
+
+  if (!pat)
+    return 0;
+
+  emit_insn (pat);
+  return target;
+}
diff --git a/gcc/config/rs6000/rs6000-call.cc b/gcc/config/rs6000/rs6000-call.cc
index d9bd5ca..f06c692 100644
--- a/gcc/config/rs6000/rs6000-call.cc
+++ b/gcc/config/rs6000/rs6000-call.cc
@@ -89,85 +89,6 @@
 #define TARGET_NO_PROTOTYPE 0
 #endif
 
-/* Used by __builtin_cpu_is(), mapping from PLATFORM names to values.  */
-static const struct
-{
-  const char *cpu;
-  unsigned int cpuid;
-} cpu_is_info[] = {
-  { "power10",	   PPC_PLATFORM_POWER10 },
-  { "power9",	   PPC_PLATFORM_POWER9 },
-  { "power8",	   PPC_PLATFORM_POWER8 },
-  { "power7",	   PPC_PLATFORM_POWER7 },
-  { "power6x",	   PPC_PLATFORM_POWER6X },
-  { "power6",	   PPC_PLATFORM_POWER6 },
-  { "power5+",	   PPC_PLATFORM_POWER5_PLUS },
-  { "power5",	   PPC_PLATFORM_POWER5 },
-  { "ppc970",	   PPC_PLATFORM_PPC970 },
-  { "power4",	   PPC_PLATFORM_POWER4 },
-  { "ppca2",	   PPC_PLATFORM_PPCA2 },
-  { "ppc476",	   PPC_PLATFORM_PPC476 },
-  { "ppc464",	   PPC_PLATFORM_PPC464 },
-  { "ppc440",	   PPC_PLATFORM_PPC440 },
-  { "ppc405",	   PPC_PLATFORM_PPC405 },
-  { "ppc-cell-be", PPC_PLATFORM_CELL_BE }
-};
-
-/* Used by __builtin_cpu_supports(), mapping from HWCAP names to masks.  */
-static const struct
-{
-  const char *hwcap;
-  int mask;
-  unsigned int id;
-} cpu_supports_info[] = {
-  /* AT_HWCAP masks.  */
-  { "4xxmac",		PPC_FEATURE_HAS_4xxMAC,		0 },
-  { "altivec",		PPC_FEATURE_HAS_ALTIVEC,	0 },
-  { "arch_2_05",	PPC_FEATURE_ARCH_2_05,		0 },
-  { "arch_2_06",	PPC_FEATURE_ARCH_2_06,		0 },
-  { "archpmu",		PPC_FEATURE_PERFMON_COMPAT,	0 },
-  { "booke",		PPC_FEATURE_BOOKE,		0 },
-  { "cellbe",		PPC_FEATURE_CELL_BE,		0 },
-  { "dfp",		PPC_FEATURE_HAS_DFP,		0 },
-  { "efpdouble",	PPC_FEATURE_HAS_EFP_DOUBLE,	0 },
-  { "efpsingle",	PPC_FEATURE_HAS_EFP_SINGLE,	0 },
-  { "fpu",		PPC_FEATURE_HAS_FPU,		0 },
-  { "ic_snoop",		PPC_FEATURE_ICACHE_SNOOP,	0 },
-  { "mmu",		PPC_FEATURE_HAS_MMU,		0 },
-  { "notb",		PPC_FEATURE_NO_TB,		0 },
-  { "pa6t",		PPC_FEATURE_PA6T,		0 },
-  { "power4",		PPC_FEATURE_POWER4,		0 },
-  { "power5",		PPC_FEATURE_POWER5,		0 },
-  { "power5+",		PPC_FEATURE_POWER5_PLUS,	0 },
-  { "power6x",		PPC_FEATURE_POWER6_EXT,		0 },
-  { "ppc32",		PPC_FEATURE_32,			0 },
-  { "ppc601",		PPC_FEATURE_601_INSTR,		0 },
-  { "ppc64",		PPC_FEATURE_64,			0 },
-  { "ppcle",		PPC_FEATURE_PPC_LE,		0 },
-  { "smt",		PPC_FEATURE_SMT,		0 },
-  { "spe",		PPC_FEATURE_HAS_SPE,		0 },
-  { "true_le",		PPC_FEATURE_TRUE_LE,		0 },
-  { "ucache",		PPC_FEATURE_UNIFIED_CACHE,	0 },
-  { "vsx",		PPC_FEATURE_HAS_VSX,		0 },
-
-  /* AT_HWCAP2 masks.  */
-  { "arch_2_07",	PPC_FEATURE2_ARCH_2_07,		1 },
-  { "dscr",		PPC_FEATURE2_HAS_DSCR,		1 },
-  { "ebb",		PPC_FEATURE2_HAS_EBB,		1 },
-  { "htm",		PPC_FEATURE2_HAS_HTM,		1 },
-  { "htm-nosc",		PPC_FEATURE2_HTM_NOSC,		1 },
-  { "htm-no-suspend",	PPC_FEATURE2_HTM_NO_SUSPEND,	1 },
-  { "isel",		PPC_FEATURE2_HAS_ISEL,		1 },
-  { "tar",		PPC_FEATURE2_HAS_TAR,		1 },
-  { "vcrypto",		PPC_FEATURE2_HAS_VEC_CRYPTO,	1 },
-  { "arch_3_00",	PPC_FEATURE2_ARCH_3_00,		1 },
-  { "ieee128",		PPC_FEATURE2_HAS_IEEE128,	1 },
-  { "darn",		PPC_FEATURE2_DARN,		1 },
-  { "scv",		PPC_FEATURE2_SCV,		1 },
-  { "arch_3_1",		PPC_FEATURE2_ARCH_3_1,		1 },
-  { "mma",		PPC_FEATURE2_MMA,		1 },
-};
-
 /* Nonzero if we can use a floating-point register to pass this arg.  */
 #define USE_FP_FOR_ARG_P(CUM,MODE)		\
   (SCALAR_FLOAT_MODE_NOT_VECTOR_P (MODE)		\
@@ -2880,188 +2801,6 @@ rs6000_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
   return build_va_arg_indirect_ref (addr);
 }
 
-/* Debug utility to translate a type node to a single textual token.  */
-static
-const char *rs6000_type_string (tree type_node)
-{
-  if (type_node == void_type_node)
-    return "void";
-  else if (type_node == long_integer_type_node)
-    return "long";
-  else if (type_node == long_unsigned_type_node)
-    return "ulong";
-  else if (type_node == long_long_integer_type_node)
-    return "longlong";
-  else if (type_node == long_long_unsigned_type_node)
-    return "ulonglong";
-  else if (type_node == bool_V2DI_type_node)
-    return "vbll";
-  else if (type_node == bool_V4SI_type_node)
-    return "vbi";
-  else if (type_node == bool_V8HI_type_node)
-    return "vbs";
-  else if (type_node == bool_V16QI_type_node)
-    return "vbc";
-  else if (type_node == bool_int_type_node)
-    return "bool";
-  else if (type_node == dfloat64_type_node)
-    return "_Decimal64";
-  else if (type_node == double_type_node)
-    return "double";
-  else if (type_node == intDI_type_node)
-    return "sll";
-  else if (type_node == intHI_type_node)
-    return "ss";
-  else if (type_node == ibm128_float_type_node)
-    return "__ibm128";
-  else if (type_node == opaque_V4SI_type_node)
-    return "opaque";
-  else if (POINTER_TYPE_P (type_node))
-    return "void*";
-  else if (type_node == intQI_type_node || type_node == char_type_node)
-    return "sc";
-  else if (type_node == dfloat32_type_node)
-    return "_Decimal32";
-  else if (type_node == float_type_node)
-    return "float";
-  else if (type_node == intSI_type_node || type_node == integer_type_node)
-    return "si";
-  else if (type_node == dfloat128_type_node)
-    return "_Decimal128";
-  else if (type_node == long_double_type_node)
-    return "longdouble";
-  else if (type_node == intTI_type_node)
-    return "sq";
-  else if (type_node == unsigned_intDI_type_node)
-    return "ull";
-  else if (type_node == unsigned_intHI_type_node)
-    return "us";
-  else if (type_node == unsigned_intQI_type_node)
-    return "uc";
-  else if (type_node == unsigned_intSI_type_node)
-    return "ui";
-  else if (type_node == unsigned_intTI_type_node)
-    return "uq";
-  else if (type_node == unsigned_V1TI_type_node)
-    return "vuq";
-  else if (type_node == unsigned_V2DI_type_node)
-    return "vull";
-  else if (type_node == unsigned_V4SI_type_node)
-    return "vui";
-  else if (type_node == unsigned_V8HI_type_node)
-    return "vus";
-  else if (type_node == unsigned_V16QI_type_node)
-    return "vuc";
-  else if (type_node == V16QI_type_node)
-    return "vsc";
-  else if (type_node == V1TI_type_node)
-    return "vsq";
-  else if (type_node == V2DF_type_node)
-    return "vd";
-  else if (type_node == V2DI_type_node)
-    return "vsll";
-  else if (type_node == V4SF_type_node)
-    return "vf";
-  else if (type_node == V4SI_type_node)
-    return "vsi";
-  else if (type_node == V8HI_type_node)
-    return "vss";
-  else if (type_node == pixel_V8HI_type_node)
-    return "vp";
-  else if (type_node == pcvoid_type_node)
-    return "voidc*";
-  else if (type_node == float128_type_node)
-    return "_Float128";
-  else if (type_node == vector_pair_type_node)
-    return "__vector_pair";
-  else if (type_node == vector_quad_type_node)
-    return "__vector_quad";
-
-  return "unknown";
-}
-
-static rtx
-altivec_expand_predicate_builtin (enum insn_code icode, tree exp, rtx target)
-{
-  rtx pat, scratch;
-  tree cr6_form = CALL_EXPR_ARG (exp, 0);
-  tree arg0 = CALL_EXPR_ARG (exp, 1);
-  tree arg1 = CALL_EXPR_ARG (exp, 2);
-  rtx op0 = expand_normal (arg0);
-  rtx op1 = expand_normal (arg1);
-  machine_mode tmode = SImode;
-  machine_mode mode0 = insn_data[icode].operand[1].mode;
-  machine_mode mode1 = insn_data[icode].operand[2].mode;
-  int cr6_form_int;
-
-  if (TREE_CODE (cr6_form) != INTEGER_CST)
-    {
-      error ("argument 1 of %qs must be a constant",
-	     "__builtin_altivec_predicate");
-      return const0_rtx;
-    }
-  else
-    cr6_form_int = TREE_INT_CST_LOW (cr6_form);
-
-  gcc_assert (mode0 == mode1);
-
-  /* If we have invalid arguments, bail out before generating bad rtl.  */
-  if (arg0 == error_mark_node || arg1 == error_mark_node)
-    return const0_rtx;
-
-  if (target == 0
-      || GET_MODE (target) != tmode
-      || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  if (! (*insn_data[icode].operand[1].predicate) (op0, mode0))
-    op0 = copy_to_mode_reg (mode0, op0);
-  if (! (*insn_data[icode].operand[2].predicate) (op1, mode1))
-    op1 = copy_to_mode_reg (mode1, op1);
-
-  /* Note that for many of the relevant operations (e.g. cmpne or
-     cmpeq) with float or double operands, it makes more sense for the
-     mode of the allocated scratch register to select a vector of
-     integer.  But the choice to copy the mode of operand 0 was made
-     long ago and there are no plans to change it.  */
-  scratch = gen_reg_rtx (mode0);
-
-  pat = GEN_FCN (icode) (scratch, op0, op1);
-  if (! pat)
-    return 0;
-  emit_insn (pat);
-
-  /* The vec_any* and vec_all* predicates use the same opcodes for two
-     different operations, but the bits in CR6 will be different
-     depending on what information we want.  So we have to play tricks
-     with CR6 to get the right bits out.
-
-     If you think this is disgusting, look at the specs for the
-     AltiVec predicates.  */
-
-  switch (cr6_form_int)
-    {
-    case 0:
-      emit_insn (gen_cr6_test_for_zero (target));
-      break;
-    case 1:
-      emit_insn (gen_cr6_test_for_zero_reverse (target));
-      break;
-    case 2:
-      emit_insn (gen_cr6_test_for_lt (target));
-      break;
-    case 3:
-      emit_insn (gen_cr6_test_for_lt_reverse (target));
-      break;
-    default:
-      error ("argument 1 of %qs is out of range",
-	     "__builtin_altivec_predicate");
-      break;
-    }
-
-  return target;
-}
-
 rtx
 swap_endian_selector_for_mode (machine_mode mode)
 {
@@ -3100,3271 +2839,6 @@ swap_endian_selector_for_mode (machine_mode mode)
 						     gen_rtvec_v (16, perm)));
 }
 
-/* Return the correct ICODE value depending on whether we are
-   setting or reading the HTM SPRs.  */
-static inline enum insn_code
-rs6000_htm_spr_icode (bool nonvoid)
-{
-  if (nonvoid)
-    return (TARGET_POWERPC64) ? CODE_FOR_htm_mfspr_di : CODE_FOR_htm_mfspr_si;
-  else
-    return (TARGET_POWERPC64) ? CODE_FOR_htm_mtspr_di : CODE_FOR_htm_mtspr_si;
-}
-
-/* Expand vec_init builtin.  */
-static rtx
-altivec_expand_vec_init_builtin (tree type, tree exp, rtx target)
-{
-  machine_mode tmode = TYPE_MODE (type);
-  machine_mode inner_mode = GET_MODE_INNER (tmode);
-  int i, n_elt = GET_MODE_NUNITS (tmode);
-
-  gcc_assert (VECTOR_MODE_P (tmode));
-  gcc_assert (n_elt == call_expr_nargs (exp));
-
-  if (!target || !register_operand (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  /* If we have a vector compromised of a single element, such as V1TImode, do
-     the initialization directly.  */
-  if (n_elt == 1 && GET_MODE_SIZE (tmode) == GET_MODE_SIZE (inner_mode))
-    {
-      rtx x = expand_normal (CALL_EXPR_ARG (exp, 0));
-      emit_move_insn (target, gen_lowpart (tmode, x));
-    }
-  else
-    {
-      rtvec v = rtvec_alloc (n_elt);
-
-      for (i = 0; i < n_elt; ++i)
-	{
-	  rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
-	  RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
-	}
-
-      rs6000_expand_vector_init (target, gen_rtx_PARALLEL (tmode, v));
-    }
-
-  return target;
-}
-
-/* Return the integer constant in ARG.  Constrain it to be in the range
-   of the subparts of VEC_TYPE; issue an error if not.  */
-
-static int
-get_element_number (tree vec_type, tree arg)
-{
-  unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
-
-  if (!tree_fits_uhwi_p (arg)
-      || (elt = tree_to_uhwi (arg), elt > max))
-    {
-      error ("selector must be an integer constant in the range [0, %wi]", max);
-      return 0;
-    }
-
-  return elt;
-}
-
-/* Expand vec_set builtin.  */
-static rtx
-altivec_expand_vec_set_builtin (tree exp)
-{
-  machine_mode tmode, mode1;
-  tree arg0, arg1, arg2;
-  int elt;
-  rtx op0, op1;
-
-  arg0 = CALL_EXPR_ARG (exp, 0);
-  arg1 = CALL_EXPR_ARG (exp, 1);
-  arg2 = CALL_EXPR_ARG (exp, 2);
-
-  tmode = TYPE_MODE (TREE_TYPE (arg0));
-  mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
-  gcc_assert (VECTOR_MODE_P (tmode));
-
-  op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
-  op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
-  elt = get_element_number (TREE_TYPE (arg0), arg2);
-
-  if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
-    op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
-
-  op0 = force_reg (tmode, op0);
-  op1 = force_reg (mode1, op1);
-
-  rs6000_expand_vector_set (op0, op1, GEN_INT (elt));
-
-  return op0;
-}
-
-/* Expand vec_ext builtin.  */
-static rtx
-altivec_expand_vec_ext_builtin (tree exp, rtx target)
-{
-  machine_mode tmode, mode0;
-  tree arg0, arg1;
-  rtx op0;
-  rtx op1;
-
-  arg0 = CALL_EXPR_ARG (exp, 0);
-  arg1 = CALL_EXPR_ARG (exp, 1);
-
-  op0 = expand_normal (arg0);
-  op1 = expand_normal (arg1);
-
-  if (TREE_CODE (arg1) == INTEGER_CST)
-    {
-      unsigned HOST_WIDE_INT elt;
-      unsigned HOST_WIDE_INT size = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
-      unsigned int truncated_selector;
-      /* Even if !tree_fits_uhwi_p (arg1)), TREE_INT_CST_LOW (arg0)
-	 returns low-order bits of INTEGER_CST for modulo indexing.  */
-      elt = TREE_INT_CST_LOW (arg1);
-      truncated_selector = elt % size;
-      op1 = GEN_INT (truncated_selector);
-    }
-
-  tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
-  mode0 = TYPE_MODE (TREE_TYPE (arg0));
-  gcc_assert (VECTOR_MODE_P (mode0));
-
-  op0 = force_reg (mode0, op0);
-
-  if (optimize || !target || !register_operand (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  rs6000_expand_vector_extract (target, op0, op1);
-
-  return target;
-}
-
-/* Raise an error message for a builtin function that is called without the
-   appropriate target options being set.  */
-
-void
-rs6000_invalid_builtin (enum rs6000_gen_builtins fncode)
-{
-  size_t j = (size_t) fncode;
-  const char *name = rs6000_builtin_info[j].bifname;
-
-  switch (rs6000_builtin_info[j].enable)
-    {
-    case ENB_P5:
-      error ("%qs requires the %qs option", name, "-mcpu=power5");
-      break;
-    case ENB_P6:
-      error ("%qs requires the %qs option", name, "-mcpu=power6");
-      break;
-    case ENB_P6_64:
-      error ("%qs requires the %qs option and either the %qs or %qs option",
-	     name, "-mcpu=power6", "-m64", "-mpowerpc64");
-      break;
-    case ENB_ALTIVEC:
-      error ("%qs requires the %qs option", name, "-maltivec");
-      break;
-    case ENB_CELL:
-      error ("%qs requires the %qs option", name, "-mcpu=cell");
-      break;
-    case ENB_VSX:
-      error ("%qs requires the %qs option", name, "-mvsx");
-      break;
-    case ENB_P7:
-      error ("%qs requires the %qs option", name, "-mcpu=power7");
-      break;
-    case ENB_P7_64:
-      error ("%qs requires the %qs option and either the %qs or %qs option",
-	     name, "-mcpu=power7", "-m64", "-mpowerpc64");
-      break;
-    case ENB_P8:
-      error ("%qs requires the %qs option", name, "-mcpu=power8");
-      break;
-    case ENB_P8V:
-      error ("%qs requires the %qs and %qs options", name, "-mcpu=power8",
-	     "-mvsx");
-      break;
-    case ENB_P9:
-      error ("%qs requires the %qs option", name, "-mcpu=power9");
-      break;
-    case ENB_P9_64:
-      error ("%qs requires the %qs option and either the %qs or %qs option",
-	     name, "-mcpu=power9", "-m64", "-mpowerpc64");
-      break;
-    case ENB_P9V:
-      error ("%qs requires the %qs and %qs options", name, "-mcpu=power9",
-	     "-mvsx");
-      break;
-    case ENB_IEEE128_HW:
-      error ("%qs requires quad-precision floating-point arithmetic", name);
-      break;
-    case ENB_DFP:
-      error ("%qs requires the %qs option", name, "-mhard-dfp");
-      break;
-    case ENB_CRYPTO:
-      error ("%qs requires the %qs option", name, "-mcrypto");
-      break;
-    case ENB_HTM:
-      error ("%qs requires the %qs option", name, "-mhtm");
-      break;
-    case ENB_P10:
-      error ("%qs requires the %qs option", name, "-mcpu=power10");
-      break;
-    case ENB_P10_64:
-      error ("%qs requires the %qs option and either the %qs or %qs option",
-	     name, "-mcpu=power10", "-m64", "-mpowerpc64");
-      break;
-    case ENB_MMA:
-      error ("%qs requires the %qs option", name, "-mmma");
-      break;
-    default:
-    case ENB_ALWAYS:
-      gcc_unreachable ();
-    }
-}
-
-/* Target hook for early folding of built-ins, shamelessly stolen
-   from ia64.cc.  */
-
-tree
-rs6000_fold_builtin (tree fndecl ATTRIBUTE_UNUSED,
-		     int n_args ATTRIBUTE_UNUSED,
-		     tree *args ATTRIBUTE_UNUSED,
-		     bool ignore ATTRIBUTE_UNUSED)
-{
-#ifdef SUBTARGET_FOLD_BUILTIN
-  return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
-#else
-  return NULL_TREE;
-#endif
-}
-
-/* Helper function to handle the gimple folding of a vector compare
-   operation.  This sets up true/false vectors, and uses the
-   VEC_COND_EXPR operation.
-   CODE indicates which comparison is to be made. (EQ, GT, ...).
-   TYPE indicates the type of the result.
-   Code is inserted before GSI.  */
-static tree
-fold_build_vec_cmp (tree_code code, tree type, tree arg0, tree arg1,
-		    gimple_stmt_iterator *gsi)
-{
-  tree cmp_type = truth_type_for (type);
-  tree zero_vec = build_zero_cst (type);
-  tree minus_one_vec = build_minus_one_cst (type);
-  tree temp = create_tmp_reg_or_ssa_name (cmp_type);
-  gimple *g = gimple_build_assign (temp, code, arg0, arg1);
-  gsi_insert_before (gsi, g, GSI_SAME_STMT);
-  return fold_build3 (VEC_COND_EXPR, type, temp, minus_one_vec, zero_vec);
-}
-
-/* Helper function to handle the in-between steps for the
-   vector compare built-ins.  */
-static void
-fold_compare_helper (gimple_stmt_iterator *gsi, tree_code code, gimple *stmt)
-{
-  tree arg0 = gimple_call_arg (stmt, 0);
-  tree arg1 = gimple_call_arg (stmt, 1);
-  tree lhs = gimple_call_lhs (stmt);
-  tree cmp = fold_build_vec_cmp (code, TREE_TYPE (lhs), arg0, arg1, gsi);
-  gimple *g = gimple_build_assign (lhs, cmp);
-  gimple_set_location (g, gimple_location (stmt));
-  gsi_replace (gsi, g, true);
-}
-
-/* Helper function to map V2DF and V4SF types to their
- integral equivalents (V2DI and V4SI).  */
-tree map_to_integral_tree_type (tree input_tree_type)
-{
-  if (INTEGRAL_TYPE_P (TREE_TYPE (input_tree_type)))
-    return input_tree_type;
-  else
-    {
-      if (types_compatible_p (TREE_TYPE (input_tree_type),
-			      TREE_TYPE (V2DF_type_node)))
-	return V2DI_type_node;
-      else if (types_compatible_p (TREE_TYPE (input_tree_type),
-				   TREE_TYPE (V4SF_type_node)))
-	return V4SI_type_node;
-      else
-	gcc_unreachable ();
-    }
-}
-
-/* Helper function to handle the vector merge[hl] built-ins.  The
-   implementation difference between h and l versions for this code are in
-   the values used when building of the permute vector for high word versus
-   low word merge.  The variance is keyed off the use_high parameter.  */
-static void
-fold_mergehl_helper (gimple_stmt_iterator *gsi, gimple *stmt, int use_high)
-{
-  tree arg0 = gimple_call_arg (stmt, 0);
-  tree arg1 = gimple_call_arg (stmt, 1);
-  tree lhs = gimple_call_lhs (stmt);
-  tree lhs_type = TREE_TYPE (lhs);
-  int n_elts = TYPE_VECTOR_SUBPARTS (lhs_type);
-  int midpoint = n_elts / 2;
-  int offset = 0;
-
-  if (use_high == 1)
-    offset = midpoint;
-
-  /* The permute_type will match the lhs for integral types.  For double and
-     float types, the permute type needs to map to the V2 or V4 type that
-     matches size.  */
-  tree permute_type;
-  permute_type = map_to_integral_tree_type (lhs_type);
-  tree_vector_builder elts (permute_type, VECTOR_CST_NELTS (arg0), 1);
-
-  for (int i = 0; i < midpoint; i++)
-    {
-      elts.safe_push (build_int_cst (TREE_TYPE (permute_type),
-				     offset + i));
-      elts.safe_push (build_int_cst (TREE_TYPE (permute_type),
-				     offset + n_elts + i));
-    }
-
-  tree permute = elts.build ();
-
-  gimple *g = gimple_build_assign (lhs, VEC_PERM_EXPR, arg0, arg1, permute);
-  gimple_set_location (g, gimple_location (stmt));
-  gsi_replace (gsi, g, true);
-}
-
-/* Helper function to handle the vector merge[eo] built-ins.  */
-static void
-fold_mergeeo_helper (gimple_stmt_iterator *gsi, gimple *stmt, int use_odd)
-{
-  tree arg0 = gimple_call_arg (stmt, 0);
-  tree arg1 = gimple_call_arg (stmt, 1);
-  tree lhs = gimple_call_lhs (stmt);
-  tree lhs_type = TREE_TYPE (lhs);
-  int n_elts = TYPE_VECTOR_SUBPARTS (lhs_type);
-
-  /* The permute_type will match the lhs for integral types.  For double and
-     float types, the permute type needs to map to the V2 or V4 type that
-     matches size.  */
-  tree permute_type;
-  permute_type = map_to_integral_tree_type (lhs_type);
-
-  tree_vector_builder elts (permute_type, VECTOR_CST_NELTS (arg0), 1);
-
- /* Build the permute vector.  */
-  for (int i = 0; i < n_elts / 2; i++)
-    {
-      elts.safe_push (build_int_cst (TREE_TYPE (permute_type),
-				     2*i + use_odd));
-      elts.safe_push (build_int_cst (TREE_TYPE (permute_type),
-				     2*i + use_odd + n_elts));
-    }
-
-  tree permute = elts.build ();
-
-  gimple *g = gimple_build_assign (lhs, VEC_PERM_EXPR, arg0, arg1, permute);
-  gimple_set_location (g, gimple_location (stmt));
-  gsi_replace (gsi, g, true);
-}
-
-/*  Helper function to sort out which built-ins may be valid without having
-    a LHS.  */
-static bool
-rs6000_builtin_valid_without_lhs (enum rs6000_gen_builtins fn_code,
-				  tree fndecl)
-{
-  if (TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node)
-    return true;
-
-  switch (fn_code)
-    {
-    case RS6000_BIF_STVX_V16QI:
-    case RS6000_BIF_STVX_V8HI:
-    case RS6000_BIF_STVX_V4SI:
-    case RS6000_BIF_STVX_V4SF:
-    case RS6000_BIF_STVX_V2DI:
-    case RS6000_BIF_STVX_V2DF:
-    case RS6000_BIF_STXVW4X_V16QI:
-    case RS6000_BIF_STXVW4X_V8HI:
-    case RS6000_BIF_STXVW4X_V4SF:
-    case RS6000_BIF_STXVW4X_V4SI:
-    case RS6000_BIF_STXVD2X_V2DF:
-    case RS6000_BIF_STXVD2X_V2DI:
-      return true;
-    default:
-      return false;
-    }
-}
-
-/* Check whether a builtin function is supported in this target
-   configuration.  */
-bool
-rs6000_builtin_is_supported (enum rs6000_gen_builtins fncode)
-{
-  switch (rs6000_builtin_info[(size_t) fncode].enable)
-    {
-    case ENB_ALWAYS:
-      return true;
-    case ENB_P5:
-      return TARGET_POPCNTB;
-    case ENB_P6:
-      return TARGET_CMPB;
-    case ENB_P6_64:
-      return TARGET_CMPB && TARGET_POWERPC64;
-    case ENB_P7:
-      return TARGET_POPCNTD;
-    case ENB_P7_64:
-      return TARGET_POPCNTD && TARGET_POWERPC64;
-    case ENB_P8:
-      return TARGET_DIRECT_MOVE;
-    case ENB_P8V:
-      return TARGET_P8_VECTOR;
-    case ENB_P9:
-      return TARGET_MODULO;
-    case ENB_P9_64:
-      return TARGET_MODULO && TARGET_POWERPC64;
-    case ENB_P9V:
-      return TARGET_P9_VECTOR;
-    case ENB_P10:
-      return TARGET_POWER10;
-    case ENB_P10_64:
-      return TARGET_POWER10 && TARGET_POWERPC64;
-    case ENB_ALTIVEC:
-      return TARGET_ALTIVEC;
-    case ENB_VSX:
-      return TARGET_VSX;
-    case ENB_CELL:
-      return TARGET_ALTIVEC && rs6000_cpu == PROCESSOR_CELL;
-    case ENB_IEEE128_HW:
-      return TARGET_FLOAT128_HW;
-    case ENB_DFP:
-      return TARGET_DFP;
-    case ENB_CRYPTO:
-      return TARGET_CRYPTO;
-    case ENB_HTM:
-      return TARGET_HTM;
-    case ENB_MMA:
-      return TARGET_MMA;
-    default:
-      gcc_unreachable ();
-    }
-  gcc_unreachable ();
-}
-
-/* Expand the MMA built-ins early, so that we can convert the pass-by-reference
-   __vector_quad arguments into pass-by-value arguments, leading to more
-   efficient code generation.  */
-static bool
-rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi,
-				rs6000_gen_builtins fn_code)
-{
-  gimple *stmt = gsi_stmt (*gsi);
-  size_t fncode = (size_t) fn_code;
-
-  if (!bif_is_mma (rs6000_builtin_info[fncode]))
-    return false;
-
-  /* Each call that can be gimple-expanded has an associated built-in
-     function that it will expand into.  If this one doesn't, we have
-     already expanded it!  Exceptions: lxvp and stxvp.  */
-  if (rs6000_builtin_info[fncode].assoc_bif == RS6000_BIF_NONE
-      && fncode != RS6000_BIF_LXVP
-      && fncode != RS6000_BIF_STXVP)
-    return false;
-
-  bifdata *bd = &rs6000_builtin_info[fncode];
-  unsigned nopnds = bd->nargs;
-  gimple_seq new_seq = NULL;
-  gimple *new_call;
-  tree new_decl;
-
-  /* Compatibility built-ins; we used to call these
-     __builtin_mma_{dis,}assemble_pair, but now we call them
-     __builtin_vsx_{dis,}assemble_pair.  Handle the old versions.  */
-  if (fncode == RS6000_BIF_ASSEMBLE_PAIR)
-    fncode = RS6000_BIF_ASSEMBLE_PAIR_V;
-  else if (fncode == RS6000_BIF_DISASSEMBLE_PAIR)
-    fncode = RS6000_BIF_DISASSEMBLE_PAIR_V;
-
-  if (fncode == RS6000_BIF_DISASSEMBLE_ACC
-      || fncode == RS6000_BIF_DISASSEMBLE_PAIR_V)
-    {
-      /* This is an MMA disassemble built-in function.  */
-      push_gimplify_context (true);
-      unsigned nvec = (fncode == RS6000_BIF_DISASSEMBLE_ACC) ? 4 : 2;
-      tree dst_ptr = gimple_call_arg (stmt, 0);
-      tree src_ptr = gimple_call_arg (stmt, 1);
-      tree src_type = TREE_TYPE (src_ptr);
-      tree src = create_tmp_reg_or_ssa_name (TREE_TYPE (src_type));
-      gimplify_assign (src, build_simple_mem_ref (src_ptr), &new_seq);
-
-      /* If we are not disassembling an accumulator/pair or our destination is
-	 another accumulator/pair, then just copy the entire thing as is.  */
-      if ((fncode == RS6000_BIF_DISASSEMBLE_ACC
-	   && TREE_TYPE (TREE_TYPE (dst_ptr)) == vector_quad_type_node)
-	  || (fncode == RS6000_BIF_DISASSEMBLE_PAIR_V
-	      && TREE_TYPE (TREE_TYPE (dst_ptr)) == vector_pair_type_node))
-	{
-	  tree dst = build_simple_mem_ref (build1 (VIEW_CONVERT_EXPR,
-						   src_type, dst_ptr));
-	  gimplify_assign (dst, src, &new_seq);
-	  pop_gimplify_context (NULL);
-	  gsi_replace_with_seq (gsi, new_seq, true);
-	  return true;
-	}
-
-      /* If we're disassembling an accumulator into a different type, we need
-	 to emit a xxmfacc instruction now, since we cannot do it later.  */
-      if (fncode == RS6000_BIF_DISASSEMBLE_ACC)
-	{
-	  new_decl = rs6000_builtin_decls[RS6000_BIF_XXMFACC_INTERNAL];
-	  new_call = gimple_build_call (new_decl, 1, src);
-	  src = create_tmp_reg_or_ssa_name (vector_quad_type_node);
-	  gimple_call_set_lhs (new_call, src);
-	  gimple_seq_add_stmt (&new_seq, new_call);
-	}
-
-      /* Copy the accumulator/pair vector by vector.  */
-      new_decl
-	= rs6000_builtin_decls[rs6000_builtin_info[fncode].assoc_bif];
-      tree dst_type = build_pointer_type_for_mode (unsigned_V16QI_type_node,
-						   ptr_mode, true);
-      tree dst_base = build1 (VIEW_CONVERT_EXPR, dst_type, dst_ptr);
-      for (unsigned i = 0; i < nvec; i++)
-	{
-	  unsigned index = WORDS_BIG_ENDIAN ? i : nvec - 1 - i;
-	  tree dst = build2 (MEM_REF, unsigned_V16QI_type_node, dst_base,
-			     build_int_cst (dst_type, index * 16));
-	  tree dstssa = create_tmp_reg_or_ssa_name (unsigned_V16QI_type_node);
-	  new_call = gimple_build_call (new_decl, 2, src,
-					build_int_cstu (uint16_type_node, i));
-	  gimple_call_set_lhs (new_call, dstssa);
-	  gimple_seq_add_stmt (&new_seq, new_call);
-	  gimplify_assign (dst, dstssa, &new_seq);
-	}
-      pop_gimplify_context (NULL);
-      gsi_replace_with_seq (gsi, new_seq, true);
-      return true;
-    }
-
-  /* TODO: Do some factoring on these two chunks.  */
-  if (fncode == RS6000_BIF_LXVP)
-    {
-      push_gimplify_context (true);
-      tree offset = gimple_call_arg (stmt, 0);
-      tree ptr = gimple_call_arg (stmt, 1);
-      tree lhs = gimple_call_lhs (stmt);
-      if (TREE_TYPE (TREE_TYPE (ptr)) != vector_pair_type_node)
-	ptr = build1 (VIEW_CONVERT_EXPR,
-		      build_pointer_type (vector_pair_type_node), ptr);
-      tree mem = build_simple_mem_ref (build2 (POINTER_PLUS_EXPR,
-					       TREE_TYPE (ptr), ptr, offset));
-      gimplify_assign (lhs, mem, &new_seq);
-      pop_gimplify_context (NULL);
-      gsi_replace_with_seq (gsi, new_seq, true);
-      return true;
-    }
-
-  if (fncode == RS6000_BIF_STXVP)
-    {
-      push_gimplify_context (true);
-      tree src = gimple_call_arg (stmt, 0);
-      tree offset = gimple_call_arg (stmt, 1);
-      tree ptr = gimple_call_arg (stmt, 2);
-      if (TREE_TYPE (TREE_TYPE (ptr)) != vector_pair_type_node)
-	ptr = build1 (VIEW_CONVERT_EXPR,
-		      build_pointer_type (vector_pair_type_node), ptr);
-      tree mem = build_simple_mem_ref (build2 (POINTER_PLUS_EXPR,
-					       TREE_TYPE (ptr), ptr, offset));
-      gimplify_assign (mem, src, &new_seq);
-      pop_gimplify_context (NULL);
-      gsi_replace_with_seq (gsi, new_seq, true);
-      return true;
-    }
-
-  /* Convert this built-in into an internal version that uses pass-by-value
-     arguments.  The internal built-in is found in the assoc_bif field.  */
-  new_decl = rs6000_builtin_decls[rs6000_builtin_info[fncode].assoc_bif];
-  tree lhs, op[MAX_MMA_OPERANDS];
-  tree acc = gimple_call_arg (stmt, 0);
-  push_gimplify_context (true);
-
-  if (bif_is_quad (*bd))
-    {
-      /* This built-in has a pass-by-reference accumulator input, so load it
-	 into a temporary accumulator for use as a pass-by-value input.  */
-      op[0] = create_tmp_reg_or_ssa_name (vector_quad_type_node);
-      for (unsigned i = 1; i < nopnds; i++)
-	op[i] = gimple_call_arg (stmt, i);
-      gimplify_assign (op[0], build_simple_mem_ref (acc), &new_seq);
-    }
-  else
-    {
-      /* This built-in does not use its pass-by-reference accumulator argument
-	 as an input argument, so remove it from the input list.  */
-      nopnds--;
-      for (unsigned i = 0; i < nopnds; i++)
-	op[i] = gimple_call_arg (stmt, i + 1);
-    }
-
-  switch (nopnds)
-    {
-    case 0:
-      new_call = gimple_build_call (new_decl, 0);
-      break;
-    case 1:
-      new_call = gimple_build_call (new_decl, 1, op[0]);
-      break;
-    case 2:
-      new_call = gimple_build_call (new_decl, 2, op[0], op[1]);
-      break;
-    case 3:
-      new_call = gimple_build_call (new_decl, 3, op[0], op[1], op[2]);
-      break;
-    case 4:
-      new_call = gimple_build_call (new_decl, 4, op[0], op[1], op[2], op[3]);
-      break;
-    case 5:
-      new_call = gimple_build_call (new_decl, 5, op[0], op[1], op[2], op[3],
-				    op[4]);
-      break;
-    case 6:
-      new_call = gimple_build_call (new_decl, 6, op[0], op[1], op[2], op[3],
-				    op[4], op[5]);
-      break;
-    case 7:
-      new_call = gimple_build_call (new_decl, 7, op[0], op[1], op[2], op[3],
-				    op[4], op[5], op[6]);
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  if (fncode == RS6000_BIF_BUILD_PAIR || fncode == RS6000_BIF_ASSEMBLE_PAIR_V)
-    lhs = create_tmp_reg_or_ssa_name (vector_pair_type_node);
-  else
-    lhs = create_tmp_reg_or_ssa_name (vector_quad_type_node);
-  gimple_call_set_lhs (new_call, lhs);
-  gimple_seq_add_stmt (&new_seq, new_call);
-  gimplify_assign (build_simple_mem_ref (acc), lhs, &new_seq);
-  pop_gimplify_context (NULL);
-  gsi_replace_with_seq (gsi, new_seq, true);
-
-  return true;
-}
-
-/* Fold a machine-dependent built-in in GIMPLE.  (For folding into
-   a constant, use rs6000_fold_builtin.)  */
-bool
-rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
-{
-  gimple *stmt = gsi_stmt (*gsi);
-  tree fndecl = gimple_call_fndecl (stmt);
-  gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
-  enum rs6000_gen_builtins fn_code
-    = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl);
-  tree arg0, arg1, lhs, temp;
-  enum tree_code bcode;
-  gimple *g;
-
-  size_t uns_fncode = (size_t) fn_code;
-  enum insn_code icode = rs6000_builtin_info[uns_fncode].icode;
-  const char *fn_name1 = rs6000_builtin_info[uns_fncode].bifname;
-  const char *fn_name2 = (icode != CODE_FOR_nothing)
-			  ? get_insn_name ((int) icode)
-			  : "nothing";
-
-  if (TARGET_DEBUG_BUILTIN)
-      fprintf (stderr, "rs6000_gimple_fold_builtin %d %s %s\n",
-	       fn_code, fn_name1, fn_name2);
-
-  if (!rs6000_fold_gimple)
-    return false;
-
-  /* Prevent gimple folding for code that does not have a LHS, unless it is
-     allowed per the rs6000_builtin_valid_without_lhs helper function.  */
-  if (!gimple_call_lhs (stmt)
-      && !rs6000_builtin_valid_without_lhs (fn_code, fndecl))
-    return false;
-
-  /* Don't fold invalid builtins, let rs6000_expand_builtin diagnose it.  */
-  if (!rs6000_builtin_is_supported (fn_code))
-    return false;
-
-  if (rs6000_gimple_fold_mma_builtin (gsi, fn_code))
-    return true;
-
-  switch (fn_code)
-    {
-    /* Flavors of vec_add.  We deliberately don't expand
-       RS6000_BIF_VADDUQM as it gets lowered from V1TImode to
-       TImode, resulting in much poorer code generation.  */
-    case RS6000_BIF_VADDUBM:
-    case RS6000_BIF_VADDUHM:
-    case RS6000_BIF_VADDUWM:
-    case RS6000_BIF_VADDUDM:
-    case RS6000_BIF_VADDFP:
-    case RS6000_BIF_XVADDDP:
-    case RS6000_BIF_XVADDSP:
-      bcode = PLUS_EXPR;
-    do_binary:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      if (INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (lhs)))
-	  && !TYPE_OVERFLOW_WRAPS (TREE_TYPE (TREE_TYPE (lhs))))
-	{
-	  /* Ensure the binary operation is performed in a type
-	     that wraps if it is integral type.  */
-	  gimple_seq stmts = NULL;
-	  tree type = unsigned_type_for (TREE_TYPE (lhs));
-	  tree uarg0 = gimple_build (&stmts, VIEW_CONVERT_EXPR,
-				     type, arg0);
-	  tree uarg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR,
-				     type, arg1);
-	  tree res = gimple_build (&stmts, gimple_location (stmt), bcode,
-				   type, uarg0, uarg1);
-	  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-	  g = gimple_build_assign (lhs, VIEW_CONVERT_EXPR,
-				   build1 (VIEW_CONVERT_EXPR,
-					   TREE_TYPE (lhs), res));
-	  gsi_replace (gsi, g, true);
-	  return true;
-	}
-      g = gimple_build_assign (lhs, bcode, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* Flavors of vec_sub.  We deliberately don't expand
-       RS6000_BIF_VSUBUQM. */
-    case RS6000_BIF_VSUBUBM:
-    case RS6000_BIF_VSUBUHM:
-    case RS6000_BIF_VSUBUWM:
-    case RS6000_BIF_VSUBUDM:
-    case RS6000_BIF_VSUBFP:
-    case RS6000_BIF_XVSUBDP:
-    case RS6000_BIF_XVSUBSP:
-      bcode = MINUS_EXPR;
-      goto do_binary;
-    case RS6000_BIF_XVMULSP:
-    case RS6000_BIF_XVMULDP:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      g = gimple_build_assign (lhs, MULT_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* Even element flavors of vec_mul (signed). */
-    case RS6000_BIF_VMULESB:
-    case RS6000_BIF_VMULESH:
-    case RS6000_BIF_VMULESW:
-    /* Even element flavors of vec_mul (unsigned).  */
-    case RS6000_BIF_VMULEUB:
-    case RS6000_BIF_VMULEUH:
-    case RS6000_BIF_VMULEUW:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      g = gimple_build_assign (lhs, VEC_WIDEN_MULT_EVEN_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* Odd element flavors of vec_mul (signed).  */
-    case RS6000_BIF_VMULOSB:
-    case RS6000_BIF_VMULOSH:
-    case RS6000_BIF_VMULOSW:
-    /* Odd element flavors of vec_mul (unsigned). */
-    case RS6000_BIF_VMULOUB:
-    case RS6000_BIF_VMULOUH:
-    case RS6000_BIF_VMULOUW:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      g = gimple_build_assign (lhs, VEC_WIDEN_MULT_ODD_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* Flavors of vec_div (Integer).  */
-    case RS6000_BIF_DIV_V2DI:
-    case RS6000_BIF_UDIV_V2DI:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      g = gimple_build_assign (lhs, TRUNC_DIV_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* Flavors of vec_div (Float).  */
-    case RS6000_BIF_XVDIVSP:
-    case RS6000_BIF_XVDIVDP:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      g = gimple_build_assign (lhs, RDIV_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* Flavors of vec_and.  */
-    case RS6000_BIF_VAND_V16QI_UNS:
-    case RS6000_BIF_VAND_V16QI:
-    case RS6000_BIF_VAND_V8HI_UNS:
-    case RS6000_BIF_VAND_V8HI:
-    case RS6000_BIF_VAND_V4SI_UNS:
-    case RS6000_BIF_VAND_V4SI:
-    case RS6000_BIF_VAND_V2DI_UNS:
-    case RS6000_BIF_VAND_V2DI:
-    case RS6000_BIF_VAND_V4SF:
-    case RS6000_BIF_VAND_V2DF:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      g = gimple_build_assign (lhs, BIT_AND_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* Flavors of vec_andc.  */
-    case RS6000_BIF_VANDC_V16QI_UNS:
-    case RS6000_BIF_VANDC_V16QI:
-    case RS6000_BIF_VANDC_V8HI_UNS:
-    case RS6000_BIF_VANDC_V8HI:
-    case RS6000_BIF_VANDC_V4SI_UNS:
-    case RS6000_BIF_VANDC_V4SI:
-    case RS6000_BIF_VANDC_V2DI_UNS:
-    case RS6000_BIF_VANDC_V2DI:
-    case RS6000_BIF_VANDC_V4SF:
-    case RS6000_BIF_VANDC_V2DF:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1));
-      g = gimple_build_assign (temp, BIT_NOT_EXPR, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_insert_before (gsi, g, GSI_SAME_STMT);
-      g = gimple_build_assign (lhs, BIT_AND_EXPR, arg0, temp);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* Flavors of vec_nand.  */
-    case RS6000_BIF_NAND_V16QI_UNS:
-    case RS6000_BIF_NAND_V16QI:
-    case RS6000_BIF_NAND_V8HI_UNS:
-    case RS6000_BIF_NAND_V8HI:
-    case RS6000_BIF_NAND_V4SI_UNS:
-    case RS6000_BIF_NAND_V4SI:
-    case RS6000_BIF_NAND_V2DI_UNS:
-    case RS6000_BIF_NAND_V2DI:
-    case RS6000_BIF_NAND_V4SF:
-    case RS6000_BIF_NAND_V2DF:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1));
-      g = gimple_build_assign (temp, BIT_AND_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_insert_before (gsi, g, GSI_SAME_STMT);
-      g = gimple_build_assign (lhs, BIT_NOT_EXPR, temp);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* Flavors of vec_or.  */
-    case RS6000_BIF_VOR_V16QI_UNS:
-    case RS6000_BIF_VOR_V16QI:
-    case RS6000_BIF_VOR_V8HI_UNS:
-    case RS6000_BIF_VOR_V8HI:
-    case RS6000_BIF_VOR_V4SI_UNS:
-    case RS6000_BIF_VOR_V4SI:
-    case RS6000_BIF_VOR_V2DI_UNS:
-    case RS6000_BIF_VOR_V2DI:
-    case RS6000_BIF_VOR_V4SF:
-    case RS6000_BIF_VOR_V2DF:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      g = gimple_build_assign (lhs, BIT_IOR_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* flavors of vec_orc.  */
-    case RS6000_BIF_ORC_V16QI_UNS:
-    case RS6000_BIF_ORC_V16QI:
-    case RS6000_BIF_ORC_V8HI_UNS:
-    case RS6000_BIF_ORC_V8HI:
-    case RS6000_BIF_ORC_V4SI_UNS:
-    case RS6000_BIF_ORC_V4SI:
-    case RS6000_BIF_ORC_V2DI_UNS:
-    case RS6000_BIF_ORC_V2DI:
-    case RS6000_BIF_ORC_V4SF:
-    case RS6000_BIF_ORC_V2DF:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1));
-      g = gimple_build_assign (temp, BIT_NOT_EXPR, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_insert_before (gsi, g, GSI_SAME_STMT);
-      g = gimple_build_assign (lhs, BIT_IOR_EXPR, arg0, temp);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* Flavors of vec_xor.  */
-    case RS6000_BIF_VXOR_V16QI_UNS:
-    case RS6000_BIF_VXOR_V16QI:
-    case RS6000_BIF_VXOR_V8HI_UNS:
-    case RS6000_BIF_VXOR_V8HI:
-    case RS6000_BIF_VXOR_V4SI_UNS:
-    case RS6000_BIF_VXOR_V4SI:
-    case RS6000_BIF_VXOR_V2DI_UNS:
-    case RS6000_BIF_VXOR_V2DI:
-    case RS6000_BIF_VXOR_V4SF:
-    case RS6000_BIF_VXOR_V2DF:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      g = gimple_build_assign (lhs, BIT_XOR_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* Flavors of vec_nor.  */
-    case RS6000_BIF_VNOR_V16QI_UNS:
-    case RS6000_BIF_VNOR_V16QI:
-    case RS6000_BIF_VNOR_V8HI_UNS:
-    case RS6000_BIF_VNOR_V8HI:
-    case RS6000_BIF_VNOR_V4SI_UNS:
-    case RS6000_BIF_VNOR_V4SI:
-    case RS6000_BIF_VNOR_V2DI_UNS:
-    case RS6000_BIF_VNOR_V2DI:
-    case RS6000_BIF_VNOR_V4SF:
-    case RS6000_BIF_VNOR_V2DF:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1));
-      g = gimple_build_assign (temp, BIT_IOR_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_insert_before (gsi, g, GSI_SAME_STMT);
-      g = gimple_build_assign (lhs, BIT_NOT_EXPR, temp);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* flavors of vec_abs.  */
-    case RS6000_BIF_ABS_V16QI:
-    case RS6000_BIF_ABS_V8HI:
-    case RS6000_BIF_ABS_V4SI:
-    case RS6000_BIF_ABS_V4SF:
-    case RS6000_BIF_ABS_V2DI:
-    case RS6000_BIF_XVABSDP:
-    case RS6000_BIF_XVABSSP:
-      arg0 = gimple_call_arg (stmt, 0);
-      if (INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (arg0)))
-	  && !TYPE_OVERFLOW_WRAPS (TREE_TYPE (TREE_TYPE (arg0))))
-	return false;
-      lhs = gimple_call_lhs (stmt);
-      g = gimple_build_assign (lhs, ABS_EXPR, arg0);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* flavors of vec_min.  */
-    case RS6000_BIF_XVMINDP:
-    case RS6000_BIF_XVMINSP:
-    case RS6000_BIF_VMINFP:
-      {
-	lhs = gimple_call_lhs (stmt);
-	tree type = TREE_TYPE (lhs);
-	if (HONOR_NANS (type))
-	  return false;
-	gcc_fallthrough ();
-      }
-    case RS6000_BIF_VMINSD:
-    case RS6000_BIF_VMINUD:
-    case RS6000_BIF_VMINSB:
-    case RS6000_BIF_VMINSH:
-    case RS6000_BIF_VMINSW:
-    case RS6000_BIF_VMINUB:
-    case RS6000_BIF_VMINUH:
-    case RS6000_BIF_VMINUW:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      g = gimple_build_assign (lhs, MIN_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* flavors of vec_max.  */
-    case RS6000_BIF_XVMAXDP:
-    case RS6000_BIF_XVMAXSP:
-    case RS6000_BIF_VMAXFP:
-      {
-	lhs = gimple_call_lhs (stmt);
-	tree type = TREE_TYPE (lhs);
-	if (HONOR_NANS (type))
-	  return false;
-	gcc_fallthrough ();
-      }
-    case RS6000_BIF_VMAXSD:
-    case RS6000_BIF_VMAXUD:
-    case RS6000_BIF_VMAXSB:
-    case RS6000_BIF_VMAXSH:
-    case RS6000_BIF_VMAXSW:
-    case RS6000_BIF_VMAXUB:
-    case RS6000_BIF_VMAXUH:
-    case RS6000_BIF_VMAXUW:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      g = gimple_build_assign (lhs, MAX_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* Flavors of vec_eqv.  */
-    case RS6000_BIF_EQV_V16QI:
-    case RS6000_BIF_EQV_V8HI:
-    case RS6000_BIF_EQV_V4SI:
-    case RS6000_BIF_EQV_V4SF:
-    case RS6000_BIF_EQV_V2DF:
-    case RS6000_BIF_EQV_V2DI:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      temp = create_tmp_reg_or_ssa_name (TREE_TYPE (arg1));
-      g = gimple_build_assign (temp, BIT_XOR_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_insert_before (gsi, g, GSI_SAME_STMT);
-      g = gimple_build_assign (lhs, BIT_NOT_EXPR, temp);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-    /* Flavors of vec_rotate_left.  */
-    case RS6000_BIF_VRLB:
-    case RS6000_BIF_VRLH:
-    case RS6000_BIF_VRLW:
-    case RS6000_BIF_VRLD:
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      lhs = gimple_call_lhs (stmt);
-      g = gimple_build_assign (lhs, LROTATE_EXPR, arg0, arg1);
-      gimple_set_location (g, gimple_location (stmt));
-      gsi_replace (gsi, g, true);
-      return true;
-  /* Flavors of vector shift right algebraic.
-     vec_sra{b,h,w} -> vsra{b,h,w}.  */
-    case RS6000_BIF_VSRAB:
-    case RS6000_BIF_VSRAH:
-    case RS6000_BIF_VSRAW:
-    case RS6000_BIF_VSRAD:
-      {
-	arg0 = gimple_call_arg (stmt, 0);
-	arg1 = gimple_call_arg (stmt, 1);
-	lhs = gimple_call_lhs (stmt);
-	tree arg1_type = TREE_TYPE (arg1);
-	tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1));
-	tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type));
-	location_t loc = gimple_location (stmt);
-	/* Force arg1 into the range valid matching the arg0 type.  */
-	/* Build a vector consisting of the max valid bit-size values.  */
-	int n_elts = VECTOR_CST_NELTS (arg1);
-	tree element_size = build_int_cst (unsigned_element_type,
-					   128 / n_elts);
-	tree_vector_builder elts (unsigned_arg1_type, n_elts, 1);
-	for (int i = 0; i < n_elts; i++)
-	  elts.safe_push (element_size);
-	tree modulo_tree = elts.build ();
-	/* Modulo the provided shift value against that vector.  */
-	gimple_seq stmts = NULL;
-	tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR,
-					   unsigned_arg1_type, arg1);
-	tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR,
-				      unsigned_arg1_type, unsigned_arg1,
-				      modulo_tree);
-	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-	/* And finally, do the shift.  */
-	g = gimple_build_assign (lhs, RSHIFT_EXPR, arg0, new_arg1);
-	gimple_set_location (g, loc);
-	gsi_replace (gsi, g, true);
-	return true;
-      }
-   /* Flavors of vector shift left.
-      builtin_altivec_vsl{b,h,w} -> vsl{b,h,w}.  */
-    case RS6000_BIF_VSLB:
-    case RS6000_BIF_VSLH:
-    case RS6000_BIF_VSLW:
-    case RS6000_BIF_VSLD:
-      {
-	location_t loc;
-	gimple_seq stmts = NULL;
-	arg0 = gimple_call_arg (stmt, 0);
-	tree arg0_type = TREE_TYPE (arg0);
-	if (INTEGRAL_TYPE_P (TREE_TYPE (arg0_type))
-	    && !TYPE_OVERFLOW_WRAPS (TREE_TYPE (arg0_type)))
-	  return false;
-	arg1 = gimple_call_arg (stmt, 1);
-	tree arg1_type = TREE_TYPE (arg1);
-	tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1));
-	tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type));
-	loc = gimple_location (stmt);
-	lhs = gimple_call_lhs (stmt);
-	/* Force arg1 into the range valid matching the arg0 type.  */
-	/* Build a vector consisting of the max valid bit-size values.  */
-	int n_elts = VECTOR_CST_NELTS (arg1);
-	int tree_size_in_bits = TREE_INT_CST_LOW (size_in_bytes (arg1_type))
-				* BITS_PER_UNIT;
-	tree element_size = build_int_cst (unsigned_element_type,
-					   tree_size_in_bits / n_elts);
-	tree_vector_builder elts (unsigned_type_for (arg1_type), n_elts, 1);
-	for (int i = 0; i < n_elts; i++)
-	  elts.safe_push (element_size);
-	tree modulo_tree = elts.build ();
-	/* Modulo the provided shift value against that vector.  */
-	tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR,
-					   unsigned_arg1_type, arg1);
-	tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR,
-				      unsigned_arg1_type, unsigned_arg1,
-				      modulo_tree);
-	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-	/* And finally, do the shift.  */
-	g = gimple_build_assign (lhs, LSHIFT_EXPR, arg0, new_arg1);
-	gimple_set_location (g, gimple_location (stmt));
-	gsi_replace (gsi, g, true);
-	return true;
-      }
-    /* Flavors of vector shift right.  */
-    case RS6000_BIF_VSRB:
-    case RS6000_BIF_VSRH:
-    case RS6000_BIF_VSRW:
-    case RS6000_BIF_VSRD:
-      {
-	arg0 = gimple_call_arg (stmt, 0);
-	arg1 = gimple_call_arg (stmt, 1);
-	lhs = gimple_call_lhs (stmt);
-	tree arg1_type = TREE_TYPE (arg1);
-	tree unsigned_arg1_type = unsigned_type_for (TREE_TYPE (arg1));
-	tree unsigned_element_type = unsigned_type_for (TREE_TYPE (arg1_type));
-	location_t loc = gimple_location (stmt);
-	gimple_seq stmts = NULL;
-	/* Convert arg0 to unsigned.  */
-	tree arg0_unsigned
-	  = gimple_build (&stmts, VIEW_CONVERT_EXPR,
-			  unsigned_type_for (TREE_TYPE (arg0)), arg0);
-	/* Force arg1 into the range valid matching the arg0 type.  */
-	/* Build a vector consisting of the max valid bit-size values.  */
-	int n_elts = VECTOR_CST_NELTS (arg1);
-	tree element_size = build_int_cst (unsigned_element_type,
-					   128 / n_elts);
-	tree_vector_builder elts (unsigned_arg1_type, n_elts, 1);
-	for (int i = 0; i < n_elts; i++)
-	  elts.safe_push (element_size);
-	tree modulo_tree = elts.build ();
-	/* Modulo the provided shift value against that vector.  */
-	tree unsigned_arg1 = gimple_build (&stmts, VIEW_CONVERT_EXPR,
-					   unsigned_arg1_type, arg1);
-	tree new_arg1 = gimple_build (&stmts, loc, TRUNC_MOD_EXPR,
-				      unsigned_arg1_type, unsigned_arg1,
-				      modulo_tree);
-	/* Do the shift.  */
-	tree res
-	  = gimple_build (&stmts, RSHIFT_EXPR,
-			  TREE_TYPE (arg0_unsigned), arg0_unsigned, new_arg1);
-	/* Convert result back to the lhs type.  */
-	res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
-	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-	replace_call_with_value (gsi, res);
-	return true;
-      }
-    /* Vector loads.  */
-    case RS6000_BIF_LVX_V16QI:
-    case RS6000_BIF_LVX_V8HI:
-    case RS6000_BIF_LVX_V4SI:
-    case RS6000_BIF_LVX_V4SF:
-    case RS6000_BIF_LVX_V2DI:
-    case RS6000_BIF_LVX_V2DF:
-    case RS6000_BIF_LVX_V1TI:
-      {
-	arg0 = gimple_call_arg (stmt, 0);  // offset
-	arg1 = gimple_call_arg (stmt, 1);  // address
-	lhs = gimple_call_lhs (stmt);
-	location_t loc = gimple_location (stmt);
-	/* Since arg1 may be cast to a different type, just use ptr_type_node
-	   here instead of trying to enforce TBAA on pointer types.  */
-	tree arg1_type = ptr_type_node;
-	tree lhs_type = TREE_TYPE (lhs);
-	/* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
-	   the tree using the value from arg0.  The resulting type will match
-	   the type of arg1.  */
-	gimple_seq stmts = NULL;
-	tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0);
-	tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
-				       arg1_type, arg1, temp_offset);
-	/* Mask off any lower bits from the address.  */
-	tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR,
-					  arg1_type, temp_addr,
-					  build_int_cst (arg1_type, -16));
-	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-	if (!is_gimple_mem_ref_addr (aligned_addr))
-	  {
-	    tree t = make_ssa_name (TREE_TYPE (aligned_addr));
-	    gimple *g = gimple_build_assign (t, aligned_addr);
-	    gsi_insert_before (gsi, g, GSI_SAME_STMT);
-	    aligned_addr = t;
-	  }
-	/* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
-	   take an offset, but since we've already incorporated the offset
-	   above, here we just pass in a zero.  */
-	gimple *g
-	  = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
-					      build_int_cst (arg1_type, 0)));
-	gimple_set_location (g, loc);
-	gsi_replace (gsi, g, true);
-	return true;
-      }
-    /* Vector stores.  */
-    case RS6000_BIF_STVX_V16QI:
-    case RS6000_BIF_STVX_V8HI:
-    case RS6000_BIF_STVX_V4SI:
-    case RS6000_BIF_STVX_V4SF:
-    case RS6000_BIF_STVX_V2DI:
-    case RS6000_BIF_STVX_V2DF:
-      {
-	arg0 = gimple_call_arg (stmt, 0); /* Value to be stored.  */
-	arg1 = gimple_call_arg (stmt, 1); /* Offset.  */
-	tree arg2 = gimple_call_arg (stmt, 2); /* Store-to address.  */
-	location_t loc = gimple_location (stmt);
-	tree arg0_type = TREE_TYPE (arg0);
-	/* Use ptr_type_node (no TBAA) for the arg2_type.
-	   FIXME: (Richard)  "A proper fix would be to transition this type as
-	   seen from the frontend to GIMPLE, for example in a similar way we
-	   do for MEM_REFs by piggy-backing that on an extra argument, a
-	   constant zero pointer of the alias pointer type to use (which would
-	   also serve as a type indicator of the store itself).  I'd use a
-	   target specific internal function for this (not sure if we can have
-	   those target specific, but I guess if it's folded away then that's
-	   fine) and get away with the overload set."  */
-	tree arg2_type = ptr_type_node;
-	/* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
-	   the tree using the value from arg0.  The resulting type will match
-	   the type of arg2.  */
-	gimple_seq stmts = NULL;
-	tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg1);
-	tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
-				       arg2_type, arg2, temp_offset);
-	/* Mask off any lower bits from the address.  */
-	tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR,
-					  arg2_type, temp_addr,
-					  build_int_cst (arg2_type, -16));
-	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-	if (!is_gimple_mem_ref_addr (aligned_addr))
-	  {
-	    tree t = make_ssa_name (TREE_TYPE (aligned_addr));
-	    gimple *g = gimple_build_assign (t, aligned_addr);
-	    gsi_insert_before (gsi, g, GSI_SAME_STMT);
-	    aligned_addr = t;
-	  }
-	/* The desired gimple result should be similar to:
-	   MEM[(__vector floatD.1407 *)_1] = vf1D.2697;  */
-	gimple *g
-	  = gimple_build_assign (build2 (MEM_REF, arg0_type, aligned_addr,
-					 build_int_cst (arg2_type, 0)), arg0);
-	gimple_set_location (g, loc);
-	gsi_replace (gsi, g, true);
-	return true;
-      }
-
-    /* unaligned Vector loads.  */
-    case RS6000_BIF_LXVW4X_V16QI:
-    case RS6000_BIF_LXVW4X_V8HI:
-    case RS6000_BIF_LXVW4X_V4SF:
-    case RS6000_BIF_LXVW4X_V4SI:
-    case RS6000_BIF_LXVD2X_V2DF:
-    case RS6000_BIF_LXVD2X_V2DI:
-      {
-	arg0 = gimple_call_arg (stmt, 0);  // offset
-	arg1 = gimple_call_arg (stmt, 1);  // address
-	lhs = gimple_call_lhs (stmt);
-	location_t loc = gimple_location (stmt);
-	/* Since arg1 may be cast to a different type, just use ptr_type_node
-	   here instead of trying to enforce TBAA on pointer types.  */
-	tree arg1_type = ptr_type_node;
-	tree lhs_type = TREE_TYPE (lhs);
-	/* In GIMPLE the type of the MEM_REF specifies the alignment.  The
-	  required alignment (power) is 4 bytes regardless of data type.  */
-	tree align_ltype = build_aligned_type (lhs_type, 4);
-	/* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
-	   the tree using the value from arg0.  The resulting type will match
-	   the type of arg1.  */
-	gimple_seq stmts = NULL;
-	tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0);
-	tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
-				       arg1_type, arg1, temp_offset);
-	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-	if (!is_gimple_mem_ref_addr (temp_addr))
-	  {
-	    tree t = make_ssa_name (TREE_TYPE (temp_addr));
-	    gimple *g = gimple_build_assign (t, temp_addr);
-	    gsi_insert_before (gsi, g, GSI_SAME_STMT);
-	    temp_addr = t;
-	  }
-	/* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
-	   take an offset, but since we've already incorporated the offset
-	   above, here we just pass in a zero.  */
-	gimple *g;
-	g = gimple_build_assign (lhs, build2 (MEM_REF, align_ltype, temp_addr,
-					      build_int_cst (arg1_type, 0)));
-	gimple_set_location (g, loc);
-	gsi_replace (gsi, g, true);
-	return true;
-      }
-
-    /* unaligned Vector stores.  */
-    case RS6000_BIF_STXVW4X_V16QI:
-    case RS6000_BIF_STXVW4X_V8HI:
-    case RS6000_BIF_STXVW4X_V4SF:
-    case RS6000_BIF_STXVW4X_V4SI:
-    case RS6000_BIF_STXVD2X_V2DF:
-    case RS6000_BIF_STXVD2X_V2DI:
-      {
-	arg0 = gimple_call_arg (stmt, 0); /* Value to be stored.  */
-	arg1 = gimple_call_arg (stmt, 1); /* Offset.  */
-	tree arg2 = gimple_call_arg (stmt, 2); /* Store-to address.  */
-	location_t loc = gimple_location (stmt);
-	tree arg0_type = TREE_TYPE (arg0);
-	/* Use ptr_type_node (no TBAA) for the arg2_type.  */
-	tree arg2_type = ptr_type_node;
-	/* In GIMPLE the type of the MEM_REF specifies the alignment.  The
-	   required alignment (power) is 4 bytes regardless of data type.  */
-	tree align_stype = build_aligned_type (arg0_type, 4);
-	/* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
-	   the tree using the value from arg1.  */
-	gimple_seq stmts = NULL;
-	tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg1);
-	tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
-				       arg2_type, arg2, temp_offset);
-	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-	if (!is_gimple_mem_ref_addr (temp_addr))
-	  {
-	    tree t = make_ssa_name (TREE_TYPE (temp_addr));
-	    gimple *g = gimple_build_assign (t, temp_addr);
-	    gsi_insert_before (gsi, g, GSI_SAME_STMT);
-	    temp_addr = t;
-	  }
-	gimple *g;
-	g = gimple_build_assign (build2 (MEM_REF, align_stype, temp_addr,
-					 build_int_cst (arg2_type, 0)), arg0);
-	gimple_set_location (g, loc);
-	gsi_replace (gsi, g, true);
-	return true;
-      }
-
-    /* Vector Fused multiply-add (fma).  */
-    case RS6000_BIF_VMADDFP:
-    case RS6000_BIF_XVMADDDP:
-    case RS6000_BIF_XVMADDSP:
-    case RS6000_BIF_VMLADDUHM:
-      {
-	arg0 = gimple_call_arg (stmt, 0);
-	arg1 = gimple_call_arg (stmt, 1);
-	tree arg2 = gimple_call_arg (stmt, 2);
-	lhs = gimple_call_lhs (stmt);
-	gcall *g = gimple_build_call_internal (IFN_FMA, 3, arg0, arg1, arg2);
-	gimple_call_set_lhs (g, lhs);
-	gimple_call_set_nothrow (g, true);
-	gimple_set_location (g, gimple_location (stmt));
-	gsi_replace (gsi, g, true);
-	return true;
-      }
-
-    /* Vector compares; EQ, NE, GE, GT, LE.  */
-    case RS6000_BIF_VCMPEQUB:
-    case RS6000_BIF_VCMPEQUH:
-    case RS6000_BIF_VCMPEQUW:
-    case RS6000_BIF_VCMPEQUD:
-    /* We deliberately omit RS6000_BIF_VCMPEQUT for now, because gimple
-       folding produces worse code for 128-bit compares.  */
-      fold_compare_helper (gsi, EQ_EXPR, stmt);
-      return true;
-
-    case RS6000_BIF_VCMPNEB:
-    case RS6000_BIF_VCMPNEH:
-    case RS6000_BIF_VCMPNEW:
-    /* We deliberately omit RS6000_BIF_VCMPNET for now, because gimple
-       folding produces worse code for 128-bit compares.  */
-      fold_compare_helper (gsi, NE_EXPR, stmt);
-      return true;
-
-    case RS6000_BIF_CMPGE_16QI:
-    case RS6000_BIF_CMPGE_U16QI:
-    case RS6000_BIF_CMPGE_8HI:
-    case RS6000_BIF_CMPGE_U8HI:
-    case RS6000_BIF_CMPGE_4SI:
-    case RS6000_BIF_CMPGE_U4SI:
-    case RS6000_BIF_CMPGE_2DI:
-    case RS6000_BIF_CMPGE_U2DI:
-    /* We deliberately omit RS6000_BIF_CMPGE_1TI and RS6000_BIF_CMPGE_U1TI
-       for now, because gimple folding produces worse code for 128-bit
-       compares.  */
-      fold_compare_helper (gsi, GE_EXPR, stmt);
-      return true;
-
-    case RS6000_BIF_VCMPGTSB:
-    case RS6000_BIF_VCMPGTUB:
-    case RS6000_BIF_VCMPGTSH:
-    case RS6000_BIF_VCMPGTUH:
-    case RS6000_BIF_VCMPGTSW:
-    case RS6000_BIF_VCMPGTUW:
-    case RS6000_BIF_VCMPGTUD:
-    case RS6000_BIF_VCMPGTSD:
-    /* We deliberately omit RS6000_BIF_VCMPGTUT and RS6000_BIF_VCMPGTST
-       for now, because gimple folding produces worse code for 128-bit
-       compares.  */
-      fold_compare_helper (gsi, GT_EXPR, stmt);
-      return true;
-
-    case RS6000_BIF_CMPLE_16QI:
-    case RS6000_BIF_CMPLE_U16QI:
-    case RS6000_BIF_CMPLE_8HI:
-    case RS6000_BIF_CMPLE_U8HI:
-    case RS6000_BIF_CMPLE_4SI:
-    case RS6000_BIF_CMPLE_U4SI:
-    case RS6000_BIF_CMPLE_2DI:
-    case RS6000_BIF_CMPLE_U2DI:
-    /* We deliberately omit RS6000_BIF_CMPLE_1TI and RS6000_BIF_CMPLE_U1TI
-       for now, because gimple folding produces worse code for 128-bit
-       compares.  */
-      fold_compare_helper (gsi, LE_EXPR, stmt);
-      return true;
-
-    /* flavors of vec_splat_[us]{8,16,32}.  */
-    case RS6000_BIF_VSPLTISB:
-    case RS6000_BIF_VSPLTISH:
-    case RS6000_BIF_VSPLTISW:
-      {
-	arg0 = gimple_call_arg (stmt, 0);
-	lhs = gimple_call_lhs (stmt);
-
-	/* Only fold the vec_splat_*() if the lower bits of arg 0 is a
-	   5-bit signed constant in range -16 to +15.  */
-	if (TREE_CODE (arg0) != INTEGER_CST
-	    || !IN_RANGE (TREE_INT_CST_LOW (arg0), -16, 15))
-	  return false;
-	gimple_seq stmts = NULL;
-	location_t loc = gimple_location (stmt);
-	tree splat_value = gimple_convert (&stmts, loc,
-					   TREE_TYPE (TREE_TYPE (lhs)), arg0);
-	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-	tree splat_tree = build_vector_from_val (TREE_TYPE (lhs), splat_value);
-	g = gimple_build_assign (lhs, splat_tree);
-	gimple_set_location (g, gimple_location (stmt));
-	gsi_replace (gsi, g, true);
-	return true;
-      }
-
-    /* Flavors of vec_splat.  */
-    /* a = vec_splat (b, 0x3) becomes a = { b[3],b[3],b[3],...};  */
-    case RS6000_BIF_VSPLTB:
-    case RS6000_BIF_VSPLTH:
-    case RS6000_BIF_VSPLTW:
-    case RS6000_BIF_XXSPLTD_V2DI:
-    case RS6000_BIF_XXSPLTD_V2DF:
-      {
-	arg0 = gimple_call_arg (stmt, 0); /* input vector.  */
-	arg1 = gimple_call_arg (stmt, 1); /* index into arg0.  */
-	/* Only fold the vec_splat_*() if arg1 is both a constant value and
-	   is a valid index into the arg0 vector.  */
-	unsigned int n_elts = VECTOR_CST_NELTS (arg0);
-	if (TREE_CODE (arg1) != INTEGER_CST
-	    || TREE_INT_CST_LOW (arg1) > (n_elts -1))
-	  return false;
-	lhs = gimple_call_lhs (stmt);
-	tree lhs_type = TREE_TYPE (lhs);
-	tree arg0_type = TREE_TYPE (arg0);
-	tree splat;
-	if (TREE_CODE (arg0) == VECTOR_CST)
-	  splat = VECTOR_CST_ELT (arg0, TREE_INT_CST_LOW (arg1));
-	else
-	  {
-	    /* Determine (in bits) the length and start location of the
-	       splat value for a call to the tree_vec_extract helper.  */
-	    int splat_elem_size = TREE_INT_CST_LOW (size_in_bytes (arg0_type))
-				  * BITS_PER_UNIT / n_elts;
-	    int splat_start_bit = TREE_INT_CST_LOW (arg1) * splat_elem_size;
-	    tree len = build_int_cst (bitsizetype, splat_elem_size);
-	    tree start = build_int_cst (bitsizetype, splat_start_bit);
-	    splat = tree_vec_extract (gsi, TREE_TYPE (lhs_type), arg0,
-				      len, start);
-	  }
-	/* And finally, build the new vector.  */
-	tree splat_tree = build_vector_from_val (lhs_type, splat);
-	g = gimple_build_assign (lhs, splat_tree);
-	gimple_set_location (g, gimple_location (stmt));
-	gsi_replace (gsi, g, true);
-	return true;
-      }
-
-    /* vec_mergel (integrals).  */
-    case RS6000_BIF_VMRGLH:
-    case RS6000_BIF_VMRGLW:
-    case RS6000_BIF_XXMRGLW_4SI:
-    case RS6000_BIF_VMRGLB:
-    case RS6000_BIF_VEC_MERGEL_V2DI:
-    case RS6000_BIF_XXMRGLW_4SF:
-    case RS6000_BIF_VEC_MERGEL_V2DF:
-      fold_mergehl_helper (gsi, stmt, 1);
-      return true;
-    /* vec_mergeh (integrals).  */
-    case RS6000_BIF_VMRGHH:
-    case RS6000_BIF_VMRGHW:
-    case RS6000_BIF_XXMRGHW_4SI:
-    case RS6000_BIF_VMRGHB:
-    case RS6000_BIF_VEC_MERGEH_V2DI:
-    case RS6000_BIF_XXMRGHW_4SF:
-    case RS6000_BIF_VEC_MERGEH_V2DF:
-      fold_mergehl_helper (gsi, stmt, 0);
-      return true;
-
-    /* Flavors of vec_mergee.  */
-    case RS6000_BIF_VMRGEW_V4SI:
-    case RS6000_BIF_VMRGEW_V2DI:
-    case RS6000_BIF_VMRGEW_V4SF:
-    case RS6000_BIF_VMRGEW_V2DF:
-      fold_mergeeo_helper (gsi, stmt, 0);
-      return true;
-    /* Flavors of vec_mergeo.  */
-    case RS6000_BIF_VMRGOW_V4SI:
-    case RS6000_BIF_VMRGOW_V2DI:
-    case RS6000_BIF_VMRGOW_V4SF:
-    case RS6000_BIF_VMRGOW_V2DF:
-      fold_mergeeo_helper (gsi, stmt, 1);
-      return true;
-
-    /* d = vec_pack (a, b) */
-    case RS6000_BIF_VPKUDUM:
-    case RS6000_BIF_VPKUHUM:
-    case RS6000_BIF_VPKUWUM:
-      {
-	arg0 = gimple_call_arg (stmt, 0);
-	arg1 = gimple_call_arg (stmt, 1);
-	lhs = gimple_call_lhs (stmt);
-	gimple *g = gimple_build_assign (lhs, VEC_PACK_TRUNC_EXPR, arg0, arg1);
-	gimple_set_location (g, gimple_location (stmt));
-	gsi_replace (gsi, g, true);
-	return true;
-      }
-
-    /* d = vec_unpackh (a) */
-    /* Note that the UNPACK_{HI,LO}_EXPR used in the gimple_build_assign call
-       in this code is sensitive to endian-ness, and needs to be inverted to
-       handle both LE and BE targets.  */
-    case RS6000_BIF_VUPKHSB:
-    case RS6000_BIF_VUPKHSH:
-    case RS6000_BIF_VUPKHSW:
-      {
-	arg0 = gimple_call_arg (stmt, 0);
-	lhs = gimple_call_lhs (stmt);
-	if (BYTES_BIG_ENDIAN)
-	  g = gimple_build_assign (lhs, VEC_UNPACK_HI_EXPR, arg0);
-	else
-	  g = gimple_build_assign (lhs, VEC_UNPACK_LO_EXPR, arg0);
-	gimple_set_location (g, gimple_location (stmt));
-	gsi_replace (gsi, g, true);
-	return true;
-      }
-    /* d = vec_unpackl (a) */
-    case RS6000_BIF_VUPKLSB:
-    case RS6000_BIF_VUPKLSH:
-    case RS6000_BIF_VUPKLSW:
-      {
-	arg0 = gimple_call_arg (stmt, 0);
-	lhs = gimple_call_lhs (stmt);
-	if (BYTES_BIG_ENDIAN)
-	  g = gimple_build_assign (lhs, VEC_UNPACK_LO_EXPR, arg0);
-	else
-	  g = gimple_build_assign (lhs, VEC_UNPACK_HI_EXPR, arg0);
-	gimple_set_location (g, gimple_location (stmt));
-	gsi_replace (gsi, g, true);
-	return true;
-      }
-    /* There is no gimple type corresponding with pixel, so just return.  */
-    case RS6000_BIF_VUPKHPX:
-    case RS6000_BIF_VUPKLPX:
-      return false;
-
-    /* vec_perm.  */
-    case RS6000_BIF_VPERM_16QI:
-    case RS6000_BIF_VPERM_8HI:
-    case RS6000_BIF_VPERM_4SI:
-    case RS6000_BIF_VPERM_2DI:
-    case RS6000_BIF_VPERM_4SF:
-    case RS6000_BIF_VPERM_2DF:
-    case RS6000_BIF_VPERM_16QI_UNS:
-    case RS6000_BIF_VPERM_8HI_UNS:
-    case RS6000_BIF_VPERM_4SI_UNS:
-    case RS6000_BIF_VPERM_2DI_UNS:
-      {
-	arg0 = gimple_call_arg (stmt, 0);
-	arg1 = gimple_call_arg (stmt, 1);
-	tree permute = gimple_call_arg (stmt, 2);
-	lhs = gimple_call_lhs (stmt);
-	location_t loc = gimple_location (stmt);
-	gimple_seq stmts = NULL;
-	// convert arg0 and arg1 to match the type of the permute
-	// for the VEC_PERM_EXPR operation.
-	tree permute_type = (TREE_TYPE (permute));
-	tree arg0_ptype = gimple_build (&stmts, loc, VIEW_CONVERT_EXPR,
-					permute_type, arg0);
-	tree arg1_ptype = gimple_build (&stmts, loc, VIEW_CONVERT_EXPR,
-					permute_type, arg1);
-	tree lhs_ptype = gimple_build (&stmts, loc, VEC_PERM_EXPR,
-				      permute_type, arg0_ptype, arg1_ptype,
-				      permute);
-	// Convert the result back to the desired lhs type upon completion.
-	tree temp = gimple_build (&stmts, loc, VIEW_CONVERT_EXPR,
-				  TREE_TYPE (lhs), lhs_ptype);
-	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-	g = gimple_build_assign (lhs, temp);
-	gimple_set_location (g, loc);
-	gsi_replace (gsi, g, true);
-	return true;
-      }
-
-    default:
-      if (TARGET_DEBUG_BUILTIN)
-	fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
-		 fn_code, fn_name1, fn_name2);
-      break;
-    }
-
-  return false;
-}
-
-/* Expand ALTIVEC_BUILTIN_MASK_FOR_LOAD.  */
-rtx
-rs6000_expand_ldst_mask (rtx target, tree arg0)
-{
-  int icode2 = BYTES_BIG_ENDIAN ? (int) CODE_FOR_altivec_lvsr_direct
-				: (int) CODE_FOR_altivec_lvsl_direct;
-  machine_mode tmode = insn_data[icode2].operand[0].mode;
-  machine_mode mode = insn_data[icode2].operand[1].mode;
-
-  gcc_assert (TARGET_ALTIVEC);
-
-  gcc_assert (POINTER_TYPE_P (TREE_TYPE (arg0)));
-  rtx op = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL);
-  rtx addr = memory_address (mode, op);
-  /* We need to negate the address.  */
-  op = gen_reg_rtx (GET_MODE (addr));
-  emit_insn (gen_rtx_SET (op, gen_rtx_NEG (GET_MODE (addr), addr)));
-  op = gen_rtx_MEM (mode, op);
-
-  if (target == 0
-      || GET_MODE (target) != tmode
-      || !insn_data[icode2].operand[0].predicate (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  rtx pat = GEN_FCN (icode2) (target, op);
-  if (!pat)
-    return 0;
-  emit_insn (pat);
-
-  return target;
-}
-
-/* Expand the CPU builtin in FCODE and store the result in TARGET.  */
-static rtx
-cpu_expand_builtin (enum rs6000_gen_builtins fcode,
-		    tree exp ATTRIBUTE_UNUSED, rtx target)
-{
-  /* __builtin_cpu_init () is a nop, so expand to nothing.  */
-  if (fcode == RS6000_BIF_CPU_INIT)
-    return const0_rtx;
-
-  if (target == 0 || GET_MODE (target) != SImode)
-    target = gen_reg_rtx (SImode);
-
-  /* TODO: Factor the #ifdef'd code into a separate function.  */
-#ifdef TARGET_LIBC_PROVIDES_HWCAP_IN_TCB
-  tree arg = TREE_OPERAND (CALL_EXPR_ARG (exp, 0), 0);
-  /* Target clones creates an ARRAY_REF instead of STRING_CST, convert it back
-     to a STRING_CST.  */
-  if (TREE_CODE (arg) == ARRAY_REF
-      && TREE_CODE (TREE_OPERAND (arg, 0)) == STRING_CST
-      && TREE_CODE (TREE_OPERAND (arg, 1)) == INTEGER_CST
-      && compare_tree_int (TREE_OPERAND (arg, 1), 0) == 0)
-    arg = TREE_OPERAND (arg, 0);
-
-  if (TREE_CODE (arg) != STRING_CST)
-    {
-      error ("builtin %qs only accepts a string argument",
-	     rs6000_builtin_info[(size_t) fcode].bifname);
-      return const0_rtx;
-    }
-
-  if (fcode == RS6000_BIF_CPU_IS)
-    {
-      const char *cpu = TREE_STRING_POINTER (arg);
-      rtx cpuid = NULL_RTX;
-      for (size_t i = 0; i < ARRAY_SIZE (cpu_is_info); i++)
-	if (strcmp (cpu, cpu_is_info[i].cpu) == 0)
-	  {
-	    /* The CPUID value in the TCB is offset by _DL_FIRST_PLATFORM.  */
-	    cpuid = GEN_INT (cpu_is_info[i].cpuid + _DL_FIRST_PLATFORM);
-	    break;
-	  }
-      if (cpuid == NULL_RTX)
-	{
-	  /* Invalid CPU argument.  */
-	  error ("cpu %qs is an invalid argument to builtin %qs",
-		 cpu, rs6000_builtin_info[(size_t) fcode].bifname);
-	  return const0_rtx;
-	}
-
-      rtx platform = gen_reg_rtx (SImode);
-      rtx address = gen_rtx_PLUS (Pmode,
-				  gen_rtx_REG (Pmode, TLS_REGNUM),
-				  GEN_INT (TCB_PLATFORM_OFFSET));
-      rtx tcbmem = gen_const_mem (SImode, address);
-      emit_move_insn (platform, tcbmem);
-      emit_insn (gen_eqsi3 (target, platform, cpuid));
-    }
-  else if (fcode == RS6000_BIF_CPU_SUPPORTS)
-    {
-      const char *hwcap = TREE_STRING_POINTER (arg);
-      rtx mask = NULL_RTX;
-      int hwcap_offset;
-      for (size_t i = 0; i < ARRAY_SIZE (cpu_supports_info); i++)
-	if (strcmp (hwcap, cpu_supports_info[i].hwcap) == 0)
-	  {
-	    mask = GEN_INT (cpu_supports_info[i].mask);
-	    hwcap_offset = TCB_HWCAP_OFFSET (cpu_supports_info[i].id);
-	    break;
-	  }
-      if (mask == NULL_RTX)
-	{
-	  /* Invalid HWCAP argument.  */
-	  error ("%s %qs is an invalid argument to builtin %qs",
-		 "hwcap", hwcap,
-		 rs6000_builtin_info[(size_t) fcode].bifname);
-	  return const0_rtx;
-	}
-
-      rtx tcb_hwcap = gen_reg_rtx (SImode);
-      rtx address = gen_rtx_PLUS (Pmode,
-				  gen_rtx_REG (Pmode, TLS_REGNUM),
-				  GEN_INT (hwcap_offset));
-      rtx tcbmem = gen_const_mem (SImode, address);
-      emit_move_insn (tcb_hwcap, tcbmem);
-      rtx scratch1 = gen_reg_rtx (SImode);
-      emit_insn (gen_rtx_SET (scratch1,
-			      gen_rtx_AND (SImode, tcb_hwcap, mask)));
-      rtx scratch2 = gen_reg_rtx (SImode);
-      emit_insn (gen_eqsi3 (scratch2, scratch1, const0_rtx));
-      emit_insn (gen_rtx_SET (target,
-			      gen_rtx_XOR (SImode, scratch2, const1_rtx)));
-    }
-  else
-    gcc_unreachable ();
-
-  /* Record that we have expanded a CPU builtin, so that we can later
-     emit a reference to the special symbol exported by LIBC to ensure we
-     do not link against an old LIBC that doesn't support this feature.  */
-  cpu_builtin_p = true;
-
-#else
-  warning (0, "builtin %qs needs GLIBC (2.23 and newer) that exports hardware "
-	   "capability bits", rs6000_builtin_info[(size_t) fcode].bifname);
-
-  /* For old LIBCs, always return FALSE.  */
-  emit_move_insn (target, GEN_INT (0));
-#endif /* TARGET_LIBC_PROVIDES_HWCAP_IN_TCB */
-
-  return target;
-}
-
-/* For the element-reversing load/store built-ins, produce the correct
-   insn_code depending on the target endianness.  */
-static insn_code
-elemrev_icode (rs6000_gen_builtins fcode)
-{
-  switch (fcode)
-    {
-    case RS6000_BIF_ST_ELEMREV_V1TI:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v1ti
-			      : CODE_FOR_vsx_st_elemrev_v1ti;
-
-    case RS6000_BIF_ST_ELEMREV_V2DF:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v2df
-			      : CODE_FOR_vsx_st_elemrev_v2df;
-
-    case RS6000_BIF_ST_ELEMREV_V2DI:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v2di
-			      : CODE_FOR_vsx_st_elemrev_v2di;
-
-    case RS6000_BIF_ST_ELEMREV_V4SF:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v4sf
-			      : CODE_FOR_vsx_st_elemrev_v4sf;
-
-    case RS6000_BIF_ST_ELEMREV_V4SI:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v4si
-			      : CODE_FOR_vsx_st_elemrev_v4si;
-
-    case RS6000_BIF_ST_ELEMREV_V8HI:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v8hi
-			      : CODE_FOR_vsx_st_elemrev_v8hi;
-
-    case RS6000_BIF_ST_ELEMREV_V16QI:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v16qi
-			      : CODE_FOR_vsx_st_elemrev_v16qi;
-
-    case RS6000_BIF_LD_ELEMREV_V2DF:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v2df
-			      : CODE_FOR_vsx_ld_elemrev_v2df;
-
-    case RS6000_BIF_LD_ELEMREV_V1TI:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v1ti
-			      : CODE_FOR_vsx_ld_elemrev_v1ti;
-
-    case RS6000_BIF_LD_ELEMREV_V2DI:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v2di
-			      : CODE_FOR_vsx_ld_elemrev_v2di;
-
-    case RS6000_BIF_LD_ELEMREV_V4SF:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v4sf
-			      : CODE_FOR_vsx_ld_elemrev_v4sf;
-
-    case RS6000_BIF_LD_ELEMREV_V4SI:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v4si
-			      : CODE_FOR_vsx_ld_elemrev_v4si;
-
-    case RS6000_BIF_LD_ELEMREV_V8HI:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v8hi
-			      : CODE_FOR_vsx_ld_elemrev_v8hi;
-
-    case RS6000_BIF_LD_ELEMREV_V16QI:
-      return BYTES_BIG_ENDIAN ? CODE_FOR_vsx_load_v16qi
-			      : CODE_FOR_vsx_ld_elemrev_v16qi;
-    default:
-      ;
-    }
-
-  gcc_unreachable ();
-}
-
-/* Expand an AltiVec vector load builtin, and return the expanded rtx.  */
-static rtx
-ldv_expand_builtin (rtx target, insn_code icode, rtx *op, machine_mode tmode)
-{
-  if (target == 0
-      || GET_MODE (target) != tmode
-      || !insn_data[icode].operand[0].predicate (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  op[1] = copy_to_mode_reg (Pmode, op[1]);
-
-  /* These CELL built-ins use BLKmode instead of tmode for historical
-     (i.e., unknown) reasons.  TODO: Is this necessary?  */
-  bool blk = (icode == CODE_FOR_altivec_lvlx
-	      || icode == CODE_FOR_altivec_lvlxl
-	      || icode == CODE_FOR_altivec_lvrx
-	      || icode == CODE_FOR_altivec_lvrxl);
-
-  /* For LVX, express the RTL accurately by ANDing the address with -16.
-     LVXL and LVE*X expand to use UNSPECs to hide their special behavior,
-     so the raw address is fine.  */
-  /* TODO: That statement seems wrong, as the UNSPECs don't surround the
-     memory expression, so a latent bug may lie here.  The &-16 is likely
-     needed for all VMX-style loads.  */
-  if (icode == CODE_FOR_altivec_lvx_v1ti
-      || icode == CODE_FOR_altivec_lvx_v2df
-      || icode == CODE_FOR_altivec_lvx_v2di
-      || icode == CODE_FOR_altivec_lvx_v4sf
-      || icode == CODE_FOR_altivec_lvx_v4si
-      || icode == CODE_FOR_altivec_lvx_v8hi
-      || icode == CODE_FOR_altivec_lvx_v16qi)
-    {
-      rtx rawaddr;
-      if (op[0] == const0_rtx)
-	rawaddr = op[1];
-      else
-	{
-	  op[0] = copy_to_mode_reg (Pmode, op[0]);
-	  rawaddr = gen_rtx_PLUS (Pmode, op[1], op[0]);
-	}
-      rtx addr = gen_rtx_AND (Pmode, rawaddr, gen_rtx_CONST_INT (Pmode, -16));
-      addr = gen_rtx_MEM (blk ? BLKmode : tmode, addr);
-
-      emit_insn (gen_rtx_SET (target, addr));
-    }
-  else
-    {
-      rtx addr;
-      if (op[0] == const0_rtx)
-	addr = gen_rtx_MEM (blk ? BLKmode : tmode, op[1]);
-      else
-	{
-	  op[0] = copy_to_mode_reg (Pmode, op[0]);
-	  addr = gen_rtx_MEM (blk ? BLKmode : tmode,
-			      gen_rtx_PLUS (Pmode, op[1], op[0]));
-	}
-
-      rtx pat = GEN_FCN (icode) (target, addr);
-      if (!pat)
-	return 0;
-      emit_insn (pat);
-    }
-
-  return target;
-}
-
-/* Expand a builtin function that loads a scalar into a vector register
-   with sign extension, and return the expanded rtx.  */
-static rtx
-lxvrse_expand_builtin (rtx target, insn_code icode, rtx *op,
-		       machine_mode tmode, machine_mode smode)
-{
-  rtx pat, addr;
-  op[1] = copy_to_mode_reg (Pmode, op[1]);
-
-  if (op[0] == const0_rtx)
-    addr = gen_rtx_MEM (tmode, op[1]);
-  else
-    {
-      op[0] = copy_to_mode_reg (Pmode, op[0]);
-      addr = gen_rtx_MEM (smode,
-			  gen_rtx_PLUS (Pmode, op[1], op[0]));
-    }
-
-  rtx discratch = gen_reg_rtx (V2DImode);
-  rtx tiscratch = gen_reg_rtx (TImode);
-
-  /* Emit the lxvr*x insn.  */
-  pat = GEN_FCN (icode) (tiscratch, addr);
-  if (!pat)
-    return 0;
-  emit_insn (pat);
-
-  /* Emit a sign extension from V16QI,V8HI,V4SI to V2DI.  */
-  rtx temp1;
-  if (icode == CODE_FOR_vsx_lxvrbx)
-    {
-      temp1  = simplify_gen_subreg (V16QImode, tiscratch, TImode, 0);
-      emit_insn (gen_vsx_sign_extend_qi_v2di (discratch, temp1));
-    }
-  else if (icode == CODE_FOR_vsx_lxvrhx)
-    {
-      temp1  = simplify_gen_subreg (V8HImode, tiscratch, TImode, 0);
-      emit_insn (gen_vsx_sign_extend_hi_v2di (discratch, temp1));
-    }
-  else if (icode == CODE_FOR_vsx_lxvrwx)
-    {
-      temp1  = simplify_gen_subreg (V4SImode, tiscratch, TImode, 0);
-      emit_insn (gen_vsx_sign_extend_si_v2di (discratch, temp1));
-    }
-  else if (icode == CODE_FOR_vsx_lxvrdx)
-    discratch = simplify_gen_subreg (V2DImode, tiscratch, TImode, 0);
-  else
-    gcc_unreachable ();
-
-  /* Emit the sign extension from V2DI (double) to TI (quad).  */
-  rtx temp2 = simplify_gen_subreg (TImode, discratch, V2DImode, 0);
-  emit_insn (gen_extendditi2_vector (target, temp2));
-
-  return target;
-}
-
-/* Expand a builtin function that loads a scalar into a vector register
-   with zero extension, and return the expanded rtx.  */
-static rtx
-lxvrze_expand_builtin (rtx target, insn_code icode, rtx *op,
-		       machine_mode tmode, machine_mode smode)
-{
-  rtx pat, addr;
-  op[1] = copy_to_mode_reg (Pmode, op[1]);
-
-  if (op[0] == const0_rtx)
-    addr = gen_rtx_MEM (tmode, op[1]);
-  else
-    {
-      op[0] = copy_to_mode_reg (Pmode, op[0]);
-      addr = gen_rtx_MEM (smode,
-			  gen_rtx_PLUS (Pmode, op[1], op[0]));
-    }
-
-  pat = GEN_FCN (icode) (target, addr);
-  if (!pat)
-    return 0;
-  emit_insn (pat);
-  return target;
-}
-
-/* Expand an AltiVec vector store builtin, and return the expanded rtx.  */
-static rtx
-stv_expand_builtin (insn_code icode, rtx *op,
-		    machine_mode tmode, machine_mode smode)
-{
-  op[2] = copy_to_mode_reg (Pmode, op[2]);
-
-  /* For STVX, express the RTL accurately by ANDing the address with -16.
-     STVXL and STVE*X expand to use UNSPECs to hide their special behavior,
-     so the raw address is fine.  */
-  /* TODO: That statement seems wrong, as the UNSPECs don't surround the
-     memory expression, so a latent bug may lie here.  The &-16 is likely
-     needed for all VMX-style stores.  */
-  if (icode == CODE_FOR_altivec_stvx_v2df
-      || icode == CODE_FOR_altivec_stvx_v2di
-      || icode == CODE_FOR_altivec_stvx_v4sf
-      || icode == CODE_FOR_altivec_stvx_v4si
-      || icode == CODE_FOR_altivec_stvx_v8hi
-      || icode == CODE_FOR_altivec_stvx_v16qi)
-    {
-      rtx rawaddr;
-      if (op[1] == const0_rtx)
-	rawaddr = op[2];
-      else
-	{
-	  op[1] = copy_to_mode_reg (Pmode, op[1]);
-	  rawaddr = gen_rtx_PLUS (Pmode, op[2], op[1]);
-	}
-
-      rtx addr = gen_rtx_AND (Pmode, rawaddr, gen_rtx_CONST_INT (Pmode, -16));
-      addr = gen_rtx_MEM (tmode, addr);
-      op[0] = copy_to_mode_reg (tmode, op[0]);
-      emit_insn (gen_rtx_SET (addr, op[0]));
-    }
-  else if (icode == CODE_FOR_vsx_stxvrbx
-	   || icode == CODE_FOR_vsx_stxvrhx
-	   || icode == CODE_FOR_vsx_stxvrwx
-	   || icode == CODE_FOR_vsx_stxvrdx)
-    {
-      rtx truncrtx = gen_rtx_TRUNCATE (tmode, op[0]);
-      op[0] = copy_to_mode_reg (E_TImode, truncrtx);
-
-      rtx addr;
-      if (op[1] == const0_rtx)
-	addr = gen_rtx_MEM (Pmode, op[2]);
-      else
-	{
-	  op[1] = copy_to_mode_reg (Pmode, op[1]);
-	  addr = gen_rtx_MEM (tmode, gen_rtx_PLUS (Pmode, op[2], op[1]));
-	}
-      rtx pat = GEN_FCN (icode) (addr, op[0]);
-      if (pat)
-	emit_insn (pat);
-    }
-  else
-    {
-      if (!insn_data[icode].operand[1].predicate (op[0], smode))
-	op[0] = copy_to_mode_reg (smode, op[0]);
-
-      rtx addr;
-      if (op[1] == const0_rtx)
-	addr = gen_rtx_MEM (tmode, op[2]);
-      else
-	{
-	  op[1] = copy_to_mode_reg (Pmode, op[1]);
-	  addr = gen_rtx_MEM (tmode, gen_rtx_PLUS (Pmode, op[2], op[1]));
-	}
-
-      rtx pat = GEN_FCN (icode) (addr, op[0]);
-      if (pat)
-	emit_insn (pat);
-    }
-
-  return NULL_RTX;
-}
-
-/* Expand the MMA built-in in EXP, and return it.  */
-static rtx
-mma_expand_builtin (tree exp, rtx target, insn_code icode,
-		    rs6000_gen_builtins fcode)
-{
-  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
-  bool void_func = TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node;
-  machine_mode tmode = VOIDmode;
-  rtx op[MAX_MMA_OPERANDS];
-  unsigned nopnds = 0;
-
-  if (!void_func)
-    {
-      tmode = insn_data[icode].operand[0].mode;
-      if (!(target
-	    && GET_MODE (target) == tmode
-	    && insn_data[icode].operand[0].predicate (target, tmode)))
-	target = gen_reg_rtx (tmode);
-      op[nopnds++] = target;
-    }
-  else
-    target = const0_rtx;
-
-  call_expr_arg_iterator iter;
-  tree arg;
-  FOR_EACH_CALL_EXPR_ARG (arg, iter, exp)
-    {
-      if (arg == error_mark_node)
-	return const0_rtx;
-
-      rtx opnd;
-      const struct insn_operand_data *insn_op;
-      insn_op = &insn_data[icode].operand[nopnds];
-      if (TREE_CODE (arg) == ADDR_EXPR
-	  && MEM_P (DECL_RTL (TREE_OPERAND (arg, 0))))
-	opnd = DECL_RTL (TREE_OPERAND (arg, 0));
-      else
-	opnd = expand_normal (arg);
-
-      if (!insn_op->predicate (opnd, insn_op->mode))
-	{
-	  /* TODO: This use of constraints needs explanation.  */
-	  if (!strcmp (insn_op->constraint, "n"))
-	    {
-	      if (!CONST_INT_P (opnd))
-		error ("argument %d must be an unsigned literal", nopnds);
-	      else
-		error ("argument %d is an unsigned literal that is "
-		       "out of range", nopnds);
-	      return const0_rtx;
-	    }
-	  opnd = copy_to_mode_reg (insn_op->mode, opnd);
-	}
-
-      /* Some MMA instructions have INOUT accumulator operands, so force
-	 their target register to be the same as their input register.  */
-      if (!void_func
-	  && nopnds == 1
-	  && !strcmp (insn_op->constraint, "0")
-	  && insn_op->mode == tmode
-	  && REG_P (opnd)
-	  && insn_data[icode].operand[0].predicate (opnd, tmode))
-	target = op[0] = opnd;
-
-      op[nopnds++] = opnd;
-    }
-
-  rtx pat;
-  switch (nopnds)
-    {
-    case 1:
-      pat = GEN_FCN (icode) (op[0]);
-      break;
-    case 2:
-      pat = GEN_FCN (icode) (op[0], op[1]);
-      break;
-    case 3:
-      /* The ASSEMBLE builtin source operands are reversed in little-endian
-	 mode, so reorder them.  */
-      if (fcode == RS6000_BIF_ASSEMBLE_PAIR_V_INTERNAL && !WORDS_BIG_ENDIAN)
-	std::swap (op[1], op[2]);
-      pat = GEN_FCN (icode) (op[0], op[1], op[2]);
-      break;
-    case 4:
-      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3]);
-      break;
-    case 5:
-      /* The ASSEMBLE builtin source operands are reversed in little-endian
-	 mode, so reorder them.  */
-      if (fcode == RS6000_BIF_ASSEMBLE_ACC_INTERNAL && !WORDS_BIG_ENDIAN)
-	{
-	  std::swap (op[1], op[4]);
-	  std::swap (op[2], op[3]);
-	}
-      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4]);
-      break;
-    case 6:
-      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5]);
-      break;
-    case 7:
-      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5], op[6]);
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  if (!pat)
-    return NULL_RTX;
-
-  emit_insn (pat);
-  return target;
-}
-
-/* Return the appropriate SPR number associated with the given builtin.  */
-static inline HOST_WIDE_INT
-htm_spr_num (enum rs6000_gen_builtins code)
-{
-  if (code == RS6000_BIF_GET_TFHAR
-      || code == RS6000_BIF_SET_TFHAR)
-    return TFHAR_SPR;
-  else if (code == RS6000_BIF_GET_TFIAR
-	   || code == RS6000_BIF_SET_TFIAR)
-    return TFIAR_SPR;
-  else if (code == RS6000_BIF_GET_TEXASR
-	   || code == RS6000_BIF_SET_TEXASR)
-    return TEXASR_SPR;
-  gcc_assert (code == RS6000_BIF_GET_TEXASRU
-	      || code == RS6000_BIF_SET_TEXASRU);
-  return TEXASRU_SPR;
-}
-
-/* Expand the HTM builtin in EXP and store the result in TARGET.
-   Return the expanded rtx.  */
-static rtx
-htm_expand_builtin (bifdata *bifaddr, rs6000_gen_builtins fcode,
-		    tree exp, rtx target)
-{
-  if (!TARGET_POWERPC64
-      && (fcode == RS6000_BIF_TABORTDC
-	  || fcode == RS6000_BIF_TABORTDCI))
-    {
-      error ("builtin %qs is only valid in 64-bit mode", bifaddr->bifname);
-      return const0_rtx;
-    }
-
-  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
-  bool nonvoid = TREE_TYPE (TREE_TYPE (fndecl)) != void_type_node;
-  bool uses_spr = bif_is_htmspr (*bifaddr);
-  insn_code icode = bifaddr->icode;
-
-  if (uses_spr)
-    icode = rs6000_htm_spr_icode (nonvoid);
-
-  rtx op[MAX_HTM_OPERANDS];
-  int nopnds = 0;
-  const insn_operand_data *insn_op = &insn_data[icode].operand[0];
-
-  if (nonvoid)
-    {
-      machine_mode tmode = (uses_spr) ? insn_op->mode : E_SImode;
-      if (!target
-	  || GET_MODE (target) != tmode
-	  || (uses_spr && !insn_op->predicate (target, tmode)))
-	target = gen_reg_rtx (tmode);
-      if (uses_spr)
-	op[nopnds++] = target;
-    }
-
-  tree arg;
-  call_expr_arg_iterator iter;
-
-  FOR_EACH_CALL_EXPR_ARG (arg, iter, exp)
-    {
-      if (arg == error_mark_node || nopnds >= MAX_HTM_OPERANDS)
-	return const0_rtx;
-
-      insn_op = &insn_data[icode].operand[nopnds];
-      op[nopnds] = expand_normal (arg);
-
-      if (!insn_op->predicate (op[nopnds], insn_op->mode))
-	{
-	  /* TODO: This use of constraints could use explanation.
-	     This happens a couple of places, perhaps make that a
-	     function to document what's happening.  */
-	  if (!strcmp (insn_op->constraint, "n"))
-	    {
-	      int arg_num = nonvoid ? nopnds : nopnds + 1;
-	      if (!CONST_INT_P (op[nopnds]))
-		error ("argument %d must be an unsigned literal", arg_num);
-	      else
-		error ("argument %d is an unsigned literal that is "
-		       "out of range", arg_num);
-	      return const0_rtx;
-	    }
-	  op[nopnds] = copy_to_mode_reg (insn_op->mode, op[nopnds]);
-	}
-
-      nopnds++;
-    }
-
-  /* Handle the builtins for extended mnemonics.  These accept
-     no arguments, but map to builtins that take arguments.  */
-  switch (fcode)
-    {
-    case RS6000_BIF_TENDALL:  /* Alias for: tend. 1  */
-    case RS6000_BIF_TRESUME:  /* Alias for: tsr. 1  */
-      op[nopnds++] = GEN_INT (1);
-      break;
-    case RS6000_BIF_TSUSPEND: /* Alias for: tsr. 0  */
-      op[nopnds++] = GEN_INT (0);
-      break;
-    default:
-      break;
-    }
-
-  /* If this builtin accesses SPRs, then pass in the appropriate
-     SPR number and SPR regno as the last two operands.  */
-  rtx cr = NULL_RTX;
-  if (uses_spr)
-    {
-      machine_mode mode = TARGET_POWERPC64 ? DImode : SImode;
-      op[nopnds++] = gen_rtx_CONST_INT (mode, htm_spr_num (fcode));
-    }
-  /* If this builtin accesses a CR field, then pass in a scratch
-     CR field as the last operand.  */
-  else if (bif_is_htmcr (*bifaddr))
-    {
-      cr = gen_reg_rtx (CCmode);
-      op[nopnds++] = cr;
-    }
-
-  rtx pat;
-  switch (nopnds)
-    {
-    case 1:
-      pat = GEN_FCN (icode) (op[0]);
-      break;
-    case 2:
-      pat = GEN_FCN (icode) (op[0], op[1]);
-      break;
-    case 3:
-      pat = GEN_FCN (icode) (op[0], op[1], op[2]);
-      break;
-    case 4:
-      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3]);
-      break;
-    default:
-      gcc_unreachable ();
-    }
-  if (!pat)
-    return NULL_RTX;
-  emit_insn (pat);
-
-  if (bif_is_htmcr (*bifaddr))
-    {
-      if (fcode == RS6000_BIF_TBEGIN)
-	{
-	  /* Emit code to set TARGET to true or false depending on
-	     whether the tbegin. instruction succeeded or failed
-	     to start a transaction.  We do this by placing the 1's
-	     complement of CR's EQ bit into TARGET.  */
-	  rtx scratch = gen_reg_rtx (SImode);
-	  emit_insn (gen_rtx_SET (scratch,
-				  gen_rtx_EQ (SImode, cr,
-					      const0_rtx)));
-	  emit_insn (gen_rtx_SET (target,
-				  gen_rtx_XOR (SImode, scratch,
-					       GEN_INT (1))));
-	}
-      else
-	{
-	  /* Emit code to copy the 4-bit condition register field
-	     CR into the least significant end of register TARGET.  */
-	  rtx scratch1 = gen_reg_rtx (SImode);
-	  rtx scratch2 = gen_reg_rtx (SImode);
-	  rtx subreg = simplify_gen_subreg (CCmode, scratch1, SImode, 0);
-	  emit_insn (gen_movcc (subreg, cr));
-	  emit_insn (gen_lshrsi3 (scratch2, scratch1, GEN_INT (28)));
-	  emit_insn (gen_andsi3 (target, scratch2, GEN_INT (0xf)));
-	}
-    }
-
-  if (nonvoid)
-    return target;
-  return const0_rtx;
-}
-
-/* Expand an expression EXP that calls a built-in function,
-   with result going to TARGET if that's convenient
-   (and in mode MODE if that's convenient).
-   SUBTARGET may be used as the target for computing one of EXP's operands.
-   IGNORE is nonzero if the value is to be ignored.
-   Use the new builtin infrastructure.  */
-rtx
-rs6000_expand_builtin (tree exp, rtx target, rtx /* subtarget */,
-		       machine_mode /* mode */, int ignore)
-{
-  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
-  enum rs6000_gen_builtins fcode
-    = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl);
-  size_t uns_fcode = (size_t)fcode;
-  enum insn_code icode = rs6000_builtin_info[uns_fcode].icode;
-
-  /* TODO: The following commentary and code is inherited from the original
-     builtin processing code.  The commentary is a bit confusing, with the
-     intent being that KFmode is always IEEE-128, IFmode is always IBM
-     double-double, and TFmode is the current long double.  The code is
-     confusing in that it converts from KFmode to TFmode pattern names,
-     when the other direction is more intuitive.  Try to address this.  */
-
-  /* We have two different modes (KFmode, TFmode) that are the IEEE
-     128-bit floating point type, depending on whether long double is the
-     IBM extended double (KFmode) or long double is IEEE 128-bit (TFmode).
-     It is simpler if we only define one variant of the built-in function,
-     and switch the code when defining it, rather than defining two built-
-     ins and using the overload table in rs6000-c.cc to switch between the
-     two.  If we don't have the proper assembler, don't do this switch
-     because CODE_FOR_*kf* and CODE_FOR_*tf* will be CODE_FOR_nothing.  */
-  if (FLOAT128_IEEE_P (TFmode))
-    switch (icode)
-      {
-      case CODE_FOR_sqrtkf2_odd:
-	icode = CODE_FOR_sqrttf2_odd;
-	break;
-      case CODE_FOR_trunckfdf2_odd:
-	icode = CODE_FOR_trunctfdf2_odd;
-	break;
-      case CODE_FOR_addkf3_odd:
-	icode = CODE_FOR_addtf3_odd;
-	break;
-      case CODE_FOR_subkf3_odd:
-	icode = CODE_FOR_subtf3_odd;
-	break;
-      case CODE_FOR_mulkf3_odd:
-	icode = CODE_FOR_multf3_odd;
-	break;
-      case CODE_FOR_divkf3_odd:
-	icode = CODE_FOR_divtf3_odd;
-	break;
-      case CODE_FOR_fmakf4_odd:
-	icode = CODE_FOR_fmatf4_odd;
-	break;
-      case CODE_FOR_xsxexpqp_kf:
-	icode = CODE_FOR_xsxexpqp_tf;
-	break;
-      case CODE_FOR_xsxsigqp_kf:
-	icode = CODE_FOR_xsxsigqp_tf;
-	break;
-      case CODE_FOR_xststdcnegqp_kf:
-	icode = CODE_FOR_xststdcnegqp_tf;
-	break;
-      case CODE_FOR_xsiexpqp_kf:
-	icode = CODE_FOR_xsiexpqp_tf;
-	break;
-      case CODE_FOR_xsiexpqpf_kf:
-	icode = CODE_FOR_xsiexpqpf_tf;
-	break;
-      case CODE_FOR_xststdcqp_kf:
-	icode = CODE_FOR_xststdcqp_tf;
-	break;
-      case CODE_FOR_xscmpexpqp_eq_kf:
-	icode = CODE_FOR_xscmpexpqp_eq_tf;
-	break;
-      case CODE_FOR_xscmpexpqp_lt_kf:
-	icode = CODE_FOR_xscmpexpqp_lt_tf;
-	break;
-      case CODE_FOR_xscmpexpqp_gt_kf:
-	icode = CODE_FOR_xscmpexpqp_gt_tf;
-	break;
-      case CODE_FOR_xscmpexpqp_unordered_kf:
-	icode = CODE_FOR_xscmpexpqp_unordered_tf;
-	break;
-      default:
-	break;
-      }
-
-  /* In case of "#pragma target" changes, we initialize all builtins
-     but check for actual availability now, during expand time.  For
-     invalid builtins, generate a normal call.  */
-  bifdata *bifaddr = &rs6000_builtin_info[uns_fcode];
-  bif_enable e = bifaddr->enable;
-
-  if (!(e == ENB_ALWAYS
-	|| (e == ENB_P5 && TARGET_POPCNTB)
-	|| (e == ENB_P6 && TARGET_CMPB)
-	|| (e == ENB_P6_64 && TARGET_CMPB && TARGET_POWERPC64)
-	|| (e == ENB_ALTIVEC && TARGET_ALTIVEC)
-	|| (e == ENB_CELL && TARGET_ALTIVEC && rs6000_cpu == PROCESSOR_CELL)
-	|| (e == ENB_VSX && TARGET_VSX)
-	|| (e == ENB_P7 && TARGET_POPCNTD)
-	|| (e == ENB_P7_64 && TARGET_POPCNTD && TARGET_POWERPC64)
-	|| (e == ENB_P8 && TARGET_DIRECT_MOVE)
-	|| (e == ENB_P8V && TARGET_P8_VECTOR)
-	|| (e == ENB_P9 && TARGET_MODULO)
-	|| (e == ENB_P9_64 && TARGET_MODULO && TARGET_POWERPC64)
-	|| (e == ENB_P9V && TARGET_P9_VECTOR)
-	|| (e == ENB_IEEE128_HW && TARGET_FLOAT128_HW)
-	|| (e == ENB_DFP && TARGET_DFP)
-	|| (e == ENB_CRYPTO && TARGET_CRYPTO)
-	|| (e == ENB_HTM && TARGET_HTM)
-	|| (e == ENB_P10 && TARGET_POWER10)
-	|| (e == ENB_P10_64 && TARGET_POWER10 && TARGET_POWERPC64)
-	|| (e == ENB_MMA && TARGET_MMA)))
-    {
-      rs6000_invalid_builtin (fcode);
-      return expand_call (exp, target, ignore);
-    }
-
-  if (bif_is_nosoft (*bifaddr)
-      && rs6000_isa_flags & OPTION_MASK_SOFT_FLOAT)
-    {
-      error ("%qs not supported with %<-msoft-float%>",
-	     bifaddr->bifname);
-      return const0_rtx;
-    }
-
-  if (bif_is_no32bit (*bifaddr) && TARGET_32BIT)
-    {
-      error ("%qs is not supported in 32-bit mode", bifaddr->bifname);
-      return const0_rtx;
-    }
-
-  if (bif_is_ibmld (*bifaddr) && !FLOAT128_2REG_P (TFmode))
-    {
-      error ("%qs requires %<long double%> to be IBM 128-bit format",
-	     bifaddr->bifname);
-      return const0_rtx;
-    }
-
-  if (bif_is_cpu (*bifaddr))
-    return cpu_expand_builtin (fcode, exp, target);
-
-  if (bif_is_init (*bifaddr))
-    return altivec_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
-
-  if (bif_is_set (*bifaddr))
-    return altivec_expand_vec_set_builtin (exp);
-
-  if (bif_is_extract (*bifaddr))
-    return altivec_expand_vec_ext_builtin (exp, target);
-
-  if (bif_is_predicate (*bifaddr))
-    return altivec_expand_predicate_builtin (icode, exp, target);
-
-  if (bif_is_htm (*bifaddr))
-    return htm_expand_builtin (bifaddr, fcode, exp, target);
-
-  if (bif_is_32bit (*bifaddr) && TARGET_32BIT)
-    {
-      if (fcode == RS6000_BIF_MFTB)
-	icode = CODE_FOR_rs6000_mftb_si;
-      else if (fcode == RS6000_BIF_BPERMD)
-	icode = CODE_FOR_bpermd_si;
-      else if (fcode == RS6000_BIF_DARN)
-	icode = CODE_FOR_darn_64_si;
-      else if (fcode == RS6000_BIF_DARN_32)
-	icode = CODE_FOR_darn_32_si;
-      else if (fcode == RS6000_BIF_DARN_RAW)
-	icode = CODE_FOR_darn_raw_si;
-      else
-	gcc_unreachable ();
-    }
-
-  if (bif_is_endian (*bifaddr) && BYTES_BIG_ENDIAN)
-    {
-      if (fcode == RS6000_BIF_LD_ELEMREV_V1TI)
-	icode = CODE_FOR_vsx_load_v1ti;
-      else if (fcode == RS6000_BIF_LD_ELEMREV_V2DF)
-	icode = CODE_FOR_vsx_load_v2df;
-      else if (fcode == RS6000_BIF_LD_ELEMREV_V2DI)
-	icode = CODE_FOR_vsx_load_v2di;
-      else if (fcode == RS6000_BIF_LD_ELEMREV_V4SF)
-	icode = CODE_FOR_vsx_load_v4sf;
-      else if (fcode == RS6000_BIF_LD_ELEMREV_V4SI)
-	icode = CODE_FOR_vsx_load_v4si;
-      else if (fcode == RS6000_BIF_LD_ELEMREV_V8HI)
-	icode = CODE_FOR_vsx_load_v8hi;
-      else if (fcode == RS6000_BIF_LD_ELEMREV_V16QI)
-	icode = CODE_FOR_vsx_load_v16qi;
-      else if (fcode == RS6000_BIF_ST_ELEMREV_V1TI)
-	icode = CODE_FOR_vsx_store_v1ti;
-      else if (fcode == RS6000_BIF_ST_ELEMREV_V2DF)
-	icode = CODE_FOR_vsx_store_v2df;
-      else if (fcode == RS6000_BIF_ST_ELEMREV_V2DI)
-	icode = CODE_FOR_vsx_store_v2di;
-      else if (fcode == RS6000_BIF_ST_ELEMREV_V4SF)
-	icode = CODE_FOR_vsx_store_v4sf;
-      else if (fcode == RS6000_BIF_ST_ELEMREV_V4SI)
-	icode = CODE_FOR_vsx_store_v4si;
-      else if (fcode == RS6000_BIF_ST_ELEMREV_V8HI)
-	icode = CODE_FOR_vsx_store_v8hi;
-      else if (fcode == RS6000_BIF_ST_ELEMREV_V16QI)
-	icode = CODE_FOR_vsx_store_v16qi;
-      else
-	gcc_unreachable ();
-    }
-
-
-  /* TRUE iff the built-in function returns void.  */
-  bool void_func = TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node;
-  /* Position of first argument (0 for void-returning functions, else 1).  */
-  int k;
-  /* Modes for the return value, if any, and arguments.  */
-  const int MAX_BUILTIN_ARGS = 6;
-  machine_mode mode[MAX_BUILTIN_ARGS + 1];
-
-  if (void_func)
-    k = 0;
-  else
-    {
-      k = 1;
-      mode[0] = insn_data[icode].operand[0].mode;
-    }
-
-  /* Tree expressions for each argument.  */
-  tree arg[MAX_BUILTIN_ARGS];
-  /* RTL expressions for each argument.  */
-  rtx op[MAX_BUILTIN_ARGS];
-
-  int nargs = bifaddr->nargs;
-  gcc_assert (nargs <= MAX_BUILTIN_ARGS);
-
-
-  for (int i = 0; i < nargs; i++)
-    {
-      arg[i] = CALL_EXPR_ARG (exp, i);
-      if (arg[i] == error_mark_node)
-	return const0_rtx;
-      STRIP_NOPS (arg[i]);
-      op[i] = expand_normal (arg[i]);
-      /* We have a couple of pesky patterns that don't specify the mode...  */
-      mode[i+k] = insn_data[icode].operand[i+k].mode;
-      if (!mode[i+k])
-	mode[i+k] = Pmode;
-    }
-
-  /* Check for restricted constant arguments.  */
-  for (int i = 0; i < 2; i++)
-    {
-      switch (bifaddr->restr[i])
-	{
-	case RES_BITS:
-	  {
-	    size_t mask = 1;
-	    mask <<= bifaddr->restr_val1[i];
-	    mask--;
-	    tree restr_arg = arg[bifaddr->restr_opnd[i] - 1];
-	    STRIP_NOPS (restr_arg);
-	    if (!(TREE_CODE (restr_arg) == INTEGER_CST
-		  && (TREE_INT_CST_LOW (restr_arg) & ~mask) == 0))
-	      {
-		unsigned p = (1U << bifaddr->restr_val1[i]) - 1;
-		error ("argument %d must be a literal between 0 and %d,"
-		       " inclusive",
-		       bifaddr->restr_opnd[i], p);
-		return CONST0_RTX (mode[0]);
-	      }
-	    break;
-	  }
-	case RES_RANGE:
-	  {
-	    tree restr_arg = arg[bifaddr->restr_opnd[i] - 1];
-	    STRIP_NOPS (restr_arg);
-	    if (!(TREE_CODE (restr_arg) == INTEGER_CST
-		  && IN_RANGE (tree_to_shwi (restr_arg),
-			       bifaddr->restr_val1[i],
-			       bifaddr->restr_val2[i])))
-	      {
-		error ("argument %d must be a literal between %d and %d,"
-		       " inclusive",
-		       bifaddr->restr_opnd[i], bifaddr->restr_val1[i],
-		       bifaddr->restr_val2[i]);
-		return CONST0_RTX (mode[0]);
-	      }
-	    break;
-	  }
-	case RES_VAR_RANGE:
-	  {
-	    tree restr_arg = arg[bifaddr->restr_opnd[i] - 1];
-	    STRIP_NOPS (restr_arg);
-	    if (TREE_CODE (restr_arg) == INTEGER_CST
-		&& !IN_RANGE (tree_to_shwi (restr_arg),
-			      bifaddr->restr_val1[i],
-			      bifaddr->restr_val2[i]))
-	      {
-		error ("argument %d must be a variable or a literal "
-		       "between %d and %d, inclusive",
-		       bifaddr->restr_opnd[i], bifaddr->restr_val1[i],
-		       bifaddr->restr_val2[i]);
-		return CONST0_RTX (mode[0]);
-	      }
-	    break;
-	  }
-	case RES_VALUES:
-	  {
-	    tree restr_arg = arg[bifaddr->restr_opnd[i] - 1];
-	    STRIP_NOPS (restr_arg);
-	    if (!(TREE_CODE (restr_arg) == INTEGER_CST
-		  && (tree_to_shwi (restr_arg) == bifaddr->restr_val1[i]
-		      || tree_to_shwi (restr_arg) == bifaddr->restr_val2[i])))
-	      {
-		error ("argument %d must be either a literal %d or a "
-		       "literal %d",
-		       bifaddr->restr_opnd[i], bifaddr->restr_val1[i],
-		       bifaddr->restr_val2[i]);
-		return CONST0_RTX (mode[0]);
-	      }
-	    break;
-	  }
-	default:
-	case RES_NONE:
-	  break;
-	}
-    }
-
-  if (bif_is_ldstmask (*bifaddr))
-    return rs6000_expand_ldst_mask (target, arg[0]);
-
-  if (bif_is_stvec (*bifaddr))
-    {
-      if (bif_is_reve (*bifaddr))
-	icode = elemrev_icode (fcode);
-      return stv_expand_builtin (icode, op, mode[0], mode[1]);
-    }
-
-  if (bif_is_ldvec (*bifaddr))
-    {
-      if (bif_is_reve (*bifaddr))
-	icode = elemrev_icode (fcode);
-      return ldv_expand_builtin (target, icode, op, mode[0]);
-    }
-
-  if (bif_is_lxvrse (*bifaddr))
-    return lxvrse_expand_builtin (target, icode, op, mode[0], mode[1]);
-
-  if (bif_is_lxvrze (*bifaddr))
-    return lxvrze_expand_builtin (target, icode, op, mode[0], mode[1]);
-
-  if (bif_is_mma (*bifaddr))
-    return mma_expand_builtin (exp, target, icode, fcode);
-
-  if (fcode == RS6000_BIF_PACK_IF
-      && TARGET_LONG_DOUBLE_128
-      && !TARGET_IEEEQUAD)
-    {
-      icode = CODE_FOR_packtf;
-      fcode = RS6000_BIF_PACK_TF;
-      uns_fcode = (size_t) fcode;
-    }
-  else if (fcode == RS6000_BIF_UNPACK_IF
-	   && TARGET_LONG_DOUBLE_128
-	   && !TARGET_IEEEQUAD)
-    {
-      icode = CODE_FOR_unpacktf;
-      fcode = RS6000_BIF_UNPACK_TF;
-      uns_fcode = (size_t) fcode;
-    }
-
-  if (TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node)
-    target = NULL_RTX;
-  else if (target == 0
-	   || GET_MODE (target) != mode[0]
-	   || !insn_data[icode].operand[0].predicate (target, mode[0]))
-    target = gen_reg_rtx (mode[0]);
-
-  for (int i = 0; i < nargs; i++)
-    if (!insn_data[icode].operand[i+k].predicate (op[i], mode[i+k]))
-      op[i] = copy_to_mode_reg (mode[i+k], op[i]);
-
-  rtx pat;
-
-  switch (nargs)
-    {
-    case 0:
-      pat = (void_func
-	     ? GEN_FCN (icode) ()
-	     : GEN_FCN (icode) (target));
-      break;
-    case 1:
-      pat = (void_func
-	     ? GEN_FCN (icode) (op[0])
-	     : GEN_FCN (icode) (target, op[0]));
-      break;
-    case 2:
-      pat = (void_func
-	     ? GEN_FCN (icode) (op[0], op[1])
-	     : GEN_FCN (icode) (target, op[0], op[1]));
-      break;
-    case 3:
-      pat = (void_func
-	     ? GEN_FCN (icode) (op[0], op[1], op[2])
-	     : GEN_FCN (icode) (target, op[0], op[1], op[2]));
-      break;
-    case 4:
-      pat = (void_func
-	     ? GEN_FCN (icode) (op[0], op[1], op[2], op[3])
-	     : GEN_FCN (icode) (target, op[0], op[1], op[2], op[3]));
-      break;
-    case 5:
-      pat = (void_func
-	     ? GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4])
-	     : GEN_FCN (icode) (target, op[0], op[1], op[2], op[3], op[4]));
-      break;
-    case 6:
-      pat = (void_func
-	     ? GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5])
-	     : GEN_FCN (icode) (target, op[0], op[1],
-				op[2], op[3], op[4], op[5]));
-      break;
-    default:
-      gcc_assert (MAX_BUILTIN_ARGS == 6);
-      gcc_unreachable ();
-    }
-
-  if (!pat)
-    return 0;
-
-  emit_insn (pat);
-  return target;
-}
-
-/* Create a builtin vector type with a name.  Taking care not to give
-   the canonical type a name.  */
-
-static tree
-rs6000_vector_type (const char *name, tree elt_type, unsigned num_elts)
-{
-  tree result = build_vector_type (elt_type, num_elts);
-
-  /* Copy so we don't give the canonical type a name.  */
-  result = build_variant_type_copy (result);
-
-  add_builtin_type (name, result);
-
-  return result;
-}
-
-void
-rs6000_init_builtins (void)
-{
-  tree tdecl;
-  tree t;
-
-  if (TARGET_DEBUG_BUILTIN)
-    fprintf (stderr, "rs6000_init_builtins%s%s\n",
-	     (TARGET_ALTIVEC)	   ? ", altivec" : "",
-	     (TARGET_VSX)	   ? ", vsx"	 : "");
-
-  V2DI_type_node = rs6000_vector_type ("__vector long long",
-				       long_long_integer_type_node, 2);
-  ptr_V2DI_type_node
-    = build_pointer_type (build_qualified_type (V2DI_type_node,
-						TYPE_QUAL_CONST));
-
-  V2DF_type_node = rs6000_vector_type ("__vector double", double_type_node, 2);
-  ptr_V2DF_type_node
-    = build_pointer_type (build_qualified_type (V2DF_type_node,
-						TYPE_QUAL_CONST));
-
-  V4SI_type_node = rs6000_vector_type ("__vector signed int",
-				       intSI_type_node, 4);
-  ptr_V4SI_type_node
-    = build_pointer_type (build_qualified_type (V4SI_type_node,
-						TYPE_QUAL_CONST));
-
-  V4SF_type_node = rs6000_vector_type ("__vector float", float_type_node, 4);
-  ptr_V4SF_type_node
-    = build_pointer_type (build_qualified_type (V4SF_type_node,
-						TYPE_QUAL_CONST));
-
-  V8HI_type_node = rs6000_vector_type ("__vector signed short",
-				       intHI_type_node, 8);
-  ptr_V8HI_type_node
-    = build_pointer_type (build_qualified_type (V8HI_type_node,
-						TYPE_QUAL_CONST));
-
-  V16QI_type_node = rs6000_vector_type ("__vector signed char",
-					intQI_type_node, 16);
-  ptr_V16QI_type_node
-    = build_pointer_type (build_qualified_type (V16QI_type_node,
-						TYPE_QUAL_CONST));
-
-  unsigned_V16QI_type_node = rs6000_vector_type ("__vector unsigned char",
-					unsigned_intQI_type_node, 16);
-  ptr_unsigned_V16QI_type_node
-    = build_pointer_type (build_qualified_type (unsigned_V16QI_type_node,
-						TYPE_QUAL_CONST));
-
-  unsigned_V8HI_type_node = rs6000_vector_type ("__vector unsigned short",
-				       unsigned_intHI_type_node, 8);
-  ptr_unsigned_V8HI_type_node
-    = build_pointer_type (build_qualified_type (unsigned_V8HI_type_node,
-						TYPE_QUAL_CONST));
-
-  unsigned_V4SI_type_node = rs6000_vector_type ("__vector unsigned int",
-				       unsigned_intSI_type_node, 4);
-  ptr_unsigned_V4SI_type_node
-    = build_pointer_type (build_qualified_type (unsigned_V4SI_type_node,
-						TYPE_QUAL_CONST));
-
-  unsigned_V2DI_type_node
-    = rs6000_vector_type ("__vector unsigned long long",
-			  long_long_unsigned_type_node, 2);
-
-  ptr_unsigned_V2DI_type_node
-    = build_pointer_type (build_qualified_type (unsigned_V2DI_type_node,
-						TYPE_QUAL_CONST));
-
-  opaque_V4SI_type_node = build_opaque_vector_type (intSI_type_node, 4);
-
-  const_str_type_node
-    = build_pointer_type (build_qualified_type (char_type_node,
-						TYPE_QUAL_CONST));
-
-  /* We use V1TI mode as a special container to hold __int128_t items that
-     must live in VSX registers.  */
-  if (intTI_type_node)
-    {
-      V1TI_type_node = rs6000_vector_type ("__vector __int128",
-					   intTI_type_node, 1);
-      ptr_V1TI_type_node
-	= build_pointer_type (build_qualified_type (V1TI_type_node,
-						    TYPE_QUAL_CONST));
-      unsigned_V1TI_type_node
-	= rs6000_vector_type ("__vector unsigned __int128",
-			      unsigned_intTI_type_node, 1);
-      ptr_unsigned_V1TI_type_node
-	= build_pointer_type (build_qualified_type (unsigned_V1TI_type_node,
-						    TYPE_QUAL_CONST));
-    }
-
-  /* The 'vector bool ...' types must be kept distinct from 'vector unsigned ...'
-     types, especially in C++ land.  Similarly, 'vector pixel' is distinct from
-     'vector unsigned short'.  */
-
-  bool_char_type_node = build_distinct_type_copy (unsigned_intQI_type_node);
-  bool_short_type_node = build_distinct_type_copy (unsigned_intHI_type_node);
-  bool_int_type_node = build_distinct_type_copy (unsigned_intSI_type_node);
-  bool_long_long_type_node = build_distinct_type_copy (unsigned_intDI_type_node);
-  pixel_type_node = build_distinct_type_copy (unsigned_intHI_type_node);
-
-  long_integer_type_internal_node = long_integer_type_node;
-  long_unsigned_type_internal_node = long_unsigned_type_node;
-  long_long_integer_type_internal_node = long_long_integer_type_node;
-  long_long_unsigned_type_internal_node = long_long_unsigned_type_node;
-  intQI_type_internal_node = intQI_type_node;
-  uintQI_type_internal_node = unsigned_intQI_type_node;
-  intHI_type_internal_node = intHI_type_node;
-  uintHI_type_internal_node = unsigned_intHI_type_node;
-  intSI_type_internal_node = intSI_type_node;
-  uintSI_type_internal_node = unsigned_intSI_type_node;
-  intDI_type_internal_node = intDI_type_node;
-  uintDI_type_internal_node = unsigned_intDI_type_node;
-  intTI_type_internal_node = intTI_type_node;
-  uintTI_type_internal_node = unsigned_intTI_type_node;
-  float_type_internal_node = float_type_node;
-  double_type_internal_node = double_type_node;
-  long_double_type_internal_node = long_double_type_node;
-  dfloat64_type_internal_node = dfloat64_type_node;
-  dfloat128_type_internal_node = dfloat128_type_node;
-  void_type_internal_node = void_type_node;
-
-  ptr_intQI_type_node
-    = build_pointer_type (build_qualified_type (intQI_type_internal_node,
-						TYPE_QUAL_CONST));
-  ptr_uintQI_type_node
-    = build_pointer_type (build_qualified_type (uintQI_type_internal_node,
-						TYPE_QUAL_CONST));
-  ptr_intHI_type_node
-    = build_pointer_type (build_qualified_type (intHI_type_internal_node,
-						TYPE_QUAL_CONST));
-  ptr_uintHI_type_node
-    = build_pointer_type (build_qualified_type (uintHI_type_internal_node,
-						TYPE_QUAL_CONST));
-  ptr_intSI_type_node
-    = build_pointer_type (build_qualified_type (intSI_type_internal_node,
-						TYPE_QUAL_CONST));
-  ptr_uintSI_type_node
-    = build_pointer_type (build_qualified_type (uintSI_type_internal_node,
-						TYPE_QUAL_CONST));
-  ptr_intDI_type_node
-    = build_pointer_type (build_qualified_type (intDI_type_internal_node,
-						TYPE_QUAL_CONST));
-  ptr_uintDI_type_node
-    = build_pointer_type (build_qualified_type (uintDI_type_internal_node,
-						TYPE_QUAL_CONST));
-  ptr_intTI_type_node
-    = build_pointer_type (build_qualified_type (intTI_type_internal_node,
-						TYPE_QUAL_CONST));
-  ptr_uintTI_type_node
-    = build_pointer_type (build_qualified_type (uintTI_type_internal_node,
-						TYPE_QUAL_CONST));
-
-  t = build_qualified_type (long_integer_type_internal_node, TYPE_QUAL_CONST);
-  ptr_long_integer_type_node = build_pointer_type (t);
-
-  t = build_qualified_type (long_unsigned_type_internal_node, TYPE_QUAL_CONST);
-  ptr_long_unsigned_type_node = build_pointer_type (t);
-
-  ptr_float_type_node
-    = build_pointer_type (build_qualified_type (float_type_internal_node,
-						TYPE_QUAL_CONST));
-  ptr_double_type_node
-    = build_pointer_type (build_qualified_type (double_type_internal_node,
-						TYPE_QUAL_CONST));
-  ptr_long_double_type_node
-    = build_pointer_type (build_qualified_type (long_double_type_internal_node,
-						TYPE_QUAL_CONST));
-  if (dfloat64_type_node)
-    {
-      t = build_qualified_type (dfloat64_type_internal_node, TYPE_QUAL_CONST);
-      ptr_dfloat64_type_node = build_pointer_type (t);
-    }
-  else
-    ptr_dfloat64_type_node = NULL;
-
-  if (dfloat128_type_node)
-    {
-      t = build_qualified_type (dfloat128_type_internal_node, TYPE_QUAL_CONST);
-      ptr_dfloat128_type_node = build_pointer_type (t);
-    }
-  else
-    ptr_dfloat128_type_node = NULL;
-
-  t = build_qualified_type (long_long_integer_type_internal_node,
-			    TYPE_QUAL_CONST);
-  ptr_long_long_integer_type_node  = build_pointer_type (t);
-
-  t = build_qualified_type (long_long_unsigned_type_internal_node,
-			    TYPE_QUAL_CONST);
-  ptr_long_long_unsigned_type_node = build_pointer_type (t);
-
-  /* 128-bit floating point support.  KFmode is IEEE 128-bit floating point.
-     IFmode is the IBM extended 128-bit format that is a pair of doubles.
-     TFmode will be either IEEE 128-bit floating point or the IBM double-double
-     format that uses a pair of doubles, depending on the switches and
-     defaults.
-
-     If we don't support for either 128-bit IBM double double or IEEE 128-bit
-     floating point, we need make sure the type is non-zero or else self-test
-     fails during bootstrap.
-
-     Always create __ibm128 as a separate type, even if the current long double
-     format is IBM extended double.
-
-     For IEEE 128-bit floating point, always create the type __ieee128.  If the
-     user used -mfloat128, rs6000-c.cc will create a define from __float128 to
-     __ieee128.  */
-  if (TARGET_FLOAT128_TYPE)
-    {
-      if (!TARGET_IEEEQUAD && TARGET_LONG_DOUBLE_128)
-	ibm128_float_type_node = long_double_type_node;
-      else
-	{
-	  ibm128_float_type_node = make_node (REAL_TYPE);
-	  TYPE_PRECISION (ibm128_float_type_node) = 128;
-	  SET_TYPE_MODE (ibm128_float_type_node, IFmode);
-	  layout_type (ibm128_float_type_node);
-	}
-      t = build_qualified_type (ibm128_float_type_node, TYPE_QUAL_CONST);
-      ptr_ibm128_float_type_node = build_pointer_type (t);
-      lang_hooks.types.register_builtin_type (ibm128_float_type_node,
-					      "__ibm128");
-
-      if (TARGET_IEEEQUAD && TARGET_LONG_DOUBLE_128)
-	ieee128_float_type_node = long_double_type_node;
-      else
-	ieee128_float_type_node = float128_type_node;
-      t = build_qualified_type (ieee128_float_type_node, TYPE_QUAL_CONST);
-      ptr_ieee128_float_type_node = build_pointer_type (t);
-      lang_hooks.types.register_builtin_type (ieee128_float_type_node,
-					      "__ieee128");
-    }
-
-  else
-    ieee128_float_type_node = ibm128_float_type_node = long_double_type_node;
-
-  /* Vector pair and vector quad support.  */
-  vector_pair_type_node = make_node (OPAQUE_TYPE);
-  SET_TYPE_MODE (vector_pair_type_node, OOmode);
-  TYPE_SIZE (vector_pair_type_node) = bitsize_int (GET_MODE_BITSIZE (OOmode));
-  TYPE_PRECISION (vector_pair_type_node) = GET_MODE_BITSIZE (OOmode);
-  TYPE_SIZE_UNIT (vector_pair_type_node) = size_int (GET_MODE_SIZE (OOmode));
-  SET_TYPE_ALIGN (vector_pair_type_node, 256);
-  TYPE_USER_ALIGN (vector_pair_type_node) = 0;
-  lang_hooks.types.register_builtin_type (vector_pair_type_node,
-					  "__vector_pair");
-  t = build_qualified_type (vector_pair_type_node, TYPE_QUAL_CONST);
-  ptr_vector_pair_type_node = build_pointer_type (t);
-
-  vector_quad_type_node = make_node (OPAQUE_TYPE);
-  SET_TYPE_MODE (vector_quad_type_node, XOmode);
-  TYPE_SIZE (vector_quad_type_node) = bitsize_int (GET_MODE_BITSIZE (XOmode));
-  TYPE_PRECISION (vector_quad_type_node) = GET_MODE_BITSIZE (XOmode);
-  TYPE_SIZE_UNIT (vector_quad_type_node) = size_int (GET_MODE_SIZE (XOmode));
-  SET_TYPE_ALIGN (vector_quad_type_node, 512);
-  TYPE_USER_ALIGN (vector_quad_type_node) = 0;
-  lang_hooks.types.register_builtin_type (vector_quad_type_node,
-					  "__vector_quad");
-  t = build_qualified_type (vector_quad_type_node, TYPE_QUAL_CONST);
-  ptr_vector_quad_type_node = build_pointer_type (t);
-
-  /* Initialize the modes for builtin_function_type, mapping a machine mode to
-     tree type node.  */
-  builtin_mode_to_type[QImode][0] = integer_type_node;
-  builtin_mode_to_type[QImode][1] = unsigned_intSI_type_node;
-  builtin_mode_to_type[HImode][0] = integer_type_node;
-  builtin_mode_to_type[HImode][1] = unsigned_intSI_type_node;
-  builtin_mode_to_type[SImode][0] = intSI_type_node;
-  builtin_mode_to_type[SImode][1] = unsigned_intSI_type_node;
-  builtin_mode_to_type[DImode][0] = intDI_type_node;
-  builtin_mode_to_type[DImode][1] = unsigned_intDI_type_node;
-  builtin_mode_to_type[TImode][0] = intTI_type_node;
-  builtin_mode_to_type[TImode][1] = unsigned_intTI_type_node;
-  builtin_mode_to_type[SFmode][0] = float_type_node;
-  builtin_mode_to_type[DFmode][0] = double_type_node;
-  builtin_mode_to_type[IFmode][0] = ibm128_float_type_node;
-  builtin_mode_to_type[KFmode][0] = ieee128_float_type_node;
-  builtin_mode_to_type[TFmode][0] = long_double_type_node;
-  builtin_mode_to_type[DDmode][0] = dfloat64_type_node;
-  builtin_mode_to_type[TDmode][0] = dfloat128_type_node;
-  builtin_mode_to_type[V1TImode][0] = V1TI_type_node;
-  builtin_mode_to_type[V1TImode][1] = unsigned_V1TI_type_node;
-  builtin_mode_to_type[V2DImode][0] = V2DI_type_node;
-  builtin_mode_to_type[V2DImode][1] = unsigned_V2DI_type_node;
-  builtin_mode_to_type[V2DFmode][0] = V2DF_type_node;
-  builtin_mode_to_type[V4SImode][0] = V4SI_type_node;
-  builtin_mode_to_type[V4SImode][1] = unsigned_V4SI_type_node;
-  builtin_mode_to_type[V4SFmode][0] = V4SF_type_node;
-  builtin_mode_to_type[V8HImode][0] = V8HI_type_node;
-  builtin_mode_to_type[V8HImode][1] = unsigned_V8HI_type_node;
-  builtin_mode_to_type[V16QImode][0] = V16QI_type_node;
-  builtin_mode_to_type[V16QImode][1] = unsigned_V16QI_type_node;
-  builtin_mode_to_type[OOmode][1] = vector_pair_type_node;
-  builtin_mode_to_type[XOmode][1] = vector_quad_type_node;
-
-  tdecl = add_builtin_type ("__bool char", bool_char_type_node);
-  TYPE_NAME (bool_char_type_node) = tdecl;
-
-  tdecl = add_builtin_type ("__bool short", bool_short_type_node);
-  TYPE_NAME (bool_short_type_node) = tdecl;
-
-  tdecl = add_builtin_type ("__bool int", bool_int_type_node);
-  TYPE_NAME (bool_int_type_node) = tdecl;
-
-  tdecl = add_builtin_type ("__pixel", pixel_type_node);
-  TYPE_NAME (pixel_type_node) = tdecl;
-
-  bool_V16QI_type_node = rs6000_vector_type ("__vector __bool char",
-					     bool_char_type_node, 16);
-  ptr_bool_V16QI_type_node
-    = build_pointer_type (build_qualified_type (bool_V16QI_type_node,
-						TYPE_QUAL_CONST));
-
-  bool_V8HI_type_node = rs6000_vector_type ("__vector __bool short",
-					    bool_short_type_node, 8);
-  ptr_bool_V8HI_type_node
-    = build_pointer_type (build_qualified_type (bool_V8HI_type_node,
-						TYPE_QUAL_CONST));
-
-  bool_V4SI_type_node = rs6000_vector_type ("__vector __bool int",
-					    bool_int_type_node, 4);
-  ptr_bool_V4SI_type_node
-    = build_pointer_type (build_qualified_type (bool_V4SI_type_node,
-						TYPE_QUAL_CONST));
-
-  bool_V2DI_type_node = rs6000_vector_type (TARGET_POWERPC64
-					    ? "__vector __bool long"
-					    : "__vector __bool long long",
-					    bool_long_long_type_node, 2);
-  ptr_bool_V2DI_type_node
-    = build_pointer_type (build_qualified_type (bool_V2DI_type_node,
-						TYPE_QUAL_CONST));
-
-  bool_V1TI_type_node = rs6000_vector_type ("__vector __bool __int128",
-					    intTI_type_node, 1);
-  ptr_bool_V1TI_type_node
-    = build_pointer_type (build_qualified_type (bool_V1TI_type_node,
-						TYPE_QUAL_CONST));
-
-  pixel_V8HI_type_node = rs6000_vector_type ("__vector __pixel",
-					     pixel_type_node, 8);
-  ptr_pixel_V8HI_type_node
-    = build_pointer_type (build_qualified_type (pixel_V8HI_type_node,
-						TYPE_QUAL_CONST));
-  pcvoid_type_node
-    = build_pointer_type (build_qualified_type (void_type_node,
-						TYPE_QUAL_CONST));
-
-  /* Execute the autogenerated initialization code for builtins.  */
-  rs6000_init_generated_builtins ();
-
-  if (TARGET_DEBUG_BUILTIN)
-    {
-      fprintf (stderr, "\nAutogenerated built-in functions:\n\n");
-      for (int i = 1; i < (int) RS6000_BIF_MAX; i++)
-	{
-	  bif_enable e = rs6000_builtin_info[i].enable;
-	  if (e == ENB_P5 && !TARGET_POPCNTB)
-	    continue;
-	  if (e == ENB_P6 && !TARGET_CMPB)
-	    continue;
-	  if (e == ENB_P6_64 && !(TARGET_CMPB && TARGET_POWERPC64))
-	    continue;
-	  if (e == ENB_ALTIVEC && !TARGET_ALTIVEC)
-	    continue;
-	  if (e == ENB_VSX && !TARGET_VSX)
-	    continue;
-	  if (e == ENB_P7 && !TARGET_POPCNTD)
-	    continue;
-	  if (e == ENB_P7_64 && !(TARGET_POPCNTD && TARGET_POWERPC64))
-	    continue;
-	  if (e == ENB_P8 && !TARGET_DIRECT_MOVE)
-	    continue;
-	  if (e == ENB_P8V && !TARGET_P8_VECTOR)
-	    continue;
-	  if (e == ENB_P9 && !TARGET_MODULO)
-	    continue;
-	  if (e == ENB_P9_64 && !(TARGET_MODULO && TARGET_POWERPC64))
-	    continue;
-	  if (e == ENB_P9V && !TARGET_P9_VECTOR)
-	    continue;
-	  if (e == ENB_IEEE128_HW && !TARGET_FLOAT128_HW)
-	    continue;
-	  if (e == ENB_DFP && !TARGET_DFP)
-	    continue;
-	  if (e == ENB_CRYPTO && !TARGET_CRYPTO)
-	    continue;
-	  if (e == ENB_HTM && !TARGET_HTM)
-	    continue;
-	  if (e == ENB_P10 && !TARGET_POWER10)
-	    continue;
-	  if (e == ENB_P10_64 && !(TARGET_POWER10 && TARGET_POWERPC64))
-	    continue;
-	  if (e == ENB_MMA && !TARGET_MMA)
-	    continue;
-	  tree fntype = rs6000_builtin_info[i].fntype;
-	  tree t = TREE_TYPE (fntype);
-	  fprintf (stderr, "%s %s (", rs6000_type_string (t),
-		   rs6000_builtin_info[i].bifname);
-	  t = TYPE_ARG_TYPES (fntype);
-	  while (t && TREE_VALUE (t) != void_type_node)
-	    {
-	      fprintf (stderr, "%s",
-		       rs6000_type_string (TREE_VALUE (t)));
-	      t = TREE_CHAIN (t);
-	      if (t && TREE_VALUE (t) != void_type_node)
-		fprintf (stderr, ", ");
-	    }
-	  fprintf (stderr, "); %s [%4d]\n",
-		   rs6000_builtin_info[i].attr_string, (int) i);
-	}
-      fprintf (stderr, "\nEnd autogenerated built-in functions.\n\n\n");
-     }
-
-  if (TARGET_XCOFF)
-    {
-      /* AIX libm provides clog as __clog.  */
-      if ((tdecl = builtin_decl_explicit (BUILT_IN_CLOG)) != NULL_TREE)
-	set_user_assembler_name (tdecl, "__clog");
-
-      /* When long double is 64 bit, some long double builtins of libc
-	 functions (like __builtin_frexpl) must call the double version
-	 (frexp) not the long double version (frexpl) that expects a 128 bit
-	 argument.  */
-      if (! TARGET_LONG_DOUBLE_128)
-	{
-	  if ((tdecl = builtin_decl_explicit (BUILT_IN_FMODL)) != NULL_TREE)
-	    set_user_assembler_name (tdecl, "fmod");
-	  if ((tdecl = builtin_decl_explicit (BUILT_IN_FREXPL)) != NULL_TREE)
-	    set_user_assembler_name (tdecl, "frexp");
-	  if ((tdecl = builtin_decl_explicit (BUILT_IN_LDEXPL)) != NULL_TREE)
-	    set_user_assembler_name (tdecl, "ldexp");
-	  if ((tdecl = builtin_decl_explicit (BUILT_IN_MODFL)) != NULL_TREE)
-	    set_user_assembler_name (tdecl, "modf");
-	}
-    }
-
-  altivec_builtin_mask_for_load
-    = rs6000_builtin_decls[RS6000_BIF_MASK_FOR_LOAD];
-
-#ifdef SUBTARGET_INIT_BUILTINS
-  SUBTARGET_INIT_BUILTINS;
-#endif
-
-  return;
-}
-
-tree
-rs6000_builtin_decl (unsigned code, bool /* initialize_p */)
-{
-  rs6000_gen_builtins fcode = (rs6000_gen_builtins) code;
-
-  if (fcode >= RS6000_OVLD_MAX)
-    return error_mark_node;
-
-  return rs6000_builtin_decls[code];
-}
-
 /* Return the internal arg pointer used for function incoming
    arguments.  When -fsplit-stack, the arg pointer is r12 so we need
    to copy it to a pseudo in order for it to be preserved over calls
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index a5fd36b..ac6dd19 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -86,6 +86,10 @@
 /* This file should be included last.  */
 #include "target-def.h"
 
+extern tree rs6000_builtin_mask_for_load (void);
+extern tree rs6000_builtin_md_vectorized_function (tree, tree, tree);
+extern tree rs6000_builtin_reciprocal (tree);
+
   /* Set -mabi=ieeelongdouble on some old targets.  In the future, power server
      systems will also set long double to be IEEE 128-bit.  AIX and Darwin
      explicitly redefine TARGET_IEEEQUAD and TARGET_IEEEQUAD_DEFAULT to 0, so
@@ -105,9 +109,6 @@
 #define PCREL_SUPPORTED_BY_OS	0
 #endif
 
-/* Support targetm.vectorize.builtin_mask_for_load.  */
-tree altivec_builtin_mask_for_load;
-
 #ifdef USING_ELFOS_H
 /* Counter for labels which are to be placed in .fixup.  */
 int fixuplabelno = 0;
@@ -159,9 +160,6 @@ enum reg_class rs6000_regno_regclass[FIRST_PSEUDO_REGISTER];
 
 static int dbg_cost_ctrl;
 
-/* Built in types.  */
-tree rs6000_builtin_types[RS6000_BTI_MAX];
-
 /* Flag to say the TOC is initialized */
 int toc_initialized, need_toc_init;
 char toc_label_name[10];
@@ -190,9 +188,6 @@ enum reg_class rs6000_constraints[RS6000_CONSTRAINT_MAX];
 /* Describe the alignment of a vector.  */
 int rs6000_vector_align[NUM_MACHINE_MODES];
 
-/* Map selected modes to types for builtins.  */
-tree builtin_mode_to_type[MAX_MACHINE_MODE][2];
-
 /* What modes to automatically generate reciprocal divide estimate (fre) and
    reciprocal sqrt (frsqrte) for.  */
 unsigned char rs6000_recip_bits[MAX_MACHINE_MODE];
@@ -4969,18 +4964,6 @@ rs6000_option_override (void)
 }
 
 
-/* Implement targetm.vectorize.builtin_mask_for_load.  */
-static tree
-rs6000_builtin_mask_for_load (void)
-{
-  /* Don't use lvsl/vperm for P8 and similarly efficient machines.  */
-  if ((TARGET_ALTIVEC && !TARGET_VSX)
-      || (TARGET_VSX && !TARGET_EFFICIENT_UNALIGNED_VSX))
-    return altivec_builtin_mask_for_load;
-  else
-    return 0;
-}
-
 /* Implement LOOP_ALIGN. */
 align_flags
 rs6000_loop_align (rtx label)
@@ -5689,119 +5672,6 @@ rs6000_builtin_vectorized_function (unsigned int fn, tree type_out,
   return NULL_TREE;
 }
 
-/* Implement targetm.vectorize.builtin_md_vectorized_function.  */
-
-static tree
-rs6000_builtin_md_vectorized_function (tree fndecl, tree type_out,
-				       tree type_in)
-{
-  machine_mode in_mode, out_mode;
-  int in_n, out_n;
-
-  if (TARGET_DEBUG_BUILTIN)
-    fprintf (stderr,
-	     "rs6000_builtin_md_vectorized_function (%s, %s, %s)\n",
-	     IDENTIFIER_POINTER (DECL_NAME (fndecl)),
-	     GET_MODE_NAME (TYPE_MODE (type_out)),
-	     GET_MODE_NAME (TYPE_MODE (type_in)));
-
-  /* TODO: Should this be gcc_assert?  */
-  if (TREE_CODE (type_out) != VECTOR_TYPE
-      || TREE_CODE (type_in) != VECTOR_TYPE)
-    return NULL_TREE;
-
-  out_mode = TYPE_MODE (TREE_TYPE (type_out));
-  out_n = TYPE_VECTOR_SUBPARTS (type_out);
-  in_mode = TYPE_MODE (TREE_TYPE (type_in));
-  in_n = TYPE_VECTOR_SUBPARTS (type_in);
-
-  enum rs6000_gen_builtins fn
-    = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl);
-  switch (fn)
-    {
-    case RS6000_BIF_RSQRTF:
-      if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode)
-	  && out_mode == SFmode && out_n == 4
-	  && in_mode == SFmode && in_n == 4)
-	return rs6000_builtin_decls[RS6000_BIF_VRSQRTFP];
-      break;
-    case RS6000_BIF_RSQRT:
-      if (VECTOR_UNIT_VSX_P (V2DFmode)
-	  && out_mode == DFmode && out_n == 2
-	  && in_mode == DFmode && in_n == 2)
-	return rs6000_builtin_decls[RS6000_BIF_RSQRT_2DF];
-      break;
-    case RS6000_BIF_RECIPF:
-      if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode)
-	  && out_mode == SFmode && out_n == 4
-	  && in_mode == SFmode && in_n == 4)
-	return rs6000_builtin_decls[RS6000_BIF_VRECIPFP];
-      break;
-    case RS6000_BIF_RECIP:
-      if (VECTOR_UNIT_VSX_P (V2DFmode)
-	  && out_mode == DFmode && out_n == 2
-	  && in_mode == DFmode && in_n == 2)
-	return rs6000_builtin_decls[RS6000_BIF_RECIP_V2DF];
-      break;
-    default:
-      break;
-    }
-
-  machine_mode in_vmode = TYPE_MODE (type_in);
-  machine_mode out_vmode = TYPE_MODE (type_out);
-
-  /* Power10 supported vectorized built-in functions.  */
-  if (TARGET_POWER10
-      && in_vmode == out_vmode
-      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
-    {
-      machine_mode exp_mode = DImode;
-      machine_mode exp_vmode = V2DImode;
-      enum rs6000_gen_builtins bif;
-      switch (fn)
-	{
-	case RS6000_BIF_DIVWE:
-	case RS6000_BIF_DIVWEU:
-	  exp_mode = SImode;
-	  exp_vmode = V4SImode;
-	  if (fn == RS6000_BIF_DIVWE)
-	    bif = RS6000_BIF_VDIVESW;
-	  else
-	    bif = RS6000_BIF_VDIVEUW;
-	  break;
-	case RS6000_BIF_DIVDE:
-	case RS6000_BIF_DIVDEU:
-	  if (fn == RS6000_BIF_DIVDE)
-	    bif = RS6000_BIF_VDIVESD;
-	  else
-	    bif = RS6000_BIF_VDIVEUD;
-	  break;
-	case RS6000_BIF_CFUGED:
-	  bif = RS6000_BIF_VCFUGED;
-	  break;
-	case RS6000_BIF_CNTLZDM:
-	  bif = RS6000_BIF_VCLZDM;
-	  break;
-	case RS6000_BIF_CNTTZDM:
-	  bif = RS6000_BIF_VCTZDM;
-	  break;
-	case RS6000_BIF_PDEPD:
-	  bif = RS6000_BIF_VPDEPD;
-	  break;
-	case RS6000_BIF_PEXTD:
-	  bif = RS6000_BIF_VPEXTD;
-	  break;
-	default:
-	  return NULL_TREE;
-	}
-
-      if (in_mode == exp_mode && in_vmode == exp_vmode)
-	return rs6000_builtin_decls[bif];
-    }
-
-  return NULL_TREE;
-}
-
 /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
    library with vectorized intrinsics.  */
 
@@ -22543,31 +22413,6 @@ rs6000_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
   return allocno_class;
 }
 
-/* Returns a code for a target-specific builtin that implements
-   reciprocal of the function, or NULL_TREE if not available.  */
-
-static tree
-rs6000_builtin_reciprocal (tree fndecl)
-{
-  switch (DECL_MD_FUNCTION_CODE (fndecl))
-    {
-    case RS6000_BIF_XVSQRTDP:
-      if (!RS6000_RECIP_AUTO_RSQRTE_P (V2DFmode))
-	return NULL_TREE;
-
-      return rs6000_builtin_decls[RS6000_BIF_RSQRT_2DF];
-
-    case RS6000_BIF_XVSQRTSP:
-      if (!RS6000_RECIP_AUTO_RSQRTE_P (V4SFmode))
-	return NULL_TREE;
-
-      return rs6000_builtin_decls[RS6000_BIF_RSQRT_4SF];
-
-    default:
-      return NULL_TREE;
-    }
-}
-
 /* Load up a constant.  If the mode is a vector mode, splat the value across
    all of the vector elements.  */
 
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index 5fdb8f2..17af314 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -2551,7 +2551,6 @@ enum rs6000_builtin_type_index
 extern GTY(()) tree rs6000_builtin_types[RS6000_BTI_MAX];
 
 #ifndef USED_FOR_TARGET
-extern GTY(()) tree builtin_mode_to_type[MAX_MACHINE_MODE][2];
 extern GTY(()) tree altivec_builtin_mask_for_load;
 extern GTY(()) section *toc_section;
 
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index 1a460d9..597cea4 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -43,6 +43,10 @@ rs6000-logue.o: $(srcdir)/config/rs6000/rs6000-logue.cc
 	$(COMPILE) $<
 	$(POSTCOMPILE)
 
+rs6000-builtin.o: $(srcdir)/config/rs6000/rs6000-builtin.cc
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
 build/rs6000-gen-builtins.o: $(srcdir)/config/rs6000/rs6000-gen-builtins.cc
 build/rbtree.o: $(srcdir)/config/rs6000/rbtree.cc
 
-- 
cgit v1.1


From 3f30f2d1dbb3228b8468b26239fe60c2974ce2ac Mon Sep 17 00:00:00 2001
From: Bill Schmidt <wschmidt@linux.ibm.com>
Date: Wed, 2 Feb 2022 21:24:22 -0600
Subject: rs6000: Fix LE code gen for vec_cnt[lt]z_lsbb [PR95082]

These built-ins were misimplemented as always having big-endian semantics.

2022-01-18  Bill Schmidt  <wschmidt@linux.ibm.com>

gcc/
	PR target/95082
	* config/rs6000/rs6000-builtin.cc (rs6000_expand_builtin): Handle
	endianness for vclzlsbb and vctzlsbb.
	* config/rs6000/rs6000-builtins.def (VCLZLSBB_V16QI): Change
	default pattern and indicate a different pattern will be used for
	big endian.
	(VCLZLSBB_V4SI): Likewise.
	(VCLZLSBB_V8HI): Likewise.
	(VCTZLSBB_V16QI): Likewise.
	(VCTZLSBB_V4SI): Likewise.
	(VCTZLSBB_V8HI): Likewise.

gcc/testsuite/
	PR target/95082
	* gcc.target/powerpc/vsu/vec-cntlz-lsbb-0.c: Restrict to -mbig.
	* gcc.target/powerpc/vsu/vec-cntlz-lsbb-1.c: Likewise.
	* gcc.target/powerpc/vsu/vec-cntlz-lsbb-3.c: New.
	* gcc.target/powerpc/vsu/vec-cntlz-lsbb-4.c: New.
	* gcc.target/powerpc/vsu/vec-cnttz-lsbb-0.c: Restrict to -mbig.
	* gcc.target/powerpc/vsu/vec-cnttz-lsbb-1.c: Likewise.
	* gcc.target/powerpc/vsu/vec-cnttz-lsbb-3.c: New.
	* gcc.target/powerpc/vsu/vec-cnttz-lsbb-4.c: New.
---
 gcc/config/rs6000/rs6000-builtin.cc   | 12 ++++++++++++
 gcc/config/rs6000/rs6000-builtins.def | 12 ++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000-builtin.cc b/gcc/config/rs6000/rs6000-builtin.cc
index 005f936..69f8cee 100644
--- a/gcc/config/rs6000/rs6000-builtin.cc
+++ b/gcc/config/rs6000/rs6000-builtin.cc
@@ -3485,6 +3485,18 @@ rs6000_expand_builtin (tree exp, rtx target, rtx /* subtarget */,
 	icode = CODE_FOR_vsx_store_v8hi;
       else if (fcode == RS6000_BIF_ST_ELEMREV_V16QI)
 	icode = CODE_FOR_vsx_store_v16qi;
+      else if (fcode == RS6000_BIF_VCLZLSBB_V16QI)
+	icode = CODE_FOR_vclzlsbb_v16qi;
+      else if (fcode == RS6000_BIF_VCLZLSBB_V4SI)
+	icode = CODE_FOR_vclzlsbb_v4si;
+      else if (fcode == RS6000_BIF_VCLZLSBB_V8HI)
+	icode = CODE_FOR_vclzlsbb_v8hi;
+      else if (fcode == RS6000_BIF_VCTZLSBB_V16QI)
+	icode = CODE_FOR_vctzlsbb_v16qi;
+      else if (fcode == RS6000_BIF_VCTZLSBB_V4SI)
+	icode = CODE_FOR_vctzlsbb_v4si;
+      else if (fcode == RS6000_BIF_VCTZLSBB_V8HI)
+	icode = CODE_FOR_vctzlsbb_v8hi;
       else
 	gcc_unreachable ();
     }
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index a8ebb4a..7f527b6 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -2550,13 +2550,13 @@
     VBPERMD altivec_vbpermd {}
 
   const signed int __builtin_altivec_vclzlsbb_v16qi (vsc);
-    VCLZLSBB_V16QI vclzlsbb_v16qi {}
+    VCLZLSBB_V16QI vctzlsbb_v16qi {endian}
 
   const signed int __builtin_altivec_vclzlsbb_v4si (vsi);
-    VCLZLSBB_V4SI vclzlsbb_v4si {}
+    VCLZLSBB_V4SI vctzlsbb_v4si {endian}
 
   const signed int __builtin_altivec_vclzlsbb_v8hi (vss);
-    VCLZLSBB_V8HI vclzlsbb_v8hi {}
+    VCLZLSBB_V8HI vctzlsbb_v8hi {endian}
 
   const vsc __builtin_altivec_vctzb (vsc);
     VCTZB ctzv16qi2 {}
@@ -2571,13 +2571,13 @@
     VCTZW ctzv4si2 {}
 
   const signed int __builtin_altivec_vctzlsbb_v16qi (vsc);
-    VCTZLSBB_V16QI vctzlsbb_v16qi {}
+    VCTZLSBB_V16QI vclzlsbb_v16qi {endian}
 
   const signed int __builtin_altivec_vctzlsbb_v4si (vsi);
-    VCTZLSBB_V4SI vctzlsbb_v4si {}
+    VCTZLSBB_V4SI vclzlsbb_v4si {endian}
 
   const signed int __builtin_altivec_vctzlsbb_v8hi (vss);
-    VCTZLSBB_V8HI vctzlsbb_v8hi {}
+    VCTZLSBB_V8HI vclzlsbb_v8hi {endian}
 
   const signed int __builtin_altivec_vcmpaeb_p (vsc, vsc);
     VCMPAEB_P vector_ae_v16qi_p {}
-- 
cgit v1.1


From 48bd780ee327c9ae6ffc0641e73cc1f4939fb204 Mon Sep 17 00:00:00 2001
From: Bill Schmidt <wschmidt@linux.ibm.com>
Date: Wed, 2 Feb 2022 21:30:27 -0600
Subject: rs6000: Remove -m[no-]fold-gimple flag [PR103686]

The -m[no-]fold-gimple flag was really intended primarily for internal
testing while implementing GIMPLE folding for rs6000 vector built-in
functions.  It ended up leaking into other places, causing problems such
as PR103686 identifies.  Let's remove it.

There are a number of tests in the testsuite that require adjustment.
Some specify -mfold-gimple directly, which is the default, so that is
handled by removing the option.  Others unnecessarily specify
-mno-fold-gimple, as the tests work fine without this.  Again that is
handled by removing the option.  There are a couple of extra variants of
tests specifically for -mno-fold-gimple; for those, we can just	remove the
whole test.

gcc.target/powerpc/builtins-1.c was more problematic.  It was written in
such a way as to be extremely fragile.  For this one, I rewrote the whole
test in a different style, using individual functions to test each
built-in function.  These same tests are also largely covered by
builtins-1-be-folded.c and builtins-1-le-folded.c, so I chose to
explicitly make this test -mbig for simplicity, and use -O2 for clean code
generation.  I made some slight modifications to the expected instruction
counts as a result, and tested on both 32- and 64-bit.

2022-02-02  Bill Schmidt  <wschmidt@linux.ibm.com>

gcc/
	PR target/103686
	* config/rs6000/rs6000-builtin.cc (rs6000_gimple_fold_builtin):	Remove
	test for !rs6000_fold_gimple.
	* config/rs6000/rs6000.cc (rs6000_option_override_internal): Likewise.
	* config/rs6000/rs6000.opt (mfold-gimple): Remove.

gcc/testsuite/
	PR target/103686
	* gcc.target/powerpc/builtins-1-be-folded.c: Remove -mfold-gimple
	option.
	* gcc.target/powerpc/builtins-1-le-folded.c: Likewise.
	* gcc.target/powerpc/builtins-1.c: Rewrite to use small functions and
	restrict to -O2 -mbig for predictability.  Adjust instruction counts.
	* gcc.target/powerpc/builtins-5.c: Remove -mno-fold-gimple option.
	* gcc.target/powerpc/p8-vec-xl-xst.c: Likewise.
	* gcc.target/powerpc/pr83926.c: Likewise.
	* gcc.target/powerpc/pr86731-nogimplefold-longlong.c: Delete.
	* gcc.target/powerpc/pr86731-nogimplefold.c: Delete.
	* gcc.target/powerpc/swaps-p8-17.c: Remove -mno-fold-gimple option.
---
 gcc/config/rs6000/rs6000-builtin.cc | 3 ---
 gcc/config/rs6000/rs6000.cc         | 4 ----
 gcc/config/rs6000/rs6000.opt        | 4 ----
 3 files changed, 11 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000-builtin.cc b/gcc/config/rs6000/rs6000-builtin.cc
index 69f8cee..5d34c1b 100644
--- a/gcc/config/rs6000/rs6000-builtin.cc
+++ b/gcc/config/rs6000/rs6000-builtin.cc
@@ -1299,9 +1299,6 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
       fprintf (stderr, "rs6000_gimple_fold_builtin %d %s %s\n",
 	       fn_code, fn_name1, fn_name2);
 
-  if (!rs6000_fold_gimple)
-    return false;
-
   /* Prevent gimple folding for code that does not have a LHS, unless it is
      allowed per the rs6000_builtin_valid_without_lhs helper function.  */
   if (!gimple_call_lhs (stmt)
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index ac6dd19..b6f2309 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -3833,10 +3833,6 @@ rs6000_option_override_internal (bool global_init_p)
 	   & OPTION_MASK_DIRECT_MOVE))
     rs6000_isa_flags |= ~rs6000_isa_flags_explicit & OPTION_MASK_STRICT_ALIGN;
 
-  if (!rs6000_fold_gimple)
-     fprintf (stderr,
-	      "gimple folding of rs6000 builtins has been disabled.\n");
-
   /* Add some warnings for VSX.  */
   if (TARGET_VSX)
     {
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index c2a7718..68c0cae 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -155,10 +155,6 @@ maltivec
 Target Mask(ALTIVEC) Var(rs6000_isa_flags)
 Use AltiVec instructions.
 
-mfold-gimple
-Target Var(rs6000_fold_gimple) Init(1)
-Enable early gimple folding of builtins.
-
 mhard-dfp
 Target Mask(DFP) Var(rs6000_isa_flags)
 Use decimal floating point instructions.
-- 
cgit v1.1


From 599122fa690d55e5e14d74f4d514b2d8b6a98505 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Thu, 3 Feb 2022 22:24:21 +0100
Subject: i386: Do not use %ecx DRAP for functions that use __builtin_eh_return
 [PR104362]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

%ecx can't be used for both DRAP register and eh_return.  Adjust find_drap_reg
to choose %edi for functions that uses __builtin_eh_return to avoid the assert
in ix86_expand_epilogue that enforces this rule.

2022-02-03  Uroš Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog:

	PR target/104362
	* config/i386/i386.cc (find_drap_reg): For 32bit targets
	return DI_REG if function uses __builtin_eh_return.

gcc/testsuite/ChangeLog:

	PR target/104362
	* gcc.target/i386/pr104362.c: New test.
---
 gcc/config/i386/i386.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index ad5a5ca..dd5584f 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -7400,7 +7400,8 @@ find_drap_reg (void)
 	 register in such case.  */
       if (DECL_STATIC_CHAIN (decl)
 	  || cfun->machine->no_caller_saved_registers
-	  || crtl->tail_call_emit)
+	  || crtl->tail_call_emit
+	  || crtl->calls_eh_return)
 	return DI_REG;
 
       /* Reuse static chain register if it isn't used for parameter
-- 
cgit v1.1


From 8d6fffc4bcd4afa0beb0efad4f3b95394aa15618 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Fri, 4 Feb 2022 18:30:59 +0100
Subject: rs6000: Fix up -D_FORTIFY_SOURCE* with -mabi=ieeelongdouble
 [PR104380]

The following testcase FAILs when configured with
--with-long-double-format=ieee .  Only happens in the -std=c* modes, not the
GNU modes; while the glibc headers have __asm redirects of
vsnprintf and __vsnprinf_chk to __vsnprintfieee128 and
__vsnprintf_chkieee128, the vsnprintf fortification extern inline gnu_inline
always_inline wrapper calls __builtin_vsnprintf_chk and we actually emit
a call to __vsnprinf_chk (i.e. with IBM extended long double) instead of
__vsnprintf_chkieee128.

rs6000_mangle_decl_assembler_name already had cases for *printf and *scanf,
so this just adds another case for *printf_chk.  *scanf_chk doesn't exist.
__ prefixing isn't done because *printf_chk already starts with __.

2022-02-04  Jakub Jelinek  <jakub@redhat.com>

	PR target/104380
	* config/rs6000/rs6000.cc (rs6000_mangle_decl_assembler_name): Also
	adjust mangling of __builtin*printf_chk.

	* gcc.dg/pr104380.c: New test.
---
 gcc/config/rs6000/rs6000.cc | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index b6f2309..d9fc67d 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -28069,6 +28069,7 @@ rs6000_mangle_decl_assembler_name (tree decl, tree id)
 	{
 	  size_t printf_len = strlen ("printf");
 	  size_t scanf_len = strlen ("scanf");
+	  size_t printf_chk_len = strlen ("printf_chk");
 
 	  if (len >= printf_len
 	      && strcmp (name + len - printf_len, "printf") == 0)
@@ -28078,6 +28079,10 @@ rs6000_mangle_decl_assembler_name (tree decl, tree id)
 		   && strcmp (name + len - scanf_len, "scanf") == 0)
 	    newname = xasprintf ("__isoc99_%sieee128", name);
 
+	  else if (len >= printf_chk_len
+		   && strcmp (name + len - printf_chk_len, "printf_chk") == 0)
+	    newname = xasprintf ("%sieee128", name);
+
 	  else if (name[len - 1] == 'l')
 	    {
 	      bool uses_ieee128_p = false;
-- 
cgit v1.1


From b28b92bc008776c8b517841f99ba6a31bf7751d2 Mon Sep 17 00:00:00 2001
From: Bill Schmidt <wschmidt@linux.ibm.com>
Date: Wed, 2 Feb 2022 20:55:36 -0600
Subject: rs6000: More factoring of overload processing

This patch continues the refactoring started with r12-6014.  I had previously
noted that the resolve_vec* routines can be further simplified by processing
the argument list earlier, so that all routines can use the arrays of arguments
and types.  I found that this was useful for some of the routines, but not for
all of them.

For several of the special-cased overloads, we don't specify all of the
possible type combinations in rs6000-overload.def, because the types don't
matter for the expansion we do.  For these, we can't use generic error message
handling when the number of arguments is incorrect, because the result is
misleading error messages that indicate argument types are wrong.

So this patch goes halfway and improves the factoring on the remaining special
cases, but leaves vec_splats, vec_promote, vec_extract, vec_insert, and
vec_step alone.

2022-02-02  Bill Schmidt  <wschmidt@linux.ibm.com>

gcc/
	* config/rs6000/rs6000-c.cc (resolve_vec_mul): Accept args and types
	parameters instead of arglist and nargs.  Simplify accordingly.  Remove
	unnecessary test for argument count mismatch.
	(resolve_vec_cmpne): Likewise.
	(resolve_vec_adde_sube): Likewise.
	(resolve_vec_addec_subec): Likewise.
	(altivec_resolve_overloaded_builtin): Move overload special handling
	after the gathering of arguments into args[] and types[] and the test
	for correct number of arguments.  Don't perform the test for correct
	number of arguments for certain special cases.  Call the other special
	cases with args and types instead of arglist and nargs.
---
 gcc/config/rs6000/rs6000-c.cc | 304 ++++++++++++++++++------------------------
 1 file changed, 127 insertions(+), 177 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index 145421a..15251ef 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -939,37 +939,25 @@ altivec_build_resolved_builtin (tree *args, int n, tree fntype, tree ret_type,
 enum resolution { unresolved, resolved, resolved_bad };
 
 /* Resolve an overloaded vec_mul call and return a tree expression for the
-   resolved call if successful.  NARGS is the number of arguments to the call.
-   ARGLIST contains the arguments.  RES must be set to indicate the status of
+   resolved call if successful.  ARGS contains the arguments to the call.
+   TYPES contains their types.  RES must be set to indicate the status of
    the resolution attempt.  LOC contains statement location information.  */
 
 static tree
-resolve_vec_mul (resolution *res, vec<tree, va_gc> *arglist, unsigned nargs,
-		 location_t loc)
+resolve_vec_mul (resolution *res, tree *args, tree *types, location_t loc)
 {
   /* vec_mul needs to be special cased because there are no instructions for it
      for the {un}signed char, {un}signed short, and {un}signed int types.  */
-  if (nargs != 2)
-    {
-      error ("builtin %qs only accepts 2 arguments", "vec_mul");
-      *res = resolved;
-      return error_mark_node;
-    }
-
-  tree arg0 = (*arglist)[0];
-  tree arg0_type = TREE_TYPE (arg0);
-  tree arg1 = (*arglist)[1];
-  tree arg1_type = TREE_TYPE (arg1);
 
   /* Both arguments must be vectors and the types must be compatible.  */
-  if (TREE_CODE (arg0_type) != VECTOR_TYPE
-      || !lang_hooks.types_compatible_p (arg0_type, arg1_type))
+  if (TREE_CODE (types[0]) != VECTOR_TYPE
+      || !lang_hooks.types_compatible_p (types[0], types[1]))
     {
       *res = resolved_bad;
       return error_mark_node;
     }
 
-  switch (TYPE_MODE (TREE_TYPE (arg0_type)))
+  switch (TYPE_MODE (TREE_TYPE (types[0])))
     {
     case E_QImode:
     case E_HImode:
@@ -978,21 +966,21 @@ resolve_vec_mul (resolution *res, vec<tree, va_gc> *arglist, unsigned nargs,
     case E_TImode:
       /* For scalar types just use a multiply expression.  */
       *res = resolved;
-      return fold_build2_loc (loc, MULT_EXPR, TREE_TYPE (arg0), arg0,
-			      fold_convert (TREE_TYPE (arg0), arg1));
+      return fold_build2_loc (loc, MULT_EXPR, types[0], args[0],
+			      fold_convert (types[0], args[1]));
     case E_SFmode:
       {
 	/* For floats use the xvmulsp instruction directly.  */
 	*res = resolved;
 	tree call = rs6000_builtin_decls[RS6000_BIF_XVMULSP];
-	return build_call_expr (call, 2, arg0, arg1);
+	return build_call_expr (call, 2, args[0], args[1]);
       }
     case E_DFmode:
       {
 	/* For doubles use the xvmuldp instruction directly.  */
 	*res = resolved;
 	tree call = rs6000_builtin_decls[RS6000_BIF_XVMULDP];
-	return build_call_expr (call, 2, arg0, arg1);
+	return build_call_expr (call, 2, args[0], args[1]);
       }
     /* Other types are errors.  */
     default:
@@ -1002,37 +990,25 @@ resolve_vec_mul (resolution *res, vec<tree, va_gc> *arglist, unsigned nargs,
 }
 
 /* Resolve an overloaded vec_cmpne call and return a tree expression for the
-   resolved call if successful.  NARGS is the number of arguments to the call.
-   ARGLIST contains the arguments.  RES must be set to indicate the status of
+   resolved call if successful.  ARGS contains the arguments to the call.
+   TYPES contains their types.  RES must be set to indicate the status of
    the resolution attempt.  LOC contains statement location information.  */
 
 static tree
-resolve_vec_cmpne (resolution *res, vec<tree, va_gc> *arglist, unsigned nargs,
-		   location_t loc)
+resolve_vec_cmpne (resolution *res, tree *args, tree *types, location_t loc)
 {
   /* vec_cmpne needs to be special cased because there are no instructions
      for it (prior to power 9).  */
-  if (nargs != 2)
-    {
-      error ("builtin %qs only accepts 2 arguments", "vec_cmpne");
-      *res = resolved;
-      return error_mark_node;
-    }
-
-  tree arg0 = (*arglist)[0];
-  tree arg0_type = TREE_TYPE (arg0);
-  tree arg1 = (*arglist)[1];
-  tree arg1_type = TREE_TYPE (arg1);
 
   /* Both arguments must be vectors and the types must be compatible.  */
-  if (TREE_CODE (arg0_type) != VECTOR_TYPE
-      || !lang_hooks.types_compatible_p (arg0_type, arg1_type))
+  if (TREE_CODE (types[0]) != VECTOR_TYPE
+      || !lang_hooks.types_compatible_p (types[0], types[1]))
     {
       *res = resolved_bad;
       return error_mark_node;
     }
 
-  machine_mode arg0_elt_mode = TYPE_MODE (TREE_TYPE (arg0_type));
+  machine_mode arg0_elt_mode = TYPE_MODE (TREE_TYPE (types[0]));
 
   /* Power9 instructions provide the most efficient implementation of
      ALTIVEC_BUILTIN_VEC_CMPNE if the mode is not DImode or TImode
@@ -1060,8 +1036,8 @@ resolve_vec_cmpne (resolution *res, vec<tree, va_gc> *arglist, unsigned nargs,
 	    /* call = vec_cmpeq (va, vb)
 	       result = vec_nor (call, call).  */
 	    vec<tree, va_gc> *params = make_tree_vector ();
-	    vec_safe_push (params, arg0);
-	    vec_safe_push (params, arg1);
+	    vec_safe_push (params, args[0]);
+	    vec_safe_push (params, args[1]);
 	    tree decl = rs6000_builtin_decls[RS6000_OVLD_VEC_CMPEQ];
 	    tree call = altivec_resolve_overloaded_builtin (loc, decl, params);
 	    /* Use save_expr to ensure that operands used more than once
@@ -1088,46 +1064,30 @@ resolve_vec_cmpne (resolution *res, vec<tree, va_gc> *arglist, unsigned nargs,
   return error_mark_node;
 }
 
-/* Resolve an overloaded vec_adde or vec_sube call and return a tree
-   expression for the resolved call if successful.  NARGS is the number of
-   arguments to the call.  ARGLIST contains the arguments.  RES must be set
-   to indicate the status of the resolution attempt.  LOC contains statement
-   location information.  */
+/* Resolve an overloaded vec_adde or vec_sube call and return a tree expression
+   for the resolved call if successful.  ARGS contains the arguments to the
+   call.  TYPES contains their arguments.  RES must be set to indicate the
+   status of the resolution attempt.  LOC contains statement location
+   information.  */
 
 static tree
 resolve_vec_adde_sube (resolution *res, rs6000_gen_builtins fcode,
-		       vec<tree, va_gc> *arglist, unsigned nargs,
-		       location_t loc)
+		       tree *args, tree *types, location_t loc)
 {
   /* vec_adde needs to be special cased because there is no instruction
      for the {un}signed int version.  */
-  if (nargs != 3)
-    {
-      const char *name;
-      name = fcode == RS6000_OVLD_VEC_ADDE ? "vec_adde" : "vec_sube";
-      error ("builtin %qs only accepts 3 arguments", name);
-      *res = resolved;
-      return error_mark_node;
-    }
-
-  tree arg0 = (*arglist)[0];
-  tree arg0_type = TREE_TYPE (arg0);
-  tree arg1 = (*arglist)[1];
-  tree arg1_type = TREE_TYPE (arg1);
-  tree arg2 = (*arglist)[2];
-  tree arg2_type = TREE_TYPE (arg2);
 
   /* All 3 arguments must be vectors of (signed or unsigned) (int or
      __int128) and the types must be compatible.  */
-  if (TREE_CODE (arg0_type) != VECTOR_TYPE
-      || !lang_hooks.types_compatible_p (arg0_type, arg1_type)
-      || !lang_hooks.types_compatible_p (arg1_type, arg2_type))
+  if (TREE_CODE (types[0]) != VECTOR_TYPE
+      || !lang_hooks.types_compatible_p (types[0], types[1])
+      || !lang_hooks.types_compatible_p (types[1], types[2]))
     {
       *res = resolved_bad;
       return error_mark_node;
     }
 
-  switch (TYPE_MODE (TREE_TYPE (arg0_type)))
+  switch (TYPE_MODE (TREE_TYPE (types[0])))
     {
       /* For {un}signed ints,
 	 vec_adde (va, vb, carryv) == vec_add (vec_add (va, vb),
@@ -1137,8 +1097,8 @@ resolve_vec_adde_sube (resolution *res, rs6000_gen_builtins fcode,
     case E_SImode:
       {
 	vec<tree, va_gc> *params = make_tree_vector ();
-	vec_safe_push (params, arg0);
-	vec_safe_push (params, arg1);
+	vec_safe_push (params, args[0]);
+	vec_safe_push (params, args[1]);
 
 	tree add_sub_builtin;
 	if (fcode == RS6000_OVLD_VEC_ADDE)
@@ -1148,10 +1108,10 @@ resolve_vec_adde_sube (resolution *res, rs6000_gen_builtins fcode,
 
 	tree call = altivec_resolve_overloaded_builtin (loc, add_sub_builtin,
 							params);
-	tree const1 = build_int_cstu (TREE_TYPE (arg0_type), 1);
-	tree ones_vector = build_vector_from_val (arg0_type, const1);
-	tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, arg0_type,
-					 arg2, ones_vector);
+	tree const1 = build_int_cstu (TREE_TYPE (types[0]), 1);
+	tree ones_vector = build_vector_from_val (types[0], const1);
+	tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, types[0],
+					 args[2], ones_vector);
 	params = make_tree_vector ();
 	vec_safe_push (params, call);
 	vec_safe_push (params, and_expr);
@@ -1175,45 +1135,29 @@ resolve_vec_adde_sube (resolution *res, rs6000_gen_builtins fcode,
 }
 
 /* Resolve an overloaded vec_addec or vec_subec call and return a tree
-   expression for the resolved call if successful.  NARGS is the number of
-   arguments to the call.  ARGLIST contains the arguments.  RES must be set
-   to indicate the status of the resolution attempt.  LOC contains statement
-   location information.  */
+   expression for the resolved call if successful.  ARGS contains the arguments
+   to the call.  TYPES contains their types.  RES must be set to indicate the
+   status of the resolution attempt.  LOC contains statement location
+   information.  */
 
 static tree
 resolve_vec_addec_subec (resolution *res, rs6000_gen_builtins fcode,
-			 vec<tree, va_gc> *arglist, unsigned nargs,
-			 location_t loc)
+			 tree *args, tree *types, location_t loc)
 {
   /* vec_addec and vec_subec needs to be special cased because there is
      no instruction for the (un)signed int version.  */
-  if (nargs != 3)
-    {
-      const char *name;
-      name = fcode == RS6000_OVLD_VEC_ADDEC ? "vec_addec" : "vec_subec";
-      error ("builtin %qs only accepts 3 arguments", name);
-      *res = resolved;
-      return error_mark_node;
-    }
-
-  tree arg0 = (*arglist)[0];
-  tree arg0_type = TREE_TYPE (arg0);
-  tree arg1 = (*arglist)[1];
-  tree arg1_type = TREE_TYPE (arg1);
-  tree arg2 = (*arglist)[2];
-  tree arg2_type = TREE_TYPE (arg2);
 
   /* All 3 arguments must be vectors of (signed or unsigned) (int or
      __int128) and the types must be compatible.  */
-  if (TREE_CODE (arg0_type) != VECTOR_TYPE
-      || !lang_hooks.types_compatible_p (arg0_type, arg1_type)
-      || !lang_hooks.types_compatible_p (arg1_type, arg2_type))
+  if (TREE_CODE (types[0]) != VECTOR_TYPE
+      || !lang_hooks.types_compatible_p (types[0], types[1])
+      || !lang_hooks.types_compatible_p (types[1], types[2]))
     {
       *res = resolved_bad;
       return error_mark_node;
     }
 
-  switch (TYPE_MODE (TREE_TYPE (arg0_type)))
+  switch (TYPE_MODE (TREE_TYPE (types[0])))
     {
       /* For {un}signed ints,
 	   vec_addec (va, vb, carryv) ==
@@ -1224,11 +1168,11 @@ resolve_vec_addec_subec (resolution *res, rs6000_gen_builtins fcode,
       {
 	/* Use save_expr to ensure that operands used more than once that may
 	   have side effects (like calls) are only evaluated once.  */
-	arg0 = save_expr (arg0);
-	arg1 = save_expr (arg1);
+	args[0] = save_expr (args[0]);
+	args[1] = save_expr (args[1]);
 	vec<tree, va_gc> *params = make_tree_vector ();
-	vec_safe_push (params, arg0);
-	vec_safe_push (params, arg1);
+	vec_safe_push (params, args[0]);
+	vec_safe_push (params, args[1]);
 
 	tree as_c_builtin;
 	if (fcode == RS6000_OVLD_VEC_ADDEC)
@@ -1239,8 +1183,8 @@ resolve_vec_addec_subec (resolution *res, rs6000_gen_builtins fcode,
 	tree call1 = altivec_resolve_overloaded_builtin (loc, as_c_builtin,
 							 params);
 	params = make_tree_vector ();
-	vec_safe_push (params, arg0);
-	vec_safe_push (params, arg1);
+	vec_safe_push (params, args[0]);
+	vec_safe_push (params, args[1]);
 
 	tree as_builtin;
 	if (fcode == RS6000_OVLD_VEC_ADDEC)
@@ -1250,10 +1194,10 @@ resolve_vec_addec_subec (resolution *res, rs6000_gen_builtins fcode,
 
 	tree call2 = altivec_resolve_overloaded_builtin (loc, as_builtin,
 							 params);
-	tree const1 = build_int_cstu (TREE_TYPE (arg0_type), 1);
-	tree ones_vector = build_vector_from_val (arg0_type, const1);
-	tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, arg0_type,
-					 arg2, ones_vector);
+	tree const1 = build_int_cstu (TREE_TYPE (types[0]), 1);
+	tree ones_vector = build_vector_from_val (types[0], const1);
+	tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, types[0],
+					 args[2], ones_vector);
 	params = make_tree_vector ();
 	vec_safe_push (params, call2);
 	vec_safe_push (params, and_expr);
@@ -1783,78 +1727,22 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
 	     "%<vec_lvsr%> is deprecated for little endian; use "
 	     "assignment for unaligned loads and stores");
 
-  /* Some overloads require special handling.  */
-  /* FIXME: Could we simplify the helper functions if we gathered arguments
-     and types into arrays first?  */
-  tree returned_expr = NULL;
-  resolution res = unresolved;
-  vec<tree, va_gc> *arglist = static_cast<vec<tree, va_gc> *> (passed_arglist);
-  unsigned int nargs = vec_safe_length (arglist);
-
-  switch (fcode)
-    {
-    case RS6000_OVLD_VEC_MUL:
-      returned_expr = resolve_vec_mul (&res, arglist, nargs, loc);
-      break;
-
-    case RS6000_OVLD_VEC_CMPNE:
-      returned_expr = resolve_vec_cmpne (&res, arglist, nargs, loc);
-      break;
-
-    case RS6000_OVLD_VEC_ADDE:
-    case RS6000_OVLD_VEC_SUBE:
-      returned_expr = resolve_vec_adde_sube (&res, fcode, arglist, nargs, loc);
-      break;
-
-    case RS6000_OVLD_VEC_ADDEC:
-    case RS6000_OVLD_VEC_SUBEC:
-      returned_expr = resolve_vec_addec_subec (&res, fcode, arglist, nargs,
-					       loc);
-      break;
-
-    case RS6000_OVLD_VEC_SPLATS:
-    case RS6000_OVLD_VEC_PROMOTE:
-      returned_expr = resolve_vec_splats (&res, fcode, arglist, nargs);
-      break;
-
-    case RS6000_OVLD_VEC_EXTRACT:
-      returned_expr = resolve_vec_extract (&res, arglist, nargs, loc);
-      break;
-
-    case RS6000_OVLD_VEC_INSERT:
-      returned_expr = resolve_vec_insert (&res, arglist, nargs, loc);
-      break;
-
-    case RS6000_OVLD_VEC_STEP:
-      returned_expr = resolve_vec_step (&res, arglist, nargs);
-      break;
-
-    default:
-      ;
-    }
-
-  if (res == resolved)
-    return returned_expr;
-
-  /* "Regular" built-in functions and overloaded functions share a namespace
-     for some arrays, like rs6000_builtin_decls.  But rs6000_overload_info
-     only has information for the overloaded functions, so we need an
-     adjusted index for that.  */
-  unsigned int adj_fcode = fcode - RS6000_OVLD_NONE;
-
-  if (res == resolved_bad)
-    {
-      const char *name = rs6000_overload_info[adj_fcode].ovld_name;
-      error ("invalid parameter combination for AltiVec intrinsic %qs", name);
-      return error_mark_node;
-    }
-
   /* Gather the arguments and their types into arrays for easier handling.  */
   tree fnargs = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
   tree types[MAX_OVLD_ARGS];
   tree args[MAX_OVLD_ARGS];
   unsigned int n;
 
+  /* Count the number of expected arguments.  */
+  unsigned expected_args = 0;
+  for (tree chain = fnargs;
+       chain && !VOID_TYPE_P (TREE_VALUE (chain));
+       chain = TREE_CHAIN (chain))
+    expected_args++;
+
+  vec<tree, va_gc> *arglist = static_cast<vec<tree, va_gc> *> (passed_arglist);
+  unsigned int nargs = vec_safe_length (arglist);
+
   for (n = 0;
        !VOID_TYPE_P (TREE_VALUE (fnargs)) && n < nargs;
        fnargs = TREE_CHAIN (fnargs), n++)
@@ -1915,10 +1803,72 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
     }
 
   /* If the number of arguments did not match the prototype, return NULL
-     and the generic code will issue the appropriate error message.  */
-  if (!VOID_TYPE_P (TREE_VALUE (fnargs)) || n < nargs)
+     and the generic code will issue the appropriate error message.  Skip
+     this test for functions where we don't fully describe all the possible
+     overload signatures in rs6000-overload.def (because they aren't relevant
+     to the expansion here).  If we don't, we get confusing error messages.  */
+  /* As an example, for vec_splats we have:
+
+; There are no actual builtins for vec_splats.  There is special handling for
+; this in altivec_resolve_overloaded_builtin in rs6000-c.cc, where the call
+; is replaced by a constructor.  The single overload here causes
+; __builtin_vec_splats to be registered with the front end so that can happen.
+[VEC_SPLATS, vec_splats, __builtin_vec_splats]
+  vsi __builtin_vec_splats (vsi);
+    ABS_V4SI SPLATS_FAKERY
+
+    So even though __builtin_vec_splats accepts all vector types, the
+    infrastructure cheats and just records one prototype.  We end up getting
+    an error message that refers to this specific prototype even when we
+    are handling a different argument type.  That is completely confusing
+    to the user, so it's best to let these cases be handled individually
+    in the resolve_vec_splats, etc., helper functions.  */
+
+  if (n != expected_args
+      && !(fcode == RS6000_OVLD_VEC_PROMOTE
+	   || fcode == RS6000_OVLD_VEC_SPLATS
+	   || fcode == RS6000_OVLD_VEC_EXTRACT
+	   || fcode == RS6000_OVLD_VEC_INSERT
+	   || fcode == RS6000_OVLD_VEC_STEP))
     return NULL;
 
+  /* Some overloads require special handling.  */
+  tree returned_expr = NULL;
+  resolution res = unresolved;
+
+  if (fcode == RS6000_OVLD_VEC_MUL)
+    returned_expr = resolve_vec_mul (&res, args, types, loc);
+  else if (fcode == RS6000_OVLD_VEC_CMPNE)
+    returned_expr = resolve_vec_cmpne (&res, args, types, loc);
+  else if (fcode == RS6000_OVLD_VEC_ADDE || fcode == RS6000_OVLD_VEC_SUBE)
+    returned_expr = resolve_vec_adde_sube (&res, fcode, args, types, loc);
+  else if (fcode == RS6000_OVLD_VEC_ADDEC || fcode == RS6000_OVLD_VEC_SUBEC)
+    returned_expr = resolve_vec_addec_subec (&res, fcode, args, types, loc);
+  else if (fcode == RS6000_OVLD_VEC_SPLATS || fcode == RS6000_OVLD_VEC_PROMOTE)
+    returned_expr = resolve_vec_splats (&res, fcode, arglist, nargs);
+  else if (fcode == RS6000_OVLD_VEC_EXTRACT)
+    returned_expr = resolve_vec_extract (&res, arglist, nargs, loc);
+  else if (fcode == RS6000_OVLD_VEC_INSERT)
+    returned_expr = resolve_vec_insert (&res, arglist, nargs, loc);
+  else if (fcode == RS6000_OVLD_VEC_STEP)
+    returned_expr = resolve_vec_step (&res, arglist, nargs);
+
+  if (res == resolved)
+    return returned_expr;
+
+  /* "Regular" built-in functions and overloaded functions share a namespace
+     for some arrays, like rs6000_builtin_decls.  But rs6000_overload_info
+     only has information for the overloaded functions, so we need an
+     adjusted index for that.  */
+  unsigned int adj_fcode = fcode - RS6000_OVLD_NONE;
+
+  if (res == resolved_bad)
+    {
+      const char *name = rs6000_overload_info[adj_fcode].ovld_name;
+      error ("invalid parameter combination for AltiVec intrinsic %qs", name);
+      return error_mark_node;
+    }
+
   bool unsupported_builtin = false;
   rs6000_gen_builtins instance_code;
   bool supported = false;
-- 
cgit v1.1


From 06e32a5ebf20c11dd31bc2677bede569fef84316 Mon Sep 17 00:00:00 2001
From: Kito Cheng <kito.cheng@sifive.com>
Date: Tue, 25 Jan 2022 20:44:04 +0800
Subject: RISC-V: Always pass -misa-spec to assembler [PR104219]

Add -misa-spec to OPTION_DEFAULT_SPECS to make sure -misa-spec will
always pass that into assembler, that prevent GCC and binutils using
different way to interpret the ISA string.

gcc/ChangeLog:

	PR target/104219
	* config.gcc (riscv*-*-*): Normalize the with_isa_spec value.
	(all_defaults): Add isa_spec.
	* config/riscv/riscv.h (OPTION_DEFAULT_SPECS): Add isa_spec.
---
 gcc/config/riscv/riscv.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 6956684..8a4d2cf 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -60,6 +60,7 @@ extern const char *riscv_default_mtune (int argc, const char **argv);
    --with-arch is ignored if -march or -mcpu is specified.
    --with-abi is ignored if -mabi is specified.
    --with-tune is ignored if -mtune or -mcpu is specified.
+   --with-isa-spec is ignored if -misa-spec is specified.
 
    But using default -march/-mtune value if -mcpu don't have valid option.  */
 #define OPTION_DEFAULT_SPECS \
@@ -70,6 +71,7 @@ extern const char *riscv_default_mtune (int argc, const char **argv);
 	   "  %{!mcpu=*:-march=%(VALUE)}"				\
 	   "  %{mcpu=*:%:riscv_expand_arch_from_cpu(%* %(VALUE))}}" },	\
   {"abi", "%{!mabi=*:-mabi=%(VALUE)}" }, \
+  {"isa_spec", "%{!misa-spec=*:-misa-spec=%(VALUE)}" }, \
 
 #ifdef IN_LIBGCC2
 #undef TARGET_64BIT
-- 
cgit v1.1


From 8103623923ac4ea19b97a369979d4bd5731aab57 Mon Sep 17 00:00:00 2001
From: Kewen Lin <linkw@linux.ibm.com>
Date: Sun, 6 Feb 2022 21:29:32 -0600
Subject: rs6000: Disable MMA if no VSX support [PR103627]

As PR103627 shows, there is an unexpected case where !TARGET_VSX
and TARGET_MMA co-exist.  As ISA3.1 claims, SIMD is a requirement
for MMA.  By looking into the ICE, I noticed that the current
MMA implementation depends on vector pairs load/store which use
VSX register, but we don't have a separated option to control
Power10 vector support and Segher pointed out "-mpower9-vector is
a workaround that should go away" and more explanations in [1].
So this patch makes MMA require VSX instead.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2022-January/589303.html

gcc/ChangeLog:

	PR target/103627
	* config/rs6000/rs6000.cc (rs6000_option_override_internal): Disable
	MMA if !TARGET_VSX.

gcc/testsuite/ChangeLog:

	PR target/103627
	* gcc.target/powerpc/pr103627-1.c: New test.
	* gcc.target/powerpc/pr103627-2.c: New test.
---
 gcc/config/rs6000/rs6000.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index d9fc67d..a2843d1 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -4482,6 +4482,16 @@ rs6000_option_override_internal (bool global_init_p)
       rs6000_isa_flags &= ~OPTION_MASK_MMA;
     }
 
+  /* MMA requires SIMD support as ISA 3.1 claims and our implementation
+     such as "*movoo" uses vector pair access which use VSX registers.
+     So make MMA require VSX support here.  */
+  if (TARGET_MMA && !TARGET_VSX)
+    {
+      if ((rs6000_isa_flags_explicit & OPTION_MASK_MMA) != 0)
+	error ("%qs requires %qs", "-mmma", "-mvsx");
+      rs6000_isa_flags &= ~OPTION_MASK_MMA;
+    }
+
   if (!TARGET_PCREL && TARGET_PCREL_OPT)
     rs6000_isa_flags &= ~OPTION_MASK_PCREL_OPT;
 
-- 
cgit v1.1


From e66ba0f55c000152df63fc67c11a64f79122ef86 Mon Sep 17 00:00:00 2001
From: Kewen Lin <linkw@linux.ibm.com>
Date: Sun, 6 Feb 2022 21:30:02 -0600
Subject: rs6000: Move the hunk affecting VSX/ALTIVEC ahead [PR103627]

The modified hunk can update VSX and ALTIVEC flag, we have some codes
to check/warn for some flags related to VSX and ALTIVEC sitting where
the hunk is proprosed to be moved to.  Without this adjustment, the
VSX and ALTIVEC update is too late, it can cause the incompatibility
and result in unexpected behaviors, the associated test case is one
typical case.

Since we already have the code which sets TARGET_FLOAT128_TYPE and lays
after the moved place, and OPTION_MASK_FLOAT128_KEYWORD will rely on
TARGET_FLOAT128_TYPE, so it just simply remove them.

gcc/ChangeLog:

	PR target/103627
	* config/rs6000/rs6000.cc (rs6000_option_override_internal): Move the
	hunk affecting VSX and ALTIVEC to appropriate place.

gcc/testsuite/ChangeLog:

	PR target/103627
	* gcc.target/powerpc/pr103627-3.c: New test.
---
 gcc/config/rs6000/rs6000.cc | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index a2843d1..e571a0b 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -3934,6 +3934,15 @@ rs6000_option_override_internal (bool global_init_p)
   else if (TARGET_ALTIVEC)
     rs6000_isa_flags |= (OPTION_MASK_PPC_GFXOPT & ~ignore_masks);
 
+  /* Disable VSX and Altivec silently if the user switched cpus to power7 in a
+     target attribute or pragma which automatically enables both options,
+     unless the altivec ABI was set.  This is set by default for 64-bit, but
+     not for 32-bit.  Don't move this before the above code using ignore_masks,
+     since it can reset the cleared VSX/ALTIVEC flag again.  */
+  if (main_target_opt && !main_target_opt->x_rs6000_altivec_abi)
+    rs6000_isa_flags &= ~((OPTION_MASK_VSX | OPTION_MASK_ALTIVEC)
+			  & ~rs6000_isa_flags_explicit);
+
   if (TARGET_CRYPTO && !TARGET_ALTIVEC)
     {
       if (rs6000_isa_flags_explicit & OPTION_MASK_CRYPTO)
@@ -4350,18 +4359,6 @@ rs6000_option_override_internal (bool global_init_p)
 	}
     }
 
-  /* Disable VSX and Altivec silently if the user switched cpus to power7 in a
-     target attribute or pragma which automatically enables both options,
-     unless the altivec ABI was set.  This is set by default for 64-bit, but
-     not for 32-bit.  */
-  if (main_target_opt != NULL && !main_target_opt->x_rs6000_altivec_abi)
-    {
-      TARGET_FLOAT128_TYPE = 0;
-      rs6000_isa_flags &= ~((OPTION_MASK_VSX | OPTION_MASK_ALTIVEC
-			     | OPTION_MASK_FLOAT128_KEYWORD)
-			    & ~rs6000_isa_flags_explicit);
-    }
-
   /* Enable Altivec ABI for AIX -maltivec.  */
   if (TARGET_XCOFF
       && (TARGET_ALTIVEC || TARGET_VSX)
-- 
cgit v1.1


From db95441cf5399aabc46ca83df19f7290c3e23cb1 Mon Sep 17 00:00:00 2001
From: Andreas Krebbel <krebbel@linux.ibm.com>
Date: Sun, 6 Feb 2022 09:07:41 +0100
Subject: Check always_inline flag in s390_can_inline_p [PR104327]

MASK_MVCLE is set for -Os but not for other optimization levels. In
general it should not make much sense to inline across calls where the
flag is different but we have to allow it for always_inline.

The patch also rearranges the hook implementation a bit based on the
recommendations from Jakub und Martin in the PR.

Bootstrapped and regression tested on s390x with various arch flags.
Will commit after giving a few days for comments.

gcc/ChangeLog:

	PR target/104327
	* config/s390/s390.cc (s390_can_inline_p): Accept a few more flags
	if always_inline is set. Don't inline when tune differs without
	always_inline.

gcc/testsuite/ChangeLog:

	PR target/104327
	* gcc.c-torture/compile/pr104327.c: New test.
---
 gcc/config/s390/s390.cc | 64 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 47 insertions(+), 17 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 5c2a830..c6cfe41 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -16091,6 +16091,23 @@ s390_valid_target_attribute_p (tree fndecl,
 static bool
 s390_can_inline_p (tree caller, tree callee)
 {
+  /* Flags which if present in the callee are required in the caller as well.  */
+  const unsigned HOST_WIDE_INT caller_required_masks = MASK_OPT_HTM;
+
+  /* Flags which affect the ABI and in general prevent inlining.  */
+  unsigned HOST_WIDE_INT must_match_masks
+    = (MASK_64BIT | MASK_ZARCH | MASK_HARD_DFP | MASK_SOFT_FLOAT
+       | MASK_LONG_DOUBLE_128 | MASK_OPT_VX);
+
+  /* Flags which we in general want to prevent inlining but accept for
+     always_inline.  */
+  const unsigned HOST_WIDE_INT always_inline_safe_masks
+    = MASK_MVCLE | MASK_BACKCHAIN | MASK_SMALL_EXEC;
+
+  const HOST_WIDE_INT all_masks
+     = (caller_required_masks | must_match_masks | always_inline_safe_masks
+	| MASK_DEBUG_ARG | MASK_PACKED_STACK | MASK_ZVECTOR);
+
   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
 
@@ -16103,16 +16120,18 @@ s390_can_inline_p (tree caller, tree callee)
 
   struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
-  bool ret = true;
 
-  if ((caller_opts->x_target_flags & ~(MASK_SOFT_FLOAT | MASK_HARD_DFP))
-      != (callee_opts->x_target_flags & ~(MASK_SOFT_FLOAT | MASK_HARD_DFP)))
-    ret = false;
+  /* If one of these triggers make sure to add proper handling of your
+     new flag to this hook.  */
+  gcc_assert (!(caller_opts->x_target_flags & ~all_masks));
+  gcc_assert (!(callee_opts->x_target_flags & ~all_masks));
 
-  /* Don't inline functions to be compiled for a more recent arch into a
-     function for an older arch.  */
-  else if (caller_opts->x_s390_arch < callee_opts->x_s390_arch)
-    ret = false;
+  bool always_inline
+    = (DECL_DISREGARD_INLINE_LIMITS (callee)
+       && lookup_attribute ("always_inline", DECL_ATTRIBUTES (callee)));
+
+  if (!always_inline)
+    must_match_masks |= always_inline_safe_masks;
 
   /* Inlining a hard float function into a soft float function is only
      allowed if the hard float function doesn't actually make use of
@@ -16120,16 +16139,27 @@ s390_can_inline_p (tree caller, tree callee)
 
      We are called from FEs for multi-versioning call optimization, so
      beware of ipa_fn_summaries not available.  */
-  else if (((TARGET_SOFT_FLOAT_P (caller_opts->x_target_flags)
-	     && !TARGET_SOFT_FLOAT_P (callee_opts->x_target_flags))
-	    || (!TARGET_HARD_DFP_P (caller_opts->x_target_flags)
-		&& TARGET_HARD_DFP_P (callee_opts->x_target_flags)))
-	   && (! ipa_fn_summaries
-	       || ipa_fn_summaries->get
-	       (cgraph_node::get (callee))->fp_expressions))
-    ret = false;
+  if (always_inline && ipa_fn_summaries
+      && !ipa_fn_summaries->get(cgraph_node::get (callee))->fp_expressions)
+    must_match_masks &= ~(MASK_HARD_DFP | MASK_SOFT_FLOAT);
 
-  return ret;
+  if ((caller_opts->x_target_flags & must_match_masks)
+      != (callee_opts->x_target_flags & must_match_masks))
+    return false;
+
+  if (~(caller_opts->x_target_flags & caller_required_masks)
+      & (callee_opts->x_target_flags & caller_required_masks))
+    return false;
+
+  /* Don't inline functions to be compiled for a more recent arch into a
+     function for an older arch.  */
+  if (caller_opts->x_s390_arch < callee_opts->x_s390_arch)
+    return false;
+
+  if (!always_inline && caller_opts->x_s390_tune != callee_opts->x_s390_tune)
+    return false;
+
+  return true;
 }
 #endif
 
-- 
cgit v1.1


From 12aae3b93aeae50f5ced1bbef57fe207ecd12930 Mon Sep 17 00:00:00 2001
From: Tamar Christina <tamar.christina@arm.com>
Date: Mon, 7 Feb 2022 12:54:42 +0000
Subject: AArch32: correct dot-product RTL patterns.

The previous fix for this problem was wrong due to a subtle difference between
where NEON expects the RMW values and where intrinsics expects them.

The insn pattern is modeled after the intrinsics and so needs an expand for
the vectorizer optab to switch the RTL.

However operand[3] is not expected to be written to so the current pattern is
bogus.

Instead we use the expand to shuffle around the RTL.

The vectorizer expects operands[3] and operands[0] to be
the same but the aarch64 intrinsics expanders expect operands[0] and
operands[1] to be the same.

This also fixes some issues with big-endian, each dot product performs 4 8-byte
multiplications.  However compared to AArch64 we don't enter lanes in GCC
lane indexed in AArch32 aside from loads/stores.  This means no lane remappings
are done in arm-builtins.c and so none should be done at the instruction side.

There are some other instructions that need inspections as I think there are
more incorrect ones.

Third there was a bug in the ACLE specication for dot product which has now been
fixed[1].  This means some intrinsics were missing and are added by this patch.

Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues.

Ok for master? and active branches after some stew?

[1] https://github.com/ARM-software/acle/releases/tag/r2021Q3

gcc/ChangeLog:

	* config/arm/arm_neon.h (vdot_laneq_u32, vdotq_laneq_u32,
	vdot_laneq_s32, vdotq_laneq_s32): New.
	* config/arm/arm_neon_builtins.def (sdot_laneq, udot_laneq): New.
	* config/arm/neon.md (neon_<sup>dot<vsi2qi>): New.
	(<sup>dot_prod<vsi2qi>): Re-order rtl.
	(neon_<sup>dot_lane<vsi2qi>): Fix rtl order and endiannes.
	(neon_<sup>dot_laneq<vsi2qi>): New.

gcc/testsuite/ChangeLog:

	* gcc.target/arm/simd/vdot-compile.c: Add new cases.
	* gcc.target/arm/simd/vdot-exec.c: Likewise.
---
 gcc/config/arm/arm_neon.h            |  29 ++++++++
 gcc/config/arm/arm_neon_builtins.def |   2 +
 gcc/config/arm/neon.md               | 125 ++++++++++++++++++++---------------
 3 files changed, 101 insertions(+), 55 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 9b6d599..fdfea33 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18243,6 +18243,35 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, int8x8_t __b, const int __index)
   return __builtin_neon_sdot_lanev16qi (__r, __a, __b, __index);
 }
 
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, const int __index)
+{
+  return __builtin_neon_udot_laneqv8qi_uuuus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b,
+		const int __index)
+{
+  return __builtin_neon_udot_laneqv16qi_uuuus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int __index)
+{
+  return __builtin_neon_sdot_laneqv8qi (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int __index)
+{
+  return __builtin_neon_sdot_laneqv16qi (__r, __a, __b, __index);
+}
+
 #pragma GCC pop_options
 #endif
 
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index 865de65..c29ae3a 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -342,6 +342,8 @@ VAR2 (TERNOP, sdot, v8qi, v16qi)
 VAR2 (UTERNOP, udot, v8qi, v16qi)
 VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi)
 VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
+VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi)
+VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi)
 
 VAR1 (USTERNOP, usdot, v8qi)
 VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi)
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index e06c824..4a8987b 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2866,20 +2866,49 @@
 })
 
 
-;; These instructions map to the __builtins for the Dot Product operations.
-(define_insn "neon_<sup>dot<vsi2qi>"
+;; These map to the auto-vectorizer Dot Product optab.
+;; The auto-vectorizer expects a dot product builtin that also does an
+;; accumulation into the provided register.
+;; Given the following pattern
+;;
+;; for (i=0; i<len; i++) {
+;;     c = a[i] * b[i];
+;;     r += c;
+;; }
+;; return result;
+;;
+;; This can be auto-vectorized to
+;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
+;;
+;; given enough iterations.  However the vectorizer can keep unrolling the loop
+;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
+;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
+;; ...
+;;
+;; and so the vectorizer provides r, in which the result has to be accumulated.
+(define_insn "<sup>dot_prod<vsi2qi>"
   [(set (match_operand:VCVTI 0 "register_operand" "=w")
-	(plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
-		    (unspec:VCVTI [(match_operand:<VSI2QI> 2
-							"register_operand" "w")
-				   (match_operand:<VSI2QI> 3
-							"register_operand" "w")]
-		DOTPROD)))]
+	(plus:VCVTI
+	  (unspec:VCVTI [(match_operand:<VSI2QI> 1 "register_operand" "w")
+			 (match_operand:<VSI2QI> 2 "register_operand" "w")]
+			 DOTPROD)
+	  (match_operand:VCVTI 3 "register_operand" "0")))]
   "TARGET_DOTPROD"
-  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
   [(set_attr "type" "neon_dot<q>")]
 )
 
+;; These instructions map to the __builtins for the Dot Product operations
+(define_expand "neon_<sup>dot<vsi2qi>"
+  [(set (match_operand:VCVTI 0 "register_operand" "=w")
+	(plus:VCVTI
+	  (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand")
+			 (match_operand:<VSI2QI> 3 "register_operand")]
+			 DOTPROD)
+	  (match_operand:VCVTI 1 "register_operand")))]
+  "TARGET_DOTPROD"
+)
+
 ;; These instructions map to the __builtins for the Dot Product operations.
 (define_insn "neon_usdot<vsi2qi>"
   [(set (match_operand:VCVTI 0 "register_operand" "=w")
@@ -2898,17 +2927,40 @@
 ;; indexed operations.
 (define_insn "neon_<sup>dot_lane<vsi2qi>"
   [(set (match_operand:VCVTI 0 "register_operand" "=w")
-	(plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
-		    (unspec:VCVTI [(match_operand:<VSI2QI> 2
-							"register_operand" "w")
-				   (match_operand:V8QI 3 "register_operand" "t")
-				   (match_operand:SI 4 "immediate_operand" "i")]
-		DOTPROD)))]
+	(plus:VCVTI
+	  (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" "w")
+			 (match_operand:V8QI 3 "register_operand" "t")
+			 (match_operand:SI 4 "immediate_operand" "i")]
+			 DOTPROD)
+	  (match_operand:VCVTI 1 "register_operand" "0")))]
+  "TARGET_DOTPROD"
+  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations.
+(define_insn "neon_<sup>dot_laneq<vsi2qi>"
+  [(set (match_operand:VCVTI 0 "register_operand" "=w")
+	(plus:VCVTI
+	  (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" "w")
+			 (match_operand:V16QI 3 "register_operand" "t")
+			 (match_operand:SI 4 "immediate_operand" "i")]
+			 DOTPROD)
+	  (match_operand:VCVTI 1 "register_operand" "0")))]
   "TARGET_DOTPROD"
   {
-    operands[4]
-      = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL (operands[4])));
-    return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
+    int lane = INTVAL (operands[4]);
+    if (lane > GET_MODE_NUNITS (V2SImode) - 1)
+      {
+	operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode));
+	return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
+      }
+    else
+      {
+	operands[4] = GEN_INT (lane);
+	return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
+      }
   }
   [(set_attr "type" "neon_dot<q>")]
 )
@@ -2932,43 +2984,6 @@
   [(set_attr "type" "neon_dot<q>")]
 )
 
-;; These expands map to the Dot Product optab the vectorizer checks for.
-;; The auto-vectorizer expects a dot product builtin that also does an
-;; accumulation into the provided register.
-;; Given the following pattern
-;;
-;; for (i=0; i<len; i++) {
-;;     c = a[i] * b[i];
-;;     r += c;
-;; }
-;; return result;
-;;
-;; This can be auto-vectorized to
-;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
-;;
-;; given enough iterations.  However the vectorizer can keep unrolling the loop
-;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
-;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
-;; ...
-;;
-;; and so the vectorizer provides r, in which the result has to be accumulated.
-(define_expand "<sup>dot_prod<vsi2qi>"
-  [(set (match_operand:VCVTI 0 "register_operand")
-	(plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1
-							"register_operand")
-				   (match_operand:<VSI2QI> 2
-							"register_operand")]
-		     DOTPROD)
-		    (match_operand:VCVTI 3 "register_operand")))]
-  "TARGET_DOTPROD"
-{
-  emit_insn (
-    gen_neon_<sup>dot<vsi2qi> (operands[3], operands[3], operands[1],
-				 operands[2]));
-  emit_insn (gen_rtx_SET (operands[0], operands[3]));
-  DONE;
-})
-
 ;; Auto-vectorizer pattern for usdot
 (define_expand "usdot_prod<vsi2qi>"
   [(set (match_operand:VCVTI 0 "register_operand")
-- 
cgit v1.1


From f2d131645114f14bd91a60107c941287370650ea Mon Sep 17 00:00:00 2001
From: Tamar Christina <tamar.christina@arm.com>
Date: Mon, 7 Feb 2022 12:55:12 +0000
Subject: AArch32: correct usdot-product RTL patterns.

There was a bug in the ACLE specication for dot product which has now
been fixed[1].  This means some intrinsics were missing and are added by this
patch.

Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues.

Ok for master?

[1] https://github.com/ARM-software/acle/releases/tag/r2021Q3

gcc/ChangeLog:

	* config/arm/arm_neon.h (vusdotq_s32, vusdot_laneq_s32,
	vusdotq_laneq_s32, vsudot_laneq_s32, vsudotq_laneq_s32): New
	* config/arm/arm_neon_builtins.def (usdot): Add V16QI.
	(usdot_laneq, sudot_laneq): New.
	* config/arm/neon.md (neon_<sup>dot_laneq<vsi2qi>): New.
	(neon_<sup>dot_lane<vsi2qi>): Remote unneeded code.

gcc/testsuite/ChangeLog:

	* gcc.target/arm/simd/vdot-2-1.c: Add new tests.
	* gcc.target/arm/simd/vdot-2-2.c: Likewise and fix output.
---
 gcc/config/arm/arm_neon.h            | 39 ++++++++++++++++++++++++++++++++++++
 gcc/config/arm/arm_neon_builtins.def |  4 +++-
 gcc/config/arm/neon.md               | 28 ++++++++++++++++++++++++--
 3 files changed, 68 insertions(+), 3 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index fdfea33..b30d04c 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18930,6 +18930,13 @@ vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
   return __builtin_neon_usdotv8qi_ssus (__r, __a, __b);
 }
 
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+  return __builtin_neon_usdotv16qi_ssus (__r, __a, __b);
+}
+
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a,
@@ -18962,6 +18969,38 @@ vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a,
   return __builtin_neon_sudot_lanev16qi_sssus (__r, __a, __b, __index);
 }
 
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a,
+		  int8x16_t __b, const int __index)
+{
+  return __builtin_neon_usdot_laneqv8qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a,
+		   int8x16_t __b, const int __index)
+{
+  return __builtin_neon_usdot_laneqv16qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a,
+		  uint8x16_t __b, const int __index)
+{
+  return __builtin_neon_sudot_laneqv8qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a,
+		   uint8x16_t __b, const int __index)
+{
+  return __builtin_neon_sudot_laneqv16qi_sssus (__r, __a, __b, __index);
+}
+
 #pragma GCC pop_options
 
 #pragma GCC pop_options
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index c29ae3a..445b2bf 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -345,9 +345,11 @@ VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
 VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi)
 VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi)
 
-VAR1 (USTERNOP, usdot, v8qi)
+VAR2 (USTERNOP, usdot, v8qi, v16qi)
 VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi)
 VAR2 (SUMAC_LANE_QUADTUP, sudot_lane, v8qi, v16qi)
+VAR2 (USMAC_LANE_QUADTUP, usdot_laneq, v8qi, v16qi)
+VAR2 (SUMAC_LANE_QUADTUP, sudot_laneq, v8qi, v16qi)
 
 VAR4 (BINOP, vcadd90, v4hf, v2sf, v8hf, v4sf)
 VAR4 (BINOP, vcadd270, v4hf, v2sf, v8hf, v4sf)
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 4a8987b..2b9a3de 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2977,9 +2977,33 @@
 	    DOTPROD_I8MM)
 	  (match_operand:VCVTI 1 "register_operand" "0")))]
   "TARGET_I8MM"
+  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations in the v8.6 I8MM extension.
+(define_insn "neon_<sup>dot_laneq<vsi2qi>"
+  [(set (match_operand:VCVTI 0 "register_operand" "=w")
+	(plus:VCVTI
+	  (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" "w")
+			 (match_operand:V16QI 3 "register_operand" "t")
+			 (match_operand:SI 4 "immediate_operand" "i")]
+			 DOTPROD_I8MM)
+	  (match_operand:VCVTI 1 "register_operand" "0")))]
+  "TARGET_I8MM"
   {
-    operands[4] = GEN_INT (INTVAL (operands[4]));
-    return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
+    int lane = INTVAL (operands[4]);
+    if (lane > GET_MODE_NUNITS (V2SImode) - 1)
+      {
+	operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode));
+	return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
+      }
+    else
+      {
+	operands[4] = GEN_INT (lane);
+	return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
+      }
   }
   [(set_attr "type" "neon_dot<q>")]
 )
-- 
cgit v1.1


From 04b54cc486cc6fcc40380445e500eaf46d7901dc Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Thu, 3 Feb 2022 14:00:02 +0100
Subject: [nvptx] Fix .local atomic regressions

In PR target/104364, two problems were reported:
- in muniform-simt mode, an atom.cas insn is no longer executed in the
  "master lane" only.
- in msoft-stack mode, an __atomic_compare_exchange_n on stack memory is
  translated assuming it accesses local memory, while that's not the case.

Fix these by:
- ensuring that all insns with atomic attribute are also predicable, such
  that the validate_change in nvptx_reorg_uniform_simt will succeed, and
  asserting that it does, and
- guarding the local atomics implementation with a new function
  nvptx_mem_local_p that correctly handles msoft-stack.

Tested on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-02-04  Tom de Vries  <tdevries@suse.de>

	PR target/104364
	* config/nvptx/nvptx-protos.h (nvptx_mem_local_p): Declare.
	* config/nvptx/nvptx.cc (nvptx_reorg_uniform_simt): Assert that
	change is validated.
	(nvptx_mem_local_p): New function.
	* config/nvptx/nvptx.md: Use nvptx_mem_local_p.
	(define_c_enum "unspecv"): Add UNSPECV_CAS_LOCAL.
	(define_insn "atomic_compare_and_swap<mode>_1_local"): New
	non-atomic, non-predicable define_insn, factored out of ...
	(define_insn "atomic_compare_and_swap<mode>_1"): ... here.
	Make predicable again.
	(define_expand "atomic_compare_and_swap<mode>"): Use
	atomic_compare_and_swap<mode>_1_local.

gcc/testsuite/ChangeLog:

2022-02-04  Tom de Vries  <tdevries@suse.de>

	PR target/104364
	* gcc.target/nvptx/softstack-2.c: New test.
	* gcc.target/nvptx/uniform-simt-1.c: New test.
---
 gcc/config/nvptx/nvptx-protos.h |  1 +
 gcc/config/nvptx/nvptx.cc       | 25 +++++++++++++++-
 gcc/config/nvptx/nvptx.md       | 63 +++++++++++++++++++++--------------------
 3 files changed, 58 insertions(+), 31 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
index 3d6ad14..a846e34 100644
--- a/gcc/config/nvptx/nvptx-protos.h
+++ b/gcc/config/nvptx/nvptx-protos.h
@@ -59,5 +59,6 @@ extern const char *nvptx_output_simt_enter (rtx, rtx, rtx);
 extern const char *nvptx_output_simt_exit (rtx);
 extern const char *nvptx_output_red_partition (rtx, rtx);
 extern const char *nvptx_output_atomic_insn (const char *, rtx *, int, int);
+extern bool nvptx_mem_local_p (rtx);
 #endif
 #endif
diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index b3bb97c..2a69492 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -3150,7 +3150,8 @@ nvptx_reorg_uniform_simt ()
       rtx pred = nvptx_get_unisimt_predicate ();
       pred = gen_rtx_NE (BImode, pred, const0_rtx);
       pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
-      validate_change (insn, &PATTERN (insn), pat, false);
+      bool changed_p = validate_change (insn, &PATTERN (insn), pat, false);
+      gcc_assert (changed_p);
     }
 }
 
@@ -6894,6 +6895,28 @@ nvptx_libc_has_function (enum function_class fn_class, tree type)
   return default_libc_has_function (fn_class, type);
 }
 
+bool
+nvptx_mem_local_p (rtx mem)
+{
+  gcc_assert (GET_CODE (mem) == MEM);
+
+  struct address_info info;
+  decompose_mem_address (&info, mem);
+
+  if (info.base != NULL && REG_P (*info.base)
+      && REGNO_PTR_FRAME_P (REGNO (*info.base)))
+    {
+      if (TARGET_SOFT_STACK)
+	{
+	  /* Frame-related doesn't mean local.  */
+	}
+      else
+	return true;
+    }
+
+  return false;
+}
+
 #undef TARGET_OPTION_OVERRIDE
 #define TARGET_OPTION_OVERRIDE nvptx_option_override
 
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 92768dd..d64dbfd 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -54,6 +54,7 @@
 (define_c_enum "unspecv" [
    UNSPECV_LOCK
    UNSPECV_CAS
+   UNSPECV_CAS_LOCAL
    UNSPECV_XCHG
    UNSPECV_BARSYNC
    UNSPECV_WARPSYNC
@@ -1771,8 +1772,14 @@
    (match_operand:SI 7 "const_int_operand")]		;; failure model
   ""
 {
-  emit_insn (gen_atomic_compare_and_swap<mode>_1
-    (operands[1], operands[2], operands[3], operands[4], operands[6]));
+  if (nvptx_mem_local_p (operands[2]))
+    emit_insn (gen_atomic_compare_and_swap<mode>_1_local
+		(operands[1], operands[2], operands[3], operands[4],
+		 operands[6]));
+  else
+    emit_insn (gen_atomic_compare_and_swap<mode>_1
+		(operands[1], operands[2], operands[3], operands[4],
+		 operands[6]));
 
   rtx cond = gen_reg_rtx (BImode);
   emit_move_insn (cond, gen_rtx_EQ (BImode, operands[1], operands[3]));
@@ -1780,23 +1787,18 @@
   DONE;
 })
 
-(define_insn "atomic_compare_and_swap<mode>_1"
+(define_insn "atomic_compare_and_swap<mode>_1_local"
   [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
 	(unspec_volatile:SDIM
 	  [(match_operand:SDIM 1 "memory_operand" "+m")
 	   (match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri")
 	   (match_operand:SDIM 3 "nvptx_nonmemory_operand" "Ri")
 	   (match_operand:SI 4 "const_int_operand")]
-	  UNSPECV_CAS))
+	  UNSPECV_CAS_LOCAL))
    (set (match_dup 1)
-	(unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))]
+	(unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS_LOCAL))]
   ""
   {
-    struct address_info info;
-    decompose_mem_address (&info, operands[1]);
-    if (info.base != NULL && REG_P (*info.base)
-	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
-      {
 	output_asm_insn ("{", NULL);
 	output_asm_insn ("\\t"	      ".reg.pred"  "\\t" "%%eq_p;", NULL);
 	output_asm_insn ("\\t"	      ".reg%t0"	   "\\t" "%%val;", operands);
@@ -1807,13 +1809,26 @@
 	output_asm_insn ("\\t"	      "mov%t0"	   "\\t" "%0,%%val;", operands);
 	output_asm_insn ("}", NULL);
 	return "";
-      }
+  }
+  [(set_attr "predicable" "false")])
+
+(define_insn "atomic_compare_and_swap<mode>_1"
+  [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
+	(unspec_volatile:SDIM
+	  [(match_operand:SDIM 1 "memory_operand" "+m")
+	   (match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri")
+	   (match_operand:SDIM 3 "nvptx_nonmemory_operand" "Ri")
+	   (match_operand:SI 4 "const_int_operand")]
+	  UNSPECV_CAS))
+   (set (match_dup 1)
+	(unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))]
+  ""
+  {
     const char *t
-      = "\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;";
+      = "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;";
     return nvptx_output_atomic_insn (t, operands, 1, 4);
   }
-  [(set_attr "atomic" "true")
-   (set_attr "predicable" "false")])
+  [(set_attr "atomic" "true")])
 
 (define_insn "atomic_exchange<mode>"
   [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")	;; output
@@ -1825,10 +1840,7 @@
 	(match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))]	;; input
   ""
   {
-    struct address_info info;
-    decompose_mem_address (&info, operands[1]);
-    if (info.base != NULL && REG_P (*info.base)
-	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+    if (nvptx_mem_local_p (operands[1]))
       {
 	output_asm_insn ("{", NULL);
 	output_asm_insn ("\\t"	 ".reg%t0"  "\\t" "%%val;", operands);
@@ -1855,10 +1867,7 @@
 	(match_dup 1))]
   ""
   {
-    struct address_info info;
-    decompose_mem_address (&info, operands[1]);
-    if (info.base != NULL && REG_P (*info.base)
-	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+    if (nvptx_mem_local_p (operands[1]))
       {
 	output_asm_insn ("{", NULL);
 	output_asm_insn ("\\t"	 ".reg%t0"  "\\t" "%%val;", operands);
@@ -1888,10 +1897,7 @@
 	(match_dup 1))]
   ""
   {
-    struct address_info info;
-    decompose_mem_address (&info, operands[1]);
-    if (info.base != NULL && REG_P (*info.base)
-	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+    if (nvptx_mem_local_p (operands[1]))
       {
 	output_asm_insn ("{", NULL);
 	output_asm_insn ("\\t"	 ".reg%t0"  "\\t" "%%val;", operands);
@@ -1924,10 +1930,7 @@
 	(match_dup 1))]
   "<MODE>mode == SImode || TARGET_SM35"
   {
-    struct address_info info;
-    decompose_mem_address (&info, operands[1]);
-    if (info.base != NULL && REG_P (*info.base)
-	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+    if (nvptx_mem_local_p (operands[1]))
       {
 	output_asm_insn ("{", NULL);
 	output_asm_insn ("\\t"	 ".reg.b%T0"    "\\t" "%%val;", operands);
-- 
cgit v1.1


From 73f4a989b7f8aeaf8bff37e7f33b65d26b8f179f Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Mon, 7 Feb 2022 14:50:13 +0100
Subject: [nvptx] Fix 'main (int argc)' compilation

On nvptx, with test-case sso-12.c I run into:
...
spawn nvptx-none-run ./sso-12.exe^M
error: Prototype doesn't match for 'main' in 'input file 1 at offset 1796', \
  first defined in 'input file 1 at offset 1796'^M
nvptx-run: cuLinkAddData failed: device kernel image is invalid \
  (CUDA_ERROR_INVALID_SOURCE, 300)^M
FAIL: gcc.dg/sso-12.c execution test
...

The problem is that the test case uses 'main (int)' prototype, while __main
uses:
...
extern int main (int, void **);
...

There's code in write_fn_proto_1 to handle 'main (void)' as if
'main (int, void **)' was specified, but that's not active for 'main (int)'.

Fix this in write_fn_proto_1 by handling 'main (int)' as if
'main (int, void **)' was specified.

Tested on nvptx.

gcc/ChangeLog:

2022-02-07  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.cc (write_fn_proto_1): Handle 'main (int)'.
---
 gcc/config/nvptx/nvptx.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 2a69492..006fac8 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -938,10 +938,13 @@ write_fn_proto_1 (std::stringstream &s, bool is_defn,
   if (DECL_STATIC_CHAIN (decl))
     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
 
-  if (!argno && strcmp (name, "main") == 0)
+  if (argno < 2 && strcmp (name, "main") == 0)
     {
-      argno = write_arg_type (s, -1, argno, integer_type_node, true);
-      argno = write_arg_type (s, -1, argno, ptr_type_node, true);
+      if (argno == 0)
+	argno = write_arg_type (s, -1, argno, integer_type_node, true);
+
+      if (argno == 1)
+	argno = write_arg_type (s, -1, argno, ptr_type_node, true);
     }
 
   if (argno)
-- 
cgit v1.1


From 3faeba72cf93bdbf0b42d6b1b65fd4f0794f9d2a Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@embecosm.com>
Date: Tue, 8 Feb 2022 12:14:58 +0000
Subject: RISC-V: Add target machine headers as a dependency for riscv-sr.o

Make riscv-sr.o depend on target machine headers, removing spurious test
failures:

FAIL: gcc.target/riscv/save-restore-3.c scan-assembler-not call[ \t]*t0,__riscv_save_0
FAIL: gcc.target/riscv/save-restore-3.c scan-assembler-not tail[ \t]*__riscv_restore_0
FAIL: gcc.target/riscv/save-restore-3.c scan-assembler tail[ \t]*foo
FAIL: gcc.target/riscv/save-restore-6.c scan-assembler-not call[ \t]*t0,__riscv_save_0
FAIL: gcc.target/riscv/save-restore-6.c scan-assembler-not tail[ \t]*__riscv_restore_0
FAIL: gcc.target/riscv/save-restore-6.c scan-assembler tail[ \t]*other_func

if the definitions of UNSPECs are locally changed and GCC rebuilt from a
dirty tree.

	gcc/
	* config/riscv/t-riscv (riscv-sr.o): Add $(TM_H) dependency.
---
 gcc/config/riscv/t-riscv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
index 096d70e..19736b3 100644
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@@ -6,7 +6,7 @@ riscv-builtins.o: $(srcdir)/config/riscv/riscv-builtins.cc $(CONFIG_H) \
 		$(srcdir)/config/riscv/riscv-builtins.cc
 
 riscv-sr.o: $(srcdir)/config/riscv/riscv-sr.cc $(CONFIG_H) \
-  $(SYSTEM_H)
+  $(SYSTEM_H) $(TM_H)
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/riscv/riscv-sr.cc
 
-- 
cgit v1.1


From decde11183bdccc46587d6614b75f3d56a2f2e4a Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Fri, 4 Feb 2022 08:53:52 +0100
Subject: [nvptx] Choose -mptx default based on -misa

While testing with driver version 390.147 I ran into the problem that it
doesn't support ptx isa version 6.3 (the new default), only 6.1.

Furthermore, using the -mptx option is a bit user-unfriendly.

Say we want to compile for sm_80.  We can use -misa=sm_80 to specify that, but
then run into errors because the default ptx version is 6.3, which doesn't
support sm_80 yet.

Address both these issues by:
- picking a default -mptx based on the active -misa, and
- ensuring that the default -mptx is at least 6.0 (instead
  of 6.3).

Also add an error in case of incompatible options like
"-misa=sm_80 -mptx=6.3":
...
cc1: error: PTX version (-mptx) needs to be at least 7.0 to support \
  selected -misa (sm_80)
...

Tested on x86_64-linux with nvptx accelerator.

gcc/ChangeLog:

2022-02-08  Tom de Vries  <tdevries@suse.de>

	PR target/104283
	* config/nvptx/nvptx-opts.h (enum ptx_version): Add PTX_VERSION_3_0
	and PTX_VERSION_4_2.
	* config/nvptx/nvptx.cc (first_ptx_version_supporting_sm)
	(default_ptx_version_option, ptx_version_to_string)
	(sm_version_to_string, handle_ptx_version_option): New function.
	(nvptx_option_override): Call handle_ptx_version_option.
	(nvptx_file_start): Use ptx_version_to_string and sm_version_to_string.
	* config/nvptx/nvptx.md (define_insn "nvptx_shuffle<mode>")
	(define_insn "nvptx_vote_ballot"): Use TARGET_PTX_6_0.
	* config/nvptx/nvptx.opt (mptx): Remove 'Init'.
---
 gcc/config/nvptx/nvptx-opts.h |   2 +
 gcc/config/nvptx/nvptx.cc     | 133 +++++++++++++++++++++++++++++++++++++-----
 gcc/config/nvptx/nvptx.md     |   4 +-
 gcc/config/nvptx/nvptx.opt    |   2 +-
 4 files changed, 122 insertions(+), 19 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h
index c754a51..cc488b2 100644
--- a/gcc/config/nvptx/nvptx-opts.h
+++ b/gcc/config/nvptx/nvptx-opts.h
@@ -31,7 +31,9 @@ enum ptx_isa
 
 enum ptx_version
 {
+  PTX_VERSION_3_0,
   PTX_VERSION_3_1,
+  PTX_VERSION_4_2,
   PTX_VERSION_6_0,
   PTX_VERSION_6_3,
   PTX_VERSION_7_0
diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 006fac8..1b0227a 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -205,6 +205,109 @@ diagnose_openacc_conflict (bool optval, const char *optname)
     error ("option %s is not supported together with %<-fopenacc%>", optname);
 }
 
+static enum ptx_version
+first_ptx_version_supporting_sm (enum ptx_isa sm)
+{
+  switch (sm)
+    {
+    case PTX_ISA_SM30:
+      return PTX_VERSION_3_0;
+    case PTX_ISA_SM35:
+      return PTX_VERSION_3_1;
+    case PTX_ISA_SM53:
+      return PTX_VERSION_4_2;
+    case PTX_ISA_SM75:
+      return PTX_VERSION_6_3;
+    case PTX_ISA_SM80:
+      return PTX_VERSION_7_0;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+static enum ptx_version
+default_ptx_version_option (void)
+{
+  enum ptx_version first
+    = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
+
+  /* Pick a version that supports the sm.  */
+  enum ptx_version res = first;
+
+  /* Pick at least 3.1.  This has been the smallest version historically.  */
+  res = MAX (res, PTX_VERSION_3_1);
+
+  /* Pick at least 6.0, to enable using bar.warp.sync to have a way to force
+     warp convergence.  */
+  res = MAX (res, PTX_VERSION_6_0);
+
+  /* Verify that we pick a version that supports the sm.  */
+  gcc_assert (first <= res);
+  return res;
+}
+
+static const char *
+ptx_version_to_string (enum ptx_version v)
+{
+  switch (v)
+    {
+    case PTX_VERSION_3_0:
+      return "3.0";
+    case PTX_VERSION_3_1:
+      return "3.1";
+    case PTX_VERSION_4_2:
+      return "4.2";
+    case PTX_VERSION_6_0:
+      return "6.0";
+    case PTX_VERSION_6_3:
+      return "6.3";
+    case PTX_VERSION_7_0:
+      return "7.0";
+    default:
+      gcc_unreachable ();
+    }
+}
+
+static const char *
+sm_version_to_string (enum ptx_isa sm)
+{
+  switch (sm)
+    {
+    case PTX_ISA_SM30:
+      return "30";
+    case PTX_ISA_SM35:
+      return "35";
+    case PTX_ISA_SM53:
+      return "53";
+    case PTX_ISA_SM70:
+      return "70";
+    case PTX_ISA_SM75:
+      return "75";
+    case PTX_ISA_SM80:
+      return "80";
+    default:
+      gcc_unreachable ();
+    }
+}
+
+static void
+handle_ptx_version_option (void)
+{
+  if (!OPTION_SET_P (ptx_version_option))
+    {
+      ptx_version_option = default_ptx_version_option ();
+      return;
+    }
+
+  enum ptx_version first
+    = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
+
+  if (ptx_version_option < first)
+    error ("PTX version (-mptx) needs to be at least %s to support selected"
+	   " -misa (sm_%s)", ptx_version_to_string (first),
+	   sm_version_to_string ((enum ptx_isa)ptx_isa_option));
+}
+
 /* Implement TARGET_OPTION_OVERRIDE.  */
 
 static void
@@ -212,6 +315,8 @@ nvptx_option_override (void)
 {
   init_machine_status = nvptx_init_machine_status;
 
+  handle_ptx_version_option ();
+
   /* Set toplevel_reorder, unless explicitly disabled.  We need
      reordering so that we emit necessary assembler decls of
      undeclared variables. */
@@ -5430,23 +5535,19 @@ static void
 nvptx_file_start (void)
 {
   fputs ("// BEGIN PREAMBLE\n", asm_out_file);
-  if (TARGET_PTX_7_0)
-    fputs ("\t.version\t7.0\n", asm_out_file);
-  else if (TARGET_PTX_6_3)
-    fputs ("\t.version\t6.3\n", asm_out_file);
-  else
-    fputs ("\t.version\t3.1\n", asm_out_file);
-  if (TARGET_SM80)
-    fputs ("\t.target\tsm_80\n", asm_out_file);
-  else if (TARGET_SM75)
-    fputs ("\t.target\tsm_75\n", asm_out_file);
-  else if (TARGET_SM53)
-    fputs ("\t.target\tsm_53\n", asm_out_file);
-  else if (TARGET_SM35)
-    fputs ("\t.target\tsm_35\n", asm_out_file);
-  else
-    fputs ("\t.target\tsm_30\n", asm_out_file);
+
+  fputs ("\t.version\t", asm_out_file);
+  fputs (ptx_version_to_string ((enum ptx_version)ptx_version_option),
+	 asm_out_file);
+  fputs ("\n", asm_out_file);
+
+  fputs ("\t.target\tsm_", asm_out_file);
+  fputs (sm_version_to_string ((enum ptx_isa)ptx_isa_option),
+	 asm_out_file);
+  fputs ("\n", asm_out_file);
+
   fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
+
   fputs ("// END PREAMBLE\n", asm_out_file);
 }
 
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index d64dbfd..7463603 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -1603,7 +1603,7 @@
 		  UNSPEC_SHUFFLE))]
   ""
   {
-    if (TARGET_PTX_6_3)
+    if (TARGET_PTX_6_0)
       return "%.\\tshfl.sync%S3.b32\\t%0, %1, %2, 31, 0xffffffff;";
     else
       return "%.\\tshfl%S3.b32\\t%0, %1, %2, 31;";
@@ -1615,7 +1615,7 @@
 		   UNSPEC_VOTE_BALLOT))]
   ""
   {
-    if (TARGET_PTX_6_3)
+    if (TARGET_PTX_6_0)
       return "%.\\tvote.sync.ballot.b32\\t%0, %1, 0xffffffff;";
     else
       return "%.\\tvote.ballot.b32\\t%0, %1;";
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 6e12b1f..e3f65b2 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -89,5 +89,5 @@ EnumValue
 Enum(ptx_version) String(7.0) Value(PTX_VERSION_7_0)
 
 mptx=
-Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option) Init(PTX_VERSION_6_3)
+Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option)
 Specify the version of the ptx version to use.
-- 
cgit v1.1


From 1e3185e714e877b2b4d14ade0865322f71a8cbf6 Mon Sep 17 00:00:00 2001
From: Robin Dapp <rdapp@linux.ibm.com>
Date: Tue, 8 Feb 2022 14:56:29 +0100
Subject: s390: Increase costs for load on condition and change movqicc
 expander.

This patch changes the costs for a load on condition from 5 to 6 in
order to ensure that we only if-convert two and not three or more SETS like

if (cond)
{
  a = b;
  c = d;
  e = f;
}

In the movqicc expander we emit a paradoxical subreg directly that
combine would otherwise try to create by using a non-optimal sequence
(which would be too expensive).

Also, fix two oversights in ifcvt testcases.

gcc/ChangeLog:

	* config/s390/s390.cc (s390_rtx_costs): Increase costs for load
	on condition.
	* config/s390/s390.md: Use paradoxical subreg.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/ifcvt-two-insns-int.c: Fix array size.
	* gcc.target/s390/ifcvt-two-insns-long.c: Dito.
---
 gcc/config/s390/s390.cc | 2 +-
 gcc/config/s390/s390.md | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index c6cfe41..d2af6d8 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -3636,7 +3636,7 @@ s390_rtx_costs (rtx x, machine_mode mode, int outer_code,
 
 	/* It is going to be a load/store on condition.  Make it
 	   slightly more expensive than a normal load.  */
-	*total = COSTS_N_INSNS (1) + 1;
+	*total = COSTS_N_INSNS (1) + 2;
 
 	rtx dst = SET_DEST (x);
 	rtx then = XEXP (SET_SRC (x), 1);
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index e3ccbac..5eee8e8 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -7003,9 +7003,9 @@
   if (!CONSTANT_P (els))
     els = simplify_gen_subreg (E_SImode, els, <MODE>mode, 0);
 
-  rtx tmp_target = gen_reg_rtx (E_SImode);
+  rtx tmp_target = simplify_gen_subreg (E_SImode, operands[0], <MODE>mode, 0);
+
   emit_insn (gen_movsicc (tmp_target, operands[1], then, els));
-  emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp_target));
   DONE;
 })
 
-- 
cgit v1.1


From ab1355a4804f04700a6ad49c9cc90261334e9dc3 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Tue, 8 Feb 2022 15:35:37 +0100
Subject: [nvptx] Unbreak build, add PTX_ISA_SM70

With the commit "[nvptx] Choose -mptx default based on -misa" I introduced a
use of PTX_ISA_SM70, without adding it first.

Add it, as well as the corresponding TARGET_SM70.

Build for x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-02-08  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx-opts.h (enum ptx_isa): Add PTX_ISA_SM70.
	* config/nvptx/nvptx.h (TARGET_SM70): Define.
---
 gcc/config/nvptx/nvptx-opts.h | 1 +
 gcc/config/nvptx/nvptx.h      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h
index cc488b2..e918d43 100644
--- a/gcc/config/nvptx/nvptx-opts.h
+++ b/gcc/config/nvptx/nvptx-opts.h
@@ -25,6 +25,7 @@ enum ptx_isa
   PTX_ISA_SM30,
   PTX_ISA_SM35,
   PTX_ISA_SM53,
+  PTX_ISA_SM70,
   PTX_ISA_SM75,
   PTX_ISA_SM80
 };
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 065d7aa..edffd08 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -88,6 +88,7 @@
 
 #define TARGET_SM35 (ptx_isa_option >= PTX_ISA_SM35)
 #define TARGET_SM53 (ptx_isa_option >= PTX_ISA_SM53)
+#define TARGET_SM70 (ptx_isa_option >= PTX_ISA_SM70)
 #define TARGET_SM75 (ptx_isa_option >= PTX_ISA_SM75)
 #define TARGET_SM80 (ptx_isa_option >= PTX_ISA_SM80)
 
-- 
cgit v1.1


From 943d631abdd7be623cbf2b870d3d0cfef89f5f26 Mon Sep 17 00:00:00 2001
From: Bill Schmidt <wschmidt@linux.ibm.com>
Date: Tue, 8 Feb 2022 10:36:14 -0600
Subject: rs6000: Add support for vmsumcud and vec_msumc

2022-02-08  Bill Schmidt  <wschmidt@linux.ibm.com>

gcc/
	* config/rs6000/rs6000-builtins.def (VMSUMCUD): New.
	* config/rs6000/rs6000-overload.def (VEC_MSUMC): New.
	* config/rs6000/vsx.md (UNSPEC_VMSUMCUD): New constant.
	(vmsumcud): New define_insn.

gcc/testsuite/
	* gcc.target/powerpc/vec-msumc.c: New test.
---
 gcc/config/rs6000/rs6000-builtins.def |  3 +++
 gcc/config/rs6000/rs6000-overload.def |  4 ++++
 gcc/config/rs6000/vsx.md              | 13 +++++++++++++
 3 files changed, 20 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index 7f527b6..2d1e63fb 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -3497,6 +3497,9 @@
   const signed int __builtin_altivec_vstrihr_p (vss);
     VSTRIHR_P vstrir_p_v8hi {}
 
+  const vuq __builtin_vsx_vmsumcud (vull, vull, vuq);
+    VMSUMCUD vmsumcud {}
+
   const signed int __builtin_vsx_xvtlsbb_all_ones (vsc);
     XVTLSBB_ONES xvtlsbbo {}
 
diff --git a/gcc/config/rs6000/rs6000-overload.def b/gcc/config/rs6000/rs6000-overload.def
index cdc703e..49a6104 100644
--- a/gcc/config/rs6000/rs6000-overload.def
+++ b/gcc/config/rs6000/rs6000-overload.def
@@ -2456,6 +2456,10 @@
   vuq __builtin_vec_msum (vull, vull, vuq);
     VMSUMUDM  VMSUMUDM_U
 
+[VEC_MSUMC, vec_msumc, __builtin_vec_msumc]
+  vuq __builtin_vec_msumc (vull, vull, vuq);
+    VMSUMCUD
+
 [VEC_MSUMS, vec_msums, __builtin_vec_msums]
   vui __builtin_vec_msums (vus, vus, vui);
     VMSUMUHS
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index c8c891e..2f5a2f7 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -372,6 +372,7 @@
    UNSPEC_REPLACE_UN
    UNSPEC_VDIVES
    UNSPEC_VDIVEU
+   UNSPEC_VMSUMCUD
    UNSPEC_XXEVAL
    UNSPEC_XXSPLTIW
    UNSPEC_XXSPLTIDP
@@ -6620,3 +6621,15 @@
   emit_move_insn (operands[0], tmp4);
   DONE;
 })
+
+;; vmsumcud
+(define_insn "vmsumcud"
+[(set (match_operand:V1TI 0 "register_operand" "+v")
+      (unspec:V1TI [(match_operand:V2DI 1 "register_operand" "v")
+                    (match_operand:V2DI 2 "register_operand" "v")
+		    (match_operand:V1TI 3 "register_operand" "v")]
+		   UNSPEC_VMSUMCUD))]
+  "TARGET_POWER10"
+  "vmsumcud %0,%1,%2,%3"
+  [(set_attr "type" "veccomplex")]
+)
-- 
cgit v1.1


From 0c3e491a4e5ae74bfbed6d167d403d262b5a4adc Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Tue, 8 Feb 2022 20:14:30 +0100
Subject: rs6000: Fix up vspltis_shifted [PR102140]

The following testcase ICEs, because
(const_vector:V4SI [
                (const_int 0 [0]) repeated x3
                (const_int -2147483648 [0xffffffff80000000])
            ])
is recognized as valid easy_vector_constant in between split1 pass and
end of RA.
The problem is that such constants need to be split, and the only
splitter for that is:
(define_split
  [(set (match_operand:VM 0 "altivec_register_operand")
        (match_operand:VM 1 "easy_vector_constant_vsldoi"))]
  "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode) && can_create_pseudo_p ()"
There is only a single splitting pass before RA, so after that finishes,
if something gets matched in between that and end of RA (after that
can_create_pseudo_p () would be no longer true), it will never be
successfully split and we ICE at final.cc time or earlier.

The i386 backend (and a few others) already use
(cfun->curr_properties & PROP_rtl_split_insns)
as a test for split1 pass finished, so that some insns that should be split
during split1 and shouldn't be matched afterwards are properly guarded.

So, the following patch does that for vspltis_shifted too.

2022-02-08  Jakub Jelinek  <jakub@redhat.com>

	PR target/102140
	* config/rs6000/rs6000.cc (vspltis_shifted): Return false also if
	split1 pass has finished already.

	* gcc.dg/pr102140.c: New test.
---
 gcc/config/rs6000/rs6000.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index e571a0b..eaba9a2 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -6257,8 +6257,11 @@ vspltis_shifted (rtx op)
     return false;
 
   /* We need to create pseudo registers to do the shift, so don't recognize
-     shift vector constants after reload.  */
-  if (!can_create_pseudo_p ())
+     shift vector constants after reload.  Don't match it even before RA
+     after split1 is done, because there won't be further splitting pass
+     before RA to do the splitting.  */
+  if (!can_create_pseudo_p ()
+      || (cfun->curr_properties & PROP_rtl_split_insns))
     return false;
 
   nunits = GET_MODE_NUNITS (mode);
-- 
cgit v1.1


From 1c827873ed283df282f2df11dfe0ff607e07dab3 Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Wed, 9 Feb 2022 08:48:35 +0100
Subject: target/104453 - guard call folding with NULL LHS

This guards shift builtin folding to do nothing when there is
no LHS, similar to what other foldings do.

2022-02-09  Richard Biener  <rguenther@suse.de>

	PR target/104453
	* config/i386/i386.cc (ix86_gimple_fold_builtin): Guard shift
	folding for NULL LHS.

	* gcc.target/i386/pr104453.c: New testcase.
---
 gcc/config/i386/i386.cc | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index dd5584f..448c079 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18642,6 +18642,8 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 
     do_shift:
       gcc_assert (n_args >= 2);
+      if (!gimple_call_lhs (stmt))
+	break;
       arg0 = gimple_call_arg (stmt, 0);
       arg1 = gimple_call_arg (stmt, 1);
       elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
-- 
cgit v1.1


From 59b31f0e2d187ebdb3d399661e22b28e4ebd8099 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 9 Feb 2022 13:14:43 +0800
Subject: ICE: QImode(not SImode) operand should be passed to
 gen_vec_initv16qiqi in ashlv16qi3.

ix86_expand_vector_init expects vals to be a parallel containing
values of individual fields which should be either element mode of the
vector mode, or a vector mode with the same element mode and smaller
number of elements.

But in the expander ashlv16qi3, the second operand is SImode which
can't be directly passed to gen_vec_initv16qiqi.

gcc/ChangeLog:

	PR target/104451
	* config/i386/sse.md (<insn><mode>3): lowpart_subreg
	operands[2] from SImode to QImode.

gcc/testsuite/ChangeLog:

	PR target/104451
	* gcc.target/i386/pr104451.c: New test.
---
 gcc/config/i386/sse.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d8cb7b6..36b35f6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -24153,8 +24153,9 @@
 	    negate = true;
 	}
       par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
+      tmp = lowpart_subreg (QImode, operands[2], SImode);
       for (i = 0; i < 16; i++)
-        XVECEXP (par, 0, i) = operands[2];
+	XVECEXP (par, 0, i) = tmp;
 
       tmp = gen_reg_rtx (V16QImode);
       emit_insn (gen_vec_initv16qiqi (tmp, par));
-- 
cgit v1.1


From 5390a2f191682dae3c6d1e1deac20e05be413514 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 30 Jan 2022 10:08:14 -0800
Subject: x86: Check each component of source operand for AVX_U128_DIRTY

commit 9775e465c1fbfc32656de77c618c61acf5bd905d
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Jul 27 07:46:04 2021 -0700

    x86: Don't set AVX_U128_DIRTY when zeroing YMM/ZMM register

called ix86_check_avx_upper_register to check mode on source operand.
But ix86_check_avx_upper_register doesn't work on source operand like

(vec_select:V2DI (reg/v:V4DI 23 xmm3 [orig:91 ymm ] [91])
    (parallel [
            (const_int 2 [0x2])
            (const_int 3 [0x3])
        ]))

Add ix86_avx_u128_mode_source to check mode for each component of source
operand.

gcc/

	PR target/104441
	* config/i386/i386.cc (ix86_avx_u128_mode_source): New function.
	(ix86_avx_u128_mode_needed): Return AVX_U128_ANY for debug INSN.
	Call ix86_avx_u128_mode_source to check mode for each component
	of source operand.

gcc/testsuite/

	PR target/104441
	* gcc.target/i386/pr104441-1a.c: New test.
	* gcc.target/i386/pr104441-1b.c: Likewise.
---
 gcc/config/i386/i386.cc | 145 ++++++++++++++++++++++++++----------------------
 1 file changed, 79 insertions(+), 66 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 448c079..db5e168 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14365,11 +14365,82 @@ ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
     }
  }
 
+/* For YMM/ZMM store or YMM/ZMM extract.  Return mode for the source
+   operand of SRC DEFs in the same basic block before INSN.  */
+
+static int
+ix86_avx_u128_mode_source (rtx_insn *insn, const_rtx src)
+{
+  basic_block bb = BLOCK_FOR_INSN (insn);
+  rtx_insn *end = BB_END (bb);
+
+  /* Return AVX_U128_DIRTY if there is no DEF in the same basic
+     block.  */
+  int status = AVX_U128_DIRTY;
+
+  for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src));
+       def; def = DF_REF_NEXT_REG (def))
+    if (DF_REF_BB (def) == bb)
+      {
+	/* Ignore DEF from different basic blocks.  */
+	rtx_insn *def_insn = DF_REF_INSN (def);
+
+	/* Check if DEF_INSN is before INSN.  */
+	rtx_insn *next;
+	for (next = NEXT_INSN (def_insn);
+	     next != nullptr && next != end && next != insn;
+	     next = NEXT_INSN (next))
+	  ;
+
+	/* Skip if DEF_INSN isn't before INSN.  */
+	if (next != insn)
+	  continue;
+
+	/* Return AVX_U128_DIRTY if the source operand of DEF_INSN
+	   isn't constant zero.  */
+
+	if (CALL_P (def_insn))
+	  {
+	    bool avx_upper_reg_found = false;
+	    note_stores (def_insn,
+			 ix86_check_avx_upper_stores,
+			 &avx_upper_reg_found);
+
+	    /* Return AVX_U128_DIRTY if call returns AVX.  */
+	    if (avx_upper_reg_found)
+	      return AVX_U128_DIRTY;
+
+	    continue;
+	  }
+
+	rtx set = single_set (def_insn);
+	if (!set)
+	  return AVX_U128_DIRTY;
+
+	rtx dest = SET_DEST (set);
+
+	/* Skip if DEF_INSN is not an AVX load.  Return AVX_U128_DIRTY
+	   if the source operand isn't constant zero.  */
+	if (ix86_check_avx_upper_register (dest)
+	    && standard_sse_constant_p (SET_SRC (set),
+					GET_MODE (dest)) != 1)
+	  return AVX_U128_DIRTY;
+
+	/* We get here only if all AVX loads are from constant zero.  */
+	status = AVX_U128_ANY;
+      }
+
+  return status;
+}
+
 /* Return needed mode for entity in optimize_mode_switching pass.  */
 
 static int
 ix86_avx_u128_mode_needed (rtx_insn *insn)
 {
+  if (DEBUG_INSN_P (insn))
+    return AVX_U128_ANY;
+
   if (CALL_P (insn))
     {
       rtx link;
@@ -14409,6 +14480,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
       return AVX_U128_CLEAN;
     }
 
+  subrtx_iterator::array_type array;
+
   rtx set = single_set (insn);
   if (set)
     {
@@ -14423,74 +14496,15 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
 	  else
 	    return AVX_U128_ANY;
 	}
-      else if (ix86_check_avx_upper_register (src))
+      else
 	{
-	  /* This is an YMM/ZMM store.  Check for the source operand
-	     of SRC DEFs in the same basic block before INSN.  */
-	  basic_block bb = BLOCK_FOR_INSN (insn);
-	  rtx_insn *end = BB_END (bb);
-
-	  /* Return AVX_U128_DIRTY if there is no DEF in the same basic
-	     block.  */
-	  int status = AVX_U128_DIRTY;
-
-	  for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src));
-	       def; def = DF_REF_NEXT_REG (def))
-	    if (DF_REF_BB (def) == bb)
+	  FOR_EACH_SUBRTX (iter, array, src, NONCONST)
+	    if (ix86_check_avx_upper_register (*iter))
 	      {
-		/* Ignore DEF from different basic blocks.  */
-		rtx_insn *def_insn = DF_REF_INSN (def);
-
-		/* Check if DEF_INSN is before INSN.  */
-		rtx_insn *next;
-		for (next = NEXT_INSN (def_insn);
-		     next != nullptr && next != end && next != insn;
-		     next = NEXT_INSN (next))
-		  ;
-
-		/* Skip if DEF_INSN isn't before INSN.  */
-		if (next != insn)
-		  continue;
-
-		/* Return AVX_U128_DIRTY if the source operand of
-		   DEF_INSN isn't constant zero.  */
-
-		if (CALL_P (def_insn))
-		  {
-		    bool avx_upper_reg_found = false;
-		    note_stores (def_insn, ix86_check_avx_upper_stores,
-				 &avx_upper_reg_found);
-
-		    /* Return AVX_U128_DIRTY if call returns AVX.  */
-		    if (avx_upper_reg_found)
-		      return AVX_U128_DIRTY;
-
-		    continue;
-		  }
-
-		set = single_set (def_insn);
-		if (!set)
-		  return AVX_U128_DIRTY;
-
-		dest = SET_DEST (set);
-
-		/* Skip if DEF_INSN is not an AVX load.  */
-		if (ix86_check_avx_upper_register (dest))
-		  {
-		    src = SET_SRC (set);
-		    /* Return AVX_U128_DIRTY if the source operand isn't
-		       constant zero.  */
-		    if (standard_sse_constant_p (src, GET_MODE (dest))
-			!= 1)
-		      return AVX_U128_DIRTY;
-		  }
-
-		/* We get here only if all AVX loads are from constant
-		   zero.  */
-		status = AVX_U128_ANY;
+		int status = ix86_avx_u128_mode_source (insn, *iter);
+		if (status == AVX_U128_DIRTY)
+		  return status;
 	      }
-
-	  return status;
 	}
 
       /* This isn't YMM/ZMM load/store.  */
@@ -14501,7 +14515,6 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
      Hardware changes state only when a 256bit register is written to,
      but we need to prevent the compiler from moving optimal insertion
      point above eventual read from 256bit or 512 bit register.  */
-  subrtx_iterator::array_type array;
   FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
     if (ix86_check_avx_upper_register (*iter))
       return AVX_U128_DIRTY;
-- 
cgit v1.1


From ab0b5fbfe90168d2e470aefb19e0cf31526290bc Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sat, 19 Jun 2021 05:12:48 -0700
Subject: x86: Add -m[no-]direct-extern-access

Add -m[no-]direct-extern-access and nodirect_extern_access attribute.
-mdirect-extern-access is the default.  With nodirect_extern_access
attribute, GOT is always used to access undefined data and function
symbols with nodirect_extern_access attribute, including in PIE and
non-PIE.  With -mno-direct-extern-access:

1. Always use GOT to access undefined data and function symbols,
   including in PIE and non-PIE.  These will avoid copy relocations
   in executables.  This is compatible with existing executables and
   shared libraries.
2. In executable and shared library, bind symbols with the STV_PROTECTED
   visibility locally:
   a. The address of data symbol is the address of data body.
   b. For systems without function descriptor, the function pointer is
      the address of function body.
   c. The resulting shared libraries may not be incompatible with
      executables which have copy relocations on protected symbols or
      use executable PLT entries as function addresses for protected
      functions in shared libraries.
3. Update asm_preferred_eh_data_format to select PC relative EH encoding
format with -mno-direct-extern-access to avoid copy relocation.
4. Add ix86_reloc_rw_mask for TARGET_ASM_RELOC_RW_MASK to avoid copy
relocation with -mno-direct-extern-access.

gcc/

	PR target/35513
	PR target/100593
	* config/i386/gnu-property.cc: Include "i386-protos.h".
	(file_end_indicate_exec_stack_and_gnu_property): Generate
	a GNU_PROPERTY_1_NEEDED note for -mno-direct-extern-access or
	nodirect_extern_access attribute.
	* config/i386/i386-options.cc
	(handle_nodirect_extern_access_attribute): New function.
	(ix86_attribute_table): Add nodirect_extern_access attribute.
	* config/i386/i386-protos.h (ix86_force_load_from_GOT_p): Add a
	bool argument.
	(ix86_has_no_direct_extern_access): New.
	* config/i386/i386.cc (ix86_has_no_direct_extern_access): New.
	(ix86_force_load_from_GOT_p): Add a bool argument to indicate
	call operand.  Force non-call load from GOT for
	-mno-direct-extern-access or nodirect_extern_access attribute.
	(legitimate_pic_address_disp_p): Avoid copy relocation in PIE
	for -mno-direct-extern-access or nodirect_extern_access attribute.
	(ix86_print_operand): Pass true to ix86_force_load_from_GOT_p
	for call operand.
	(asm_preferred_eh_data_format): Use PC-relative format for
	-mno-direct-extern-access to avoid copy relocation.  Check
	ptr_mode instead of TARGET_64BIT when selecting DW_EH_PE_sdata4.
	(ix86_binds_local_p): Set ix86_has_no_direct_extern_access to
	true for -mno-direct-extern-access or nodirect_extern_access
	attribute.  Don't treat protected data as extern and avoid copy
	relocation on common symbol with -mno-direct-extern-access or
	nodirect_extern_access attribute.
	(ix86_reloc_rw_mask): New to avoid copy relocation for
	-mno-direct-extern-access.
	(TARGET_ASM_RELOC_RW_MASK): New.
	* config/i386/i386.opt: Add -mdirect-extern-access.
	* doc/extend.texi: Document nodirect_extern_access attribute.
	* doc/invoke.texi: Document -m[no-]direct-extern-access.

gcc/testsuite/

	PR target/35513
	PR target/100593
	* g++.target/i386/pr35513-1.C: New file.
	* g++.target/i386/pr35513-2.C: Likewise.
	* gcc.target/i386/pr35513-1a.c: Likewise.
	* gcc.target/i386/pr35513-1b.c: Likewise.
	* gcc.target/i386/pr35513-2a.c: Likewise.
	* gcc.target/i386/pr35513-2b.c: Likewise.
	* gcc.target/i386/pr35513-3a.c: Likewise.
	* gcc.target/i386/pr35513-3b.c: Likewise.
	* gcc.target/i386/pr35513-4a.c: Likewise.
	* gcc.target/i386/pr35513-4b.c: Likewise.
	* gcc.target/i386/pr35513-5a.c: Likewise.
	* gcc.target/i386/pr35513-5b.c: Likewise.
	* gcc.target/i386/pr35513-6a.c: Likewise.
	* gcc.target/i386/pr35513-6b.c: Likewise.
	* gcc.target/i386/pr35513-7a.c: Likewise.
	* gcc.target/i386/pr35513-7b.c: Likewise.
	* gcc.target/i386/pr35513-8.c: Likewise.
	* gcc.target/i386/pr35513-9a.c: Likewise.
	* gcc.target/i386/pr35513-9b.c: Likewise.
	* gcc.target/i386/pr35513-10a.c: Likewise.
	* gcc.target/i386/pr35513-10b.c: Likewise.
	* gcc.target/i386/pr35513-11a.c: Likewise.
	* gcc.target/i386/pr35513-11b.c: Likewise.
	* gcc.target/i386/pr35513-12a.c: Likewise.
	* gcc.target/i386/pr35513-12b.c: Likewise.
---
 gcc/config/i386/gnu-property.cc | 10 +++++-
 gcc/config/i386/i386-options.cc | 32 ++++++++++++++++++++
 gcc/config/i386/i386-protos.h   |  4 ++-
 gcc/config/i386/i386.cc         | 67 ++++++++++++++++++++++++++++++++---------
 gcc/config/i386/i386.opt        |  4 +++
 5 files changed, 101 insertions(+), 16 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/gnu-property.cc b/gcc/config/i386/gnu-property.cc
index f08984f..ea63c1e 100644
--- a/gcc/config/i386/gnu-property.cc
+++ b/gcc/config/i386/gnu-property.cc
@@ -23,6 +23,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tm.h"
 #include "output.h"
 #include "linux-common.h"
+#include "i386-protos.h"
 
 static void
 emit_gnu_property (unsigned int type, unsigned int data)
@@ -60,7 +61,9 @@ file_end_indicate_exec_stack_and_gnu_property (void)
 {
   file_end_indicate_exec_stack ();
 
-  if (flag_cf_protection == CF_NONE && !ix86_needed)
+  if (flag_cf_protection == CF_NONE
+      && !ix86_needed
+      && !ix86_has_no_direct_extern_access)
     return;
 
   unsigned int feature_1 = 0;
@@ -121,4 +124,9 @@ file_end_indicate_exec_stack_and_gnu_property (void)
   /* Generate GNU_PROPERTY_X86_ISA_1_NEEDED.  */
   if (isa_1)
     emit_gnu_property (0xc0008002, isa_1);
+
+  if (ix86_has_no_direct_extern_access)
+    /* Emite a GNU_PROPERTY_1_NEEDED note with
+       GNU_PROPERTY_1_NEEDED_INDIRECT_EXTERN_ACCESS.  */
+    emit_gnu_property (0xb0008000, (1U << 0));
 }
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 082abd2..8055393 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -3775,6 +3775,36 @@ ix86_handle_fentry_name (tree *node, tree name, tree args,
   return NULL_TREE;
 }
 
+/* Handle a "nodirect_extern_access" attribute; arguments as in
+   struct attribute_spec.handler.  */
+
+static tree
+handle_nodirect_extern_access_attribute (tree *pnode, tree name,
+					 tree ARG_UNUSED (args),
+					 int ARG_UNUSED (flags),
+					 bool *no_add_attrs)
+{
+  tree node = *pnode;
+
+  if (VAR_OR_FUNCTION_DECL_P (node))
+    {
+      if ((!TREE_STATIC (node) && TREE_CODE (node) != FUNCTION_DECL
+	   && !DECL_EXTERNAL (node)) || !TREE_PUBLIC (node))
+	{
+	  warning (OPT_Wattributes,
+		   "%qE attribute have effect only on public objects", name);
+	  *no_add_attrs = true;
+	}
+    }
+  else
+    {
+      warning (OPT_Wattributes, "%qE attribute ignored", name);
+      *no_add_attrs = true;
+    }
+
+  return NULL_TREE;
+}
+
 /* Table of valid machine attributes.  */
 const struct attribute_spec ix86_attribute_table[] =
 {
@@ -3855,6 +3885,8 @@ const struct attribute_spec ix86_attribute_table[] =
     ix86_handle_fentry_name, NULL },
   { "cf_check", 0, 0, true, false, false, false,
     ix86_handle_fndecl_attribute, NULL },
+  { "nodirect_extern_access", 0, 0, true, false, false, false,
+    handle_nodirect_extern_access_attribute, NULL },
 
   /* End element.  */
   { NULL, 0, 0, false, false, false, false, NULL, NULL }
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 6b3c951..b7e9aa7 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -79,7 +79,7 @@ extern bool ix86_expand_cmpstrn_or_cmpmem (rtx, rtx, rtx, rtx, rtx, bool);
 extern bool constant_address_p (rtx);
 extern bool legitimate_pic_operand_p (rtx);
 extern bool legitimate_pic_address_disp_p (rtx);
-extern bool ix86_force_load_from_GOT_p (rtx);
+extern bool ix86_force_load_from_GOT_p (rtx, bool = false);
 extern void print_reg (rtx, int, FILE*);
 extern void ix86_print_operand (FILE *, rtx, int);
 
@@ -401,3 +401,5 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
   (gcc::context *);
 extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
   (gcc::context *);
+
+extern bool ix86_has_no_direct_extern_access;
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index db5e168..6b97a2b 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -363,6 +363,9 @@ unsigned int ix86_default_incoming_stack_boundary;
 /* Alignment for incoming stack boundary in bits.  */
 unsigned int ix86_incoming_stack_boundary;
 
+/* True if there is no direct access to extern symbols.  */
+bool ix86_has_no_direct_extern_access;
+
 /* Calling abi specific va_list type nodes.  */
 tree sysv_va_list_type_node;
 tree ms_va_list_type_node;
@@ -10514,13 +10517,17 @@ darwin_local_data_pic (rtx disp)
 }
 
 /* True if the function symbol operand X should be loaded from GOT.
+   If CALL_P is true, X is a call operand.
+
+   NB: -mno-direct-extern-access doesn't force load from GOT for
+   call.
 
    NB: In 32-bit mode, only non-PIC is allowed in inline assembly
    statements, since a PIC register could not be available at the
    call site.  */
 
 bool
-ix86_force_load_from_GOT_p (rtx x)
+ix86_force_load_from_GOT_p (rtx x, bool call_p)
 {
   return ((TARGET_64BIT || (!flag_pic && HAVE_AS_IX86_GOT32X))
 	  && !TARGET_PECOFF && !TARGET_MACHO
@@ -10528,11 +10535,16 @@ ix86_force_load_from_GOT_p (rtx x)
 	  && ix86_cmodel != CM_LARGE
 	  && ix86_cmodel != CM_LARGE_PIC
 	  && GET_CODE (x) == SYMBOL_REF
-	  && SYMBOL_REF_FUNCTION_P (x)
-	  && (!flag_plt
-	      || (SYMBOL_REF_DECL (x)
-		  && lookup_attribute ("noplt",
-				       DECL_ATTRIBUTES (SYMBOL_REF_DECL (x)))))
+	  && ((!call_p
+	       && (!ix86_direct_extern_access
+		   || (SYMBOL_REF_DECL (x)
+		       && lookup_attribute ("nodirect_extern_access",
+					    DECL_ATTRIBUTES (SYMBOL_REF_DECL (x))))))
+	      || (SYMBOL_REF_FUNCTION_P (x)
+		  && (!flag_plt
+		      || (SYMBOL_REF_DECL (x)
+			  && lookup_attribute ("noplt",
+					       DECL_ATTRIBUTES (SYMBOL_REF_DECL (x)))))))
 	  && !SYMBOL_REF_LOCAL_P (x));
 }
 
@@ -10799,7 +10811,11 @@ legitimate_pic_address_disp_p (rtx disp)
 	    }
 	  else if (!SYMBOL_REF_FAR_ADDR_P (op0)
 		   && (SYMBOL_REF_LOCAL_P (op0)
-		       || (HAVE_LD_PIE_COPYRELOC
+		       || ((ix86_direct_extern_access
+			    && !(SYMBOL_REF_DECL (op0)
+				 && lookup_attribute ("nodirect_extern_access",
+						      DECL_ATTRIBUTES (SYMBOL_REF_DECL (op0)))))
+			   && HAVE_LD_PIE_COPYRELOC
 			   && flag_pie
 			   && !SYMBOL_REF_WEAK (op0)
 			   && !SYMBOL_REF_FUNCTION_P (op0)))
@@ -13755,7 +13771,7 @@ ix86_print_operand (FILE *file, rtx x, int code)
 
       if (code == 'P')
 	{
-	  if (ix86_force_load_from_GOT_p (x))
+	  if (ix86_force_load_from_GOT_p (x, true))
 	    {
 	      /* For inline assembly statement, load function address
 		 from GOT with 'P' operand modifier to avoid PLT.  */
@@ -22536,10 +22552,10 @@ int
 asm_preferred_eh_data_format (int code, int global)
 {
   /* PE-COFF is effectively always -fPIC because of the .reloc section.  */
-  if (flag_pic || TARGET_PECOFF)
+  if (flag_pic || TARGET_PECOFF || !ix86_direct_extern_access)
     {
       int type = DW_EH_PE_sdata8;
-      if (!TARGET_64BIT
+      if (ptr_mode == SImode
 	  || ix86_cmodel == CM_SMALL_PIC
 	  || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
 	type = DW_EH_PE_sdata4;
@@ -23629,10 +23645,28 @@ ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
 static bool
 ix86_binds_local_p (const_tree exp)
 {
-  return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
-				  (!flag_pic
-				   || (TARGET_64BIT
-				       && HAVE_LD_PIE_COPYRELOC != 0)));
+  bool direct_extern_access
+    = (ix86_direct_extern_access
+       && !(VAR_OR_FUNCTION_DECL_P (exp)
+	    && lookup_attribute ("nodirect_extern_access",
+				 DECL_ATTRIBUTES (exp))));
+  if (!direct_extern_access)
+    ix86_has_no_direct_extern_access = true;
+  return default_binds_local_p_3 (exp, flag_shlib != 0, true,
+				  direct_extern_access,
+				  (direct_extern_access
+				   && (!flag_pic
+				       || (TARGET_64BIT
+					   && HAVE_LD_PIE_COPYRELOC != 0))));
+}
+
+/* If flag_pic or ix86_direct_extern_access is false, then neither
+   local nor global relocs should be placed in readonly memory.  */
+
+static int
+ix86_reloc_rw_mask (void)
+{
+  return (flag_pic || !ix86_direct_extern_access) ? 3 : 0;
 }
 #endif
 
@@ -24697,6 +24731,11 @@ ix86_libgcc_floating_mode_supported_p
 #undef TARGET_IFUNC_REF_LOCAL_OK
 #define TARGET_IFUNC_REF_LOCAL_OK hook_bool_void_true
 
+#if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
+# undef TARGET_ASM_RELOC_RW_MASK
+# define TARGET_ASM_RELOC_RW_MASK ix86_reloc_rw_mask
+#endif
+
 static bool ix86_libc_has_fast_function (int fcode ATTRIBUTE_UNUSED)
 {
 #ifdef OPTION_GLIBC
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index eb829d1..d8e8656 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1206,3 +1206,7 @@ Support MWAIT and MONITOR built-in functions and code generation.
 mavx512fp16
 Target Mask(ISA2_AVX512FP16) Var(ix86_isa_flags2) Save
 Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX512FP16 built-in functions and code generation.
+
+mdirect-extern-access
+Target Var(ix86_direct_extern_access) Init(1)
+Do not use GOT to access external symbols.
-- 
cgit v1.1


From c48a6819d157fd648e77ef5be0dce887e047c734 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 9 Feb 2022 16:57:02 +0000
Subject: aarch64: Tighten general_operand predicates

This patch fixes some case in which *general_operand was used over
*nonimmediate_operand by patterns that don't accept immediates.
This avoids some complication with later patches.

gcc/
	* config/aarch64/aarch64-simd.md (aarch64_simd_vec_set<mode>): Use
	aarch64_simd_nonimmediate_operand instead of
	aarch64_simd_general_operand.
	(@aarch64_combinez<mode>): Use nonimmediate_operand instead of
	general_operand.
	(@aarch64_combinez_be<mode>): Likewise.
---
 gcc/config/aarch64/aarch64-simd.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 6646e06..9529bdb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1039,7 +1039,7 @@
   [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
 	(vec_merge:VALL_F16
 	    (vec_duplicate:VALL_F16
-		(match_operand:<VEL> 1 "aarch64_simd_general_operand" "w,?r,Utv"))
+		(match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "w,?r,Utv"))
 	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
 	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
   "TARGET_SIMD"
@@ -4380,7 +4380,7 @@
 (define_insn "@aarch64_combinez<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
 	(vec_concat:<VDBL>
-	  (match_operand:VDC 1 "general_operand" "w,?r,m")
+	  (match_operand:VDC 1 "nonimmediate_operand" "w,?r,m")
 	  (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
   "@
@@ -4395,7 +4395,7 @@
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
         (vec_concat:<VDBL>
 	  (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")
-	  (match_operand:VDC 1 "general_operand" "w,?r,m")))]
+	  (match_operand:VDC 1 "nonimmediate_operand" "w,?r,m")))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "@
    mov\\t%0.8b, %1.8b
-- 
cgit v1.1


From fabc5d9bceb0aec8db2147eb50ae375c711eea90 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 9 Feb 2022 16:57:02 +0000
Subject: aarch64: Generalise vec_set predicate

The aarch64_simd_vec_set<mode> define_insn takes memory operands,
so this patch makes the vec_set<mode> optab expander do the same.

gcc/
	* config/aarch64/aarch64-simd.md (vec_set<mode>): Allow the
	element to be an aarch64_simd_nonimmediate_operand.
---
 gcc/config/aarch64/aarch64-simd.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9529bdb..872a3d7 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1378,7 +1378,7 @@
 
 (define_expand "vec_set<mode>"
   [(match_operand:VALL_F16 0 "register_operand")
-   (match_operand:<VEL> 1 "register_operand")
+   (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand")
    (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
   {
-- 
cgit v1.1


From 958448a9441ee54e012c67cfc3cf88083f3d0e4a Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 9 Feb 2022 16:57:03 +0000
Subject: aarch64: Generalise adjacency check for load_pair_lanes

This patch generalises the load_pair_lanes<mode> guard so that
it uses aarch64_check_consecutive_mems to check for consecutive
mems.  It also allows the pattern to be used for STRICT_ALIGNMENT
targets if the alignment is high enough.

The main aim is to avoid an inline test, for the sake of a later patch
that needs to repeat it.  Reusing aarch64_check_consecutive_mems seemed
simpler than writing an entirely new function.

gcc/
	* config/aarch64/aarch64-protos.h (aarch64_mergeable_load_pair_p):
	Declare.
	* config/aarch64/aarch64-simd.md (load_pair_lanes<mode>): Use
	aarch64_mergeable_load_pair_p instead of inline check.
	* config/aarch64/aarch64.cc (aarch64_expand_vector_init): Likewise.
	(aarch64_check_consecutive_mems): Allow the reversed parameter
	to be null.
	(aarch64_mergeable_load_pair_p): New function.
---
 gcc/config/aarch64/aarch64-protos.h |  1 +
 gcc/config/aarch64/aarch64-simd.md  |  7 ++---
 gcc/config/aarch64/aarch64.cc       | 54 ++++++++++++++++++++++++-------------
 3 files changed, 38 insertions(+), 24 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 2636853..b75ed35 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1000,6 +1000,7 @@ void aarch64_atomic_assign_expand_fenv (tree *, tree *, tree *);
 int aarch64_ccmp_mode_to_code (machine_mode mode);
 
 bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset);
+bool aarch64_mergeable_load_pair_p (machine_mode, rtx, rtx);
 bool aarch64_operands_ok_for_ldpstp (rtx *, bool, machine_mode);
 bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, machine_mode);
 void aarch64_swap_ldrstr_operands (rtx *, bool);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 872a3d7..c5bc2ea 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4353,11 +4353,8 @@
 	(vec_concat:<VDBL>
 	   (match_operand:VDC 1 "memory_operand" "Utq")
 	   (match_operand:VDC 2 "memory_operand" "m")))]
-  "TARGET_SIMD && !STRICT_ALIGNMENT
-   && rtx_equal_p (XEXP (operands[2], 0),
-		   plus_constant (Pmode,
-				  XEXP (operands[1], 0),
-				  GET_MODE_SIZE (<MODE>mode)))"
+  "TARGET_SIMD
+   && aarch64_mergeable_load_pair_p (<VDBL>mode, operands[1], operands[2])"
   "ldr\\t%q0, %1"
   [(set_attr "type" "neon_load1_1reg_q")]
 )
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 296145e..c47543a 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -21063,11 +21063,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 		 for store_pair_lanes<mode>.  */
 	      if (memory_operand (x0, inner_mode)
 		  && memory_operand (x1, inner_mode)
-		  && !STRICT_ALIGNMENT
-		  && rtx_equal_p (XEXP (x1, 0),
-				  plus_constant (Pmode,
-						 XEXP (x0, 0),
-						 GET_MODE_SIZE (inner_mode))))
+		  && aarch64_mergeable_load_pair_p (mode, x0, x1))
 		{
 		  rtx t;
 		  if (inner_mode == DFmode)
@@ -24687,14 +24683,20 @@ aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
   return priority;
 }
 
-/* Check if *MEM1 and *MEM2 are consecutive memory references and,
+/* If REVERSED is null, return true if memory reference *MEM2 comes
+   immediately after memory reference *MEM1.  Do not change the references
+   in this case.
+
+   Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
    if they are, try to make them use constant offsets from the same base
    register.  Return true on success.  When returning true, set *REVERSED
    to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2.  */
 static bool
 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
 {
-  *reversed = false;
+  if (reversed)
+    *reversed = false;
+
   if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
       || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
     return false;
@@ -24723,7 +24725,7 @@ aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
       if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
 	return true;
 
-      if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)))
+      if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
 	{
 	  *reversed = true;
 	  return true;
@@ -24756,22 +24758,25 @@ aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
 
       if (known_eq (expr_offset1 + size1, expr_offset2))
 	;
-      else if (known_eq (expr_offset2 + size2, expr_offset1))
+      else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
 	*reversed = true;
       else
 	return false;
 
-      if (base2)
+      if (reversed)
 	{
-	  rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
-				     expr_offset1 - expr_offset2);
-	  *mem1 = replace_equiv_address_nv (*mem1, addr1);
-	}
-      else
-	{
-	  rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
-				     expr_offset2 - expr_offset1);
-	  *mem2 = replace_equiv_address_nv (*mem2, addr2);
+	  if (base2)
+	    {
+	      rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
+					 expr_offset1 - expr_offset2);
+	      *mem1 = replace_equiv_address_nv (*mem1, addr1);
+	    }
+	  else
+	    {
+	      rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
+					 expr_offset2 - expr_offset1);
+	      *mem2 = replace_equiv_address_nv (*mem2, addr2);
+	    }
 	}
       return true;
     }
@@ -24779,6 +24784,17 @@ aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
   return false;
 }
 
+/* Return true if MEM1 and MEM2 can be combined into a single access
+   of mode MODE, with the combined access having the same address as MEM1.  */
+
+bool
+aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
+{
+  if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
+    return false;
+  return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
+}
+
 /* Given OPERANDS of consecutive load/store, check if we can merge
    them into ldp/stp.  LOAD is true if they are load instructions.
    MODE is the mode of memory operands.  */
-- 
cgit v1.1


From aeef5c57f161ad0258c5ab066ade2274bef3271a Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 9 Feb 2022 16:57:04 +0000
Subject: aarch64: Remove redundant vec_concat patterns

move_lo_quad_internal_<mode> and move_lo_quad_internal_be_<mode>
partially duplicate the later aarch64_combinez{,_be}<mode> patterns.
The duplication itself is a regression.

The only substantive differences between the two are:

* combinez uses vector MOV (ORR) instead of element MOV (DUP).
  The former seems more likely to be handled via renaming.

* combinez disparages the GPR->FPR alternative whereas move_lo_quad
  gave it equal cost.  The new test gives a token example of when
  the combinez behaviour helps.

gcc/
	* config/aarch64/aarch64-simd.md (move_lo_quad_internal_<mode>)
	(move_lo_quad_internal_be_<mode>): Delete.
	(move_lo_quad_<mode>): Use aarch64_combine<Vhalf> instead of the above.

gcc/testsuite/
	* gcc.target/aarch64/vec-init-8.c: New test.
---
 gcc/config/aarch64/aarch64-simd.md | 37 ++-----------------------------------
 1 file changed, 2 insertions(+), 35 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c5bc2ea..d6cd4c7 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1584,46 +1584,13 @@
 ;; On little-endian this is { operand, zeroes }
 ;; On big-endian this is { zeroes, operand }
 
-(define_insn "move_lo_quad_internal_<mode>"
-  [(set (match_operand:VQMOV 0 "register_operand" "=w,w,w")
-	(vec_concat:VQMOV
-	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")
-	  (match_operand:<VHALF> 2 "aarch64_simd_or_scalar_imm_zero")))]
-  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-  "@
-   dup\\t%d0, %1.d[0]
-   fmov\\t%d0, %1
-   dup\\t%d0, %1"
-  [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>")
-   (set_attr "length" "4")
-   (set_attr "arch" "simd,fp,simd")]
-)
-
-(define_insn "move_lo_quad_internal_be_<mode>"
-  [(set (match_operand:VQMOV 0 "register_operand" "=w,w,w")
-	(vec_concat:VQMOV
-	  (match_operand:<VHALF> 2 "aarch64_simd_or_scalar_imm_zero")
-	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")))]
-  "TARGET_SIMD && BYTES_BIG_ENDIAN"
-  "@
-   dup\\t%d0, %1.d[0]
-   fmov\\t%d0, %1
-   dup\\t%d0, %1"
-  [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>")
-   (set_attr "length" "4")
-   (set_attr "arch" "simd,fp,simd")]
-)
-
 (define_expand "move_lo_quad_<mode>"
   [(match_operand:VQMOV 0 "register_operand")
    (match_operand:<VHALF> 1 "register_operand")]
   "TARGET_SIMD"
 {
-  rtx zs = CONST0_RTX (<VHALF>mode);
-  if (BYTES_BIG_ENDIAN)
-    emit_insn (gen_move_lo_quad_internal_be_<mode> (operands[0], operands[1], zs));
-  else
-    emit_insn (gen_move_lo_quad_internal_<mode> (operands[0], operands[1], zs));
+  emit_insn (gen_aarch64_combine<Vhalf> (operands[0], operands[1],
+					 CONST0_RTX (<VHALF>mode)));
   DONE;
 }
 )
-- 
cgit v1.1


From 85ac2fe44fd4acf8350dd74ccb003a2050baad2a Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 9 Feb 2022 16:57:05 +0000
Subject: aarch64: Add more vec_combine patterns

vec_combine is really one instruction on aarch64, provided that
the lowpart element is in the same register as the destination
vector.  This patch adds patterns for that.

The patch fixes a regression from GCC 8.  Before the patch:

int64x2_t s64q_1(int64_t a0, int64_t a1) {
  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
    return (int64x2_t) { a1, a0 };
  else
    return (int64x2_t) { a0, a1 };
}

generated:

        fmov    d0, x0
        ins     v0.d[1], x1
        ins     v0.d[1], x1
        ret

whereas GCC 8 generated the more respectable:

        dup     v0.2d, x0
        ins     v0.d[1], x1
        ret

gcc/
	* config/aarch64/predicates.md (aarch64_reg_or_mem_pair_operand):
	New predicate.
	* config/aarch64/aarch64-simd.md (*aarch64_combine_internal<mode>)
	(*aarch64_combine_internal_be<mode>): New patterns.

gcc/testsuite/
	* gcc.target/aarch64/vec-init-9.c: New test.
	* gcc.target/aarch64/vec-init-10.c: Likewise.
	* gcc.target/aarch64/vec-init-11.c: Likewise.
---
 gcc/config/aarch64/aarch64-simd.md | 62 ++++++++++++++++++++++++++++++++++++++
 gcc/config/aarch64/predicates.md   |  4 +++
 2 files changed, 66 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index d6cd4c7..ead8039 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4326,6 +4326,25 @@
   [(set_attr "type" "neon_load1_1reg_q")]
 )
 
+;; This STP pattern is a partial duplicate of the general vec_concat patterns
+;; below.  The reason for having both of them is that the alternatives of
+;; the later patterns do not have consistent register preferences: the STP
+;; alternatives have no preference between GPRs and FPRs (and if anything,
+;; the GPR form is more natural for scalar integers) whereas the other
+;; alternatives *require* an FPR for operand 1 and prefer one for operand 2.
+;;
+;; Using "*" to hide the STP alternatives from the RA penalizes cases in
+;; which the destination was always memory.  On the other hand, expressing
+;; the true preferences makes GPRs seem more palatable than they really are
+;; for register destinations.
+;;
+;; Despite that, we do still want the general form to have STP alternatives,
+;; in order to handle cases where a register destination is spilled.
+;;
+;; The best compromise therefore seemed to be to have a dedicated STP
+;; pattern to catch cases in which the destination was always memory.
+;; This dedicated pattern must come first.
+
 (define_insn "store_pair_lanes<mode>"
   [(set (match_operand:<VDBL> 0 "aarch64_mem_pair_lanes_operand" "=Umn, Umn")
 	(vec_concat:<VDBL>
@@ -4338,6 +4357,49 @@
   [(set_attr "type" "neon_stp, store_16")]
 )
 
+;; Form a vector whose least significant half comes from operand 1 and whose
+;; most significant half comes from operand 2.  The register alternatives
+;; tie the least significant half to the same register as the destination,
+;; so that only the other half needs to be handled explicitly.  For the
+;; reasons given above, the STP alternatives use ? for constraints that
+;; the register alternatives either don't accept or themselves disparage.
+
+(define_insn "*aarch64_combine_internal<mode>"
+  [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn")
+	(vec_concat:<VDBL>
+	  (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r")
+	  (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, w, ?r")))]
+  "TARGET_SIMD
+   && !BYTES_BIG_ENDIAN
+   && (register_operand (operands[0], <VDBL>mode)
+       || register_operand (operands[2], <MODE>mode))"
+  "@
+   ins\t%0.d[1], %2.d[0]
+   ins\t%0.d[1], %2
+   ld1\t{%0.d}[1], %2
+   stp\t%d1, %d2, %y0
+   stp\t%x1, %x2, %y0"
+  [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")]
+)
+
+(define_insn "*aarch64_combine_internal_be<mode>"
+  [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn")
+	(vec_concat:<VDBL>
+	  (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, ?w, ?r")
+	  (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r")))]
+  "TARGET_SIMD
+   && BYTES_BIG_ENDIAN
+   && (register_operand (operands[0], <VDBL>mode)
+       || register_operand (operands[2], <MODE>mode))"
+  "@
+   ins\t%0.d[1], %2.d[0]
+   ins\t%0.d[1], %2
+   ld1\t{%0.d}[1], %2
+   stp\t%d2, %d1, %y0
+   stp\t%x2, %x1, %y0"
+  [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")]
+)
+
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
 ;; dest vector.
 
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 7dc4c15..c308015 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -254,6 +254,10 @@
 						  false,
 						  ADDR_QUERY_LDP_STP_N)")))
 
+(define_predicate "aarch64_reg_or_mem_pair_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_mem_pair_lanes_operand")))
+
 (define_predicate "aarch64_prefetch_operand"
   (match_test "aarch64_address_valid_for_prefetch_p (op, false)"))
 
-- 
cgit v1.1


From 4057266ce5afc1fccd5d4e4971103afaa4be63d4 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 9 Feb 2022 16:57:05 +0000
Subject: aarch64: Add a general vec_concat expander

After previous patches, we have a (mostly new) group of vec_concat
patterns as well as vestiges of the old move_lo/hi_quad patterns.
(A previous patch removed the move_lo_quad insns, but we still
have the move_hi_quad insns and both sets of expanders.)

This patch is the first of two to remove the old move_lo/hi_quad
stuff.  It isn't technically a regression fix, but it seemed
better to make the changes now rather than leave things in
a half-finished and inconsistent state.

This patch defines an aarch64_vec_concat expander that coerces the
element operands into a valid form, including the ones added by the
previous patch.  This in turn lets us get rid of one move_lo/hi_quad
pair.

As a side-effect, it also means that vcombines of 2 vectors make
better use of the available forms, like vec_inits of 2 scalars
already do.

gcc/
	* config/aarch64/aarch64-protos.h (aarch64_split_simd_combine):
	Delete.
	* config/aarch64/aarch64-simd.md (@aarch64_combinez<mode>): Rename
	to...
	(*aarch64_combinez<mode>): ...this.
	(@aarch64_combinez_be<mode>): Rename to...
	(*aarch64_combinez_be<mode>): ...this.
	(@aarch64_vec_concat<mode>): New expander.
	(aarch64_combine<mode>): Use it.
	(@aarch64_simd_combine<mode>): Delete.
	* config/aarch64/aarch64.cc (aarch64_split_simd_combine): Delete.
	(aarch64_expand_vector_init): Use aarch64_vec_concat.

gcc/testsuite/
	* gcc.target/aarch64/vec-init-12.c: New test.
---
 gcc/config/aarch64/aarch64-protos.h |  2 -
 gcc/config/aarch64/aarch64-simd.md  | 76 ++++++++++++++++++++++++-------------
 gcc/config/aarch64/aarch64.cc       | 55 ++++-----------------------
 3 files changed, 57 insertions(+), 76 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index b75ed35..392efa0 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -925,8 +925,6 @@ bool aarch64_split_128bit_move_p (rtx, rtx);
 
 bool aarch64_mov128_immediate (rtx);
 
-void aarch64_split_simd_combine (rtx, rtx, rtx);
-
 void aarch64_split_simd_move (rtx, rtx);
 
 /* Check for a legitimate floating point constant for FMOV.  */
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ead8039..7acde0d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4403,7 +4403,7 @@
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
 ;; dest vector.
 
-(define_insn "@aarch64_combinez<mode>"
+(define_insn "*aarch64_combinez<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
 	(vec_concat:<VDBL>
 	  (match_operand:VDC 1 "nonimmediate_operand" "w,?r,m")
@@ -4417,7 +4417,7 @@
    (set_attr "arch" "simd,fp,simd")]
 )
 
-(define_insn "@aarch64_combinez_be<mode>"
+(define_insn "*aarch64_combinez_be<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
         (vec_concat:<VDBL>
 	  (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")
@@ -4431,38 +4431,62 @@
    (set_attr "arch" "simd,fp,simd")]
 )
 
-(define_expand "aarch64_combine<mode>"
-  [(match_operand:<VDBL> 0 "register_operand")
-   (match_operand:VDC 1 "register_operand")
-   (match_operand:VDC 2 "aarch64_simd_reg_or_zero")]
+;; Form a vector whose first half (in array order) comes from operand 1
+;; and whose second half (in array order) comes from operand 2.
+;; This operand order follows the RTL vec_concat operation.
+(define_expand "@aarch64_vec_concat<mode>"
+  [(set (match_operand:<VDBL> 0 "register_operand")
+	(vec_concat:<VDBL>
+	  (match_operand:VDC 1 "general_operand")
+	  (match_operand:VDC 2 "general_operand")))]
   "TARGET_SIMD"
 {
-  if (operands[2] == CONST0_RTX (<MODE>mode))
+  int lo = BYTES_BIG_ENDIAN ? 2 : 1;
+  int hi = BYTES_BIG_ENDIAN ? 1 : 2;
+
+  if (MEM_P (operands[1])
+      && MEM_P (operands[2])
+      && aarch64_mergeable_load_pair_p (<VDBL>mode, operands[1], operands[2]))
+    /* Use load_pair_lanes<mode>.  */
+    ;
+  else if (operands[hi] == CONST0_RTX (<MODE>mode))
     {
-      if (BYTES_BIG_ENDIAN)
-	emit_insn (gen_aarch64_combinez_be<mode> (operands[0], operands[1],
-						  operands[2]));
-      else
-	emit_insn (gen_aarch64_combinez<mode> (operands[0], operands[1],
-					       operands[2]));
+      /* Use *aarch64_combinez<mode>.  */
+      if (!nonimmediate_operand (operands[lo], <MODE>mode))
+	operands[lo] = force_reg (<MODE>mode, operands[lo]);
     }
   else
-    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
-  DONE;
-}
-)
+    {
+      /* Use *aarch64_combine_general<mode>.  */
+      operands[lo] = force_reg (<MODE>mode, operands[lo]);
+      if (!aarch64_simd_nonimmediate_operand (operands[hi], <MODE>mode))
+	{
+	  if (MEM_P (operands[hi]))
+	    {
+	      rtx addr = force_reg (Pmode, XEXP (operands[hi], 0));
+	      operands[hi] = replace_equiv_address (operands[hi], addr);
+	    }
+	  else
+	    operands[hi] = force_reg (<MODE>mode, operands[hi]);
+	}
+    }
+})
 
-(define_expand "@aarch64_simd_combine<mode>"
+;; Form a vector whose least significant half comes from operand 1 and whose
+;; most significant half comes from operand 2.  This operand order follows
+;; arm_neon.h vcombine* intrinsics.
+(define_expand "aarch64_combine<mode>"
   [(match_operand:<VDBL> 0 "register_operand")
-   (match_operand:VDC 1 "register_operand")
-   (match_operand:VDC 2 "register_operand")]
+   (match_operand:VDC 1 "general_operand")
+   (match_operand:VDC 2 "general_operand")]
   "TARGET_SIMD"
-  {
-    emit_insn (gen_move_lo_quad_<Vdbl> (operands[0], operands[1]));
-    emit_insn (gen_move_hi_quad_<Vdbl> (operands[0], operands[2]));
-    DONE;
-  }
-[(set_attr "type" "multiple")]
+{
+  if (BYTES_BIG_ENDIAN)
+    std::swap (operands[1], operands[2]);
+  emit_insn (gen_aarch64_vec_concat<mode> (operands[0], operands[1],
+					   operands[2]));
+  DONE;
+}
 )
 
 ;; <su><addsub>l<q>.
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c47543a..af42d1b 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4239,23 +4239,6 @@ aarch64_split_128bit_move_p (rtx dst, rtx src)
   return true;
 }
 
-/* Split a complex SIMD combine.  */
-
-void
-aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
-{
-  machine_mode src_mode = GET_MODE (src1);
-  machine_mode dst_mode = GET_MODE (dst);
-
-  gcc_assert (VECTOR_MODE_P (dst_mode));
-  gcc_assert (register_operand (dst, dst_mode)
-	      && register_operand (src1, src_mode)
-	      && register_operand (src2, src_mode));
-
-  emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
-  return;
-}
-
 /* Split a complex SIMD move.  */
 
 void
@@ -20941,37 +20924,13 @@ aarch64_expand_vector_init (rtx target, rtx vals)
      of mode N in VALS and we must put their concatentation into TARGET.  */
   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
     {
-      gcc_assert (known_eq (GET_MODE_SIZE (mode),
-		  2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
-      rtx lo = XVECEXP (vals, 0, 0);
-      rtx hi = XVECEXP (vals, 0, 1);
-      machine_mode narrow_mode = GET_MODE (lo);
-      gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
-      gcc_assert (narrow_mode == GET_MODE (hi));
-
-      /* When we want to concatenate a half-width vector with zeroes we can
-	 use the aarch64_combinez[_be] patterns.  Just make sure that the
-	 zeroes are in the right half.  */
-      if (BYTES_BIG_ENDIAN
-	  && aarch64_simd_imm_zero (lo, narrow_mode)
-	  && general_operand (hi, narrow_mode))
-	emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
-      else if (!BYTES_BIG_ENDIAN
-	       && aarch64_simd_imm_zero (hi, narrow_mode)
-	       && general_operand (lo, narrow_mode))
-	emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
-      else
-	{
-	  /* Else create the two half-width registers and combine them.  */
-	  if (!REG_P (lo))
-	    lo = force_reg (GET_MODE (lo), lo);
-	  if (!REG_P (hi))
-	    hi = force_reg (GET_MODE (hi), hi);
-
-	  if (BYTES_BIG_ENDIAN)
-	    std::swap (lo, hi);
-	  emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
-	}
+      machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
+      gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
+		  && known_eq (GET_MODE_SIZE (mode),
+			       2 * GET_MODE_SIZE (narrow_mode)));
+      emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
+					 XVECEXP (vals, 0, 0),
+					 XVECEXP (vals, 0, 1)));
      return;
    }
 
-- 
cgit v1.1


From bce43c0493f65d2589776f0dafa396d5477a84c7 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 9 Feb 2022 16:57:06 +0000
Subject: aarch64: Remove move_lo/hi_quad expanders

This patch is the second of two to remove the old
move_lo/hi_quad expanders and move_hi_quad insns.

gcc/
	* config/aarch64/aarch64-simd.md (@aarch64_split_simd_mov<mode>):
	Use aarch64_combine instead of move_lo/hi_quad.  Tabify.
	(move_lo_quad_<mode>, aarch64_simd_move_hi_quad_<mode>): Delete.
	(aarch64_simd_move_hi_quad_be_<mode>, move_hi_quad_<mode>): Delete.
	(vec_pack_trunc_<mode>): Take general_operand elements and use
	aarch64_combine rather than move_lo/hi_quad to combine them.
	(vec_pack_trunc_df): Likewise.
---
 gcc/config/aarch64/aarch64-simd.md | 111 ++++++-------------------------------
 1 file changed, 18 insertions(+), 93 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 7acde0d..ef6e772 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -272,7 +272,7 @@
 
 (define_expand "@aarch64_split_simd_mov<mode>"
   [(set (match_operand:VQMOV 0)
-        (match_operand:VQMOV 1))]
+	(match_operand:VQMOV 1))]
   "TARGET_SIMD"
   {
     rtx dst = operands[0];
@@ -280,23 +280,22 @@
 
     if (GP_REGNUM_P (REGNO (src)))
       {
-        rtx src_low_part = gen_lowpart (<VHALF>mode, src);
-        rtx src_high_part = gen_highpart (<VHALF>mode, src);
+	rtx src_low_part = gen_lowpart (<VHALF>mode, src);
+	rtx src_high_part = gen_highpart (<VHALF>mode, src);
+	rtx dst_low_part = gen_lowpart (<VHALF>mode, dst);
 
-        emit_insn
-          (gen_move_lo_quad_<mode> (dst, src_low_part));
-        emit_insn
-          (gen_move_hi_quad_<mode> (dst, src_high_part));
+	emit_move_insn (dst_low_part, src_low_part);
+	emit_insn (gen_aarch64_combine<Vhalf> (dst, dst_low_part,
+					       src_high_part));
       }
-
     else
       {
-        rtx dst_low_part = gen_lowpart (<VHALF>mode, dst);
-        rtx dst_high_part = gen_highpart (<VHALF>mode, dst);
+	rtx dst_low_part = gen_lowpart (<VHALF>mode, dst);
+	rtx dst_high_part = gen_highpart (<VHALF>mode, dst);
 	rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
 	rtx hi = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
-        emit_insn (gen_aarch64_get_half<mode> (dst_low_part, src, lo));
-        emit_insn (gen_aarch64_get_half<mode> (dst_high_part, src, hi));
+	emit_insn (gen_aarch64_get_half<mode> (dst_low_part, src, lo));
+	emit_insn (gen_aarch64_get_half<mode> (dst_high_part, src, hi));
       }
     DONE;
   }
@@ -1580,69 +1579,6 @@
 ;; What that means, is that the RTL descriptions of the below patterns
 ;; need to change depending on endianness.
 
-;; Move to the low architectural bits of the register.
-;; On little-endian this is { operand, zeroes }
-;; On big-endian this is { zeroes, operand }
-
-(define_expand "move_lo_quad_<mode>"
-  [(match_operand:VQMOV 0 "register_operand")
-   (match_operand:<VHALF> 1 "register_operand")]
-  "TARGET_SIMD"
-{
-  emit_insn (gen_aarch64_combine<Vhalf> (operands[0], operands[1],
-					 CONST0_RTX (<VHALF>mode)));
-  DONE;
-}
-)
-
-;; Move operand1 to the high architectural bits of the register, keeping
-;; the low architectural bits of operand2.
-;; For little-endian this is { operand2, operand1 }
-;; For big-endian this is { operand1, operand2 }
-
-(define_insn "aarch64_simd_move_hi_quad_<mode>"
-  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
-        (vec_concat:VQMOV
-          (vec_select:<VHALF>
-                (match_dup 0)
-                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))
-	  (match_operand:<VHALF> 1 "register_operand" "w,r")))]
-  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-  "@
-   ins\\t%0.d[1], %1.d[0]
-   ins\\t%0.d[1], %1"
-  [(set_attr "type" "neon_ins")]
-)
-
-(define_insn "aarch64_simd_move_hi_quad_be_<mode>"
-  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
-        (vec_concat:VQMOV
-	  (match_operand:<VHALF> 1 "register_operand" "w,r")
-          (vec_select:<VHALF>
-                (match_dup 0)
-                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))))]
-  "TARGET_SIMD && BYTES_BIG_ENDIAN"
-  "@
-   ins\\t%0.d[1], %1.d[0]
-   ins\\t%0.d[1], %1"
-  [(set_attr "type" "neon_ins")]
-)
-
-(define_expand "move_hi_quad_<mode>"
- [(match_operand:VQMOV 0 "register_operand")
-  (match_operand:<VHALF> 1 "register_operand")]
- "TARGET_SIMD"
-{
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
-  if (BYTES_BIG_ENDIAN)
-    emit_insn (gen_aarch64_simd_move_hi_quad_be_<mode> (operands[0],
-		    operands[1], p));
-  else
-    emit_insn (gen_aarch64_simd_move_hi_quad_<mode> (operands[0],
-		    operands[1], p));
-  DONE;
-})
-
 ;; Narrowing operations.
 
 (define_insn "aarch64_xtn<mode>_insn_le"
@@ -1743,16 +1679,12 @@
 
 (define_expand "vec_pack_trunc_<mode>"
  [(match_operand:<VNARROWD> 0 "register_operand")
-  (match_operand:VDN 1 "register_operand")
-  (match_operand:VDN 2 "register_operand")]
+  (match_operand:VDN 1 "general_operand")
+  (match_operand:VDN 2 "general_operand")]
  "TARGET_SIMD"
 {
   rtx tempreg = gen_reg_rtx (<VDBL>mode);
-  int lo = BYTES_BIG_ENDIAN ? 2 : 1;
-  int hi = BYTES_BIG_ENDIAN ? 1 : 2;
-
-  emit_insn (gen_move_lo_quad_<Vdbl> (tempreg, operands[lo]));
-  emit_insn (gen_move_hi_quad_<Vdbl> (tempreg, operands[hi]));
+  emit_insn (gen_aarch64_vec_concat<mode> (tempreg, operands[1], operands[2]));
   emit_insn (gen_trunc<Vdbl><Vnarrowd>2 (operands[0], tempreg));
   DONE;
 })
@@ -3402,20 +3334,13 @@
 
 (define_expand "vec_pack_trunc_df"
   [(set (match_operand:V2SF 0 "register_operand")
-      (vec_concat:V2SF
-	(float_truncate:SF
-	    (match_operand:DF 1 "register_operand"))
-	(float_truncate:SF
-	    (match_operand:DF 2 "register_operand"))
-	  ))]
+	(vec_concat:V2SF
+	  (float_truncate:SF (match_operand:DF 1 "general_operand"))
+	  (float_truncate:SF (match_operand:DF 2 "general_operand"))))]
   "TARGET_SIMD"
   {
     rtx tmp = gen_reg_rtx (V2SFmode);
-    int lo = BYTES_BIG_ENDIAN ? 2 : 1;
-    int hi = BYTES_BIG_ENDIAN ? 1 : 2;
-
-    emit_insn (gen_move_lo_quad_v2df (tmp, operands[lo]));
-    emit_insn (gen_move_hi_quad_v2df (tmp, operands[hi]));
+    emit_insn (gen_aarch64_vec_concatdf (tmp, operands[1], operands[2]));
     emit_insn (gen_aarch64_float_truncate_lo_v2sf (operands[0], tmp));
     DONE;
   }
-- 
cgit v1.1


From 83d7e720cd1d075312e798c4ebd2e093f03465fb Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 9 Feb 2022 16:57:06 +0000
Subject: aarch64: Extend vec_concat patterns to 8-byte vectors

This patch extends the previous support for 16-byte vec_concat
so that it supports pairs of 4-byte elements.  This too isn't
strictly a regression fix, since the 8-byte forms weren't affected
by the same problems as the 16-byte forms, but it leaves things in
a more consistent state.

gcc/
	* config/aarch64/iterators.md (VDCSIF): New mode iterator.
	(VDBL): Handle SF.
	(single_wx, single_type, single_dtype, dblq): New mode attributes.
	* config/aarch64/aarch64-simd.md (load_pair_lanes<mode>): Extend
	from VDC to VDCSIF.
	(store_pair_lanes<mode>): Likewise.
	(*aarch64_combine_internal<mode>): Likewise.
	(*aarch64_combine_internal_be<mode>): Likewise.
	(*aarch64_combinez<mode>): Likewise.
	(*aarch64_combinez_be<mode>): Likewise.
	* config/aarch64/aarch64.cc (aarch64_classify_address): Handle
	8-byte modes for ADDR_QUERY_LDP_STP_N.
	(aarch64_print_operand): Likewise for %y.

gcc/testsuite/
	* gcc.target/aarch64/vec-init-13.c: New test.
	* gcc.target/aarch64/vec-init-14.c: Likewise.
	* gcc.target/aarch64/vec-init-15.c: Likewise.
	* gcc.target/aarch64/vec-init-16.c: Likewise.
	* gcc.target/aarch64/vec-init-17.c: Likewise.
---
 gcc/config/aarch64/aarch64-simd.md | 72 +++++++++++++++++++-------------------
 gcc/config/aarch64/aarch64.cc      | 16 ++++++---
 gcc/config/aarch64/iterators.md    | 38 ++++++++++++++++++--
 3 files changed, 84 insertions(+), 42 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ef6e772..1873342 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4243,12 +4243,12 @@
 (define_insn "load_pair_lanes<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w")
 	(vec_concat:<VDBL>
-	   (match_operand:VDC 1 "memory_operand" "Utq")
-	   (match_operand:VDC 2 "memory_operand" "m")))]
+	   (match_operand:VDCSIF 1 "memory_operand" "Utq")
+	   (match_operand:VDCSIF 2 "memory_operand" "m")))]
   "TARGET_SIMD
    && aarch64_mergeable_load_pair_p (<VDBL>mode, operands[1], operands[2])"
-  "ldr\\t%q0, %1"
-  [(set_attr "type" "neon_load1_1reg_q")]
+  "ldr\\t%<single_dtype>0, %1"
+  [(set_attr "type" "neon_load1_1reg<dblq>")]
 )
 
 ;; This STP pattern is a partial duplicate of the general vec_concat patterns
@@ -4273,12 +4273,12 @@
 (define_insn "store_pair_lanes<mode>"
   [(set (match_operand:<VDBL> 0 "aarch64_mem_pair_lanes_operand" "=Umn, Umn")
 	(vec_concat:<VDBL>
-	   (match_operand:VDC 1 "register_operand" "w, r")
-	   (match_operand:VDC 2 "register_operand" "w, r")))]
+	   (match_operand:VDCSIF 1 "register_operand" "w, r")
+	   (match_operand:VDCSIF 2 "register_operand" "w, r")))]
   "TARGET_SIMD"
   "@
-   stp\\t%d1, %d2, %y0
-   stp\\t%x1, %x2, %y0"
+   stp\t%<single_type>1, %<single_type>2, %y0
+   stp\t%<single_wx>1, %<single_wx>2, %y0"
   [(set_attr "type" "neon_stp, store_16")]
 )
 
@@ -4292,37 +4292,37 @@
 (define_insn "*aarch64_combine_internal<mode>"
   [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn")
 	(vec_concat:<VDBL>
-	  (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r")
-	  (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, w, ?r")))]
+	  (match_operand:VDCSIF 1 "register_operand" "0, 0, 0, ?w, ?r")
+	  (match_operand:VDCSIF 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, w, ?r")))]
   "TARGET_SIMD
    && !BYTES_BIG_ENDIAN
    && (register_operand (operands[0], <VDBL>mode)
        || register_operand (operands[2], <MODE>mode))"
   "@
-   ins\t%0.d[1], %2.d[0]
-   ins\t%0.d[1], %2
-   ld1\t{%0.d}[1], %2
-   stp\t%d1, %d2, %y0
-   stp\t%x1, %x2, %y0"
-  [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")]
+   ins\t%0.<single_type>[1], %2.<single_type>[0]
+   ins\t%0.<single_type>[1], %<single_wx>2
+   ld1\t{%0.<single_type>}[1], %2
+   stp\t%<single_type>1, %<single_type>2, %y0
+   stp\t%<single_wx>1, %<single_wx>2, %y0"
+  [(set_attr "type" "neon_ins<dblq>, neon_from_gp<dblq>, neon_load1_one_lane<dblq>, neon_stp, store_16")]
 )
 
 (define_insn "*aarch64_combine_internal_be<mode>"
   [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn")
 	(vec_concat:<VDBL>
-	  (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, ?w, ?r")
-	  (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r")))]
+	  (match_operand:VDCSIF 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, ?w, ?r")
+	  (match_operand:VDCSIF 1 "register_operand" "0, 0, 0, ?w, ?r")))]
   "TARGET_SIMD
    && BYTES_BIG_ENDIAN
    && (register_operand (operands[0], <VDBL>mode)
        || register_operand (operands[2], <MODE>mode))"
   "@
-   ins\t%0.d[1], %2.d[0]
-   ins\t%0.d[1], %2
-   ld1\t{%0.d}[1], %2
-   stp\t%d2, %d1, %y0
-   stp\t%x2, %x1, %y0"
-  [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")]
+   ins\t%0.<single_type>[1], %2.<single_type>[0]
+   ins\t%0.<single_type>[1], %<single_wx>2
+   ld1\t{%0.<single_type>}[1], %2
+   stp\t%<single_type>2, %<single_type>1, %y0
+   stp\t%<single_wx>2, %<single_wx>1, %y0"
+  [(set_attr "type" "neon_ins<dblq>, neon_from_gp<dblq>, neon_load1_one_lane<dblq>, neon_stp, store_16")]
 )
 
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
@@ -4331,13 +4331,13 @@
 (define_insn "*aarch64_combinez<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
 	(vec_concat:<VDBL>
-	  (match_operand:VDC 1 "nonimmediate_operand" "w,?r,m")
-	  (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")))]
+	  (match_operand:VDCSIF 1 "nonimmediate_operand" "w,?r,m")
+	  (match_operand:VDCSIF 2 "aarch64_simd_or_scalar_imm_zero")))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
   "@
-   mov\\t%0.8b, %1.8b
-   fmov\t%d0, %1
-   ldr\\t%d0, %1"
+   fmov\\t%<single_type>0, %<single_type>1
+   fmov\t%<single_type>0, %<single_wx>1
+   ldr\\t%<single_type>0, %1"
   [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg")
    (set_attr "arch" "simd,fp,simd")]
 )
@@ -4345,13 +4345,13 @@
 (define_insn "*aarch64_combinez_be<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
         (vec_concat:<VDBL>
-	  (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")
-	  (match_operand:VDC 1 "nonimmediate_operand" "w,?r,m")))]
+	  (match_operand:VDCSIF 2 "aarch64_simd_or_scalar_imm_zero")
+	  (match_operand:VDCSIF 1 "nonimmediate_operand" "w,?r,m")))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "@
-   mov\\t%0.8b, %1.8b
-   fmov\t%d0, %1
-   ldr\\t%d0, %1"
+   fmov\\t%<single_type>0, %<single_type>1
+   fmov\t%<single_type>0, %<single_wx>1
+   ldr\\t%<single_type>0, %1"
   [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg")
    (set_attr "arch" "simd,fp,simd")]
 )
@@ -4362,8 +4362,8 @@
 (define_expand "@aarch64_vec_concat<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand")
 	(vec_concat:<VDBL>
-	  (match_operand:VDC 1 "general_operand")
-	  (match_operand:VDC 2 "general_operand")))]
+	  (match_operand:VDCSIF 1 "general_operand")
+	  (match_operand:VDCSIF 2 "general_operand")))]
   "TARGET_SIMD"
 {
   int lo = BYTES_BIG_ENDIAN ? 2 : 1;
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index af42d1b..7bb97bd 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -9922,9 +9922,15 @@ aarch64_classify_address (struct aarch64_address_info *info,
   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
      corresponds to the actual size of the memory being loaded/stored and the
      mode of the corresponding addressing mode is half of that.  */
-  if (type == ADDR_QUERY_LDP_STP_N
-      && known_eq (GET_MODE_SIZE (mode), 16))
-    mode = DFmode;
+  if (type == ADDR_QUERY_LDP_STP_N)
+    {
+      if (known_eq (GET_MODE_SIZE (mode), 16))
+	mode = DFmode;
+      else if (known_eq (GET_MODE_SIZE (mode), 8))
+	mode = SFmode;
+      else
+	return false;
+    }
 
   bool allow_reg_index_p = (!load_store_pair_p
 			    && ((vec_flags == 0
@@ -11404,7 +11410,9 @@ aarch64_print_operand (FILE *f, rtx x, int code)
 	machine_mode mode = GET_MODE (x);
 
 	if (!MEM_P (x)
-	    || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
+	    || (code == 'y'
+		&& maybe_ne (GET_MODE_SIZE (mode), 8)
+		&& maybe_ne (GET_MODE_SIZE (mode), 16)))
 	  {
 	    output_operand_lossage ("invalid operand for '%%%c'", code);
 	    return;
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index a0c02e4..88067a3 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -236,6 +236,9 @@
 ;; Double vector modes for combines.
 (define_mode_iterator VDC [V8QI V4HI V4BF V4HF V2SI V2SF DI DF])
 
+;; VDC plus SI and SF.
+(define_mode_iterator VDCSIF [V8QI V4HI V4BF V4HF V2SI V2SF SI SF DI DF])
+
 ;; Polynomial modes for vector combines.
 (define_mode_iterator VDC_P [V8QI V4HI DI])
 
@@ -1436,8 +1439,8 @@
 (define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
 			(V4HF "V8HF")  (V4BF "V8BF")
 			(V2SI "V4SI")  (V2SF "V4SF")
-			(SI   "V2SI")  (DI   "V2DI")
-			(DF   "V2DF")])
+			(SI   "V2SI")  (SF   "V2SF")
+			(DI   "V2DI")  (DF   "V2DF")])
 
 ;; Register suffix for double-length mode.
 (define_mode_attr Vdtype [(V4HF "8h") (V2SF "4s")])
@@ -1557,6 +1560,30 @@
 			     (V4SI "2s") (V8HF "4h")
 			     (V4SF "2s")])
 
+;; Whether a mode fits in W or X registers (i.e. "w" for 32-bit modes
+;; and "x" for 64-bit modes).
+(define_mode_attr single_wx [(SI   "w") (SF   "w")
+			     (V8QI "x") (V4HI "x")
+			     (V4HF "x") (V4BF "x")
+			     (V2SI "x") (V2SF "x")
+			     (DI   "x") (DF   "x")])
+
+;; Whether a mode fits in S or D registers (i.e. "s" for 32-bit modes
+;; and "d" for 64-bit modes).
+(define_mode_attr single_type [(SI   "s") (SF   "s")
+			       (V8QI "d") (V4HI "d")
+			       (V4HF "d") (V4BF "d")
+			       (V2SI "d") (V2SF "d")
+			       (DI   "d") (DF   "d")])
+
+;; Whether a double-width mode fits in D or Q registers (i.e. "d" for
+;; 32-bit modes and "q" for 64-bit modes).
+(define_mode_attr single_dtype [(SI   "d") (SF   "d")
+			        (V8QI "q") (V4HI "q")
+			        (V4HF "q") (V4BF "q")
+			        (V2SI "q") (V2SF "q")
+			        (DI   "q") (DF   "q")])
+
 ;; Define corresponding core/FP element mode for each vector mode.
 (define_mode_attr vw [(V8QI "w") (V16QI "w")
 		      (V4HI "w") (V8HI "w")
@@ -1849,6 +1876,13 @@
 		     (V4x1DF "") (V4x2DF "_q")
 		     (V4x4BF "") (V4x8BF "_q")])
 
+;; Equivalent of the "q" attribute for the <VDBL> mode.
+(define_mode_attr dblq [(SI   "") (SF   "")
+		        (V8QI "_q") (V4HI "_q")
+		        (V4HF "_q") (V4BF "_q")
+		        (V2SI "_q") (V2SF "_q")
+		        (DI   "_q") (DF   "_q")])
+
 (define_mode_attr vp [(V8QI "v") (V16QI "v")
 		      (V4HI "v") (V8HI  "v")
 		      (V2SI "p") (V4SI  "v")
-- 
cgit v1.1


From ed3fea09b18f67e757b5768b42cb6e816626f1db Mon Sep 17 00:00:00 2001
From: Bill Schmidt <wschmidt@linux.ibm.com>
Date: Fri, 4 Feb 2022 13:07:17 -0600
Subject: rs6000: Correct function prototypes for vec_replace_unaligned

Due to a pasto error in the documentation, vec_replace_unaligned was
implemented with the same function prototypes as vec_replace_elt.  It was
intended that vec_replace_unaligned always specify output vectors as having
type vector unsigned char, to emphasize that elements are potentially
misaligned by this built-in function.  This patch corrects the
misimplementation.

2022-02-04  Bill Schmidt  <wschmidt@linux.ibm.com>

gcc/
	* config/rs6000/rs6000-builtins.def (VREPLACE_UN_UV2DI): Change
	function prototype.
	(VREPLACE_UN_UV4SI): Likewise.
	(VREPLACE_UN_V2DF): Likewise.
	(VREPLACE_UN_V2DI): Likewise.
	(VREPLACE_UN_V4SF): Likewise.
	(VREPLACE_UN_V4SI): Likewise.
	* config/rs6000/rs6000-overload.def (VEC_REPLACE_UN): Change all
	function prototypes.
	* config/rs6000/vsx.md (vreplace_un_<mode>): Remove define_expand.
	(vreplace_un_<mode>): New define_insn.

gcc/testsuite/
	* gcc.target/powerpc/vec-replace-word-runnable.c: Handle expected
	prototypes for each call to vec_replace_unaligned.
---
 gcc/config/rs6000/rs6000-builtins.def | 16 ++++++++--------
 gcc/config/rs6000/rs6000-overload.def | 12 ++++++------
 gcc/config/rs6000/vsx.md              | 25 ++++++++++---------------
 3 files changed, 24 insertions(+), 29 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index 2d1e63fb..ae2760c 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -3387,25 +3387,25 @@
   const vull __builtin_altivec_vpextd (vull, vull);
     VPEXTD vpextd {}
 
-  const vull __builtin_altivec_vreplace_un_uv2di (vull, unsigned long long, \
-                                                  const int<4>);
+  const vuc __builtin_altivec_vreplace_un_uv2di (vull, unsigned long long, \
+                                                 const int<4>);
     VREPLACE_UN_UV2DI vreplace_un_v2di {}
 
-  const vui __builtin_altivec_vreplace_un_uv4si (vui, unsigned int, \
+  const vuc __builtin_altivec_vreplace_un_uv4si (vui, unsigned int, \
                                                  const int<4>);
     VREPLACE_UN_UV4SI vreplace_un_v4si {}
 
-  const vd __builtin_altivec_vreplace_un_v2df (vd, double, const int<4>);
+  const vuc __builtin_altivec_vreplace_un_v2df (vd, double, const int<4>);
     VREPLACE_UN_V2DF vreplace_un_v2df {}
 
-  const vsll __builtin_altivec_vreplace_un_v2di (vsll, signed long long, \
-                                                 const int<4>);
+  const vuc __builtin_altivec_vreplace_un_v2di (vsll, signed long long, \
+                                                const int<4>);
     VREPLACE_UN_V2DI vreplace_un_v2di {}
 
-  const vf __builtin_altivec_vreplace_un_v4sf (vf, float, const int<4>);
+  const vuc __builtin_altivec_vreplace_un_v4sf (vf, float, const int<4>);
     VREPLACE_UN_V4SF vreplace_un_v4sf {}
 
-  const vsi __builtin_altivec_vreplace_un_v4si (vsi, signed int, const int<4>);
+  const vuc __builtin_altivec_vreplace_un_v4si (vsi, signed int, const int<4>);
     VREPLACE_UN_V4SI vreplace_un_v4si {}
 
   const vull __builtin_altivec_vreplace_uv2di (vull, unsigned long long, \
diff --git a/gcc/config/rs6000/rs6000-overload.def b/gcc/config/rs6000/rs6000-overload.def
index 49a6104..44e2945 100644
--- a/gcc/config/rs6000/rs6000-overload.def
+++ b/gcc/config/rs6000/rs6000-overload.def
@@ -3059,17 +3059,17 @@
     VREPLACE_ELT_V2DF
 
 [VEC_REPLACE_UN, vec_replace_unaligned, __builtin_vec_replace_un]
-  vui __builtin_vec_replace_un (vui, unsigned int, const int);
+  vuc __builtin_vec_replace_un (vui, unsigned int, const int);
     VREPLACE_UN_UV4SI
-  vsi __builtin_vec_replace_un (vsi, signed int, const int);
+  vuc __builtin_vec_replace_un (vsi, signed int, const int);
     VREPLACE_UN_V4SI
-  vull __builtin_vec_replace_un (vull, unsigned long long, const int);
+  vuc __builtin_vec_replace_un (vull, unsigned long long, const int);
     VREPLACE_UN_UV2DI
-  vsll __builtin_vec_replace_un (vsll, signed long long, const int);
+  vuc __builtin_vec_replace_un (vsll, signed long long, const int);
     VREPLACE_UN_V2DI
-  vf __builtin_vec_replace_un (vf, float, const int);
+  vuc __builtin_vec_replace_un (vf, float, const int);
     VREPLACE_UN_V4SF
-  vd __builtin_vec_replace_un (vd, double, const int);
+  vuc __builtin_vec_replace_un (vd, double, const int);
     VREPLACE_UN_V2DF
 
 [VEC_REVB, vec_revb, __builtin_vec_revb]
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 2f5a2f7..b53de10 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -4197,21 +4197,6 @@
  }
 [(set_attr "type" "vecsimple")])
 
-(define_expand "vreplace_un_<mode>"
- [(set (match_operand:REPLACE_ELT 0 "register_operand")
- (unspec:REPLACE_ELT [(match_operand:REPLACE_ELT 1 "register_operand")
-		      (match_operand:<VS_scalar> 2 "register_operand")
-		      (match_operand:QI 3 "const_0_to_12_operand")]
-		     UNSPEC_REPLACE_UN))]
- "TARGET_POWER10"
-{
-   /* Immediate value is the byte index Big Endian numbering.  */
-   emit_insn (gen_vreplace_elt_<mode>_inst (operands[0], operands[1],
-					    operands[2], operands[3]));
-   DONE;
- }
-[(set_attr "type" "vecsimple")])
-
 (define_insn "vreplace_elt_<mode>_inst"
  [(set (match_operand:REPLACE_ELT 0 "register_operand" "=v")
   (unspec:REPLACE_ELT [(match_operand:REPLACE_ELT 1 "register_operand" "0")
@@ -4222,6 +4207,16 @@
  "vins<REPLACE_ELT_char> %0,%2,%3"
  [(set_attr "type" "vecsimple")])
 
+(define_insn "vreplace_un_<mode>"
+ [(set (match_operand:V16QI 0 "register_operand" "=v")
+  (unspec:V16QI [(match_operand:REPLACE_ELT 1 "register_operand" "0")
+                 (match_operand:<VS_scalar> 2 "register_operand" "r")
+		 (match_operand:QI 3 "const_0_to_12_operand" "n")]
+		UNSPEC_REPLACE_UN))]
+ "TARGET_POWER10"
+ "vins<REPLACE_ELT_char> %0,%2,%3"
+ [(set_attr "type" "vecsimple")])
+
 ;; VSX_EXTRACT optimizations
 ;; Optimize double d = (double) vec_extract (vi, <n>)
 ;; Get the element into the top position and use XVCVSWDP/XVCVUWDP
-- 
cgit v1.1


From eefec38c992e3622a69de9667e91f0cafbff03cc Mon Sep 17 00:00:00 2001
From: Jeff Law <jeffreyalaw@gmail.com>
Date: Wed, 9 Feb 2022 14:10:53 -0500
Subject: Avoid using predefined insn name for instruction with different
 semantics

This isn't technically a regression, but it only impacts the v850 target and
fixes a long standing code correctness issue.

As outlined in slightly more detail in the PR, the v850 is using the pattern
name "fnmasf4" and "fnmssf4" to generate fnmaf.s and fnmsf.s instructions
 respectively.

Unfortunately fnmasf4 is expected to produce (-a * b) + c and
fnmssf4 (-a * b) - c.  Those v850 instructions actually negate the entire
result.

The fix is trivial.  Use a different pattern name so that the combiner can
still generate those instructions, but prevent those instructions from being
used to implement GCC's notion of what fnmas and fnmss should be.

This fixes pr97040 as well as a handful of testsuite failures for the v3e5
multilib.

gcc/
	PR target/97040
	* config/v850/v850.md (*v850_fnmasf4): Renamed from fnmasf4.
	(*v850_fnmssf4): Renamed from fnmssf4
---
 gcc/config/v850/v850.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/v850/v850.md b/gcc/config/v850/v850.md
index ed51157..6ca31e3 100644
--- a/gcc/config/v850/v850.md
+++ b/gcc/config/v850/v850.md
@@ -2601,7 +2601,12 @@
    (set_attr "type" "fpu")])
 
 ;;; negative-multiply-add
-(define_insn "fnmasf4"
+;; Note the name on this and the following insn were previously fnmasf4
+;; and fnmssf4.  Those names are known to the gimple->rtl expanders and
+;; must implement specific semantics (negating one of the inputs to the
+;; multiplication).  The v850 instructions actually negate the entire
+;; result.  Thus the names have been changed and hidden.
+(define_insn "*v850_fnmasf4"
   [(set (match_operand:SF                 0 "register_operand" "=r")
 	(neg:SF (fma:SF (match_operand:SF 1 "register_operand" "r")
 			(match_operand:SF 2 "register_operand" "r")
@@ -2612,7 +2617,7 @@
    (set_attr "type" "fpu")])
 
 ;; negative-multiply-subtract
-(define_insn "fnmssf4"
+(define_insn "*v850_fnmssf4"
   [(set (match_operand:SF                         0 "register_operand" "=r")
 	(neg:SF (fma:SF (match_operand:SF         1 "register_operand" "r")
 			(match_operand:SF         2 "register_operand" "r")
-- 
cgit v1.1


From 2b399dbabd48639ab4daac462c9d82c6cf3f99cc Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Wed, 9 Feb 2022 20:18:10 +0100
Subject: i386: Force inputs to a register to avoid lowpart_subreg failure
 [PR104458]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Input operands can be in the form of:

	(subreg:DI (reg:V2SF 96) 0)

which chokes lowpart_subreg. Force inputs to a register, which is
preferable even when the input operand is from memory.

2022-02-09  Uroš Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog:

	PR target/104458
	* config/i386/i386-expand.cc (ix86_split_idivmod):
	Force operands[2] and operands[3] into a register..

gcc/testsuite/ChangeLog:

	PR target/104458
	* gcc.target/i386/pr104458.c: New test.
---
 gcc/config/i386/i386-expand.cc | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index eb1930b..ce9607e 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1407,6 +1407,9 @@ ix86_split_idivmod (machine_mode mode, rtx operands[],
   rtx scratch, tmp0, tmp1, tmp2;
   rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
 
+  operands[2] = force_reg (mode, operands[2]);
+  operands[3] = force_reg (mode, operands[3]);
+
   switch (mode)
     {
     case E_SImode:
-- 
cgit v1.1


From 41582f88ec01c5ce2f85ebc4ac2743eb426d6e33 Mon Sep 17 00:00:00 2001
From: Andrew Pinski <apinski@marvell.com>
Date: Wed, 9 Feb 2022 14:56:58 -0800
Subject: [COMMITTED] Fix PR aarch64/104474: ICE with vector float initializers
 and non-consts.

The problem here is that the aarch64 back-end was placing const0_rtx
into the constant vector RTL even if the mode was a floating point mode.
The fix is instead to use CONST0_RTX and pass the mode to select the
correct zero (either const_int or const_double).

Committed as obvious after a bootstrap/test on aarch64-linux-gnu with
no regressions.

	PR target/104474

gcc/ChangeLog:

	* config/aarch64/aarch64.cc
	(aarch64_sve_expand_vector_init_handle_trailing_constants):
	Use CONST0_RTX instead of const0_rtx for the non-constant elements.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/pr104474-1.c: New test.
	* gcc.target/aarch64/sve/pr104474-2.c: New test.
	* gcc.target/aarch64/sve/pr104474-3.c: New test.
---
 gcc/config/aarch64/aarch64.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 7bb97bd..e3f18fb 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -21164,7 +21164,7 @@ aarch64_sve_expand_vector_init_handle_trailing_constants
 	{
 	  rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
 	  if (!valid_for_const_vector_p (elem_mode, x))
-	    x = const0_rtx;
+	    x = CONST0_RTX (elem_mode);
 	  v.quick_push (x);
 	}
       rtx const_vec = v.build ();
-- 
cgit v1.1


From 91a7e1daa7520489fafc0001d03c68bad4304f15 Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Thu, 3 Feb 2022 09:07:22 +0100
Subject: nvptx: Improved support for HFMode including neghf2 and abshf2

This patch adds more support for _Float16 (HFmode) to the nvptx backend.
Currently negation, absolute value and floating point comparisons are
implemented by promoting to float (SFmode).  This patch adds suitable
define_insns to nvptx.md, most conditional on TARGET_SM53 (-misa=sm_53).
This patch also adds support for HFmode fused multiply-add.

One subtlety is that neghf2 and abshf2 are implemented by (HImode)
bit manipulation operations to update the sign bit.  The NVidia PTX
ISA documentation for neg.f16 and abs.f16 contains the caution
"Future implementations may comply with the IEEE 754 standard by preserving
the (NaN) payload and modifying only the sign bit".  Given the availability
of suitable replacements, I thought it best to provide IEEE 754 compliant
implementations.  If anyone observes a performance penalty from this
choice I'm happy to provide a -ffast-math variant (or revisit this
decision).

This patch has been tested on nvptx-none hosted on x86_64-pc-linux-gnu
(including newlib) with a make and make -k check with no new failures.

gcc/ChangeLog:

	* config/nvptx/nvptx.md (*cmpf): New define_insn.
	(cstorehf4): New define_expand.
	(fmahf4): New define_insn.
	(neghf2): New define_insn.
	(abshf2): New define_insn.

gcc/testsuite/ChangeLog:

	* gcc.target/nvptx/float16-3.c: New test case for neghf2.
	* gcc.target/nvptx/float16-4.c: New test case for abshf2.
	* gcc.target/nvptx/float16-5.c: New test case for fmahf4.
	* gcc.target/nvptx/float16-6.c: New test case.
---
 gcc/config/nvptx/nvptx.md | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 7463603..e26d24e 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -783,6 +783,14 @@
   ""
   "%.\\tsetp%c1\\t%0, %2, %3;")
 
+(define_insn "*cmphf"
+  [(set (match_operand:BI 0 "nvptx_register_operand" "=R")
+	(match_operator:BI 1 "nvptx_float_comparison_operator"
+	   [(match_operand:HF 2 "nvptx_register_operand" "R")
+	    (match_operand:HF 3 "nvptx_nonmemory_operand" "RF")]))]
+  "TARGET_SM53"
+  "%.\\tsetp%c1\\t%0, %2, %3;")
+
 (define_insn "jump"
   [(set (pc)
 	(label_ref (match_operand 0 "" "")))]
@@ -973,6 +981,21 @@
   DONE;
 })
 
+(define_expand "cstorehf4"
+  [(set (match_operand:SI 0 "nvptx_register_operand")
+	(match_operator:SI 1 "nvptx_float_comparison_operator"
+	  [(match_operand:HF 2 "nvptx_register_operand")
+	   (match_operand:HF 3 "nvptx_nonmemory_operand")]))]
+  "TARGET_SM53"
+{
+  rtx reg = gen_reg_rtx (BImode);
+  rtx cmp = gen_rtx_fmt_ee (GET_CODE (operands[1]), BImode,
+			    operands[2], operands[3]);
+  emit_move_insn (reg, cmp);
+  emit_insn (gen_setccsi_from_bi (operands[0], reg));
+  DONE;
+})
+
 ;; Calls
 
 (define_insn "call_insn_<mode>"
@@ -1160,6 +1183,26 @@
   "TARGET_SM53"
   "%.\\tmul.f16\\t%0, %1, %2;")
 
+(define_insn "fmahf4"
+  [(set (match_operand:HF 0 "nvptx_register_operand" "=R")
+	(fma:HF (match_operand:HF 1 "nvptx_register_operand" "R")
+		(match_operand:HF 2 "nvptx_nonmemory_operand" "RF")
+		(match_operand:HF 3 "nvptx_nonmemory_operand" "RF")))]
+  "TARGET_SM53"
+  "%.\\tfma%#.f16\\t%0, %1, %2, %3;")
+
+(define_insn "neghf2"
+  [(set (match_operand:HF 0 "nvptx_register_operand" "=R")
+	(neg:HF (match_operand:HF 1 "nvptx_register_operand" "R")))]
+  ""
+  "%.\\txor.b16\\t%0, %1, -32768;")
+
+(define_insn "abshf2"
+  [(set (match_operand:HF 0 "nvptx_register_operand" "=R")
+	(abs:HF (match_operand:HF 1 "nvptx_register_operand" "R")))]
+  ""
+  "%.\\tand.b16\\t%0, %1, 32767;")
+
 (define_insn "exp2hf2"
   [(set (match_operand:HF 0 "nvptx_register_operand" "=R")
 	(unspec:HF [(match_operand:HF 1 "nvptx_register_operand" "R")]
-- 
cgit v1.1


From de12b919c74307c5c2a4c79a29683d21e622422e Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Thu, 3 Feb 2022 09:21:58 +0100
Subject: nvptx: Expand QI mode operations using SI mode instructions

One of the unusual target features of the Nvidia PTX ISA is that it
doesn't provide QI mode (byte sized) operations or registers.  Somewhat
conventionally, 8-bit quantities are read from/written to memory using
special instructions, but stored internally using SImode (32-bit) registers.
GCC's middle-end accomodates targets without QImode optabs, by widening
operations until suitable support is found, and with the current nvptx
backend this means 16-bit HImode operations.  The inconvenience is that
nvptx is also a TARGET_TRULY_NOOP_TRUNCATION=false target, meaning that
additional instructions are required to convert between the SImode
registers used to hold QImode values, and the HImode registers used to
operate on them (and back again).  This results in a large amount of
shuffling and type conversion in code dealing with bytes, i.e. using
char or Boolean types.

This patch improves the situation by providing expanders in the nvptx
machine description to perform QImode operations natively in SImode
instead of HImode.  An alternate implementation might be to provide
some form of target hook to specify which fallback modes to use during
RTL expansion, but I think this requirement is unusual, and a solution
entirely in the nvptx backend doesn't disturb/affect other targets.

The improvements can be quite dramatic, as shown in the example below:

int foo(int x, int y) { return (x==21) && (y==69); }

previously with -O2 required 15 instructions:

                mov.u32 %r26, %ar0;
                mov.u32 %r27, %ar1;
                setp.eq.u32     %r31, %r26, 21;
                selp.u32        %r30, 1, 0, %r31;
                mov.u32 %r29, %r30;
                setp.eq.u32     %r34, %r27, 69;
                selp.u32        %r33, 1, 0, %r34;
                mov.u32 %r32, %r33;
                cvt.u16.u8      %r39, %r29;
                mov.u16 %r36, %r39;
                cvt.u16.u8      %r39, %r32;
                mov.u16 %r37, %r39;
                and.b16 %r35, %r36, %r37;
                cvt.u32.u16     %r38, %r35;
                cvt.u32.u8      %value, %r38;

with this patch, now requires only 7 instructions:

                mov.u32 %r26, %ar0;
                mov.u32 %r27, %ar1;
                setp.eq.u32     %r31, %r26, 21;
                setp.eq.u32     %r34, %r27, 69;
                selp.u32        %r37, 1, 0, %r31;
                selp.u32        %r38, 1, 0, %r34;
                and.b32 %value, %r37, %r38;

This patch has been tested on nvptx-none hosted on x86_64-pc-linux-gnu
(including newlib) with a make and make -k check with no new failures.

gcc/ChangeLog:

	* config/nvptx/nvptx.md (cmp<mode>): Renamed from *cmp<mode>.
	(setcc<mode>_from_bi): Additionally support QImode.
	(extendbi<mode>2): Additionally support QImode.
	(zero_extendbi<mode>2): Additionally support QImode.
	(any_sbinary, any_ubinary, any_sunary, any_uunary): New code
	iterators for signed and unsigned, binary and unary operations.
	(<sbinary>qi3, <ubinary>qi3, <sunary>qi2, <uunary>qi2): New
	expanders to perform QImode operations using SImode instructions.
	(cstoreqi4): New define_expand.
	(*ext_truncsi2_qi): New define_insn.
	(*zext_truncsi2_qi): New define_insn.

gcc/testsuite/ChangeLog:

	* gcc.target/nvptx/bool-1.c: New test case.
---
 gcc/config/nvptx/nvptx.md | 114 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 107 insertions(+), 7 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index e26d24e..f53809ea 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -767,7 +767,7 @@
 
 ;; Comparisons and branches
 
-(define_insn "*cmp<mode>"
+(define_insn "cmp<mode>"
   [(set (match_operand:BI 0 "nvptx_register_operand" "=R")
 	(match_operator:BI 1 "nvptx_comparison_operator"
 	   [(match_operand:HSDIM 2 "nvptx_register_operand" "R")
@@ -879,22 +879,22 @@
 ;; Conditional stores
 
 (define_insn "setcc<mode>_from_bi"
-  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
-	(ne:HSDIM (match_operand:BI 1 "nvptx_register_operand" "R")
+  [(set (match_operand:QHSDIM 0 "nvptx_register_operand" "=R")
+	(ne:QHSDIM (match_operand:BI 1 "nvptx_register_operand" "R")
 		   (const_int 0)))]
   ""
   "%.\\tselp%t0\\t%0, 1, 0, %1;")
 
 (define_insn "extendbi<mode>2"
-  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
-	(sign_extend:HSDIM
+  [(set (match_operand:QHSDIM 0 "nvptx_register_operand" "=R")
+	(sign_extend:QHSDIM
 	 (match_operand:BI 1 "nvptx_register_operand" "R")))]
   ""
   "%.\\tselp%t0\\t%0, -1, 0, %1;")
 
 (define_insn "zero_extendbi<mode>2"
-  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
-	(zero_extend:HSDIM
+  [(set (match_operand:QHSDIM 0 "nvptx_register_operand" "=R")
+	(zero_extend:QHSDIM
 	 (match_operand:BI 1 "nvptx_register_operand" "R")))]
   ""
   "%.\\tselp%t0\\t%0, 1, 0, %1;")
@@ -2117,3 +2117,103 @@
     return nvptx_output_red_partition (operands[0], operands[1]);
   }
   [(set_attr "predicable" "false")])
+
+;; Expand QI mode operations using SI mode instructions.
+(define_code_iterator any_sbinary [plus minus smin smax])
+(define_code_attr sbinary [(plus "add") (minus "sub") (smin "smin") (smax "smax")])
+
+(define_code_iterator any_ubinary [and ior xor umin umax])
+(define_code_attr ubinary [(and "and") (ior "ior") (xor "xor") (umin "umin")
+			   (umax "umax")])
+
+(define_code_iterator any_sunary [neg abs])
+(define_code_attr sunary [(neg "neg") (abs "abs")])
+
+(define_code_iterator any_uunary [not])
+(define_code_attr uunary [(not "one_cmpl")])
+
+(define_expand "<sbinary>qi3"
+  [(set (match_operand:QI 0 "nvptx_register_operand")
+	(any_sbinary:QI (match_operand:QI 1 "nvptx_nonmemory_operand")
+			(match_operand:QI 2 "nvptx_nonmemory_operand")))]
+  ""
+{
+  rtx reg = gen_reg_rtx (SImode);
+  rtx op0 = convert_modes (SImode, QImode, operands[1], 0);
+  rtx op1 = convert_modes (SImode, QImode, operands[2], 0);
+  if (<CODE> == MINUS)
+    op0 = force_reg (SImode, op0);
+  emit_insn (gen_<sbinary>si3 (reg, op0, op1));
+  emit_insn (gen_truncsiqi2 (operands[0], reg));
+  DONE;
+})
+
+(define_expand "<ubinary>qi3"
+  [(set (match_operand:QI 0 "nvptx_register_operand")
+	(any_ubinary:QI (match_operand:QI 1 "nvptx_nonmemory_operand")
+			(match_operand:QI 2 "nvptx_nonmemory_operand")))]
+  ""
+{
+  rtx reg = gen_reg_rtx (SImode);
+  rtx op0 = convert_modes (SImode, QImode, operands[1], 1);
+  rtx op1 = convert_modes (SImode, QImode, operands[2], 1);
+  emit_insn (gen_<ubinary>si3 (reg, op0, op1));
+  emit_insn (gen_truncsiqi2 (operands[0], reg));
+  DONE;
+})
+
+(define_expand "<sunary>qi2"
+  [(set (match_operand:QI 0 "nvptx_register_operand")
+	(any_sunary:QI (match_operand:QI 1 "nvptx_nonmemory_operand")))]
+  ""
+{
+  rtx reg = gen_reg_rtx (SImode);
+  rtx op0 = convert_modes (SImode, QImode, operands[1], 0);
+  emit_insn (gen_<sunary>si2 (reg, op0));
+  emit_insn (gen_truncsiqi2 (operands[0], reg));
+  DONE;
+})
+
+(define_expand "<uunary>qi2"
+  [(set (match_operand:QI 0 "nvptx_register_operand")
+	(any_uunary:QI (match_operand:QI 1 "nvptx_nonmemory_operand")))]
+  ""
+{
+  rtx reg = gen_reg_rtx (SImode);
+  rtx op0 = convert_modes (SImode, QImode, operands[1], 1);
+  emit_insn (gen_<uunary>si2 (reg, op0));
+  emit_insn (gen_truncsiqi2 (operands[0], reg));
+  DONE;
+})
+
+(define_expand "cstoreqi4"
+  [(set (match_operand:SI 0 "nvptx_register_operand")
+	(match_operator:SI 1 "nvptx_comparison_operator"
+	  [(match_operand:QI 2 "nvptx_nonmemory_operand")
+	   (match_operand:QI 3 "nvptx_nonmemory_operand")]))]
+  ""
+{
+  rtx reg = gen_reg_rtx (BImode);
+  enum rtx_code code = GET_CODE (operands[1]);
+  int unsignedp = unsigned_condition_p (code);
+  rtx op2 = convert_modes (SImode, QImode, operands[2], unsignedp);
+  rtx op3 = convert_modes (SImode, QImode, operands[3], unsignedp);
+  rtx cmp = gen_rtx_fmt_ee (code, SImode, op2, op3);
+  emit_insn (gen_cmpsi (reg, cmp, op2, op3));
+  emit_insn (gen_setccsi_from_bi (operands[0], reg));
+  DONE;
+})
+
+(define_insn "*ext_truncsi2_qi"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+	(sign_extend:SI
+	 (truncate:QI (match_operand:SI 1 "nvptx_register_operand" "R"))))]
+  ""
+  "%.\\tcvt.s32.s8\\t%0, %1;")
+
+(define_insn "*zext_truncsi2_qi"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+	(zero_extend:SI
+	 (truncate:QI (match_operand:SI 1 "nvptx_register_operand" "R"))))]
+  ""
+  "%.\\tcvt.u32.u8\\t%0, %1;")
-- 
cgit v1.1


From 26d7b8f9bdf9ffb414beaa1133672f2d04c954eb Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Thu, 3 Feb 2022 14:41:01 +0100
Subject: nvptx: Add support for 64-bit mul.hi (and other) instructions

Now that the middle-end MULT_HIGHPART_EXPR pieces are in place, this
patch adds support for nvptx's mul.hi.s64 and mul.hi.u64 instructions,
as previously reviewed (provisionally pre-approved) back in August 2020:
https://gcc.gnu.org/pipermail/gcc-patches/2020-August/551373.html
Since then a few things have changed, so this patch uses the new
SMUL_HIGHPART and UMUL_HIGHPART RTX expressions, but the test cases
remain the same.  Like the x86_64 backend, this patch retains the
"trunc" forms of these instructions (while the RTL optimizers/combine
may still generate them).

Given that we're rapidly approaching stage 4, I also took the liberty
of including support in nvptx.md for a few other instructions.  With
the new 64-bit highpart multiplication instructions added above, we
can now provide a define_expand for efficient 64-bit (to 128-bit)
widening multiplications.  This patch also adds support for nvptx's
testp.infinite instruction (for implementing __builtin_isinf) and
the not.pred instruction.

As an example of the code generation improvements, the function
int foo(double x) { return __builtin_isinf(x); }
previously generated with -O2:

                mov.f64 %r26, %ar0;
                abs.f64 %r28, %r26;
                setp.leu.f64    %r31, %r28, 0d7fefffffffffffff;
                selp.u32        %r30, 1, 0, %r31;
                mov.u32 %r29, %r30;
                cvt.u16.u8      %r35, %r29;
                mov.u16 %r33, %r35;
                xor.b16 %r32, %r33, 1;
                cvt.u32.u16     %r34, %r32;
                cvt.u32.u8      %value, %r34;

and with this patch now generates:

                mov.f64 %r23, %ar0;
                testp.infinite.f64      %r24, %r23;
                selp.u32        %value, 1, 0, %r24;

This patch has been tested on nvptx-none hosted on x86_64-pc-linux-gnu
(including newlib) with a make and make -k check with no new failures.

gcc/ChangeLog:

	* config/nvptx/nvptx.md (UNSPEC_ISINF): New UNSPEC.
	(one_cmplbi2): New define_insn for not.pred.
	(mulditi3): New define_expand for signed widening multiply.
	(umulditi3): New define_expand for unsigned widening multiply.
	(smul<mode>3_highpart): New define_insn for signed highpart mult.
	(umul<mode>3_highpart): New define_insn for unsigned highpart mult.
	(*smulhi3_highpart_2): Renamed from smulhi3_highpart.
	(*smulsi3_highpart_2): Renamed from smulsi3_highpart.
	(*umulhi3_highpart_2): Renamed from umulhi3_highpart.
	(*umulsi3_highpart_2): Renamed from umulsi3_highpart.
	(*setcc<mode>_from_not_bi): New define_insn.
	(*setcc_isinf<mode>): New define_insn for testp.infinite.
	(isinf<mode>2): New define_expand.

gcc/testsuite/ChangeLog:

	* gcc.target/nvptx/mul-hi64.c: New test case.
	* gcc.target/nvptx/umul-hi64.c: New test case.
	* gcc.target/nvptx/mul-wide64.c: New test case.
	* gcc.target/nvptx/umul-wide64.c: New test case.
	* gcc.target/nvptx/isinf.c: New test case.
---
 gcc/config/nvptx/nvptx.md | 91 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 87 insertions(+), 4 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index f53809ea..d19a687 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -27,6 +27,7 @@
    UNSPEC_SIN
    UNSPEC_COS
    UNSPEC_TANH
+   UNSPEC_ISINF
 
    UNSPEC_FPINT_FLOOR
    UNSPEC_FPINT_BTRUNC
@@ -596,6 +597,12 @@
   ""
   "%.\\tnot.b%T0\\t%0, %1;")
 
+(define_insn "one_cmplbi2"
+  [(set (match_operand:BI 0 "nvptx_register_operand" "=R")
+	(not:BI (match_operand:BI 1 "nvptx_register_operand" "R")))]
+  ""
+  "%.\\tnot.pred\\t%0, %1;")
+
 (define_insn "*cnot<mode>2"
   [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
 	(eq:HSDIM (match_operand:HSDIM 1 "nvptx_register_operand" "R")
@@ -671,7 +678,57 @@
   ""
   "%.\\tmul.wide.u32\\t%0, %1, %2;")
 
-(define_insn "smulhi3_highpart"
+(define_expand "mulditi3"
+  [(set (match_operand:TI 0 "nvptx_register_operand")
+	(mult:TI (sign_extend:TI
+		  (match_operand:DI 1 "nvptx_register_operand"))
+		 (sign_extend:DI
+		  (match_operand:DI 2 "nvptx_nonmemory_operand"))))]
+  ""
+{
+  rtx hi = gen_reg_rtx (DImode);
+  rtx lo = gen_reg_rtx (DImode);
+  emit_insn (gen_smuldi3_highpart (hi, operands[1], operands[2]));
+  emit_insn (gen_muldi3 (lo, operands[1], operands[2]));
+  emit_move_insn (gen_highpart (DImode, operands[0]), hi);
+  emit_move_insn (gen_lowpart (DImode, operands[0]), lo);
+  DONE;
+})
+
+(define_expand "umulditi3"
+  [(set (match_operand:TI 0 "nvptx_register_operand")
+	(mult:TI (zero_extend:TI
+		  (match_operand:DI 1 "nvptx_register_operand"))
+		 (zero_extend:DI
+		  (match_operand:DI 2 "nvptx_nonmemory_operand"))))]
+  ""
+{
+  rtx hi = gen_reg_rtx (DImode);
+  rtx lo = gen_reg_rtx (DImode);
+  emit_insn (gen_umuldi3_highpart (hi, operands[1], operands[2]));
+  emit_insn (gen_muldi3 (lo, operands[1], operands[2]));
+  emit_move_insn (gen_highpart (DImode, operands[0]), hi);
+  emit_move_insn (gen_lowpart (DImode, operands[0]), lo);
+  DONE;
+})
+
+(define_insn "smul<mode>3_highpart"
+  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
+	(smul_highpart:HSDIM
+	  (match_operand:HSDIM 1 "nvptx_register_operand" "R")
+	  (match_operand:HSDIM 2 "nvptx_nonmemory_operand" "Ri")))]
+  ""
+  "%.\\tmul.hi.s%T0\\t%0, %1, %2;")
+
+(define_insn "umul<mode>3_highpart"
+  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
+	(umul_highpart:HSDIM
+	  (match_operand:HSDIM 1 "nvptx_register_operand" "R")
+	  (match_operand:HSDIM 2 "nvptx_nonmemory_operand" "Ri")))]
+  ""
+  "%.\\tmul.hi.u%T0\\t%0, %1, %2;")
+
+(define_insn "*smulhi3_highpart_2"
   [(set (match_operand:HI 0 "nvptx_register_operand" "=R")
 	(truncate:HI
 	 (lshiftrt:SI
@@ -683,7 +740,7 @@
   ""
   "%.\\tmul.hi.s16\\t%0, %1, %2;")
 
-(define_insn "smulsi3_highpart"
+(define_insn "*smulsi3_highpart_2"
   [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
 	(truncate:SI
 	 (lshiftrt:DI
@@ -695,7 +752,7 @@
   ""
   "%.\\tmul.hi.s32\\t%0, %1, %2;")
 
-(define_insn "umulhi3_highpart"
+(define_insn "*umulhi3_highpart_2"
   [(set (match_operand:HI 0 "nvptx_register_operand" "=R")
 	(truncate:HI
 	 (lshiftrt:SI
@@ -707,7 +764,7 @@
   ""
   "%.\\tmul.hi.u16\\t%0, %1, %2;")
 
-(define_insn "umulsi3_highpart"
+(define_insn "*umulsi3_highpart_2"
   [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
 	(truncate:SI
 	 (lshiftrt:DI
@@ -885,6 +942,13 @@
   ""
   "%.\\tselp%t0\\t%0, 1, 0, %1;")
 
+(define_insn "*setcc<mode>_from_not_bi"
+  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
+	(eq:HSDIM (match_operand:BI 1 "nvptx_register_operand" "R")
+		   (const_int 0)))]
+  ""
+  "%.\\tselp%t0\\t%0, 0, 1, %1;")
+
 (define_insn "extendbi<mode>2"
   [(set (match_operand:QHSDIM 0 "nvptx_register_operand" "=R")
 	(sign_extend:QHSDIM
@@ -1160,6 +1224,25 @@
   "flag_unsafe_math_optimizations"
   "%.\\tex2.approx%t0\\t%0, %1;")
 
+(define_insn "setcc_isinf<mode>"
+  [(set (match_operand:BI 0 "nvptx_register_operand" "=R")
+	(unspec:BI [(match_operand:SDFM 1 "nvptx_register_operand" "R")]
+		   UNSPEC_ISINF))]
+  ""
+  "%.\\ttestp.infinite%t1\\t%0, %1;")
+
+(define_expand "isinf<mode>2"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+	(unspec:SI [(match_operand:SDFM 1 "nvptx_register_operand" "R")]
+		   UNSPEC_ISINF))]
+  ""
+{
+  rtx pred = gen_reg_rtx (BImode);
+  emit_insn (gen_setcc_isinf<mode> (pred, operands[1]));
+  emit_insn (gen_setccsi_from_bi (operands[0], pred));
+  DONE;
+})
+
 ;; HFmode floating point arithmetic.
 
 (define_insn "addhf3"
-- 
cgit v1.1


From f68c3de7fc9065d8c9ac75b3736ea27abffdce45 Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Thu, 3 Feb 2022 14:46:40 +0100
Subject: nvptx: Fix and use BI mode logic instructions (e.g. and.pred)

This patch adds support for nvptx's BImode and.pred, or.pred and
xor.pred instructions.  Technically, nvptx.md previously defined
andbi3, iorbi3 and xorbi3 instructions, but the assembly language
mnemonic output for these was incorrect (e.g. and.b1) and would be
rejected by the ptxas assembler.  The most significant part of this
patch is the new define_split which teaches the compiler to actually
use these instructions when appropriate (exposing the latent bug above).

After https://gcc.gnu.org/pipermail/gcc-patches/2022-January/587999.html,
the function:

int foo(int x, int y) { return (x==21) && (y==69); }

when compiled with -O2 produces:

                mov.u32 %r26, %ar0;
                mov.u32 %r27, %ar1;
                setp.eq.u32     %r31, %r26, 21;
                setp.eq.u32     %r34, %r27, 69;
                selp.u32        %r37, 1, 0, %r31;
                selp.u32        %r38, 1, 0, %r34;
                and.b32 %value, %r37, %r38;

with this patch we now save an extra instruction and generate:

                mov.u32 %r26, %ar0;
                mov.u32 %r27, %ar1;
                setp.eq.u32     %r31, %r26, 21;
                setp.eq.u32     %r34, %r27, 69;
                and.pred        %r39, %r34, %r31;
                selp.u32        %value, 1, 0, %r39;

This patch has been tested (on top of the patch mentioned above) on
nvptx-none hosted on x86_64-pc-linux-gnu (including newlib) with a
make and make -k check with no new failures.

gcc/ChangeLog:

	* config/nvptx/nvptx.md (any_logic): Move code iterator earlier
	in machine description.
	(logic): Move code attribute earlier in machine description.
	(ilogic): New code attribute, like logic but "ior" for IOR.
	(and<mode>3, ior<mode>3, xor<mode>3): Delete. Replace with...
	(<ilogic><mode>3): New define_insn for HSDIM logic operations.
	(<ilogic>bi3): New define_insn for BI mode logic operations.
	(define_split): Lower logic operations from integer modes to
	BI mode predicate operations.

gcc/testsuite/ChangeLog:

	* gcc.target/nvptx/bool-1.c: Update.
	* gcc.target/nvptx/bool-2.c: New test case for and.pred.
	* gcc.target/nvptx/bool-3.c: New test case for or.pred.
	* gcc.target/nvptx/bool-4.c: New test case for xor.pred.
---
 gcc/config/nvptx/nvptx.md | 47 ++++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index d19a687..107df74 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -801,26 +801,38 @@
 
 ;; Logical operations
 
-(define_insn "and<mode>3"
-  [(set (match_operand:BHSDIM 0 "nvptx_register_operand" "=R")
-	(and:BHSDIM (match_operand:BHSDIM 1 "nvptx_register_operand" "R")
-		    (match_operand:BHSDIM 2 "nvptx_nonmemory_operand" "Ri")))]
-  ""
-  "%.\\tand.b%T0\\t%0, %1, %2;")
+(define_code_iterator any_logic [and ior xor])
+(define_code_attr logic [(and "and") (ior "or") (xor "xor")])
+(define_code_attr ilogic [(and "and") (ior "ior") (xor "xor")])
 
-(define_insn "ior<mode>3"
-  [(set (match_operand:BHSDIM 0 "nvptx_register_operand" "=R")
-	(ior:BHSDIM (match_operand:BHSDIM 1 "nvptx_register_operand" "R")
-		    (match_operand:BHSDIM 2 "nvptx_nonmemory_operand" "Ri")))]
+(define_insn "<ilogic><mode>3"
+  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
+	(any_logic:HSDIM
+	  (match_operand:HSDIM 1 "nvptx_register_operand" "R")
+	  (match_operand:HSDIM 2 "nvptx_nonmemory_operand" "Ri")))]
   ""
-  "%.\\tor.b%T0\\t%0, %1, %2;")
+  "%.\\t<logic>.b%T0\\t%0, %1, %2;")
 
-(define_insn "xor<mode>3"
-  [(set (match_operand:BHSDIM 0 "nvptx_register_operand" "=R")
-	(xor:BHSDIM (match_operand:BHSDIM 1 "nvptx_register_operand" "R")
-		    (match_operand:BHSDIM 2 "nvptx_nonmemory_operand" "Ri")))]
+(define_insn "<ilogic>bi3"
+  [(set (match_operand:BI 0 "nvptx_register_operand" "=R")
+	(any_logic:BI (match_operand:BI 1 "nvptx_register_operand" "R")
+		      (match_operand:BI 2 "nvptx_register_operand" "R")))]
   ""
-  "%.\\txor.b%T0\\t%0, %1, %2;")
+  "%.\\t<logic>.pred\\t%0, %1, %2;")
+
+(define_split
+  [(set (match_operand:HSDIM 0 "nvptx_register_operand")
+	(any_logic:HSDIM
+	  (ne:HSDIM (match_operand:BI 1 "nvptx_register_operand")
+		    (const_int 0))
+	  (ne:HSDIM (match_operand:BI 2 "nvptx_register_operand")
+		    (const_int 0))))]
+  "can_create_pseudo_p ()"
+  [(set (match_dup 3) (any_logic:BI (match_dup 1) (match_dup 2)))
+   (set (match_dup 0) (ne:HSDIM (match_dup 3) (const_int 0)))]
+{
+  operands[3] = gen_reg_rtx (BImode);
+})
 
 ;; Comparisons and branches
 
@@ -2042,9 +2054,6 @@
   }
   [(set_attr "atomic" "true")])
 
-(define_code_iterator any_logic [and ior xor])
-(define_code_attr logic [(and "and") (ior "or") (xor "xor")])
-
 (define_insn "atomic_fetch_<logic><mode>"
   [(set (match_operand:SDIM 1 "memory_operand" "+m")
 	(unspec_volatile:SDIM
-- 
cgit v1.1


From 9bacd7af2e3bba9ddad17e7de4e2d299419d819d Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Fri, 4 Feb 2022 04:13:53 +0100
Subject: PR target/104345: Use nvptx "set" instruction for cond ? -1 : 0

This patch addresses the "increased register pressure" regression on
nvptx-none caused by my change to transition the backend to a
STORE_FLAG_VALUE = 1 target.  This improved code generation for the
more common case of producing 0/1 Boolean values, but unfortunately
made things marginally worse when a 0/-1 mask value is desired.
Unfortunately, nvptx kernels are extremely sensitive to changes in
register usage, which was observable in the reported PR.

This patch provides optimizations for -(cond ? 1 : 0), effectively
simplify this into cond ? -1 : 0, where these ternary operators are
provided by nvptx's selp instruction, and for the specific case of
SImode, using (restoring) nvptx's "set" instruction (which avoids
the need for a predicate register).

This patch has been tested on nvptx-none hosted on x86_64-pc-linux-gnu
with a "make" and "make -k check" with no new failures.  Unfortunately,
the exact register usage of a nvptx kernel depends upon the version of
the Cuda drivers being used (and the hardware), but I believe this
change should resolve the PR (for Thomas) by improving code generation
for the cases that regressed.

gcc/ChangeLog:

	PR target/104345
	* config/nvptx/nvptx.md (sel_true<mode>): Fix indentation.
	(sel_false<mode>): Likewise.
	(define_code_iterator eqne): New code iterator for EQ and NE.
	(*selp<mode>_neg_<code>): New define_insn_and_split to optimize
	the negation of a selp instruction.
	(*selp<mode>_not_<code>): New define_insn_and_split to optimize
	the bitwise not of a selp instruction.
	(*setcc_int<mode>): Use set instruction for neg:SI of a selp.

gcc/testsuite/ChangeLog:

	PR target/104345
	* gcc.target/nvptx/neg-selp.c: New test case.
---
 gcc/config/nvptx/nvptx.md | 58 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 54 insertions(+), 4 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 107df74..ad642e7 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -977,7 +977,7 @@
 
 (define_insn "sel_true<mode>"
   [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
-        (if_then_else:HSDIM
+	(if_then_else:HSDIM
 	  (ne (match_operand:BI 1 "nvptx_register_operand" "R") (const_int 0))
 	  (match_operand:HSDIM 2 "nvptx_nonmemory_operand" "Ri")
 	  (match_operand:HSDIM 3 "nvptx_nonmemory_operand" "Ri")))]
@@ -986,7 +986,7 @@
 
 (define_insn "sel_true<mode>"
   [(set (match_operand:SDFM 0 "nvptx_register_operand" "=R")
-        (if_then_else:SDFM
+	(if_then_else:SDFM
 	  (ne (match_operand:BI 1 "nvptx_register_operand" "R") (const_int 0))
 	  (match_operand:SDFM 2 "nvptx_nonmemory_operand" "RF")
 	  (match_operand:SDFM 3 "nvptx_nonmemory_operand" "RF")))]
@@ -995,7 +995,7 @@
 
 (define_insn "sel_false<mode>"
   [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
-        (if_then_else:HSDIM
+	(if_then_else:HSDIM
 	  (eq (match_operand:BI 1 "nvptx_register_operand" "R") (const_int 0))
 	  (match_operand:HSDIM 2 "nvptx_nonmemory_operand" "Ri")
 	  (match_operand:HSDIM 3 "nvptx_nonmemory_operand" "Ri")))]
@@ -1004,13 +1004,63 @@
 
 (define_insn "sel_false<mode>"
   [(set (match_operand:SDFM 0 "nvptx_register_operand" "=R")
-        (if_then_else:SDFM
+	(if_then_else:SDFM
 	  (eq (match_operand:BI 1 "nvptx_register_operand" "R") (const_int 0))
 	  (match_operand:SDFM 2 "nvptx_nonmemory_operand" "RF")
 	  (match_operand:SDFM 3 "nvptx_nonmemory_operand" "RF")))]
   ""
   "%.\\tselp%t0\\t%0, %3, %2, %1;")
 
+(define_code_iterator eqne [eq ne])
+
+;; Split negation of a predicate into a conditional move.
+(define_insn_and_split "*selp<mode>_neg_<code>"
+  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
+	(neg:HSDIM (eqne:HSDIM
+		     (match_operand:BI 1 "nvptx_register_operand" "R")
+		     (const_int 0))))]
+  ""
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(if_then_else:HSDIM
+	  (eqne (match_dup 1) (const_int 0))
+	  (const_int -1)
+	  (const_int 0)))])
+
+;; Split bitwise not of a predicate into a conditional move.
+(define_insn_and_split "*selp<mode>_not_<code>"
+  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
+	(not:HSDIM (eqne:HSDIM
+		     (match_operand:BI 1 "nvptx_register_operand" "R")
+		     (const_int 0))))]
+  ""
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(if_then_else:HSDIM
+	  (eqne (match_dup 1) (const_int 0))
+	  (const_int -2)
+	  (const_int -1)))])
+
+(define_insn "*setcc_int<mode>"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+	(neg:SI
+	  (match_operator:SI 1 "nvptx_comparison_operator"
+	    [(match_operand:HSDIM 2 "nvptx_register_operand" "R")
+	     (match_operand:HSDIM 3 "nvptx_nonmemory_operand" "Ri")])))]
+  ""
+  "%.\\tset%t0%c1\\t%0, %2, %3;")
+
+(define_insn "*setcc_int<mode>"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+	(neg:SI
+	  (match_operator:SI 1 "nvptx_float_comparison_operator"
+	    [(match_operand:SDFM 2 "nvptx_register_operand" "R")
+	     (match_operand:SDFM 3 "nvptx_nonmemory_operand" "RF")])))]
+  ""
+  "%.\\tset%t0%c1\\t%0, %2, %3;")
+
 (define_insn "setcc_float<mode>"
   [(set (match_operand:SF 0 "nvptx_register_operand" "=R")
 	(match_operator:SF 1 "nvptx_comparison_operator"
-- 
cgit v1.1


From 6d98e83b2c919bd9fba2c61333d613bafc37357f Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Tue, 8 Feb 2022 20:56:55 +0100
Subject: nvptx: Tweak constraints on copysign instructions

Many thanks to Thomas Schwinge for confirming my hypothesis that the register
usage regression, PR target/104345, is solely due to libgcc's _muldc3 function.
In addition to the isinf functionality in the previously proposed nvptx patch at
https://gcc.gnu.org/pipermail/gcc-patches/2022-January/588453.html which
significantly reduces the number of instructions in _muldc3, the patch below
further reduces both the number of instructions and the number of explicitly
declared registers, by permitting floating point constant immediate operands
in nvptx's copysign instruction.

Fingers-crossed, the combination with all of the previous proposed nvptx
patches improves things.  Ultimately, increasing register usage from 50 to
51 registers, reducing the number of concurrent threads by ~2%, can easily
be countered if we're now executing significantly fewer instructions in each
kernel, for a net performance win.

This patch has been tested on nvptx-none hosted on x86_64-pc-linux-gnu
with a "make" and "make -k check" with no new failures.

gcc/ChangeLog:

	* config/nvptx/nvptx.md (copysign<mode>3): Allow immediate
	floating point constants as operands 1 and/or 2.
---
 gcc/config/nvptx/nvptx.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index ad642e7..bb0c0b3 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -1209,8 +1209,8 @@
 
 (define_insn "copysign<mode>3"
   [(set (match_operand:SDFM 0 "nvptx_register_operand" "=R")
-	(unspec:SDFM [(match_operand:SDFM 1 "nvptx_register_operand" "R")
-		      (match_operand:SDFM 2 "nvptx_register_operand" "R")]
+	(unspec:SDFM [(match_operand:SDFM 1 "nvptx_nonmemory_operand" "RF")
+		      (match_operand:SDFM 2 "nvptx_nonmemory_operand" "RF")]
 		      UNSPEC_COPYSIGN))]
   ""
   "%.\\tcopysign%t0\\t%0, %2, %1;")
-- 
cgit v1.1


From 5b2d679bbbcc2b976c6e228ba63afdf67c33164e Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Mon, 7 Feb 2022 14:12:34 +0100
Subject: [nvptx] Workaround sub.u16 driver JIT bug

There's a nvidia driver JIT bug that mishandles this code (minimized from
builtin-arith-overflow-15.c):
...
int main (void) {
  signed char r;
  unsigned char y = (unsigned char) 0x80;
  if (__builtin_sub_overflow ((unsigned char)0, (unsigned char)y, &r))
    __builtin_abort ();
  return 0;
}
...
which at ptx level minimizes to:
...
  mov.u16 r22, 0x0080;
  st.local.u16 [frame_var],r22;
  ld.local.u16 r32,[frame_var];
  sub.u16 r33,0x0000,r32;
  cvt.u32.u16 r35,r33;
...
where we expect r35 == 0x0000ff80 but get instead 0xffffff80, and where using
nvptx-none-run -O0 fixes the problem.  [ See also
https://github.com/vries/nvidia-bugs/tree/master/builtin-arith-overflow-15 . ]

Try to workaround the bug by using sub.s16 instead of sub.u16.

Tested on nvptx.

gcc/ChangeLog:

2022-02-07  Tom de Vries  <tdevries@suse.de>

	PR target/97005
	* config/nvptx/nvptx.md (define_insn "sub<mode>3"): Workaround
	driver JIT bug by using sub.s16 instead of sub.u16.
---
 gcc/config/nvptx/nvptx.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index bb0c0b3..cced68e 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -506,7 +506,14 @@
 	(minus:HSDIM (match_operand:HSDIM 1 "nvptx_register_operand" "R")
 		     (match_operand:HSDIM 2 "nvptx_register_operand" "R")))]
   ""
-  "%.\\tsub%t0\\t%0, %1, %2;")
+  {
+    if (GET_MODE (operands[0]) == HImode)
+      /* Workaround https://developer.nvidia.com/nvidia_bug/3527713.
+	 See PR97005.  */
+      return "%.\\tsub.s16\\t%0, %1, %2;";
+
+    return "%.\\tsub%t0\\t%0, %1, %2;";
+  })
 
 (define_insn "mul<mode>3"
   [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
-- 
cgit v1.1


From 3e7d4e82dc9fecb051e9ac422c312b26206d5ecd Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Thu, 13 Jan 2022 13:13:44 +0100
Subject: [nvptx] Handle pre-sm_7x shared atomic store using atomic exchange

The ptx isa specifies (for pre-sm_7x) that atomic operations on shared memory
locations do not guarantee atomicity with respect to normal store instructions
to the same address.

This can be fixed by:
- inserting barriers between normal stores and atomic operations to a common
  address
- using atom.exch to store to locations accessed by other atomic operations.

It's not clearly spelled out which barriers are needed, and a barrier seem more
expensive than atomic exchange.

Implement the pre-sm_7x shared atomic store using atomic exchange.

That includes stores using generic addressing, since those may also point to
shared memory.

Tested on x86-64 with nvptx accelerator.

gcc/ChangeLog:

2022-02-02  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx-protos.h (nvptx_mem_maybe_shared_p): Declare.
	* config/nvptx/nvptx.cc (nvptx_mem_data_area): New static function.
	(nvptx_mem_maybe_shared_p): New function.
	* config/nvptx/nvptx.md (define_expand "atomic_store<mode>"): New
	define_expand.

gcc/testsuite/ChangeLog:

2022-02-02  Tom de Vries  <tdevries@suse.de>

	* gcc.target/nvptx/atomic-store-1.c: New test.
	* gcc.target/nvptx/atomic-store-3.c: New test.
	* gcc.target/nvptx/stack-atomics-run.c: Update.
---
 gcc/config/nvptx/nvptx-protos.h |  1 +
 gcc/config/nvptx/nvptx.cc       | 22 ++++++++++++++++++++++
 gcc/config/nvptx/nvptx.md       | 30 ++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
index a846e34..0bf9af4 100644
--- a/gcc/config/nvptx/nvptx-protos.h
+++ b/gcc/config/nvptx/nvptx-protos.h
@@ -60,5 +60,6 @@ extern const char *nvptx_output_simt_exit (rtx);
 extern const char *nvptx_output_red_partition (rtx, rtx);
 extern const char *nvptx_output_atomic_insn (const char *, rtx *, int, int);
 extern bool nvptx_mem_local_p (rtx);
+extern bool nvptx_mem_maybe_shared_p (const_rtx);
 #endif
 #endif
diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 1b0227a..5b26c0f 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -76,6 +76,7 @@
 #include "intl.h"
 #include "opts.h"
 #include "tree-pretty-print.h"
+#include "rtl-iter.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -2787,6 +2788,27 @@ nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
   nvptx_print_address_operand (file, addr, mode);
 }
 
+static nvptx_data_area
+nvptx_mem_data_area (const_rtx x)
+{
+  gcc_assert (GET_CODE (x) == MEM);
+
+  const_rtx addr = XEXP (x, 0);
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, addr, ALL)
+    if (SYMBOL_REF_P (*iter))
+      return SYMBOL_DATA_AREA (*iter);
+
+  return DATA_AREA_GENERIC;
+}
+
+bool
+nvptx_mem_maybe_shared_p (const_rtx x)
+{
+  nvptx_data_area area = nvptx_mem_data_area (x);
+  return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC;
+}
+
 /* Print an operand, X, to FILE, with an optional modifier in CODE.
 
    Meaning of CODE:
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index cced68e..1a283b4 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -2051,6 +2051,36 @@
   }
   [(set_attr "atomic" "true")])
 
+(define_expand "atomic_store<mode>"
+  [(match_operand:SDIM 0 "memory_operand" "=m")		  ;; memory
+   (match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri")  ;; input
+   (match_operand:SI 2 "const_int_operand")]		  ;; model
+  ""
+{
+  struct address_info info;
+  decompose_mem_address (&info, operands[0]);
+  if (info.base != NULL && REG_P (*info.base)
+      && REGNO_PTR_FRAME_P (REGNO (*info.base)))
+    {
+      emit_insn (gen_mov<mode> (operands[0], operands[1]));
+      DONE;
+    }
+
+  if (TARGET_SM70)
+    /* Fall back to expand_atomic_store.  */
+    FAIL;
+
+  bool maybe_shared_p = nvptx_mem_maybe_shared_p (operands[0]);
+  if (!maybe_shared_p)
+    /* Fall back to expand_atomic_store.  */
+    FAIL;
+
+  rtx tmpreg = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_atomic_exchange<mode> (tmpreg, operands[0], operands[1],
+					operands[2]));
+  DONE;
+})
+
 (define_insn "atomic_fetch_add<mode>"
   [(set (match_operand:SDIM 1 "memory_operand" "+m")
 	(unspec_volatile:SDIM
-- 
cgit v1.1


From 19a13d5a1d695465b3c3905b7c8ec888add1a39e Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Wed, 2 Feb 2022 16:23:37 +0100
Subject: [nvptx] Handle sm_7x shared atomic store more optimal

For sm_7x atomic stores we fall back on expand_atomic_store, but this
results in using membar.sys for shared stores.

Fix this by adding an nvptx_atomic_store insn that adds a membar.cta for a
shared store.

Tested on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-02-02  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.md (define_insn "nvptx_atomic_store<mode>"): New
	define_insn.
	(define_expand "atomic_store<mode>"): Use nvptx_atomic_store<mode> for
	TARGET_SM70.
	(define_c_enum "unspecv"): Add UNSPECV_ST.

gcc/testsuite/ChangeLog:

2022-02-02  Tom de Vries  <tdevries@suse.de>

	* gcc.target/nvptx/atomic-store-2.c: New test.
---
 gcc/config/nvptx/nvptx.md | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 1a283b4..4c378ec 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -57,6 +57,7 @@
    UNSPECV_CAS
    UNSPECV_CAS_LOCAL
    UNSPECV_XCHG
+   UNSPECV_ST
    UNSPECV_BARSYNC
    UNSPECV_WARPSYNC
    UNSPECV_UNIFORM_WARP_CHECK
@@ -2067,8 +2068,11 @@
     }
 
   if (TARGET_SM70)
-    /* Fall back to expand_atomic_store.  */
-    FAIL;
+    {
+       emit_insn (gen_nvptx_atomic_store<mode> (operands[0], operands[1],
+						operands[2]));
+       DONE;
+    }
 
   bool maybe_shared_p = nvptx_mem_maybe_shared_p (operands[0]);
   if (!maybe_shared_p)
@@ -2081,6 +2085,20 @@
   DONE;
 })
 
+(define_insn "nvptx_atomic_store<mode>"
+  [(set (match_operand:SDIM 0 "memory_operand" "+m")	      ;; memory
+       (unspec_volatile:SDIM
+	 [(match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri") ;; input
+	  (match_operand:SI 2 "const_int_operand")]		;; model
+	       UNSPECV_ST))]
+  "TARGET_SM70"
+  {
+    const char *t
+      = "%.\tst%A0.b%T0\t%0, %1;";
+    return nvptx_output_atomic_insn (t, operands, 0, 2);
+  }
+  [(set_attr "atomic" "true")])
+
 (define_insn "atomic_fetch_add<mode>"
   [(set (match_operand:SDIM 1 "memory_operand" "+m")
 	(unspec_volatile:SDIM
-- 
cgit v1.1


From 53fcc46339239c4958e2a15bb9e59274133bbcf7 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Thu, 10 Feb 2022 17:23:17 +0100
Subject: i386: Fix vec_unpacks_float_lo_v4si operand constraint [PR104469]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

2022-02-10  Uroš Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog:

	PR target/104469
	* config/i386/sse.md (vec_unpacks_float_lo_v4si):
	Change operand 1 constraint to register_operand.

gcc/testsuite/ChangeLog:

	PR target/104469
	* gcc.target/i386/pr104469.c: New test.
---
 gcc/config/i386/sse.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 36b35f6..b2f5634 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -9223,7 +9223,7 @@
 (define_expand "vec_unpacks_float_hi_v8si"
   [(set (match_dup 2)
 	(vec_select:V4SI
-	  (match_operand:V8SI 1 "vector_operand")
+	  (match_operand:V8SI 1 "register_operand")
 	  (parallel [(const_int 4) (const_int 5)
 		     (const_int 6) (const_int 7)])))
    (set (match_operand:V4DF 0 "register_operand")
-- 
cgit v1.1


From fd64b09217fbe8fa33b559e61564071e8aca71e5 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Thu, 10 Feb 2022 11:26:16 +0100
Subject: [nvptx] Handle asm insn in prevent_branch_around_nothing

With GOMP_NVPTX_JIT=-00 and -mptx=3.1, I run into:
...
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/acc_prof-version-1.c \
  -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none  -O2 \
  execution test
...

The problem is that we're generating a diverging branch around nothing:
...
        {
                .reg.u32        %x;
                mov.u32 %x, %tid.x;
                setp.ne.u32     %r23, %x, 0;
        }
        @%r23   bra     $L2;
$L2:
...
which the driver JIT has problems with at -O0, so consequently we run into the
nvptx_uniform_warp_check.

Fix this by handling asm ("") and alike in prevent_branch_around_nothing.

Tested on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-02-10  Tom de Vries  <tdevries@suse.de>

	PR target/104456
	* config/nvptx/nvptx.cc (prevent_branch_around_nothing): Handle asm
	insn.
---
 gcc/config/nvptx/nvptx.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 5b26c0f..afbad5b 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -5257,6 +5257,14 @@ prevent_branch_around_nothing (void)
 	    case CODE_FOR_nvptx_join:
 	    case CODE_FOR_nop:
 	      continue;
+	    case -1:
+	      /* Handle asm ("") and similar.  */
+	      if (GET_CODE (PATTERN (insn)) == ASM_INPUT
+		  || GET_CODE (PATTERN (insn)) == ASM_OPERANDS
+		  || (GET_CODE (PATTERN (insn)) == PARALLEL
+		      && asm_noperands (PATTERN (insn)) >= 0))
+		continue;
+	      /* FALLTHROUGH.  */
 	    default:
 	      seen_label = NULL;
 	      continue;
-- 
cgit v1.1


From 4c3792d448964f7bd99e7eac2c29c9eb7c2bfb84 Mon Sep 17 00:00:00 2001
From: Iain Sandoe <iain@sandoe.co.uk>
Date: Mon, 7 Feb 2022 15:36:35 +0000
Subject: LRA, rs6000, Darwin: Amend lo_sum use for forced constants
 [PR104117].

Two issues resulted in this PR, which manifests when we force a constant into
memory in LRA (in PIC code on Darwin).  The presence of such forced constants
is quite dependent on other RTL optimisations, and it is easy for the issue to
become latent for a specific case.

First, in the Darwin-specific rs6000 backend code, we were not being careful
enough in rejecting invalid symbolic addresses.  Specifically, when generating
PIC code, we require a SYMBOL_REF to be wrapped in an UNSPEC_MACHOPIC_OFFSET.

Second, LRA was attempting to load a register using an invalid lo_sum address.

Signed-off-by: Iain Sandoe <iain@sandoe.co.uk>
Co-authored-by: Vladimir Makarov <vmakarov@redhat.com>

	PR target/104117

gcc/ChangeLog:

	* config/rs6000/rs6000.cc (darwin_rs6000_legitimate_lo_sum_const_p):
	Check for UNSPEC_MACHOPIC_OFFSET wrappers on symbolic addresses when
	emitting PIC code.
	(legitimate_lo_sum_address_p): Likewise.
	* lra-constraints.cc (process_address_1): Do not attempt to emit a reg
	load from an invalid lo_sum address.
---
 gcc/config/rs6000/rs6000.cc | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index eaba9a2..bc3ef072 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -8317,8 +8317,14 @@ darwin_rs6000_legitimate_lo_sum_const_p (rtx x, machine_mode mode)
   if (GET_CODE (x) == CONST)
     x = XEXP (x, 0);
 
+  /* If we are building PIC code, then any symbol must be wrapped in an
+     UNSPEC_MACHOPIC_OFFSET so that it will get the picbase subtracted.  */
+  bool machopic_offs_p = false;
   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_MACHOPIC_OFFSET)
-    x =  XVECEXP (x, 0, 0);
+    {
+      x =  XVECEXP (x, 0, 0);
+      machopic_offs_p = true;
+    }
 
   rtx sym = NULL_RTX;
   unsigned HOST_WIDE_INT offset = 0;
@@ -8349,6 +8355,9 @@ darwin_rs6000_legitimate_lo_sum_const_p (rtx x, machine_mode mode)
   if (sym)
     {
       tree decl = SYMBOL_REF_DECL (sym);
+      /* As noted above, PIC code cannot use a bare SYMBOL_REF.  */
+      if (TARGET_MACHO && flag_pic && !machopic_offs_p)
+	return false;
 #if TARGET_MACHO
       if (MACHO_SYMBOL_INDIRECTION_P (sym))
       /* The decl in an indirection symbol is the original one, which might
@@ -8936,7 +8945,7 @@ legitimate_lo_sum_address_p (machine_mode mode, rtx x, int strict)
     return false;
   x = XEXP (x, 1);
 
-  if (TARGET_ELF || TARGET_MACHO)
+  if (TARGET_ELF)
     {
       bool large_toc_ok;
 
@@ -8962,7 +8971,32 @@ legitimate_lo_sum_address_p (machine_mode mode, rtx x, int strict)
 
       return CONSTANT_P (x) || large_toc_ok;
     }
+  else if (TARGET_MACHO)
+    {
+      if (GET_MODE_NUNITS (mode) != 1)
+	return false;
+      if (GET_MODE_SIZE (mode) > UNITS_PER_WORD
+	  && !(/* see above  */
+	       TARGET_HARD_FLOAT && (mode == DFmode || mode == DDmode)))
+	return false;
+#if TARGET_MACHO
+      if (MACHO_DYNAMIC_NO_PIC_P || !flag_pic)
+	return CONSTANT_P (x);
+#endif
+      /* Macho-O PIC code from here.  */
+      if (GET_CODE (x) == CONST)
+	x = XEXP (x, 0);
+
+      /* SYMBOL_REFs need to be wrapped in an UNSPEC_MACHOPIC_OFFSET.  */
+      if (SYMBOL_REF_P (x))
+	return false;
 
+      /* So this is OK if the wrapped object is const.  */
+      if (GET_CODE (x) == UNSPEC
+	  && XINT (x, 1) == UNSPEC_MACHOPIC_OFFSET)
+	return CONSTANT_P (XVECEXP (x, 0, 0));
+      return CONSTANT_P (x);
+    }
   return false;
 }
 
-- 
cgit v1.1


From edadc7e0510b703d9727cf5ff68d55d84bb95def Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Sat, 12 Feb 2022 10:53:49 +0100
Subject: i386: Skip decimal float vector modes in type_natural_mode [PR79754]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

2022-02-12  Uroš Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog:

	PR target/79754
	* config/i386/i386.cc (type_natural_mode):
	Skip decimal float vector modes.

gcc/testsuite/ChangeLog:

	PR target/79754
	* gcc.target/i386/pr79754.c: New test.
---
 gcc/config/i386/i386.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 6b97a2b..cf246e7 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -1876,10 +1876,14 @@ type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
 	{
 	  machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
 
-	  /* There are no XFmode vector modes.  */
+	  /* There are no XFmode vector modes ...  */
 	  if (innermode == XFmode)
 	    return mode;
 
+	  /* ... and no decimal float vector modes.  */
+	  if (DECIMAL_FLOAT_MODE_P (innermode))
+	    return mode;
+
 	  if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
 	    mode = MIN_MODE_VECTOR_FLOAT;
 	  else
-- 
cgit v1.1


From 0538d42cdd68f6b65d72ed7768f1d00ba44f8631 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Sat, 12 Feb 2022 11:17:41 +0100
Subject: i386: Fix up cvtsd2ss splitter [PR104502]

The following testcase ICEs, because AVX512F is enabled, AVX512VL is not,
and the cvtsd2ss insn has %xmm0-15 as output operand and %xmm16-31 as
input operand.  For output operand %xmm16+ the splitter just gives up
in such case, but for such input it just emits vmovddup which requires
AVX512VL if either operand is EXT_REX_SSE_REG_P (when it is 128-bit).

The following patch fixes it by treating that case like the pre-SSE3
output != input case - move the input to output and do everything on
the output reg which is known to be < %xmm16.

2022-02-12  Jakub Jelinek  <jakub@redhat.com>

	PR target/104502
	* config/i386/i386.md (cvtsd2ss splitter): If operands[1] is xmm16+
	and AVX512VL isn't available, move operands[1] to operands[0] first.

	* gcc.target/i386/pr104502.c: New test.
---
 gcc/config/i386/i386.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 74da0d4..8ffa641 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4838,8 +4838,8 @@
      movddup is available.  */
   if (REG_P (operands[1]))
     {
-      if (!TARGET_SSE3
-	  && REGNO (operands[0]) != REGNO (operands[1]))
+      if ((!TARGET_SSE3 && REGNO (operands[0]) != REGNO (operands[1]))
+	  || (EXT_REX_SSE_REG_P (operands[1]) && !TARGET_AVX512VL))
 	{
 	  rtx tmp = lowpart_subreg (DFmode, operands[0], SFmode);
 	  emit_move_insn (tmp, operands[1]);
-- 
cgit v1.1


From d51cad0b840a14c66732cb6a166c11ddf55d18b2 Mon Sep 17 00:00:00 2001
From: Andrew Stubbs <ams@codesourcery.com>
Date: Sat, 12 Feb 2022 23:44:48 +0000
Subject: amdgcn: Allow vector reductions on constants

Obviously it would be better if these reductions could be evaluated at compile
time, but this will avoid an ICE.

gcc/ChangeLog:

	* config/gcn/gcn.cc (gcn_expand_reduc_scalar): Use force_reg.
---
 gcc/config/gcn/gcn.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 74819c6..402f025 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -4460,7 +4460,7 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
      pair of lanes, then on every pair of results from the previous
      iteration (thereby effectively reducing every 4 lanes) and so on until
      all lanes are reduced.  */
-  rtx in, out = src;
+  rtx in, out = force_reg (mode, src);
   for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
     {
       rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
-- 
cgit v1.1


From 16b65b08484237cc2845c4f5c4f15efe3a43a32c Mon Sep 17 00:00:00 2001
From: Michael Meissner <meissner@linux.ibm.com>
Date: Mon, 14 Feb 2022 17:42:14 -0500
Subject: Use correct names for __ibm128 if long double is IEEE 128-bit.

If you are on a PowerPC system where the default long double is IEEE
128-bit (either through the compiler option -mabi=ieeelongdouble or via
the configure option --with-long-double-format=ieee), GCC used the wrong
names for some of the conversion functions for the __ibm128 type.

Internally, GCC uses IFmode for __ibm128 if long double is IEEE 128-bit,
instead of TFmode when long double is IBM 128-bit.  This patch adds the
missing conversions to prevent the 'if' name from being used.

In particular, before the patch, the conversions used were:

    IFmode to DImode signed:	__fixifdi	instead of __fixtfdi
    IFmode to DImode unsigned	__fixunsifti	instead of __fixunstfti
    DImode to IFmode signed:	__floatdiif	instead of __floatditf
    DImode to IFmode unsigned:	__floatundiif	instead of __floatunditf

2022-02-14  Michael Meissner  <meissner@the-meissners.org>

gcc/
	PR target/104253
	* config/rs6000/rs6000.cc (init_float128_ibm): Update the
	conversion functions used to convert IFmode types.

gcc/testsuite/
	PR target/104253
	* gcc.target/powerpc/pr104253.c: New test.
---
 gcc/config/rs6000/rs6000.cc | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index bc3ef072..e76c017 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -11018,6 +11018,12 @@ init_float128_ibm (machine_mode mode)
       set_conv_libfunc (trunc_optab, DDmode, mode, "__dpd_trunctfdd");
       set_conv_libfunc (sext_optab, TDmode, mode, "__dpd_extendtftd");
 
+      set_conv_libfunc (sfix_optab, DImode, mode, "__fixtfdi");
+      set_conv_libfunc (ufix_optab, DImode, mode, "__fixunstfdi");
+
+      set_conv_libfunc (sfloat_optab, mode, DImode, "__floatditf");
+      set_conv_libfunc (ufloat_optab, mode, DImode, "__floatunditf");
+
       if (TARGET_POWERPC64)
 	{
 	  set_conv_libfunc (sfix_optab, TImode, mode, "__fixtfti");
-- 
cgit v1.1


From 0863d0ede34d21b2258686e6ccfd6dbb100bb754 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Tue, 15 Feb 2022 12:17:41 +0100
Subject: cygwin: Fix up -Werror=format-diag errors [PR104536]

As the testcase reports, cygwin has 3 can%'t contractions in diagnostics,
we use cannot everywhere else instead and -Wformat-diag enforces that.

2022-02-15  Jakub Jelinek  <jakub@redhat.com>

	PR target/104536
	* config/i386/host-cygwin.cc (cygwin_gt_pch_get_address): Use
	cannot instead of can%'t in diagnostics.  Formatting fixes.
---
 gcc/config/i386/host-cygwin.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/host-cygwin.cc b/gcc/config/i386/host-cygwin.cc
index fcf6333..05ad3a8 100644
--- a/gcc/config/i386/host-cygwin.cc
+++ b/gcc/config/i386/host-cygwin.cc
@@ -51,18 +51,18 @@ static void *
 cygwin_gt_pch_get_address (size_t sz, int fd)
 {
   void *base;
-  off_t p = lseek(fd, 0, SEEK_CUR);
+  off_t p = lseek (fd, 0, SEEK_CUR);
 
   if (p == (off_t) -1)
-    fatal_error (input_location, "can%'t get position in PCH file: %m");
+    fatal_error (input_location, "cannot get position in PCH file: %m");
 
    /* Cygwin requires that the underlying file be at least
       as large as the requested mapping.  */
   if ((size_t) p < sz)
-  { 
-    if ( ftruncate (fd, sz) == -1 )
-      fatal_error (input_location, "can%'t extend PCH file: %m");
-  }
+    { 
+      if (ftruncate (fd, sz) == -1)
+	fatal_error (input_location, "cannot extend PCH file: %m");
+    }
 
   base = mmap (NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
 
@@ -71,8 +71,8 @@ cygwin_gt_pch_get_address (size_t sz, int fd)
   else
     munmap (base, sz);
 
-  if (lseek (fd, p, SEEK_SET) == (off_t) -1 )
-    fatal_error (input_location, "can%'t set position in PCH file: %m");
+  if (lseek (fd, p, SEEK_SET) == (off_t) -1)
+    fatal_error (input_location, "cannot set position in PCH file: %m");
 
   return base;
 }
-- 
cgit v1.1


From 4963079769c99c4073adfd799885410ad484cbbe Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Tue, 15 Feb 2022 18:09:33 +0000
Subject: vect+aarch64: Fix ldp_stp_* regressions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ldp_stp_1.c, ldp_stp_4.c and ldp_stp_5.c have been failing since
vectorisation was enabled at -O2.  In all three cases SLP is
generating vector code when scalar code would be better.

The problem is that the target costs do not model whether STP could
be used for the scalar or vector code, so the normal latency-based
costs for store-heavy code can be way off.  It would be good to fix
that “properly” at some point, but it isn't easy; see the existing
discussion in aarch64_sve_adjust_stmt_cost for more details.

This patch therefore adds an on-the-side check for whether the
code is doing nothing more than set-up+stores.  It then applies
STP-based costs to those cases only, in addition to the normal
latency-based costs.  (That is, the vector code has to win on
both counts rather than on one count individually.)

However, at the moment, SLP costs one vector set-up instruction
for every vector in an SLP node, even if the contents are the
same as a previous vector in the same node.  Fixing the STP costs
without fixing that would regress other cases, tested in the patch.

The patch therefore makes the SLP costing code check for duplicates
within a node.  Ideally we'd check for duplicates more globally,
but that would require a more global approach to costs: the cost
of an initialisation should be amoritised across all trees that
use the initialisation, rather than fully counted against one
arbitrarily-chosen subtree.

Back on aarch64: an earlier version of the patch tried to apply
the new heuristic to constant stores.  However, that didn't work
too well in practice; see the comments for details.  The patch
therefore just tests the status quo for constant cases, leaving out
a match if the current choice is dubious.

ldp_stp_5.c was affected by the same thing.  The test would be
worth vectorising if we generated better vector code, but:

(1) We do a bad job of moving the { -1, 1 } constant, given that
    we have { -1, -1 } and { 1, 1 } to hand.

(2) The vector code has 6 pairable stores to misaligned offsets.
    We have peephole patterns to handle such misalignment for
    4 pairable stores, but not 6.

So the SLP decision isn't wrong as such.  It's just being let
down by later codegen.

The patch therefore adds -mstrict-align to preserve the original
intention of the test while adding ldp_stp_19.c to check for the
preferred vector code (XFAILed for now).

gcc/
	* tree-vectorizer.h (vect_scalar_ops_slice): New struct.
	(vect_scalar_ops_slice_hash): Likewise.
	(vect_scalar_ops_slice::op): New function.
	* tree-vect-slp.cc (vect_scalar_ops_slice::all_same_p): New function.
	(vect_scalar_ops_slice_hash::hash): Likewise.
	(vect_scalar_ops_slice_hash::equal): Likewise.
	(vect_prologue_cost_for_slp): Check for duplicate vectors.
	* config/aarch64/aarch64.cc
	(aarch64_vector_costs::m_stp_sequence_cost): New member variable.
	(aarch64_aligned_constant_offset_p): New function.
	(aarch64_stp_sequence_cost): Likewise.
	(aarch64_vector_costs::add_stmt_cost): Handle new STP heuristic.
	(aarch64_vector_costs::finish_cost): Likewise.

gcc/testsuite/
	* gcc.target/aarch64/ldp_stp_5.c: Require -mstrict-align.
	* gcc.target/aarch64/ldp_stp_14.h,
	* gcc.target/aarch64/ldp_stp_14.c: New test.
	* gcc.target/aarch64/ldp_stp_15.c: Likewise.
	* gcc.target/aarch64/ldp_stp_16.c: Likewise.
	* gcc.target/aarch64/ldp_stp_17.c: Likewise.
	* gcc.target/aarch64/ldp_stp_18.c: Likewise.
	* gcc.target/aarch64/ldp_stp_19.c: Likewise.
---
 gcc/config/aarch64/aarch64.cc | 140 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index e3f18fb..1a460d4 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -14932,6 +14932,31 @@ private:
      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
   unsigned int m_vec_flags = 0;
 
+  /* At the moment, we do not model LDP and STP in the vector and scalar costs.
+     This means that code such as:
+
+	a[0] = x;
+	a[1] = x;
+
+     will be costed as two scalar instructions and two vector instructions
+     (a scalar_to_vec and an unaligned_store).  For SLP, the vector form
+     wins if the costs are equal, because of the fact that the vector costs
+     include constant initializations whereas the scalar costs don't.
+     We would therefore tend to vectorize the code above, even though
+     the scalar version can use a single STP.
+
+     We should eventually fix this and model LDP and STP in the main costs;
+     see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
+     Until then, we look specifically for code that does nothing more than
+     STP-like operations.  We cost them on that basis in addition to the
+     normal latency-based costs.
+
+     If the scalar or vector code could be a sequence of STPs +
+     initialization, this variable counts the cost of the sequence,
+     with 2 units per instruction.  The variable is ~0U for other
+     kinds of code.  */
+  unsigned int m_stp_sequence_cost = 0;
+
   /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
      throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
      situations, we try to predict whether an Advanced SIMD implementation
@@ -15724,6 +15749,104 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
     }
 }
 
+/* Return true if STMT_INFO contains a memory access and if the constant
+   component of the memory address is aligned to SIZE bytes.  */
+static bool
+aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
+				   poly_uint64 size)
+{
+  if (!STMT_VINFO_DATA_REF (stmt_info))
+    return false;
+
+  if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
+    stmt_info = first_stmt;
+  tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
+  /* Needed for gathers & scatters, for example.  */
+  if (!constant_offset)
+    return false;
+
+  return multiple_p (wi::to_poly_offset (constant_offset), size);
+}
+
+/* Check if a scalar or vector stmt could be part of a region of code
+   that does nothing more than store values to memory, in the scalar
+   case using STP.  Return the cost of the stmt if so, counting 2 for
+   one instruction.  Return ~0U otherwise.
+
+   The arguments are a subset of those passed to add_stmt_cost.  */
+unsigned int
+aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
+			   stmt_vec_info stmt_info, tree vectype)
+{
+  /* Code that stores vector constants uses a vector_load to create
+     the constant.  We don't apply the heuristic to that case for two
+     main reasons:
+
+     - At the moment, STPs are only formed via peephole2, and the
+       constant scalar moves would often come between STRs and so
+       prevent STP formation.
+
+     - The scalar code also has to load the constant somehow, and that
+       isn't costed.  */
+  switch (kind)
+    {
+    case scalar_to_vec:
+      /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup.  */
+      return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
+
+    case vec_construct:
+      if (FLOAT_TYPE_P (vectype))
+	/* Count 1 insn for the maximum number of FP->SIMD INS
+	   instructions.  */
+	return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
+
+      /* Count 2 insns for a GPR->SIMD move and 2 insns for the
+	 maximum number of GPR->SIMD INS instructions.  */
+      return vect_nunits_for_cost (vectype) * 4 * count;
+
+    case vector_store:
+    case unaligned_store:
+      /* Count 1 insn per vector if we can't form STP Q pairs.  */
+      if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
+	return count * 2;
+      if (aarch64_tune_params.extra_tuning_flags
+	  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
+	return count * 2;
+
+      if (stmt_info)
+	{
+	  /* Assume we won't be able to use STP if the constant offset
+	     component of the address is misaligned.  ??? This could be
+	     removed if we formed STP pairs earlier, rather than relying
+	     on peephole2.  */
+	  auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
+	  if (!aarch64_aligned_constant_offset_p (stmt_info, size))
+	    return count * 2;
+	}
+      return CEIL (count, 2) * 2;
+
+    case scalar_store:
+      if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
+	{
+	  /* Check for a mode in which STP pairs can be formed.  */
+	  auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
+	  if (maybe_ne (size, 4) && maybe_ne (size, 8))
+	    return ~0U;
+
+	  /* Assume we won't be able to use STP if the constant offset
+	     component of the address is misaligned.  ??? This could be
+	     removed if we formed STP pairs earlier, rather than relying
+	     on peephole2.  */
+	  if (!aarch64_aligned_constant_offset_p (stmt_info, size))
+	    return ~0U;
+	}
+      return count;
+
+    default:
+      return ~0U;
+    }
+}
+
 unsigned
 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 				     stmt_vec_info stmt_info, tree vectype,
@@ -15747,6 +15870,14 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
       m_analyzed_vinfo = true;
     }
 
+  /* Apply the heuristic described above m_stp_sequence_cost.  */
+  if (m_stp_sequence_cost != ~0U)
+    {
+      uint64_t cost = aarch64_stp_sequence_cost (count, kind,
+						 stmt_info, vectype);
+      m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
+    }
+
   /* Try to get a more accurate cost by looking at STMT_INFO instead
      of just looking at KIND.  */
   if (stmt_info && aarch64_use_new_vector_costs_p ())
@@ -16017,6 +16148,15 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
     m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
 					   m_costs[vect_body]);
 
+  /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
+     the scalar code in the event of a tie, since there is more chance
+     of scalar code being optimized with surrounding operations.  */
+  if (!loop_vinfo
+      && scalar_costs
+      && m_stp_sequence_cost != ~0U
+      && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
+    m_costs[vect_body] = 2 * scalar_costs->total_cost ();
+
   vector_costs::finish_cost (scalar_costs);
 }
 
-- 
cgit v1.1


From 8e84b2b37a541b27feea69769fc314d534464ebd Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Tue, 15 Feb 2022 18:09:35 +0000
Subject: aarch64: Fix subs_compare_2.c regression [PR100874]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

subs_compare_2.c tests that we can use a SUBS+CSEL sequence for:

unsigned int
foo (unsigned int a, unsigned int b)
{
  unsigned int x = a - 4;
  if (a < 4)
    return x;
  else
    return 0;
}

As Andrew notes in the PR, this is effectively MIN (x, 4) - 4,
and it is now recognised as such by phiopt.  Previously it was
if-converted in RTL instead.

I tried to look for ways to generalise this to other situations
and to other ?:-style operations, not just max and min.  However,
for general ?: we tend to push an outer “- CST” into the arms of
the ?: -- at least if one of them simplifies -- so I didn't find
any useful abstraction.

This patch therefore adds a pattern specifically for
max/min(a,cst)-cst.  I'm not thrilled at having to do this,
but it seems like the least worst fix in the circumstances.
Also, max(a,cst)-cst for unsigned a is a useful saturating
subtraction idiom and so is arguably worth its own code
for that reason.

gcc/
	PR target/100874
	* config/aarch64/aarch64-protos.h (aarch64_maxmin_plus_const):
	Declare.
	* config/aarch64/aarch64.cc (aarch64_maxmin_plus_const): New function.
	* config/aarch64/aarch64.md (*aarch64_minmax_plus): New pattern.

gcc/testsuite/
	* gcc.target/aarch64/max_plus_1.c: New test.
	* gcc.target/aarch64/max_plus_2.c: Likewise.
	* gcc.target/aarch64/max_plus_3.c: Likewise.
	* gcc.target/aarch64/max_plus_4.c: Likewise.
	* gcc.target/aarch64/max_plus_5.c: Likewise.
	* gcc.target/aarch64/max_plus_6.c: Likewise.
	* gcc.target/aarch64/max_plus_7.c: Likewise.
	* gcc.target/aarch64/min_plus_1.c: Likewise.
	* gcc.target/aarch64/min_plus_2.c: Likewise.
	* gcc.target/aarch64/min_plus_3.c: Likewise.
	* gcc.target/aarch64/min_plus_4.c: Likewise.
	* gcc.target/aarch64/min_plus_5.c: Likewise.
	* gcc.target/aarch64/min_plus_6.c: Likewise.
	* gcc.target/aarch64/min_plus_7.c: Likewise.
---
 gcc/config/aarch64/aarch64-protos.h |   1 +
 gcc/config/aarch64/aarch64.cc       | 104 ++++++++++++++++++++++++++++++++++++
 gcc/config/aarch64/aarch64.md       |  27 ++++++++++
 3 files changed, 132 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 392efa0..d0e78d6 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -939,6 +939,7 @@ bool aarch64_legitimate_address_p (machine_mode, rtx, bool,
 				   aarch64_addr_query_type = ADDR_QUERY_M);
 machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);
 rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx);
+bool aarch64_maxmin_plus_const (rtx_code, rtx *, bool);
 rtx aarch64_load_tp (rtx);
 
 void aarch64_expand_compare_and_swap (rtx op[]);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 1a460d4..37ed22bc 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3781,6 +3781,110 @@ aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
   return aarch64_gen_compare_reg (code, x, y);
 }
 
+/* Consider the operation:
+
+     OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
+
+   where:
+
+   - CODE is [SU]MAX or [SU]MIN
+   - OPERANDS[2] and OPERANDS[3] are constant integers
+   - OPERANDS[3] is a positive or negative shifted 12-bit immediate
+   - all operands have mode MODE
+
+   Decide whether it is possible to implement the operation using:
+
+     SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
+     or
+     ADDS <tmp>, OPERANDS[1], OPERANDS[3]
+
+   followed by:
+
+     <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
+
+   where <insn> is one of CSEL, CSINV or CSINC.  Return true if so.
+   If GENERATE_P is true, also update OPERANDS as follows:
+
+     OPERANDS[4] = -OPERANDS[3]
+     OPERANDS[5] = the rtl condition representing <cond>
+     OPERANDS[6] = <tmp>
+     OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC.  */
+bool
+aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
+{
+  signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
+  rtx dst = operands[0];
+  rtx maxmin_op = operands[2];
+  rtx add_op = operands[3];
+  machine_mode mode = GET_MODE (dst);
+
+  /* max (x, y) - z == (x >= y + 1 ? x : y) - z
+		    == (x >= y ? x : y) - z
+		    == (x > y ? x : y) - z
+		    == (x > y - 1 ? x : y) - z
+
+     min (x, y) - z == (x <= y - 1 ? x : y) - z
+		    == (x <= y ? x : y) - z
+		    == (x < y ? x : y) - z
+		    == (x < y + 1 ? x : y) - z
+
+     Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
+     which x is compared with z.  Set DIFF to y - z.  Thus the supported
+     combinations are as follows, with DIFF being the value after the ":":
+
+     max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1   [z == y + 1]
+		    == x >= y ? x - y : 0              [z == y]
+		    == x > y ? x - y : 0               [z == y]
+		    == x > y - 1 ? x - (y - 1) : 1     [z == y - 1]
+
+     min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1    [z == y - 1]
+		    == x <= y ? x - y : 0              [z == y]
+		    == x < y ? x - y : 0               [z == y]
+		    == x < y + 1 ? x - (y + 1) : -1    [z == y + 1].  */
+  auto maxmin_val = rtx_mode_t (maxmin_op, mode);
+  auto add_val = rtx_mode_t (add_op, mode);
+  auto sub_val = wi::neg (add_val);
+  auto diff = wi::sub (maxmin_val, sub_val);
+  if (!(diff == 0
+	|| (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
+	|| (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
+    return false;
+
+  if (!generate_p)
+    return true;
+
+  rtx_code cmp;
+  switch (code)
+    {
+    case SMAX:
+      cmp = diff == 1 ? GT : GE;
+      break;
+    case UMAX:
+      cmp = diff == 1 ? GTU : GEU;
+      break;
+    case SMIN:
+      cmp = diff == -1 ? LT : LE;
+      break;
+    case UMIN:
+      cmp = diff == -1 ? LTU : LEU;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
+
+  operands[4] = immed_wide_int_const (sub_val, mode);
+  operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
+  if (can_create_pseudo_p ())
+    operands[6] = gen_reg_rtx (mode);
+  else
+    operands[6] = dst;
+  operands[7] = immed_wide_int_const (diff, mode);
+
+  return true;
+}
+
+
 /* Build the SYMBOL_REF for __tls_get_addr.  */
 
 static GTY(()) rtx tls_get_addr_libfunc;
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 3c72bda..64cc21d 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4405,6 +4405,33 @@
   }
 )
 
+;; Implement MAX/MIN (A, B) - C using SUBS/ADDS followed by CSEL/CSINV/CSINC.
+;; See aarch64_maxmin_plus_const for details about the supported cases.
+(define_insn_and_split "*aarch64_minmax_plus"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(plus:GPI
+	  (MAXMIN:GPI
+	    (match_operand:GPI 1 "register_operand" "r")
+	    (match_operand:GPI 2 "const_int_operand"))
+	  (match_operand:GPI 3 "aarch64_plus_immediate")))
+   (clobber (reg:CC CC_REGNUM))]
+  "aarch64_maxmin_plus_const (<CODE>, operands, false)"
+  "#"
+  "&& 1"
+  [(parallel
+     [(set (reg:CC CC_REGNUM)
+	   (compare:CC (match_dup 1) (match_dup 4)))
+      (set (match_dup 6)
+	   (plus:GPI (match_dup 1) (match_dup 3)))])
+   (set (match_dup 0)
+	(if_then_else:GPI (match_dup 5) (match_dup 6) (match_dup 7)))]
+  {
+    if (!aarch64_maxmin_plus_const (<CODE>, operands, true))
+      gcc_unreachable ();
+  }
+  [(set_attr "length" "8")]
+)
+
 ;; -------------------------------------------------------------------
 ;; Logical operations
 ;; -------------------------------------------------------------------
-- 
cgit v1.1


From 25332d2325c720f584444c3858efdb85b8a3c06a Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 16 Feb 2022 10:21:13 +0000
Subject: aarch64: Extend PR100056 patterns to +
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pr100056.c contains things like:

    int
    or_shift_u3a (unsigned i)
    {
      i &= 7;
      return i | (i << 11);
    }

After g:96146e61cd7aee62c21c2845916ec42152918ab7, the preferred
gimple representation of this is a multiplication:

  i_2 = i_1(D) & 7;
  _5 = i_2 * 2049;

Expand then open-codes the multiplication back to individual shifts,
but (of course) it uses + rather than | to combine the shifts.
This means that we end up with the RTL equivalent of:

  i + (i << 11)

I wondered about canonicalising the + to | (*back* to | in this case)
when the operands have no set bits in common and when one of the
operands is &, | or ^, but that didn't seem to be a popular idea when
I asked on IRC.  The feeling seemed to be that + is inherently simpler
than |, so we shouldn't be “simplifying” the other way.

This patch therefore adjusts the PR100056 patterns to handle +
as well as |, in cases where the operands are provably disjoint.

For:

    int
    or_shift_u8 (unsigned char i)
    {
      return i | (i << 11);
    }

the instructions:

    2: r95:SI=zero_extend(x0:QI)
      REG_DEAD x0:QI
    7: r98:SI=r95:SI<<0xb

are combined into:

    (parallel [
        (set (reg:SI 98)
             (and:SI (ashift:SI (reg:SI 0 x0 [ i ])
                                (const_int 11 [0xb]))
                     (const_int 522240 [0x7f800])))
        (set (reg/v:SI 95 [ i ])
             (zero_extend:SI (reg:QI 0 x0 [ i ])))
    ])

which fails to match, but which is then split into its individual
(independent) sets.  Later the zero_extend is combined with the add
to get an ADD UXTB:

    (set (reg:SI 99)
         (plus:SI (zero_extend:SI (reg:QI 0 x0 [ i ]))
                  (reg:SI 98)))

This means that there is never a 3-insn combo to match the split
against.  The end result is therefore:

        ubfiz   w1, w0, 11, 8
        add     w0, w1, w0, uxtb

This is a bit redundant, since it's doing the zero_extend twice.
It is at least 2 instructions though, rather than the 3 that we
had before the original patch for PR100056.  or_shift_u8_asm is
affected similarly.

The net effect is that we do still have 2 UBFIZs, but we're at
least back down to 2 instructions per function, as for GCC 11.
I think that's good enough for now.

There are probably other instructions that should be extended
to support + as well as | (e.g. the EXTR ones), but those aren't
regressions and so are GCC 13 material.

gcc/
	PR target/100056
	* config/aarch64/iterators.md (LOGICAL_OR_PLUS): New iterator.
	* config/aarch64/aarch64.md: Extend the PR100056 patterns
	to handle plus in the same way as ior, if the operands have
	no set bits in common.

gcc/testsuite/
	PR target/100056
	* gcc.target/aarch64/pr100056.c: XFAIL the original UBFIZ test
	and instead expect two UBFIZs + two ADD UXTBs.
---
 gcc/config/aarch64/aarch64.md   | 33 +++++++++++++++++++++++----------
 gcc/config/aarch64/iterators.md |  3 +++
 2 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 64cc21d..5909184 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4558,7 +4558,7 @@
 
 (define_split
   [(set (match_operand:GPI 0 "register_operand")
-	(LOGICAL:GPI
+	(LOGICAL_OR_PLUS:GPI
 	  (and:GPI (ashift:GPI (match_operand:GPI 1 "register_operand")
 			       (match_operand:QI 2 "aarch64_shift_imm_<mode>"))
 		   (match_operand:GPI 3 "const_int_operand"))
@@ -4571,16 +4571,23 @@
 	   && REGNO (operands[1]) == REGNO (operands[4])))
    && (trunc_int_for_mode (GET_MODE_MASK (GET_MODE (operands[4]))
 			   << INTVAL (operands[2]), <MODE>mode)
-       == INTVAL (operands[3]))"
+       == INTVAL (operands[3]))
+   && (<CODE> != PLUS
+       || (GET_MODE_MASK (GET_MODE (operands[4]))
+	   & INTVAL (operands[3])) == 0)"
   [(set (match_dup 5) (zero_extend:GPI (match_dup 4)))
-   (set (match_dup 0) (LOGICAL:GPI (ashift:GPI (match_dup 5) (match_dup 2))
-				   (match_dup 5)))]
-  "operands[5] = gen_reg_rtx (<MODE>mode);"
+   (set (match_dup 0) (match_dup 6))]
+  {
+    operands[5] = gen_reg_rtx (<MODE>mode);
+    rtx shift = gen_rtx_ASHIFT (<MODE>mode, operands[5], operands[2]);
+    rtx_code new_code = (<CODE> == PLUS ? IOR : <CODE>);
+    operands[6] = gen_rtx_fmt_ee (new_code, <MODE>mode, shift, operands[5]);
+  }
 )
 
 (define_split
   [(set (match_operand:GPI 0 "register_operand")
-	(LOGICAL:GPI
+	(LOGICAL_OR_PLUS:GPI
 	  (and:GPI (ashift:GPI (match_operand:GPI 1 "register_operand")
 			       (match_operand:QI 2 "aarch64_shift_imm_<mode>"))
 		   (match_operand:GPI 4 "const_int_operand"))
@@ -4589,11 +4596,17 @@
    && pow2_or_zerop (UINTVAL (operands[3]) + 1)
    && (trunc_int_for_mode (UINTVAL (operands[3])
 			   << INTVAL (operands[2]), <MODE>mode)
-       == INTVAL (operands[4]))"
+       == INTVAL (operands[4]))
+   && (<CODE> != PLUS
+       || (INTVAL (operands[4]) & INTVAL (operands[3])) == 0)"
   [(set (match_dup 5) (and:GPI (match_dup 1) (match_dup 3)))
-   (set (match_dup 0) (LOGICAL:GPI (ashift:GPI (match_dup 5) (match_dup 2))
-				   (match_dup 5)))]
-  "operands[5] = gen_reg_rtx (<MODE>mode);"
+   (set (match_dup 0) (match_dup 6))]
+  {
+    operands[5] = gen_reg_rtx (<MODE>mode);
+    rtx shift = gen_rtx_ASHIFT (<MODE>mode, operands[5], operands[2]);
+    rtx_code new_code = (<CODE> == PLUS ? IOR : <CODE>);
+    operands[6] = gen_rtx_fmt_ee (new_code, <MODE>mode, shift, operands[5]);
+  }
 )
 
 (define_split
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 88067a3..e72fdf35 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2122,6 +2122,9 @@
 ;; Code iterator for logical operations
 (define_code_iterator LOGICAL [and ior xor])
 
+;; LOGICAL with plus, for when | gets converted to +.
+(define_code_iterator LOGICAL_OR_PLUS [and ior xor plus])
+
 ;; LOGICAL without AND.
 (define_code_iterator LOGICAL_OR [ior xor])
 
-- 
cgit v1.1


From 687e57d7ac741d1c48ac030f87041aa56b888532 Mon Sep 17 00:00:00 2001
From: Michael Meissner <meissner@linux.ibm.com>
Date: Wed, 16 Feb 2022 22:00:00 -0500
Subject: Define __SIZEOF_FLOAT128__ and __SIZEOF_IBM128__.

Define the sizes of the PowerPC specific types __float128 and __ibm128 if those
types are enabled.

This patch will define __SIZEOF_IBM128__ and __SIZEOF_FLOAT128__ if their
respective types are created in the compiler.  Currently, this means both of
these will be defined if float128 support is enabled.  But at some point in
the future, __ibm128 could be enabled without enabling float128 support and
__SIZEOF_IBM128__ would be defined.

2022-02-16  Michael Meissner  <meissner@the-meissners.org>

gcc/
	PR target/99708
	* config/rs6000/rs6000-c.cc (rs6000_cpu_cpp_builtins): Define
	__SIZEOF_IBM128__ if the IBM 128-bit long double type is created.
	Define __SIZEOF_FLOAT128__ if the IEEE 128-bit floating point type
	is created.

gcc/testsuite/
	PR target/99708
	* gcc.target/powerpc/pr99708.c: New test.
---
 gcc/config/rs6000/rs6000-c.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index 15251ef..d2e480a 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -623,7 +623,11 @@ rs6000_cpu_cpp_builtins (cpp_reader *pfile)
   if (TARGET_FRSQRTES)
     builtin_define ("__RSQRTEF__");
   if (TARGET_FLOAT128_TYPE)
-    builtin_define ("__FLOAT128_TYPE__");
+      builtin_define ("__FLOAT128_TYPE__");
+  if (ibm128_float_type_node)
+    builtin_define ("__SIZEOF_IBM128__=16");
+  if (ieee128_float_type_node)
+    builtin_define ("__SIZEOF_FLOAT128__=16");
 #ifdef TARGET_LIBC_PROVIDES_HWCAP_IN_TCB
   builtin_define ("__BUILTIN_CPU_SUPPORTS__");
 #endif
-- 
cgit v1.1


From 550cabd00238a8e74783ba6ad05a7580d074aabd Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 16 Feb 2022 15:00:59 +0800
Subject: Clean up MPX-related bit_{MPX,BNDREGS,BNDCSR}.

gcc/ChangeLog:

	* config/i386/cpuid.h (bit_MPX): Removed.
	(bit_BNDREGS): Ditto.
	(bit_BNDCSR): Ditto.
---
 gcc/config/i386/cpuid.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index ed61130..8b3dc2b 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -86,7 +86,6 @@
 #define bit_AVX2	(1 << 5)
 #define bit_BMI2	(1 << 8)
 #define bit_RTM	(1 << 11)
-#define bit_MPX	(1 << 14)
 #define bit_AVX512F	(1 << 16)
 #define bit_AVX512DQ	(1 << 17)
 #define bit_RDSEED	(1 << 18)
@@ -136,10 +135,6 @@
 #define bit_AMX_TILE    (1 << 24)
 #define bit_AMX_INT8    (1 << 25)
 
-/* XFEATURE_ENABLED_MASK register bits (%eax == 0xd, %ecx == 0) */
-#define bit_BNDREGS     (1 << 3)
-#define bit_BNDCSR      (1 << 4)
-
 /* Extended State Enumeration Sub-leaf (%eax == 0xd, %ecx == 1) */
 #define bit_XSAVEOPT	(1 << 0)
 #define bit_XSAVEC	(1 << 1)
-- 
cgit v1.1


From fac15bf84807a58f83c741b1034c1bc96348319d Mon Sep 17 00:00:00 2001
From: Robin Dapp <rdapp@linux.ibm.com>
Date: Thu, 17 Feb 2022 19:59:51 +0100
Subject: rs6000: Workaround for new ifcvt behavior [PR104335].

Since r12-6747-gaa8cfe785953a0 ifcvt passes a "cc comparison"
i.e. the representation of the result of a comparison to the
backend.  rs6000_emit_int_cmove () is not prepared to handle this.
Therefore, this patch makes it return false in such a case.

	PR target/104335

gcc/ChangeLog:

	* config/rs6000/rs6000.cc (rs6000_emit_int_cmove): Return false
	if the expected comparison's first operand is of mode MODE_CC.
---
 gcc/config/rs6000/rs6000.cc | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index e76c017..32a13cd 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -16215,6 +16215,12 @@ rs6000_emit_int_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
   if (mode != SImode && (!TARGET_POWERPC64 || mode != DImode))
     return false;
 
+  /* PR104335: We now need to expect CC-mode "comparisons"
+     coming from ifcvt.  The following code expects proper
+     comparisons so better abort here.  */
+  if (GET_MODE_CLASS (GET_MODE (XEXP (op, 0))) == MODE_CC)
+    return false;
+
   /* We still have to do the compare, because isel doesn't do a
      compare, it just looks at the CRx bits set by a previous compare
      instruction.  */
-- 
cgit v1.1


From efbb17db52afd802300c4dcce208fab326ec2915 Mon Sep 17 00:00:00 2001
From: "Paul A. Clarke" <pc@us.ibm.com>
Date: Wed, 16 Feb 2022 20:01:41 -0600
Subject: rs6000: __Uglify non-uglified local variables in headers

Properly prefix (with "__")  all local variables in shipped headers for x86
compatibility intrinsics implementations.  This avoids possible problems with
usages like:
```
```

2022-02-16  Paul A. Clarke  <pc@us.ibm.com>

gcc
	PR target/104257
	* config/rs6000/bmi2intrin.h: Uglify local variables.
	* config/rs6000/emmintrin.h: Likewise.
	* config/rs6000/mm_malloc.h: Likewise.
	* config/rs6000/mmintrin.h: Likewise.
	* config/rs6000/pmmintrin.h: Likewise.
	* config/rs6000/smmintrin.h: Likewise.
	* config/rs6000/tmmintrin.h: Likewise.
	* config/rs6000/xmmintrin.h: Likewise.
---
 gcc/config/rs6000/bmi2intrin.h |  68 +--
 gcc/config/rs6000/emmintrin.h  | 908 ++++++++++++++++++++---------------------
 gcc/config/rs6000/mm_malloc.h  |  26 +-
 gcc/config/rs6000/mmintrin.h   | 768 +++++++++++++++++-----------------
 gcc/config/rs6000/pmmintrin.h  |  28 +-
 gcc/config/rs6000/smmintrin.h  |  18 +-
 gcc/config/rs6000/tmmintrin.h  |   4 +-
 gcc/config/rs6000/xmmintrin.h  | 861 +++++++++++++++++++-------------------
 8 files changed, 1340 insertions(+), 1341 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/bmi2intrin.h b/gcc/config/rs6000/bmi2intrin.h
index f2d7eb5..b7a7ded 100644
--- a/gcc/config/rs6000/bmi2intrin.h
+++ b/gcc/config/rs6000/bmi2intrin.h
@@ -77,39 +77,39 @@ extern __inline unsigned long long
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _pdep_u64 (unsigned long long __X, unsigned long long __M)
 {
-  unsigned long result = 0x0UL;
-  const unsigned long mask = 0x8000000000000000UL;
-  unsigned long m = __M;
-  unsigned long c, t;
-  unsigned long p;
+  unsigned long __result = 0x0UL;
+  const unsigned long __mask = 0x8000000000000000UL;
+  unsigned long __m = __M;
+  unsigned long __c, __t;
+  unsigned long __p;
 
   /* The pop-count of the mask gives the number of the bits from
    source to process.  This is also needed to shift bits from the
    source into the correct position for the result.  */
-  p = 64 - __builtin_popcountl (__M);
+  __p = 64 - __builtin_popcountl (__M);
 
   /* The loop is for the number of '1' bits in the mask and clearing
    each mask bit as it is processed.  */
-  while (m != 0)
+  while (__m != 0)
     {
-      c = __builtin_clzl (m);
-      t = __X << (p - c);
-      m ^= (mask >> c);
-      result |= (t & (mask >> c));
-      p++;
+      __c = __builtin_clzl (__m);
+      __t = __X << (__p - __c);
+      __m ^= (__mask >> __c);
+      __result |= (__t & (__mask >> __c));
+      __p++;
     }
-  return (result);
+  return __result;
 }
 
 extern __inline unsigned long long
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _pext_u64 (unsigned long long __X, unsigned long long __M)
 {
-  unsigned long p = 0x4040404040404040UL; // initial bit permute control
-  const unsigned long mask = 0x8000000000000000UL;
-  unsigned long m = __M;
-  unsigned long c;
-  unsigned long result;
+  unsigned long __p = 0x4040404040404040UL; // initial bit permute control
+  const unsigned long __mask = 0x8000000000000000UL;
+  unsigned long __m = __M;
+  unsigned long __c;
+  unsigned long __result;
 
   /* if the mask is constant and selects 8 bits or less we can use
    the Power8 Bit permute instruction.  */
@@ -118,35 +118,35 @@ _pext_u64 (unsigned long long __X, unsigned long long __M)
       /* Also if the pext mask is constant, then the popcount is
        constant, we can evaluate the following loop at compile
        time and use a constant bit permute vector.  */
-      long i;
-      for (i = 0; i < __builtin_popcountl (__M); i++)
+      long __i;
+      for (__i = 0; __i < __builtin_popcountl (__M); __i++)
 	{
-	  c = __builtin_clzl (m);
-	  p = (p << 8) | c;
-	  m ^= (mask >> c);
+	  __c = __builtin_clzl (__m);
+	  __p = (__p << 8) | __c;
+	  __m ^= (__mask >> __c);
 	}
-      result = __builtin_bpermd (p, __X);
+      __result = __builtin_bpermd (__p, __X);
     }
   else
     {
-      p = 64 - __builtin_popcountl (__M);
-      result = 0;
+      __p = 64 - __builtin_popcountl (__M);
+      __result = 0;
       /* We could a use a for loop here, but that combined with
        -funroll-loops can expand to a lot of code.  The while
        loop avoids unrolling and the compiler commons the xor
        from clearing the mask bit with the (m != 0) test.  The
        result is a more compact loop setup and body.  */
-      while (m != 0)
+      while (__m != 0)
 	{
-	  unsigned long t;
-	  c = __builtin_clzl (m);
-	  t = (__X & (mask >> c)) >> (p - c);
-	  m ^= (mask >> c);
-	  result |= (t);
-	  p++;
+	  unsigned long __t;
+	  __c = __builtin_clzl (__m);
+	  __t = (__X & (__mask >> __c)) >> (__p - __c);
+	  __m ^= (__mask >> __c);
+	  __result |= (__t);
+	  __p++;
 	}
     }
-  return (result);
+  return __result;
 }
 
 /* these 32-bit implementations depend on 64-bit pdep/pext
diff --git a/gcc/config/rs6000/emmintrin.h b/gcc/config/rs6000/emmintrin.h
index 71abcca..8329679 100644
--- a/gcc/config/rs6000/emmintrin.h
+++ b/gcc/config/rs6000/emmintrin.h
@@ -141,9 +141,9 @@ _mm_setzero_pd (void)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_sd (__m128d __A, __m128d __B)
 {
-  __v2df result = (__v2df) __A;
-  result [0] = ((__v2df) __B)[0];
-  return (__m128d) result;
+  __v2df __result = (__v2df) __A;
+  __result [0] = ((__v2df) __B)[0];
+  return (__m128d) __result;
 }
 
 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
@@ -329,9 +329,9 @@ _mm_sqrt_pd (__m128d __A)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_sd (__m128d __A, __m128d __B)
 {
-  __v2df c;
-  c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
-  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+  __v2df __c;
+  __c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
+  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -343,11 +343,11 @@ _mm_min_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_min_sd (__m128d __A, __m128d __B)
 {
-  __v2df a, b, c;
-  a = vec_splats (__A[0]);
-  b = vec_splats (__B[0]);
-  c = vec_min (a, b);
-  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+  __v2df __a, __b, __c;
+  __a = vec_splats (__A[0]);
+  __b = vec_splats (__B[0]);
+  __c = vec_min (__a, __b);
+  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -359,11 +359,11 @@ _mm_max_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_max_sd (__m128d __A, __m128d __B)
 {
-  __v2df a, b, c;
-  a = vec_splats (__A[0]);
-  b = vec_splats (__B[0]);
-  c = vec_max (a, b);
-  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+  __v2df __a, __b, __c;
+  __a = vec_splats (__A[0]);
+  __b = vec_splats (__B[0]);
+  __c = vec_max (__a, __b);
+  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -399,8 +399,8 @@ _mm_cmpge_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpneq_pd (__m128d __A, __m128d __B)
 {
-  __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
-  return ((__m128d)vec_nor (temp, temp));
+  __v2df __temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
+  return ((__m128d)vec_nor (__temp, __temp));
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -430,163 +430,163 @@ _mm_cmpnge_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpord_pd (__m128d __A, __m128d __B)
 {
-  __v2du c, d;
+  __v2du __c, __d;
   /* Compare against self will return false (0's) if NAN.  */
-  c = (__v2du)vec_cmpeq (__A, __A);
-  d = (__v2du)vec_cmpeq (__B, __B);
+  __c = (__v2du)vec_cmpeq (__A, __A);
+  __d = (__v2du)vec_cmpeq (__B, __B);
   /* A != NAN and B != NAN.  */
-  return ((__m128d)vec_and(c, d));
+  return ((__m128d)vec_and(__c, __d));
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpunord_pd (__m128d __A, __m128d __B)
 {
 #if _ARCH_PWR8
-  __v2du c, d;
+  __v2du __c, __d;
   /* Compare against self will return false (0's) if NAN.  */
-  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
-  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
+  __c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
+  __d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
   /* A == NAN OR B == NAN converts too:
      NOT(A != NAN) OR NOT(B != NAN).  */
-  c = vec_nor (c, c);
-  return ((__m128d)vec_orc(c, d));
+  __c = vec_nor (__c, __c);
+  return ((__m128d)vec_orc(__c, __d));
 #else
-  __v2du c, d;
+  __v2du __c, __d;
   /* Compare against self will return false (0's) if NAN.  */
-  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
-  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
+  __c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
+  __d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
   /* Convert the true ('1's) is NAN.  */
-  c = vec_nor (c, c);
-  d = vec_nor (d, d);
-  return ((__m128d)vec_or(c, d));
+  __c = vec_nor (__c, __c);
+  __d = vec_nor (__d, __d);
+  return ((__m128d)vec_or(__c, __d));
 #endif
 }
 
 extern __inline  __m128d  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpeq_sd(__m128d  __A, __m128d  __B)
 {
-  __v2df a, b, c;
+  __v2df __a, __b, __c;
   /* PowerISA VSX does not allow partial (for just lower double)
      results. So to insure we don't generate spurious exceptions
      (from the upper double values) we splat the lower double
      before we do the operation. */
-  a = vec_splats (__A[0]);
-  b = vec_splats (__B[0]);
-  c = (__v2df) vec_cmpeq(a, b);
+  __a = vec_splats (__A[0]);
+  __b = vec_splats (__B[0]);
+  __c = (__v2df) vec_cmpeq(__a, __b);
   /* Then we merge the lower double result with the original upper
      double from __A.  */
-  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmplt_sd (__m128d __A, __m128d __B)
 {
-  __v2df a, b, c;
-  a = vec_splats (__A[0]);
-  b = vec_splats (__B[0]);
-  c = (__v2df) vec_cmplt(a, b);
-  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+  __v2df __a, __b, __c;
+  __a = vec_splats (__A[0]);
+  __b = vec_splats (__B[0]);
+  __c = (__v2df) vec_cmplt(__a, __b);
+  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmple_sd (__m128d __A, __m128d __B)
 {
-  __v2df a, b, c;
-  a = vec_splats (__A[0]);
-  b = vec_splats (__B[0]);
-  c = (__v2df) vec_cmple(a, b);
-  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+  __v2df __a, __b, __c;
+  __a = vec_splats (__A[0]);
+  __b = vec_splats (__B[0]);
+  __c = (__v2df) vec_cmple(__a, __b);
+  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpgt_sd (__m128d __A, __m128d __B)
 {
-  __v2df a, b, c;
-  a = vec_splats (__A[0]);
-  b = vec_splats (__B[0]);
-  c = (__v2df) vec_cmpgt(a, b);
-  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+  __v2df __a, __b, __c;
+  __a = vec_splats (__A[0]);
+  __b = vec_splats (__B[0]);
+  __c = (__v2df) vec_cmpgt(__a, __b);
+  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpge_sd (__m128d __A, __m128d __B)
 {
-  __v2df a, b, c;
-  a = vec_splats (__A[0]);
-  b = vec_splats (__B[0]);
-  c = (__v2df) vec_cmpge(a, b);
-  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+  __v2df __a, __b, __c;
+  __a = vec_splats (__A[0]);
+  __b = vec_splats (__B[0]);
+  __c = (__v2df) vec_cmpge(__a, __b);
+  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpneq_sd (__m128d __A, __m128d __B)
 {
-  __v2df a, b, c;
-  a = vec_splats (__A[0]);
-  b = vec_splats (__B[0]);
-  c = (__v2df) vec_cmpeq(a, b);
-  c = vec_nor (c, c);
-  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+  __v2df __a, __b, __c;
+  __a = vec_splats (__A[0]);
+  __b = vec_splats (__B[0]);
+  __c = (__v2df) vec_cmpeq(__a, __b);
+  __c = vec_nor (__c, __c);
+  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
 {
-  __v2df a, b, c;
-  a = vec_splats (__A[0]);
-  b = vec_splats (__B[0]);
+  __v2df __a, __b, __c;
+  __a = vec_splats (__A[0]);
+  __b = vec_splats (__B[0]);
   /* Not less than is just greater than or equal.  */
-  c = (__v2df) vec_cmpge(a, b);
-  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+  __c = (__v2df) vec_cmpge(__a, __b);
+  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnle_sd (__m128d __A, __m128d __B)
 {
-  __v2df a, b, c;
-  a = vec_splats (__A[0]);
-  b = vec_splats (__B[0]);
+  __v2df __a, __b, __c;
+  __a = vec_splats (__A[0]);
+  __b = vec_splats (__B[0]);
   /* Not less than or equal is just greater than.  */
-  c = (__v2df) vec_cmpge(a, b);
-  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+  __c = (__v2df) vec_cmpge(__a, __b);
+  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpngt_sd (__m128d __A, __m128d __B)
 {
-  __v2df a, b, c;
-  a = vec_splats (__A[0]);
-  b = vec_splats (__B[0]);
+  __v2df __a, __b, __c;
+  __a = vec_splats (__A[0]);
+  __b = vec_splats (__B[0]);
   /* Not greater than is just less than or equal.  */
-  c = (__v2df) vec_cmple(a, b);
-  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+  __c = (__v2df) vec_cmple(__a, __b);
+  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnge_sd (__m128d __A, __m128d __B)
 {
-  __v2df a, b, c;
-  a = vec_splats (__A[0]);
-  b = vec_splats (__B[0]);
+  __v2df __a, __b, __c;
+  __a = vec_splats (__A[0]);
+  __b = vec_splats (__B[0]);
   /* Not greater than or equal is just less than.  */
-  c = (__v2df) vec_cmplt(a, b);
-  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+  __c = (__v2df) vec_cmplt(__a, __b);
+  return (__m128d) _mm_setr_pd (__c[0], __A[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpord_sd (__m128d __A, __m128d __B)
 {
-  __v2df r;
-  r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
-  return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
+  __v2df __r;
+  __r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
+  return (__m128d) _mm_setr_pd (__r[0], ((__v2df)__A)[1]);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpunord_sd (__m128d __A, __m128d __B)
 {
-  __v2df r;
-  r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
-  return (__m128d) _mm_setr_pd (r[0], __A[1]);
+  __v2df __r;
+  __r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
+  return (__m128d) _mm_setr_pd (__r[0], __A[1]);
 }
 
 /* FIXME
@@ -845,12 +845,12 @@ _mm_setzero_si128 (void)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtepi32_pd (__m128i __A)
 {
-  __v2di val;
+  __v2di __val;
   /* For LE need to generate Vector Unpack Low Signed Word.
      Which is generated from unpackh.  */
-  val = (__v2di)vec_unpackh ((__v4si)__A);
+  __val = (__v2di)vec_unpackh ((__v4si)__A);
 
-  return (__m128d)vec_ctf (val, 0);
+  return (__m128d)vec_ctf (__val, 0);
 }
 #endif
 
@@ -863,116 +863,116 @@ _mm_cvtepi32_ps (__m128i __A)
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtpd_epi32 (__m128d __A)
 {
-  __v2df rounded = vec_rint (__A);
-  __v4si result, temp;
-  const __v4si vzero =
+  __v2df __rounded = vec_rint (__A);
+  __v4si __result, __temp;
+  const __v4si __vzero =
     { 0, 0, 0, 0 };
 
   /* VSX Vector truncate Double-Precision to integer and Convert to
    Signed Integer Word format with Saturate.  */
   __asm__(
       "xvcvdpsxws %x0,%x1"
-      : "=wa" (temp)
-      : "wa" (rounded)
+      : "=wa" (__temp)
+      : "wa" (__rounded)
       : );
 
 #ifdef _ARCH_PWR8
 #ifdef __LITTLE_ENDIAN__
-  temp = vec_mergeo (temp, temp);
+  __temp = vec_mergeo (__temp, __temp);
 #else
-  temp = vec_mergee (temp, temp);
+  __temp = vec_mergee (__temp, __temp);
 #endif
-  result = (__v4si) vec_vpkudum ((__vector long long) temp,
-				 (__vector long long) vzero);
+  __result = (__v4si) vec_vpkudum ((__vector long long) __temp,
+				 (__vector long long) __vzero);
 #else
   {
-    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
+    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
-    result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
+    __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
   }
 #endif
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtpd_pi32 (__m128d __A)
 {
-  __m128i result = _mm_cvtpd_epi32(__A);
+  __m128i __result = _mm_cvtpd_epi32(__A);
 
-  return (__m64) result[0];
+  return (__m64) __result[0];
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtpd_ps (__m128d __A)
 {
-  __v4sf result;
-  __v4si temp;
-  const __v4si vzero = { 0, 0, 0, 0 };
+  __v4sf __result;
+  __v4si __temp;
+  const __v4si __vzero = { 0, 0, 0, 0 };
 
   __asm__(
       "xvcvdpsp %x0,%x1"
-      : "=wa" (temp)
+      : "=wa" (__temp)
       : "wa" (__A)
       : );
 
 #ifdef _ARCH_PWR8
 #ifdef __LITTLE_ENDIAN__
-  temp = vec_mergeo (temp, temp);
+  __temp = vec_mergeo (__temp, __temp);
 #else
-  temp = vec_mergee (temp, temp);
+  __temp = vec_mergee (__temp, __temp);
 #endif
-  result = (__v4sf) vec_vpkudum ((__vector long long) temp,
-				 (__vector long long) vzero);
+  __result = (__v4sf) vec_vpkudum ((__vector long long) __temp,
+				 (__vector long long) __vzero);
 #else
   {
-    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
+    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
-    result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
+    __result = (__v4sf) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
   }
 #endif
-  return ((__m128)result);
+  return ((__m128)__result);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvttpd_epi32 (__m128d __A)
 {
-  __v4si result;
-  __v4si temp;
-  const __v4si vzero = { 0, 0, 0, 0 };
+  __v4si __result;
+  __v4si __temp;
+  const __v4si __vzero = { 0, 0, 0, 0 };
 
   /* VSX Vector truncate Double-Precision to integer and Convert to
    Signed Integer Word format with Saturate.  */
   __asm__(
       "xvcvdpsxws %x0,%x1"
-      : "=wa" (temp)
+      : "=wa" (__temp)
       : "wa" (__A)
       : );
 
 #ifdef _ARCH_PWR8
 #ifdef __LITTLE_ENDIAN__
-  temp = vec_mergeo (temp, temp);
+  __temp = vec_mergeo (__temp, __temp);
 #else
-  temp = vec_mergee (temp, temp);
+  __temp = vec_mergee (__temp, __temp);
 #endif
-  result = (__v4si) vec_vpkudum ((__vector long long) temp,
-				 (__vector long long) vzero);
+  __result = (__v4si) vec_vpkudum ((__vector long long) __temp,
+				 (__vector long long) __vzero);
 #else
   {
-    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
+    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
-    result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
+    __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
   }
 #endif
 
-  return ((__m128i) result);
+  return ((__m128i) __result);
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvttpd_pi32 (__m128d __A)
 {
-  __m128i result = _mm_cvttpd_epi32 (__A);
+  __m128i __result = _mm_cvttpd_epi32 (__A);
 
-  return (__m64) result[0];
+  return (__m64) __result[0];
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -985,35 +985,35 @@ _mm_cvtsi128_si32 (__m128i __A)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtpi32_pd (__m64 __A)
 {
-  __v4si temp;
-  __v2di tmp2;
-  __v2df result;
+  __v4si __temp;
+  __v2di __tmp2;
+  __v2df __result;
 
-  temp = (__v4si)vec_splats (__A);
-  tmp2 = (__v2di)vec_unpackl (temp);
-  result = vec_ctf ((__vector signed long long) tmp2, 0);
-  return (__m128d)result;
+  __temp = (__v4si)vec_splats (__A);
+  __tmp2 = (__v2di)vec_unpackl (__temp);
+  __result = vec_ctf ((__vector signed long long) __tmp2, 0);
+  return (__m128d)__result;
 }
 #endif
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtps_epi32 (__m128 __A)
 {
-  __v4sf rounded;
-  __v4si result;
+  __v4sf __rounded;
+  __v4si __result;
 
-  rounded = vec_rint((__v4sf) __A);
-  result = vec_cts (rounded, 0);
-  return (__m128i) result;
+  __rounded = vec_rint((__v4sf) __A);
+  __result = vec_cts (__rounded, 0);
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvttps_epi32 (__m128 __A)
 {
-  __v4si result;
+  __v4si __result;
 
-  result = vec_cts ((__v4sf) __A, 0);
-  return (__m128i) result;
+  __result = vec_cts ((__v4sf) __A, 0);
+  return (__m128i) __result;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1025,48 +1025,48 @@ _mm_cvtps_pd (__m128 __A)
 #else
   /* Otherwise the compiler is not current and so need to generate the
      equivalent code.  */
-  __v4sf a = (__v4sf)__A;
-  __v4sf temp;
-  __v2df result;
+  __v4sf __a = (__v4sf)__A;
+  __v4sf __temp;
+  __v2df __result;
 #ifdef __LITTLE_ENDIAN__
   /* The input float values are in elements {[0], [1]} but the convert
      instruction needs them in elements {[1], [3]}, So we use two
      shift left double vector word immediates to get the elements
      lined up.  */
-  temp = __builtin_vsx_xxsldwi (a, a, 3);
-  temp = __builtin_vsx_xxsldwi (a, temp, 2);
+  __temp = __builtin_vsx_xxsldwi (__a, __a, 3);
+  __temp = __builtin_vsx_xxsldwi (__a, __temp, 2);
 #else
   /* The input float values are in elements {[0], [1]} but the convert
      instruction needs them in elements {[0], [2]}, So we use two
      shift left double vector word immediates to get the elements
      lined up.  */
-  temp = vec_vmrghw (a, a);
+  __temp = vec_vmrghw (__a, __a);
 #endif
   __asm__(
       " xvcvspdp %x0,%x1"
-      : "=wa" (result)
-      : "wa" (temp)
+      : "=wa" (__result)
+      : "wa" (__temp)
       : );
-  return (__m128d) result;
+  return (__m128d) __result;
 #endif
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsd_si32 (__m128d __A)
 {
-  __v2df rounded = vec_rint((__v2df) __A);
-  int result = ((__v2df)rounded)[0];
+  __v2df __rounded = vec_rint((__v2df) __A);
+  int __result = ((__v2df)__rounded)[0];
 
-  return result;
+  return __result;
 }
 /* Intel intrinsic.  */
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsd_si64 (__m128d __A)
 {
-  __v2df rounded = vec_rint ((__v2df) __A );
-  long long result = ((__v2df) rounded)[0];
+  __v2df __rounded = vec_rint ((__v2df) __A );
+  long long __result = ((__v2df) __rounded)[0];
 
-  return result;
+  return __result;
 }
 
 /* Microsoft intrinsic.  */
@@ -1079,18 +1079,18 @@ _mm_cvtsd_si64x (__m128d __A)
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvttsd_si32 (__m128d __A)
 {
-  int result = ((__v2df)__A)[0];
+  int __result = ((__v2df)__A)[0];
 
-  return result;
+  return __result;
 }
 
 /* Intel intrinsic.  */
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvttsd_si64 (__m128d __A)
 {
-  long long result = ((__v2df)__A)[0];
+  long long __result = ((__v2df)__A)[0];
 
-  return result;
+  return __result;
 }
 
 /* Microsoft intrinsic.  */
@@ -1103,46 +1103,46 @@ _mm_cvttsd_si64x (__m128d __A)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsd_ss (__m128 __A, __m128d __B)
 {
-  __v4sf result = (__v4sf)__A;
+  __v4sf __result = (__v4sf)__A;
 
 #ifdef __LITTLE_ENDIAN__
-  __v4sf temp_s;
+  __v4sf __temp_s;
   /* Copy double element[0] to element [1] for conversion.  */
-  __v2df temp_b = vec_splat((__v2df)__B, 0);
+  __v2df __temp_b = vec_splat((__v2df)__B, 0);
 
   /* Pre-rotate __A left 3 (logically right 1) elements.  */
-  result = __builtin_vsx_xxsldwi (result, result, 3);
+  __result = __builtin_vsx_xxsldwi (__result, __result, 3);
   /* Convert double to single float scalar in a vector.  */
   __asm__(
       "xscvdpsp %x0,%x1"
-      : "=wa" (temp_s)
-      : "wa" (temp_b)
+      : "=wa" (__temp_s)
+      : "wa" (__temp_b)
       : );
   /* Shift the resulting scalar into vector element [0].  */
-  result = __builtin_vsx_xxsldwi (result, temp_s, 1);
+  __result = __builtin_vsx_xxsldwi (__result, __temp_s, 1);
 #else
-  result [0] = ((__v2df)__B)[0];
+  __result [0] = ((__v2df)__B)[0];
 #endif
-  return (__m128) result;
+  return (__m128) __result;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsi32_sd (__m128d __A, int __B)
 {
-  __v2df result = (__v2df)__A;
-  double db = __B;
-  result [0] = db;
-  return (__m128d)result;
+  __v2df __result = (__v2df)__A;
+  double __db = __B;
+  __result [0] = __db;
+  return (__m128d)__result;
 }
 
 /* Intel intrinsic.  */
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsi64_sd (__m128d __A, long long __B)
 {
-  __v2df result = (__v2df)__A;
-  double db = __B;
-  result [0] = db;
-  return (__m128d)result;
+  __v2df __result = (__v2df)__A;
+  double __db = __B;
+  __result [0] = __db;
+  return (__m128d)__result;
 }
 
 /* Microsoft intrinsic.  */
@@ -1157,45 +1157,45 @@ _mm_cvtss_sd (__m128d __A, __m128 __B)
 {
 #ifdef __LITTLE_ENDIAN__
   /* Use splat to move element [0] into position for the convert. */
-  __v4sf temp = vec_splat ((__v4sf)__B, 0);
-  __v2df res;
+  __v4sf __temp = vec_splat ((__v4sf)__B, 0);
+  __v2df __res;
   /* Convert single float scalar to double in a vector.  */
   __asm__(
       "xscvspdp %x0,%x1"
-      : "=wa" (res)
-      : "wa" (temp)
+      : "=wa" (__res)
+      : "wa" (__temp)
       : );
-  return (__m128d) vec_mergel (res, (__v2df)__A);
+  return (__m128d) vec_mergel (__res, (__v2df)__A);
 #else
-  __v2df res = (__v2df)__A;
-  res [0] = ((__v4sf)__B) [0];
-  return (__m128d) res;
+  __v2df __res = (__v2df)__A;
+  __res [0] = ((__v4sf)__B) [0];
+  return (__m128d) __res;
 #endif
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
 {
-  __vector double result;
-  const int litmsk = __mask & 0x3;
+  __vector double __result;
+  const int __litmsk = __mask & 0x3;
 
-  if (litmsk == 0)
-    result = vec_mergeh (__A, __B);
+  if (__litmsk == 0)
+    __result = vec_mergeh (__A, __B);
 #if __GNUC__ < 6
-  else if (litmsk == 1)
-    result = vec_xxpermdi (__B, __A, 2);
-  else if (litmsk == 2)
-    result = vec_xxpermdi (__B, __A, 1);
+  else if (__litmsk == 1)
+    __result = vec_xxpermdi (__B, __A, 2);
+  else if (__litmsk == 2)
+    __result = vec_xxpermdi (__B, __A, 1);
 #else
-  else if (litmsk == 1)
-    result = vec_xxpermdi (__A, __B, 2);
-  else if (litmsk == 2)
-    result = vec_xxpermdi (__A, __B, 1);
+  else if (__litmsk == 1)
+    __result = vec_xxpermdi (__A, __B, 2);
+  else if (__litmsk == 2)
+    __result = vec_xxpermdi (__A, __B, 1);
 #endif
   else
-    result = vec_mergel (__A, __B);
+    __result = vec_mergel (__A, __B);
 
-  return result;
+  return __result;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1213,17 +1213,17 @@ _mm_unpacklo_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_loadh_pd (__m128d __A, double const *__B)
 {
-  __v2df result = (__v2df)__A;
-  result [1] = *__B;
-  return (__m128d)result;
+  __v2df __result = (__v2df)__A;
+  __result [1] = *__B;
+  return (__m128d)__result;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_loadl_pd (__m128d __A, double const *__B)
 {
-  __v2df result = (__v2df)__A;
-  result [0] = *__B;
-  return (__m128d)result;
+  __v2df __result = (__v2df)__A;
+  __result [0] = *__B;
+  return (__m128d)__result;
 }
 
 #ifdef _ARCH_PWR8
@@ -1236,8 +1236,8 @@ _mm_movemask_pd (__m128d  __A)
 #ifdef _ARCH_PWR10
   return vec_extractm ((__v2du) __A);
 #else
-  __vector unsigned long long result;
-  static const __vector unsigned int perm_mask =
+  __vector unsigned long long __result;
+  static const __vector unsigned int __perm_mask =
     {
 #ifdef __LITTLE_ENDIAN__
 	0x80800040, 0x80808080, 0x80808080, 0x80808080
@@ -1246,14 +1246,14 @@ _mm_movemask_pd (__m128d  __A)
 #endif
     };
 
-  result = ((__vector unsigned long long)
+  __result = ((__vector unsigned long long)
 	    vec_vbpermq ((__vector unsigned char) __A,
-			 (__vector unsigned char) perm_mask));
+			 (__vector unsigned char) __perm_mask));
 
 #ifdef __LITTLE_ENDIAN__
-  return result[1];
+  return __result[1];
 #else
-  return result[0];
+  return __result[0];
 #endif
 #endif /* !_ARCH_PWR10 */
 }
@@ -1426,17 +1426,17 @@ _mm_subs_epu16 (__m128i __A, __m128i __B)
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_madd_epi16 (__m128i __A, __m128i __B)
 {
-  __vector signed int zero = {0, 0, 0, 0};
+  __vector signed int __zero = {0, 0, 0, 0};
 
-  return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
+  return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, __zero);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
 {
-  __vector signed int w0, w1;
+  __vector signed int __w0, __w1;
 
-  __vector unsigned char xform1 = {
+  __vector unsigned char __xform1 = {
 #ifdef __LITTLE_ENDIAN__
       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
@@ -1446,9 +1446,9 @@ _mm_mulhi_epi16 (__m128i __A, __m128i __B)
 #endif
     };
 
-  w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
-  w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
-  return (__m128i) vec_perm (w0, w1, xform1);
+  __w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
+  __w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
+  return (__m128i) vec_perm (__w0, __w1, __xform1);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1460,10 +1460,10 @@ _mm_mullo_epi16 (__m128i __A, __m128i __B)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_su32 (__m64 __A, __m64 __B)
 {
-  unsigned int a = __A;
-  unsigned int b = __B;
+  unsigned int __a = __A;
+  unsigned int __b = __B;
 
-  return ((__m64)a * (__m64)b);
+  return ((__m64)__a * (__m64)__b);
 }
 
 #ifdef _ARCH_PWR8
@@ -1471,24 +1471,24 @@ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __arti
 _mm_mul_epu32 (__m128i __A, __m128i __B)
 {
 #if __GNUC__ < 8
-  __v2du result;
+  __v2du __result;
 
 #ifdef __LITTLE_ENDIAN__
   /* VMX Vector Multiply Odd Unsigned Word.  */
   __asm__(
       "vmulouw %0,%1,%2"
-      : "=v" (result)
+      : "=v" (__result)
       : "v" (__A), "v" (__B)
       : );
 #else
   /* VMX Vector Multiply Even Unsigned Word.  */
   __asm__(
       "vmuleuw %0,%1,%2"
-      : "=v" (result)
+      : "=v" (__result)
       : "v" (__A), "v" (__B)
       : );
 #endif
-  return (__m128i) result;
+  return (__m128i) __result;
 #else
   return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
 #endif
@@ -1498,122 +1498,122 @@ _mm_mul_epu32 (__m128i __A, __m128i __B)
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi16 (__m128i __A, int __B)
 {
-  __v8hu lshift;
-  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
+  __v8hu __lshift;
+  __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 };
 
   if (__B >= 0 && __B < 16)
     {
       if (__builtin_constant_p(__B))
-	lshift = (__v8hu) vec_splat_s16(__B);
+	__lshift = (__v8hu) vec_splat_s16(__B);
       else
-	lshift = vec_splats ((unsigned short) __B);
+	__lshift = vec_splats ((unsigned short) __B);
 
-      result = vec_sl ((__v8hi) __A, lshift);
+      __result = vec_sl ((__v8hi) __A, __lshift);
     }
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi32 (__m128i __A, int __B)
 {
-  __v4su lshift;
-  __v4si result = { 0, 0, 0, 0 };
+  __v4su __lshift;
+  __v4si __result = { 0, 0, 0, 0 };
 
   if (__B >= 0 && __B < 32)
     {
       if (__builtin_constant_p(__B) && __B < 16)
-	lshift = (__v4su) vec_splat_s32(__B);
+	__lshift = (__v4su) vec_splat_s32(__B);
       else
-	lshift = vec_splats ((unsigned int) __B);
+	__lshift = vec_splats ((unsigned int) __B);
 
-      result = vec_sl ((__v4si) __A, lshift);
+      __result = vec_sl ((__v4si) __A, __lshift);
     }
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 #ifdef _ARCH_PWR8
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi64 (__m128i __A, int __B)
 {
-  __v2du lshift;
-  __v2di result = { 0, 0 };
+  __v2du __lshift;
+  __v2di __result = { 0, 0 };
 
   if (__B >= 0 && __B < 64)
     {
       if (__builtin_constant_p(__B) && __B < 16)
-	lshift = (__v2du) vec_splat_s32(__B);
+	__lshift = (__v2du) vec_splat_s32(__B);
       else
-	lshift = (__v2du) vec_splats ((unsigned int) __B);
+	__lshift = (__v2du) vec_splats ((unsigned int) __B);
 
-      result = vec_sl ((__v2di) __A, lshift);
+      __result = vec_sl ((__v2di) __A, __lshift);
     }
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 #endif
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srai_epi16 (__m128i __A, int __B)
 {
-  __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
-  __v8hi result;
+  __v8hu __rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
+  __v8hi __result;
 
   if (__B < 16)
     {
       if (__builtin_constant_p(__B))
-	rshift = (__v8hu) vec_splat_s16(__B);
+	__rshift = (__v8hu) vec_splat_s16(__B);
       else
-	rshift = vec_splats ((unsigned short) __B);
+	__rshift = vec_splats ((unsigned short) __B);
     }
-  result = vec_sra ((__v8hi) __A, rshift);
+  __result = vec_sra ((__v8hi) __A, __rshift);
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srai_epi32 (__m128i __A, int __B)
 {
-  __v4su rshift = { 31, 31, 31, 31 };
-  __v4si result;
+  __v4su __rshift = { 31, 31, 31, 31 };
+  __v4si __result;
 
   if (__B < 32)
     {
       if (__builtin_constant_p(__B))
 	{
 	  if (__B < 16)
-	      rshift = (__v4su) vec_splat_s32(__B);
+	      __rshift = (__v4su) vec_splat_s32(__B);
 	    else
-	      rshift = (__v4su) vec_splats((unsigned int)__B);
+	      __rshift = (__v4su) vec_splats((unsigned int)__B);
 	}
       else
-	rshift = vec_splats ((unsigned int) __B);
+	__rshift = vec_splats ((unsigned int) __B);
     }
-  result = vec_sra ((__v4si) __A, rshift);
+  __result = vec_sra ((__v4si) __A, __rshift);
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_bslli_si128 (__m128i __A, const int __N)
 {
-  __v16qu result;
-  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  __v16qu __result;
+  const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 
   if (__N < 16)
-    result = vec_sld ((__v16qu) __A, zeros, __N);
+    __result = vec_sld ((__v16qu) __A, __zeros, __N);
   else
-    result = zeros;
+    __result = __zeros;
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_bsrli_si128 (__m128i __A, const int __N)
 {
-  __v16qu result;
-  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  __v16qu __result;
+  const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 
   if (__N < 16)
 #ifdef __LITTLE_ENDIAN__
@@ -1621,21 +1621,21 @@ _mm_bsrli_si128 (__m128i __A, const int __N)
       /* Would like to use Vector Shift Left Double by Octet
 	 Immediate here to use the immediate form and avoid
 	 load of __N * 8 value into a separate VR.  */
-      result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
+      __result = vec_sld (__zeros, (__v16qu) __A, (16 - __N));
     else
 #endif
       {
-	__v16qu shift = vec_splats((unsigned char)(__N*8));
+	__v16qu __shift = vec_splats((unsigned char)(__N*8));
 #ifdef __LITTLE_ENDIAN__
-	result = vec_sro ((__v16qu)__A, shift);
+	__result = vec_sro ((__v16qu)__A, __shift);
 #else
-	result = vec_slo ((__v16qu)__A, shift);
+	__result = vec_slo ((__v16qu)__A, __shift);
 #endif
       }
   else
-    result = zeros;
+    __result = __zeros;
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1647,239 +1647,239 @@ _mm_srli_si128 (__m128i __A, const int __N)
 extern __inline  __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_si128 (__m128i __A, const int _imm5)
 {
-  __v16qu result;
-  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  __v16qu __result;
+  const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 
   if (_imm5 < 16)
 #ifdef __LITTLE_ENDIAN__
-    result = vec_sld ((__v16qu) __A, zeros, _imm5);
+    __result = vec_sld ((__v16qu) __A, __zeros, _imm5);
 #else
-    result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
+    __result = vec_sld (__zeros, (__v16qu) __A, (16 - _imm5));
 #endif
   else
-    result = zeros;
+    __result = __zeros;
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 
 _mm_srli_epi16 (__m128i  __A, int __B)
 {
-  __v8hu rshift;
-  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
+  __v8hu __rshift;
+  __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 };
 
   if (__B < 16)
     {
       if (__builtin_constant_p(__B))
-	rshift = (__v8hu) vec_splat_s16(__B);
+	__rshift = (__v8hu) vec_splat_s16(__B);
       else
-	rshift = vec_splats ((unsigned short) __B);
+	__rshift = vec_splats ((unsigned short) __B);
 
-      result = vec_sr ((__v8hi) __A, rshift);
+      __result = vec_sr ((__v8hi) __A, __rshift);
     }
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi32 (__m128i __A, int __B)
 {
-  __v4su rshift;
-  __v4si result = { 0, 0, 0, 0 };
+  __v4su __rshift;
+  __v4si __result = { 0, 0, 0, 0 };
 
   if (__B < 32)
     {
       if (__builtin_constant_p(__B))
 	{
 	  if (__B < 16)
-	      rshift = (__v4su) vec_splat_s32(__B);
+	      __rshift = (__v4su) vec_splat_s32(__B);
 	    else
-	      rshift = (__v4su) vec_splats((unsigned int)__B);
+	      __rshift = (__v4su) vec_splats((unsigned int)__B);
 	}
       else
-	rshift = vec_splats ((unsigned int) __B);
+	__rshift = vec_splats ((unsigned int) __B);
 
-      result = vec_sr ((__v4si) __A, rshift);
+      __result = vec_sr ((__v4si) __A, __rshift);
     }
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 #ifdef _ARCH_PWR8
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi64 (__m128i __A, int __B)
 {
-  __v2du rshift;
-  __v2di result = { 0, 0 };
+  __v2du __rshift;
+  __v2di __result = { 0, 0 };
 
   if (__B < 64)
     {
       if (__builtin_constant_p(__B))
 	{
 	  if (__B < 16)
-	      rshift = (__v2du) vec_splat_s32(__B);
+	      __rshift = (__v2du) vec_splat_s32(__B);
 	    else
-	      rshift = (__v2du) vec_splats((unsigned long long)__B);
+	      __rshift = (__v2du) vec_splats((unsigned long long)__B);
 	}
       else
-	rshift = (__v2du) vec_splats ((unsigned int) __B);
+	__rshift = (__v2du) vec_splats ((unsigned int) __B);
 
-      result = vec_sr ((__v2di) __A, rshift);
+      __result = vec_sr ((__v2di) __A, __rshift);
     }
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 #endif
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sll_epi16 (__m128i __A, __m128i __B)
 {
-  __v8hu lshift;
-  __vector __bool short shmask;
-  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
-  __v8hu result;
+  __v8hu __lshift;
+  __vector __bool short __shmask;
+  const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
+  __v8hu __result;
 
 #ifdef __LITTLE_ENDIAN__
-  lshift = vec_splat ((__v8hu) __B, 0);
+  __lshift = vec_splat ((__v8hu) __B, 0);
 #else
-  lshift = vec_splat ((__v8hu) __B, 3);
+  __lshift = vec_splat ((__v8hu) __B, 3);
 #endif
-  shmask = vec_cmple (lshift, shmax);
-  result = vec_sl ((__v8hu) __A, lshift);
-  result = vec_sel ((__v8hu) shmask, result, shmask);
+  __shmask = vec_cmple (__lshift, __shmax);
+  __result = vec_sl ((__v8hu) __A, __lshift);
+  __result = vec_sel ((__v8hu) __shmask, __result, __shmask);
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sll_epi32 (__m128i __A, __m128i __B)
 {
-  __v4su lshift;
-  __vector __bool int shmask;
-  const __v4su shmax = { 32, 32, 32, 32 };
-  __v4su result;
+  __v4su __lshift;
+  __vector __bool int __shmask;
+  const __v4su __shmax = { 32, 32, 32, 32 };
+  __v4su __result;
 #ifdef __LITTLE_ENDIAN__
-  lshift = vec_splat ((__v4su) __B, 0);
+  __lshift = vec_splat ((__v4su) __B, 0);
 #else
-  lshift = vec_splat ((__v4su) __B, 1);
+  __lshift = vec_splat ((__v4su) __B, 1);
 #endif
-  shmask = vec_cmplt (lshift, shmax);
-  result = vec_sl ((__v4su) __A, lshift);
-  result = vec_sel ((__v4su) shmask, result, shmask);
+  __shmask = vec_cmplt (__lshift, __shmax);
+  __result = vec_sl ((__v4su) __A, __lshift);
+  __result = vec_sel ((__v4su) __shmask, __result, __shmask);
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 #ifdef _ARCH_PWR8
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sll_epi64 (__m128i __A, __m128i __B)
 {
-  __v2du lshift;
-  __vector __bool long long shmask;
-  const __v2du shmax = { 64, 64 };
-  __v2du result;
+  __v2du __lshift;
+  __vector __bool long long __shmask;
+  const __v2du __shmax = { 64, 64 };
+  __v2du __result;
 
-  lshift = vec_splat ((__v2du) __B, 0);
-  shmask = vec_cmplt (lshift, shmax);
-  result = vec_sl ((__v2du) __A, lshift);
-  result = vec_sel ((__v2du) shmask, result, shmask);
+  __lshift = vec_splat ((__v2du) __B, 0);
+  __shmask = vec_cmplt (__lshift, __shmax);
+  __result = vec_sl ((__v2du) __A, __lshift);
+  __result = vec_sel ((__v2du) __shmask, __result, __shmask);
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 #endif
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sra_epi16 (__m128i __A, __m128i __B)
 {
-  const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
-  __v8hu rshift;
-  __v8hi result;
+  const __v8hu __rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
+  __v8hu __rshift;
+  __v8hi __result;
 
 #ifdef __LITTLE_ENDIAN__
-  rshift = vec_splat ((__v8hu)__B, 0);
+  __rshift = vec_splat ((__v8hu)__B, 0);
 #else
-  rshift = vec_splat ((__v8hu)__B, 3);
+  __rshift = vec_splat ((__v8hu)__B, 3);
 #endif
-  rshift = vec_min (rshift, rshmax);
-  result = vec_sra ((__v8hi) __A, rshift);
+  __rshift = vec_min (__rshift, __rshmax);
+  __result = vec_sra ((__v8hi) __A, __rshift);
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sra_epi32 (__m128i __A, __m128i __B)
 {
-  const __v4su rshmax = { 31, 31, 31, 31 };
-  __v4su rshift;
-  __v4si result;
+  const __v4su __rshmax = { 31, 31, 31, 31 };
+  __v4su __rshift;
+  __v4si __result;
 
 #ifdef __LITTLE_ENDIAN__
-  rshift = vec_splat ((__v4su)__B, 0);
+  __rshift = vec_splat ((__v4su)__B, 0);
 #else
-  rshift = vec_splat ((__v4su)__B, 1);
+  __rshift = vec_splat ((__v4su)__B, 1);
 #endif
-  rshift = vec_min (rshift, rshmax);
-  result = vec_sra ((__v4si) __A, rshift);
+  __rshift = vec_min (__rshift, __rshmax);
+  __result = vec_sra ((__v4si) __A, __rshift);
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srl_epi16 (__m128i __A, __m128i __B)
 {
-  __v8hu rshift;
-  __vector __bool short shmask;
-  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
-  __v8hu result;
+  __v8hu __rshift;
+  __vector __bool short __shmask;
+  const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
+  __v8hu __result;
 
 #ifdef __LITTLE_ENDIAN__
-  rshift = vec_splat ((__v8hu) __B, 0);
+  __rshift = vec_splat ((__v8hu) __B, 0);
 #else
-  rshift = vec_splat ((__v8hu) __B, 3);
+  __rshift = vec_splat ((__v8hu) __B, 3);
 #endif
-  shmask = vec_cmple (rshift, shmax);
-  result = vec_sr ((__v8hu) __A, rshift);
-  result = vec_sel ((__v8hu) shmask, result, shmask);
+  __shmask = vec_cmple (__rshift, __shmax);
+  __result = vec_sr ((__v8hu) __A, __rshift);
+  __result = vec_sel ((__v8hu) __shmask, __result, __shmask);
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srl_epi32 (__m128i __A, __m128i __B)
 {
-  __v4su rshift;
-  __vector __bool int shmask;
-  const __v4su shmax = { 32, 32, 32, 32 };
-  __v4su result;
+  __v4su __rshift;
+  __vector __bool int __shmask;
+  const __v4su __shmax = { 32, 32, 32, 32 };
+  __v4su __result;
 
 #ifdef __LITTLE_ENDIAN__
-  rshift = vec_splat ((__v4su) __B, 0);
+  __rshift = vec_splat ((__v4su) __B, 0);
 #else
-  rshift = vec_splat ((__v4su) __B, 1);
+  __rshift = vec_splat ((__v4su) __B, 1);
 #endif
-  shmask = vec_cmplt (rshift, shmax);
-  result = vec_sr ((__v4su) __A, rshift);
-  result = vec_sel ((__v4su) shmask, result, shmask);
+  __shmask = vec_cmplt (__rshift, __shmax);
+  __result = vec_sr ((__v4su) __A, __rshift);
+  __result = vec_sel ((__v4su) __shmask, __result, __shmask);
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 #ifdef _ARCH_PWR8
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srl_epi64 (__m128i __A, __m128i __B)
 {
-  __v2du rshift;
-  __vector __bool long long shmask;
-  const __v2du shmax = { 64, 64 };
-  __v2du result;
+  __v2du __rshift;
+  __vector __bool long long __shmask;
+  const __v2du __shmax = { 64, 64 };
+  __v2du __result;
 
-  rshift = vec_splat ((__v2du) __B, 0);
-  shmask = vec_cmplt (rshift, shmax);
-  result = vec_sr ((__v2du) __A, rshift);
-  result = vec_sel ((__v2du) shmask, result, shmask);
+  __rshift = vec_splat ((__v2du) __B, 0);
+  __shmask = vec_cmplt (__rshift, __shmax);
+  __result = vec_sr ((__v2du) __A, __rshift);
+  __result = vec_sel ((__v2du) __shmask, __result, __shmask);
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 #endif
 
@@ -1994,11 +1994,11 @@ _mm_extract_epi16 (__m128i const __A, int const __N)
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
 {
-  __v8hi result = (__v8hi)__A;
+  __v8hi __result = (__v8hi)__A;
 
-  result [(__N & 7)] = __D;
+  __result [(__N & 7)] = __D;
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -2037,21 +2037,21 @@ _mm_movemask_epi8 (__m128i __A)
 #ifdef _ARCH_PWR10
   return vec_extractm ((__v16qu) __A);
 #else
-  __vector unsigned long long result;
-  static const __vector unsigned char perm_mask =
+  __vector unsigned long long __result;
+  static const __vector unsigned char __perm_mask =
     {
 	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
 	0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
     };
 
-  result = ((__vector unsigned long long)
+  __result = ((__vector unsigned long long)
 	    vec_vbpermq ((__vector unsigned char) __A,
-			 (__vector unsigned char) perm_mask));
+			 (__vector unsigned char) __perm_mask));
 
 #ifdef __LITTLE_ENDIAN__
-  return result[1];
+  return __result[1];
 #else
-  return result[0];
+  return __result[0];
 #endif
 #endif /* !_ARCH_PWR10 */
 }
@@ -2060,8 +2060,8 @@ _mm_movemask_epi8 (__m128i __A)
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
 {
-  __v4su w0, w1;
-  __v16qu xform1 = {
+  __v4su __w0, __w1;
+  __v16qu __xform1 = {
 #ifdef __LITTLE_ENDIAN__
       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
@@ -2071,19 +2071,19 @@ _mm_mulhi_epu16 (__m128i __A, __m128i __B)
 #endif
     };
 
-  w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
-  w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
-  return (__m128i) vec_perm (w0, w1, xform1);
+  __w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
+  __w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
+  return (__m128i) vec_perm (__w0, __w1, __xform1);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
 {
-  unsigned long element_selector_98 = __mask & 0x03;
-  unsigned long element_selector_BA = (__mask >> 2) & 0x03;
-  unsigned long element_selector_DC = (__mask >> 4) & 0x03;
-  unsigned long element_selector_FE = (__mask >> 6) & 0x03;
-  static const unsigned short permute_selectors[4] =
+  unsigned long __element_selector_98 = __mask & 0x03;
+  unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
+  unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
+  unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
+  static const unsigned short __permute_selectors[4] =
     {
 #ifdef __LITTLE_ENDIAN__
 	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
@@ -2091,33 +2091,33 @@ _mm_shufflehi_epi16 (__m128i __A, const int __mask)
 	      0x0809, 0x0A0B, 0x0C0D, 0x0E0F
 #endif
     };
-  __v2du pmask =
+  __v2du __pmask =
 #ifdef __LITTLE_ENDIAN__
       { 0x1716151413121110UL,  0UL};
 #else
       { 0x1011121314151617UL,  0UL};
 #endif
-  __m64_union t;
-  __v2du a, r;
+  __m64_union __t;
+  __v2du __a, __r;
 
-  t.as_short[0] = permute_selectors[element_selector_98];
-  t.as_short[1] = permute_selectors[element_selector_BA];
-  t.as_short[2] = permute_selectors[element_selector_DC];
-  t.as_short[3] = permute_selectors[element_selector_FE];
-  pmask[1] = t.as_m64;
-  a = (__v2du)__A;
-  r = vec_perm (a, a, (__vector unsigned char)pmask);
-  return (__m128i) r;
+  __t.as_short[0] = __permute_selectors[__element_selector_98];
+  __t.as_short[1] = __permute_selectors[__element_selector_BA];
+  __t.as_short[2] = __permute_selectors[__element_selector_DC];
+  __t.as_short[3] = __permute_selectors[__element_selector_FE];
+  __pmask[1] = __t.as_m64;
+  __a = (__v2du)__A;
+  __r = vec_perm (__a, __a, (__vector unsigned char)__pmask);
+  return (__m128i) __r;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
 {
-  unsigned long element_selector_10 = __mask & 0x03;
-  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
-  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
-  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
-  static const unsigned short permute_selectors[4] =
+  unsigned long __element_selector_10 = __mask & 0x03;
+  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
+  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
+  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
+  static const unsigned short __permute_selectors[4] =
     {
 #ifdef __LITTLE_ENDIAN__
 	      0x0100, 0x0302, 0x0504, 0x0706
@@ -2125,32 +2125,32 @@ _mm_shufflelo_epi16 (__m128i __A, const int __mask)
 	      0x0001, 0x0203, 0x0405, 0x0607
 #endif
     };
-  __v2du pmask =
+  __v2du __pmask =
 #ifdef __LITTLE_ENDIAN__
                  { 0UL,  0x1f1e1d1c1b1a1918UL};
 #else
                  { 0UL,  0x18191a1b1c1d1e1fUL};
 #endif
-  __m64_union t;
-  __v2du a, r;
-  t.as_short[0] = permute_selectors[element_selector_10];
-  t.as_short[1] = permute_selectors[element_selector_32];
-  t.as_short[2] = permute_selectors[element_selector_54];
-  t.as_short[3] = permute_selectors[element_selector_76];
-  pmask[0] = t.as_m64;
-  a = (__v2du)__A;
-  r = vec_perm (a, a, (__vector unsigned char)pmask);
-  return (__m128i) r;
+  __m64_union __t;
+  __v2du __a, __r;
+  __t.as_short[0] = __permute_selectors[__element_selector_10];
+  __t.as_short[1] = __permute_selectors[__element_selector_32];
+  __t.as_short[2] = __permute_selectors[__element_selector_54];
+  __t.as_short[3] = __permute_selectors[__element_selector_76];
+  __pmask[0] = __t.as_m64;
+  __a = (__v2du)__A;
+  __r = vec_perm (__a, __a, (__vector unsigned char)__pmask);
+  return (__m128i) __r;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_shuffle_epi32 (__m128i __A, const int __mask)
 {
-  unsigned long element_selector_10 = __mask & 0x03;
-  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
-  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
-  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
-  static const unsigned int permute_selectors[4] =
+  unsigned long __element_selector_10 = __mask & 0x03;
+  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
+  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
+  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
+  static const unsigned int __permute_selectors[4] =
     {
 #ifdef __LITTLE_ENDIAN__
 	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
@@ -2158,26 +2158,26 @@ _mm_shuffle_epi32 (__m128i __A, const int __mask)
       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
 #endif
     };
-  __v4su t;
+  __v4su __t;
 
-  t[0] = permute_selectors[element_selector_10];
-  t[1] = permute_selectors[element_selector_32];
-  t[2] = permute_selectors[element_selector_54] + 0x10101010;
-  t[3] = permute_selectors[element_selector_76] + 0x10101010;
-  return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
+  __t[0] = __permute_selectors[__element_selector_10];
+  __t[1] = __permute_selectors[__element_selector_32];
+  __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
+  __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
+  return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)__t);
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
 {
-  __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
-  __v16qu mask, tmp;
-  __m128i_u *p = (__m128i_u*)__C;
+  __v2du __hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
+  __v16qu __mask, __tmp;
+  __m128i_u *__p = (__m128i_u*)__C;
 
-  tmp = (__v16qu)_mm_loadu_si128(p);
-  mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
-  tmp = vec_sel (tmp, (__v16qu)__A, mask);
-  _mm_storeu_si128 (p, (__m128i)tmp);
+  __tmp = (__v16qu)_mm_loadu_si128(__p);
+  __mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)__hibit);
+  __tmp = vec_sel (__tmp, (__v16qu)__A, __mask);
+  _mm_storeu_si128 (__p, (__m128i)__tmp);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -2196,26 +2196,26 @@ _mm_avg_epu16 (__m128i __A, __m128i __B)
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sad_epu8 (__m128i __A, __m128i __B)
 {
-  __v16qu a, b;
-  __v16qu vabsdiff;
-  __v4si vsum;
-  const __v4su zero = { 0, 0, 0, 0 };
-  __v4si result;
+  __v16qu __a, __b;
+  __v16qu __vabsdiff;
+  __v4si __vsum;
+  const __v4su __zero = { 0, 0, 0, 0 };
+  __v4si __result;
 
-  a = (__v16qu) __A;
-  b = (__v16qu) __B;
+  __a = (__v16qu) __A;
+  __b = (__v16qu) __B;
 #ifndef _ARCH_PWR9
-  __v16qu vmin = vec_min (a, b);
-  __v16qu vmax = vec_max (a, b);
-  vabsdiff = vec_sub (vmax, vmin);
+  __v16qu __vmin = vec_min (__a, __b);
+  __v16qu __vmax = vec_max (__a, __b);
+  __vabsdiff = vec_sub (__vmax, __vmin);
 #else
-  vabsdiff = vec_absd (a, b);
+  __vabsdiff = vec_absd (__a, __b);
 #endif
   /* Sum four groups of bytes into integers.  */
-  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
+  __vsum = (__vector signed int) vec_sum4s (__vabsdiff, __zero);
 #ifdef __LITTLE_ENDIAN__
   /* Sum across four integers with two integer results.  */
-  __asm__ ("vsum2sws %0,%1,%2" : "=v" (result) : "v" (vsum), "v" (zero));
+  __asm__ ("vsum2sws %0,%1,%2" : "=v" (__result) : "v" (__vsum), "v" (__zero));
   /* Note: vec_sum2s could be used here, but on little-endian, vector
      shifts are added that are not needed for this use-case.
      A vector shift to correctly position the 32-bit integer results
@@ -2224,11 +2224,11 @@ _mm_sad_epu8 (__m128i __A, __m128i __B)
      integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
 #else
   /* Sum across four integers with two integer results.  */
-  result = vec_sum2s (vsum, (__vector signed int) zero);
+  __result = vec_sum2s (__vsum, (__vector signed int) __zero);
   /* Rotate the sums into the correct position.  */
-  result = vec_sld (result, result, 6);
+  __result = vec_sld (__result, __result, 6);
 #endif
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/config/rs6000/mm_malloc.h b/gcc/config/rs6000/mm_malloc.h
index 3d2e09e..721f756 100644
--- a/gcc/config/rs6000/mm_malloc.h
+++ b/gcc/config/rs6000/mm_malloc.h
@@ -35,28 +35,28 @@ extern "C" int posix_memalign (void **, size_t, size_t) throw ();
 #endif
 
 static __inline void *
-_mm_malloc (size_t size, size_t alignment)
+_mm_malloc (size_t __size, size_t __alignment)
 {
   /* PowerPC64 ELF V2 ABI requires quadword alignment.  */
-  size_t vec_align = sizeof (__vector float);
+  size_t __vec_align = sizeof (__vector float);
   /* Linux GLIBC malloc alignment is at least 2 X ptr size.  */
-  size_t malloc_align = (sizeof (void *) + sizeof (void *));
-  void *ptr;
-
-  if (alignment == malloc_align && alignment == vec_align)
-    return malloc (size);
-  if (alignment < vec_align)
-    alignment = vec_align;
-  if (posix_memalign (&ptr, alignment, size) == 0)
-    return ptr;
+  size_t __malloc_align = (sizeof (void *) + sizeof (void *));
+  void *__ptr;
+
+  if (__alignment == __malloc_align && __alignment == __vec_align)
+    return malloc (__size);
+  if (__alignment < __vec_align)
+    __alignment = __vec_align;
+  if (__posix_memalign (&__ptr, __alignment, __size) == 0)
+    return __ptr;
   else
     return NULL;
 }
 
 static __inline void
-_mm_free (void * ptr)
+_mm_free (void * __ptr)
 {
-  free (ptr);
+  free (__ptr);
 }
 
 #endif /* _MM_MALLOC_H_INCLUDED */
diff --git a/gcc/config/rs6000/mmintrin.h b/gcc/config/rs6000/mmintrin.h
index da4f7d5..bf7f3b1 100644
--- a/gcc/config/rs6000/mmintrin.h
+++ b/gcc/config/rs6000/mmintrin.h
@@ -170,17 +170,17 @@ _mm_cvtsi64_si64x (__m64 __i)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
 {
-  __vector signed short vm1;
-  __vector signed char vresult;
+  __vector signed short __vm1;
+  __vector signed char __vresult;
 
-  vm1 = (__vector signed short) (__vector unsigned long long)
+  __vm1 = (__vector signed short) (__vector unsigned long long)
 #ifdef __LITTLE_ENDIAN__
         { __m1, __m2 };
 #else
         { __m2, __m1 };
 #endif
-  vresult = vec_packs (vm1, vm1);
-  return (__m64) ((__vector long long) vresult)[0];
+  __vresult = vec_packs (__vm1, __vm1);
+  return (__m64) ((__vector long long) __vresult)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -195,17 +195,17 @@ _m_packsswb (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
 {
-  __vector signed int vm1;
-  __vector signed short vresult;
+  __vector signed int __vm1;
+  __vector signed short __vresult;
 
-  vm1 = (__vector signed int) (__vector unsigned long long)
+  __vm1 = (__vector signed int) (__vector unsigned long long)
 #ifdef __LITTLE_ENDIAN__
         { __m1, __m2 };
 #else
         { __m2, __m1 };
 #endif
-  vresult = vec_packs (vm1, vm1);
-  return (__m64) ((__vector long long) vresult)[0];
+  __vresult = vec_packs (__vm1, __vm1);
+  return (__m64) ((__vector long long) __vresult)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -220,19 +220,19 @@ _m_packssdw (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
 {
-  __vector unsigned char r;
-  __vector signed short vm1 = (__vector signed short) (__vector long long)
+  __vector unsigned char __r;
+  __vector signed short __vm1 = (__vector signed short) (__vector long long)
 #ifdef __LITTLE_ENDIAN__
         { __m1, __m2 };
 #else
         { __m2, __m1 };
 #endif
   const __vector signed short __zero = { 0 };
-  __vector __bool short __select = vec_cmplt (vm1, __zero);
-  r = vec_packs ((__vector unsigned short) vm1, (__vector unsigned short) vm1);
-  __vector __bool char packsel = vec_pack (__select, __select);
-  r = vec_sel (r, (const __vector unsigned char) __zero, packsel);
-  return (__m64) ((__vector long long) r)[0];
+  __vector __bool short __select = vec_cmplt (__vm1, __zero);
+  __r = vec_packs ((__vector unsigned short) __vm1, (__vector unsigned short) __vm1);
+  __vector __bool char __packsel = vec_pack (__select, __select);
+  __r = vec_sel (__r, (const __vector unsigned char) __zero, __packsel);
+  return (__m64) ((__vector long long) __r)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -248,28 +248,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
 {
 #if _ARCH_PWR8
-  __vector unsigned char a, b, c;
+  __vector unsigned char __a, __b, __c;
 
-  a = (__vector unsigned char)vec_splats (__m1);
-  b = (__vector unsigned char)vec_splats (__m2);
-  c = vec_mergel (a, b);
-  return (__m64) ((__vector long long) c)[1];
+  __a = (__vector unsigned char)vec_splats (__m1);
+  __b = (__vector unsigned char)vec_splats (__m2);
+  __c = vec_mergel (__a, __b);
+  return (__m64) ((__vector long long) __c)[1];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_char[0] = m1.as_char[4];
-  res.as_char[1] = m2.as_char[4];
-  res.as_char[2] = m1.as_char[5];
-  res.as_char[3] = m2.as_char[5];
-  res.as_char[4] = m1.as_char[6];
-  res.as_char[5] = m2.as_char[6];
-  res.as_char[6] = m1.as_char[7];
-  res.as_char[7] = m2.as_char[7];
+  __res.as_char[0] = __mu1.as_char[4];
+  __res.as_char[1] = __mu2.as_char[4];
+  __res.as_char[2] = __mu1.as_char[5];
+  __res.as_char[3] = __mu2.as_char[5];
+  __res.as_char[4] = __mu1.as_char[6];
+  __res.as_char[5] = __mu2.as_char[6];
+  __res.as_char[6] = __mu1.as_char[7];
+  __res.as_char[7] = __mu2.as_char[7];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -284,17 +284,17 @@ _m_punpckhbw (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
 {
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_short[0] = m1.as_short[2];
-  res.as_short[1] = m2.as_short[2];
-  res.as_short[2] = m1.as_short[3];
-  res.as_short[3] = m2.as_short[3];
+  __res.as_short[0] = __mu1.as_short[2];
+  __res.as_short[1] = __mu2.as_short[2];
+  __res.as_short[2] = __mu1.as_short[3];
+  __res.as_short[3] = __mu2.as_short[3];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -307,15 +307,15 @@ _m_punpckhwd (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
 {
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_int[0] = m1.as_int[1];
-  res.as_int[1] = m2.as_int[1];
+  __res.as_int[0] = __mu1.as_int[1];
+  __res.as_int[1] = __mu2.as_int[1];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -329,28 +329,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
 {
 #if _ARCH_PWR8
-  __vector unsigned char a, b, c;
+  __vector unsigned char __a, __b, __c;
 
-  a = (__vector unsigned char)vec_splats (__m1);
-  b = (__vector unsigned char)vec_splats (__m2);
-  c = vec_mergel (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector unsigned char)vec_splats (__m1);
+  __b = (__vector unsigned char)vec_splats (__m2);
+  __c = vec_mergel (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_char[0] = m1.as_char[0];
-  res.as_char[1] = m2.as_char[0];
-  res.as_char[2] = m1.as_char[1];
-  res.as_char[3] = m2.as_char[1];
-  res.as_char[4] = m1.as_char[2];
-  res.as_char[5] = m2.as_char[2];
-  res.as_char[6] = m1.as_char[3];
-  res.as_char[7] = m2.as_char[3];
+  __res.as_char[0] = __mu1.as_char[0];
+  __res.as_char[1] = __mu2.as_char[0];
+  __res.as_char[2] = __mu1.as_char[1];
+  __res.as_char[3] = __mu2.as_char[1];
+  __res.as_char[4] = __mu1.as_char[2];
+  __res.as_char[5] = __mu2.as_char[2];
+  __res.as_char[6] = __mu1.as_char[3];
+  __res.as_char[7] = __mu2.as_char[3];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -364,17 +364,17 @@ _m_punpcklbw (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
 {
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_short[0] = m1.as_short[0];
-  res.as_short[1] = m2.as_short[0];
-  res.as_short[2] = m1.as_short[1];
-  res.as_short[3] = m2.as_short[1];
+  __res.as_short[0] = __mu1.as_short[0];
+  __res.as_short[1] = __mu2.as_short[0];
+  __res.as_short[2] = __mu1.as_short[1];
+  __res.as_short[3] = __mu2.as_short[1];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -388,15 +388,15 @@ _m_punpcklwd (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
 {
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_int[0] = m1.as_int[0];
-  res.as_int[1] = m2.as_int[0];
+  __res.as_int[0] = __mu1.as_int[0];
+  __res.as_int[1] = __mu2.as_int[0];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -410,28 +410,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_add_pi8 (__m64 __m1, __m64 __m2)
 {
 #if _ARCH_PWR8
-  __vector signed char a, b, c;
+  __vector signed char __a, __b, __c;
 
-  a = (__vector signed char)vec_splats (__m1);
-  b = (__vector signed char)vec_splats (__m2);
-  c = vec_add (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed char)vec_splats (__m1);
+  __b = (__vector signed char)vec_splats (__m2);
+  __c = vec_add (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_char[0] = m1.as_char[0] + m2.as_char[0];
-  res.as_char[1] = m1.as_char[1] + m2.as_char[1];
-  res.as_char[2] = m1.as_char[2] + m2.as_char[2];
-  res.as_char[3] = m1.as_char[3] + m2.as_char[3];
-  res.as_char[4] = m1.as_char[4] + m2.as_char[4];
-  res.as_char[5] = m1.as_char[5] + m2.as_char[5];
-  res.as_char[6] = m1.as_char[6] + m2.as_char[6];
-  res.as_char[7] = m1.as_char[7] + m2.as_char[7];
+  __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
+  __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
+  __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
+  __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
+  __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
+  __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
+  __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
+  __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -446,24 +446,24 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_add_pi16 (__m64 __m1, __m64 __m2)
 {
 #if _ARCH_PWR8
-  __vector signed short a, b, c;
+  __vector signed short __a, __b, __c;
 
-  a = (__vector signed short)vec_splats (__m1);
-  b = (__vector signed short)vec_splats (__m2);
-  c = vec_add (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed short)vec_splats (__m1);
+  __b = (__vector signed short)vec_splats (__m2);
+  __c = vec_add (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_short[0] = m1.as_short[0] + m2.as_short[0];
-  res.as_short[1] = m1.as_short[1] + m2.as_short[1];
-  res.as_short[2] = m1.as_short[2] + m2.as_short[2];
-  res.as_short[3] = m1.as_short[3] + m2.as_short[3];
+  __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
+  __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
+  __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
+  __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -478,22 +478,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_add_pi32 (__m64 __m1, __m64 __m2)
 {
 #if _ARCH_PWR9
-  __vector signed int a, b, c;
+  __vector signed int __a, __b, __c;
 
-  a = (__vector signed int)vec_splats (__m1);
-  b = (__vector signed int)vec_splats (__m2);
-  c = vec_add (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed int)vec_splats (__m1);
+  __b = (__vector signed int)vec_splats (__m2);
+  __c = vec_add (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_int[0] = m1.as_int[0] + m2.as_int[0];
-  res.as_int[1] = m1.as_int[1] + m2.as_int[1];
+  __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
+  __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -508,28 +508,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
 {
 #if _ARCH_PWR8
-  __vector signed char a, b, c;
+  __vector signed char __a, __b, __c;
 
-  a = (__vector signed char)vec_splats (__m1);
-  b = (__vector signed char)vec_splats (__m2);
-  c = vec_sub (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed char)vec_splats (__m1);
+  __b = (__vector signed char)vec_splats (__m2);
+  __c = vec_sub (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_char[0] = m1.as_char[0] - m2.as_char[0];
-  res.as_char[1] = m1.as_char[1] - m2.as_char[1];
-  res.as_char[2] = m1.as_char[2] - m2.as_char[2];
-  res.as_char[3] = m1.as_char[3] - m2.as_char[3];
-  res.as_char[4] = m1.as_char[4] - m2.as_char[4];
-  res.as_char[5] = m1.as_char[5] - m2.as_char[5];
-  res.as_char[6] = m1.as_char[6] - m2.as_char[6];
-  res.as_char[7] = m1.as_char[7] - m2.as_char[7];
+  __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
+  __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
+  __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
+  __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
+  __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
+  __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
+  __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
+  __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -544,24 +544,24 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
 {
 #if _ARCH_PWR8
-  __vector signed short a, b, c;
+  __vector signed short __a, __b, __c;
 
-  a = (__vector signed short)vec_splats (__m1);
-  b = (__vector signed short)vec_splats (__m2);
-  c = vec_sub (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed short)vec_splats (__m1);
+  __b = (__vector signed short)vec_splats (__m2);
+  __c = vec_sub (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_short[0] = m1.as_short[0] - m2.as_short[0];
-  res.as_short[1] = m1.as_short[1] - m2.as_short[1];
-  res.as_short[2] = m1.as_short[2] - m2.as_short[2];
-  res.as_short[3] = m1.as_short[3] - m2.as_short[3];
+  __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
+  __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
+  __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
+  __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -576,22 +576,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
 {
 #if _ARCH_PWR9
-  __vector signed int a, b, c;
+  __vector signed int __a, __b, __c;
 
-  a = (__vector signed int)vec_splats (__m1);
-  b = (__vector signed int)vec_splats (__m2);
-  c = vec_sub (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed int)vec_splats (__m1);
+  __b = (__vector signed int)vec_splats (__m2);
+  __c = vec_sub (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_int[0] = m1.as_int[0] - m2.as_int[0];
-  res.as_int[1] = m1.as_int[1] - m2.as_int[1];
+  __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
+  __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -729,30 +729,30 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
 {
 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
-  __m64 res;
+  __m64 __res;
   __asm__(
       "cmpb %0,%1,%2;\n"
-      : "=r" (res)
+      : "=r" (__res)
       : "r" (__m1),
 	"r" (__m2)
       : );
-  return (res);
+  return (__res);
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
-  res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
-  res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
-  res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
-  res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
-  res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
-  res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
-  res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
+  __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0])? -1: 0;
+  __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1])? -1: 0;
+  __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2])? -1: 0;
+  __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3])? -1: 0;
+  __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4])? -1: 0;
+  __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5])? -1: 0;
+  __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6])? -1: 0;
+  __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7])? -1: 0;
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -766,28 +766,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
 {
 #if _ARCH_PWR8
-  __vector signed char a, b, c;
+  __vector signed char __a, __b, __c;
 
-  a = (__vector signed char)vec_splats (__m1);
-  b = (__vector signed char)vec_splats (__m2);
-  c = (__vector signed char)vec_cmpgt (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed char)vec_splats (__m1);
+  __b = (__vector signed char)vec_splats (__m2);
+  __c = (__vector signed char)vec_cmpgt (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
-  res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
-  res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
-  res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
-  res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
-  res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
-  res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
-  res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
+  __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0])? -1: 0;
+  __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1])? -1: 0;
+  __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2])? -1: 0;
+  __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3])? -1: 0;
+  __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4])? -1: 0;
+  __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5])? -1: 0;
+  __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6])? -1: 0;
+  __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7])? -1: 0;
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -803,24 +803,24 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
 {
 #if _ARCH_PWR8
-  __vector signed short a, b, c;
+  __vector signed short __a, __b, __c;
 
-  a = (__vector signed short)vec_splats (__m1);
-  b = (__vector signed short)vec_splats (__m2);
-  c = (__vector signed short)vec_cmpeq (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed short)vec_splats (__m1);
+  __b = (__vector signed short)vec_splats (__m2);
+  __c = (__vector signed short)vec_cmpeq (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
-  res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
-  res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
-  res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
+  __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0])? -1: 0;
+  __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1])? -1: 0;
+  __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2])? -1: 0;
+  __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3])? -1: 0;
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -834,24 +834,24 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
 {
 #if _ARCH_PWR8
-  __vector signed short a, b, c;
+  __vector signed short __a, __b, __c;
 
-  a = (__vector signed short)vec_splats (__m1);
-  b = (__vector signed short)vec_splats (__m2);
-  c = (__vector signed short)vec_cmpgt (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed short)vec_splats (__m1);
+  __b = (__vector signed short)vec_splats (__m2);
+  __c = (__vector signed short)vec_cmpgt (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
-  res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
-  res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
-  res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
+  __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0])? -1: 0;
+  __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1])? -1: 0;
+  __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2])? -1: 0;
+  __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3])? -1: 0;
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -867,22 +867,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
 {
 #if _ARCH_PWR9
-  __vector signed int a, b, c;
+  __vector signed int __a, __b, __c;
 
-  a = (__vector signed int)vec_splats (__m1);
-  b = (__vector signed int)vec_splats (__m2);
-  c = (__vector signed int)vec_cmpeq (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed int)vec_splats (__m1);
+  __b = (__vector signed int)vec_splats (__m2);
+  __c = (__vector signed int)vec_cmpeq (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
-  res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
+  __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0])? -1: 0;
+  __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1])? -1: 0;
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -896,22 +896,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
 {
 #if _ARCH_PWR9
-  __vector signed int a, b, c;
+  __vector signed int __a, __b, __c;
 
-  a = (__vector signed int)vec_splats (__m1);
-  b = (__vector signed int)vec_splats (__m2);
-  c = (__vector signed int)vec_cmpgt (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed int)vec_splats (__m1);
+  __b = (__vector signed int)vec_splats (__m2);
+  __c = (__vector signed int)vec_cmpgt (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __mu1, __mu2, __res;
 
-  m1.as_m64 = __m1;
-  m2.as_m64 = __m2;
+  __mu1.as_m64 = __m1;
+  __mu2.as_m64 = __m2;
 
-  res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
-  res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
+  __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0])? -1: 0;
+  __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1])? -1: 0;
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -927,12 +927,12 @@ _m_pcmpgtd (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
 {
-  __vector signed char a, b, c;
+  __vector signed char __a, __b, __c;
 
-  a = (__vector signed char)vec_splats (__m1);
-  b = (__vector signed char)vec_splats (__m2);
-  c = vec_adds (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed char)vec_splats (__m1);
+  __b = (__vector signed char)vec_splats (__m2);
+  __c = vec_adds (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -945,12 +945,12 @@ _m_paddsb (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
 {
-  __vector signed short a, b, c;
+  __vector signed short __a, __b, __c;
 
-  a = (__vector signed short)vec_splats (__m1);
-  b = (__vector signed short)vec_splats (__m2);
-  c = vec_adds (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed short)vec_splats (__m1);
+  __b = (__vector signed short)vec_splats (__m2);
+  __c = vec_adds (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -963,12 +963,12 @@ _m_paddsw (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
 {
-  __vector unsigned char a, b, c;
+  __vector unsigned char __a, __b, __c;
 
-  a = (__vector unsigned char)vec_splats (__m1);
-  b = (__vector unsigned char)vec_splats (__m2);
-  c = vec_adds (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector unsigned char)vec_splats (__m1);
+  __b = (__vector unsigned char)vec_splats (__m2);
+  __c = vec_adds (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -982,12 +982,12 @@ _m_paddusb (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
 {
-  __vector unsigned short a, b, c;
+  __vector unsigned short __a, __b, __c;
 
-  a = (__vector unsigned short)vec_splats (__m1);
-  b = (__vector unsigned short)vec_splats (__m2);
-  c = vec_adds (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector unsigned short)vec_splats (__m1);
+  __b = (__vector unsigned short)vec_splats (__m2);
+  __c = vec_adds (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1001,12 +1001,12 @@ _m_paddusw (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
 {
-  __vector signed char a, b, c;
+  __vector signed char __a, __b, __c;
 
-  a = (__vector signed char)vec_splats (__m1);
-  b = (__vector signed char)vec_splats (__m2);
-  c = vec_subs (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed char)vec_splats (__m1);
+  __b = (__vector signed char)vec_splats (__m2);
+  __c = vec_subs (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1020,12 +1020,12 @@ _m_psubsb (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
 {
-  __vector signed short a, b, c;
+  __vector signed short __a, __b, __c;
 
-  a = (__vector signed short)vec_splats (__m1);
-  b = (__vector signed short)vec_splats (__m2);
-  c = vec_subs (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed short)vec_splats (__m1);
+  __b = (__vector signed short)vec_splats (__m2);
+  __c = vec_subs (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1039,12 +1039,12 @@ _m_psubsw (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
 {
-  __vector unsigned char a, b, c;
+  __vector unsigned char __a, __b, __c;
 
-  a = (__vector unsigned char)vec_splats (__m1);
-  b = (__vector unsigned char)vec_splats (__m2);
-  c = vec_subs (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector unsigned char)vec_splats (__m1);
+  __b = (__vector unsigned char)vec_splats (__m2);
+  __c = vec_subs (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1058,12 +1058,12 @@ _m_psubusb (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
 {
-  __vector unsigned short a, b, c;
+  __vector unsigned short __a, __b, __c;
 
-  a = (__vector unsigned short)vec_splats (__m1);
-  b = (__vector unsigned short)vec_splats (__m2);
-  c = vec_subs (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector unsigned short)vec_splats (__m1);
+  __b = (__vector unsigned short)vec_splats (__m2);
+  __c = vec_subs (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1078,14 +1078,14 @@ _m_psubusw (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
 {
-  __vector signed short a, b;
-  __vector signed int c;
-  __vector signed int zero = {0, 0, 0, 0};
+  __vector signed short __a, __b;
+  __vector signed int __c;
+  __vector signed int __zero = {0, 0, 0, 0};
 
-  a = (__vector signed short)vec_splats (__m1);
-  b = (__vector signed short)vec_splats (__m2);
-  c = vec_vmsumshm (a, b, zero);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed short)vec_splats (__m1);
+  __b = (__vector signed short)vec_splats (__m2);
+  __c = vec_vmsumshm (__a, __b, __zero);
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1098,10 +1098,10 @@ _m_pmaddwd (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
 {
-  __vector signed short a, b;
-  __vector signed short c;
-  __vector signed int w0, w1;
-  __vector unsigned char xform1 = {
+  __vector signed short __a, __b;
+  __vector signed short __c;
+  __vector signed int __w0, __w1;
+  __vector unsigned char __xform1 = {
 #ifdef __LITTLE_ENDIAN__
       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
@@ -1111,14 +1111,14 @@ _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
 #endif
     };
 
-  a = (__vector signed short)vec_splats (__m1);
-  b = (__vector signed short)vec_splats (__m2);
+  __a = (__vector signed short)vec_splats (__m1);
+  __b = (__vector signed short)vec_splats (__m2);
 
-  w0 = vec_vmulesh (a, b);
-  w1 = vec_vmulosh (a, b);
-  c = (__vector signed short)vec_perm (w0, w1, xform1);
+  __w0 = vec_vmulesh (__a, __b);
+  __w1 = vec_vmulosh (__a, __b);
+  __c = (__vector signed short)vec_perm (__w0, __w1, __xform1);
 
-  return (__m64) ((__vector long long) c)[0];
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1132,12 +1132,12 @@ _m_pmulhw (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
 {
-  __vector signed short a, b, c;
+  __vector signed short __a, __b, __c;
 
-  a = (__vector signed short)vec_splats (__m1);
-  b = (__vector signed short)vec_splats (__m2);
-  c = a * b;
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector signed short)vec_splats (__m1);
+  __b = (__vector signed short)vec_splats (__m2);
+  __c = __a * __b;
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1150,15 +1150,15 @@ _m_pmullw (__m64 __m1, __m64 __m2)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sll_pi16 (__m64 __m, __m64 __count)
 {
-  __vector signed short m, r;
-  __vector unsigned short c;
+  __vector signed short __r;
+  __vector unsigned short __c;
 
   if (__count <= 15)
     {
-      m = (__vector signed short)vec_splats (__m);
-      c = (__vector unsigned short)vec_splats ((unsigned short)__count);
-      r = vec_sl (m, (__vector unsigned short)c);
-      return (__m64) ((__vector long long) r)[0];
+      __r = (__vector signed short)vec_splats (__m);
+      __c = (__vector unsigned short)vec_splats ((unsigned short)__count);
+      __r = vec_sl (__r, (__vector unsigned short)__c);
+      return (__m64) ((__vector long long) __r)[0];
     }
   else
   return (0);
@@ -1187,13 +1187,13 @@ _m_psllwi (__m64 __m, int __count)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sll_pi32 (__m64 __m, __m64 __count)
 {
-  __m64_union m, res;
+  __m64_union __res;
 
-  m.as_m64 = __m;
+  __res.as_m64 = __m;
 
-  res.as_int[0] = m.as_int[0] << __count;
-  res.as_int[1] = m.as_int[1] << __count;
-  return (res.as_m64);
+  __res.as_int[0] = __res.as_int[0] << __count;
+  __res.as_int[1] = __res.as_int[1] << __count;
+  return (__res.as_m64);
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1219,15 +1219,15 @@ _m_pslldi (__m64 __m, int __count)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sra_pi16 (__m64 __m, __m64 __count)
 {
-  __vector signed short m, r;
-  __vector unsigned short c;
+  __vector signed short __r;
+  __vector unsigned short __c;
 
   if (__count <= 15)
     {
-	m = (__vector signed short)vec_splats (__m);
-	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
-	r = vec_sra (m, (__vector unsigned short)c);
-        return (__m64) ((__vector long long) r)[0];
+	__r = (__vector signed short)vec_splats (__m);
+	__c = (__vector unsigned short)vec_splats ((unsigned short)__count);
+	__r = vec_sra (__r, (__vector unsigned short)__c);
+        return (__m64) ((__vector long long) __r)[0];
     }
   else
   return (0);
@@ -1256,13 +1256,13 @@ _m_psrawi (__m64 __m, int __count)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sra_pi32 (__m64 __m, __m64 __count)
 {
-  __m64_union m, res;
+  __m64_union __res;
 
-  m.as_m64 = __m;
+  __res.as_m64 = __m;
 
-  res.as_int[0] = m.as_int[0] >> __count;
-  res.as_int[1] = m.as_int[1] >> __count;
-  return (res.as_m64);
+  __res.as_int[0] = __res.as_int[0] >> __count;
+  __res.as_int[1] = __res.as_int[1] >> __count;
+  return (__res.as_m64);
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1288,15 +1288,15 @@ _m_psradi (__m64 __m, int __count)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srl_pi16 (__m64 __m, __m64 __count)
 {
-  __vector unsigned short m, r;
-  __vector unsigned short c;
+  __vector unsigned short __r;
+  __vector unsigned short __c;
 
   if (__count <= 15)
     {
-	m = (__vector unsigned short)vec_splats (__m);
-	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
-	r = vec_sr (m, (__vector unsigned short)c);
-        return (__m64) ((__vector long long) r)[0];
+	__r = (__vector unsigned short)vec_splats (__m);
+	__c = (__vector unsigned short)vec_splats ((unsigned short)__count);
+	__r = vec_sr (__r, (__vector unsigned short)__c);
+        return (__m64) ((__vector long long) __r)[0];
     }
   else
     return (0);
@@ -1325,13 +1325,13 @@ _m_psrlwi (__m64 __m, int __count)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srl_pi32 (__m64 __m, __m64 __count)
 {
-  __m64_union m, res;
+  __m64_union __res;
 
-  m.as_m64 = __m;
+  __res.as_m64 = __m;
 
-  res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
-  res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
-  return (res.as_m64);
+  __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
+  __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
+  return (__res.as_m64);
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1358,24 +1358,24 @@ _m_psrldi (__m64 __m, int __count)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_set_pi32 (int __i1, int __i0)
 {
-  __m64_union res;
+  __m64_union __res;
 
-  res.as_int[0] = __i0;
-  res.as_int[1] = __i1;
-  return (res.as_m64);
+  __res.as_int[0] = __i0;
+  __res.as_int[1] = __i1;
+  return (__res.as_m64);
 }
 
 /* Creates a vector of four 16-bit values; W0 is least significant.  */
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
 {
-  __m64_union res;
+  __m64_union __res;
 
-  res.as_short[0] = __w0;
-  res.as_short[1] = __w1;
-  res.as_short[2] = __w2;
-  res.as_short[3] = __w3;
-  return (res.as_m64);
+  __res.as_short[0] = __w0;
+  __res.as_short[1] = __w1;
+  __res.as_short[2] = __w2;
+  __res.as_short[3] = __w3;
+  return (__res.as_m64);
 }
 
 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
@@ -1383,28 +1383,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
 	     char __b3, char __b2, char __b1, char __b0)
 {
-  __m64_union res;
+  __m64_union __res;
 
-  res.as_char[0] = __b0;
-  res.as_char[1] = __b1;
-  res.as_char[2] = __b2;
-  res.as_char[3] = __b3;
-  res.as_char[4] = __b4;
-  res.as_char[5] = __b5;
-  res.as_char[6] = __b6;
-  res.as_char[7] = __b7;
-  return (res.as_m64);
+  __res.as_char[0] = __b0;
+  __res.as_char[1] = __b1;
+  __res.as_char[2] = __b2;
+  __res.as_char[3] = __b3;
+  __res.as_char[4] = __b4;
+  __res.as_char[5] = __b5;
+  __res.as_char[6] = __b6;
+  __res.as_char[7] = __b7;
+  return (__res.as_m64);
 }
 
 /* Similar, but with the arguments in reverse order.  */
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_setr_pi32 (int __i0, int __i1)
 {
-  __m64_union res;
+  __m64_union __res;
 
-  res.as_int[0] = __i0;
-  res.as_int[1] = __i1;
-  return (res.as_m64);
+  __res.as_int[0] = __i0;
+  __res.as_int[1] = __i1;
+  return (__res.as_m64);
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1424,11 +1424,11 @@ _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_set1_pi32 (int __i)
 {
-  __m64_union res;
+  __m64_union __res;
 
-  res.as_int[0] = __i;
-  res.as_int[1] = __i;
-  return (res.as_m64);
+  __res.as_int[0] = __i;
+  __res.as_int[1] = __i;
+  return (__res.as_m64);
 }
 
 /* Creates a vector of four 16-bit values, all elements containing W.  */
@@ -1441,13 +1441,13 @@ _mm_set1_pi16 (short __w)
   w = (__vector signed short)vec_splats (__w);
   return (__m64) ((__vector long long) w)[0];
 #else
-  __m64_union res;
+  __m64_union __res;
 
-  res.as_short[0] = __w;
-  res.as_short[1] = __w;
-  res.as_short[2] = __w;
-  res.as_short[3] = __w;
-  return (res.as_m64);
+  __res.as_short[0] = __w;
+  __res.as_short[1] = __w;
+  __res.as_short[2] = __w;
+  __res.as_short[3] = __w;
+  return (__res.as_m64);
 #endif
 }
 
@@ -1456,22 +1456,22 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_set1_pi8 (signed char __b)
 {
 #if _ARCH_PWR8
-  __vector signed char b;
+  __vector signed char __res;
 
-  b = (__vector signed char)vec_splats (__b);
-  return (__m64) ((__vector long long) b)[0];
+  __res = (__vector signed char)vec_splats (__b);
+  return (__m64) ((__vector long long) __res)[0];
 #else
-  __m64_union res;
-
-  res.as_char[0] = __b;
-  res.as_char[1] = __b;
-  res.as_char[2] = __b;
-  res.as_char[3] = __b;
-  res.as_char[4] = __b;
-  res.as_char[5] = __b;
-  res.as_char[6] = __b;
-  res.as_char[7] = __b;
-  return (res.as_m64);
+  __m64_union __res;
+
+  __res.as_char[0] = __b;
+  __res.as_char[1] = __b;
+  __res.as_char[2] = __b;
+  __res.as_char[3] = __b;
+  __res.as_char[4] = __b;
+  __res.as_char[5] = __b;
+  __res.as_char[6] = __b;
+  __res.as_char[7] = __b;
+  return (__res.as_m64);
 #endif
 }
 #endif /* _MMINTRIN_H_INCLUDED */
diff --git a/gcc/config/rs6000/pmmintrin.h b/gcc/config/rs6000/pmmintrin.h
index bcbca15..e1b5426 100644
--- a/gcc/config/rs6000/pmmintrin.h
+++ b/gcc/config/rs6000/pmmintrin.h
@@ -58,55 +58,55 @@
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_addsub_ps (__m128 __X, __m128 __Y)
 {
-  const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0};
-  __v4sf even_neg_Y = vec_xor(__Y, even_n0);
-  return (__m128) vec_add (__X, even_neg_Y);
+  const __v4sf __even_n0 = {-0.0, 0.0, -0.0, 0.0};
+  __v4sf __even_neg_Y = vec_xor(__Y, __even_n0);
+  return (__m128) vec_add (__X, __even_neg_Y);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_addsub_pd (__m128d __X, __m128d __Y)
 {
-  const __v2df even_n0 = {-0.0, 0.0};
-  __v2df even_neg_Y = vec_xor(__Y, even_n0);
-  return (__m128d) vec_add (__X, even_neg_Y);
+  const __v2df __even_n0 = {-0.0, 0.0};
+  __v2df __even_neg_Y = vec_xor(__Y, __even_n0);
+  return (__m128d) vec_add (__X, __even_neg_Y);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hadd_ps (__m128 __X, __m128 __Y)
 {
-  __vector unsigned char xform2 = {
+  __vector unsigned char __xform2 = {
       0x00, 0x01, 0x02, 0x03,
       0x08, 0x09, 0x0A, 0x0B,
       0x10, 0x11, 0x12, 0x13,
       0x18, 0x19, 0x1A, 0x1B
     };
-  __vector unsigned char xform1 = {
+  __vector unsigned char __xform1 = {
       0x04, 0x05, 0x06, 0x07,
       0x0C, 0x0D, 0x0E, 0x0F,
       0x14, 0x15, 0x16, 0x17,
       0x1C, 0x1D, 0x1E, 0x1F
     };
-  return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
-			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
+  return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, __xform2),
+			   vec_perm ((__v4sf) __X, (__v4sf) __Y, __xform1));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hsub_ps (__m128 __X, __m128 __Y)
 {
-  __vector unsigned char xform2 = {
+  __vector unsigned char __xform2 = {
       0x00, 0x01, 0x02, 0x03,
       0x08, 0x09, 0x0A, 0x0B,
       0x10, 0x11, 0x12, 0x13,
       0x18, 0x19, 0x1A, 0x1B
     };
-  __vector unsigned char xform1 = {
+  __vector unsigned char __xform1 = {
       0x04, 0x05, 0x06, 0x07,
       0x0C, 0x0D, 0x0E, 0x0F,
       0x14, 0x15, 0x16, 0x17,
       0x1C, 0x1D, 0x1E, 0x1F
     };
-  return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
-			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
+  return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, __xform2),
+			   vec_perm ((__v4sf) __X, (__v4sf) __Y, __xform1));
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index cca2f7d..3628c88 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -273,31 +273,31 @@ _mm_round_ss (__m128 __A, __m128 __B, int __rounding)
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
 {
-  __v16qi result = (__v16qi)__A;
+  __v16qi __result = (__v16qi)__A;
 
-  result [__N & 0xf] = __D;
+  __result [__N & 0xf] = __D;
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_insert_epi32 (__m128i const __A, int const __D, int const __N)
 {
-  __v4si result = (__v4si)__A;
+  __v4si __result = (__v4si)__A;
 
-  result [__N & 3] = __D;
+  __result [__N & 3] = __D;
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_insert_epi64 (__m128i const __A, long long const __D, int const __N)
 {
-  __v2di result = (__v2di)__A;
+  __v2di __result = (__v2di)__A;
 
-  result [__N & 1] = __D;
+  __result [__N & 1] = __D;
 
-  return (__m128i) result;
+  return (__m128i) __result;
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/config/rs6000/tmmintrin.h b/gcc/config/rs6000/tmmintrin.h
index c06a643..05b985b 100644
--- a/gcc/config/rs6000/tmmintrin.h
+++ b/gcc/config/rs6000/tmmintrin.h
@@ -112,8 +112,8 @@ _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
     {
       if (__count >= 32)
 	{
-	  const __v16qu zero = { 0 };
-	  return (__m128i) zero;
+	  const __v16qu __zero = { 0 };
+	  return (__m128i) __zero;
 	}
       else
 	{
diff --git a/gcc/config/rs6000/xmmintrin.h b/gcc/config/rs6000/xmmintrin.h
index 5867431..c602011 100644
--- a/gcc/config/rs6000/xmmintrin.h
+++ b/gcc/config/rs6000/xmmintrin.h
@@ -127,14 +127,14 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif
 _mm_loadr_ps (float const *__P)
 {
   __v4sf   __tmp;
-  __m128 result;
-  static const __vector unsigned char permute_vector =
+  __m128 __result;
+  static const __vector unsigned char __permute_vector =
     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
 	0x17, 0x10, 0x11, 0x12, 0x13 };
 
   __tmp = vec_ld (0, (__v4sf *) __P);
-  result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
-  return result;
+  __result = (__m128) vec_perm (__tmp, __tmp, __permute_vector);
+  return __result;
 }
 
 /* Create a vector with all four elements equal to F.  */
@@ -184,11 +184,11 @@ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artific
 _mm_storer_ps (float *__P, __m128 __A)
 {
   __v4sf   __tmp;
-  static const __vector unsigned char permute_vector =
+  static const __vector unsigned char __permute_vector =
     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
 	0x17, 0x10, 0x11, 0x12, 0x13 };
 
-  __tmp = (__m128) vec_perm (__A, __A, permute_vector);
+  __tmp = (__m128) vec_perm (__A, __A, __permute_vector);
 
   _mm_store_ps (__P, __tmp);
 }
@@ -218,9 +218,9 @@ _mm_set_ss (float __F)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_ss (__m128 __A, __m128 __B)
 {
-  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
 
-  return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
+  return (vec_sel ((__v4sf)__A, (__v4sf)__B, __mask));
 }
 
 /* Create a vector with element 0 as *P and the rest zero.  */
@@ -245,18 +245,18 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif
 _mm_add_ss (__m128 __A, __m128 __B)
 {
 #ifdef _ARCH_PWR7
-  __m128 a, b, c;
-  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  __m128 __a, __b, __c;
+  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
   /* PowerISA VSX does not allow partial (for just lower double)
      results. So to insure we don't generate spurious exceptions
      (from the upper double values) we splat the lower double
      before we to the operation.  */
-  a = vec_splat (__A, 0);
-  b = vec_splat (__B, 0);
-  c = a + b;
+  __a = vec_splat (__A, 0);
+  __b = vec_splat (__B, 0);
+  __c = __a + __b;
   /* Then we merge the lower float result with the original upper
      float elements from __A.  */
-  return (vec_sel (__A, c, mask));
+  return (vec_sel (__A, __c, __mask));
 #else
   __A[0] = __A[0] + __B[0];
   return (__A);
@@ -267,18 +267,18 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif
 _mm_sub_ss (__m128 __A, __m128 __B)
 {
 #ifdef _ARCH_PWR7
-  __m128 a, b, c;
-  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  __m128 __a, __b, __c;
+  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
   /* PowerISA VSX does not allow partial (for just lower double)
      results. So to insure we don't generate spurious exceptions
      (from the upper double values) we splat the lower double
      before we to the operation.  */
-  a = vec_splat (__A, 0);
-  b = vec_splat (__B, 0);
-  c = a - b;
+  __a = vec_splat (__A, 0);
+  __b = vec_splat (__B, 0);
+  __c = __a - __b;
   /* Then we merge the lower float result with the original upper
      float elements from __A.  */
-  return (vec_sel (__A, c, mask));
+  return (vec_sel (__A, __c, __mask));
 #else
   __A[0] = __A[0] - __B[0];
   return (__A);
@@ -289,18 +289,18 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif
 _mm_mul_ss (__m128 __A, __m128 __B)
 {
 #ifdef _ARCH_PWR7
-  __m128 a, b, c;
-  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  __m128 __a, __b, __c;
+  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
   /* PowerISA VSX does not allow partial (for just lower double)
      results. So to insure we don't generate spurious exceptions
      (from the upper double values) we splat the lower double
      before we to the operation.  */
-  a = vec_splat (__A, 0);
-  b = vec_splat (__B, 0);
-  c = a * b;
+  __a = vec_splat (__A, 0);
+  __b = vec_splat (__B, 0);
+  __c = __a * __b;
   /* Then we merge the lower float result with the original upper
      float elements from __A.  */
-  return (vec_sel (__A, c, mask));
+  return (vec_sel (__A, __c, __mask));
 #else
   __A[0] = __A[0] * __B[0];
   return (__A);
@@ -311,18 +311,18 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif
 _mm_div_ss (__m128 __A, __m128 __B)
 {
 #ifdef _ARCH_PWR7
-  __m128 a, b, c;
-  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  __m128 __a, __b, __c;
+  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
   /* PowerISA VSX does not allow partial (for just lower double)
      results. So to insure we don't generate spurious exceptions
      (from the upper double values) we splat the lower double
      before we to the operation.  */
-  a = vec_splat (__A, 0);
-  b = vec_splat (__B, 0);
-  c = a / b;
+  __a = vec_splat (__A, 0);
+  __b = vec_splat (__B, 0);
+  __c = __a / __b;
   /* Then we merge the lower float result with the original upper
      float elements from __A.  */
-  return (vec_sel (__A, c, mask));
+  return (vec_sel (__A, __c, __mask));
 #else
   __A[0] = __A[0] / __B[0];
   return (__A);
@@ -332,17 +332,17 @@ _mm_div_ss (__m128 __A, __m128 __B)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_ss (__m128 __A)
 {
-  __m128 a, c;
-  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  __m128 __a, __c;
+  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
   /* PowerISA VSX does not allow partial (for just lower double)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper double values) we splat the lower double
    * before we to the operation. */
-  a = vec_splat (__A, 0);
-  c = vec_sqrt (a);
+  __a = vec_splat (__A, 0);
+  __c = vec_sqrt (__a);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return (vec_sel (__A, c, mask));
+  return (vec_sel (__A, __c, __mask));
 }
 
 /* Perform the respective operation on the four SPFP values in A and B.  */
@@ -391,81 +391,81 @@ _mm_rsqrt_ps (__m128 __A)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_rcp_ss (__m128 __A)
 {
-  __m128 a, c;
-  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  __m128 __a, __c;
+  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
   /* PowerISA VSX does not allow partial (for just lower double)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper double values) we splat the lower double
    * before we to the operation. */
-  a = vec_splat (__A, 0);
-  c = _mm_rcp_ps (a);
+  __a = vec_splat (__A, 0);
+  __c = _mm_rcp_ps (__a);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return (vec_sel (__A, c, mask));
+  return (vec_sel (__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_rsqrt_ss (__m128 __A)
 {
-  __m128 a, c;
-  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  __m128 __a, __c;
+  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
   /* PowerISA VSX does not allow partial (for just lower double)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper double values) we splat the lower double
    * before we to the operation. */
-  a = vec_splat (__A, 0);
-  c = vec_rsqrte (a);
+  __a = vec_splat (__A, 0);
+  __c = vec_rsqrte (__a);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return (vec_sel (__A, c, mask));
+  return (vec_sel (__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_min_ss (__m128 __A, __m128 __B)
 {
-  __v4sf a, b, c;
-  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  __v4sf __a, __b, __c;
+  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
   /* PowerISA VSX does not allow partial (for just lower float)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper float values) we splat the lower float
    * before we to the operation. */
-  a = vec_splat ((__v4sf)__A, 0);
-  b = vec_splat ((__v4sf)__B, 0);
-  c = vec_min (a, b);
+  __a = vec_splat ((__v4sf)__A, 0);
+  __b = vec_splat ((__v4sf)__B, 0);
+  __c = vec_min (__a, __b);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return (vec_sel ((__v4sf)__A, c, mask));
+  return (vec_sel ((__v4sf)__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_max_ss (__m128 __A, __m128 __B)
 {
-  __v4sf a, b, c;
-  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  __v4sf __a, __b, __c;
+  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
   /* PowerISA VSX does not allow partial (for just lower float)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper float values) we splat the lower float
    * before we to the operation. */
-  a = vec_splat (__A, 0);
-  b = vec_splat (__B, 0);
-  c = vec_max (a, b);
+  __a = vec_splat (__A, 0);
+  __b = vec_splat (__B, 0);
+  __c = vec_max (__a, __b);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return (vec_sel ((__v4sf)__A, c, mask));
+  return (vec_sel ((__v4sf)__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_min_ps (__m128 __A, __m128 __B)
 {
-  __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
-  return vec_sel (__B, __A, m);
+  __vector __bool int __m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
+  return vec_sel (__B, __A, __m);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_max_ps (__m128 __A, __m128 __B)
 {
-  __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
-  return vec_sel (__B, __A, m);
+  __vector __bool int __m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
+  return vec_sel (__B, __A, __m);
 }
 
 /* Perform logical bit-wise operations on 128-bit values.  */
@@ -530,8 +530,8 @@ _mm_cmpge_ps (__m128 __A, __m128 __B)
 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpneq_ps (__m128  __A, __m128  __B)
 {
-  __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
-  return ((__m128)vec_nor (temp, temp));
+  __v4sf __temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
+  return ((__m128)vec_nor (__temp, __temp));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -561,31 +561,31 @@ _mm_cmpnge_ps (__m128 __A, __m128 __B)
 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpord_ps (__m128  __A, __m128  __B)
 {
-  __vector unsigned int a, b;
-  __vector unsigned int c, d;
-  static const __vector unsigned int float_exp_mask =
+  __vector unsigned int __a, __b;
+  __vector unsigned int __c, __d;
+  static const __vector unsigned int __float_exp_mask =
     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 
-  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
-  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
-  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
-  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
-  return ((__m128 ) vec_and (c, d));
+  __a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  __b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  __c = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __a);
+  __d = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __b);
+  return ((__m128 ) vec_and (__c, __d));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpunord_ps (__m128 __A, __m128 __B)
 {
-  __vector unsigned int a, b;
-  __vector unsigned int c, d;
-  static const __vector unsigned int float_exp_mask =
+  __vector unsigned int __a, __b;
+  __vector unsigned int __c, __d;
+  static const __vector unsigned int __float_exp_mask =
     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 
-  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
-  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
-  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
-  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
-  return ((__m128 ) vec_or (c, d));
+  __a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  __b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  __c = (__vector unsigned int) vec_cmpgt (__a, __float_exp_mask);
+  __d = (__vector unsigned int) vec_cmpgt (__b, __float_exp_mask);
+  return ((__m128 ) vec_or (__c, __d));
 }
 
 /* Perform a comparison on the lower SPFP values of A and B.  If the
@@ -594,222 +594,222 @@ _mm_cmpunord_ps (__m128 __A, __m128 __B)
 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpeq_ss (__m128  __A, __m128  __B)
 {
-  static const __vector unsigned int mask =
+  static const __vector unsigned int __mask =
     { 0xffffffff, 0, 0, 0 };
-  __v4sf a, b, c;
+  __v4sf __a, __b, __c;
   /* PowerISA VMX does not allow partial (for just element 0)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper elements) we splat the lower float
    * before we to the operation. */
-  a = vec_splat ((__v4sf) __A, 0);
-  b = vec_splat ((__v4sf) __B, 0);
-  c = (__v4sf) vec_cmpeq(a, b);
+  __a = vec_splat ((__v4sf) __A, 0);
+  __b = vec_splat ((__v4sf) __B, 0);
+  __c = (__v4sf) vec_cmpeq (__a, __b);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmplt_ss (__m128 __A, __m128 __B)
 {
-  static const __vector unsigned int mask =
+  static const __vector unsigned int __mask =
     { 0xffffffff, 0, 0, 0 };
-  __v4sf a, b, c;
+  __v4sf __a, __b, __c;
   /* PowerISA VMX does not allow partial (for just element 0)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper elements) we splat the lower float
    * before we to the operation. */
-  a = vec_splat ((__v4sf) __A, 0);
-  b = vec_splat ((__v4sf) __B, 0);
-  c = (__v4sf) vec_cmplt(a, b);
+  __a = vec_splat ((__v4sf) __A, 0);
+  __b = vec_splat ((__v4sf) __B, 0);
+  __c = (__v4sf) vec_cmplt(__a, __b);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmple_ss (__m128 __A, __m128 __B)
 {
-  static const __vector unsigned int mask =
+  static const __vector unsigned int __mask =
     { 0xffffffff, 0, 0, 0 };
-  __v4sf a, b, c;
+  __v4sf __a, __b, __c;
   /* PowerISA VMX does not allow partial (for just element 0)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper elements) we splat the lower float
    * before we to the operation. */
-  a = vec_splat ((__v4sf) __A, 0);
-  b = vec_splat ((__v4sf) __B, 0);
-  c = (__v4sf) vec_cmple(a, b);
+  __a = vec_splat ((__v4sf) __A, 0);
+  __b = vec_splat ((__v4sf) __B, 0);
+  __c = (__v4sf) vec_cmple(__a, __b);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpgt_ss (__m128 __A, __m128 __B)
 {
-  static const __vector unsigned int mask =
+  static const __vector unsigned int __mask =
     { 0xffffffff, 0, 0, 0 };
-  __v4sf a, b, c;
+  __v4sf __a, __b, __c;
   /* PowerISA VMX does not allow partial (for just element 0)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper elements) we splat the lower float
    * before we to the operation. */
-  a = vec_splat ((__v4sf) __A, 0);
-  b = vec_splat ((__v4sf) __B, 0);
-  c = (__v4sf) vec_cmpgt(a, b);
+  __a = vec_splat ((__v4sf) __A, 0);
+  __b = vec_splat ((__v4sf) __B, 0);
+  __c = (__v4sf) vec_cmpgt(__a, __b);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpge_ss (__m128 __A, __m128 __B)
 {
-  static const __vector unsigned int mask =
+  static const __vector unsigned int __mask =
     { 0xffffffff, 0, 0, 0 };
-  __v4sf a, b, c;
+  __v4sf __a, __b, __c;
   /* PowerISA VMX does not allow partial (for just element 0)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper elements) we splat the lower float
    * before we to the operation. */
-  a = vec_splat ((__v4sf) __A, 0);
-  b = vec_splat ((__v4sf) __B, 0);
-  c = (__v4sf) vec_cmpge(a, b);
+  __a = vec_splat ((__v4sf) __A, 0);
+  __b = vec_splat ((__v4sf) __B, 0);
+  __c = (__v4sf) vec_cmpge(__a, __b);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpneq_ss (__m128 __A, __m128 __B)
 {
-  static const __vector unsigned int mask =
+  static const __vector unsigned int __mask =
     { 0xffffffff, 0, 0, 0 };
-  __v4sf a, b, c;
+  __v4sf __a, __b, __c;
   /* PowerISA VMX does not allow partial (for just element 0)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper elements) we splat the lower float
    * before we to the operation. */
-  a = vec_splat ((__v4sf) __A, 0);
-  b = vec_splat ((__v4sf) __B, 0);
-  c = (__v4sf) vec_cmpeq(a, b);
-  c = vec_nor (c, c);
+  __a = vec_splat ((__v4sf) __A, 0);
+  __b = vec_splat ((__v4sf) __B, 0);
+  __c = (__v4sf) vec_cmpeq(__a, __b);
+  __c = vec_nor (__c, __c);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
 {
-  static const __vector unsigned int mask =
+  static const __vector unsigned int __mask =
     { 0xffffffff, 0, 0, 0 };
-  __v4sf a, b, c;
+  __v4sf __a, __b, __c;
   /* PowerISA VMX does not allow partial (for just element 0)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper elements) we splat the lower float
    * before we to the operation. */
-  a = vec_splat ((__v4sf) __A, 0);
-  b = vec_splat ((__v4sf) __B, 0);
-  c = (__v4sf) vec_cmpge(a, b);
+  __a = vec_splat ((__v4sf) __A, 0);
+  __b = vec_splat ((__v4sf) __B, 0);
+  __c = (__v4sf) vec_cmpge(__a, __b);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnle_ss (__m128 __A, __m128 __B)
 {
-  static const __vector unsigned int mask =
+  static const __vector unsigned int __mask =
     { 0xffffffff, 0, 0, 0 };
-  __v4sf a, b, c;
+  __v4sf __a, __b, __c;
   /* PowerISA VMX does not allow partial (for just element 0)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper elements) we splat the lower float
    * before we to the operation. */
-  a = vec_splat ((__v4sf) __A, 0);
-  b = vec_splat ((__v4sf) __B, 0);
-  c = (__v4sf) vec_cmpgt(a, b);
+  __a = vec_splat ((__v4sf) __A, 0);
+  __b = vec_splat ((__v4sf) __B, 0);
+  __c = (__v4sf) vec_cmpgt(__a, __b);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpngt_ss (__m128 __A, __m128 __B)
 {
-  static const __vector unsigned int mask =
+  static const __vector unsigned int __mask =
     { 0xffffffff, 0, 0, 0 };
-  __v4sf a, b, c;
+  __v4sf __a, __b, __c;
   /* PowerISA VMX does not allow partial (for just element 0)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper elements) we splat the lower float
    * before we to the operation. */
-  a = vec_splat ((__v4sf) __A, 0);
-  b = vec_splat ((__v4sf) __B, 0);
-  c = (__v4sf) vec_cmple(a, b);
+  __a = vec_splat ((__v4sf) __A, 0);
+  __b = vec_splat ((__v4sf) __B, 0);
+  __c = (__v4sf) vec_cmple(__a, __b);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnge_ss (__m128 __A, __m128 __B)
 {
-  static const __vector unsigned int mask =
+  static const __vector unsigned int __mask =
     { 0xffffffff, 0, 0, 0 };
-  __v4sf a, b, c;
+  __v4sf __a, __b, __c;
   /* PowerISA VMX does not allow partial (for just element 0)
    * results. So to insure we don't generate spurious exceptions
    * (from the upper elements) we splat the lower float
    * before we do the operation. */
-  a = vec_splat ((__v4sf) __A, 0);
-  b = vec_splat ((__v4sf) __B, 0);
-  c = (__v4sf) vec_cmplt(a, b);
+  __a = vec_splat ((__v4sf) __A, 0);
+  __b = vec_splat ((__v4sf) __B, 0);
+  __c = (__v4sf) vec_cmplt(__a, __b);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+  return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpord_ss (__m128 __A, __m128 __B)
 {
-  __vector unsigned int a, b;
-  __vector unsigned int c, d;
-  static const __vector unsigned int float_exp_mask =
+  __vector unsigned int __a, __b;
+  __vector unsigned int __c, __d;
+  static const __vector unsigned int __float_exp_mask =
     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
-  static const __vector unsigned int mask =
+  static const __vector unsigned int __mask =
     { 0xffffffff, 0, 0, 0 };
 
-  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
-  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
-  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
-  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
-  c = vec_and (c, d);
+  __a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  __b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  __c = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __a);
+  __d = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __b);
+  __c = vec_and (__c, __d);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
+  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)__c, __mask));
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpunord_ss (__m128 __A, __m128 __B)
 {
-  __vector unsigned int a, b;
-  __vector unsigned int c, d;
-  static const __vector unsigned int float_exp_mask =
+  __vector unsigned int __a, __b;
+  __vector unsigned int __c, __d;
+  static const __vector unsigned int __float_exp_mask =
     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
-  static const __vector unsigned int mask =
+  static const __vector unsigned int __mask =
     { 0xffffffff, 0, 0, 0 };
 
-  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
-  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
-  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
-  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
-  c = vec_or (c, d);
+  __a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  __b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  __c = (__vector unsigned int) vec_cmpgt (__a, __float_exp_mask);
+  __d = (__vector unsigned int) vec_cmpgt (__b, __float_exp_mask);
+  __c = vec_or (__c, __d);
   /* Then we merge the lower float result with the original upper
    * float elements from __A.  */
-  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
+  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)__c, __mask));
 }
 
 /* Compare the lower SPFP values of A and B and return 1 if true
@@ -905,9 +905,9 @@ _mm_cvtss_f32 (__m128 __A)
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtss_si32 (__m128 __A)
 {
-  int res;
+  int __res;
 #ifdef _ARCH_PWR8
-  double dtmp;
+  double __dtmp;
   __asm__(
 #ifdef __LITTLE_ENDIAN__
       "xxsldwi %x0,%x0,%x0,3;\n"
@@ -916,13 +916,13 @@ _mm_cvtss_si32 (__m128 __A)
       "fctiw  %2,%2;\n"
       "mfvsrd  %1,%x2;\n"
       : "+wa" (__A),
-        "=r" (res),
-        "=f" (dtmp)
+        "=r" (__res),
+        "=f" (__dtmp)
       : );
 #else
-  res = __builtin_rint(__A[0]);
+  __res = __builtin_rint(__A[0]);
 #endif
-  return (res);
+  return __res;
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -938,9 +938,9 @@ _mm_cvt_ss2si (__m128 __A)
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtss_si64 (__m128 __A)
 {
-  long long res;
+  long long __res;
 #if defined (_ARCH_PWR8) && defined (__powerpc64__)
-  double dtmp;
+  double __dtmp;
   __asm__(
 #ifdef __LITTLE_ENDIAN__
       "xxsldwi %x0,%x0,%x0,3;\n"
@@ -949,13 +949,13 @@ _mm_cvtss_si64 (__m128 __A)
       "fctid  %2,%2;\n"
       "mfvsrd  %1,%x2;\n"
       : "+wa" (__A),
-        "=r" (res),
-        "=f" (dtmp)
+        "=r" (__res),
+        "=f" (__dtmp)
       : );
 #else
-  res = __builtin_llrint(__A[0]);
+  __res = __builtin_llrint(__A[0]);
 #endif
-  return (res);
+  return __res;
 }
 
 /* Microsoft intrinsic.  */
@@ -992,15 +992,15 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_cvtps_pi32 (__m128 __A)
 {
   /* Splat two lower SPFP values to both halves.  */
-  __v4sf temp, rounded;
-  __vector unsigned long long result;
+  __v4sf __temp, __rounded;
+  __vector unsigned long long __result;
 
   /* Splat two lower SPFP values to both halves.  */
-  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
-  rounded = vec_rint(temp);
-  result = (__vector unsigned long long) vec_cts (rounded, 0);
+  __temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
+  __rounded = vec_rint (__temp);
+  __result = (__vector unsigned long long) vec_cts (__rounded, 0);
 
-  return (__m64) ((__vector long long) result)[0];
+  return (__m64) ((__vector long long) __result)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1014,9 +1014,9 @@ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artifici
 _mm_cvttss_si32 (__m128 __A)
 {
   /* Extract the lower float element.  */
-  float temp = __A[0];
+  float __temp = __A[0];
   /* truncate to 32-bit integer and return.  */
-  return temp;
+  return __temp;
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1030,9 +1030,9 @@ extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __ar
 _mm_cvttss_si64 (__m128 __A)
 {
   /* Extract the lower float element.  */
-  float temp = __A[0];
+  float __temp = __A[0];
   /* truncate to 32-bit integer and return.  */
-  return temp;
+  return __temp;
 }
 
 /* Microsoft intrinsic.  */
@@ -1040,9 +1040,9 @@ extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __ar
 _mm_cvttss_si64x (__m128 __A)
 {
   /* Extract the lower float element.  */
-  float temp = __A[0];
+  float __temp = __A[0];
   /* truncate to 32-bit integer and return.  */
-  return temp;
+  return __temp;
 }
 
 /* Truncate the two lower SPFP values to 32-bit integers.  Return the
@@ -1050,14 +1050,14 @@ _mm_cvttss_si64x (__m128 __A)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvttps_pi32 (__m128 __A)
 {
-  __v4sf temp;
-  __vector unsigned long long result;
+  __v4sf __temp;
+  __vector unsigned long long __result;
 
   /* Splat two lower SPFP values to both halves.  */
-  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
-  result = (__vector unsigned long long) vec_cts (temp, 0);
+  __temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
+  __result = (__vector unsigned long long) vec_cts (__temp, 0);
 
-  return (__m64) ((__vector long long) result)[0];
+  return (__m64) ((__vector long long) __result)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1070,8 +1070,8 @@ _mm_cvtt_ps2pi (__m128 __A)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsi32_ss (__m128 __A, int __B)
 {
-  float temp = __B;
-  __A[0] = temp;
+  float __temp = __B;
+  __A[0] = __temp;
 
   return __A;
 }
@@ -1087,8 +1087,8 @@ _mm_cvt_si2ss (__m128 __A, int __B)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsi64_ss (__m128 __A, long long __B)
 {
-  float temp = __B;
-  __A[0] = temp;
+  float __temp = __B;
+  __A[0] = __temp;
 
   return __A;
 }
@@ -1105,14 +1105,14 @@ _mm_cvtsi64x_ss (__m128 __A, long long __B)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtpi32_ps (__m128        __A, __m64        __B)
 {
-  __vector signed int vm1;
-  __vector float vf1;
+  __vector signed int __vm1;
+  __vector float __vf1;
 
-  vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
-  vf1 = (__vector float) vec_ctf (vm1, 0);
+  __vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
+  __vf1 = (__vector float) vec_ctf (__vm1, 0);
 
   return ((__m128) (__vector unsigned long long)
-    { ((__vector unsigned long long)vf1) [0],
+    { ((__vector unsigned long long)__vf1) [0],
 	((__vector unsigned long long)__A) [1]});
 }
 
@@ -1126,54 +1126,54 @@ _mm_cvt_pi2ps (__m128 __A, __m64 __B)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtpi16_ps (__m64 __A)
 {
-  __vector signed short vs8;
-  __vector signed int vi4;
-  __vector float vf1;
+  __vector signed short __vs8;
+  __vector signed int __vi4;
+  __vector float __vf1;
 
-  vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
-  vi4 = vec_vupklsh (vs8);
-  vf1 = (__vector float) vec_ctf (vi4, 0);
+  __vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
+  __vi4 = vec_vupklsh (__vs8);
+  __vf1 = (__vector float) vec_ctf (__vi4, 0);
 
-  return (__m128) vf1;
+  return (__m128) __vf1;
 }
 
 /* Convert the four unsigned 16-bit values in A to SPFP form.  */
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtpu16_ps (__m64 __A)
 {
-  const __vector unsigned short zero =
+  const __vector unsigned short __zero =
     { 0, 0, 0, 0, 0, 0, 0, 0 };
-  __vector unsigned short vs8;
-  __vector unsigned int vi4;
-  __vector float vf1;
+  __vector unsigned short __vs8;
+  __vector unsigned int __vi4;
+  __vector float __vf1;
 
-  vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
-  vi4 = (__vector unsigned int) vec_mergel
+  __vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
+  __vi4 = (__vector unsigned int) vec_mergel
 #ifdef __LITTLE_ENDIAN__
-                                           (vs8, zero);
+                                           (__vs8, __zero);
 #else
-                                           (zero, vs8);
+                                           (__zero, __vs8);
 #endif
-  vf1 = (__vector float) vec_ctf (vi4, 0);
+  __vf1 = (__vector float) vec_ctf (__vi4, 0);
 
-  return (__m128) vf1;
+  return (__m128) __vf1;
 }
 
 /* Convert the low four signed 8-bit values in A to SPFP form.  */
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtpi8_ps (__m64 __A)
 {
-  __vector signed char vc16;
-  __vector signed short vs8;
-  __vector signed int vi4;
-  __vector float vf1;
+  __vector signed char __vc16;
+  __vector signed short __vs8;
+  __vector signed int __vi4;
+  __vector float __vf1;
 
-  vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
-  vs8 = vec_vupkhsb (vc16);
-  vi4 = vec_vupkhsh (vs8);
-  vf1 = (__vector float) vec_ctf (vi4, 0);
+  __vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
+  __vs8 = vec_vupkhsb (__vc16);
+  __vi4 = vec_vupkhsh (__vs8);
+  __vf1 = (__vector float) vec_ctf (__vi4, 0);
 
-  return (__m128) vf1;
+  return (__m128) __vf1;
 }
 
 /* Convert the low four unsigned 8-bit values in A to SPFP form.  */
@@ -1181,70 +1181,70 @@ extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __art
 
 _mm_cvtpu8_ps (__m64  __A)
 {
-  const __vector unsigned char zero =
+  const __vector unsigned char __zero =
     { 0, 0, 0, 0, 0, 0, 0, 0 };
-  __vector unsigned char vc16;
-  __vector unsigned short vs8;
-  __vector unsigned int vi4;
-  __vector float vf1;
+  __vector unsigned char __vc16;
+  __vector unsigned short __vs8;
+  __vector unsigned int __vi4;
+  __vector float __vf1;
 
-  vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
+  __vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
 #ifdef __LITTLE_ENDIAN__
-  vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
-  vi4 = (__vector unsigned int) vec_mergeh (vs8,
-					    (__vector unsigned short) zero);
+  __vs8 = (__vector unsigned short) vec_mergel (__vc16, __zero);
+  __vi4 = (__vector unsigned int) vec_mergeh (__vs8,
+					    (__vector unsigned short) __zero);
 #else
-  vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
-  vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
-                                            vs8);
+  __vs8 = (__vector unsigned short) vec_mergel (__zero, __vc16);
+  __vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) __zero,
+                                            __vs8);
 #endif
-  vf1 = (__vector float) vec_ctf (vi4, 0);
+  __vf1 = (__vector float) vec_ctf (__vi4, 0);
 
-  return (__m128) vf1;
+  return (__m128) __vf1;
 }
 
 /* Convert the four signed 32-bit values in A and B to SPFP form.  */
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
 {
-  __vector signed int vi4;
-  __vector float vf4;
+  __vector signed int __vi4;
+  __vector float __vf4;
 
-  vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
-  vf4 = (__vector float) vec_ctf (vi4, 0);
-  return (__m128) vf4;
+  __vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
+  __vf4 = (__vector float) vec_ctf (__vi4, 0);
+  return (__m128) __vf4;
 }
 
 /* Convert the four SPFP values in A to four signed 16-bit integers.  */
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtps_pi16 (__m128 __A)
 {
-  __v4sf rounded;
-  __vector signed int temp;
-  __vector unsigned long long result;
+  __v4sf __rounded;
+  __vector signed int __temp;
+  __vector unsigned long long __result;
 
-  rounded = vec_rint(__A);
-  temp = vec_cts (rounded, 0);
-  result = (__vector unsigned long long) vec_pack (temp, temp);
+  __rounded = vec_rint(__A);
+  __temp = vec_cts (__rounded, 0);
+  __result = (__vector unsigned long long) vec_pack (__temp, __temp);
 
-  return (__m64) ((__vector long long) result)[0];
+  return (__m64) ((__vector long long) __result)[0];
 }
 
 /* Convert the four SPFP values in A to four signed 8-bit integers.  */
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtps_pi8 (__m128 __A)
 {
-  __v4sf rounded;
-  __vector signed int tmp_i;
-  static const __vector signed int zero = {0, 0, 0, 0};
-  __vector signed short tmp_s;
-  __vector signed char res_v;
+  __v4sf __rounded;
+  __vector signed int __tmp_i;
+  static const __vector signed int __zero = {0, 0, 0, 0};
+  __vector signed short __tmp_s;
+  __vector signed char __res_v;
 
-  rounded = vec_rint(__A);
-  tmp_i = vec_cts (rounded, 0);
-  tmp_s = vec_pack (tmp_i, zero);
-  res_v = vec_pack (tmp_s, tmp_s);
-  return (__m64) ((__vector long long) res_v)[0];
+  __rounded = vec_rint(__A);
+  __tmp_i = vec_cts (__rounded, 0);
+  __tmp_s = vec_pack (__tmp_i, __zero);
+  __res_v = vec_pack (__tmp_s, __tmp_s);
+  return (__m64) ((__vector long long) __res_v)[0];
 }
 
 /* Selects four specific SPFP values from A and B based on MASK.  */
@@ -1252,11 +1252,11 @@ extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __art
 
 _mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
 {
-  unsigned long element_selector_10 = __mask & 0x03;
-  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
-  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
-  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
-  static const unsigned int permute_selectors[4] =
+  unsigned long __element_selector_10 = __mask & 0x03;
+  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
+  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
+  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
+  static const unsigned int __permute_selectors[4] =
     {
 #ifdef __LITTLE_ENDIAN__
       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
@@ -1264,13 +1264,13 @@ _mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
 #endif
     };
-  __vector unsigned int t;
+  __vector unsigned int __t;
 
-  t[0] = permute_selectors[element_selector_10];
-  t[1] = permute_selectors[element_selector_32];
-  t[2] = permute_selectors[element_selector_54] + 0x10101010;
-  t[3] = permute_selectors[element_selector_76] + 0x10101010;
-  return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
+  __t[0] = __permute_selectors[__element_selector_10];
+  __t[1] = __permute_selectors[__element_selector_32];
+  __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
+  __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
+  return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)__t);
 }
 
 /* Selects and interleaves the upper two SPFP values from A and B.  */
@@ -1355,8 +1355,8 @@ _mm_movemask_ps (__m128  __A)
 #ifdef _ARCH_PWR10
   return vec_extractm ((__vector unsigned int) __A);
 #else
-  __vector unsigned long long result;
-  static const __vector unsigned int perm_mask =
+  __vector unsigned long long __result;
+  static const __vector unsigned int __perm_mask =
     {
 #ifdef __LITTLE_ENDIAN__
 	0x00204060, 0x80808080, 0x80808080, 0x80808080
@@ -1365,14 +1365,14 @@ _mm_movemask_ps (__m128  __A)
 #endif
     };
 
-  result = ((__vector unsigned long long)
+  __result = ((__vector unsigned long long)
 	    vec_vbpermq ((__vector unsigned char) __A,
-			 (__vector unsigned char) perm_mask));
+			 (__vector unsigned char) __perm_mask));
 
 #ifdef __LITTLE_ENDIAN__
-  return result[1];
+  return __result[1];
 #else
-  return result[0];
+  return __result[0];
 #endif
 #endif /* !_ARCH_PWR10 */
 }
@@ -1395,12 +1395,12 @@ _mm_load_ps1 (float const *__P)
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_extract_pi16 (__m64 const __A, int const __N)
 {
-  unsigned int shiftr = __N & 3;
+  unsigned int __shiftr = __N & 3;
 #ifdef __BIG_ENDIAN__
-  shiftr = 3 - shiftr;
+  __shiftr = 3 - __shiftr;
 #endif
 
-  return ((__A >> (shiftr * 16)) & 0xffff);
+  return ((__A >> (__shiftr * 16)) & 0xffff);
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1414,12 +1414,12 @@ _m_pextrw (__m64 const __A, int const __N)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
 {
-  const int shiftl = (__N & 3) * 16;
-  const __m64 shiftD = (const __m64) __D << shiftl;
-  const __m64 mask = 0xffffUL << shiftl;
-  __m64 result = (__A & (~mask)) | (shiftD & mask);
+  const int __shiftl = (__N & 3) * 16;
+  const __m64 __shiftD = (const __m64) __D << __shiftl;
+  const __m64 __mask = 0xffffUL << __shiftl;
+  __m64 __result = (__A & (~__mask)) | (__shiftD & __mask);
 
-  return (result);
+  return __result;
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1434,30 +1434,30 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_max_pi16 (__m64 __A, __m64 __B)
 {
 #if _ARCH_PWR8
-  __vector signed short a, b, r;
-  __vector __bool short c;
-
-  a = (__vector signed short)vec_splats (__A);
-  b = (__vector signed short)vec_splats (__B);
-  c = (__vector __bool short)vec_cmpgt (a, b);
-  r = vec_sel (b, a, c);
-  return (__m64) ((__vector long long) r)[0];
+  __vector signed short __a, __b, __r;
+  __vector __bool short __c;
+
+  __a = (__vector signed short)vec_splats (__A);
+  __b = (__vector signed short)vec_splats (__B);
+  __c = (__vector __bool short)vec_cmpgt (__a, __b);
+  __r = vec_sel (__b, __a, __c);
+  return (__m64) ((__vector long long) __r)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __m1, __m2, __res;
 
-  m1.as_m64 = __A;
-  m2.as_m64 = __B;
+  __m1.as_m64 = __A;
+  __m2.as_m64 = __B;
 
-  res.as_short[0] =
-      (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
-  res.as_short[1] =
-      (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
-  res.as_short[2] =
-      (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
-  res.as_short[3] =
-      (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
+  __res.as_short[0] =
+      (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0] : __m2.as_short[0];
+  __res.as_short[1] =
+      (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1] : __m2.as_short[1];
+  __res.as_short[2] =
+      (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2] : __m2.as_short[2];
+  __res.as_short[3] =
+      (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3] : __m2.as_short[3];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -1472,28 +1472,27 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_max_pu8 (__m64 __A, __m64 __B)
 {
 #if _ARCH_PWR8
-  __vector unsigned char a, b, r;
-  __vector __bool char c;
-
-  a = (__vector unsigned char)vec_splats (__A);
-  b = (__vector unsigned char)vec_splats (__B);
-  c = (__vector __bool char)vec_cmpgt (a, b);
-  r = vec_sel (b, a, c);
-  return (__m64) ((__vector long long) r)[0];
+  __vector unsigned char __a, __b, __r;
+  __vector __bool char __c;
+
+  __a = (__vector unsigned char)vec_splats (__A);
+  __b = (__vector unsigned char)vec_splats (__B);
+  __c = (__vector __bool char)vec_cmpgt (__a, __b);
+  __r = vec_sel (__b, __a, __c);
+  return (__m64) ((__vector long long) __r)[0];
 #else
-  __m64_union m1, m2, res;
-  long i;
+  __m64_union __m1, __m2, __res;
+  long __i;
 
-  m1.as_m64 = __A;
-  m2.as_m64 = __B;
+  __m1.as_m64 = __A;
+  __m2.as_m64 = __B;
 
+  for (__i = 0; __i < 8; __i++)
+    __res.as_char[__i] =
+      ((unsigned char) __m1.as_char[__i] > (unsigned char) __m2.as_char[__i]) ?
+	  __m1.as_char[__i] : __m2.as_char[__i];
 
-  for (i = 0; i < 8; i++)
-  res.as_char[i] =
-      ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
-	  m1.as_char[i] : m2.as_char[i];
-
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -1508,30 +1507,30 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_min_pi16 (__m64 __A, __m64 __B)
 {
 #if _ARCH_PWR8
-  __vector signed short a, b, r;
-  __vector __bool short c;
-
-  a = (__vector signed short)vec_splats (__A);
-  b = (__vector signed short)vec_splats (__B);
-  c = (__vector __bool short)vec_cmplt (a, b);
-  r = vec_sel (b, a, c);
-  return (__m64) ((__vector long long) r)[0];
+  __vector signed short __a, __b, __r;
+  __vector __bool short __c;
+
+  __a = (__vector signed short)vec_splats (__A);
+  __b = (__vector signed short)vec_splats (__B);
+  __c = (__vector __bool short)vec_cmplt (__a, __b);
+  __r = vec_sel (__b, __a, __c);
+  return (__m64) ((__vector long long) __r)[0];
 #else
-  __m64_union m1, m2, res;
+  __m64_union __m1, __m2, __res;
 
-  m1.as_m64 = __A;
-  m2.as_m64 = __B;
+  __m1.as_m64 = __A;
+  __m2.as_m64 = __B;
 
-  res.as_short[0] =
-      (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
-  res.as_short[1] =
-      (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
-  res.as_short[2] =
-      (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
-  res.as_short[3] =
-      (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
+  __res.as_short[0] =
+      (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0] : __m2.as_short[0];
+  __res.as_short[1] =
+      (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1] : __m2.as_short[1];
+  __res.as_short[2] =
+      (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2] : __m2.as_short[2];
+  __res.as_short[3] =
+      (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3] : __m2.as_short[3];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -1546,28 +1545,28 @@ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artifi
 _mm_min_pu8 (__m64 __A, __m64 __B)
 {
 #if _ARCH_PWR8
-  __vector unsigned char a, b, r;
-  __vector __bool char c;
-
-  a = (__vector unsigned char)vec_splats (__A);
-  b = (__vector unsigned char)vec_splats (__B);
-  c = (__vector __bool char)vec_cmplt (a, b);
-  r = vec_sel (b, a, c);
-  return (__m64) ((__vector long long) r)[0];
+  __vector unsigned char __a, __b, __r;
+  __vector __bool char __c;
+
+  __a = (__vector unsigned char)vec_splats (__A);
+  __b = (__vector unsigned char)vec_splats (__B);
+  __c = (__vector __bool char)vec_cmplt (__a, __b);
+  __r = vec_sel (__b, __a, __c);
+  return (__m64) ((__vector long long) __r)[0];
 #else
-  __m64_union m1, m2, res;
-  long i;
+  __m64_union __m1, __m2, __res;
+  long __i;
 
-  m1.as_m64 = __A;
-  m2.as_m64 = __B;
+  __m1.as_m64 = __A;
+  __m2.as_m64 = __B;
 
 
-  for (i = 0; i < 8; i++)
-  res.as_char[i] =
-      ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
-	  m1.as_char[i] : m2.as_char[i];
+  for (__i = 0; __i < 8; __i++)
+    __res.as_char[__i] =
+      ((unsigned char) __m1.as_char[__i] < (unsigned char) __m2.as_char[__i]) ?
+	  __m1.as_char[__i] : __m2.as_char[__i];
 
-  return (__m64) res.as_m64;
+  return (__m64) __res.as_m64;
 #endif
 }
 
@@ -1582,24 +1581,24 @@ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artifici
 _mm_movemask_pi8 (__m64 __A)
 {
 #ifdef __powerpc64__
-  unsigned long long p =
+  unsigned long long __p =
 #ifdef __LITTLE_ENDIAN__
                          0x0008101820283038UL; // permute control for sign bits
 #else
                          0x3830282018100800UL; // permute control for sign bits
 #endif
-  return __builtin_bpermd (p, __A);
+  return __builtin_bpermd (__p, __A);
 #else
 #ifdef __LITTLE_ENDIAN__
-  unsigned int mask = 0x20283038UL;
-  unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf;
-  unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf;
+  unsigned int __mask = 0x20283038UL;
+  unsigned int __r1 = __builtin_bpermd (__mask, __A) & 0xf;
+  unsigned int __r2 = __builtin_bpermd (__mask, __A >> 32) & 0xf;
 #else
-  unsigned int mask = 0x38302820UL;
-  unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf;
-  unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf;
+  unsigned int __mask = 0x38302820UL;
+  unsigned int __r1 = __builtin_bpermd (__mask, __A >> 32) & 0xf;
+  unsigned int __r2 = __builtin_bpermd (__mask, __A) & 0xf;
 #endif
-  return (r2 << 4) | r1;
+  return (__r2 << 4) | __r1;
 #endif
 }
 
@@ -1614,10 +1613,10 @@ _m_pmovmskb (__m64 __A)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
 {
-  __vector unsigned short a, b;
-  __vector unsigned short c;
-  __vector unsigned int w0, w1;
-  __vector unsigned char xform1 = {
+  __vector unsigned short __a, __b;
+  __vector unsigned short __c;
+  __vector unsigned int __w0, __w1;
+  __vector unsigned char __xform1 = {
 #ifdef __LITTLE_ENDIAN__
       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
@@ -1627,14 +1626,14 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
 #endif
     };
 
-  a = (__vector unsigned short)vec_splats (__A);
-  b = (__vector unsigned short)vec_splats (__B);
+  __a = (__vector unsigned short)vec_splats (__A);
+  __b = (__vector unsigned short)vec_splats (__B);
 
-  w0 = vec_vmuleuh (a, b);
-  w1 = vec_vmulouh (a, b);
-  c = (__vector unsigned short)vec_perm (w0, w1, xform1);
+  __w0 = vec_vmuleuh (__a, __b);
+  __w1 = vec_vmulouh (__a, __b);
+  __c = (__vector unsigned short)vec_perm (__w0, __w1, __xform1);
 
-  return (__m64) ((__vector long long) c)[0];
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1648,11 +1647,11 @@ _m_pmulhuw (__m64 __A, __m64 __B)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_shuffle_pi16 (__m64 __A, int const __N)
 {
-  unsigned long element_selector_10 = __N & 0x03;
-  unsigned long element_selector_32 = (__N >> 2) & 0x03;
-  unsigned long element_selector_54 = (__N >> 4) & 0x03;
-  unsigned long element_selector_76 = (__N >> 6) & 0x03;
-  static const unsigned short permute_selectors[4] =
+  unsigned long __element_selector_10 = __N & 0x03;
+  unsigned long __element_selector_32 = (__N >> 2) & 0x03;
+  unsigned long __element_selector_54 = (__N >> 4) & 0x03;
+  unsigned long __element_selector_76 = (__N >> 6) & 0x03;
+  static const unsigned short __permute_selectors[4] =
     {
 #ifdef __LITTLE_ENDIAN__
 	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
@@ -1660,24 +1659,24 @@ _mm_shuffle_pi16 (__m64 __A, int const __N)
 	      0x0607, 0x0405, 0x0203, 0x0001
 #endif
     };
-  __m64_union t;
-  __vector unsigned long long a, p, r;
+  __m64_union __t;
+  __vector unsigned long long __a, __p, __r;
 
 #ifdef __LITTLE_ENDIAN__
-  t.as_short[0] = permute_selectors[element_selector_10];
-  t.as_short[1] = permute_selectors[element_selector_32];
-  t.as_short[2] = permute_selectors[element_selector_54];
-  t.as_short[3] = permute_selectors[element_selector_76];
+  __t.as_short[0] = __permute_selectors[__element_selector_10];
+  __t.as_short[1] = __permute_selectors[__element_selector_32];
+  __t.as_short[2] = __permute_selectors[__element_selector_54];
+  __t.as_short[3] = __permute_selectors[__element_selector_76];
 #else
-  t.as_short[3] = permute_selectors[element_selector_10];
-  t.as_short[2] = permute_selectors[element_selector_32];
-  t.as_short[1] = permute_selectors[element_selector_54];
-  t.as_short[0] = permute_selectors[element_selector_76];
+  __t.as_short[3] = __permute_selectors[__element_selector_10];
+  __t.as_short[2] = __permute_selectors[__element_selector_32];
+  __t.as_short[1] = __permute_selectors[__element_selector_54];
+  __t.as_short[0] = __permute_selectors[__element_selector_76];
 #endif
-  p = vec_splats (t.as_m64);
-  a = vec_splats (__A);
-  r = vec_perm (a, a, (__vector unsigned char)p);
-  return (__m64) ((__vector long long) r)[0];
+  __p = vec_splats (__t.as_m64);
+  __a = vec_splats (__A);
+  __r = vec_perm (__a, __a, (__vector unsigned char)__p);
+  return (__m64) ((__vector long long) __r)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1692,14 +1691,14 @@ _m_pshufw (__m64 __A, int const __N)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
 {
-  __m64 hibit = 0x8080808080808080UL;
-  __m64 mask, tmp;
-  __m64 *p = (__m64*)__P;
+  __m64 __hibit = 0x8080808080808080UL;
+  __m64 __mask, __tmp;
+  __m64 *__p = (__m64*)__P;
 
-  tmp = *p;
-  mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
-  tmp = (tmp & (~mask)) | (__A & mask);
-  *p = tmp;
+  __tmp = *__p;
+  __mask = _mm_cmpeq_pi8 ((__N & __hibit), __hibit);
+  __tmp = (__tmp & (~__mask)) | (__A & __mask);
+  *__p = __tmp;
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1712,12 +1711,12 @@ _m_maskmovq (__m64 __A, __m64 __N, char *__P)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_avg_pu8 (__m64 __A, __m64 __B)
 {
-  __vector unsigned char a, b, c;
+  __vector unsigned char __a, __b, __c;
 
-  a = (__vector unsigned char)vec_splats (__A);
-  b = (__vector unsigned char)vec_splats (__B);
-  c = vec_avg (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector unsigned char)vec_splats (__A);
+  __b = (__vector unsigned char)vec_splats (__B);
+  __c = vec_avg (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1730,12 +1729,12 @@ _m_pavgb (__m64 __A, __m64 __B)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_avg_pu16 (__m64 __A, __m64 __B)
 {
-  __vector unsigned short a, b, c;
+  __vector unsigned short __a, __b, __c;
 
-  a = (__vector unsigned short)vec_splats (__A);
-  b = (__vector unsigned short)vec_splats (__B);
-  c = vec_avg (a, b);
-  return (__m64) ((__vector long long) c)[0];
+  __a = (__vector unsigned short)vec_splats (__A);
+  __b = (__vector unsigned short)vec_splats (__B);
+  __c = vec_avg (__a, __b);
+  return (__m64) ((__vector long long) __c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1750,26 +1749,26 @@ _m_pavgw (__m64 __A, __m64 __B)
 extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sad_pu8 (__m64  __A, __m64  __B)
 {
-  __vector unsigned char a, b;
-  __vector unsigned char vmin, vmax, vabsdiff;
-  __vector signed int vsum;
-  const __vector unsigned int zero =
+  __vector unsigned char __a, __b;
+  __vector unsigned char __vmin, __vmax, __vabsdiff;
+  __vector signed int __vsum;
+  const __vector unsigned int __zero =
     { 0, 0, 0, 0 };
-  __m64_union result = {0};
+  __m64_union __result = {0};
 
-  a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
-  b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
-  vmin = vec_min (a, b);
-  vmax = vec_max (a, b);
-  vabsdiff = vec_sub (vmax, vmin);
+  __a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
+  __b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
+  __vmin = vec_min (__a, __b);
+  __vmax = vec_max (__a, __b);
+  __vabsdiff = vec_sub (__vmax, __vmin);
   /* Sum four groups of bytes into integers.  */
-  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
+  __vsum = (__vector signed int) vec_sum4s (__vabsdiff, __zero);
   /* Sum across four integers with integer result.  */
-  vsum = vec_sums (vsum, (__vector signed int) zero);
+  __vsum = vec_sums (__vsum, (__vector signed int) __zero);
   /* The sum is in the right most 32-bits of the vector result.
      Transfer to a GPR and truncate to 16 bits.  */
-  result.as_short[0] = vsum[3];
-  return result.as_m64;
+  __result.as_short[0] = __vsum[3];
+  return __result.as_m64;
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-- 
cgit v1.1


From fe79d652c96b53384ddfa43e312cb0010251391b Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Thu, 17 Feb 2022 14:40:16 +0100
Subject: target/104581 - compile-time regression in mode-switching

The x86 backend piggy-backs on mode-switching for insertion of
vzeroupper.  A recent improvement there was implemented in a way
to walk possibly the whole basic-block for all DF reg def definitions
in its mode_needed hook which is called for each instruction in
a basic-block during mode-switching local analysis.

The following mostly reverts this improvement.  It needs to be
re-done in a way more consistent with a local dataflow which
probably means making targets aware of the state of the local
dataflow analysis.

2022-02-17  Richard Biener  <rguenther@suse.de>

	PR target/104581
	* config/i386/i386.cc (ix86_avx_u128_mode_source): Remove.
	(ix86_avx_u128_mode_needed): Return AVX_U128_DIRTY instead
	of calling ix86_avx_u128_mode_source which would eventually
	have returned AVX_U128_ANY in some very special case.

	* gcc.target/i386/pr101456-1.c: XFAIL.
---
 gcc/config/i386/i386.cc | 78 ++-----------------------------------------------
 1 file changed, 3 insertions(+), 75 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index cf246e7..e4b42fb 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14377,80 +14377,12 @@ ix86_check_avx_upper_register (const_rtx exp)
 
 static void
 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
- {
-   if (ix86_check_avx_upper_register (dest))
+{
+  if (ix86_check_avx_upper_register (dest))
     {
       bool *used = (bool *) data;
       *used = true;
     }
- }
-
-/* For YMM/ZMM store or YMM/ZMM extract.  Return mode for the source
-   operand of SRC DEFs in the same basic block before INSN.  */
-
-static int
-ix86_avx_u128_mode_source (rtx_insn *insn, const_rtx src)
-{
-  basic_block bb = BLOCK_FOR_INSN (insn);
-  rtx_insn *end = BB_END (bb);
-
-  /* Return AVX_U128_DIRTY if there is no DEF in the same basic
-     block.  */
-  int status = AVX_U128_DIRTY;
-
-  for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src));
-       def; def = DF_REF_NEXT_REG (def))
-    if (DF_REF_BB (def) == bb)
-      {
-	/* Ignore DEF from different basic blocks.  */
-	rtx_insn *def_insn = DF_REF_INSN (def);
-
-	/* Check if DEF_INSN is before INSN.  */
-	rtx_insn *next;
-	for (next = NEXT_INSN (def_insn);
-	     next != nullptr && next != end && next != insn;
-	     next = NEXT_INSN (next))
-	  ;
-
-	/* Skip if DEF_INSN isn't before INSN.  */
-	if (next != insn)
-	  continue;
-
-	/* Return AVX_U128_DIRTY if the source operand of DEF_INSN
-	   isn't constant zero.  */
-
-	if (CALL_P (def_insn))
-	  {
-	    bool avx_upper_reg_found = false;
-	    note_stores (def_insn,
-			 ix86_check_avx_upper_stores,
-			 &avx_upper_reg_found);
-
-	    /* Return AVX_U128_DIRTY if call returns AVX.  */
-	    if (avx_upper_reg_found)
-	      return AVX_U128_DIRTY;
-
-	    continue;
-	  }
-
-	rtx set = single_set (def_insn);
-	if (!set)
-	  return AVX_U128_DIRTY;
-
-	rtx dest = SET_DEST (set);
-
-	/* Skip if DEF_INSN is not an AVX load.  Return AVX_U128_DIRTY
-	   if the source operand isn't constant zero.  */
-	if (ix86_check_avx_upper_register (dest)
-	    && standard_sse_constant_p (SET_SRC (set),
-					GET_MODE (dest)) != 1)
-	  return AVX_U128_DIRTY;
-
-	/* We get here only if all AVX loads are from constant zero.  */
-	status = AVX_U128_ANY;
-      }
-
-  return status;
 }
 
 /* Return needed mode for entity in optimize_mode_switching pass.  */
@@ -14520,11 +14452,7 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
 	{
 	  FOR_EACH_SUBRTX (iter, array, src, NONCONST)
 	    if (ix86_check_avx_upper_register (*iter))
-	      {
-		int status = ix86_avx_u128_mode_source (insn, *iter);
-		if (status == AVX_U128_DIRTY)
-		  return status;
-	      }
+	      return AVX_U128_DIRTY;
 	}
 
       /* This isn't YMM/ZMM load/store.  */
-- 
cgit v1.1


From df5ed150ee5fbcb8255e05eed978c4af2b3d9bcc Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Fri, 18 Feb 2022 17:21:43 +0100
Subject: rs6000: Fix up posix_memalign call in _mm_malloc [PR104598]

The uglification changes went in one spot too far and uglified also
the anem of function, posix_memalign should be called like that and
not a non-existent function instead of it.

2022-02-18  Jakub Jelinek  <jakub@redhat.com>

	PR target/104257
	PR target/104598
	* config/rs6000/mm_malloc.h (_mm_malloc): Call posix_memalign
	rather than __posix_memalign.
---
 gcc/config/rs6000/mm_malloc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/mm_malloc.h b/gcc/config/rs6000/mm_malloc.h
index 721f756..ae47cac 100644
--- a/gcc/config/rs6000/mm_malloc.h
+++ b/gcc/config/rs6000/mm_malloc.h
@@ -47,7 +47,7 @@ _mm_malloc (size_t __size, size_t __alignment)
     return malloc (__size);
   if (__alignment < __vec_align)
     __alignment = __vec_align;
-  if (__posix_memalign (&__ptr, __alignment, __size) == 0)
+  if (posix_memalign (&__ptr, __alignment, __size) == 0)
     return __ptr;
   else
     return NULL;
-- 
cgit v1.1


From 4984f882f41be1472df6ce7c439c98c4bc4e6f08 Mon Sep 17 00:00:00 2001
From: Pat Haugen <pthaugen@linux.ibm.com>
Date: Fri, 18 Feb 2022 15:38:23 -0600
Subject: Mark Power10 fusion option undocumented and remove sub-options.

gcc/
	* config/rs6000/rs6000.opt (mpower10-fusion): Mark Undocumented.
	(mpower10-fusion-ld-cmpi, mpower10-fusion-2logical,
	mpower10-fusion-logical-add, mpower10-fusion-add-logical,
	mpower10-fusion-2add, mpower10-fusion-2store): Remove.
	* config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER,
	OTHER_P9_VECTOR_MASKS): Remove Power10 fusion sub-options.
	* config/rs6000/rs6000.cc (rs6000_option_override_internal,
	power10_sched_reorder): Likewise.
	* config/rs6000/genfusion.pl (gen_ld_cmpi_p10, gen_logical_addsubf,
	gen_addadd): Likewise
	* config/rs6000/fusion.md: Regenerate.
---
 gcc/config/rs6000/fusion.md       | 332 +++++++++++++++++++-------------------
 gcc/config/rs6000/genfusion.pl    |  13 +-
 gcc/config/rs6000/rs6000-cpus.def |  14 +-
 gcc/config/rs6000/rs6000.cc       |  27 +---
 gcc/config/rs6000/rs6000.opt      |  26 +--
 5 files changed, 174 insertions(+), 238 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md
index 6f9f534..15f0c16 100644
--- a/gcc/config/rs6000/fusion.md
+++ b/gcc/config/rs6000/fusion.md
@@ -25,7 +25,7 @@
         (compare:CC (match_operand:DI 1 "ds_form_mem_operand" "m")
                     (match_operand:DI 3 "const_m1_to_1_operand" "n")))
    (clobber (match_scratch:DI 0 "=r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "ld%X1 %0,%1\;cmpdi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -46,7 +46,7 @@
         (compare:CCUNS (match_operand:DI 1 "ds_form_mem_operand" "m")
                        (match_operand:DI 3 "const_0_to_1_operand" "n")))
    (clobber (match_scratch:DI 0 "=r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "ld%X1 %0,%1\;cmpldi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -67,7 +67,7 @@
         (compare:CC (match_operand:DI 1 "ds_form_mem_operand" "m")
                     (match_operand:DI 3 "const_m1_to_1_operand" "n")))
    (set (match_operand:DI 0 "gpc_reg_operand" "=r") (match_dup 1))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "ld%X1 %0,%1\;cmpdi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -88,7 +88,7 @@
         (compare:CCUNS (match_operand:DI 1 "ds_form_mem_operand" "m")
                        (match_operand:DI 3 "const_0_to_1_operand" "n")))
    (set (match_operand:DI 0 "gpc_reg_operand" "=r") (match_dup 1))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "ld%X1 %0,%1\;cmpldi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -109,7 +109,7 @@
         (compare:CC (match_operand:SI 1 "ds_form_mem_operand" "m")
                     (match_operand:SI 3 "const_m1_to_1_operand" "n")))
    (clobber (match_scratch:SI 0 "=r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "lwa%X1 %0,%1\;cmpdi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -130,7 +130,7 @@
         (compare:CCUNS (match_operand:SI 1 "non_update_memory_operand" "m")
                        (match_operand:SI 3 "const_0_to_1_operand" "n")))
    (clobber (match_scratch:SI 0 "=r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "lwz%X1 %0,%1\;cmpldi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -151,7 +151,7 @@
         (compare:CC (match_operand:SI 1 "ds_form_mem_operand" "m")
                     (match_operand:SI 3 "const_m1_to_1_operand" "n")))
    (set (match_operand:SI 0 "gpc_reg_operand" "=r") (match_dup 1))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "lwa%X1 %0,%1\;cmpdi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -172,7 +172,7 @@
         (compare:CCUNS (match_operand:SI 1 "non_update_memory_operand" "m")
                        (match_operand:SI 3 "const_0_to_1_operand" "n")))
    (set (match_operand:SI 0 "gpc_reg_operand" "=r") (match_dup 1))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "lwz%X1 %0,%1\;cmpldi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -193,7 +193,7 @@
         (compare:CC (match_operand:SI 1 "ds_form_mem_operand" "m")
                     (match_operand:SI 3 "const_m1_to_1_operand" "n")))
    (set (match_operand:EXTSI 0 "gpc_reg_operand" "=r") (sign_extend:EXTSI (match_dup 1)))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "lwa%X1 %0,%1\;cmpdi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -214,7 +214,7 @@
         (compare:CCUNS (match_operand:SI 1 "non_update_memory_operand" "m")
                        (match_operand:SI 3 "const_0_to_1_operand" "n")))
    (set (match_operand:EXTSI 0 "gpc_reg_operand" "=r") (zero_extend:EXTSI (match_dup 1)))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "lwz%X1 %0,%1\;cmpldi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -235,7 +235,7 @@
         (compare:CC (match_operand:HI 1 "non_update_memory_operand" "m")
                     (match_operand:HI 3 "const_m1_to_1_operand" "n")))
    (clobber (match_scratch:GPR 0 "=r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "lha%X1 %0,%1\;cmpdi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -256,7 +256,7 @@
         (compare:CCUNS (match_operand:HI 1 "non_update_memory_operand" "m")
                        (match_operand:HI 3 "const_0_to_1_operand" "n")))
    (clobber (match_scratch:GPR 0 "=r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "lhz%X1 %0,%1\;cmpldi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -277,7 +277,7 @@
         (compare:CC (match_operand:HI 1 "non_update_memory_operand" "m")
                     (match_operand:HI 3 "const_m1_to_1_operand" "n")))
    (set (match_operand:EXTHI 0 "gpc_reg_operand" "=r") (sign_extend:EXTHI (match_dup 1)))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "lha%X1 %0,%1\;cmpdi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -298,7 +298,7 @@
         (compare:CCUNS (match_operand:HI 1 "non_update_memory_operand" "m")
                        (match_operand:HI 3 "const_0_to_1_operand" "n")))
    (set (match_operand:EXTHI 0 "gpc_reg_operand" "=r") (zero_extend:EXTHI (match_dup 1)))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "lhz%X1 %0,%1\;cmpldi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -319,7 +319,7 @@
         (compare:CCUNS (match_operand:QI 1 "non_update_memory_operand" "m")
                        (match_operand:QI 3 "const_0_to_1_operand" "n")))
    (clobber (match_scratch:GPR 0 "=r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "lbz%X1 %0,%1\;cmpldi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -340,7 +340,7 @@
         (compare:CCUNS (match_operand:QI 1 "non_update_memory_operand" "m")
                        (match_operand:QI 3 "const_0_to_1_operand" "n")))
    (set (match_operand:GPR 0 "gpc_reg_operand" "=r") (zero_extend:GPR (match_dup 1)))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)"
+  "(TARGET_P10_FUSION)"
   "lbz%X1 %0,%1\;cmpldi %2,%0,%3"
   "&& reload_completed
    && (cc_reg_not_cr0_operand (operands[2], CCmode)
@@ -363,7 +363,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    and %3,%1,%0\;and %3,%3,%2
    and %3,%1,%0\;and %3,%3,%2
@@ -381,7 +381,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    andc %3,%1,%0\;and %3,%3,%2
    andc %3,%1,%0\;and %3,%3,%2
@@ -399,7 +399,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    eqv %3,%1,%0\;and %3,%3,%2
    eqv %3,%1,%0\;and %3,%3,%2
@@ -417,7 +417,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nand %3,%1,%0\;and %3,%3,%2
    nand %3,%1,%0\;and %3,%3,%2
@@ -435,7 +435,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nor %3,%1,%0\;and %3,%3,%2
    nor %3,%1,%0\;and %3,%3,%2
@@ -453,7 +453,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    or %3,%1,%0\;and %3,%3,%2
    or %3,%1,%0\;and %3,%3,%2
@@ -471,7 +471,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    orc %3,%1,%0\;and %3,%3,%2
    orc %3,%1,%0\;and %3,%3,%2
@@ -489,7 +489,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    xor %3,%1,%0\;and %3,%3,%2
    xor %3,%1,%0\;and %3,%3,%2
@@ -507,7 +507,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)"
+  "(TARGET_P10_FUSION)"
   "@
    add %3,%1,%0\;and %3,%3,%2
    add %3,%1,%0\;and %3,%3,%2
@@ -525,7 +525,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)"
+  "(TARGET_P10_FUSION)"
   "@
    subf %3,%1,%0\;and %3,%3,%2
    subf %3,%1,%0\;and %3,%3,%2
@@ -543,7 +543,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    and %3,%1,%0\;andc %3,%3,%2
    and %3,%1,%0\;andc %3,%3,%2
@@ -561,7 +561,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    andc %3,%1,%0\;andc %3,%3,%2
    andc %3,%1,%0\;andc %3,%3,%2
@@ -579,7 +579,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    eqv %3,%1,%0\;andc %3,%3,%2
    eqv %3,%1,%0\;andc %3,%3,%2
@@ -597,7 +597,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nand %3,%1,%0\;andc %3,%3,%2
    nand %3,%1,%0\;andc %3,%3,%2
@@ -615,7 +615,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nor %3,%1,%0\;andc %3,%3,%2
    nor %3,%1,%0\;andc %3,%3,%2
@@ -633,7 +633,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    or %3,%1,%0\;andc %3,%3,%2
    or %3,%1,%0\;andc %3,%3,%2
@@ -651,7 +651,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    orc %3,%1,%0\;andc %3,%3,%2
    orc %3,%1,%0\;andc %3,%3,%2
@@ -669,7 +669,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    xor %3,%1,%0\;andc %3,%3,%2
    xor %3,%1,%0\;andc %3,%3,%2
@@ -687,7 +687,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    and %3,%1,%0\;eqv %3,%3,%2
    and %3,%1,%0\;eqv %3,%3,%2
@@ -705,7 +705,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    andc %3,%1,%0\;eqv %3,%3,%2
    andc %3,%1,%0\;eqv %3,%3,%2
@@ -723,7 +723,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    eqv %3,%1,%0\;eqv %3,%3,%2
    eqv %3,%1,%0\;eqv %3,%3,%2
@@ -741,7 +741,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nand %3,%1,%0\;eqv %3,%3,%2
    nand %3,%1,%0\;eqv %3,%3,%2
@@ -759,7 +759,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nor %3,%1,%0\;eqv %3,%3,%2
    nor %3,%1,%0\;eqv %3,%3,%2
@@ -777,7 +777,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    or %3,%1,%0\;eqv %3,%3,%2
    or %3,%1,%0\;eqv %3,%3,%2
@@ -795,7 +795,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    orc %3,%1,%0\;eqv %3,%3,%2
    orc %3,%1,%0\;eqv %3,%3,%2
@@ -813,7 +813,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    xor %3,%1,%0\;eqv %3,%3,%2
    xor %3,%1,%0\;eqv %3,%3,%2
@@ -831,7 +831,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    and %3,%1,%0\;nand %3,%3,%2
    and %3,%1,%0\;nand %3,%3,%2
@@ -849,7 +849,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    andc %3,%1,%0\;nand %3,%3,%2
    andc %3,%1,%0\;nand %3,%3,%2
@@ -867,7 +867,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    eqv %3,%1,%0\;nand %3,%3,%2
    eqv %3,%1,%0\;nand %3,%3,%2
@@ -885,7 +885,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nand %3,%1,%0\;nand %3,%3,%2
    nand %3,%1,%0\;nand %3,%3,%2
@@ -903,7 +903,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nor %3,%1,%0\;nand %3,%3,%2
    nor %3,%1,%0\;nand %3,%3,%2
@@ -921,7 +921,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    or %3,%1,%0\;nand %3,%3,%2
    or %3,%1,%0\;nand %3,%3,%2
@@ -939,7 +939,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    orc %3,%1,%0\;nand %3,%3,%2
    orc %3,%1,%0\;nand %3,%3,%2
@@ -957,7 +957,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    xor %3,%1,%0\;nand %3,%3,%2
    xor %3,%1,%0\;nand %3,%3,%2
@@ -975,7 +975,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)"
+  "(TARGET_P10_FUSION)"
   "@
    add %3,%1,%0\;nand %3,%3,%2
    add %3,%1,%0\;nand %3,%3,%2
@@ -993,7 +993,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)"
+  "(TARGET_P10_FUSION)"
   "@
    subf %3,%1,%0\;nand %3,%3,%2
    subf %3,%1,%0\;nand %3,%3,%2
@@ -1011,7 +1011,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    and %3,%1,%0\;nor %3,%3,%2
    and %3,%1,%0\;nor %3,%3,%2
@@ -1029,7 +1029,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    andc %3,%1,%0\;nor %3,%3,%2
    andc %3,%1,%0\;nor %3,%3,%2
@@ -1047,7 +1047,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    eqv %3,%1,%0\;nor %3,%3,%2
    eqv %3,%1,%0\;nor %3,%3,%2
@@ -1065,7 +1065,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nand %3,%1,%0\;nor %3,%3,%2
    nand %3,%1,%0\;nor %3,%3,%2
@@ -1083,7 +1083,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nor %3,%1,%0\;nor %3,%3,%2
    nor %3,%1,%0\;nor %3,%3,%2
@@ -1101,7 +1101,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    or %3,%1,%0\;nor %3,%3,%2
    or %3,%1,%0\;nor %3,%3,%2
@@ -1119,7 +1119,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    orc %3,%1,%0\;nor %3,%3,%2
    orc %3,%1,%0\;nor %3,%3,%2
@@ -1137,7 +1137,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    xor %3,%1,%0\;nor %3,%3,%2
    xor %3,%1,%0\;nor %3,%3,%2
@@ -1155,7 +1155,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)"
+  "(TARGET_P10_FUSION)"
   "@
    add %3,%1,%0\;nor %3,%3,%2
    add %3,%1,%0\;nor %3,%3,%2
@@ -1173,7 +1173,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)"
+  "(TARGET_P10_FUSION)"
   "@
    subf %3,%1,%0\;nor %3,%3,%2
    subf %3,%1,%0\;nor %3,%3,%2
@@ -1191,7 +1191,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    and %3,%1,%0\;or %3,%3,%2
    and %3,%1,%0\;or %3,%3,%2
@@ -1209,7 +1209,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    andc %3,%1,%0\;or %3,%3,%2
    andc %3,%1,%0\;or %3,%3,%2
@@ -1227,7 +1227,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    eqv %3,%1,%0\;or %3,%3,%2
    eqv %3,%1,%0\;or %3,%3,%2
@@ -1245,7 +1245,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nand %3,%1,%0\;or %3,%3,%2
    nand %3,%1,%0\;or %3,%3,%2
@@ -1263,7 +1263,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nor %3,%1,%0\;or %3,%3,%2
    nor %3,%1,%0\;or %3,%3,%2
@@ -1281,7 +1281,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    or %3,%1,%0\;or %3,%3,%2
    or %3,%1,%0\;or %3,%3,%2
@@ -1299,7 +1299,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    orc %3,%1,%0\;or %3,%3,%2
    orc %3,%1,%0\;or %3,%3,%2
@@ -1317,7 +1317,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    xor %3,%1,%0\;or %3,%3,%2
    xor %3,%1,%0\;or %3,%3,%2
@@ -1335,7 +1335,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)"
+  "(TARGET_P10_FUSION)"
   "@
    add %3,%1,%0\;or %3,%3,%2
    add %3,%1,%0\;or %3,%3,%2
@@ -1353,7 +1353,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_ADDLOG)"
+  "(TARGET_P10_FUSION)"
   "@
    subf %3,%1,%0\;or %3,%3,%2
    subf %3,%1,%0\;or %3,%3,%2
@@ -1371,7 +1371,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    and %3,%1,%0\;orc %3,%3,%2
    and %3,%1,%0\;orc %3,%3,%2
@@ -1389,7 +1389,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    andc %3,%1,%0\;orc %3,%3,%2
    andc %3,%1,%0\;orc %3,%3,%2
@@ -1407,7 +1407,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    eqv %3,%1,%0\;orc %3,%3,%2
    eqv %3,%1,%0\;orc %3,%3,%2
@@ -1425,7 +1425,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nand %3,%1,%0\;orc %3,%3,%2
    nand %3,%1,%0\;orc %3,%3,%2
@@ -1443,7 +1443,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nor %3,%1,%0\;orc %3,%3,%2
    nor %3,%1,%0\;orc %3,%3,%2
@@ -1461,7 +1461,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    or %3,%1,%0\;orc %3,%3,%2
    or %3,%1,%0\;orc %3,%3,%2
@@ -1479,7 +1479,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    orc %3,%1,%0\;orc %3,%3,%2
    orc %3,%1,%0\;orc %3,%3,%2
@@ -1497,7 +1497,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (not:GPR (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    xor %3,%1,%0\;orc %3,%3,%2
    xor %3,%1,%0\;orc %3,%3,%2
@@ -1515,7 +1515,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    and %3,%1,%0\;xor %3,%3,%2
    and %3,%1,%0\;xor %3,%3,%2
@@ -1533,7 +1533,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    andc %3,%1,%0\;xor %3,%3,%2
    andc %3,%1,%0\;xor %3,%3,%2
@@ -1551,7 +1551,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    eqv %3,%1,%0\;xor %3,%3,%2
    eqv %3,%1,%0\;xor %3,%3,%2
@@ -1569,7 +1569,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nand %3,%1,%0\;xor %3,%3,%2
    nand %3,%1,%0\;xor %3,%3,%2
@@ -1587,7 +1587,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    nor %3,%1,%0\;xor %3,%3,%2
    nor %3,%1,%0\;xor %3,%3,%2
@@ -1605,7 +1605,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    or %3,%1,%0\;xor %3,%3,%2
    or %3,%1,%0\;xor %3,%3,%2
@@ -1623,7 +1623,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    orc %3,%1,%0\;xor %3,%3,%2
    orc %3,%1,%0\;xor %3,%3,%2
@@ -1641,7 +1641,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    xor %3,%1,%0\;xor %3,%3,%2
    xor %3,%1,%0\;xor %3,%3,%2
@@ -1659,7 +1659,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
+  "(TARGET_P10_FUSION)"
   "@
    and %3,%1,%0\;add %3,%3,%2
    and %3,%1,%0\;add %3,%3,%2
@@ -1677,7 +1677,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
+  "(TARGET_P10_FUSION)"
   "@
    nand %3,%1,%0\;add %3,%3,%2
    nand %3,%1,%0\;add %3,%3,%2
@@ -1695,7 +1695,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
+  "(TARGET_P10_FUSION)"
   "@
    nor %3,%1,%0\;add %3,%3,%2
    nor %3,%1,%0\;add %3,%3,%2
@@ -1713,7 +1713,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
+  "(TARGET_P10_FUSION)"
   "@
    or %3,%1,%0\;add %3,%3,%2
    or %3,%1,%0\;add %3,%3,%2
@@ -1731,7 +1731,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
+  "(TARGET_P10_FUSION)"
   "@
    and %3,%1,%0\;subf %3,%2,%3
    and %3,%1,%0\;subf %3,%2,%3
@@ -1749,7 +1749,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
+  "(TARGET_P10_FUSION)"
   "@
    nand %3,%1,%0\;subf %3,%2,%3
    nand %3,%1,%0\;subf %3,%2,%3
@@ -1767,7 +1767,7 @@
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
+  "(TARGET_P10_FUSION)"
   "@
    nor %3,%1,%0\;subf %3,%2,%3
    nor %3,%1,%0\;subf %3,%2,%3
@@ -1785,7 +1785,7 @@
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
                  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
+  "(TARGET_P10_FUSION)"
   "@
    or %3,%1,%0\;subf %3,%2,%3
    or %3,%1,%0\;subf %3,%2,%3
@@ -1803,7 +1803,7 @@
                  (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
+  "(TARGET_P10_FUSION)"
   "@
    and %3,%1,%0\;subf %3,%3,%2
    and %3,%1,%0\;subf %3,%3,%2
@@ -1821,7 +1821,7 @@
                  (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r"))
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
+  "(TARGET_P10_FUSION)"
   "@
    nand %3,%1,%0\;subf %3,%3,%2
    nand %3,%1,%0\;subf %3,%3,%2
@@ -1839,7 +1839,7 @@
                  (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r"))
                           (not:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
+  "(TARGET_P10_FUSION)"
   "@
    nor %3,%1,%0\;subf %3,%3,%2
    nor %3,%1,%0\;subf %3,%3,%2
@@ -1857,7 +1857,7 @@
                  (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")
                           (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
+  "(TARGET_P10_FUSION)"
   "@
    or %3,%1,%0\;subf %3,%3,%2
    or %3,%1,%0\;subf %3,%3,%2
@@ -1875,7 +1875,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vand %3,%1,%0\;vand %3,%3,%2
    vand %3,%1,%0\;vand %3,%3,%2
@@ -1893,7 +1893,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vandc %3,%1,%0\;vand %3,%3,%2
    vandc %3,%1,%0\;vand %3,%3,%2
@@ -1911,7 +1911,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    veqv %3,%1,%0\;vand %3,%3,%2
    veqv %3,%1,%0\;vand %3,%3,%2
@@ -1929,7 +1929,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnand %3,%1,%0\;vand %3,%3,%2
    vnand %3,%1,%0\;vand %3,%3,%2
@@ -1947,7 +1947,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnor %3,%1,%0\;vand %3,%3,%2
    vnor %3,%1,%0\;vand %3,%3,%2
@@ -1965,7 +1965,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vor %3,%1,%0\;vand %3,%3,%2
    vor %3,%1,%0\;vand %3,%3,%2
@@ -1983,7 +1983,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vorc %3,%1,%0\;vand %3,%3,%2
    vorc %3,%1,%0\;vand %3,%3,%2
@@ -2001,7 +2001,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vxor %3,%1,%0\;vand %3,%3,%2
    vxor %3,%1,%0\;vand %3,%3,%2
@@ -2019,7 +2019,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vand %3,%1,%0\;vandc %3,%3,%2
    vand %3,%1,%0\;vandc %3,%3,%2
@@ -2037,7 +2037,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vandc %3,%1,%0\;vandc %3,%3,%2
    vandc %3,%1,%0\;vandc %3,%3,%2
@@ -2055,7 +2055,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    veqv %3,%1,%0\;vandc %3,%3,%2
    veqv %3,%1,%0\;vandc %3,%3,%2
@@ -2073,7 +2073,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnand %3,%1,%0\;vandc %3,%3,%2
    vnand %3,%1,%0\;vandc %3,%3,%2
@@ -2091,7 +2091,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnor %3,%1,%0\;vandc %3,%3,%2
    vnor %3,%1,%0\;vandc %3,%3,%2
@@ -2109,7 +2109,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vor %3,%1,%0\;vandc %3,%3,%2
    vor %3,%1,%0\;vandc %3,%3,%2
@@ -2127,7 +2127,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vorc %3,%1,%0\;vandc %3,%3,%2
    vorc %3,%1,%0\;vandc %3,%3,%2
@@ -2145,7 +2145,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vxor %3,%1,%0\;vandc %3,%3,%2
    vxor %3,%1,%0\;vandc %3,%3,%2
@@ -2163,7 +2163,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vand %3,%1,%0\;veqv %3,%3,%2
    vand %3,%1,%0\;veqv %3,%3,%2
@@ -2181,7 +2181,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vandc %3,%1,%0\;veqv %3,%3,%2
    vandc %3,%1,%0\;veqv %3,%3,%2
@@ -2199,7 +2199,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v")))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    veqv %3,%1,%0\;veqv %3,%3,%2
    veqv %3,%1,%0\;veqv %3,%3,%2
@@ -2217,7 +2217,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnand %3,%1,%0\;veqv %3,%3,%2
    vnand %3,%1,%0\;veqv %3,%3,%2
@@ -2235,7 +2235,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnor %3,%1,%0\;veqv %3,%3,%2
    vnor %3,%1,%0\;veqv %3,%3,%2
@@ -2253,7 +2253,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vor %3,%1,%0\;veqv %3,%3,%2
    vor %3,%1,%0\;veqv %3,%3,%2
@@ -2271,7 +2271,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vorc %3,%1,%0\;veqv %3,%3,%2
    vorc %3,%1,%0\;veqv %3,%3,%2
@@ -2289,7 +2289,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vxor %3,%1,%0\;veqv %3,%3,%2
    vxor %3,%1,%0\;veqv %3,%3,%2
@@ -2307,7 +2307,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vand %3,%1,%0\;vnand %3,%3,%2
    vand %3,%1,%0\;vnand %3,%3,%2
@@ -2325,7 +2325,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vandc %3,%1,%0\;vnand %3,%3,%2
    vandc %3,%1,%0\;vnand %3,%3,%2
@@ -2343,7 +2343,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    veqv %3,%1,%0\;vnand %3,%3,%2
    veqv %3,%1,%0\;vnand %3,%3,%2
@@ -2361,7 +2361,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnand %3,%1,%0\;vnand %3,%3,%2
    vnand %3,%1,%0\;vnand %3,%3,%2
@@ -2379,7 +2379,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnor %3,%1,%0\;vnand %3,%3,%2
    vnor %3,%1,%0\;vnand %3,%3,%2
@@ -2397,7 +2397,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vor %3,%1,%0\;vnand %3,%3,%2
    vor %3,%1,%0\;vnand %3,%3,%2
@@ -2415,7 +2415,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vorc %3,%1,%0\;vnand %3,%3,%2
    vorc %3,%1,%0\;vnand %3,%3,%2
@@ -2433,7 +2433,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vxor %3,%1,%0\;vnand %3,%3,%2
    vxor %3,%1,%0\;vnand %3,%3,%2
@@ -2451,7 +2451,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vand %3,%1,%0\;vnor %3,%3,%2
    vand %3,%1,%0\;vnor %3,%3,%2
@@ -2469,7 +2469,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vandc %3,%1,%0\;vnor %3,%3,%2
    vandc %3,%1,%0\;vnor %3,%3,%2
@@ -2487,7 +2487,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    veqv %3,%1,%0\;vnor %3,%3,%2
    veqv %3,%1,%0\;vnor %3,%3,%2
@@ -2505,7 +2505,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnand %3,%1,%0\;vnor %3,%3,%2
    vnand %3,%1,%0\;vnor %3,%3,%2
@@ -2523,7 +2523,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnor %3,%1,%0\;vnor %3,%3,%2
    vnor %3,%1,%0\;vnor %3,%3,%2
@@ -2541,7 +2541,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vor %3,%1,%0\;vnor %3,%3,%2
    vor %3,%1,%0\;vnor %3,%3,%2
@@ -2559,7 +2559,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vorc %3,%1,%0\;vnor %3,%3,%2
    vorc %3,%1,%0\;vnor %3,%3,%2
@@ -2577,7 +2577,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vxor %3,%1,%0\;vnor %3,%3,%2
    vxor %3,%1,%0\;vnor %3,%3,%2
@@ -2595,7 +2595,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vand %3,%1,%0\;vor %3,%3,%2
    vand %3,%1,%0\;vor %3,%3,%2
@@ -2613,7 +2613,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vandc %3,%1,%0\;vor %3,%3,%2
    vandc %3,%1,%0\;vor %3,%3,%2
@@ -2631,7 +2631,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    veqv %3,%1,%0\;vor %3,%3,%2
    veqv %3,%1,%0\;vor %3,%3,%2
@@ -2649,7 +2649,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnand %3,%1,%0\;vor %3,%3,%2
    vnand %3,%1,%0\;vor %3,%3,%2
@@ -2667,7 +2667,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnor %3,%1,%0\;vor %3,%3,%2
    vnor %3,%1,%0\;vor %3,%3,%2
@@ -2685,7 +2685,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vor %3,%1,%0\;vor %3,%3,%2
    vor %3,%1,%0\;vor %3,%3,%2
@@ -2703,7 +2703,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vorc %3,%1,%0\;vor %3,%3,%2
    vorc %3,%1,%0\;vor %3,%3,%2
@@ -2721,7 +2721,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vxor %3,%1,%0\;vor %3,%3,%2
    vxor %3,%1,%0\;vor %3,%3,%2
@@ -2739,7 +2739,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vand %3,%1,%0\;vorc %3,%3,%2
    vand %3,%1,%0\;vorc %3,%3,%2
@@ -2757,7 +2757,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vandc %3,%1,%0\;vorc %3,%3,%2
    vandc %3,%1,%0\;vorc %3,%3,%2
@@ -2775,7 +2775,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    veqv %3,%1,%0\;vorc %3,%3,%2
    veqv %3,%1,%0\;vorc %3,%3,%2
@@ -2793,7 +2793,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnand %3,%1,%0\;vorc %3,%3,%2
    vnand %3,%1,%0\;vorc %3,%3,%2
@@ -2811,7 +2811,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnor %3,%1,%0\;vorc %3,%3,%2
    vnor %3,%1,%0\;vorc %3,%3,%2
@@ -2829,7 +2829,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vor %3,%1,%0\;vorc %3,%3,%2
    vor %3,%1,%0\;vorc %3,%3,%2
@@ -2847,7 +2847,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vorc %3,%1,%0\;vorc %3,%3,%2
    vorc %3,%1,%0\;vorc %3,%3,%2
@@ -2865,7 +2865,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vxor %3,%1,%0\;vorc %3,%3,%2
    vxor %3,%1,%0\;vorc %3,%3,%2
@@ -2883,7 +2883,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vand %3,%1,%0\;vxor %3,%3,%2
    vand %3,%1,%0\;vxor %3,%3,%2
@@ -2901,7 +2901,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vandc %3,%1,%0\;vxor %3,%3,%2
    vandc %3,%1,%0\;vxor %3,%3,%2
@@ -2919,7 +2919,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    veqv %3,%1,%0\;vxor %3,%3,%2
    veqv %3,%1,%0\;vxor %3,%3,%2
@@ -2937,7 +2937,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnand %3,%1,%0\;vxor %3,%3,%2
    vnand %3,%1,%0\;vxor %3,%3,%2
@@ -2955,7 +2955,7 @@
                           (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vnor %3,%1,%0\;vxor %3,%3,%2
    vnor %3,%1,%0\;vxor %3,%3,%2
@@ -2973,7 +2973,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vor %3,%1,%0\;vxor %3,%3,%2
    vor %3,%1,%0\;vxor %3,%3,%2
@@ -2991,7 +2991,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vorc %3,%1,%0\;vxor %3,%3,%2
    vorc %3,%1,%0\;vxor %3,%3,%2
@@ -3009,7 +3009,7 @@
                           (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v"))
                  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
+  "(TARGET_P10_FUSION)"
   "@
    vxor %3,%1,%0\;vxor %3,%3,%2
    vxor %3,%1,%0\;vxor %3,%3,%2
@@ -3027,7 +3027,7 @@
                      (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r"))
            (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
    (clobber (match_scratch:GPR 4 "=X,X,X,&r"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2ADD)"
+  "(TARGET_P10_FUSION)"
   "@
    add %3,%1,%0\;add %3,%3,%2
    add %3,%1,%0\;add %3,%3,%2
@@ -3045,7 +3045,7 @@
                      (match_operand:V2DI 1 "altivec_register_operand" "%v,v,v,v"))
            (match_operand:V2DI 2 "altivec_register_operand" "v,v,v,v")))
    (clobber (match_scratch:V2DI 4 "=X,X,X,&v"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2ADD)"
+  "(TARGET_P10_FUSION)"
   "@
    vaddudm %3,%1,%0\;vaddudm %3,%3,%2
    vaddudm %3,%1,%0\;vaddudm %3,%3,%2
diff --git a/gcc/config/rs6000/genfusion.pl b/gcc/config/rs6000/genfusion.pl
index 7e201f7..81cc225 100755
--- a/gcc/config/rs6000/genfusion.pl
+++ b/gcc/config/rs6000/genfusion.pl
@@ -118,7 +118,7 @@ sub gen_ld_cmpi_p10
 	  } else {
 	      print "   (set (match_operand:${result} 0 \"gpc_reg_operand\" \"=r\") (${extend}_extend:${result} (match_dup 1)))]\n";
 	  }
-	  print "  \"(TARGET_P10_FUSION && TARGET_P10_FUSION_LD_CMPI)\"\n";
+	  print "  \"(TARGET_P10_FUSION)\"\n";
 	  print "  \"l${ldst}${echr}%X1 %0,%1\\;cmp${cmpl}di %2,%0,%3\"\n";
 	  print "  \"&& reload_completed\n";
 	  print "   && (cc_reg_not_cr0_operand (operands[2], CCmode)\n";
@@ -166,8 +166,8 @@ sub gen_logical_addsubf
 	$outer_op, $outer_comp, $outer_inv, $outer_rtl, $inner, @inner_ops,
 	$inner_comp, $inner_inv, $inner_rtl, $inner_op, $both_commute, $c4,
 	$bc, $inner_arg0, $inner_arg1, $inner_exp, $outer_arg2, $outer_exp,
-	$target_flag, $ftype, $insn, $is_subf, $is_rsubf, $outer_32, $outer_42,
-	$outer_name, $fuse_type);
+	$ftype, $insn, $is_subf, $is_rsubf, $outer_32, $outer_42,$outer_name,
+	$fuse_type);
   KIND: foreach $kind ('scalar','vector') {
       @outer_ops = @logicals;
       if ( $kind eq 'vector' ) {
@@ -199,18 +199,15 @@ sub gen_logical_addsubf
 	$outer_rtl = $rtlop{$outer};
 	@inner_ops = @logicals;
 	$ftype = "logical-logical";
-	$target_flag = "TARGET_P10_FUSION_2LOGICAL";
 	if ( exists $isaddsub{$outer} ) {
 	    @inner_ops = sort keys %logicals_addsub;
 	    $ftype = "logical-add";
-	    $target_flag = "TARGET_P10_FUSION_LOGADD";
 	} elsif ( $kind ne 'vector' && exists $logicals_addsub{$outer} ) {
 	    push (@inner_ops, @addsub);
 	}
       INNER: foreach $inner ( @inner_ops ) {
 	  if ( exists $isaddsub{$inner} ) {
 	      $ftype = "add-logical";
-	      $target_flag = "TARGET_P10_FUSION_ADDLOG";
 	  }
 	  $inner_comp = $complement{$inner};
 	  $inner_inv = $invert{$inner};
@@ -266,7 +263,7 @@ sub gen_logical_addsubf
   [(set (match_operand:${mode} 3 "${pred}" "=&0,&1,&${constraint},${constraint}")
         ${outer_exp})
    (clobber (match_scratch:${mode} 4 "=X,X,X,&${constraint}"))]
-  "(TARGET_P10_FUSION && $target_flag)"
+  "(TARGET_P10_FUSION)"
   "@
    ${inner_op} %3,%1,%0\\;${outer_op} %3,${outer_32}
    ${inner_op} %3,%1,%0\\;${outer_op} %3,${outer_32}
@@ -313,7 +310,7 @@ sub gen_addadd
                      (match_operand:${mode} 1 "${pred}" "%${c4}"))
            (match_operand:${mode} 2 "${pred}" "${c4}")))
    (clobber (match_scratch:${mode} 4 "=X,X,X,&${constraint}"))]
-  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2ADD)"
+  "(TARGET_P10_FUSION)"
   "@
    ${op} %3,%1,%0\\;${op} %3,%3,%2
    ${op} %3,%1,%0\\;${op} %3,%3,%2
diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def
index 325b219..963947f 100644
--- a/gcc/config/rs6000/rs6000-cpus.def
+++ b/gcc/config/rs6000/rs6000-cpus.def
@@ -85,13 +85,7 @@
 #define ISA_3_1_MASKS_SERVER	(ISA_3_0_MASKS_SERVER			\
 				 | OPTION_MASK_POWER10			\
 				 | OTHER_POWER10_MASKS			\
-				 | OPTION_MASK_P10_FUSION		\
-				 | OPTION_MASK_P10_FUSION_LD_CMPI	\
-				 | OPTION_MASK_P10_FUSION_2LOGICAL	\
-				 | OPTION_MASK_P10_FUSION_LOGADD 	\
-				 | OPTION_MASK_P10_FUSION_ADDLOG	\
-				 | OPTION_MASK_P10_FUSION_2ADD		\
-				 | OPTION_MASK_P10_FUSION_2STORE)
+				 | OPTION_MASK_P10_FUSION)
 
 /* Flags that need to be turned off if -mno-power9-vector.  */
 #define OTHER_P9_VECTOR_MASKS	(OPTION_MASK_FLOAT128_HW		\
@@ -139,12 +133,6 @@
 				 | OPTION_MASK_FPRND			\
 				 | OPTION_MASK_POWER10			\
 				 | OPTION_MASK_P10_FUSION		\
-				 | OPTION_MASK_P10_FUSION_LD_CMPI	\
-				 | OPTION_MASK_P10_FUSION_2LOGICAL	\
-				 | OPTION_MASK_P10_FUSION_LOGADD 	\
-				 | OPTION_MASK_P10_FUSION_ADDLOG	\
-				 | OPTION_MASK_P10_FUSION_2ADD    	\
-				 | OPTION_MASK_P10_FUSION_2STORE	\
 				 | OPTION_MASK_HTM			\
 				 | OPTION_MASK_ISEL			\
 				 | OPTION_MASK_MFCRF			\
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 32a13cd..d7a7cfe 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -4446,30 +4446,6 @@ rs6000_option_override_internal (bool global_init_p)
       && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION) == 0)
     rs6000_isa_flags |= OPTION_MASK_P10_FUSION;
 
-  if (TARGET_POWER10 &&
-      (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_LD_CMPI) == 0)
-    rs6000_isa_flags |= OPTION_MASK_P10_FUSION_LD_CMPI;
-
-  if (TARGET_POWER10
-      && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2LOGICAL) == 0)
-    rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2LOGICAL;
-
-  if (TARGET_POWER10
-      && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_LOGADD) == 0)
-    rs6000_isa_flags |= OPTION_MASK_P10_FUSION_LOGADD;
-
-  if (TARGET_POWER10
-      && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_ADDLOG) == 0)
-    rs6000_isa_flags |= OPTION_MASK_P10_FUSION_ADDLOG;
-
-  if (TARGET_POWER10
-      && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0)
-    rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD;
-
-  if (TARGET_POWER10
-      && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0)
-    rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE;
-
   /* Turn off vector pair/mma options on non-power10 systems.  */
   else if (!TARGET_POWER10 && TARGET_MMA)
     {
@@ -19032,8 +19008,7 @@ power10_sched_reorder (rtx_insn **ready, int lastpos)
 
   /* Try to pair certain store insns to adjacent memory locations
      so that the hardware will fuse them to a single operation.  */
-  if (TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE
-      && is_fusable_store (last_scheduled_insn, &mem1))
+  if (TARGET_P10_FUSION && is_fusable_store (last_scheduled_insn, &mem1))
     {
 
       /* A fusable store was just scheduled.  Scan the ready list for another
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 68c0cae..4931d78 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -487,33 +487,9 @@ Target Mask(P8_VECTOR) Var(rs6000_isa_flags)
 Use vector and scalar instructions added in ISA 2.07.
 
 mpower10-fusion
-Target Mask(P10_FUSION) Var(rs6000_isa_flags)
+Target Undocumented Mask(P10_FUSION) Var(rs6000_isa_flags)
 Fuse certain integer operations together for better performance on power10.
 
-mpower10-fusion-ld-cmpi
-Target Undocumented Mask(P10_FUSION_LD_CMPI) Var(rs6000_isa_flags)
-Fuse certain integer operations together for better performance on power10.
-
-mpower10-fusion-2logical
-Target Undocumented Mask(P10_FUSION_2LOGICAL) Var(rs6000_isa_flags)
-Fuse pairs of scalar or vector logical operations together for better performance on power10.
-
-mpower10-fusion-logical-add
-Target Undocumented Mask(P10_FUSION_LOGADD) Var(rs6000_isa_flags)
-Fuse scalar logical op with add/subf for better performance on power10.
-
-mpower10-fusion-add-logical
-Target Undocumented Mask(P10_FUSION_ADDLOG) Var(rs6000_isa_flags)
-Fuse scalar add/subf with logical op for better performance on power10.
-
-mpower10-fusion-2add
-Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags)
-Fuse dependent pairs of add or vaddudm instructions for better performance on power10.
-
-mpower10-fusion-2store
-Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags)
-Fuse certain store operations together for better performance on power10.
-
 mcrypto
 Target Mask(CRYPTO) Var(rs6000_isa_flags)
 Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions.
-- 
cgit v1.1


From 8e5c34ab45f34aadea65c5ba33ec685264b6ec66 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Fri, 18 Feb 2022 16:50:03 +0100
Subject: [nvptx] Use nvptx_warpsync / nvptx_uniform_warp_check for
 -muniform-simt

With the default ptx isa 6.0, we have for uniform-simt-1.c:
...
        @%r33   atom.global.cas.b32     %r26, [a], %r28, %r29;
                shfl.sync.idx.b32       %r26, %r26, %r32, 31, 0xffffffff;
...

The atomic insn is predicated by -muniform-simt, and the subsequent insn does
a warp sync, at which point the warp is uniform again.

But with -mptx=3.1, we have instead:
...
        @%r33   atom.global.cas.b32     %r26, [a], %r28, %r29;
                shfl.idx.b32    %r26, %r26, %r32, 31;
...

The shfl does not sync the warp, and we want the warp to go back to executing
uniformly asap.  We cannot enforce this, but at least check this using
nvptx_uniform_warp_check, similar to how that is done for openacc.

Likewise, detect the case that no shfl insn is emitted, and add a
nvptx_uniform_warp_check or nvptx_warpsync.

gcc/ChangeLog:

2022-02-19  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.cc (nvptx_unisimt_handle_set): Change return
	type to bool.
	(nvptx_reorg_uniform_simt): Insert nvptx_uniform_warp_check or
	nvptx_warpsync, if necessary.

gcc/testsuite/ChangeLog:

2022-02-19  Tom de Vries  <tdevries@suse.de>

	* gcc.target/nvptx/uniform-simt-1.c: Add scan-assembler test.
	* gcc.target/nvptx/uniform-simt-2.c: New test.
---
 gcc/config/nvptx/nvptx.cc | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index afbad5b..4942f11 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -3248,12 +3248,18 @@ nvptx_call_insn_is_syscall_p (rtx_insn *insn)
 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
    propagate its value from lane MASTER to current lane.  */
 
-static void
+static bool
 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
 {
   rtx reg;
   if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
-    emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
+    {
+      emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX),
+		       insn);
+      return true;
+    }
+
+  return false;
 }
 
 /* Adjust code for uniform-simt code generation variant by making atomics and
@@ -3275,8 +3281,30 @@ nvptx_reorg_uniform_simt ()
 	continue;
       rtx pat = PATTERN (insn);
       rtx master = nvptx_get_unisimt_master ();
+      bool shuffle_p = false;
       for (int i = 0; i < XVECLEN (pat, 0); i++)
-	nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
+	shuffle_p
+	  |= nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
+      if (shuffle_p && TARGET_PTX_6_0)
+	{
+	  /* The shuffle is a sync, so uniformity is guaranteed.  */
+	}
+      else
+	{
+	  if (TARGET_PTX_6_0)
+	    {
+	      gcc_assert (!shuffle_p);
+	      /* Emit after the insn, to guarantee uniformity.  */
+	      emit_insn_after (gen_nvptx_warpsync (), insn);
+	    }
+	  else
+	    {
+	      /* Emit after the insn (and before the shuffle, if there are any)
+		 to check uniformity.  */
+	      emit_insn_after (gen_nvptx_uniform_warp_check (), insn);
+	    }
+	}
+
       rtx pred = nvptx_get_unisimt_predicate ();
       pred = gen_rtx_NE (BImode, pred, const0_rtx);
       pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
-- 
cgit v1.1


From 9ed52438b8ca99a0dffe74da96c2281cbc9cbb4b Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Fri, 18 Feb 2022 17:38:50 +0100
Subject: [nvptx] Don't skip atomic insns in nvptx_reorg_uniform_simt

In nvptx_reorg_uniform_simt we have a loop:
...
  for (insn = get_insns (); insn; insn = next)
    {
      next = NEXT_INSN (insn);
      if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
         && !(NONJUMP_INSN_P (insn)
              && GET_CODE (PATTERN (insn)) == PARALLEL
              && get_attr_atomic (insn)))
       continue;
...
that intends to handle syscalls and atomic insns.

However, this also silently skips the atomic insn nvptx_atomic_store, which
has GET_CODE (PATTERN (insn)) == SET.

This does not cause problems, because the nvptx_atomic_store actually maps
onto a "st" insn, and therefore is not atomic and doesn't need to be handled
by nvptx_reorg_uniform_simt.

Fix this by:
- explicitly setting nvptx_atomic_store's atomic attribute to false,
- rewriting the skip condition to make sure all insn
  with atomic attribute are handled, and
- asserting that all handled insns are PARALLEL.

Tested on nvptx.

gcc/ChangeLog:

2022-02-19  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.cc (nvptx_reorg_uniform_simt): Handle all
	insns with atomic attribute.  Assert that all handled insns are
	PARALLELs.
	* config/nvptx/nvptx.md (define_insn "nvptx_atomic_store<mode>"):
	Set atomic attribute to false.

gcc/testsuite/ChangeLog:

2022-02-19  Tom de Vries  <tdevries@suse.de>

	* gcc.target/nvptx/uniform-simt-3.c: New test.
---
 gcc/config/nvptx/nvptx.cc | 20 ++++++++++++++++----
 gcc/config/nvptx/nvptx.md |  2 +-
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 4942f11..55fab3e 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -3274,12 +3274,24 @@ nvptx_reorg_uniform_simt ()
   for (insn = get_insns (); insn; insn = next)
     {
       next = NEXT_INSN (insn);
-      if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
-	  && !(NONJUMP_INSN_P (insn)
-	       && GET_CODE (PATTERN (insn)) == PARALLEL
-	       && get_attr_atomic (insn)))
+
+      /* Skip NOTE, USE, etc.  */
+      if (!INSN_P (insn) || recog_memoized (insn) == -1)
 	continue;
+
+      if (CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
+	{
+	  /* Handle syscall.  */
+	}
+      else if (get_attr_atomic (insn))
+	{
+	  /* Handle atomic insn.  */
+	}
+      else
+	continue;
+
       rtx pat = PATTERN (insn);
+      gcc_assert (GET_CODE (pat) == PARALLEL);
       rtx master = nvptx_get_unisimt_master ();
       bool shuffle_p = false;
       for (int i = 0; i < XVECLEN (pat, 0); i++)
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 4c378ec..132ef2f 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -2097,7 +2097,7 @@
       = "%.\tst%A0.b%T0\t%0, %1;";
     return nvptx_output_atomic_insn (t, operands, 0, 2);
   }
-  [(set_attr "atomic" "true")])
+  [(set_attr "atomic" "false")]) ;; Note: st is not an atomic insn.
 
 (define_insn "atomic_fetch_add<mode>"
   [(set (match_operand:SDIM 1 "memory_operand" "+m")
-- 
cgit v1.1


From 69cb3f2abb911acebfc7ffede2ee7151a3e14a59 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Tue, 15 Feb 2022 14:36:26 +0100
Subject: [nvptx] Use _ as destination operand of atom.exch

We currently generate this code for an atomic store:
...
.reg.u32 %r21;
atom.exch.b32 %r21,[%r22],%r23;
...
where %r21 is set but unused.

Use the ptx bit bucket operand '_' instead, such that we have:
...
atom.exch.b32 _,[%r22],%r23;
...

[ Note that the same problem still occurs for this code:
...
void atomic_store (int *ptr, int val) {
  __atomic_exchange_n (ptr, val, MEMMODEL_RELAXED);
}
... ]

Tested on nvptx.

gcc/ChangeLog:

2022-02-19  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.cc (nvptx_reorg_uniform_simt): Handle SET insn.
	* config/nvptx/nvptx.md
	(define_insn "nvptx_atomic_store<mode>"): Rename to ...
	(define_insn "nvptx_atomic_store_sm70<mode>"): This.
	(define_insn "nvptx_atomic_store<mode>"): New define_insn.
	(define_expand "atomic_store<mode>"): Handle rename.  Use
	nvptx_atomic_store instead of atomic_exchange.

gcc/testsuite/ChangeLog:

2022-02-19  Tom de Vries  <tdevries@suse.de>

	* gcc.target/nvptx/atomic-store-1.c: Update.
---
 gcc/config/nvptx/nvptx.cc | 18 ++++++++++++++----
 gcc/config/nvptx/nvptx.md | 25 +++++++++++++++++++------
 2 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 55fab3e..ed347ca 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -3291,12 +3291,22 @@ nvptx_reorg_uniform_simt ()
 	continue;
 
       rtx pat = PATTERN (insn);
-      gcc_assert (GET_CODE (pat) == PARALLEL);
       rtx master = nvptx_get_unisimt_master ();
       bool shuffle_p = false;
-      for (int i = 0; i < XVECLEN (pat, 0); i++)
-	shuffle_p
-	  |= nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
+      switch (GET_CODE (pat))
+       {
+       case PARALLEL:
+	 for (int i = 0; i < XVECLEN (pat, 0); i++)
+	   shuffle_p
+	     |= nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
+	 break;
+       case SET:
+	 shuffle_p |= nvptx_unisimt_handle_set (pat, insn, master);
+	 break;
+       default:
+	 gcc_unreachable ();
+       }
+
       if (shuffle_p && TARGET_PTX_6_0)
 	{
 	  /* The shuffle is a sync, so uniformity is guaranteed.  */
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 132ef2f..f6dc817 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -2069,8 +2069,8 @@
 
   if (TARGET_SM70)
     {
-       emit_insn (gen_nvptx_atomic_store<mode> (operands[0], operands[1],
-						operands[2]));
+       emit_insn (gen_nvptx_atomic_store_sm70<mode> (operands[0], operands[1],
+						     operands[2]));
        DONE;
     }
 
@@ -2079,13 +2079,12 @@
     /* Fall back to expand_atomic_store.  */
     FAIL;
 
-  rtx tmpreg = gen_reg_rtx (<MODE>mode);
-  emit_insn (gen_atomic_exchange<mode> (tmpreg, operands[0], operands[1],
-					operands[2]));
+  emit_insn (gen_nvptx_atomic_store<mode> (operands[0], operands[1],
+					   operands[2]));
   DONE;
 })
 
-(define_insn "nvptx_atomic_store<mode>"
+(define_insn "nvptx_atomic_store_sm70<mode>"
   [(set (match_operand:SDIM 0 "memory_operand" "+m")	      ;; memory
        (unspec_volatile:SDIM
 	 [(match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri") ;; input
@@ -2099,6 +2098,20 @@
   }
   [(set_attr "atomic" "false")]) ;; Note: st is not an atomic insn.
 
+(define_insn "nvptx_atomic_store<mode>"
+  [(set (match_operand:SDIM 0 "memory_operand" "+m")	      ;; memory
+       (unspec_volatile:SDIM
+	 [(match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri") ;; input
+	  (match_operand:SI 2 "const_int_operand")]		;; model
+	       UNSPECV_ST))]
+  "!TARGET_SM70"
+  {
+    const char *t
+      = "%.\tatom%A0.exch.b%T0\t_, %0, %1;";
+    return nvptx_output_atomic_insn (t, operands, 0, 2);
+  }
+  [(set_attr "atomic" "true")])
+
 (define_insn "atomic_fetch_add<mode>"
   [(set (match_operand:SDIM 1 "memory_operand" "+m")
 	(unspec_volatile:SDIM
-- 
cgit v1.1


From 02aedc6f269b5e3c1f354edcf5b84d27b0a15946 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Wed, 16 Feb 2022 17:09:11 +0100
Subject: [nvptx] Initialize ptx regs

With nvptx target, driver version 510.47.03 and board GT 1030 I, we run into:
...
FAIL: gcc.c-torture/execute/pr53465.c -O1 execution test
FAIL: gcc.c-torture/execute/pr53465.c -O2 execution test
FAIL: gcc.c-torture/execute/pr53465.c -O3 -g execution test
...
while the test-cases pass with nvptx-none-run -O0.

The problem is that the generated ptx contains a read from an uninitialized
ptx register, and the driver JIT doesn't handle this well.

For -O2 and -O3, we can get rid of the FAIL using --param
logical-op-non-short-circuit=0.  But not for -O1.

At -O1, the test-case minimizes to:
...
void __attribute__((noinline, noclone))
foo (int y) {
  int c;
  for (int i = 0; i < y; i++)
    {
      int d = i + 1;
      if (i && d <= c)
        __builtin_abort ();
      c = d;
    }
}

int main () {
  foo (2); return 0;
}
...

Note that the test-case does not contain an uninitialized use.  In the first
iteration, i is 0 and consequently c is not read.  In the second iteration, c
is read, but by that time it's already initialized by 'c = d' from the first
iteration.

AFAICT the problem is introduced as follows: the conditional use of c in the
loop body is translated into an unconditional use of c in the loop header:
...
  # c_1 = PHI <c_4(D)(2), c_9(6)>
...
which forwprop1 propagates the 'c_9 = d_7' assignment into:
...
  # c_1 = PHI <c_4(D)(2), d_7(6)>
...
which ends up being translated by expand into an unconditional:
...
(insn 13 12 0 (set (reg/v:SI 22 [ c ])
        (reg/v:SI 23 [ d ])) -1
     (nil))
...
at the start of the loop body, creating an uninitialized read of d on the
path from loop entry.

By disabling coalesce_ssa_name, we get the more usual copies on the incoming
edges.  The copy on the loop entry path still does an uninitialized read, but
that one's now initialized by init-regs.  The test-case passes, also when
disabling init-regs, so it's possible that the JIT driver doesn't object to
this type of uninitialized read.

Now that we characterized the problem to some degree, we need to fix this,
because either:
- we're violating an undocumented ptx invariant, and this is a compiler bug,
  or
- this is is a driver JIT bug and we need to work around it.

There are essentially two strategies to address this:
- stop the compiler from creating uninitialized reads
- patch up uninitialized reads using additional initialization

The former will probably involve:
- making some optimizations more conservative in the presence of
  uninitialized reads, and
- disabling some other optimizations (where making them more conservative is
  not possible, or cannot easily be achieved).
This will probably will have a cost penalty for code that does not suffer from
the original problem.

The latter has the problem that it may paper over uninitialized reads
in the source code, or indeed over ones that were incorrectly introduced
by the compiler.  But it has the advantage that it allows for the problem to
be addressed at a single location.

There's an existing pass, init-regs, which implements a form of the latter,
but it doesn't work for this example because it only inserts additional
initialization for uses that have not a single reaching definition.

Fix this by adding initialization of uninitialized ptx regs in reorg.

Control the new functionality using -minit-regs=<0|1|2|3>, meaning:
- 0: disabled.
- 1: add initialization of all regs at the entry bb
- 2: add initialization of uninitialized regs at the entry bb
- 3: add initialization of uninitialized regs close to the use
and defaulting to 3.

Tested on nvptx.

gcc/ChangeLog:

2022-02-17  Tom de Vries  <tdevries@suse.de>

	PR target/104440
	* config/nvptx/nvptx.cc (workaround_uninit_method_1)
	(workaround_uninit_method_2, workaround_uninit_method_3)
	(workaround_uninit): New function.
	(nvptx_reorg): Use workaround_uninit.
	* config/nvptx/nvptx.opt (minit-regs): New option.
---
 gcc/config/nvptx/nvptx.cc  | 188 +++++++++++++++++++++++++++++++++++++++++++++
 gcc/config/nvptx/nvptx.opt |   4 +
 2 files changed, 192 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index ed347ca..a37a6c7 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -5372,6 +5372,190 @@ workaround_barsyncs (void)
 }
 #endif
 
+/* Initialize all declared regs at function entry.
+   Advantage   : Fool-proof.
+   Disadvantage: Potentially creates a lot of long live ranges and adds a lot
+		 of insns.  */
+
+static void
+workaround_uninit_method_1 (void)
+{
+  rtx_insn *first = get_insns ();
+  rtx_insn *insert_here = NULL;
+
+  for (int ix = LAST_VIRTUAL_REGISTER + 1; ix < max_reg_num (); ix++)
+    {
+      rtx reg = regno_reg_rtx[ix];
+
+      /* Skip undeclared registers.  */
+      if (reg == const0_rtx)
+	continue;
+
+      gcc_assert (CONST0_RTX (GET_MODE (reg)));
+
+      start_sequence ();
+      emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
+      rtx_insn *inits = get_insns ();
+      end_sequence ();
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
+	  fprintf (dump_file, "Default init of reg %u inserted: insn %u\n",
+		   ix, INSN_UID (init));
+
+      if (first != NULL)
+	{
+	  insert_here = emit_insn_before (inits, first);
+	  first = NULL;
+	}
+      else
+	insert_here = emit_insn_after (inits, insert_here);
+    }
+}
+
+/* Find uses of regs that are not defined on all incoming paths, and insert a
+   corresponding def at function entry.
+   Advantage   : Simple.
+   Disadvantage: Potentially creates long live ranges.
+		 May not catch all cases.  F.i. a clobber cuts a live range in
+		 the compiler and may prevent entry_lr_in from being set for a
+		 reg, but the clobber does not translate to a ptx insn, so in
+		 ptx there still may be an uninitialized ptx reg.  See f.i.
+		 gcc.c-torture/compile/20020926-1.c.  */
+
+static void
+workaround_uninit_method_2 (void)
+{
+  auto_bitmap entry_pseudo_uninit;
+  {
+    auto_bitmap not_pseudo;
+    bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
+
+    bitmap entry_lr_in = DF_LR_IN (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+    bitmap_and_compl (entry_pseudo_uninit, entry_lr_in, not_pseudo);
+  }
+
+  rtx_insn *first = get_insns ();
+  rtx_insn *insert_here = NULL;
+
+  bitmap_iterator iterator;
+  unsigned ix;
+  EXECUTE_IF_SET_IN_BITMAP (entry_pseudo_uninit, 0, ix, iterator)
+    {
+      rtx reg = regno_reg_rtx[ix];
+      gcc_assert (CONST0_RTX (GET_MODE (reg)));
+
+      start_sequence ();
+      emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
+      rtx_insn *inits = get_insns ();
+      end_sequence ();
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
+	  fprintf (dump_file, "Missing init of reg %u inserted: insn %u\n",
+		   ix, INSN_UID (init));
+
+      if (first != NULL)
+	{
+	  insert_here = emit_insn_before (inits, first);
+	  first = NULL;
+	}
+      else
+	insert_here = emit_insn_after (inits, insert_here);
+    }
+}
+
+/* Find uses of regs that are not defined on all incoming paths, and insert a
+   corresponding def on those.
+   Advantage   : Doesn't create long live ranges.
+   Disadvantage: More complex, and potentially also more defs.  */
+
+static void
+workaround_uninit_method_3 (void)
+{
+  auto_bitmap not_pseudo;
+  bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
+
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      if (single_pred_p (bb))
+	continue;
+
+      auto_bitmap bb_pseudo_uninit;
+      bitmap_and_compl (bb_pseudo_uninit, DF_LIVE_IN (bb), DF_MIR_IN (bb));
+      bitmap_and_compl_into (bb_pseudo_uninit, not_pseudo);
+
+      bitmap_iterator iterator;
+      unsigned ix;
+      EXECUTE_IF_SET_IN_BITMAP (bb_pseudo_uninit, 0, ix, iterator)
+	{
+	  bool have_false = false;
+	  bool have_true = false;
+
+	  edge e;
+	  edge_iterator ei;
+	  FOR_EACH_EDGE (e, ei, bb->preds)
+	    {
+	      if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
+		have_true = true;
+	      else
+		have_false = true;
+	    }
+	  if (have_false ^ have_true)
+	    continue;
+
+	  FOR_EACH_EDGE (e, ei, bb->preds)
+	    {
+	      if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
+		continue;
+
+	      rtx reg = regno_reg_rtx[ix];
+	      gcc_assert (CONST0_RTX (GET_MODE (reg)));
+
+	      start_sequence ();
+	      emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
+	      rtx_insn *inits = get_insns ();
+	      end_sequence ();
+
+	      if (dump_file && (dump_flags & TDF_DETAILS))
+		for (rtx_insn *init = inits; init != NULL;
+		     init = NEXT_INSN (init))
+		  fprintf (dump_file,
+			   "Missing init of reg %u inserted on edge: %d -> %d:"
+			   " insn %u\n", ix, e->src->index, e->dest->index,
+			   INSN_UID (init));
+
+	      insert_insn_on_edge (inits, e);
+	    }
+	}
+    }
+
+  commit_edge_insertions ();
+}
+
+static void
+workaround_uninit (void)
+{
+  switch (nvptx_init_regs)
+    {
+    case 0:
+      /* Skip.  */
+      break;
+    case 1:
+      workaround_uninit_method_1 ();
+      break;
+    case 2:
+      workaround_uninit_method_2 ();
+      break;
+    case 3:
+      workaround_uninit_method_3 ();
+      break;
+    default:
+      gcc_unreachable ();
+    }
+}
+
 /* PTX-specific reorganization
    - Split blocks at fork and join instructions
    - Compute live registers
@@ -5401,6 +5585,8 @@ nvptx_reorg (void)
   df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
   df_live_add_problem ();
   df_live_set_all_dirty ();
+  if (nvptx_init_regs == 3)
+    df_mir_add_problem ();
   df_analyze ();
   regstat_init_n_sets_and_refs ();
 
@@ -5413,6 +5599,8 @@ nvptx_reorg (void)
     if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
       regno_reg_rtx[i] = const0_rtx;
 
+  workaround_uninit ();
+
   /* Determine launch dimensions of the function.  If it is not an
      offloaded function  (i.e. this is a regular compiler), the
      function has no neutering.  */
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index e3f65b2..0858007 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -91,3 +91,7 @@ Enum(ptx_version) String(7.0) Value(PTX_VERSION_7_0)
 mptx=
 Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option)
 Specify the version of the ptx version to use.
+
+minit-regs=
+Target Var(nvptx_init_regs) IntegerRange(0, 3) Joined UInteger Init(3)
+Initialize ptx registers.
-- 
cgit v1.1


From ce09ab17ddd21f73ff2caf6eec3b0ee9b0e1a11e Mon Sep 17 00:00:00 2001
From: Dan Li <ashimida@linux.alibaba.com>
Date: Mon, 21 Feb 2022 20:01:14 +0000
Subject: aarch64: Add compiler support for Shadow Call Stack

Shadow Call Stack can be used to protect the return address of a
function at runtime, and clang already supports this feature[1].

To enable SCS in user mode, in addition to compiler, other support
is also required (as discussed in [2]). This patch only adds basic
support for SCS from the compiler side, and provides convenience
for users to enable SCS.

For linux kernel, only the support of the compiler is required.

[1] https://clang.llvm.org/docs/ShadowCallStack.html
[2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102768

Signed-off-by: Dan Li <ashimida@linux.alibaba.com>

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (SLOT_REQUIRED):
	Change wb_candidate[12] to wb_push_candidate[12].
	(aarch64_layout_frame): Likewise, and
	change callee_adjust when scs is enabled.
	(aarch64_save_callee_saves):
	Change wb_candidate[12] to wb_push_candidate[12].
	(aarch64_restore_callee_saves):
	Change wb_candidate[12] to wb_pop_candidate[12].
	(aarch64_get_separate_components):
	Change wb_candidate[12] to wb_push_candidate[12].
	(aarch64_expand_prologue): Push x30 onto SCS before it's
	pushed onto stack.
	(aarch64_expand_epilogue): Pop x30 frome SCS, while
	preventing it from being popped from the regular stack again.
	(aarch64_override_options_internal): Add SCS compile option check.
	(TARGET_HAVE_SHADOW_CALL_STACK): New hook.
	* config/aarch64/aarch64.h (struct GTY): Add is_scs_enabled,
	wb_pop_candidate[12], and rename wb_candidate[12] to
	wb_push_candidate[12].
	* config/aarch64/aarch64.md (scs_push): New template.
	(scs_pop): Likewise.
	* doc/invoke.texi: Document -fsanitize=shadow-call-stack.
	* doc/tm.texi: Regenerate.
	* doc/tm.texi.in: Add hook have_shadow_call_stack.
	* flag-types.h (enum sanitize_code):
	Add SANITIZE_SHADOW_CALL_STACK.
	* opts.cc (parse_sanitizer_options): Add shadow-call-stack
	and exclude SANITIZE_SHADOW_CALL_STACK.
	* target.def: New hook.
	* toplev.cc (process_options): Add SCS compile option check.
	* ubsan.cc (ubsan_expand_null_ifn): Enum type conversion.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/shadow_call_stack_1.c: New test.
	* gcc.target/aarch64/shadow_call_stack_2.c: New test.
	* gcc.target/aarch64/shadow_call_stack_3.c: New test.
	* gcc.target/aarch64/shadow_call_stack_4.c: New test.
	* gcc.target/aarch64/shadow_call_stack_5.c: New test.
	* gcc.target/aarch64/shadow_call_stack_6.c: New test.
	* gcc.target/aarch64/shadow_call_stack_7.c: New test.
	* gcc.target/aarch64/shadow_call_stack_8.c: New test.
---
 gcc/config/aarch64/aarch64.cc | 113 +++++++++++++++++++++++++++++++-----------
 gcc/config/aarch64/aarch64.h  |  21 ++++++--
 gcc/config/aarch64/aarch64.md |  10 ++++
 3 files changed, 113 insertions(+), 31 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 37ed22bc..8bcee8b 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -80,6 +80,7 @@
 #include "fractional-cost.h"
 #include "rtlanal.h"
 #include "tree-dfa.h"
+#include "asan.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -7547,8 +7548,8 @@ aarch64_layout_frame (void)
 #define SLOT_NOT_REQUIRED (-2)
 #define SLOT_REQUIRED     (-1)
 
-  frame.wb_candidate1 = INVALID_REGNUM;
-  frame.wb_candidate2 = INVALID_REGNUM;
+  frame.wb_push_candidate1 = INVALID_REGNUM;
+  frame.wb_push_candidate2 = INVALID_REGNUM;
   frame.spare_pred_reg = INVALID_REGNUM;
 
   /* First mark all the registers that really need to be saved...  */
@@ -7663,9 +7664,9 @@ aarch64_layout_frame (void)
     {
       /* FP and LR are placed in the linkage record.  */
       frame.reg_offset[R29_REGNUM] = offset;
-      frame.wb_candidate1 = R29_REGNUM;
+      frame.wb_push_candidate1 = R29_REGNUM;
       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
-      frame.wb_candidate2 = R30_REGNUM;
+      frame.wb_push_candidate2 = R30_REGNUM;
       offset += 2 * UNITS_PER_WORD;
     }
 
@@ -7673,10 +7674,10 @@ aarch64_layout_frame (void)
     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
       {
 	frame.reg_offset[regno] = offset;
-	if (frame.wb_candidate1 == INVALID_REGNUM)
-	  frame.wb_candidate1 = regno;
-	else if (frame.wb_candidate2 == INVALID_REGNUM)
-	  frame.wb_candidate2 = regno;
+	if (frame.wb_push_candidate1 == INVALID_REGNUM)
+	  frame.wb_push_candidate1 = regno;
+	else if (frame.wb_push_candidate2 == INVALID_REGNUM)
+	  frame.wb_push_candidate2 = regno;
 	offset += UNITS_PER_WORD;
       }
 
@@ -7699,11 +7700,11 @@ aarch64_layout_frame (void)
 	  }
 
 	frame.reg_offset[regno] = offset;
-	if (frame.wb_candidate1 == INVALID_REGNUM)
-	  frame.wb_candidate1 = regno;
-	else if (frame.wb_candidate2 == INVALID_REGNUM
-		 && frame.wb_candidate1 >= V0_REGNUM)
-	  frame.wb_candidate2 = regno;
+	if (frame.wb_push_candidate1 == INVALID_REGNUM)
+	  frame.wb_push_candidate1 = regno;
+	else if (frame.wb_push_candidate2 == INVALID_REGNUM
+		 && frame.wb_push_candidate1 >= V0_REGNUM)
+	  frame.wb_push_candidate2 = regno;
 	offset += vector_save_size;
       }
 
@@ -7734,10 +7735,38 @@ aarch64_layout_frame (void)
   frame.sve_callee_adjust = 0;
   frame.callee_offset = 0;
 
+  frame.wb_pop_candidate1 = frame.wb_push_candidate1;
+  frame.wb_pop_candidate2 = frame.wb_push_candidate2;
+
+  /* Shadow call stack only deals with functions where the LR is pushed
+     onto the stack and without specifying the "no_sanitize" attribute
+     with the argument "shadow-call-stack".  */
+  frame.is_scs_enabled
+    = (!crtl->calls_eh_return
+       && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
+       && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
+
+  /* When shadow call stack is enabled, the scs_pop in the epilogue will
+     restore x30, and we don't need to pop x30 again in the traditional
+     way.  Pop candidates record the registers that need to be popped
+     eventually.  */
+  if (frame.is_scs_enabled)
+    {
+      if (frame.wb_pop_candidate2 == R30_REGNUM)
+	frame.wb_pop_candidate2 = INVALID_REGNUM;
+      else if (frame.wb_pop_candidate1 == R30_REGNUM)
+	frame.wb_pop_candidate1 = INVALID_REGNUM;
+    }
+
+  /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
+     256 to ensure that the offset meets the requirements of emit_move_insn.
+     Similarly, if candidate1 is INVALID_REGNUM, we need to set
+     max_push_offset to 0, because no registers are popped at this time,
+     so callee_adjust cannot be adjusted.  */
   HOST_WIDE_INT max_push_offset = 0;
-  if (frame.wb_candidate2 != INVALID_REGNUM)
+  if (frame.wb_pop_candidate2 != INVALID_REGNUM)
     max_push_offset = 512;
-  else if (frame.wb_candidate1 != INVALID_REGNUM)
+  else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
     max_push_offset = 256;
 
   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
@@ -7827,8 +7856,8 @@ aarch64_layout_frame (void)
     {
       /* We've decided not to associate any register saves with the initial
 	 stack allocation.  */
-      frame.wb_candidate1 = INVALID_REGNUM;
-      frame.wb_candidate2 = INVALID_REGNUM;
+      frame.wb_pop_candidate1 = frame.wb_push_candidate1 = INVALID_REGNUM;
+      frame.wb_pop_candidate2 = frame.wb_push_candidate2 = INVALID_REGNUM;
     }
 
   frame.laid_out = true;
@@ -8141,8 +8170,8 @@ aarch64_save_callee_saves (poly_int64 start_offset,
       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
 
       if (skip_wb
-	  && (regno == cfun->machine->frame.wb_candidate1
-	      || regno == cfun->machine->frame.wb_candidate2))
+	  && (regno == cfun->machine->frame.wb_push_candidate1
+	      || regno == cfun->machine->frame.wb_push_candidate2))
 	continue;
 
       if (cfun->machine->reg_is_wrapped_separately[regno])
@@ -8252,8 +8281,8 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
       rtx reg, mem;
 
       if (skip_wb
-	  && (regno == cfun->machine->frame.wb_candidate1
-	      || regno == cfun->machine->frame.wb_candidate2))
+	  && (regno == cfun->machine->frame.wb_pop_candidate1
+	      || regno == cfun->machine->frame.wb_pop_candidate2))
 	continue;
 
       machine_mode mode = aarch64_reg_save_mode (regno);
@@ -8424,8 +8453,8 @@ aarch64_get_separate_components (void)
   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
 
-  unsigned reg1 = cfun->machine->frame.wb_candidate1;
-  unsigned reg2 = cfun->machine->frame.wb_candidate2;
+  unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
+  unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
   /* If registers have been chosen to be stored/restored with
      writeback don't interfere with them to avoid having to output explicit
      stack adjustment instructions.  */
@@ -9034,8 +9063,8 @@ aarch64_expand_prologue (void)
   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
   poly_int64 below_hard_fp_saved_regs_size
     = cfun->machine->frame.below_hard_fp_saved_regs_size;
-  unsigned reg1 = cfun->machine->frame.wb_candidate1;
-  unsigned reg2 = cfun->machine->frame.wb_candidate2;
+  unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
+  unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
   rtx_insn *insn;
 
@@ -9066,6 +9095,10 @@ aarch64_expand_prologue (void)
       RTX_FRAME_RELATED_P (insn) = 1;
     }
 
+  /* Push return address to shadow call stack.  */
+  if (cfun->machine->frame.is_scs_enabled)
+    emit_insn (gen_scs_push ());
+
   if (flag_stack_usage_info)
     current_function_static_stack_size = constant_lower_bound (frame_size);
 
@@ -9212,8 +9245,10 @@ aarch64_expand_epilogue (bool for_sibcall)
   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
   poly_int64 below_hard_fp_saved_regs_size
     = cfun->machine->frame.below_hard_fp_saved_regs_size;
-  unsigned reg1 = cfun->machine->frame.wb_candidate1;
-  unsigned reg2 = cfun->machine->frame.wb_candidate2;
+  unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
+  unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
+  unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
+			   ? R29_REGNUM : R30_REGNUM);
   rtx cfi_ops = NULL;
   rtx_insn *insn;
   /* A stack clash protection prologue may not have left EP0_REGNUM or
@@ -9283,8 +9318,12 @@ aarch64_expand_epilogue (bool for_sibcall)
 				false, &cfi_ops);
   if (maybe_ne (sve_callee_adjust, 0))
     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
+
+  /* When shadow call stack is enabled, the scs_pop in the epilogue will
+     restore x30, we don't need to restore x30 again in the traditional
+     way.  */
   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
-				R0_REGNUM, R30_REGNUM,
+				R0_REGNUM, last_gpr,
 				callee_adjust != 0, &cfi_ops);
 
   if (need_barrier_p)
@@ -9322,6 +9361,17 @@ aarch64_expand_epilogue (bool for_sibcall)
       RTX_FRAME_RELATED_P (insn) = 1;
     }
 
+  /* Pop return address from shadow call stack.  */
+  if (cfun->machine->frame.is_scs_enabled)
+    {
+      machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
+      rtx reg = gen_rtx_REG (mode, R30_REGNUM);
+
+      insn = emit_insn (gen_scs_pop ());
+      add_reg_note (insn, REG_CFA_RESTORE, reg);
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+
   /* We prefer to emit the combined return/authenticate instruction RETAA,
      however there are three cases in which we must instead emit an explicit
      authentication instruction.
@@ -16878,6 +16928,10 @@ aarch64_override_options_internal (struct gcc_options *opts)
       aarch64_stack_protector_guard_offset = offs;
     }
 
+  if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
+      && !fixed_regs[R18_REGNUM])
+    error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
+
   initialize_aarch64_code_model (opts);
   initialize_aarch64_tls_size (opts);
 
@@ -27084,6 +27138,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_ASM_FUNCTION_EPILOGUE
 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
 
+#undef TARGET_HAVE_SHADOW_CALL_STACK
+#define TARGET_HAVE_SHADOW_CALL_STACK true
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-aarch64.h"
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index dddf133..27ba4f4 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -922,9 +922,21 @@ struct GTY (()) aarch64_frame
 	 Indicated by CALLEE_ADJUST == 0 && EMIT_FRAME_CHAIN.
 
      These fields indicate which registers we've decided to handle using
-     (1) or (2), or INVALID_REGNUM if none.  */
-  unsigned wb_candidate1;
-  unsigned wb_candidate2;
+     (1) or (2), or INVALID_REGNUM if none.
+
+     In some cases we don't always need to pop all registers in the push
+     candidates, pop candidates record which registers need to be popped
+     eventually.  The initial value of a pop candidate is copied from its
+     corresponding push candidate.
+
+     Currently, different pop candidates are only used for shadow call
+     stack.  When "-fsanitize=shadow-call-stack" is specified, we replace
+     x30 in the pop candidate with INVALID_REGNUM to ensure that x30 is
+     not popped twice.  */
+  unsigned wb_push_candidate1;
+  unsigned wb_push_candidate2;
+  unsigned wb_pop_candidate1;
+  unsigned wb_pop_candidate2;
 
   /* Big-endian SVE frames need a spare predicate register in order
      to save vector registers in the correct layout for unwinding.
@@ -932,6 +944,9 @@ struct GTY (()) aarch64_frame
   unsigned spare_pred_reg;
 
   bool laid_out;
+
+  /* True if shadow call stack should be enabled for the current function.  */
+  bool is_scs_enabled;
 };
 
 typedef struct GTY (()) machine_function
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 5909184..c985250 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -7093,6 +7093,16 @@
   "hint\t7 // xpaclri"
 )
 
+;; Save X30 in the X18-based POST_INC stack (consistent with clang).
+(define_expand "scs_push"
+  [(set (mem:DI (post_inc:DI (reg:DI R18_REGNUM)))
+	(reg:DI R30_REGNUM))])
+
+;; Load X30 form the X18-based PRE_DEC stack (consistent with clang).
+(define_expand "scs_pop"
+  [(set (reg:DI R30_REGNUM)
+	(mem:DI (pre_dec:DI (reg:DI R18_REGNUM))))])
+
 ;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and
 ;; all of memory.  This blocks insns from being moved across this point.
 
-- 
cgit v1.1


From 0435b978f95971e139882549f5a1765c50682216 Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Fri, 11 Feb 2022 14:44:15 +0800
Subject: i386: Relax cmpxchg instruction under -mrelax-cmpxchg-loop [PR103069]

For cmpxchg, it is commonly used in spin loop, and several user code
such as pthread directly takes cmpxchg as loop condition, which cause
huge cache bouncing.

This patch extends previous implementation to relax all cmpxchg
instruction under -mrelax-cmpxchg-loop with an extra atomic load,
compare and emulate the failed cmpxchg behavior.

For original spin loop which looks like

loop: mov    %eax,%r8d
      or     $1,%r8d
      lock cmpxchg %r8d,(%rdi)
      jne    loop

It will now truns to

loop: mov    %eax,%r8d
      or     $1,%r8d
      mov    (%r8),%rsi <--- load lock first
      cmp    %rsi,%rax <--- compare with expected input
      jne    .L2 <--- lock ne expected
      lock cmpxchg %r8d,(%rdi)
      jne    loop
  L2: mov    %rsi,%rax <--- perform the behavior of failed cmpxchg
      jne    loop

under -mrelax-cmpxchg-loop.

gcc/ChangeLog:

	PR target/103069
	* config/i386/i386-expand.cc (ix86_expand_atomic_fetch_op_loop):
	Split atomic fetch and loop part.
	(ix86_expand_cmpxchg_loop): New expander for cmpxchg loop.
	* config/i386/i386-protos.h (ix86_expand_cmpxchg_loop): New
	prototype.
	* config/i386/sync.md (atomic_compare_and_swap<mode>): Call new
	expander under TARGET_RELAX_CMPXCHG_LOOP.
	(atomic_compare_and_swap<mode>): Likewise for doubleword modes.

gcc/testsuite/ChangeLog:

	PR target/103069
	* gcc.target/i386/pr103069-2.c: Adjust result check.
	* gcc.target/i386/pr103069-3.c: New test.
	* gcc.target/i386/pr103069-4.c: Likewise.
---
 gcc/config/i386/i386-expand.cc | 153 ++++++++++++++++++++++++++++++-----------
 gcc/config/i386/i386-protos.h  |   2 +
 gcc/config/i386/sync.md        |  65 ++++++++++-------
 3 files changed, 157 insertions(+), 63 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index ce9607e..6cf1a0b 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23203,16 +23203,14 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
 				       enum rtx_code code, bool after,
 				       bool doubleword)
 {
-  rtx old_reg, new_reg, old_mem, success, oldval, new_mem;
-  rtx_code_label *loop_label, *pause_label, *done_label;
+  rtx old_reg, new_reg, old_mem, success;
   machine_mode mode = GET_MODE (target);
+  rtx_code_label *loop_label = NULL;
 
   old_reg = gen_reg_rtx (mode);
   new_reg = old_reg;
-  loop_label = gen_label_rtx ();
-  pause_label = gen_label_rtx ();
-  done_label = gen_label_rtx ();
   old_mem = copy_to_reg (mem);
+  loop_label = gen_label_rtx ();
   emit_label (loop_label);
   emit_move_insn (old_reg, old_mem);
 
@@ -23234,50 +23232,125 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
   if (after)
     emit_move_insn (target, new_reg);
 
-  /* Load memory again inside loop.  */
-  new_mem = copy_to_reg (mem);
-  /* Compare mem value with expected value.  */
+  success = NULL_RTX;
+
+  ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
+			    gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
+					  SImode),
+			    doubleword, loop_label);
+}
+
+/* Relax cmpxchg instruction, param loop_label indicates whether
+   the instruction should be relaxed with a pause loop.  If not,
+   it will be relaxed to an atomic load + compare, and skip
+   cmpxchg instruction if mem != exp_input.  */
+
+void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
+			       rtx mem, rtx exp_input, rtx new_input,
+			       rtx mem_model, bool doubleword,
+			       rtx_code_label *loop_label)
+{
+  rtx_code_label *cmp_label = NULL;
+  rtx_code_label *done_label = NULL;
+  rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
+  rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
+  rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
+  machine_mode mode = GET_MODE (target_val), hmode = mode;
+
+  if (*ptarget_bool == NULL)
+    target_bool = gen_reg_rtx (QImode);
+  else
+    target_bool = *ptarget_bool;
+
+  cmp_label = gen_label_rtx ();
+  done_label = gen_label_rtx ();
+
+  new_mem = gen_reg_rtx (mode);
+  /* Load memory first.  */
+  expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
+
+  switch (mode)
+    {
+    case TImode:
+      gendw = gen_atomic_compare_and_swapti_doubleword;
+      hmode = DImode;
+      break;
+    case DImode:
+      if (doubleword)
+	{
+	  gendw = gen_atomic_compare_and_swapdi_doubleword;
+	  hmode = SImode;
+	}
+      else
+	gen = gen_atomic_compare_and_swapdi_1;
+      break;
+    case SImode:
+      gen = gen_atomic_compare_and_swapsi_1; break;
+    case HImode:
+      gen = gen_atomic_compare_and_swaphi_1; break;
+    case QImode:
+      gen = gen_atomic_compare_and_swapqi_1; break;
+    default:
+      gcc_unreachable ();
+    }
 
+  /* Compare mem value with expected value.  */
   if (doubleword)
     {
-      machine_mode half_mode = (mode == DImode)? SImode : DImode;
-      rtx low_new_mem = gen_lowpart (half_mode, new_mem);
-      rtx low_old_mem = gen_lowpart (half_mode, old_mem);
-      rtx high_new_mem = gen_highpart (half_mode, new_mem);
-      rtx high_old_mem = gen_highpart (half_mode, old_mem);
-      emit_cmp_and_jump_insns (low_new_mem, low_old_mem, NE, NULL_RTX,
-			       half_mode, 1, pause_label,
+      rtx low_new_mem = gen_lowpart (hmode, new_mem);
+      rtx low_exp_input = gen_lowpart (hmode, exp_input);
+      rtx high_new_mem = gen_highpart (hmode, new_mem);
+      rtx high_exp_input = gen_highpart (hmode, exp_input);
+      emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
+			       hmode, 1, cmp_label,
 			       profile_probability::guessed_never ());
-      emit_cmp_and_jump_insns (high_new_mem, high_old_mem, NE, NULL_RTX,
-			       half_mode, 1, pause_label,
+      emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
+			       hmode, 1, cmp_label,
 			       profile_probability::guessed_never ());
     }
   else
-    emit_cmp_and_jump_insns (new_mem, old_mem, NE, NULL_RTX,
-			     GET_MODE (old_mem), 1, pause_label,
+    emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
+			     GET_MODE (exp_input), 1, cmp_label,
 			     profile_probability::guessed_never ());
 
-  success = NULL_RTX;
-  oldval = old_mem;
-  expand_atomic_compare_and_swap (&success, &oldval, mem, old_reg,
-				  new_reg, false, MEMMODEL_SYNC_SEQ_CST,
-				  MEMMODEL_RELAXED);
-  if (oldval != old_mem)
-    emit_move_insn (old_mem, oldval);
-
-  emit_cmp_and_jump_insns (success, const0_rtx, EQ, const0_rtx,
-			   GET_MODE (success), 1, loop_label,
-			   profile_probability::guessed_never ());
-
-  emit_jump_insn (gen_jump (done_label));
-  emit_barrier ();
-
-  /* If mem is not expected, pause and loop back.  */
-  emit_label (pause_label);
-  emit_insn (gen_pause ());
-  emit_jump_insn (gen_jump (loop_label));
-  emit_barrier ();
-  emit_label (done_label);
+  /* Directly emits cmpxchg here.  */
+  if (doubleword)
+    emit_insn (gendw (target_val, mem, exp_input,
+		      gen_lowpart (hmode, new_input),
+		      gen_highpart (hmode, new_input),
+		      mem_model));
+  else
+    emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
+
+  if (!loop_label)
+  {
+    emit_jump_insn (gen_jump (done_label));
+    emit_barrier ();
+    emit_label (cmp_label);
+    emit_move_insn (target_val, new_mem);
+    emit_label (done_label);
+    ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
+		       const0_rtx);
+  }
+  else
+  {
+    ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
+		       const0_rtx);
+    emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
+			     GET_MODE (target_bool), 1, loop_label,
+			     profile_probability::guessed_never ());
+    emit_jump_insn (gen_jump (done_label));
+    emit_barrier ();
+
+    /* If mem is not expected, pause and loop back.  */
+    emit_label (cmp_label);
+    emit_insn (gen_pause ());
+    emit_jump_insn (gen_jump (loop_label));
+    emit_barrier ();
+    emit_label (done_label);
+  }
+
+  *ptarget_bool = target_bool;
 }
 
 #include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index b7e9aa7..d5e1125 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -221,6 +221,8 @@ extern void ix86_split_mmx_punpck (rtx[], bool);
 extern void ix86_expand_avx_vzeroupper (void);
 extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code,
 					      bool, bool);
+extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
+				      bool, rtx_code_label *);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index 36417c5..820e9ca 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -373,11 +373,20 @@
    (match_operand:SI 7 "const_int_operand")]	;; failure model
   "TARGET_CMPXCHG"
 {
-  emit_insn
-   (gen_atomic_compare_and_swap<mode>_1
-    (operands[1], operands[2], operands[3], operands[4], operands[6]));
-  ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
-		     const0_rtx);
+  if (TARGET_RELAX_CMPXCHG_LOOP)
+  {
+    ix86_expand_cmpxchg_loop (&operands[0], operands[1], operands[2],
+			      operands[3], operands[4], operands[6],
+			      false, NULL);
+  }
+  else
+  {
+    emit_insn
+      (gen_atomic_compare_and_swap<mode>_1
+	(operands[1], operands[2], operands[3], operands[4], operands[6]));
+      ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
+			const0_rtx);
+  }
   DONE;
 })
 
@@ -397,25 +406,35 @@
    (match_operand:SI 7 "const_int_operand")]	;; failure model
   "TARGET_CMPXCHG"
 {
-  if (<MODE>mode == DImode && TARGET_64BIT)
-    {
-      emit_insn
-       (gen_atomic_compare_and_swapdi_1
-	(operands[1], operands[2], operands[3], operands[4], operands[6]));
-    }
+  int doubleword = !(<MODE>mode == DImode && TARGET_64BIT);
+  if (TARGET_RELAX_CMPXCHG_LOOP)
+  {
+    ix86_expand_cmpxchg_loop (&operands[0], operands[1], operands[2],
+			      operands[3], operands[4], operands[6],
+			      doubleword, NULL);
+  }
   else
-    {
-      machine_mode hmode = <CASHMODE>mode;
-
-      emit_insn
-       (gen_atomic_compare_and_swap<mode>_doubleword
-        (operands[1], operands[2], operands[3],
-	 gen_lowpart (hmode, operands[4]), gen_highpart (hmode, operands[4]),
-	 operands[6]));
-    }
-
-  ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
-		     const0_rtx);
+  {
+    if (!doubleword)
+      {
+	emit_insn
+	  (gen_atomic_compare_and_swapdi_1
+	   (operands[1], operands[2], operands[3], operands[4], operands[6]));
+      }
+    else
+      {
+	machine_mode hmode = <CASHMODE>mode;
+
+	emit_insn
+	  (gen_atomic_compare_and_swap<mode>_doubleword
+	   (operands[1], operands[2], operands[3],
+	    gen_lowpart (hmode, operands[4]), gen_highpart (hmode, operands[4]),
+	    operands[6]));
+      }
+
+    ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
+		       const0_rtx);
+  }
   DONE;
 })
 
-- 
cgit v1.1


From f24dfc76177b3994434c8beb287cde1a9976b5ce Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Fri, 18 Feb 2022 11:50:44 +0100
Subject: tree-optimization/104582 - make SLP node available in vector cost
 hook

This adjusts the vectorizer costing API to allow passing down the
SLP node the vector stmt is created from.

2022-02-18  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/104582
	* tree-vectorizer.h (stmt_info_for_cost::node): New field.
	(vector_costs::add_stmt_cost): Add SLP node parameter.
	(dump_stmt_cost): Likewise.
	(add_stmt_cost): Likewise, new overload and adjust.
	(add_stmt_costs): Adjust.
	(record_stmt_cost): New overload.
	* tree-vectorizer.cc (dump_stmt_cost): Dump the SLP node.
	(vector_costs::add_stmt_cost): Adjust.
	* tree-vect-loop.cc (vect_estimate_min_profitable_iters):
	Adjust.
	* tree-vect-slp.cc (vect_prologue_cost_for_slp): Record
	the SLP node for costing.
	(vectorizable_slp_permutation): Likewise.
	* tree-vect-stmts.cc (record_stmt_cost): Adjust and add
	new overloads.
	* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
	Adjust.
	* config/aarch64/aarch64.cc (aarch64_vector_costs::add_stmt_cost):
	Adjust.
	* config/rs6000/rs6000.cc (rs6000_vector_costs::add_stmt_cost):
	Adjust.
	(rs6000_cost_data::adjust_vect_cost_per_loop): Likewise.
---
 gcc/config/aarch64/aarch64.cc |  6 +++---
 gcc/config/i386/i386.cc       |  9 +++++----
 gcc/config/rs6000/rs6000.cc   | 10 ++++++----
 3 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 8bcee8b..dbeaaf4 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -15058,7 +15058,7 @@ public:
   aarch64_vector_costs (vec_info *, bool);
 
   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
-			      stmt_vec_info stmt_info, tree vectype,
+			      stmt_vec_info stmt_info, slp_tree, tree vectype,
 			      int misalign,
 			      vect_cost_model_location where) override;
   void finish_cost (const vector_costs *) override;
@@ -16003,8 +16003,8 @@ aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
 
 unsigned
 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
-				     stmt_vec_info stmt_info, tree vectype,
-				     int misalign,
+				     stmt_vec_info stmt_info, slp_tree,
+				     tree vectype, int misalign,
 				     vect_cost_model_location where)
 {
   fractional_cost stmt_cost
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index e4b42fb..0830dbd 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22982,8 +22982,8 @@ class ix86_vector_costs : public vector_costs
   using vector_costs::vector_costs;
 
   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
-			      stmt_vec_info stmt_info, tree vectype,
-			      int misalign,
+			      stmt_vec_info stmt_info, slp_tree node,
+			      tree vectype, int misalign,
 			      vect_cost_model_location where) override;
 };
 
@@ -22997,8 +22997,9 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
 
 unsigned
 ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
-				  stmt_vec_info stmt_info, tree vectype,
-				  int misalign, vect_cost_model_location where)
+				  stmt_vec_info stmt_info, slp_tree,
+				  tree vectype, int misalign,
+				  vect_cost_model_location where)
 {
   unsigned retval = 0;
   bool scalar_p
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index d7a7cfe..ca9e7b8 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -5212,7 +5212,7 @@ public:
   using vector_costs::vector_costs;
 
   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
-			      stmt_vec_info stmt_info, tree vectype,
+			      stmt_vec_info stmt_info, slp_tree, tree vectype,
 			      int misalign,
 			      vect_cost_model_location where) override;
   void finish_cost (const vector_costs *) override;
@@ -5428,8 +5428,9 @@ rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind,
 
 unsigned
 rs6000_cost_data::add_stmt_cost (int count, vect_cost_for_stmt kind,
-				 stmt_vec_info stmt_info, tree vectype,
-				 int misalign, vect_cost_model_location where)
+				 stmt_vec_info stmt_info, slp_tree,
+				 tree vectype, int misalign,
+				 vect_cost_model_location where)
 {
   unsigned retval = 0;
 
@@ -5470,7 +5471,8 @@ rs6000_cost_data::adjust_vect_cost_per_loop (loop_vec_info loop_vinfo)
 	  /* Each length needs one shift to fill into bits 0-7.  */
 	  shift_cnt += num_vectors_m1 + 1;
 
-      add_stmt_cost (shift_cnt, scalar_stmt, NULL, NULL_TREE, 0, vect_body);
+      add_stmt_cost (shift_cnt, scalar_stmt, NULL, NULL,
+		     NULL_TREE, 0, vect_body);
     }
 }
 
-- 
cgit v1.1


From 90d693bdc9d71841f51d68826ffa5bd685d7f0bc Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Fri, 18 Feb 2022 14:32:14 +0100
Subject: target/99881 - x86 vector cost of CTOR from integer regs

This uses the now passed SLP node to the vectorizer costing hook
to adjust vector construction costs for the cost of moving an
integer component from a GPR to a vector register when that's
required for building a vector from components.  A cruical difference
here is whether the component is loaded from memory or extracted
from a vector register as in those cases no intermediate GPR is involved.

The pr99881.c testcase can be Un-XFAILed with this patch, the
pr91446.c testcase now produces scalar code which looks superior
to me so I've adjusted it as well.

2022-02-18  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/104582
	PR target/99881
	* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
	Cost GPR to vector register moves for integer vector construction.

	* gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-1.c: New.
	* gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c: Likewise.
	* gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-3.c: Likewise.
	* gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-4.c: Likewise.
	* gcc.target/i386/pr99881.c: Un-XFAIL.
	* gcc.target/i386/pr91446.c: Adjust to not expect vectorization.
---
 gcc/config/i386/i386.cc | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 0830dbd..b2bf905 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22997,7 +22997,7 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
 
 unsigned
 ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
-				  stmt_vec_info stmt_info, slp_tree,
+				  stmt_vec_info stmt_info, slp_tree node,
 				  tree vectype, int misalign,
 				  vect_cost_model_location where)
 {
@@ -23160,6 +23160,49 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
       stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
       stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
     }
+  else if (kind == vec_construct
+	   && node
+	   && SLP_TREE_DEF_TYPE (node) == vect_external_def
+	   && INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
+    {
+      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+      unsigned i;
+      tree op;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+	if (TREE_CODE (op) == SSA_NAME)
+	  TREE_VISITED (op) = 0;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+	{
+	  if (TREE_CODE (op) != SSA_NAME
+	      || TREE_VISITED (op))
+	    continue;
+	  TREE_VISITED (op) = 1;
+	  gimple *def = SSA_NAME_DEF_STMT (op);
+	  tree tem;
+	  if (is_gimple_assign (def)
+	      && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def))
+	      && ((tem = gimple_assign_rhs1 (def)), true)
+	      && TREE_CODE (tem) == SSA_NAME
+	      /* A sign-change expands to nothing.  */
+	      && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (def)),
+					TREE_TYPE (tem)))
+	    def = SSA_NAME_DEF_STMT (tem);
+	  /* When the component is loaded from memory we can directly
+	     move it to a vector register, otherwise we have to go
+	     via a GPR or via vpinsr which involves similar cost.
+	     Likewise with a BIT_FIELD_REF extracting from a vector
+	     register we can hope to avoid using a GPR.  */
+	  if (!is_gimple_assign (def)
+	      || (!gimple_assign_load_p (def)
+		  && (gimple_assign_rhs_code (def) != BIT_FIELD_REF
+		      || !VECTOR_TYPE_P (TREE_TYPE
+				(TREE_OPERAND (gimple_assign_rhs1 (def), 0))))))
+	    stmt_cost += ix86_cost->sse_to_integer;
+	}
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+	if (TREE_CODE (op) == SSA_NAME)
+	  TREE_VISITED (op) = 0;
+    }
   if (stmt_cost == -1)
     stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
-- 
cgit v1.1


From 7e691189ca9c04fdba71ceada1faba62afbc1463 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Tue, 22 Feb 2022 10:38:37 +0100
Subject: i386: Fix up copysign/xorsign expansion [PR104612]

We ICE on the following testcase for -m32 since r12-3435. because
operands[2] is (subreg:SF (reg:DI ...) 0) and
lowpart_subreg (V4SFmode, operands[2], SFmode)
returns NULL, and that is what we use in AND etc. insns we emit.

My earlier version of the patch fixes that by calling force_reg for the
input operands, to make sure they are really REGs and so lowpart_subreg
will succeed on them - even for theoretical MEMs using REGs there seems
desirable, we don't want to read following memory slots for the paradoxical
subreg.  For the outputs, I thought we'd get better code by always computing
result into a new pseudo and them move lowpart of that pseudo into dest.

Unfortunately it regressed
FAIL: gcc.target/i386/pr89984-2.c scan-assembler-not vmovaps
on which the patch changes:
        vandps  .LC0(%rip), %xmm1, %xmm1
-       vxorps  %xmm0, %xmm1, %xmm0
+       vxorps  %xmm0, %xmm1, %xmm1
+       vmovaps %xmm1, %xmm0
        ret
The RA sees:
(insn 8 4 9 2 (set (reg:V4SF 85)
        (and:V4SF (subreg:V4SF (reg:SF 90) 0)
            (mem/u/c:V4SF (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0  S16 A128]))) "pr89984-2.c":7:12 2838 {*andv4sf3}
     (expr_list:REG_DEAD (reg:SF 90)
        (nil)))
(insn 9 8 10 2 (set (reg:V4SF 87)
        (xor:V4SF (reg:V4SF 85)
            (subreg:V4SF (reg:SF 89) 0))) "pr89984-2.c":7:12 2842 {*xorv4sf3}
     (expr_list:REG_DEAD (reg:SF 89)
        (expr_list:REG_DEAD (reg:V4SF 85)
            (nil))))
(insn 10 9 14 2 (set (reg:SF 82 [ <retval> ])
        (subreg:SF (reg:V4SF 87) 0)) "pr89984-2.c":7:12 142 {*movsf_internal}
     (expr_list:REG_DEAD (reg:V4SF 87)
        (nil)))
(insn 14 10 15 2 (set (reg/i:SF 20 xmm0)
        (reg:SF 82 [ <retval> ])) "pr89984-2.c":8:1 142 {*movsf_internal}
     (expr_list:REG_DEAD (reg:SF 82 [ <retval> ])
        (nil)))
(insn 15 14 0 2 (use (reg/i:SF 20 xmm0)) "pr89984-2.c":8:1 -1
     (nil))
and doesn't know that if it would use xmm0 not just for pseudo 82
but also for pseudo 87, it could create a noop move in insn 10 and
so could avoid an extra register copy and nothing later on is able
to figure that out either.  I don't know how the RA should know
that though.

So that we don't regress, this version of the patch
will do this stuff (i.e. use fresh vector pseudo as destination and
then move lowpart of that to dest) over what it used before (i.e.
use paradoxical subreg of the dest) only if lowpart_subreg returns NULL.

2022-02-22  Jakub Jelinek  <jakub@redhat.com>

	PR target/104612
	* config/i386/i386-expand.cc (ix86_expand_copysign): Call force_reg
	on input operands before calling lowpart_subreg on it.  For output
	operand, use a vmode pseudo as destination and then move its lowpart
	subreg into operands[0] if lowpart_subreg fails on dest.
	(ix86_expand_xorsign): Likewise.

	* gcc.dg/pr104612.c: New test.
---
 gcc/config/i386/i386-expand.cc | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 6cf1a0b..7f7055b 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2153,7 +2153,7 @@ void
 ix86_expand_copysign (rtx operands[])
 {
   machine_mode mode, vmode;
-  rtx dest, op0, op1, mask, op2, op3;
+  rtx dest, vdest, op0, op1, mask, op2, op3;
 
   mode = GET_MODE (operands[0]);
 
@@ -2174,8 +2174,13 @@ ix86_expand_copysign (rtx operands[])
       return;
     }
 
-  dest = lowpart_subreg (vmode, operands[0], mode);
-  op1 = lowpart_subreg (vmode, operands[2], mode);
+  dest = operands[0];
+  vdest = lowpart_subreg (vmode, dest, mode);
+  if (vdest == NULL_RTX)
+    vdest = gen_reg_rtx (vmode);
+  else
+    dest = NULL_RTX;
+  op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
   mask = ix86_build_signbit_mask (vmode, 0, 0);
 
   if (CONST_DOUBLE_P (operands[1]))
@@ -2184,7 +2189,9 @@ ix86_expand_copysign (rtx operands[])
       /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a.  */
       if (op0 == CONST0_RTX (mode))
 	{
-	  emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1));
+	  emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
+	  if (dest)
+	    emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
 	  return;
 	}
 
@@ -2193,7 +2200,7 @@ ix86_expand_copysign (rtx operands[])
       op0 = force_reg (vmode, op0);
     }
   else
-    op0 = lowpart_subreg (vmode, operands[1], mode);
+    op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
 
   op2 = gen_reg_rtx (vmode);
   op3 = gen_reg_rtx (vmode);
@@ -2201,7 +2208,9 @@ ix86_expand_copysign (rtx operands[])
 				    gen_rtx_NOT (vmode, mask),
 				    op0));
   emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
-  emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3));
+  emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
+  if (dest)
+    emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
 }
 
 /* Expand an xorsign operation.  */
@@ -2210,7 +2219,7 @@ void
 ix86_expand_xorsign (rtx operands[])
 {
   machine_mode mode, vmode;
-  rtx dest, op0, op1, mask, x, temp;
+  rtx dest, vdest, op0, op1, mask, x, temp;
 
   dest = operands[0];
   op0 = operands[1];
@@ -2230,15 +2239,22 @@ ix86_expand_xorsign (rtx operands[])
   temp = gen_reg_rtx (vmode);
   mask = ix86_build_signbit_mask (vmode, 0, 0);
 
-  op1 = lowpart_subreg (vmode, op1, mode);
+  op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
   x = gen_rtx_AND (vmode, op1, mask);
   emit_insn (gen_rtx_SET (temp, x));
 
-  op0 = lowpart_subreg (vmode, op0, mode);
+  op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
   x = gen_rtx_XOR (vmode, temp, op0);
 
-  dest = lowpart_subreg (vmode, dest, mode);
-  emit_insn (gen_rtx_SET (dest, x));
+  vdest = lowpart_subreg (vmode, dest, mode);
+  if (vdest == NULL_RTX)
+    vdest = gen_reg_rtx (vmode);
+  else
+    dest = NULL_RTX;
+  emit_insn (gen_rtx_SET (vdest, x));
+
+  if (dest)
+    emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
 }
 
 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
-- 
cgit v1.1


From c2b23aaaf4457278403c01cd145cd3936683384e Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Fri, 18 Feb 2022 12:31:02 +0100
Subject: [nvptx] Add -mptx-comment

Add functionality that indicates which insns are added by -minit-regs, such
that for instance we have for pr53465.s:
...
        // #APP
// 9 "gcc/testsuite/gcc.c-torture/execute/pr53465.c" 1
        // Start: Added by -minit-regs=3:
        // #NO_APP
                mov.u32 %r26, 0;
        // #APP
// 9 "gcc/testsuite/gcc.c-torture/execute/pr53465.c" 1
        // End: Added by -minit-regs=3:
        // #NO_APP
...

Can be switched off using -mno-ptx-comment.

Tested on nvptx.

gcc/ChangeLog:

2022-02-21  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.cc (gen_comment): New function.
	(workaround_uninit_method_1, workaround_uninit_method_2)
	(workaround_uninit_method_3): : Use gen_comment.
	* config/nvptx/nvptx.opt (mptx-comment): New option.
---
 gcc/config/nvptx/nvptx.cc  | 42 ++++++++++++++++++++++++++++++++++++++++++
 gcc/config/nvptx/nvptx.opt |  3 +++
 2 files changed, 45 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index a37a6c7..981b91f 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -5372,6 +5372,17 @@ workaround_barsyncs (void)
 }
 #endif
 
+static rtx
+gen_comment (const char *s)
+{
+  const char *sep = " ";
+  size_t len = strlen (ASM_COMMENT_START) + strlen (sep) + strlen (s) + 1;
+  char *comment = (char *) alloca (len);
+  snprintf (comment, len, "%s%s%s", ASM_COMMENT_START, sep, s);
+  return gen_rtx_ASM_INPUT_loc (VOIDmode, ggc_strdup (comment),
+				cfun->function_start_locus);
+}
+
 /* Initialize all declared regs at function entry.
    Advantage   : Fool-proof.
    Disadvantage: Potentially creates a lot of long live ranges and adds a lot
@@ -5394,6 +5405,8 @@ workaround_uninit_method_1 (void)
       gcc_assert (CONST0_RTX (GET_MODE (reg)));
 
       start_sequence ();
+      if (nvptx_comment && first != NULL)
+	emit_insn (gen_comment ("Start: Added by -minit-regs=1"));
       emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
       rtx_insn *inits = get_insns ();
       end_sequence ();
@@ -5411,6 +5424,9 @@ workaround_uninit_method_1 (void)
       else
 	insert_here = emit_insn_after (inits, insert_here);
     }
+
+  if (nvptx_comment && insert_here != NULL)
+    emit_insn_after (gen_comment ("End: Added by -minit-regs=1"), insert_here);
 }
 
 /* Find uses of regs that are not defined on all incoming paths, and insert a
@@ -5446,6 +5462,8 @@ workaround_uninit_method_2 (void)
       gcc_assert (CONST0_RTX (GET_MODE (reg)));
 
       start_sequence ();
+      if (nvptx_comment && first != NULL)
+	emit_insn (gen_comment ("Start: Added by -minit-regs=2:"));
       emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
       rtx_insn *inits = get_insns ();
       end_sequence ();
@@ -5463,6 +5481,9 @@ workaround_uninit_method_2 (void)
       else
 	insert_here = emit_insn_after (inits, insert_here);
     }
+
+  if (nvptx_comment && insert_here != NULL)
+    emit_insn_after (gen_comment ("End: Added by -minit-regs=2"), insert_here);
 }
 
 /* Find uses of regs that are not defined on all incoming paths, and insert a
@@ -5531,6 +5552,27 @@ workaround_uninit_method_3 (void)
 	}
     }
 
+  if (nvptx_comment)
+    FOR_EACH_BB_FN (bb, cfun)
+      {
+	if (single_pred_p (bb))
+	  continue;
+
+	edge e;
+	edge_iterator ei;
+	FOR_EACH_EDGE (e, ei, bb->preds)
+	  {
+	    if (e->insns.r == NULL_RTX)
+	      continue;
+	    start_sequence ();
+	    emit_insn (gen_comment ("Start: Added by -minit-regs=3:"));
+	    emit_insn (e->insns.r);
+	    emit_insn (gen_comment ("End: Added by -minit-regs=3:"));
+	    e->insns.r = get_insns ();
+	    end_sequence ();
+	  }
+      }
+
   commit_edge_insertions ();
 }
 
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 0858007..e56ec92 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -95,3 +95,6 @@ Specify the version of the ptx version to use.
 minit-regs=
 Target Var(nvptx_init_regs) IntegerRange(0, 3) Joined UInteger Init(3)
 Initialize ptx registers.
+
+mptx-comment
+Target Var(nvptx_comment) Init(1) Undocumented
-- 
cgit v1.1


From bc91cb8d8cf1d4abbb74fb69d918071e1801fd77 Mon Sep 17 00:00:00 2001
From: Tobias Burnus <tobias@codesourcery.com>
Date: Sat, 19 Feb 2022 23:28:49 +0100
Subject: nvptx: Add -mptx=6.0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently supported internally are 3.1, 6.0, 6.3 and 7.0.

However, -mptx= supports 3.1, 6.3, 7.0 – but not the internal default 6.0.

Add -mptx=6.0 for consistency.

Tested on nvptx.

gcc/ChangeLog:

	* config/nvptx/nvptx.opt (mptx): Add 6.0 alias PTX_VERSION_6_0.
	* doc/invoke.texi (-mptx): Update for new values and defaults.

Co-Authored-By: Tom de Vries <tdevries@suse.de>
---
 gcc/config/nvptx/nvptx.opt | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index e56ec92..97e127c 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -83,6 +83,9 @@ EnumValue
 Enum(ptx_version) String(3.1) Value(PTX_VERSION_3_1)
 
 EnumValue
+Enum(ptx_version) String(6.0) Value(PTX_VERSION_6_0)
+
+EnumValue
 Enum(ptx_version) String(6.3) Value(PTX_VERSION_6_3)
 
 EnumValue
-- 
cgit v1.1


From bd73d8dd312c759ee505b401d6b4fd7be07a3f1a Mon Sep 17 00:00:00 2001
From: Tobias Burnus <tobias@codesourcery.com>
Date: Sun, 20 Feb 2022 00:25:33 +0100
Subject: nvptx: Add -misa=sm_70

Add -misa=sm_70, and use it to specify the misa value in test-case
gcc.target/nvptx/atomic-store-2.c.

Tested on nvptx.

gcc/ChangeLog:

	* config/nvptx/nvptx-c.cc (nvptx_cpu_cpp_builtins): Handle SM70.
	* config/nvptx/nvptx.cc (first_ptx_version_supporting_sm):
	Likewise.
	* config/nvptx/nvptx.opt (misa): Add sm_70 alias PTX_ISA_SM70.

gcc/testsuite/ChangeLog:

2022-02-22  Tom de Vries  <tdevries@suse.de>

	* gcc.target/nvptx/atomic-store-2.c: Use -misa=sm_70.
	* gcc.target/nvptx/uniform-simt-3.c: Same.

Co-Authored-By: Tom de Vries <tdevries@suse.de>
---
 gcc/config/nvptx/nvptx-c.cc | 2 ++
 gcc/config/nvptx/nvptx.cc   | 2 ++
 gcc/config/nvptx/nvptx.opt  | 3 +++
 3 files changed, 7 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx-c.cc b/gcc/config/nvptx/nvptx-c.cc
index d68b991..b2375fb 100644
--- a/gcc/config/nvptx/nvptx-c.cc
+++ b/gcc/config/nvptx/nvptx-c.cc
@@ -43,6 +43,8 @@ nvptx_cpu_cpp_builtins (void)
     cpp_define (parse_in, "__PTX_SM__=800");
   else if (TARGET_SM75)
     cpp_define (parse_in, "__PTX_SM__=750");
+  else if (TARGET_SM70)
+    cpp_define (parse_in, "__PTX_SM__=700");
   else if (TARGET_SM53)
     cpp_define (parse_in, "__PTX_SM__=530");
   else if (TARGET_SM35)
diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 981b91f..858789e 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -217,6 +217,8 @@ first_ptx_version_supporting_sm (enum ptx_isa sm)
       return PTX_VERSION_3_1;
     case PTX_ISA_SM53:
       return PTX_VERSION_4_2;
+    case PTX_ISA_SM70:
+      return PTX_VERSION_6_0;
     case PTX_ISA_SM75:
       return PTX_VERSION_6_3;
     case PTX_ISA_SM80:
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 97e127c..9776c3b 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -65,6 +65,9 @@ EnumValue
 Enum(ptx_isa) String(sm_53) Value(PTX_ISA_SM53)
 
 EnumValue
+Enum(ptx_isa) String(sm_70) Value(PTX_ISA_SM70)
+
+EnumValue
 Enum(ptx_isa) String(sm_75) Value(PTX_ISA_SM75)
 
 EnumValue
-- 
cgit v1.1


From bf3e36fbf13f0db44a79988036cb9c042288841a Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@arm.com>
Date: Wed, 13 Oct 2021 09:16:09 +0000
Subject: arm: Add GENERAL_AND_VPR_REGS regclass
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At some point during the development of this patch series, it appeared
that in some cases the register allocator wants “VPR or general”
rather than “VPR or general or FP” (which is the same thing as
ALL_REGS).  The series does not seem to require this anymore, but it
seems to be a good thing to do anyway, to give the register allocator
more freedom.

CLASS_MAX_NREGS and arm_hard_regno_nregs need adjustment to avoid a
regression in gcc.dg/stack-usage-1.c when compiled with -mthumb
-mfloat-abi=hard -march=armv8.1-m.main+mve.fp+fp.dp.

Most of the work of this patch series was carried out while I was
working at STMicroelectronics as a Linaro assignee.

2022-02-22  Christophe Lyon  <christophe.lyon@arm.com>

	gcc/
	* config/arm/arm.h (reg_class): Add GENERAL_AND_VPR_REGS.
	(REG_CLASS_NAMES): Likewise.
	(REG_CLASS_CONTENTS): Likewise.
	(CLASS_MAX_NREGS): Handle VPR.
	* config/arm/arm.cc (arm_hard_regno_nregs): Handle VPR.
---
 gcc/config/arm/arm.cc | 3 +++
 gcc/config/arm/arm.h  | 7 ++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 663f459..9c19589 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -25339,6 +25339,9 @@ thumb2_asm_output_opcode (FILE * stream)
 static unsigned int
 arm_hard_regno_nregs (unsigned int regno, machine_mode mode)
 {
+  if (IS_VPR_REGNUM (regno))
+    return CEIL (GET_MODE_SIZE (mode), 2);
+
   if (TARGET_32BIT
       && regno > PC_REGNUM
       && regno != FRAME_POINTER_REGNUM
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index f52724d..61c0221 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -1287,6 +1287,7 @@ enum reg_class
   SFP_REG,
   AFP_REG,
   VPR_REG,
+  GENERAL_AND_VPR_REGS,
   ALL_REGS,
   LIM_REG_CLASSES
 };
@@ -1316,6 +1317,7 @@ enum reg_class
   "SFP_REG",		\
   "AFP_REG",		\
   "VPR_REG",		\
+  "GENERAL_AND_VPR_REGS", \
   "ALL_REGS"		\
 }
 
@@ -1344,6 +1346,7 @@ enum reg_class
   { 0x00000000, 0x00000000, 0x00000000, 0x00000040 }, /* SFP_REG */	\
   { 0x00000000, 0x00000000, 0x00000000, 0x00000080 }, /* AFP_REG */	\
   { 0x00000000, 0x00000000, 0x00000000, 0x00000400 }, /* VPR_REG.  */	\
+  { 0x00005FFF, 0x00000000, 0x00000000, 0x00000400 }, /* GENERAL_AND_VPR_REGS.  */ \
   { 0xFFFF7FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000000F }  /* ALL_REGS.  */	\
 }
 
@@ -1453,7 +1456,9 @@ extern const char *fp_sysreg_names[NB_FP_SYSREGS];
    ARM regs are UNITS_PER_WORD bits.  
    FIXME: Is this true for iWMMX?  */
 #define CLASS_MAX_NREGS(CLASS, MODE)  \
-  (ARM_NUM_REGS (MODE))
+  (CLASS == VPR_REG)		      \
+  ? CEIL (GET_MODE_SIZE (MODE), 2)    \
+  : (ARM_NUM_REGS (MODE))
 
 /* If defined, gives a class of registers that cannot be used as the
    operand of a SUBREG that changes the mode of the object illegally.  */
-- 
cgit v1.1


From 6769084fdf159fb5c0fd20c8d28cfef5b2126cb0 Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@arm.com>
Date: Wed, 13 Oct 2021 09:16:14 +0000
Subject: arm: Add support for VPR_REG in arm_class_likely_spilled_p

VPR_REG is the only register in its class, so it should be handled by
TARGET_CLASS_LIKELY_SPILLED_P, which is achieved by calling
default_class_likely_spilled_p.  No test fails without this patch, but
it seems it should be implemented.

Most of the work of this patch series was carried out while I was
working at STMicroelectronics as a Linaro assignee.

2022-02-22  Christophe Lyon  <christophe.lyon@arm.com>

	gcc/
	* config/arm/arm.cc (arm_class_likely_spilled_p): Handle VPR_REG.
---
 gcc/config/arm/arm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 9c19589..8d7f095 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -29369,7 +29369,7 @@ arm_class_likely_spilled_p (reg_class_t rclass)
       || rclass  == CC_REG)
     return true;
 
-  return false;
+  return default_class_likely_spilled_p (rclass);
 }
 
 /* Implements target hook small_register_classes_for_mode_p.  */
-- 
cgit v1.1


From 0d0aaea105f6b5ddd9b4763e4cbd16ef65a74cb9 Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@arm.com>
Date: Wed, 13 Oct 2021 09:16:17 +0000
Subject: arm: Fix mve_vmvnq_n_<supf><mode> argument mode

The vmvnq_n* intrinsics and have [u]int[16|32]_t arguments, so use
<V_elem> iterator instead of HI in mve_vmvnq_n_<supf><mode>.

Most of the work of this patch series was carried out while I was
working at STMicroelectronics as a Linaro assignee.

2022-02-22  Christophe Lyon  <christophe.lyon@arm.com>

	gcc/
	* config/arm/mve.md (mve_vmvnq_n_<supf><mode>): Use V_elem mode
	for operand 1.
---
 gcc/config/arm/mve.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 171dd38..5c3b34d 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -617,7 +617,7 @@
 (define_insn "mve_vmvnq_n_<supf><mode>"
   [
    (set (match_operand:MVE_5 0 "s_register_operand" "=w")
-	(unspec:MVE_5 [(match_operand:HI 1 "immediate_operand" "i")]
+	(unspec:MVE_5 [(match_operand:<V_elem> 1 "immediate_operand" "i")]
 	 VMVNQ_N))
   ]
   "TARGET_HAVE_MVE"
-- 
cgit v1.1


From 884f77b4222289510e1df9db2889b60c5df6fcda Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@arm.com>
Date: Wed, 13 Oct 2021 09:16:22 +0000
Subject: arm: Implement MVE predicates as vectors of booleans

This patch implements support for vectors of booleans to support MVE
predicates, instead of HImode.  Since the ABI mandates pred16_t (aka
uint16_t) to represent predicates in intrinsics prototypes, we
introduce a new "predicate" type qualifier so that we can map relevant
builtins HImode arguments and return value to the appropriate vector
of booleans (VxBI).

We have to update test_vector_ops_duplicate, because it iterates using
an offset in bytes, where we would need to iterate in bits: we stop
iterating when we reach the end of the vector of booleans.

In addition, we have to fix the underlying definition of vectors of
booleans because ARM/MVE needs a different representation than
AArch64/SVE. With ARM/MVE the 'true' bit is duplicated over the
element size, so that a true element of V4BI is represented by
'0b1111'.  This patch updates the aarch64 definition of VNx*BI as
needed.

Most of the work of this patch series was carried out while I was
working at STMicroelectronics as a Linaro assignee.

2022-02-22  Christophe Lyon  <christophe.lyon@arm.com>
	    Richard Sandiford  <richard.sandiford@arm.com>

	gcc/
	PR target/100757
	PR target/101325
	* config/aarch64/aarch64-modes.def (VNx16BI, VNx8BI, VNx4BI,
	VNx2BI): Update definition.
	* config/arm/arm-builtins.cc (arm_init_simd_builtin_types): Add new
	simd types.
	(arm_init_builtin): Map predicate vectors arguments to HImode.
	(arm_expand_builtin_args): Move HImode predicate arguments to VxBI
	rtx. Move return value to HImode rtx.
	* config/arm/arm-builtins.h (arm_type_qualifiers): Add qualifier_predicate.
	* config/arm/arm-modes.def (B2I, B4I, V16BI, V8BI, V4BI): New modes.
	* config/arm/arm-simd-builtin-types.def (Pred1x16_t,
	Pred2x8_t,Pred4x4_t): New.
	* emit-rtl.cc (init_emit_once): Handle all boolean modes.
	* genmodes.cc (mode_data): Add boolean field.
	(blank_mode): Initialize it.
	(make_complex_modes): Fix handling of boolean modes.
	(make_vector_modes): Likewise.
	(VECTOR_BOOL_MODE): Use new COMPONENT parameter.
	(make_vector_bool_mode): Likewise.
	(BOOL_MODE): New.
	(make_bool_mode): New.
	(emit_insn_modes_h): Fix generation of boolean modes.
	(emit_class_narrowest_mode): Likewise.
	* machmode.def: (VECTOR_BOOL_MODE): Document new COMPONENT
	parameter.  Use new BOOL_MODE instead of FRACTIONAL_INT_MODE to
	define BImode.
	* rtx-vector-builder.cc (rtx_vector_builder::find_cached_value):
	Fix handling of constm1_rtx for VECTOR_BOOL.
	* simplify-rtx.cc (native_encode_rtx): Fix support for VECTOR_BOOL.
	(native_decode_vector_rtx): Likewise.
	(test_vector_ops_duplicate): Skip vec_merge test
	with vectors of booleans.
	* varasm.cc (output_constant_pool_2): Likewise.
---
 gcc/config/aarch64/aarch64-modes.def      |  8 +++----
 gcc/config/arm/arm-builtins.cc            | 39 +++++++++++++++++++++++++++++--
 gcc/config/arm/arm-builtins.h             |  4 +++-
 gcc/config/arm/arm-modes.def              |  8 +++++++
 gcc/config/arm/arm-simd-builtin-types.def |  4 ++++
 5 files changed, 56 insertions(+), 7 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index 976bf9b..8f39922 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -47,10 +47,10 @@ ADJUST_FLOAT_FORMAT (HF, &ieee_half_format);
 
 /* Vector modes.  */
 
-VECTOR_BOOL_MODE (VNx16BI, 16, 2);
-VECTOR_BOOL_MODE (VNx8BI, 8, 2);
-VECTOR_BOOL_MODE (VNx4BI, 4, 2);
-VECTOR_BOOL_MODE (VNx2BI, 2, 2);
+VECTOR_BOOL_MODE (VNx16BI, 16, BI, 2);
+VECTOR_BOOL_MODE (VNx8BI, 8, BI, 2);
+VECTOR_BOOL_MODE (VNx4BI, 4, BI, 2);
+VECTOR_BOOL_MODE (VNx2BI, 2, BI, 2);
 
 ADJUST_NUNITS (VNx16BI, aarch64_sve_vg * 8);
 ADJUST_NUNITS (VNx8BI, aarch64_sve_vg * 4);
diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc
index e6bbda2..993a2f7 100644
--- a/gcc/config/arm/arm-builtins.cc
+++ b/gcc/config/arm/arm-builtins.cc
@@ -1553,11 +1553,28 @@ arm_init_simd_builtin_types (void)
       tree eltype = arm_simd_types[i].eltype;
       machine_mode mode = arm_simd_types[i].mode;
 
-      if (eltype == NULL)
+      if (eltype == NULL
+	  /* VECTOR_BOOL is not supported unless MVE is activated,
+	     this would make build_truth_vector_type_for_mode
+	     crash.  */
+	  && ((GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
+	      || !TARGET_HAVE_MVE))
 	continue;
       if (arm_simd_types[i].itype == NULL)
 	{
-	  tree type = build_vector_type (eltype, GET_MODE_NUNITS (mode));
+	  tree type;
+	  if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
+	    {
+	      /* Handle MVE predicates: they are internally stored as
+		 16 bits, but are used as vectors of 1, 2 or 4-bit
+		 elements.  */
+	      type = build_truth_vector_type_for_mode (GET_MODE_NUNITS (mode),
+						       mode);
+	      eltype = TREE_TYPE (type);
+	    }
+	  else
+	    type = build_vector_type (eltype, GET_MODE_NUNITS (mode));
+
 	  type = build_distinct_type_copy (type);
 	  SET_TYPE_STRUCTURAL_EQUALITY (type);
 
@@ -1695,6 +1712,11 @@ arm_init_builtin (unsigned int fcode, arm_builtin_datum *d,
       if (qualifiers & qualifier_map_mode)
 	op_mode = d->mode;
 
+      /* MVE Predicates use HImode as mandated by the ABI: pred16_t is
+	 unsigned short.  */
+      if (qualifiers & qualifier_predicate)
+	op_mode = HImode;
+
       /* For pointers, we want a pointer to the basic type
 	 of the vector.  */
       if (qualifiers & qualifier_pointer && VECTOR_MODE_P (op_mode))
@@ -2939,6 +2961,12 @@ arm_expand_builtin_args (rtx target, machine_mode map_mode, int fcode,
 	    case ARG_BUILTIN_COPY_TO_REG:
 	      if (POINTER_TYPE_P (TREE_TYPE (arg[argc])))
 		op[argc] = convert_memory_address (Pmode, op[argc]);
+
+	      /* MVE uses mve_pred16_t (aka HImode) for vectors of
+		 predicates.  */
+	      if (GET_MODE_CLASS (mode[argc]) == MODE_VECTOR_BOOL)
+		op[argc] = gen_lowpart (mode[argc], op[argc]);
+
 	      /*gcc_assert (GET_MODE (op[argc]) == mode[argc]); */
 	      if (!(*insn_data[icode].operand[opno].predicate)
 		  (op[argc], mode[argc]))
@@ -3144,6 +3172,13 @@ constant_arg:
   else
     emit_insn (insn);
 
+  if (GET_MODE_CLASS (tmode) == MODE_VECTOR_BOOL)
+    {
+      rtx HItarget = gen_reg_rtx (HImode);
+      emit_move_insn (HItarget, gen_lowpart (HImode, target));
+      return HItarget;
+    }
+
   return target;
 }
 
diff --git a/gcc/config/arm/arm-builtins.h b/gcc/config/arm/arm-builtins.h
index e5130d6..a8ef8ae 100644
--- a/gcc/config/arm/arm-builtins.h
+++ b/gcc/config/arm/arm-builtins.h
@@ -84,7 +84,9 @@ enum arm_type_qualifiers
   qualifier_lane_pair_index = 0x1000,
   /* Lane indices selected in quadtuplets - must be within range of previous
      argument = a vector.  */
-  qualifier_lane_quadtup_index = 0x2000
+  qualifier_lane_quadtup_index = 0x2000,
+  /* MVE vector predicates.  */
+  qualifier_predicate = 0x4000
 };
 
 struct arm_simd_type_info
diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def
index de689c8..9ed0cd0 100644
--- a/gcc/config/arm/arm-modes.def
+++ b/gcc/config/arm/arm-modes.def
@@ -84,6 +84,14 @@ VECTOR_MODE (FLOAT, BF, 2);   /*                 V2BF.  */
 VECTOR_MODE (FLOAT, BF, 4);   /*		 V4BF.  */
 VECTOR_MODE (FLOAT, BF, 8);   /*		 V8BF.  */
 
+/* Predicates for MVE.  */
+BOOL_MODE (B2I, 2, 1);
+BOOL_MODE (B4I, 4, 1);
+
+VECTOR_BOOL_MODE (V16BI, 16, BI, 2);
+VECTOR_BOOL_MODE (V8BI, 8, B2I, 2);
+VECTOR_BOOL_MODE (V4BI, 4, B4I, 2);
+
 /* Fraction and accumulator vector modes.  */
 VECTOR_MODES (FRACT, 4);      /* V4QQ  V2HQ */
 VECTOR_MODES (UFRACT, 4);     /* V4UQQ V2UHQ */
diff --git a/gcc/config/arm/arm-simd-builtin-types.def b/gcc/config/arm/arm-simd-builtin-types.def
index 6ba6f21..d1d6416 100644
--- a/gcc/config/arm/arm-simd-builtin-types.def
+++ b/gcc/config/arm/arm-simd-builtin-types.def
@@ -51,3 +51,7 @@
   ENTRY (Bfloat16x2_t, V2BF, none, 32, bfloat16, 20)
   ENTRY (Bfloat16x4_t, V4BF, none, 64, bfloat16, 20)
   ENTRY (Bfloat16x8_t, V8BF, none, 128, bfloat16, 20)
+
+  ENTRY (Pred1x16_t, V16BI, predicate, 16, pred1, 16)
+  ENTRY (Pred2x8_t, V8BI, predicate, 8, pred1, 15)
+  ENTRY (Pred4x4_t, V4BI, predicate, 4, pred1, 15)
-- 
cgit v1.1


From 91224cf625dc90304bb515a0cc602beed48fe3da Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@arm.com>
Date: Wed, 13 Oct 2021 09:16:27 +0000
Subject: arm: Implement auto-vectorized MVE comparisons with vectors of
 boolean predicates

We make use of qualifier_predicate to describe MVE builtins
prototypes, restricting to auto-vectorizable vcmp* and vpsel builtins,
as they are exercised by the tests added earlier in the series.

Special handling is needed for mve_vpselq because it has a v2di
variant, which has no natural VPR.P0 representation: we keep HImode
for it.

The vector_compare expansion code is updated to use the right VxBI
mode instead of HI for the result.

We extend the existing thumb2_movhi_vfp and thumb2_movhi_fp16 patterns
to use the new MVE_7_HI iterator which covers HI and the new VxBI
modes, in conjunction with the new DB constraint for a constant vector
of booleans.

This patch also adds tests derived from the one provided in PR
target/101325: there is a compile-only test because I did not have
access to anything that could execute MVE code until recently.  I have
been able to add an executable test since QEMU supports MVE.

Instead of adding arm_v8_1m_mve_hw, I update arm_mve_hw so that it
uses add_options_for_arm_v8_1m_mve_fp, like arm_neon_hw does.  This
ensures arm_mve_hw passes even if the toolchain does not generate MVE
code by default.

Most of the work of this patch series was carried out while I was
working at STMicroelectronics as a Linaro assignee.

2022-02-22  Christophe Lyon <christophe.lyon@arm.com>
	    Richard Sandiford  <richard.sandiford@arm.com>

	gcc/
	PR target/100757
	PR target/101325
	* config/arm/arm-builtins.cc (BINOP_PRED_UNONE_UNONE_QUALIFIERS)
	(BINOP_PRED_NONE_NONE_QUALIFIERS)
	(TERNOP_NONE_NONE_NONE_PRED_QUALIFIERS)
	(TERNOP_UNONE_UNONE_UNONE_PRED_QUALIFIERS): New.
	* config/arm/arm-protos.h (mve_bool_vec_to_const): New.
	* config/arm/arm.cc (arm_hard_regno_mode_ok): Handle new VxBI
	modes.
	(arm_mode_to_pred_mode): New.
	(arm_expand_vector_compare): Use the right VxBI mode instead of
	HI.
	(arm_expand_vcond): Likewise.
	(simd_valid_immediate): Handle MODE_VECTOR_BOOL.
	(mve_bool_vec_to_const): New.
	(neon_make_constant): Call mve_bool_vec_to_const when needed.
	* config/arm/arm_mve_builtins.def (vcmpneq_, vcmphiq_, vcmpcsq_)
	(vcmpltq_, vcmpleq_, vcmpgtq_, vcmpgeq_, vcmpeqq_, vcmpneq_f)
	(vcmpltq_f, vcmpleq_f, vcmpgtq_f, vcmpgeq_f, vcmpeqq_f, vpselq_u)
	(vpselq_s, vpselq_f): Use new predicated qualifiers.
	* config/arm/constraints.md (DB): New.
	* config/arm/iterators.md (MVE_7, MVE_7_HI): New mode iterators.
	(MVE_VPRED, MVE_vpred): New attribute iterators.
	* config/arm/mve.md (@mve_vcmp<mve_cmp_op>q_<mode>)
	(@mve_vcmp<mve_cmp_op>q_f<mode>, @mve_vpselq_<supf><mode>)
	(@mve_vpselq_f<mode>): Use MVE_VPRED instead of HI.
	(@mve_vpselq_<supf>v2di): Define separately.
	(mov<mode>): New expander for VxBI modes.
	* config/arm/vfp.md (thumb2_movhi_vfp, thumb2_movhi_fp16): Use
	MVE_7_HI iterator and add support for DB constraint.

	gcc/testsuite/
	PR target/100757
	PR target/101325
	* gcc.dg/rtl/arm/mve-vxbi.c: New test.
	* gcc.target/arm/simd/pr101325.c: New.
	* gcc.target/arm/simd/pr101325-2.c: New.
	* lib/target-supports.exp (check_effective_target_arm_mve_hw): Use
	add_options_for_arm_v8_1m_mve_fp.
---
 gcc/config/arm/arm-builtins.cc      | 25 +++++++++++++++++
 gcc/config/arm/arm-protos.h         |  1 +
 gcc/config/arm/arm.cc               | 56 +++++++++++++++++++++++++++++++++----
 gcc/config/arm/arm_mve_builtins.def | 34 +++++++++++-----------
 gcc/config/arm/constraints.md       |  6 ++++
 gcc/config/arm/iterators.md         |  6 ++++
 gcc/config/arm/mve.md               | 23 +++++++++++----
 gcc/config/arm/vfp.md               | 34 ++++++++++++++--------
 8 files changed, 144 insertions(+), 41 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc
index 993a2f7..1c6b9c9 100644
--- a/gcc/config/arm/arm-builtins.cc
+++ b/gcc/config/arm/arm-builtins.cc
@@ -421,6 +421,12 @@ arm_binop_unone_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_binop_unone_unone_unone_qualifiers)
 
 static enum arm_type_qualifiers
+arm_binop_pred_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_predicate, qualifier_unsigned, qualifier_unsigned };
+#define BINOP_PRED_UNONE_UNONE_QUALIFIERS \
+  (arm_binop_pred_unone_unone_qualifiers)
+
+static enum arm_type_qualifiers
 arm_binop_unone_none_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_none, qualifier_immediate };
 #define BINOP_UNONE_NONE_IMM_QUALIFIERS \
@@ -439,6 +445,12 @@ arm_binop_unone_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_binop_unone_none_none_qualifiers)
 
 static enum arm_type_qualifiers
+arm_binop_pred_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_predicate, qualifier_none, qualifier_none };
+#define BINOP_PRED_NONE_NONE_QUALIFIERS \
+  (arm_binop_pred_none_none_qualifiers)
+
+static enum arm_type_qualifiers
 arm_binop_unone_unone_none_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_none };
 #define BINOP_UNONE_UNONE_NONE_QUALIFIERS \
@@ -510,6 +522,12 @@ arm_ternop_none_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_ternop_none_none_none_unone_qualifiers)
 
 static enum arm_type_qualifiers
+arm_ternop_none_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_none, qualifier_predicate };
+#define TERNOP_NONE_NONE_NONE_PRED_QUALIFIERS \
+  (arm_ternop_none_none_none_pred_qualifiers)
+
+static enum arm_type_qualifiers
 arm_ternop_none_none_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_immediate, qualifier_unsigned };
 #define TERNOP_NONE_NONE_IMM_UNONE_QUALIFIERS \
@@ -529,6 +547,13 @@ arm_ternop_unone_unone_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_ternop_unone_unone_unone_unone_qualifiers)
 
 static enum arm_type_qualifiers
+arm_ternop_unone_unone_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
+    qualifier_predicate };
+#define TERNOP_UNONE_UNONE_UNONE_PRED_QUALIFIERS \
+  (arm_ternop_unone_unone_unone_pred_qualifiers)
+
+static enum arm_type_qualifiers
 arm_ternop_none_none_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_none, qualifier_none };
 #define TERNOP_NONE_NONE_NONE_NONE_QUALIFIERS \
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 881c72c..f2f7ca6 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -101,6 +101,7 @@ extern char *neon_output_shift_immediate (const char *, char, rtx *,
 					  machine_mode, int, bool);
 extern void neon_pairwise_reduce (rtx, rtx, machine_mode,
 				  rtx (*) (rtx, rtx, rtx));
+extern rtx mve_bool_vec_to_const (rtx const_vec);
 extern rtx neon_make_constant (rtx, bool generate = true);
 extern tree arm_builtin_vectorized_function (unsigned int, tree, tree);
 extern void neon_expand_vector_init (rtx, rtx);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 8d7f095..df43c67 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -12802,7 +12802,10 @@ simd_valid_immediate (rtx op, machine_mode mode, int inverse,
   innersize = GET_MODE_UNIT_SIZE (mode);
 
   /* Only support 128-bit vectors for MVE.  */
-  if (TARGET_HAVE_MVE && (!vector || n_elts * innersize != 16))
+  if (TARGET_HAVE_MVE
+      && (!vector
+	  || (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
+	  || n_elts * innersize != 16))
     return -1;
 
   /* Vectors of float constants.  */
@@ -13167,6 +13170,29 @@ neon_vdup_constant (rtx vals, bool generate)
   return gen_vec_duplicate (mode, x);
 }
 
+/* Return a HI representation of CONST_VEC suitable for MVE predicates.  */
+rtx
+mve_bool_vec_to_const (rtx const_vec)
+{
+  int n_elts = GET_MODE_NUNITS ( GET_MODE (const_vec));
+  int repeat = 16 / n_elts;
+  int i;
+  int hi_val = 0;
+
+  for (i = 0; i < n_elts; i++)
+    {
+      rtx el = CONST_VECTOR_ELT (const_vec, i);
+      unsigned HOST_WIDE_INT elpart;
+
+      gcc_assert (CONST_INT_P (el));
+      elpart = INTVAL (el);
+
+      for (int j = 0; j < repeat; j++)
+	hi_val |= elpart << (i * repeat + j);
+    }
+  return gen_int_mode (hi_val, HImode);
+}
+
 /* Return a non-NULL RTX iff VALS, which is a PARALLEL containing only
    constants (for vec_init) or CONST_VECTOR, can be effeciently loaded
    into a register.
@@ -13207,6 +13233,8 @@ neon_make_constant (rtx vals, bool generate)
       && simd_immediate_valid_for_move (const_vec, mode, NULL, NULL))
     /* Load using VMOV.  On Cortex-A8 this takes one cycle.  */
     return const_vec;
+  else if (TARGET_HAVE_MVE && (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL))
+    return mve_bool_vec_to_const (const_vec);
   else if ((target = neon_vdup_constant (vals, generate)) != NULL_RTX)
     /* Loaded using VDUP.  On Cortex-A8 the VDUP takes one NEON
        pipeline cycle; creating the constant takes one or two ARM
@@ -25365,7 +25393,10 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
     return false;
 
   if (IS_VPR_REGNUM (regno))
-    return mode == HImode;
+    return mode == HImode
+      || mode == V16BImode
+      || mode == V8BImode
+      || mode == V4BImode;
 
   if (TARGET_THUMB1)
     /* For the Thumb we only allow values bigger than SImode in
@@ -31053,6 +31084,19 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
     arm_post_atomic_barrier (model);
 }
 
+/* Return the mode for the MVE vector of predicates corresponding to MODE.  */
+machine_mode
+arm_mode_to_pred_mode (machine_mode mode)
+{
+  switch (GET_MODE_NUNITS (mode))
+    {
+    case 16: return V16BImode;
+    case 8: return V8BImode;
+    case 4: return V4BImode;
+    }
+  gcc_unreachable ();
+}
+
 /* Expand code to compare vectors OP0 and OP1 using condition CODE.
    If CAN_INVERT, store either the result or its inverse in TARGET
    and return true if TARGET contains the inverse.  If !CAN_INVERT,
@@ -31136,7 +31180,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
 	  if (vcond_mve)
 	    vpr_p0 = target;
 	  else
-	    vpr_p0 = gen_reg_rtx (HImode);
+	    vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode));
 
 	  switch (GET_MODE_CLASS (cmp_mode))
 	    {
@@ -31178,7 +31222,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
 	  if (vcond_mve)
 	    vpr_p0 = target;
 	  else
-	    vpr_p0 = gen_reg_rtx (HImode);
+	    vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode));
 
 	  emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
 	  if (!vcond_mve)
@@ -31205,7 +31249,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
 	  if (vcond_mve)
 	    vpr_p0 = target;
 	  else
-	    vpr_p0 = gen_reg_rtx (HImode);
+	    vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode));
 
 	  emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0));
 	  if (!vcond_mve)
@@ -31258,7 +31302,7 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)
   if (TARGET_HAVE_MVE)
     {
       vcond_mve=true;
-      mask = gen_reg_rtx (HImode);
+      mask = gen_reg_rtx (arm_mode_to_pred_mode (cmp_result_mode));
     }
   else
     mask = gen_reg_rtx (cmp_result_mode);
diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def
index c3ae407..44b41ea 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -89,7 +89,7 @@ VAR3 (BINOP_UNONE_UNONE_IMM, vshrq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_IMM, vshrq_n_s, v16qi, v8hi, v4si)
 VAR1 (BINOP_NONE_NONE_UNONE, vaddlvq_p_s, v4si)
 VAR1 (BINOP_UNONE_UNONE_UNONE, vaddlvq_p_u, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_NONE_NONE, vcmpneq_, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vshlq_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_NONE, vshlq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vsubq_u, v16qi, v8hi, v4si)
@@ -117,9 +117,9 @@ VAR3 (BINOP_UNONE_UNONE_UNONE, vhsubq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, veorq_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_UNONE_UNONE, vcmphiq_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_n_, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_UNONE_UNONE, vcmpcsq_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vandq_u, v16qi, v8hi, v4si)
@@ -143,15 +143,15 @@ VAR3 (BINOP_UNONE_UNONE_IMM, vshlq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_IMM, vrshrq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_IMM, vqshlq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_n_, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_NONE_NONE, vcmpltq_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_n_, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_NONE_NONE, vcmpleq_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_n_, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_NONE_NONE, vcmpgtq_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_NONE_NONE, vcmpgeq_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_NONE_NONE, vcmpeqq_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_NONE_IMM, vqshluq_n_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_UNONE, vaddvq_p_s, v16qi, v8hi, v4si)
@@ -219,17 +219,17 @@ VAR2 (BINOP_UNONE_UNONE_IMM, vshllbq_n_u, v16qi, v8hi)
 VAR2 (BINOP_UNONE_UNONE_IMM, vorrq_n_u, v8hi, v4si)
 VAR2 (BINOP_UNONE_UNONE_IMM, vbicq_n_u, v8hi, v4si)
 VAR2 (BINOP_UNONE_NONE_NONE, vcmpneq_n_f, v8hf, v4sf)
-VAR2 (BINOP_UNONE_NONE_NONE, vcmpneq_f, v8hf, v4sf)
+VAR2 (BINOP_PRED_NONE_NONE, vcmpneq_f, v8hf, v4sf)
 VAR2 (BINOP_UNONE_NONE_NONE, vcmpltq_n_f, v8hf, v4sf)
-VAR2 (BINOP_UNONE_NONE_NONE, vcmpltq_f, v8hf, v4sf)
+VAR2 (BINOP_PRED_NONE_NONE, vcmpltq_f, v8hf, v4sf)
 VAR2 (BINOP_UNONE_NONE_NONE, vcmpleq_n_f, v8hf, v4sf)
-VAR2 (BINOP_UNONE_NONE_NONE, vcmpleq_f, v8hf, v4sf)
+VAR2 (BINOP_PRED_NONE_NONE, vcmpleq_f, v8hf, v4sf)
 VAR2 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_f, v8hf, v4sf)
-VAR2 (BINOP_UNONE_NONE_NONE, vcmpgtq_f, v8hf, v4sf)
+VAR2 (BINOP_PRED_NONE_NONE, vcmpgtq_f, v8hf, v4sf)
 VAR2 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_f, v8hf, v4sf)
-VAR2 (BINOP_UNONE_NONE_NONE, vcmpgeq_f, v8hf, v4sf)
+VAR2 (BINOP_PRED_NONE_NONE, vcmpgeq_f, v8hf, v4sf)
 VAR2 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_f, v8hf, v4sf)
-VAR2 (BINOP_UNONE_NONE_NONE, vcmpeqq_f, v8hf, v4sf)
+VAR2 (BINOP_PRED_NONE_NONE, vcmpeqq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vsubq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vqmovntq_s, v8hi, v4si)
 VAR2 (BINOP_NONE_NONE_NONE, vqmovnbq_s, v8hi, v4si)
@@ -295,8 +295,8 @@ VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vcvtaq_m_u, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtaq_m_s, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vshlcq_vec_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_UNONE_IMM, vshlcq_vec_s, v16qi, v8hi, v4si)
-VAR4 (TERNOP_UNONE_UNONE_UNONE_UNONE, vpselq_u, v16qi, v8hi, v4si, v2di)
-VAR4 (TERNOP_NONE_NONE_NONE_UNONE, vpselq_s, v16qi, v8hi, v4si, v2di)
+VAR4 (TERNOP_UNONE_UNONE_UNONE_PRED, vpselq_u, v16qi, v8hi, v4si, v2di)
+VAR4 (TERNOP_NONE_NONE_NONE_PRED, vpselq_s, v16qi, v8hi, v4si, v2di)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrev64q_m_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmvnq_m_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlasq_n_u, v16qi, v8hi, v4si)
@@ -426,7 +426,7 @@ VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrev64q_m_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrev32q_m_s, v16qi, v8hi)
 VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vqmovntq_m_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vqmovnbq_m_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vpselq_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vpselq_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vnegq_m_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmovntq_m_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmovnbq_m_s, v8hi, v4si)
diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index 1920004..2b411b0 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -312,6 +312,12 @@
  (and (match_code "const_vector")
       (match_test "(TARGET_NEON || TARGET_HAVE_MVE) && op == CONST0_RTX (mode)")))
 
+(define_constraint "DB"
+ "@internal
+  In ARM/Thumb-2 state with MVE a constant vector of booleans."
+ (and (match_code "const_vector")
+      (match_test "TARGET_HAVE_MVE && GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL")))
+
 (define_constraint "Da"
  "@internal
   In ARM/Thumb-2 state a const_int, const_double or const_vector that can
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 8202c27..37cf797 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -272,6 +272,8 @@
 (define_mode_iterator MVE_2 [V16QI V8HI V4SI])
 (define_mode_iterator MVE_5 [V8HI V4SI])
 (define_mode_iterator MVE_6 [V8HI V4SI])
+(define_mode_iterator MVE_7 [V16BI V8BI V4BI])
+(define_mode_iterator MVE_7_HI [HI V16BI V8BI V4BI])
 
 ;;----------------------------------------------------------------------------
 ;; Code iterators
@@ -946,6 +948,10 @@
 			       (V8HF "u16") (V4SF "32")])
 (define_mode_attr earlyclobber_32 [(V16QI "=w") (V8HI "=w") (V4SI "=&w")
 						(V8HF "=w") (V4SF "=&w")])
+(define_mode_attr MVE_VPRED [(V16QI "V16BI") (V8HI "V8BI") (V4SI "V4BI")
+                             (V2DI "HI") (V8HF "V8BI")   (V4SF "V4BI")])
+(define_mode_attr MVE_vpred [(V16QI "v16bi") (V8HI "v8bi") (V4SI "v4bi")
+                             (V2DI "hi") (V8HF "v8bi")   (V4SF "v4bi")])
 
 ;;----------------------------------------------------------------------------
 ;; Code attributes
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 5c3b34d..983aa10 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -839,8 +839,8 @@
 ;;
 (define_insn "@mve_vcmp<mve_cmp_op>q_<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(MVE_COMPARISONS:<MVE_VPRED> (match_operand:MVE_2 1 "s_register_operand" "w")
 		    (match_operand:MVE_2 2 "s_register_operand" "w")))
   ]
   "TARGET_HAVE_MVE"
@@ -1929,8 +1929,8 @@
 ;;
 (define_insn "@mve_vcmp<mve_cmp_op>q_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(MVE_FP_COMPARISONS:<MVE_VPRED> (match_operand:MVE_0 1 "s_register_operand" "w")
 			       (match_operand:MVE_0 2 "s_register_operand" "w")))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3324,7 +3324,7 @@
    (set (match_operand:MVE_1 0 "s_register_operand" "=w")
 	(unspec:MVE_1 [(match_operand:MVE_1 1 "s_register_operand" "w")
 		       (match_operand:MVE_1 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VPSELQ))
   ]
   "TARGET_HAVE_MVE"
@@ -4419,7 +4419,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VPSELQ_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -10516,3 +10516,14 @@
   "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1"
   [(set_attr "type" "mve_load")]
 )
+
+;; Expander for VxBI moves
+(define_expand "mov<mode>"
+  [(set (match_operand:MVE_7 0 "nonimmediate_operand")
+        (match_operand:MVE_7 1 "general_operand"))]
+  "TARGET_HAVE_MVE"
+  {
+    if (!register_operand (operands[0], <MODE>mode))
+      operands[1] = force_reg (<MODE>mode, operands[1]);
+  }
+)
diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
index f5ccb92..f00d1ca 100644
--- a/gcc/config/arm/vfp.md
+++ b/gcc/config/arm/vfp.md
@@ -73,21 +73,26 @@
 
 (define_insn "*thumb2_movhi_vfp"
  [(set
-   (match_operand:HI 0 "nonimmediate_operand"
+   (match_operand:MVE_7_HI 0 "nonimmediate_operand"
     "=rk, r, l, r, m, r, *t, r, *t, Up, r")
-   (match_operand:HI 1 "general_operand"
-    "rk, I, Py, n, r, m, r, *t, *t, r, Up"))]
+   (match_operand:MVE_7_HI 1 "general_operand"
+    "rk, IDB, Py, n, r, m, r, *t, *t, r, Up"))]
  "TARGET_THUMB2 && TARGET_VFP_BASE
   && !TARGET_VFP_FP16INST
-  && (register_operand (operands[0], HImode)
-       || register_operand (operands[1], HImode))"
+  && (register_operand (operands[0], <MODE>mode)
+       || register_operand (operands[1], <MODE>mode))"
 {
   switch (which_alternative)
     {
     case 0:
-    case 1:
     case 2:
       return "mov%?\t%0, %1\t%@ movhi";
+    case 1:
+      if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_VECTOR_BOOL)
+        operands[1] = mve_const_bool_vec_to_hi (operands[1]);
+      else
+        operands[1] = gen_lowpart (HImode, operands[1]);
+      return "mov%?\t%0, %1\t%@ movhi";
     case 3:
       return "movw%?\t%0, %L1\t%@ movhi";
     case 4:
@@ -173,20 +178,25 @@
 
 (define_insn "*thumb2_movhi_fp16"
  [(set
-   (match_operand:HI 0 "nonimmediate_operand"
+   (match_operand:MVE_7_HI 0 "nonimmediate_operand"
     "=rk, r, l, r, m, r, *t, r, *t, Up, r")
-   (match_operand:HI 1 "general_operand"
-    "rk, I, Py, n, r, m, r, *t, *t, r, Up"))]
+   (match_operand:MVE_7_HI 1 "general_operand"
+    "rk, IDB, Py, n, r, m, r, *t, *t, r, Up"))]
  "TARGET_THUMB2 && (TARGET_VFP_FP16INST || TARGET_HAVE_MVE)
-  && (register_operand (operands[0], HImode)
-       || register_operand (operands[1], HImode))"
+  && (register_operand (operands[0], <MODE>mode)
+       || register_operand (operands[1], <MODE>mode))"
 {
   switch (which_alternative)
     {
     case 0:
-    case 1:
     case 2:
       return "mov%?\t%0, %1\t%@ movhi";
+    case 1:
+      if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_VECTOR_BOOL)
+        operands[1] = mve_const_bool_vec_to_hi (operands[1]);
+      else
+        operands[1] = gen_lowpart (HImode, operands[1]);
+      return "mov%?\t%0, %1\t%@ movhi";
     case 3:
       return "movw%?\t%0, %L1\t%@ movhi";
     case 4:
-- 
cgit v1.1


From df0e57c2c032cea0f77f2e68231c035f282b26d6 Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@arm.com>
Date: Wed, 20 Oct 2021 15:30:16 +0000
Subject: arm: Fix vcond_mask expander for MVE (PR target/100757)

The problem in this PR is that we call VPSEL with a mask of vector
type instead of HImode. This happens because operand 3 in vcond_mask
is the pre-computed vector comparison and has vector type.

This patch fixes it by implementing TARGET_VECTORIZE_GET_MASK_MODE,
returning the appropriate VxBI mode when targeting MVE.  In turn, this
implies implementing vec_cmp<mode><MVE_vpred>,
vec_cmpu<mode><MVE_vpred> and vcond_mask_<mode><MVE_vpred>, and we can
move vec_cmp<mode><v_cmp_result>, vec_cmpu<mode><mode> and
vcond_mask_<mode><v_cmp_result> back to neon.md since they are not
used by MVE anymore.  The new *<MVE_vpred> patterns listed above are
implemented in mve.md since they are only valid for MVE. However this
may make maintenance/comparison more painful than having all of them
in vec-common.md.

In the process, we can get rid of the recently added vcond_mve
parameter of arm_expand_vector_compare.

Compared to neon.md's vcond_mask_<mode><v_cmp_result> before my "arm:
Auto-vectorization for MVE: vcmp" patch (r12-834), it keeps the VDQWH
iterator added in r12-835 (to have V4HF/V8HF support), as well as the
(!<Is_float_mode> || flag_unsafe_math_optimizations) condition which
was not present before r12-834 although SF modes were enabled by VDQW
(I think this was a bug).

Using TARGET_VECTORIZE_GET_MASK_MODE has the advantage that we no
longer need to generate vpsel with vectors of 0 and 1: the masks are
now merged via scalar 'ands' instructions operating on 16-bit masks
after converting the boolean vectors.

In addition, this patch fixes a problem in arm_expand_vcond() where
the result would be a vector of 0 or 1 instead of operand 1 or 2.

Since we want to skip gcc.dg/signbit-2.c for MVE, we also add a new
arm_mve effective target.

Reducing the number of iterations in pr100757-3.c from 32 to 8, we
generate the code below:

float a[32];
float fn1(int d) {
  float c = 4.0f;
  for (int b = 0; b < 8; b++)
    if (a[b] != 2.0f)
      c = 5.0f;
  return c;
}

fn1:
	ldr     r3, .L3+48
	vldr.64 d4, .L3              // q2=(2.0,2.0,2.0,2.0)
	vldr.64 d5, .L3+8
	vldrw.32        q0, [r3]     // q0=a(0..3)
	adds    r3, r3, #16
	vcmp.f32        eq, q0, q2   // cmp a(0..3) == (2.0,2.0,2.0,2.0)
	vldrw.32        q1, [r3]     // q1=a(4..7)
	vmrs     r3, P0
	vcmp.f32        eq, q1, q2   // cmp a(4..7) == (2.0,2.0,2.0,2.0)
	vmrs    r2, P0  @ movhi
	ands    r3, r3, r2           // r3=select(a(0..3]) & select(a(4..7))
	vldr.64 d4, .L3+16           // q2=(5.0,5.0,5.0,5.0)
	vldr.64 d5, .L3+24
	vmsr     P0, r3
	vldr.64 d6, .L3+32           // q3=(4.0,4.0,4.0,4.0)
	vldr.64 d7, .L3+40
	vpsel q3, q3, q2             // q3=vcond_mask(4.0,5.0)
	vmov.32 r2, q3[1]            // keep the scalar max
	vmov.32 r0, q3[3]
	vmov.32 r3, q3[2]
	vmov.f32        s11, s12
	vmov    s15, r2
	vmov    s14, r3
	vmaxnm.f32      s15, s11, s15
	vmaxnm.f32      s15, s15, s14
	vmov    s14, r0
	vmaxnm.f32      s15, s15, s14
	vmov    r0, s15
	bx      lr
	.L4:
	.align  3
	.L3:
	.word   1073741824	// 2.0f
	.word   1073741824
	.word   1073741824
	.word   1073741824
	.word   1084227584	// 5.0f
	.word   1084227584
	.word   1084227584
	.word   1084227584
	.word   1082130432	// 4.0f
	.word   1082130432
	.word   1082130432
	.word   1082130432

This patch adds tests that trigger an ICE without this fix.

The pr100757*.c testcases are derived from
gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using
various types and return values different from 0 and 1 to avoid
commonalization with boolean masks.  In addition, since we should not
need these masks, the tests make sure they are not present.

Most of the work of this patch series was carried out while I was
working at STMicroelectronics as a Linaro assignee.

2022-02-22  Christophe Lyon  <christophe.lyon@arm.com>

	PR target/100757
	gcc/
	* config/arm/arm-protos.h (arm_get_mask_mode): New prototype.
	(arm_expand_vector_compare): Update prototype.
	* config/arm/arm.cc (TARGET_VECTORIZE_GET_MASK_MODE): New.
	(arm_vector_mode_supported_p): Add support for VxBI modes.
	(arm_expand_vector_compare): Remove useless generation of vpsel.
	(arm_expand_vcond): Fix select operands.
	(arm_get_mask_mode): New.
	* config/arm/mve.md (vec_cmp<mode><MVE_vpred>): New.
	(vec_cmpu<mode><MVE_vpred>): New.
	(vcond_mask_<mode><MVE_vpred>): New.
	* config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>)
	(vec_cmpu<mode><mode, vcond_mask_<mode><v_cmp_result>): Move to ...
	* config/arm/neon.md (vec_cmp<mode><v_cmp_result>)
	(vec_cmpu<mode><mode, vcond_mask_<mode><v_cmp_result>): ... here
	and disable for MVE.
	* doc/sourcebuild.texi (arm_mve): Document new effective-target.

	gcc/testsuite/
	PR target/100757
	* gcc.target/arm/simd/pr100757-2.c: New.
	* gcc.target/arm/simd/pr100757-3.c: New.
	* gcc.target/arm/simd/pr100757-4.c: New.
	* gcc.target/arm/simd/pr100757.c: New.
	* gcc.dg/signbit-2.c: Skip when targeting ARM/MVE.
	* lib/target-supports.exp (check_effective_target_arm_mve): New.
---
 gcc/config/arm/arm-protos.h  |   3 +-
 gcc/config/arm/arm.cc        | 117 +++++++++++++++----------------------------
 gcc/config/arm/mve.md        |  51 +++++++++++++++++++
 gcc/config/arm/neon.md       |  39 +++++++++++++++
 gcc/config/arm/vec-common.md |  52 -------------------
 5 files changed, 132 insertions(+), 130 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index f2f7ca6..9d14209 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -204,6 +204,7 @@ extern void arm_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree);
 extern bool arm_pad_reg_upward (machine_mode, tree, int);
 #endif
 extern int arm_apply_result_size (void);
+extern opt_machine_mode arm_get_mask_mode (machine_mode mode);
 
 #endif /* RTX_CODE */
 
@@ -380,7 +381,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
 extern bool arm_fusion_enabled_p (tune_params::fuse_ops);
 extern bool arm_valid_symbolic_address_p (rtx);
 extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
-extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool, bool);
+extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool);
 #endif /* RTX_CODE */
 
 extern bool arm_gen_setmem (rtx *);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index df43c67..c1103d9 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -832,6 +832,9 @@ static const struct attribute_spec arm_attribute_table[] =
 
 #undef TARGET_STACK_PROTECT_GUARD
 #define TARGET_STACK_PROTECT_GUARD arm_stack_protect_guard
+
+#undef TARGET_VECTORIZE_GET_MASK_MODE
+#define TARGET_VECTORIZE_GET_MASK_MODE arm_get_mask_mode
 
 /* Obstack for minipool constant handling.  */
 static struct obstack minipool_obstack;
@@ -29286,7 +29289,8 @@ arm_vector_mode_supported_p (machine_mode mode)
 
   if (TARGET_HAVE_MVE
       && (mode == V2DImode || mode == V4SImode || mode == V8HImode
-	  || mode == V16QImode))
+	  || mode == V16QImode
+	  || mode == V16BImode || mode == V8BImode || mode == V4BImode))
       return true;
 
   if (TARGET_HAVE_MVE_FLOAT
@@ -31085,7 +31089,7 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
 }
 
 /* Return the mode for the MVE vector of predicates corresponding to MODE.  */
-machine_mode
+opt_machine_mode
 arm_mode_to_pred_mode (machine_mode mode)
 {
   switch (GET_MODE_NUNITS (mode))
@@ -31094,7 +31098,7 @@ arm_mode_to_pred_mode (machine_mode mode)
     case 8: return V8BImode;
     case 4: return V4BImode;
     }
-  gcc_unreachable ();
+  return opt_machine_mode ();
 }
 
 /* Expand code to compare vectors OP0 and OP1 using condition CODE.
@@ -31102,16 +31106,12 @@ arm_mode_to_pred_mode (machine_mode mode)
    and return true if TARGET contains the inverse.  If !CAN_INVERT,
    always store the result in TARGET, never its inverse.
 
-   If VCOND_MVE, do not emit the vpsel instruction here, let arm_expand_vcond do
-   it with the right destination type to avoid emiting two vpsel, one here and
-   one in arm_expand_vcond.
-
    Note that the handling of floating-point comparisons is not
    IEEE compliant.  */
 
 bool
 arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
-			   bool can_invert, bool vcond_mve)
+			   bool can_invert)
 {
   machine_mode cmp_result_mode = GET_MODE (target);
   machine_mode cmp_mode = GET_MODE (op0);
@@ -31140,7 +31140,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
 	       and then store its inverse in TARGET.  This avoids reusing
 	       TARGET (which for integer NE could be one of the inputs).  */
 	    rtx tmp = gen_reg_rtx (cmp_result_mode);
-	    if (arm_expand_vector_compare (tmp, code, op0, op1, true, vcond_mve))
+	    if (arm_expand_vector_compare (tmp, code, op0, op1, true))
 	      gcc_unreachable ();
 	    emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
 	    return false;
@@ -31176,36 +31176,22 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
     case NE:
       if (TARGET_HAVE_MVE)
 	{
-	  rtx vpr_p0;
-	  if (vcond_mve)
-	    vpr_p0 = target;
-	  else
-	    vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode));
-
 	  switch (GET_MODE_CLASS (cmp_mode))
 	    {
 	    case MODE_VECTOR_INT:
-	      emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
+	      emit_insn (gen_mve_vcmpq (code, cmp_mode, target,
+					op0, force_reg (cmp_mode, op1)));
 	      break;
 	    case MODE_VECTOR_FLOAT:
 	      if (TARGET_HAVE_MVE_FLOAT)
-		emit_insn (gen_mve_vcmpq_f (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
+		emit_insn (gen_mve_vcmpq_f (code, cmp_mode, target,
+					    op0, force_reg (cmp_mode, op1)));
 	      else
 		gcc_unreachable ();
 	      break;
 	    default:
 	      gcc_unreachable ();
 	    }
-
-	  /* If we are not expanding a vcond, build the result here.  */
-	  if (!vcond_mve)
-	    {
-	      rtx zero = gen_reg_rtx (cmp_result_mode);
-	      rtx one = gen_reg_rtx (cmp_result_mode);
-	      emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
-	      emit_move_insn (one, CONST1_RTX (cmp_result_mode));
-	      emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
-	    }
 	}
       else
 	emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
@@ -31217,23 +31203,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
     case GEU:
     case GTU:
       if (TARGET_HAVE_MVE)
-	{
-	  rtx vpr_p0;
-	  if (vcond_mve)
-	    vpr_p0 = target;
-	  else
-	    vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode));
-
-	  emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
-	  if (!vcond_mve)
-	    {
-	      rtx zero = gen_reg_rtx (cmp_result_mode);
-	      rtx one = gen_reg_rtx (cmp_result_mode);
-	      emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
-	      emit_move_insn (one, CONST1_RTX (cmp_result_mode));
-	      emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
-	    }
-	}
+	emit_insn (gen_mve_vcmpq (code, cmp_mode, target,
+				  op0, force_reg (cmp_mode, op1)));
       else
 	emit_insn (gen_neon_vc (code, cmp_mode, target,
 				op0, force_reg (cmp_mode, op1)));
@@ -31244,23 +31215,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
     case LEU:
     case LTU:
       if (TARGET_HAVE_MVE)
-	{
-	  rtx vpr_p0;
-	  if (vcond_mve)
-	    vpr_p0 = target;
-	  else
-	    vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode));
-
-	  emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0));
-	  if (!vcond_mve)
-	    {
-	      rtx zero = gen_reg_rtx (cmp_result_mode);
-	      rtx one = gen_reg_rtx (cmp_result_mode);
-	      emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
-	      emit_move_insn (one, CONST1_RTX (cmp_result_mode));
-	      emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
-	    }
-	}
+	emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, target,
+				  force_reg (cmp_mode, op1), op0));
       else
 	emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
 				target, force_reg (cmp_mode, op1), op0));
@@ -31275,8 +31231,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
 	rtx gt_res = gen_reg_rtx (cmp_result_mode);
 	rtx alt_res = gen_reg_rtx (cmp_result_mode);
 	rtx_code alt_code = (code == LTGT ? LT : LE);
-	if (arm_expand_vector_compare (gt_res, GT, op0, op1, true, vcond_mve)
-	    || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true, vcond_mve))
+	if (arm_expand_vector_compare (gt_res, GT, op0, op1, true)
+	    || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true))
 	  gcc_unreachable ();
 	emit_insn (gen_rtx_SET (target, gen_rtx_IOR (cmp_result_mode,
 						     gt_res, alt_res)));
@@ -31296,19 +31252,15 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)
 {
   /* When expanding for MVE, we do not want to emit a (useless) vpsel in
      arm_expand_vector_compare, and another one here.  */
-  bool vcond_mve=false;
   rtx mask;
 
   if (TARGET_HAVE_MVE)
-    {
-      vcond_mve=true;
-      mask = gen_reg_rtx (arm_mode_to_pred_mode (cmp_result_mode));
-    }
+    mask = gen_reg_rtx (arm_mode_to_pred_mode (cmp_result_mode).require ());
   else
     mask = gen_reg_rtx (cmp_result_mode);
 
   bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]),
-					     operands[4], operands[5], true, vcond_mve);
+					     operands[4], operands[5], true);
   if (inverted)
     std::swap (operands[1], operands[2]);
   if (TARGET_NEON)
@@ -31316,20 +31268,20 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)
 			    mask, operands[1], operands[2]));
   else
     {
-      machine_mode cmp_mode = GET_MODE (operands[4]);
-      rtx vpr_p0 = mask;
-      rtx zero = gen_reg_rtx (cmp_mode);
-      rtx one = gen_reg_rtx (cmp_mode);
-      emit_move_insn (zero, CONST0_RTX (cmp_mode));
-      emit_move_insn (one, CONST1_RTX (cmp_mode));
+      machine_mode cmp_mode = GET_MODE (operands[0]);
+
       switch (GET_MODE_CLASS (cmp_mode))
 	{
 	case MODE_VECTOR_INT:
-	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0));
+	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_mode, operands[0],
+				     operands[1], operands[2], mask));
 	  break;
 	case MODE_VECTOR_FLOAT:
 	  if (TARGET_HAVE_MVE_FLOAT)
-	    emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0));
+	    emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0],
+					 operands[1], operands[2], mask));
+	  else
+	    gcc_unreachable ();
 	  break;
 	default:
 	  gcc_unreachable ();
@@ -34251,4 +34203,15 @@ arm_mode_base_reg_class (machine_mode mode)
 
 struct gcc_target targetm = TARGET_INITIALIZER;
 
+/* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
+
+opt_machine_mode
+arm_get_mask_mode (machine_mode mode)
+{
+  if (TARGET_HAVE_MVE)
+    return arm_mode_to_pred_mode (mode);
+
+  return default_get_mask_mode (mode);
+}
+
 #include "gt-arm.h"
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 983aa10..d0c3100 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -10527,3 +10527,54 @@
       operands[1] = force_reg (<MODE>mode, operands[1]);
   }
 )
+
+;; Expanders for vec_cmp and vcond
+
+(define_expand "vec_cmp<mode><MVE_vpred>"
+  [(set (match_operand:<MVE_VPRED> 0 "s_register_operand")
+	(match_operator:<MVE_VPRED> 1 "comparison_operator"
+	  [(match_operand:MVE_VLD_ST 2 "s_register_operand")
+	   (match_operand:MVE_VLD_ST 3 "reg_or_zero_operand")]))]
+  "TARGET_HAVE_MVE
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
+			     operands[2], operands[3], false);
+  DONE;
+})
+
+(define_expand "vec_cmpu<mode><MVE_vpred>"
+  [(set (match_operand:<MVE_VPRED> 0 "s_register_operand")
+	(match_operator:<MVE_VPRED> 1 "comparison_operator"
+	  [(match_operand:MVE_2 2 "s_register_operand")
+	   (match_operand:MVE_2 3 "reg_or_zero_operand")]))]
+  "TARGET_HAVE_MVE"
+{
+  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
+			     operands[2], operands[3], false);
+  DONE;
+})
+
+(define_expand "vcond_mask_<mode><MVE_vpred>"
+  [(set (match_operand:MVE_VLD_ST 0 "s_register_operand")
+	(if_then_else:MVE_VLD_ST
+	  (match_operand:<MVE_VPRED> 3 "s_register_operand")
+	  (match_operand:MVE_VLD_ST 1 "s_register_operand")
+	  (match_operand:MVE_VLD_ST 2 "s_register_operand")))]
+  "TARGET_HAVE_MVE"
+{
+  switch (GET_MODE_CLASS (<MODE>mode))
+    {
+      case MODE_VECTOR_INT:
+	emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],
+				   operands[1], operands[2], operands[3]));
+	break;
+      case MODE_VECTOR_FLOAT:
+	emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0],
+				     operands[1], operands[2], operands[3]));
+	break;
+      default:
+	gcc_unreachable ();
+    }
+  DONE;
+})
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 2b9a3de..f270ded 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1394,6 +1394,45 @@
   [(set_attr "type" "neon_qsub<q>")]
 )
 
+(define_expand "vec_cmp<mode><v_cmp_result>"
+  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
+	(match_operator:<V_cmp_result> 1 "comparison_operator"
+	  [(match_operand:VDQWH 2 "s_register_operand")
+	   (match_operand:VDQWH 3 "reg_or_zero_operand")]))]
+  "TARGET_NEON
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
+			     operands[2], operands[3], false);
+  DONE;
+})
+
+(define_expand "vec_cmpu<mode><mode>"
+  [(set (match_operand:VDQIW 0 "s_register_operand")
+	(match_operator:VDQIW 1 "comparison_operator"
+	  [(match_operand:VDQIW 2 "s_register_operand")
+	   (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
+  "TARGET_NEON"
+{
+  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
+			     operands[2], operands[3], false);
+  DONE;
+})
+
+(define_expand "vcond_mask_<mode><v_cmp_result>"
+  [(set (match_operand:VDQWH 0 "s_register_operand")
+	(if_then_else:VDQWH
+	  (match_operand:<V_cmp_result> 3 "s_register_operand")
+	  (match_operand:VDQWH 1 "s_register_operand")
+	  (match_operand:VDQWH 2 "s_register_operand")))]
+  "TARGET_NEON
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  emit_insn (gen_neon_vbsl<mode> (operands[0], operands[3], operands[1],
+				  operands[2]));
+  DONE;
+})
+
 ;; Patterns for builtins.
 
 ; good for plain vadd, vaddq.
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 2718d82..f130090 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -363,33 +363,6 @@
     }
 })
 
-(define_expand "vec_cmp<mode><v_cmp_result>"
-  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
-	(match_operator:<V_cmp_result> 1 "comparison_operator"
-	  [(match_operand:VDQWH 2 "s_register_operand")
-	   (match_operand:VDQWH 3 "reg_or_zero_operand")]))]
-  "ARM_HAVE_<MODE>_ARITH
-   && !TARGET_REALLY_IWMMXT
-   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
-{
-  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
-			     operands[2], operands[3], false, false);
-  DONE;
-})
-
-(define_expand "vec_cmpu<mode><mode>"
-  [(set (match_operand:VDQIW 0 "s_register_operand")
-	(match_operator:VDQIW 1 "comparison_operator"
-	  [(match_operand:VDQIW 2 "s_register_operand")
-	   (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
-  "ARM_HAVE_<MODE>_ARITH
-   && !TARGET_REALLY_IWMMXT"
-{
-  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
-			     operands[2], operands[3], false, false);
-  DONE;
-})
-
 ;; Conditional instructions.  These are comparisons with conditional moves for
 ;; vectors.  They perform the assignment:
 ;;
@@ -461,31 +434,6 @@
   DONE;
 })
 
-(define_expand "vcond_mask_<mode><v_cmp_result>"
-  [(set (match_operand:VDQWH 0 "s_register_operand")
-        (if_then_else:VDQWH
-          (match_operand:<V_cmp_result> 3 "s_register_operand")
-          (match_operand:VDQWH 1 "s_register_operand")
-          (match_operand:VDQWH 2 "s_register_operand")))]
-  "ARM_HAVE_<MODE>_ARITH
-   && !TARGET_REALLY_IWMMXT
-   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
-{
-  if (TARGET_NEON)
-    {
-      emit_insn (gen_neon_vbsl (<MODE>mode, operands[0], operands[3],
-                                operands[1], operands[2]));
-    }
-  else if (TARGET_HAVE_MVE)
-    {
-      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],
-                                 operands[1], operands[2], operands[3]));
-    }
-  else
-    gcc_unreachable ();
-  DONE;
-})
-
 (define_expand "vec_load_lanesoi<mode>"
   [(set (match_operand:OI 0 "s_register_operand")
         (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
-- 
cgit v1.1


From e6a4aefce8e47a7d3ba781066a1410ebfa963e59 Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@arm.com>
Date: Wed, 13 Oct 2021 09:16:35 +0000
Subject: arm: Convert remaining MVE vcmp builtins to predicate qualifiers

This is mostly a mechanical change, only tested by the intrinsics
expansion tests.

Most of the work of this patch series was carried out while I was
working at STMicroelectronics as a Linaro assignee.

2022-02-22  Christophe Lyon  <christophe.lyon@arm.com>

	gcc/
	PR target/100757
	PR target/101325
	* config/arm/arm-builtins.cc (BINOP_UNONE_NONE_NONE_QUALIFIERS):
	Delete.
	(TERNOP_UNONE_NONE_NONE_UNONE_QUALIFIERS): Change to ...
	(TERNOP_PRED_NONE_NONE_PRED_QUALIFIERS): ... this.
	(TERNOP_PRED_UNONE_UNONE_PRED_QUALIFIERS): New.
	* config/arm/arm_mve_builtins.def (vcmp*q_n_, vcmp*q_m_f): Use new
	predicated qualifiers.
	* config/arm/mve.md (mve_vcmp<mve_cmp_op>q_n_<mode>)
	(mve_vcmp*q_m_f<mode>): Use MVE_VPRED instead of HI.
---
 gcc/config/arm/arm-builtins.cc      |  21 +++--
 gcc/config/arm/arm_mve_builtins.def |  92 +++++++++----------
 gcc/config/arm/mve.md               | 176 ++++++++++++++++++------------------
 3 files changed, 145 insertions(+), 144 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc
index 1c6b9c9..02411c6 100644
--- a/gcc/config/arm/arm-builtins.cc
+++ b/gcc/config/arm/arm-builtins.cc
@@ -439,12 +439,6 @@ arm_binop_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_binop_none_none_unone_qualifiers)
 
 static enum arm_type_qualifiers
-arm_binop_unone_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-  = { qualifier_unsigned, qualifier_none, qualifier_none };
-#define BINOP_UNONE_NONE_NONE_QUALIFIERS \
-  (arm_binop_unone_none_none_qualifiers)
-
-static enum arm_type_qualifiers
 arm_binop_pred_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_predicate, qualifier_none, qualifier_none };
 #define BINOP_PRED_NONE_NONE_QUALIFIERS \
@@ -504,10 +498,10 @@ arm_ternop_unone_unone_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_ternop_unone_unone_imm_unone_qualifiers)
 
 static enum arm_type_qualifiers
-arm_ternop_unone_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-  = { qualifier_unsigned, qualifier_none, qualifier_none, qualifier_unsigned };
-#define TERNOP_UNONE_NONE_NONE_UNONE_QUALIFIERS \
-  (arm_ternop_unone_none_none_unone_qualifiers)
+arm_ternop_pred_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_predicate, qualifier_none, qualifier_none, qualifier_predicate };
+#define TERNOP_PRED_NONE_NONE_PRED_QUALIFIERS \
+  (arm_ternop_pred_none_none_pred_qualifiers)
 
 static enum arm_type_qualifiers
 arm_ternop_none_none_none_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -554,6 +548,13 @@ arm_ternop_unone_unone_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_ternop_unone_unone_unone_pred_qualifiers)
 
 static enum arm_type_qualifiers
+arm_ternop_pred_unone_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_predicate, qualifier_unsigned, qualifier_unsigned,
+    qualifier_predicate };
+#define TERNOP_PRED_UNONE_UNONE_PRED_QUALIFIERS \
+  (arm_ternop_pred_unone_unone_pred_qualifiers)
+
+static enum arm_type_qualifiers
 arm_ternop_none_none_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_none, qualifier_none };
 #define TERNOP_NONE_NONE_NONE_NONE_QUALIFIERS \
diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def
index 44b41ea..b7ebbca 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -118,9 +118,9 @@ VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, veorq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_PRED_UNONE_UNONE, vcmphiq_, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_UNONE_UNONE, vcmphiq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_PRED_UNONE_UNONE, vcmpcsq_, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_UNONE_UNONE, vcmpcsq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vandq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vaddvq_p_u, v16qi, v8hi, v4si)
@@ -142,17 +142,17 @@ VAR3 (BINOP_UNONE_UNONE_NONE, vbrsrq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_IMM, vshlq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_IMM, vrshrq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_IMM, vqshlq_n_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_NONE_NONE, vcmpneq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_PRED_NONE_NONE, vcmpltq_, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_NONE_NONE, vcmpltq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_PRED_NONE_NONE, vcmpleq_, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_NONE_NONE, vcmpleq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_PRED_NONE_NONE, vcmpgtq_, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_NONE_NONE, vcmpgtq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_PRED_NONE_NONE, vcmpgeq_, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_NONE_NONE, vcmpgeq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_PRED_NONE_NONE, vcmpeqq_, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_PRED_NONE_NONE, vcmpeqq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_NONE_IMM, vqshluq_n_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_UNONE, vaddvq_p_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vsubq_s, v16qi, v8hi, v4si)
@@ -218,17 +218,17 @@ VAR2 (BINOP_UNONE_UNONE_IMM, vshlltq_n_u, v16qi, v8hi)
 VAR2 (BINOP_UNONE_UNONE_IMM, vshllbq_n_u, v16qi, v8hi)
 VAR2 (BINOP_UNONE_UNONE_IMM, vorrq_n_u, v8hi, v4si)
 VAR2 (BINOP_UNONE_UNONE_IMM, vbicq_n_u, v8hi, v4si)
-VAR2 (BINOP_UNONE_NONE_NONE, vcmpneq_n_f, v8hf, v4sf)
+VAR2 (BINOP_PRED_NONE_NONE, vcmpneq_n_f, v8hf, v4sf)
 VAR2 (BINOP_PRED_NONE_NONE, vcmpneq_f, v8hf, v4sf)
-VAR2 (BINOP_UNONE_NONE_NONE, vcmpltq_n_f, v8hf, v4sf)
+VAR2 (BINOP_PRED_NONE_NONE, vcmpltq_n_f, v8hf, v4sf)
 VAR2 (BINOP_PRED_NONE_NONE, vcmpltq_f, v8hf, v4sf)
-VAR2 (BINOP_UNONE_NONE_NONE, vcmpleq_n_f, v8hf, v4sf)
+VAR2 (BINOP_PRED_NONE_NONE, vcmpleq_n_f, v8hf, v4sf)
 VAR2 (BINOP_PRED_NONE_NONE, vcmpleq_f, v8hf, v4sf)
-VAR2 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_f, v8hf, v4sf)
+VAR2 (BINOP_PRED_NONE_NONE, vcmpgtq_n_f, v8hf, v4sf)
 VAR2 (BINOP_PRED_NONE_NONE, vcmpgtq_f, v8hf, v4sf)
-VAR2 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_f, v8hf, v4sf)
+VAR2 (BINOP_PRED_NONE_NONE, vcmpgeq_n_f, v8hf, v4sf)
 VAR2 (BINOP_PRED_NONE_NONE, vcmpgeq_f, v8hf, v4sf)
-VAR2 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_f, v8hf, v4sf)
+VAR2 (BINOP_PRED_NONE_NONE, vcmpeqq_n_f, v8hf, v4sf)
 VAR2 (BINOP_PRED_NONE_NONE, vcmpeqq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vsubq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vqmovntq_s, v8hi, v4si)
@@ -285,7 +285,7 @@ VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlaldavhaq_s, v4si)
 VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrmlaldavhaq_u, v4si)
 VAR2 (TERNOP_NONE_NONE_UNONE_UNONE, vcvtq_m_to_f_u, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtq_m_to_f_s, v8hf, v4sf)
-VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpeqq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_f, v8hf, v4sf)
 VAR3 (TERNOP_UNONE_NONE_UNONE_IMM, vshlcq_carry_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vshlcq_carry_u, v16qi, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqrshrunbq_n_s, v8hi, v4si)
@@ -306,14 +306,14 @@ VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmladavaq_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vminvq_p_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmaxvq_p_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vdupq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmpneq_m_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmpneq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmphiq_m_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmphiq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmpeqq_m_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmpeqq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmpcsq_m_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vcmpcsq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpneq_m_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpneq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmphiq_m_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmphiq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpeqq_m_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpeqq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpcsq_m_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpcsq_m_n_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vclzq_m_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vaddvaq_p_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vsriq_n_u, v16qi, v8hi, v4si)
@@ -326,18 +326,18 @@ VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vminavq_p_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vminaq_m_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vmaxavq_p_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vmaxaq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpneq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpneq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpltq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpltq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpleq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpleq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgtq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgtq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgeq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgeq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpeqq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpeqq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpltq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpltq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpleq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpleq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpgtq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpgtq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_n_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vshlq_m_r_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vrshlq_m_n_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vrev64q_m_s, v16qi, v8hi, v4si)
@@ -405,17 +405,17 @@ VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqshrunbq_n_s, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqrshruntq_n_s, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_IMM_UNONE, vorrq_m_n_u, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_IMM_UNONE, vmvnq_m_n_u, v8hi, v4si)
-VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpneq_m_n_f, v8hf, v4sf)
-VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpneq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpltq_m_n_f, v8hf, v4sf)
-VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpltq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpleq_m_n_f, v8hf, v4sf)
-VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpleq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgtq_m_n_f, v8hf, v4sf)
-VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgtq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgeq_m_n_f, v8hf, v4sf)
-VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpgeq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_UNONE_NONE_NONE_UNONE, vcmpeqq_m_n_f, v8hf, v4sf)
+VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_n_f, v8hf, v4sf)
+VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpltq_m_n_f, v8hf, v4sf)
+VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpltq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpleq_m_n_f, v8hf, v4sf)
+VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpleq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgtq_m_n_f, v8hf, v4sf)
+VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgtq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_n_f, v8hf, v4sf)
+VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_n_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndxq_m_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndq_m_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndpq_m_f, v8hf, v4sf)
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index d0c3100..12f05b3 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -853,8 +853,8 @@
 ;;
 (define_insn "mve_vcmp<mve_cmp_op>q_n_<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(MVE_COMPARISONS:<MVE_VPRED> (match_operand:MVE_2 1 "s_register_operand" "w")
 		    (match_operand:<V_elem> 2 "s_register_operand" "r")))
   ]
   "TARGET_HAVE_MVE"
@@ -1943,8 +1943,8 @@
 ;;
 (define_insn "@mve_vcmp<mve_cmp_op>q_n_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(MVE_FP_COMPARISONS:<MVE_VPRED> (match_operand:MVE_0 1 "s_register_operand" "w")
 			       (match_operand:<V_elem> 2 "s_register_operand" "r")))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -2593,10 +2593,10 @@
 ;;
 (define_insn "mve_vcmpeqq_m_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_0 1 "s_register_operand" "w")
 		    (match_operand:MVE_0 2 "s_register_operand" "w")
-		    (match_operand:HI 3 "vpr_register_operand" "Up")]
+		    (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPEQQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -2809,10 +2809,10 @@
 ;;
 (define_insn "mve_vcmpcsq_m_n_u<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPCSQ_M_N_U))
   ]
   "TARGET_HAVE_MVE"
@@ -2825,10 +2825,10 @@
 ;;
 (define_insn "mve_vcmpcsq_m_u<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPCSQ_M_U))
   ]
   "TARGET_HAVE_MVE"
@@ -2841,10 +2841,10 @@
 ;;
 (define_insn "mve_vcmpeqq_m_n_<supf><mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPEQQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -2857,10 +2857,10 @@
 ;;
 (define_insn "mve_vcmpeqq_m_<supf><mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPEQQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -2873,10 +2873,10 @@
 ;;
 (define_insn "mve_vcmpgeq_m_n_s<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPGEQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -2889,10 +2889,10 @@
 ;;
 (define_insn "mve_vcmpgeq_m_s<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPGEQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -2905,10 +2905,10 @@
 ;;
 (define_insn "mve_vcmpgtq_m_n_s<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPGTQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -2921,10 +2921,10 @@
 ;;
 (define_insn "mve_vcmpgtq_m_s<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPGTQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -2937,10 +2937,10 @@
 ;;
 (define_insn "mve_vcmphiq_m_n_u<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPHIQ_M_N_U))
   ]
   "TARGET_HAVE_MVE"
@@ -2953,10 +2953,10 @@
 ;;
 (define_insn "mve_vcmphiq_m_u<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPHIQ_M_U))
   ]
   "TARGET_HAVE_MVE"
@@ -2969,10 +2969,10 @@
 ;;
 (define_insn "mve_vcmpleq_m_n_s<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPLEQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -2985,10 +2985,10 @@
 ;;
 (define_insn "mve_vcmpleq_m_s<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPLEQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -3001,10 +3001,10 @@
 ;;
 (define_insn "mve_vcmpltq_m_n_s<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPLTQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -3017,10 +3017,10 @@
 ;;
 (define_insn "mve_vcmpltq_m_s<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPLTQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -3033,10 +3033,10 @@
 ;;
 (define_insn "mve_vcmpneq_m_n_<supf><mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPNEQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -3049,10 +3049,10 @@
 ;;
 (define_insn "mve_vcmpneq_m_<supf><mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPNEQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -3770,10 +3770,10 @@
 ;;
 (define_insn "mve_vcmpeqq_m_n_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPEQQ_M_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3786,10 +3786,10 @@
 ;;
 (define_insn "mve_vcmpgeq_m_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPGEQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3802,10 +3802,10 @@
 ;;
 (define_insn "mve_vcmpgeq_m_n_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPGEQ_M_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3818,10 +3818,10 @@
 ;;
 (define_insn "mve_vcmpgtq_m_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPGTQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3834,10 +3834,10 @@
 ;;
 (define_insn "mve_vcmpgtq_m_n_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPGTQ_M_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3850,10 +3850,10 @@
 ;;
 (define_insn "mve_vcmpleq_m_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPLEQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3866,10 +3866,10 @@
 ;;
 (define_insn "mve_vcmpleq_m_n_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPLEQ_M_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3882,10 +3882,10 @@
 ;;
 (define_insn "mve_vcmpltq_m_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPLTQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3898,10 +3898,10 @@
 ;;
 (define_insn "mve_vcmpltq_m_n_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPLTQ_M_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3914,10 +3914,10 @@
 ;;
 (define_insn "mve_vcmpneq_m_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPNEQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3930,10 +3930,10 @@
 ;;
 (define_insn "mve_vcmpneq_m_n_f<mode>"
   [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
+   (set (match_operand:<MVE_VPRED> 0 "vpr_register_operand" "=Up")
+	(unspec:<MVE_VPRED> [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCMPNEQ_M_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-- 
cgit v1.1


From 724d6566cd11c676f3bc082a9771784c825affb1 Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@arm.com>
Date: Wed, 13 Oct 2021 09:16:40 +0000
Subject: arm: Convert more MVE builtins to predicate qualifiers

This patch covers all builtins that have an HI operand and use the
<mode> iterator, thus we can replace HI whe <MVE_vpred>.

Most of the work of this patch series was carried out while I was
working at STMicroelectronics as a Linaro assignee.

2022-02-22  Christophe Lyon  <christophe.lyon@arm.com>

	gcc/
	PR target/100757
	PR target/101325
	* config/arm/arm-builtins.cc (TERNOP_UNONE_UNONE_NONE_UNONE_QUALIFIERS): Change to ...
	(TERNOP_UNONE_UNONE_NONE_PRED_QUALIFIERS): ... this.
	(TERNOP_UNONE_UNONE_IMM_UNONE_QUALIFIERS): Change to ...
	(TERNOP_UNONE_UNONE_IMM_PRED_QUALIFIERS): ... this.
	(TERNOP_NONE_NONE_IMM_UNONE_QUALIFIERS): Change to ...
	(TERNOP_NONE_NONE_IMM_PRED_QUALIFIERS): ... this.
	(TERNOP_NONE_NONE_UNONE_UNONE_QUALIFIERS): Change to ...
	(TERNOP_NONE_NONE_UNONE_PRED_QUALIFIERS): ... this.
	(QUADOP_UNONE_UNONE_NONE_NONE_UNONE_QUALIFIERS): Change to ...
	(QUADOP_UNONE_UNONE_NONE_NONE_PRED_QUALIFIERS): ... this.
	(QUADOP_NONE_NONE_NONE_NONE_PRED_QUALIFIERS): New.
	(QUADOP_NONE_NONE_NONE_IMM_UNONE_QUALIFIERS): Change to ...
	(QUADOP_NONE_NONE_NONE_IMM_PRED_QUALIFIERS): ... this.
	(QUADOP_UNONE_UNONE_UNONE_UNONE_PRED_QUALIFIERS): New.
	(QUADOP_UNONE_UNONE_NONE_IMM_UNONE_QUALIFIERS): Change to ...
	(QUADOP_UNONE_UNONE_NONE_IMM_PRED_QUALIFIERS): ... this.
	(QUADOP_NONE_NONE_UNONE_IMM_UNONE_QUALIFIERS): Change to ...
	(QUADOP_NONE_NONE_UNONE_IMM_PRED_QUALIFIERS): ... this.
	(QUADOP_UNONE_UNONE_UNONE_IMM_UNONE_QUALIFIERS): Change to ...
	(QUADOP_UNONE_UNONE_UNONE_IMM_PRED_QUALIFIERS): ... this.
	(QUADOP_UNONE_UNONE_UNONE_NONE_UNONE_QUALIFIERS): Change to ...
	(QUADOP_UNONE_UNONE_UNONE_NONE_PRED_QUALIFIERS): ... this.
	(STRS_P_QUALIFIERS): Use predicate qualifier.
	(STRU_P_QUALIFIERS): Likewise.
	(STRSU_P_QUALIFIERS): Likewise.
	(STRSS_P_QUALIFIERS): Likewise.
	(LDRGS_Z_QUALIFIERS): Likewise.
	(LDRGU_Z_QUALIFIERS): Likewise.
	(LDRS_Z_QUALIFIERS): Likewise.
	(LDRU_Z_QUALIFIERS): Likewise.
	(QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_UNONE_QUALIFIERS): Change to ...
	(QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED_QUALIFIERS): ... this.
	(BINOP_NONE_NONE_PRED_QUALIFIERS): New.
	(BINOP_UNONE_UNONE_PRED_QUALIFIERS): New.
	* config/arm/arm_mve_builtins.def: Use new predicated qualifiers.
	* config/arm/mve.md: Use MVE_VPRED instead of HI.
---
 gcc/config/arm/arm-builtins.cc      | 130 +++++----
 gcc/config/arm/arm_mve_builtins.def | 562 ++++++++++++++++++------------------
 gcc/config/arm/mve.md               | 420 +++++++++++++--------------
 3 files changed, 569 insertions(+), 543 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc
index 02411c6..a9536b2 100644
--- a/gcc/config/arm/arm-builtins.cc
+++ b/gcc/config/arm/arm-builtins.cc
@@ -484,18 +484,18 @@ arm_ternop_unone_unone_none_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_ternop_unone_unone_none_imm_qualifiers)
 
 static enum arm_type_qualifiers
-arm_ternop_unone_unone_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+arm_ternop_unone_unone_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_none,
-      qualifier_unsigned };
-#define TERNOP_UNONE_UNONE_NONE_UNONE_QUALIFIERS \
-  (arm_ternop_unone_unone_none_unone_qualifiers)
+      qualifier_predicate };
+#define TERNOP_UNONE_UNONE_NONE_PRED_QUALIFIERS \
+  (arm_ternop_unone_unone_none_pred_qualifiers)
 
 static enum arm_type_qualifiers
-arm_ternop_unone_unone_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+arm_ternop_unone_unone_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate,
-    qualifier_unsigned };
-#define TERNOP_UNONE_UNONE_IMM_UNONE_QUALIFIERS \
-  (arm_ternop_unone_unone_imm_unone_qualifiers)
+    qualifier_predicate };
+#define TERNOP_UNONE_UNONE_IMM_PRED_QUALIFIERS \
+  (arm_ternop_unone_unone_imm_pred_qualifiers)
 
 static enum arm_type_qualifiers
 arm_ternop_pred_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -522,16 +522,16 @@ arm_ternop_none_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_ternop_none_none_none_pred_qualifiers)
 
 static enum arm_type_qualifiers
-arm_ternop_none_none_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-  = { qualifier_none, qualifier_none, qualifier_immediate, qualifier_unsigned };
-#define TERNOP_NONE_NONE_IMM_UNONE_QUALIFIERS \
-  (arm_ternop_none_none_imm_unone_qualifiers)
+arm_ternop_none_none_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_immediate, qualifier_predicate };
+#define TERNOP_NONE_NONE_IMM_PRED_QUALIFIERS \
+  (arm_ternop_none_none_imm_pred_qualifiers)
 
 static enum arm_type_qualifiers
-arm_ternop_none_none_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-  = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_unsigned };
-#define TERNOP_NONE_NONE_UNONE_UNONE_QUALIFIERS \
-  (arm_ternop_none_none_unone_unone_qualifiers)
+arm_ternop_none_none_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_predicate };
+#define TERNOP_NONE_NONE_UNONE_PRED_QUALIFIERS \
+  (arm_ternop_none_none_unone_pred_qualifiers)
 
 static enum arm_type_qualifiers
 arm_ternop_unone_unone_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -561,11 +561,11 @@ arm_ternop_none_none_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_ternop_none_none_none_none_qualifiers)
 
 static enum arm_type_qualifiers
-arm_quadop_unone_unone_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+arm_quadop_unone_unone_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_none, qualifier_none,
-    qualifier_unsigned };
-#define QUADOP_UNONE_UNONE_NONE_NONE_UNONE_QUALIFIERS \
-  (arm_quadop_unone_unone_none_none_unone_qualifiers)
+    qualifier_predicate };
+#define QUADOP_UNONE_UNONE_NONE_NONE_PRED_QUALIFIERS \
+  (arm_quadop_unone_unone_none_none_pred_qualifiers)
 
 static enum arm_type_qualifiers
 arm_quadop_none_none_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -575,11 +575,18 @@ arm_quadop_none_none_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_quadop_none_none_none_none_unone_qualifiers)
 
 static enum arm_type_qualifiers
-arm_quadop_none_none_none_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+arm_quadop_none_none_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_none, qualifier_none,
+    qualifier_predicate };
+#define QUADOP_NONE_NONE_NONE_NONE_PRED_QUALIFIERS \
+  (arm_quadop_none_none_none_none_pred_qualifiers)
+
+static enum arm_type_qualifiers
+arm_quadop_none_none_none_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_none, qualifier_immediate,
-    qualifier_unsigned };
-#define QUADOP_NONE_NONE_NONE_IMM_UNONE_QUALIFIERS \
-  (arm_quadop_none_none_none_imm_unone_qualifiers)
+    qualifier_predicate };
+#define QUADOP_NONE_NONE_NONE_IMM_PRED_QUALIFIERS \
+  (arm_quadop_none_none_none_imm_pred_qualifiers)
 
 static enum arm_type_qualifiers
 arm_quadop_unone_unone_unone_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -589,32 +596,39 @@ arm_quadop_unone_unone_unone_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_quadop_unone_unone_unone_unone_unone_qualifiers)
 
 static enum arm_type_qualifiers
-arm_quadop_unone_unone_none_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+arm_quadop_unone_unone_unone_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
+    qualifier_unsigned, qualifier_predicate };
+#define QUADOP_UNONE_UNONE_UNONE_UNONE_PRED_QUALIFIERS \
+  (arm_quadop_unone_unone_unone_unone_pred_qualifiers)
+
+static enum arm_type_qualifiers
+arm_quadop_unone_unone_none_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_none,
-    qualifier_immediate, qualifier_unsigned };
-#define QUADOP_UNONE_UNONE_NONE_IMM_UNONE_QUALIFIERS \
-  (arm_quadop_unone_unone_none_imm_unone_qualifiers)
+    qualifier_immediate, qualifier_predicate };
+#define QUADOP_UNONE_UNONE_NONE_IMM_PRED_QUALIFIERS \
+  (arm_quadop_unone_unone_none_imm_pred_qualifiers)
 
 static enum arm_type_qualifiers
-arm_quadop_none_none_unone_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+arm_quadop_none_none_unone_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_immediate,
-    qualifier_unsigned };
-#define QUADOP_NONE_NONE_UNONE_IMM_UNONE_QUALIFIERS \
-  (arm_quadop_none_none_unone_imm_unone_qualifiers)
+    qualifier_predicate };
+#define QUADOP_NONE_NONE_UNONE_IMM_PRED_QUALIFIERS \
+  (arm_quadop_none_none_unone_imm_pred_qualifiers)
 
 static enum arm_type_qualifiers
-arm_quadop_unone_unone_unone_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+arm_quadop_unone_unone_unone_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
-    qualifier_immediate, qualifier_unsigned };
-#define QUADOP_UNONE_UNONE_UNONE_IMM_UNONE_QUALIFIERS \
-  (arm_quadop_unone_unone_unone_imm_unone_qualifiers)
+    qualifier_immediate, qualifier_predicate };
+#define QUADOP_UNONE_UNONE_UNONE_IMM_PRED_QUALIFIERS \
+  (arm_quadop_unone_unone_unone_imm_pred_qualifiers)
 
 static enum arm_type_qualifiers
-arm_quadop_unone_unone_unone_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+arm_quadop_unone_unone_unone_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
-    qualifier_none, qualifier_unsigned };
-#define QUADOP_UNONE_UNONE_UNONE_NONE_UNONE_QUALIFIERS \
-  (arm_quadop_unone_unone_unone_none_unone_qualifiers)
+    qualifier_none, qualifier_predicate };
+#define QUADOP_UNONE_UNONE_UNONE_NONE_PRED_QUALIFIERS \
+  (arm_quadop_unone_unone_unone_none_pred_qualifiers)
 
 static enum arm_type_qualifiers
 arm_strs_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -651,25 +665,25 @@ arm_strsbu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 
 static enum arm_type_qualifiers
 arm_strs_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-  = { qualifier_void, qualifier_pointer, qualifier_none, qualifier_unsigned};
+  = { qualifier_void, qualifier_pointer, qualifier_none, qualifier_predicate};
 #define STRS_P_QUALIFIERS (arm_strs_p_qualifiers)
 
 static enum arm_type_qualifiers
 arm_stru_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_void, qualifier_pointer, qualifier_unsigned,
-      qualifier_unsigned};
+      qualifier_predicate};
 #define STRU_P_QUALIFIERS (arm_stru_p_qualifiers)
 
 static enum arm_type_qualifiers
 arm_strsu_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_void, qualifier_pointer, qualifier_unsigned,
-      qualifier_unsigned, qualifier_unsigned};
+      qualifier_unsigned, qualifier_predicate};
 #define STRSU_P_QUALIFIERS (arm_strsu_p_qualifiers)
 
 static enum arm_type_qualifiers
 arm_strss_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_void, qualifier_pointer, qualifier_unsigned,
-      qualifier_none, qualifier_unsigned};
+      qualifier_none, qualifier_predicate};
 #define STRSS_P_QUALIFIERS (arm_strss_p_qualifiers)
 
 static enum arm_type_qualifiers
@@ -729,31 +743,31 @@ arm_ldrgbu_z_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 static enum arm_type_qualifiers
 arm_ldrgs_z_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_pointer, qualifier_unsigned,
-      qualifier_unsigned};
+      qualifier_predicate};
 #define LDRGS_Z_QUALIFIERS (arm_ldrgs_z_qualifiers)
 
 static enum arm_type_qualifiers
 arm_ldrgu_z_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_pointer, qualifier_unsigned,
-      qualifier_unsigned};
+      qualifier_predicate};
 #define LDRGU_Z_QUALIFIERS (arm_ldrgu_z_qualifiers)
 
 static enum arm_type_qualifiers
 arm_ldrs_z_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-  = { qualifier_none, qualifier_pointer, qualifier_unsigned};
+  = { qualifier_none, qualifier_pointer, qualifier_predicate};
 #define LDRS_Z_QUALIFIERS (arm_ldrs_z_qualifiers)
 
 static enum arm_type_qualifiers
 arm_ldru_z_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-  = { qualifier_unsigned, qualifier_pointer, qualifier_unsigned};
+  = { qualifier_unsigned, qualifier_pointer, qualifier_predicate};
 #define LDRU_Z_QUALIFIERS (arm_ldru_z_qualifiers)
 
 static enum arm_type_qualifiers
-arm_quinop_unone_unone_unone_unone_imm_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+arm_quinop_unone_unone_unone_unone_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
-      qualifier_unsigned, qualifier_immediate, qualifier_unsigned };
-#define QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_UNONE_QUALIFIERS \
-  (arm_quinop_unone_unone_unone_unone_imm_unone_qualifiers)
+      qualifier_unsigned, qualifier_immediate, qualifier_predicate };
+#define QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED_QUALIFIERS \
+  (arm_quinop_unone_unone_unone_unone_imm_pred_qualifiers)
 
 static enum arm_type_qualifiers
 arm_ldrgbwbxu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -830,6 +844,18 @@ arm_sqshl_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_const};
 #define SQSHL_QUALIFIERS (arm_sqshl_qualifiers)
 
+static enum arm_type_qualifiers
+arm_binop_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_predicate };
+#define BINOP_NONE_NONE_PRED_QUALIFIERS \
+  (arm_binop_none_none_pred_qualifiers)
+
+static enum arm_type_qualifiers
+arm_binop_unone_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_unsigned, qualifier_unsigned, qualifier_predicate };
+#define BINOP_UNONE_UNONE_PRED_QUALIFIERS \
+  (arm_binop_unone_unone_pred_qualifiers)
+
 /* End of Qualifier for MVE builtins.  */
 
    /* void ([T element type] *, T, immediate).  */
diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def
index b7ebbca..7db6d47 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -123,7 +123,7 @@ VAR3 (BINOP_PRED_UNONE_UNONE, vcmpcsq_, v16qi, v8hi, v4si)
 VAR3 (BINOP_PRED_UNONE_UNONE, vcmpcsq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vandq_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vaddvq_p_u, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_UNONE_PRED, vaddvq_p_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vaddvaq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vaddq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vabdq_u, v16qi, v8hi, v4si)
@@ -154,7 +154,7 @@ VAR3 (BINOP_PRED_NONE_NONE, vcmpgeq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_PRED_NONE_NONE, vcmpeqq_, v16qi, v8hi, v4si)
 VAR3 (BINOP_PRED_NONE_NONE, vcmpeqq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_NONE_IMM, vqshluq_n_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_NONE_NONE_UNONE, vaddvq_p_s, v16qi, v8hi, v4si)
+VAR3 (BINOP_NONE_NONE_PRED, vaddvq_p_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vsubq_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vsubq_n_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vshlq_r_s, v16qi, v8hi, v4si)
@@ -277,35 +277,35 @@ VAR1 (BINOP_NONE_NONE_NONE, vrmlaldavhq_s, v4si)
 VAR1 (BINOP_NONE_NONE_NONE, vcvttq_f16_f32, v8hf)
 VAR1 (BINOP_NONE_NONE_NONE, vcvtbq_f16_f32, v8hf)
 VAR1 (BINOP_NONE_NONE_NONE, vaddlvaq_s, v4si)
-VAR2 (TERNOP_NONE_NONE_IMM_UNONE, vbicq_m_n_s, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_IMM_UNONE, vbicq_m_n_u, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_IMM_PRED, vbicq_m_n_s, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_IMM_PRED, vbicq_m_n_u, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_IMM, vqrshrnbq_n_s, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vqrshrnbq_n_u, v8hi, v4si)
 VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlaldavhaq_s, v4si)
 VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrmlaldavhaq_u, v4si)
-VAR2 (TERNOP_NONE_NONE_UNONE_UNONE, vcvtq_m_to_f_u, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtq_m_to_f_s, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_UNONE_PRED, vcvtq_m_to_f_u, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtq_m_to_f_s, v8hf, v4sf)
 VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_f, v8hf, v4sf)
 VAR3 (TERNOP_UNONE_NONE_UNONE_IMM, vshlcq_carry_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vshlcq_carry_u, v16qi, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqrshrunbq_n_s, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_NONE_NONE, vabavq_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vabavq_u, v16qi, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vcvtaq_m_u, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtaq_m_s, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vcvtaq_m_u, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtaq_m_s, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vshlcq_vec_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_UNONE_IMM, vshlcq_vec_s, v16qi, v8hi, v4si)
 VAR4 (TERNOP_UNONE_UNONE_UNONE_PRED, vpselq_u, v16qi, v8hi, v4si, v2di)
 VAR4 (TERNOP_NONE_NONE_NONE_PRED, vpselq_s, v16qi, v8hi, v4si, v2di)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrev64q_m_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmvnq_m_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vrev64q_m_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vmvnq_m_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlasq_n_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlaq_n_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmladavq_p_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vmladavq_p_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmladavaq_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vminvq_p_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmaxvq_p_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vdupq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vminvq_p_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vmaxvq_p_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vdupq_m_n_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpneq_m_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpneq_m_n_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmphiq_m_u, v16qi, v8hi, v4si)
@@ -314,18 +314,18 @@ VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpeqq_m_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpeqq_m_n_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpcsq_m_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_PRED_UNONE_UNONE_PRED, vcmpcsq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vclzq_m_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vaddvaq_p_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vclzq_m_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vaddvaq_p_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vsriq_n_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vsliq_n_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vshlq_m_r_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vrshlq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vqshlq_m_r_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vqrshlq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vminavq_p_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vminaq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vmaxavq_p_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_NONE_UNONE, vmaxaq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vshlq_m_r_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vrshlq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vqshlq_m_r_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vqrshlq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vminavq_p_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vminaq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vmaxavq_p_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_UNONE_UNONE_NONE_PRED, vmaxaq_m_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_n_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpltq_m_s, v16qi, v8hi, v4si)
@@ -338,26 +338,26 @@ VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_n_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vshlq_m_r_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vrshlq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vrev64q_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vqshlq_m_r_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vqrshlq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vqnegq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vqabsq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vnegq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vmvnq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vmlsdavxq_p_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vmlsdavq_p_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vmladavxq_p_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vmladavq_p_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vminvq_p_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vmaxvq_p_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vdupq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vclzq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vclsq_m_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vaddvaq_p_s, v16qi, v8hi, v4si)
-VAR3 (TERNOP_NONE_NONE_NONE_UNONE, vabsq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vshlq_m_r_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vrshlq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vrev64q_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vqshlq_m_r_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vqrshlq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vqnegq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vqabsq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vnegq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vmvnq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vmlsdavxq_p_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vmlsdavq_p_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vmladavxq_p_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vmladavq_p_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vminvq_p_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vmaxvq_p_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vdupq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vclzq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vclsq_m_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vaddvaq_p_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_PRED, vabsq_m_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqrdmlsdhxq_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqrdmlsdhq_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqrdmlashq_n_s, v16qi, v8hi, v4si)
@@ -378,14 +378,14 @@ VAR3 (TERNOP_NONE_NONE_NONE_NONE, vmladavaxq_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_NONE, vmladavaq_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_IMM, vsriq_n_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_IMM, vsliq_n_s, v16qi, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrev32q_m_u, v16qi, v8hi)
-VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vqmovntq_m_u, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vqmovnbq_m_u, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmovntq_m_u, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmovnbq_m_u, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmovltq_m_u, v16qi, v8hi)
-VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmovlbq_m_u, v16qi, v8hi)
-VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlaldavq_p_u, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vrev32q_m_u, v16qi, v8hi)
+VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vqmovntq_m_u, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vqmovnbq_m_u, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vmovntq_m_u, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vmovnbq_m_u, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vmovltq_m_u, v16qi, v8hi)
+VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vmovlbq_m_u, v16qi, v8hi)
+VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vmlaldavq_p_u, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlaldavaq_u, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vshrntq_n_u, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vshrnbq_n_u, v8hi, v4si)
@@ -394,17 +394,17 @@ VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vrshrnbq_n_u, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vqshrntq_n_u, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vqshrnbq_n_u, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_UNONE_IMM, vqrshrntq_n_u, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vqmovuntq_m_s, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vqmovunbq_m_s, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vcvtq_m_from_f_u, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vcvtpq_m_u, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vcvtnq_m_u, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_NONE_UNONE, vcvtmq_m_u, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vqmovuntq_m_s, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vqmovunbq_m_s, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vcvtq_m_from_f_u, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vcvtpq_m_u, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vcvtnq_m_u, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vcvtmq_m_u, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqshruntq_n_s, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqshrunbq_n_s, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqrshruntq_n_s, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_IMM_UNONE, vorrq_m_n_u, v8hi, v4si)
-VAR2 (TERNOP_UNONE_UNONE_IMM_UNONE, vmvnq_m_n_u, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_IMM_PRED, vorrq_m_n_u, v8hi, v4si)
+VAR2 (TERNOP_UNONE_UNONE_IMM_PRED, vmvnq_m_n_u, v8hi, v4si)
 VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_n_f, v8hf, v4sf)
 VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpneq_m_f, v8hf, v4sf)
 VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpltq_m_n_f, v8hf, v4sf)
@@ -416,38 +416,38 @@ VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgtq_m_f, v8hf, v4sf)
 VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_n_f, v8hf, v4sf)
 VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpgeq_m_f, v8hf, v4sf)
 VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_n_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndxq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndpq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndnq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndmq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrndaq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrev64q_m_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vrev32q_m_s, v16qi, v8hi)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vqmovntq_m_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vqmovnbq_m_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrndxq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrndq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrndpq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrndnq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrndmq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrndaq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrev64q_m_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vrev32q_m_s, v16qi, v8hi)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vqmovntq_m_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vqmovnbq_m_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_PRED, vpselq_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vnegq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmovntq_m_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmovnbq_m_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmovltq_m_s, v16qi, v8hi)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmovlbq_m_s, v16qi, v8hi)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmlsldavxq_p_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmlsldavq_p_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmlaldavxq_p_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmlaldavq_p_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vminnmvq_p_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vminnmavq_p_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vminnmaq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmaxnmvq_p_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmaxnmavq_p_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vmaxnmaq_m_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vdupq_m_n_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtq_m_from_f_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtpq_m_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtnq_m_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vcvtmq_m_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_NONE_UNONE, vabsq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vnegq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmovntq_m_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmovnbq_m_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmovltq_m_s, v16qi, v8hi)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmovlbq_m_s, v16qi, v8hi)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmlsldavxq_p_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmlsldavq_p_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmlaldavxq_p_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmlaldavq_p_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vminnmvq_p_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vminnmavq_p_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vminnmaq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmaxnmvq_p_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmaxnmavq_p_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vmaxnmaq_m_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vdupq_m_n_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtq_m_from_f_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtpq_m_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtnq_m_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtmq_m_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_PRED, vabsq_m_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_NONE, vmlsldavaxq_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_NONE, vmlsldavaq_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_NONE, vmlaldavaxq_s, v8hi, v4si)
@@ -463,8 +463,8 @@ VAR2 (TERNOP_NONE_NONE_NONE_IMM, vrshrnbq_n_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_IMM, vqshrntq_n_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_IMM, vqshrnbq_n_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_IMM, vqrshrntq_n_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_IMM_UNONE, vorrq_m_n_s, v8hi, v4si)
-VAR2 (TERNOP_NONE_NONE_IMM_UNONE, vmvnq_m_n_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_IMM_PRED, vorrq_m_n_s, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_IMM_PRED, vmvnq_m_n_s, v8hi, v4si)
 VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrmlaldavhq_p_u, v4si)
 VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrev16q_m_u, v16qi)
 VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vaddlvaq_p_u, v4si)
@@ -482,189 +482,189 @@ VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vaddlvaq_p_s, v4si)
 VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlsldavhaxq_s, v4si)
 VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlsldavhaq_s, v4si)
 VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlaldavhaxq_s, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vsriq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vsriq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vsubq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vsubq_m_u, v16qi, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vcvtq_m_n_to_f_u, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vcvtq_m_n_to_f_s, v8hf, v4sf)
-VAR3 (QUADOP_UNONE_UNONE_NONE_IMM_UNONE, vqshluq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_NONE_NONE_UNONE, vabavq_p_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vabavq_p_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_UNONE, vshlq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vshlq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vsubq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vrmulhq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vrhaddq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vqsubq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vqsubq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vqaddq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vqaddq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vorrq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vornq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmulq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmulq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmulltq_int_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmullbq_int_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmulhq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmlasq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmlaq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmladavaq_p_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vminq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmaxq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vhsubq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vhsubq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vhaddq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vhaddq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, veorq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vcaddq_rot90_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vcaddq_rot270_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vbicq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vandq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vaddq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vaddq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vabdq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_UNONE, vrshlq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_UNONE, vqshlq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_UNONE, vqrshlq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_UNONE, vbrsrq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vsliq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshrq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vrshrq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vqshlq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vsubq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrshlq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmulhq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrhaddq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqsubq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqsubq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqshlq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrshlq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmulhq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmulhq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmlsdhxq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmlsdhq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmlashq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmlahq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmladhxq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqrdmladhq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmulhq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmulhq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlsdhxq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlsdhq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlahq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlashq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmladhxq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmladhq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqaddq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqaddq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vorrq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vornq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmulq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmulq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmulltq_int_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmullbq_int_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmulhq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlsdavaxq_p_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlsdavaq_p_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlasq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlaq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmladavaxq_p_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmladavaq_p_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vminq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmaxq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vhsubq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vhsubq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vhcaddq_rot90_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vhcaddq_rot270_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vhaddq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vhaddq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, veorq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcaddq_rot90_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcaddq_rot270_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vbrsrq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vbicq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vandq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vaddq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vaddq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vabdq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vsliq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vshrq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vshlq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vrshrq_m_n_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vqshlq_m_n_s, v16qi, v8hi, v4si)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmulltq_poly_m_p, v16qi, v8hi)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmullbq_poly_m_p, v16qi, v8hi)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmlaldavaq_p_u, v8hi, v4si)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshrntq_m_n_u, v8hi, v4si)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshrnbq_m_n_u, v8hi, v4si)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlltq_m_n_u, v16qi, v8hi)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshllbq_m_n_u, v16qi, v8hi)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vrshrntq_m_n_u, v8hi, v4si)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vrshrnbq_m_n_u, v8hi, v4si)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vqshrntq_m_n_u, v8hi, v4si)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vqshrnbq_m_n_u, v8hi, v4si)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vqrshrntq_m_n_u, v8hi, v4si)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vqrshrnbq_m_n_u, v8hi, v4si)
-VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_UNONE, vqshruntq_m_n_s, v8hi, v4si)
-VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_UNONE, vqshrunbq_m_n_s, v8hi, v4si)
-VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_UNONE, vqrshruntq_m_n_s, v8hi, v4si)
-VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_UNONE, vqrshrunbq_m_n_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmulltq_m_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmulltq_m_n_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmullbq_m_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmullbq_m_n_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlsldavaxq_p_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlsldavaq_p_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlaldavaxq_p_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmlaldavaq_p_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vshrntq_m_n_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vshrnbq_m_n_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vshlltq_m_n_s, v16qi, v8hi)
-VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vshllbq_m_n_s, v16qi, v8hi)
-VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vrshrntq_m_n_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vrshrnbq_m_n_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vqshrntq_m_n_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vqshrnbq_m_n_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vqrshrntq_m_n_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vqrshrnbq_m_n_s, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_IMM_PRED, vsriq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vsriq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsubq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsubq_m_u, v16qi, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_UNONE_IMM_PRED, vcvtq_m_n_to_f_u, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vcvtq_m_n_to_f_s, v8hf, v4sf)
+VAR3 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vqshluq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_NONE_NONE_PRED, vabavq_p_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vabavq_p_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_PRED, vshlq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vshlq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsubq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vrmulhq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vrhaddq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vqsubq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vqsubq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vqaddq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vqaddq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vorrq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vornq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmulq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmulq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmulltq_int_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmullbq_int_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmulhq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmlasq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmlaq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmladavaq_p_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vminq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmaxq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vhsubq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vhsubq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vhaddq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vhaddq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, veorq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vcaddq_rot90_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vcaddq_rot270_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vbicq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vandq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vaddq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vaddq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vabdq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_PRED, vrshlq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_PRED, vqshlq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_PRED, vqrshlq_m_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_NONE_PRED, vbrsrq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vsliq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshrq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vrshrq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vqshlq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsubq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrshlq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrmulhq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrhaddq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqsubq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqsubq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqshlq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrshlq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmulhq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmulhq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmlsdhxq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmlsdhq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmlashq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmlahq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmladhxq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqrdmladhq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmulhq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmulhq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmlsdhxq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmlsdhq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmlahq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmlashq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmladhxq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmladhq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqaddq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqaddq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vorrq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vornq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmulq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmulq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmulltq_int_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmullbq_int_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmulhq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlsdavaxq_p_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlsdavaq_p_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlasq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlaq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmladavaxq_p_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmladavaq_p_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vminq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmaxq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhsubq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhsubq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhcaddq_rot90_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhcaddq_rot270_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhaddq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhaddq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, veorq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcaddq_rot90_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcaddq_rot270_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vbrsrq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vbicq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vandq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vaddq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vaddq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vabdq_m_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_IMM_PRED, vsliq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_IMM_PRED, vshrq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_IMM_PRED, vshlq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_IMM_PRED, vrshrq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqshlq_m_n_s, v16qi, v8hi, v4si)
+VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmulltq_poly_m_p, v16qi, v8hi)
+VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmullbq_poly_m_p, v16qi, v8hi)
+VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vmlaldavaq_p_u, v8hi, v4si)
+VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshrntq_m_n_u, v8hi, v4si)
+VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshrnbq_m_n_u, v8hi, v4si)
+VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlltq_m_n_u, v16qi, v8hi)
+VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshllbq_m_n_u, v16qi, v8hi)
+VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vrshrntq_m_n_u, v8hi, v4si)
+VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vrshrnbq_m_n_u, v8hi, v4si)
+VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vqshrntq_m_n_u, v8hi, v4si)
+VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vqshrnbq_m_n_u, v8hi, v4si)
+VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vqrshrntq_m_n_u, v8hi, v4si)
+VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vqrshrnbq_m_n_u, v8hi, v4si)
+VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vqshruntq_m_n_s, v8hi, v4si)
+VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vqshrunbq_m_n_s, v8hi, v4si)
+VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vqrshruntq_m_n_s, v8hi, v4si)
+VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vqrshrunbq_m_n_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmulltq_m_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmulltq_m_n_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmullbq_m_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vqdmullbq_m_n_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlsldavaxq_p_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlsldavaq_p_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlaldavaxq_p_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmlaldavaq_p_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vshrntq_m_n_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vshrnbq_m_n_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vshlltq_m_n_s, v16qi, v8hi)
+VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vshllbq_m_n_s, v16qi, v8hi)
+VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vrshrntq_m_n_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vrshrnbq_m_n_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqshrntq_m_n_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqshrnbq_m_n_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqrshrntq_m_n_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqrshrnbq_m_n_s, v8hi, v4si)
 VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vrmlaldavhaq_p_u, v4si)
 VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlsldavhaxq_p_s, v4si)
 VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlsldavhaq_p_s, v4si)
 VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlaldavhaxq_p_s, v4si)
 VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlaldavhaq_p_s, v4si)
-VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_UNONE, vcvtq_m_n_from_f_u, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vcvtq_m_n_from_f_s, v8hi, v4si)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vbrsrq_m_n_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vsubq_m_n_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vsubq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vorrq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vornq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmulq_m_n_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmulq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vminnmq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vmaxnmq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vfmsq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vfmasq_m_n_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vfmaq_m_n_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vfmaq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, veorq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmulq_rot90_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmulq_rot270_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmulq_rot180_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmulq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmlaq_rot90_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmlaq_rot270_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmlaq_rot180_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcmlaq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcaddq_rot90_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vcaddq_rot270_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vbicq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vandq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vaddq_m_n_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vaddq_m_f, v8hf, v4sf)
-VAR2 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vabdq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vcvtq_m_n_from_f_u, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vcvtq_m_n_from_f_s, v8hi, v4si)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vbrsrq_m_n_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsubq_m_n_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsubq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vorrq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vornq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmulq_m_n_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmulq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vminnmq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vmaxnmq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vfmsq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vfmasq_m_n_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vfmaq_m_n_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vfmaq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, veorq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmulq_rot90_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmulq_rot270_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmulq_rot180_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmulq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmlaq_rot90_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmlaq_rot270_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmlaq_rot180_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcmlaq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcaddq_rot90_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcaddq_rot270_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vbicq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vandq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vaddq_m_n_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vaddq_m_f, v8hf, v4sf)
+VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vabdq_m_f, v8hf, v4sf)
 VAR3 (STRS, vstrbq_s, v16qi, v8hi, v4si)
 VAR3 (STRU, vstrbq_u, v16qi, v8hi, v4si)
 VAR3 (STRSS, vstrbq_scatter_offset_s, v16qi, v8hi, v4si)
@@ -797,14 +797,14 @@ VAR1 (STRSU_P, vstrwq_scatter_offset_p_u, v4si)
 VAR1 (STRSU_P, vstrwq_scatter_shifted_offset_p_u, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, viwdupq_wb_u, v16qi, v4si, v8hi)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vdwdupq_wb_u, v16qi, v4si, v8hi)
-VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_UNONE, viwdupq_m_wb_u, v16qi, v8hi, v4si)
-VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_UNONE, vdwdupq_m_wb_u, v16qi, v8hi, v4si)
-VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_UNONE, viwdupq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_UNONE, vdwdupq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED, viwdupq_m_wb_u, v16qi, v8hi, v4si)
+VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED, vdwdupq_m_wb_u, v16qi, v8hi, v4si)
+VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED, viwdupq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED, vdwdupq_m_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_IMM, vddupq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_IMM, vidupq_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vddupq_m_n_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vidupq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vddupq_m_n_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vidupq_m_n_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vdwdupq_n_u, v16qi, v4si, v8hi)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, viwdupq_n_u, v16qi, v4si, v8hi)
 VAR1 (STRSBWBU, vstrwq_scatter_base_wb_u, v4si)
@@ -870,10 +870,10 @@ VAR1 (UQSHL, urshr_, si)
 VAR1 (UQSHL, urshrl_, di)
 VAR1 (UQSHL, uqshl_, si)
 VAR1 (UQSHL, uqshll_, di)
-VAR3 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vshlcq_m_vec_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vshlcq_m_carry_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlcq_m_vec_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlcq_m_carry_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_UNONE_IMM_PRED, vshlcq_m_vec_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_UNONE_IMM_PRED, vshlcq_m_carry_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlcq_m_vec_u, v16qi, v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlcq_m_carry_u, v16qi, v8hi, v4si)
 
 /* optabs without any suffixes.  */
 VAR5 (BINOP_NONE_NONE_NONE, vcaddq_rot90, v16qi, v8hi, v4si, v8hf, v4sf)
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 12f05b3..5d51da1 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -130,7 +130,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VRNDQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -918,7 +918,7 @@
   [
    (set (match_operand:SI 0 "s_register_operand" "=Te")
 	(unspec:SI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:HI 2 "vpr_register_operand" "Up")]
+		    (match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up")]
 	 VADDVQ_P))
   ]
   "TARGET_HAVE_MVE"
@@ -2581,7 +2581,7 @@
    (set (match_operand:MVE_5 0 "s_register_operand" "=w")
 	(unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0")
 		       (match_operand:SI 2 "immediate_operand" "i")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VBICQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -2611,7 +2611,7 @@
    (set (match_operand:MVE_5 0 "s_register_operand" "=w")
 	(unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0")
 		       (match_operand:<MVE_CNVT> 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCVTAQ_M))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -2626,7 +2626,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:<MVE_CNVT> 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCVTQ_M_TO_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -2748,7 +2748,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VABSQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -2764,7 +2764,7 @@
    (set (match_operand:SI 0 "s_register_operand" "=Te")
 	(unspec:SI [(match_operand:SI 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VADDVAQ_P))
   ]
   "TARGET_HAVE_MVE"
@@ -2780,7 +2780,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCLSQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -2796,7 +2796,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCLZQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -3068,7 +3068,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VDUPQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -3084,7 +3084,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMAXAQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -3100,7 +3100,7 @@
    (set (match_operand:<V_elem> 0 "s_register_operand" "=r")
 	(unspec:<V_elem> [(match_operand:<V_elem> 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMAXAVQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -3116,7 +3116,7 @@
    (set (match_operand:<V_elem> 0 "s_register_operand" "=r")
 	(unspec:<V_elem> [(match_operand:<V_elem> 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMAXVQ_P))
   ]
   "TARGET_HAVE_MVE"
@@ -3132,7 +3132,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMINAQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -3148,7 +3148,7 @@
    (set (match_operand:<V_elem> 0 "s_register_operand" "=r")
 	(unspec:<V_elem> [(match_operand:<V_elem> 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMINAVQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -3164,7 +3164,7 @@
    (set (match_operand:<V_elem> 0 "s_register_operand" "=r")
 	(unspec:<V_elem> [(match_operand:<V_elem> 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMINVQ_P))
   ]
   "TARGET_HAVE_MVE"
@@ -3196,7 +3196,7 @@
    (set (match_operand:SI 0 "s_register_operand" "=Te")
 	(unspec:SI [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMLADAVQ_P))
   ]
   "TARGET_HAVE_MVE"
@@ -3212,7 +3212,7 @@
    (set (match_operand:SI 0 "s_register_operand" "=Te")
 	(unspec:SI [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMLADAVXQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -3260,7 +3260,7 @@
    (set (match_operand:SI 0 "s_register_operand" "=Te")
 	(unspec:SI [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMLSDAVQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -3276,7 +3276,7 @@
    (set (match_operand:SI 0 "s_register_operand" "=Te")
 	(unspec:SI [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMLSDAVXQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -3292,7 +3292,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMVNQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -3308,7 +3308,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VNEGQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -3340,7 +3340,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VQABSQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -3388,7 +3388,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VQNEGQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -3500,7 +3500,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:SI 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VQRSHLQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -3516,7 +3516,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:SI 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VQSHLQ_M_R))
   ]
   "TARGET_HAVE_MVE"
@@ -3532,7 +3532,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VREV64Q_M))
   ]
   "TARGET_HAVE_MVE"
@@ -3548,7 +3548,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:SI 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VRSHLQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -3564,7 +3564,7 @@
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:SI 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VSHLQ_M_R))
   ]
   "TARGET_HAVE_MVE"
@@ -3723,7 +3723,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VABSQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4013,7 +4013,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:<V_elem> 2 "s_register_operand" "r")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VDUPQ_M_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4092,7 +4092,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMAXNMAQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4107,7 +4107,7 @@
    (set (match_operand:<V_elem> 0 "s_register_operand" "=r")
 	(unspec:<V_elem> [(match_operand:<V_elem> 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMAXNMAVQ_P_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4123,7 +4123,7 @@
    (set (match_operand:<V_elem> 0 "s_register_operand" "=r")
 	(unspec:<V_elem> [(match_operand:<V_elem> 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMAXNMVQ_P_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4138,7 +4138,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMINNMAQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4154,7 +4154,7 @@
    (set (match_operand:<V_elem> 0 "s_register_operand" "=r")
 	(unspec:<V_elem> [(match_operand:<V_elem> 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMINNMAVQ_P_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4169,7 +4169,7 @@
    (set (match_operand:<V_elem> 0 "s_register_operand" "=r")
 	(unspec:<V_elem> [(match_operand:<V_elem> 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMINNMVQ_P_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4217,7 +4217,7 @@
    (set (match_operand:DI 0 "s_register_operand" "=r")
 	(unspec:DI [(match_operand:MVE_5 1 "s_register_operand" "w")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMLALDAVQ_P))
   ]
   "TARGET_HAVE_MVE"
@@ -4233,7 +4233,7 @@
    (set (match_operand:DI 0 "s_register_operand" "=r")
 	(unspec:DI [(match_operand:MVE_5 1 "s_register_operand" "w")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMLALDAVXQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -4280,7 +4280,7 @@
    (set (match_operand:DI 0 "s_register_operand" "=r")
 	(unspec:DI [(match_operand:MVE_5 1 "s_register_operand" "w")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMLSLDAVQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -4296,7 +4296,7 @@
    (set (match_operand:DI 0 "s_register_operand" "=r")
 	(unspec:DI [(match_operand:MVE_5 1 "s_register_operand" "w")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMLSLDAVXQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -4311,7 +4311,7 @@
    (set (match_operand:<V_double_width> 0 "s_register_operand" "=w")
 	(unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
 		       (match_operand:MVE_3 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMOVLBQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -4326,7 +4326,7 @@
    (set (match_operand:<V_double_width> 0 "s_register_operand" "=w")
 	(unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
 		       (match_operand:MVE_3 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMOVLTQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -4341,7 +4341,7 @@
    (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMOVNBQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -4357,7 +4357,7 @@
    (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMOVNTQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -4373,7 +4373,7 @@
    (set (match_operand:MVE_5 0 "s_register_operand" "=w")
 	(unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0")
 		       (match_operand:SI 2 "immediate_operand" "i")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VMVNQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -4388,7 +4388,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VNEGQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4404,7 +4404,7 @@
    (set (match_operand:MVE_5 0 "s_register_operand" "=w")
 	(unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0")
 		       (match_operand:SI 2 "immediate_operand" "i")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VORRQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -4435,7 +4435,7 @@
    (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VQMOVNBQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -4451,7 +4451,7 @@
    (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VQMOVNTQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -4467,7 +4467,7 @@
    (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VQMOVUNBQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -4483,7 +4483,7 @@
    (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VQMOVUNTQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -4611,7 +4611,7 @@
    (set (match_operand:MVE_3 0 "s_register_operand" "=w")
 	(unspec:MVE_3 [(match_operand:MVE_3 1 "s_register_operand" "0")
 		       (match_operand:MVE_3 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VREV32Q_M))
   ]
   "TARGET_HAVE_MVE"
@@ -4627,7 +4627,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VREV64Q_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4723,7 +4723,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VRNDAQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4739,7 +4739,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VRNDMQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4755,7 +4755,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VRNDNQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4771,7 +4771,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VRNDPQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4787,7 +4787,7 @@
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VRNDXQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4867,7 +4867,7 @@
    (set (match_operand:MVE_5 0 "s_register_operand" "=w")
 	(unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0")
 		       (match_operand:<MVE_CNVT> 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCVTMQ_M))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4883,7 +4883,7 @@
    (set (match_operand:MVE_5 0 "s_register_operand" "=w")
 	(unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0")
 		       (match_operand:<MVE_CNVT> 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCVTPQ_M))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4899,7 +4899,7 @@
    (set (match_operand:MVE_5 0 "s_register_operand" "=w")
 	(unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0")
 		       (match_operand:<MVE_CNVT> 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCVTNQ_M))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4916,7 +4916,7 @@
 	(unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0")
 		       (match_operand:<MVE_CNVT> 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "<MVE_pred2>" "<MVE_constraint2>")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCVTQ_M_N_FROM_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4948,7 +4948,7 @@
    (set (match_operand:MVE_5 0 "s_register_operand" "=w")
 	(unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0")
 		       (match_operand:<MVE_CNVT> 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCVTQ_M_FROM_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4997,7 +4997,7 @@
 	(unspec:SI [(match_operand:SI 1 "s_register_operand" "0")
 		    (match_operand:MVE_2 2 "s_register_operand" "w")
 		    (match_operand:MVE_2 3 "s_register_operand" "w")
-		    (match_operand:HI 4 "vpr_register_operand" "Up")]
+		    (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VABAVQ_P))
   ]
   "TARGET_HAVE_MVE"
@@ -5014,7 +5014,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "mve_imm_7" "Ra")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQSHLUQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5030,7 +5030,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VSHLQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5046,7 +5046,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "mve_imm_selective_upto_8" "Rg")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VSRIQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5062,7 +5062,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VSUBQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5078,7 +5078,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:<MVE_CNVT> 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "<MVE_pred2>" "<MVE_constraint2>")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCVTQ_M_N_TO_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -5094,7 +5094,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VABDQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5111,7 +5111,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VADDQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5128,7 +5128,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VADDQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5145,7 +5145,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VANDQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5162,7 +5162,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VBICQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5179,7 +5179,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VBRSRQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5196,7 +5196,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCADDQ_ROT270_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5213,7 +5213,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCADDQ_ROT90_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5230,7 +5230,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VEORQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5247,7 +5247,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VHADDQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5264,7 +5264,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VHADDQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5281,7 +5281,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VHSUBQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5298,7 +5298,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VHSUBQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5315,7 +5315,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMAXQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5332,7 +5332,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMINQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5349,7 +5349,7 @@
 	(unspec:SI [(match_operand:SI 1 "s_register_operand" "0")
 		    (match_operand:MVE_2 2 "s_register_operand" "w")
 		    (match_operand:MVE_2 3 "s_register_operand" "w")
-		    (match_operand:HI 4 "vpr_register_operand" "Up")]
+		    (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMLADAVAQ_P))
   ]
   "TARGET_HAVE_MVE"
@@ -5366,7 +5366,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMLAQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5383,7 +5383,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMLASQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5400,7 +5400,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMULHQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5417,7 +5417,7 @@
 	(unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
 				  (match_operand:MVE_2 2 "s_register_operand" "w")
 				  (match_operand:MVE_2 3 "s_register_operand" "w")
-				  (match_operand:HI 4 "vpr_register_operand" "Up")]
+				  (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMULLBQ_INT_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5434,7 +5434,7 @@
 	(unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
 				  (match_operand:MVE_2 2 "s_register_operand" "w")
 				  (match_operand:MVE_2 3 "s_register_operand" "w")
-				  (match_operand:HI 4 "vpr_register_operand" "Up")]
+				  (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMULLTQ_INT_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5451,7 +5451,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMULQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5468,7 +5468,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMULQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5485,7 +5485,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VORNQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5502,7 +5502,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VORRQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5519,7 +5519,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQADDQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5536,7 +5536,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQADDQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5553,7 +5553,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQDMLAHQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5570,7 +5570,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQDMLASHQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5587,7 +5587,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQRDMLAHQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5604,7 +5604,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQRDMLASHQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5621,7 +5621,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQRSHLQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5638,7 +5638,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "immediate_operand" "i")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQSHLQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5655,7 +5655,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQSHLQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5672,7 +5672,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQSUBQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5689,7 +5689,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQSUBQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5706,7 +5706,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VRHADDQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5723,7 +5723,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VRMULHQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5740,7 +5740,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VRSHLQ_M))
   ]
   "TARGET_HAVE_MVE"
@@ -5757,7 +5757,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "<MVE_pred2>" "<MVE_constraint2>")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VRSHRQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5774,7 +5774,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "immediate_operand" "i")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VSHLQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5791,7 +5791,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "<MVE_pred2>" "<MVE_constraint2>")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VSHRQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5808,7 +5808,7 @@
        (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "<MVE_pred>" "<MVE_constraint>")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VSLIQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5825,7 +5825,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VSUBQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5842,7 +5842,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VHCADDQ_ROT270_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5859,7 +5859,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VHCADDQ_ROT90_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5876,7 +5876,7 @@
 	(unspec:SI [(match_operand:SI 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMLADAVAXQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5893,7 +5893,7 @@
 	(unspec:SI [(match_operand:SI 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMLSDAVAQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5910,7 +5910,7 @@
 	(unspec:SI [(match_operand:SI 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMLSDAVAXQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5927,7 +5927,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQDMLADHQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5944,7 +5944,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQDMLADHXQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5961,7 +5961,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQDMLSDHQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5978,7 +5978,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQDMLSDHXQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5995,7 +5995,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQDMULHQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6012,7 +6012,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQDMULHQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6029,7 +6029,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQRDMLADHQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6046,7 +6046,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQRDMLADHXQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6063,7 +6063,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQRDMLSDHQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6080,7 +6080,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQRDMLSDHXQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6097,7 +6097,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQRDMULHQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6114,7 +6114,7 @@
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
 		       (match_operand:MVE_2 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQRDMULHQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6131,7 +6131,7 @@
 	(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:MVE_5 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMLALDAVAQ_P))
   ]
   "TARGET_HAVE_MVE"
@@ -6148,7 +6148,7 @@
 	(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:MVE_5 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMLALDAVAXQ_P))
   ]
   "TARGET_HAVE_MVE"
@@ -6165,7 +6165,7 @@
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "mve_imm_8" "Rb")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQRSHRNBQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -6182,7 +6182,7 @@
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "mve_imm_8" "Rb")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQRSHRNTQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -6199,7 +6199,7 @@
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "<MVE_pred3>" "<MVE_constraint3>")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQSHRNBQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -6216,7 +6216,7 @@
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "<MVE_pred3>" "<MVE_constraint3>")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQSHRNTQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -6250,7 +6250,7 @@
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "mve_imm_8" "Rb")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VRSHRNBQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -6267,7 +6267,7 @@
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "mve_imm_8" "Rb")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VRSHRNTQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -6284,7 +6284,7 @@
 	(unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
 		       (match_operand:MVE_3 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "immediate_operand" "i")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VSHLLBQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -6301,7 +6301,7 @@
 	(unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
 		       (match_operand:MVE_3 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "immediate_operand" "i")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VSHLLTQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -6318,7 +6318,7 @@
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "<MVE_pred3>" "<MVE_constraint3>")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VSHRNBQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -6335,7 +6335,7 @@
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "<MVE_pred3>" "<MVE_constraint3>")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VSHRNTQ_M_N))
   ]
   "TARGET_HAVE_MVE"
@@ -6352,7 +6352,7 @@
 	(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:MVE_5 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMLSLDAVAQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6369,7 +6369,7 @@
 	(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:MVE_5 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMLSLDAVAXQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6386,7 +6386,7 @@
 	(unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
 		       (match_operand:MVE_3 2 "s_register_operand" "w")
 		       (match_operand:MVE_3 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMULLBQ_POLY_M_P))
   ]
   "TARGET_HAVE_MVE"
@@ -6403,7 +6403,7 @@
 	(unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
 		       (match_operand:MVE_3 2 "s_register_operand" "w")
 		       (match_operand:MVE_3 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMULLTQ_POLY_M_P))
   ]
   "TARGET_HAVE_MVE"
@@ -6420,7 +6420,7 @@
 	(unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQDMULLBQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6437,7 +6437,7 @@
 	(unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:MVE_5 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQDMULLBQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6454,7 +6454,7 @@
 	(unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQDMULLTQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6471,7 +6471,7 @@
 	(unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:MVE_5 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQDMULLTQ_M_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6488,7 +6488,7 @@
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "mve_imm_8" "Rb")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQRSHRUNBQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6505,7 +6505,7 @@
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "<MVE_pred3>" "<MVE_constraint3>")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQRSHRUNTQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6522,7 +6522,7 @@
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "<MVE_pred3>" "<MVE_constraint3>")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQSHRUNBQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6539,7 +6539,7 @@
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
 		       (match_operand:MVE_5 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "<MVE_pred3>" "<MVE_constraint3>")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VQSHRUNTQ_M_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6623,7 +6623,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VABDQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6640,7 +6640,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VADDQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6657,7 +6657,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VADDQ_M_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6674,7 +6674,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VANDQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6691,7 +6691,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VBICQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6708,7 +6708,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:SI 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VBRSRQ_M_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6725,7 +6725,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCADDQ_ROT270_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6742,7 +6742,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCADDQ_ROT90_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6759,7 +6759,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCMLAQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6776,7 +6776,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCMLAQ_ROT180_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6793,7 +6793,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCMLAQ_ROT270_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6810,7 +6810,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCMLAQ_ROT90_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6827,7 +6827,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCMULQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6844,7 +6844,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCMULQ_ROT180_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6861,7 +6861,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCMULQ_ROT270_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6878,7 +6878,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VCMULQ_ROT90_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6895,7 +6895,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VEORQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6912,7 +6912,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VFMAQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6929,7 +6929,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VFMAQ_M_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6946,7 +6946,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VFMASQ_M_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6963,7 +6963,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VFMSQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6980,7 +6980,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMAXNMQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -6997,7 +6997,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMINNMQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -7014,7 +7014,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMULQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -7031,7 +7031,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VMULQ_M_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -7048,7 +7048,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VORNQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -7065,7 +7065,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VORRQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -7082,7 +7082,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:MVE_0 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VSUBQ_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -7099,7 +7099,7 @@
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")
 		       (match_operand:<V_elem> 3 "s_register_operand" "r")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VSUBQ_M_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -7248,7 +7248,7 @@
   [(match_operand:<MVE_B_ELEM>  0 "mve_scatter_memory")
    (match_operand:MVE_2 1 "s_register_operand")
    (match_operand:MVE_2 2 "s_register_operand")
-   (match_operand:HI 3 "vpr_register_operand" "Up")
+   (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")
    (unspec:V4SI [(const_int 0)] VSTRBSOQ)]
   "TARGET_HAVE_MVE"
 {
@@ -7267,7 +7267,7 @@
 	  [(match_operand:SI 0 "register_operand" "r")
 	   (match_operand:MVE_2 1 "s_register_operand" "w")
 	   (match_operand:MVE_2 2 "s_register_operand" "w")
-	   (match_operand:HI 3 "vpr_register_operand" "Up")]
+	   (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	  VSTRBSOQ))]
   "TARGET_HAVE_MVE"
   "vpst\;vstrbt.<V_sz_elem>\t%q2, [%0, %q1]"
@@ -7302,7 +7302,7 @@
 (define_insn "mve_vstrbq_p_<supf><mode>"
   [(set (match_operand:<MVE_B_ELEM> 0 "mve_memory_operand" "=Ux")
 	(unspec:<MVE_B_ELEM> [(match_operand:MVE_2 1 "s_register_operand" "w")
-			      (match_operand:HI 2 "vpr_register_operand" "Up")]
+			      (match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up")]
 	 VSTRBQ))
   ]
   "TARGET_HAVE_MVE"
@@ -7323,7 +7323,7 @@
   [(set (match_operand:MVE_2 0 "s_register_operand" "=&w")
 	(unspec:MVE_2 [(match_operand:<MVE_B_ELEM> 1 "memory_operand" "Us")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VLDRBGOQ))
   ]
   "TARGET_HAVE_MVE"
@@ -7347,7 +7347,7 @@
 (define_insn "mve_vldrbq_z_<supf><mode>"
   [(set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:<MVE_B_ELEM> 1 "mve_memory_operand" "Ux")
-		       (match_operand:HI 2 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up")]
 	 VLDRBQ))
   ]
   "TARGET_HAVE_MVE"
@@ -7434,7 +7434,7 @@
   [(set (match_operand:MVE_6 0 "s_register_operand" "=&w")
 	(unspec:MVE_6 [(match_operand:<MVE_H_ELEM> 1 "memory_operand" "Us")
 		       (match_operand:MVE_6 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")
 	]VLDRHGOQ))
   ]
   "TARGET_HAVE_MVE"
@@ -7482,7 +7482,7 @@
   [(set (match_operand:MVE_6 0 "s_register_operand" "=&w")
 	(unspec:MVE_6 [(match_operand:<MVE_H_ELEM> 1 "memory_operand" "Us")
 		       (match_operand:MVE_6 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")
 	]VLDRHGSOQ))
   ]
   "TARGET_HAVE_MVE"
@@ -7548,7 +7548,7 @@
 (define_insn "mve_vldrhq_z_<supf><mode>"
   [(set (match_operand:MVE_6 0 "s_register_operand" "=w")
 	(unspec:MVE_6 [(match_operand:<MVE_H_ELEM> 1 "mve_memory_operand" "Ux")
-	(match_operand:HI 2 "vpr_register_operand" "Up")]
+	(match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up")]
 	 VLDRHQ))
   ]
   "TARGET_HAVE_MVE"
@@ -8124,7 +8124,7 @@
 (define_insn "mve_vstrhq_p_<supf><mode>"
   [(set (match_operand:<MVE_H_ELEM> 0 "mve_memory_operand" "=Ux")
 	(unspec:<MVE_H_ELEM> [(match_operand:MVE_6 1 "s_register_operand" "w")
-			      (match_operand:HI 2 "vpr_register_operand" "Up")]
+			      (match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up")]
 	 VSTRHQ))
   ]
   "TARGET_HAVE_MVE"
@@ -8145,7 +8145,7 @@
   [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
    (match_operand:MVE_6 1 "s_register_operand")
    (match_operand:MVE_6 2 "s_register_operand")
-   (match_operand:HI 3 "vpr_register_operand")
+   (match_operand:<MVE_VPRED> 3 "vpr_register_operand")
    (unspec:V4SI [(const_int 0)] VSTRHSOQ)]
   "TARGET_HAVE_MVE"
 {
@@ -8164,7 +8164,7 @@
 	  [(match_operand:SI 0 "register_operand" "r")
 	   (match_operand:MVE_6 1 "s_register_operand" "w")
 	   (match_operand:MVE_6 2 "s_register_operand" "w")
-	   (match_operand:HI 3 "vpr_register_operand" "Up")]
+	   (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	  VSTRHSOQ))]
   "TARGET_HAVE_MVE"
   "vpst\;vstrht.<V_sz_elem>\t%q2, [%0, %q1]"
@@ -8205,7 +8205,7 @@
   [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
    (match_operand:MVE_6 1 "s_register_operand")
    (match_operand:MVE_6 2 "s_register_operand")
-   (match_operand:HI 3 "vpr_register_operand")
+   (match_operand:<MVE_VPRED> 3 "vpr_register_operand")
    (unspec:V4SI [(const_int 0)] VSTRHSSOQ)]
   "TARGET_HAVE_MVE"
 {
@@ -8224,7 +8224,7 @@
 	  [(match_operand:SI 0 "register_operand" "r")
 	   (match_operand:MVE_6 1 "s_register_operand" "w")
 	   (match_operand:MVE_6 2 "s_register_operand" "w")
-	   (match_operand:HI 3 "vpr_register_operand" "Up")]
+	   (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	  VSTRHSSOQ))]
   "TARGET_HAVE_MVE"
   "vpst\;vstrht.<V_sz_elem>\t%q2, [%0, %q1, uxtw #1]"
@@ -9011,7 +9011,7 @@
    (match_operand:MVE_2 1 "s_register_operand")
    (match_operand:SI 2 "s_register_operand")
    (match_operand:SI 3 "mve_imm_selective_upto_8")
-   (match_operand:HI 4 "vpr_register_operand")]
+   (match_operand:<MVE_VPRED> 4 "vpr_register_operand")]
   "TARGET_HAVE_MVE"
 {
   rtx temp = gen_reg_rtx (SImode);
@@ -9031,7 +9031,7 @@
        (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		      (match_operand:SI 3 "s_register_operand" "2")
 		      (match_operand:SI 4 "mve_imm_selective_upto_8" "Rg")
-		      (match_operand:HI 5 "vpr_register_operand" "Up")]
+		      (match_operand:<MVE_VPRED> 5 "vpr_register_operand" "Up")]
 	VIDUPQ_M))
   (set (match_operand:SI 2 "s_register_operand" "=Te")
        (plus:SI (match_dup 3)
@@ -9079,7 +9079,7 @@
    (match_operand:MVE_2 1 "s_register_operand")
    (match_operand:SI 2 "s_register_operand")
    (match_operand:SI 3 "mve_imm_selective_upto_8")
-   (match_operand:HI 4 "vpr_register_operand")]
+   (match_operand:<MVE_VPRED> 4 "vpr_register_operand")]
   "TARGET_HAVE_MVE"
 {
   rtx temp = gen_reg_rtx (SImode);
@@ -9099,7 +9099,7 @@
        (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
 		      (match_operand:SI 3 "s_register_operand" "2")
 		      (match_operand:SI 4 "mve_imm_selective_upto_8" "Rg")
-		      (match_operand:HI 5 "vpr_register_operand" "Up")]
+		      (match_operand:<MVE_VPRED> 5 "vpr_register_operand" "Up")]
 	VDDUPQ_M))
   (set (match_operand:SI 2 "s_register_operand" "=Te")
        (minus:SI (match_dup 3)
@@ -9170,7 +9170,7 @@
   (match_operand:SI 2 "s_register_operand")
   (match_operand:DI 3 "s_register_operand")
   (match_operand:SI 4 "mve_imm_selective_upto_8")
-  (match_operand:HI 5 "vpr_register_operand")]
+  (match_operand:<MVE_VPRED> 5 "vpr_register_operand")]
  "TARGET_HAVE_MVE"
 {
   rtx ignore_wb = gen_reg_rtx (SImode);
@@ -9190,7 +9190,7 @@
   (match_operand:SI 2 "s_register_operand")
   (match_operand:DI 3 "s_register_operand")
   (match_operand:SI 4 "mve_imm_selective_upto_8")
-  (match_operand:HI 5 "vpr_register_operand")]
+  (match_operand:<MVE_VPRED> 5 "vpr_register_operand")]
  "TARGET_HAVE_MVE"
 {
   rtx ignore_vec = gen_reg_rtx (<MODE>mode);
@@ -9210,7 +9210,7 @@
 		       (match_operand:SI 3 "s_register_operand" "1")
 		       (subreg:SI (match_operand:DI 4 "s_register_operand" "r") 4)
 		       (match_operand:SI 5 "mve_imm_selective_upto_8" "Rg")
-		       (match_operand:HI 6 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 6 "vpr_register_operand" "Up")]
 	 VDWDUPQ_M))
    (set (match_operand:SI 1 "s_register_operand" "=Te")
 	(unspec:SI [(match_dup 2)
@@ -9287,7 +9287,7 @@
   (match_operand:SI 2 "s_register_operand")
   (match_operand:DI 3 "s_register_operand")
   (match_operand:SI 4 "mve_imm_selective_upto_8")
-  (match_operand:HI 5 "vpr_register_operand")]
+  (match_operand:<MVE_VPRED> 5 "vpr_register_operand")]
  "TARGET_HAVE_MVE"
 {
   rtx ignore_wb = gen_reg_rtx (SImode);
@@ -9307,7 +9307,7 @@
   (match_operand:SI 2 "s_register_operand")
   (match_operand:DI 3 "s_register_operand")
   (match_operand:SI 4 "mve_imm_selective_upto_8")
-  (match_operand:HI 5 "vpr_register_operand")]
+  (match_operand:<MVE_VPRED> 5 "vpr_register_operand")]
  "TARGET_HAVE_MVE"
 {
   rtx ignore_vec = gen_reg_rtx (<MODE>mode);
@@ -9327,7 +9327,7 @@
 		       (match_operand:SI 3 "s_register_operand" "1")
 		       (subreg:SI (match_operand:DI 4 "s_register_operand" "r") 4)
 		       (match_operand:SI 5 "mve_imm_selective_upto_8" "Rg")
-		       (match_operand:HI 6 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 6 "vpr_register_operand" "Up")]
 	 VIWDUPQ_M))
    (set (match_operand:SI 1 "s_register_operand" "=Te")
 	(unspec:SI [(match_dup 2)
@@ -10335,7 +10335,7 @@
   (match_operand:MVE_2 1 "s_register_operand")
   (match_operand:SI 2 "s_register_operand")
   (match_operand:SI 3 "mve_imm_32")
-  (match_operand:HI 4 "vpr_register_operand")
+  (match_operand:<MVE_VPRED> 4 "vpr_register_operand")
   (unspec:MVE_2 [(const_int 0)] VSHLCQ_M)]
  "TARGET_HAVE_MVE"
 {
@@ -10351,7 +10351,7 @@
   (match_operand:MVE_2 1 "s_register_operand")
   (match_operand:SI 2 "s_register_operand")
   (match_operand:SI 3 "mve_imm_32")
-  (match_operand:HI 4 "vpr_register_operand")
+  (match_operand:<MVE_VPRED> 4 "vpr_register_operand")
   (unspec:MVE_2 [(const_int 0)] VSHLCQ_M)]
  "TARGET_HAVE_MVE"
 {
@@ -10367,7 +10367,7 @@
        (unspec:MVE_2 [(match_operand:MVE_2 2 "s_register_operand" "0")
 		      (match_operand:SI 3 "s_register_operand" "1")
 		      (match_operand:SI 4 "mve_imm_32" "Rf")
-		      (match_operand:HI 5 "vpr_register_operand" "Up")]
+		      (match_operand:<MVE_VPRED> 5 "vpr_register_operand" "Up")]
 	VSHLCQ_M))
   (set (match_operand:SI  1 "s_register_operand" "=r")
        (unspec:SI [(match_dup 2)
-- 
cgit v1.1


From 6a7c13a0cf2290b60ab36f9ce1027b92838586bd Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@arm.com>
Date: Wed, 20 Oct 2021 15:39:17 +0000
Subject: arm: Convert more load/store MVE builtins to predicate qualifiers

This patch covers a few builtins where we do not use the <mode>
iterator and thus we cannot use <MVE_vpred>.

For v2di instructions, we keep the HI mode for predicates.

Most of the work of this patch series was carried out while I was
working at STMicroelectronics as a Linaro assignee.

2022-02-22  Christophe Lyon  <christophe.lyon@arm.com>

	gcc/
	PR target/100757
	PR target/101325
	* config/arm/arm-builtins.cc (STRSBS_P_QUALIFIERS): Use predicate
	qualifier.
	(STRSBU_P_QUALIFIERS): Likewise.
	(LDRGBS_Z_QUALIFIERS): Likewise.
	(LDRGBU_Z_QUALIFIERS): Likewise.
	(LDRGBWBXU_Z_QUALIFIERS): Likewise.
	(LDRGBWBS_Z_QUALIFIERS): Likewise.
	(LDRGBWBU_Z_QUALIFIERS): Likewise.
	(STRSBWBS_P_QUALIFIERS): Likewise.
	(STRSBWBU_P_QUALIFIERS): Likewise.
	* config/arm/mve.md: Use VxBI instead of HI.
---
 gcc/config/arm/arm-builtins.cc | 18 +++++------
 gcc/config/arm/mve.md          | 68 +++++++++++++++++++++---------------------
 2 files changed, 43 insertions(+), 43 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc
index a9536b2..5d582f1 100644
--- a/gcc/config/arm/arm-builtins.cc
+++ b/gcc/config/arm/arm-builtins.cc
@@ -689,13 +689,13 @@ arm_strss_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 static enum arm_type_qualifiers
 arm_strsbs_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_void, qualifier_unsigned, qualifier_immediate,
-      qualifier_none, qualifier_unsigned};
+      qualifier_none, qualifier_predicate};
 #define STRSBS_P_QUALIFIERS (arm_strsbs_p_qualifiers)
 
 static enum arm_type_qualifiers
 arm_strsbu_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_void, qualifier_unsigned, qualifier_immediate,
-      qualifier_unsigned, qualifier_unsigned};
+      qualifier_unsigned, qualifier_predicate};
 #define STRSBU_P_QUALIFIERS (arm_strsbu_p_qualifiers)
 
 static enum arm_type_qualifiers
@@ -731,13 +731,13 @@ arm_ldrgbu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 static enum arm_type_qualifiers
 arm_ldrgbs_z_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_unsigned, qualifier_immediate,
-      qualifier_unsigned};
+      qualifier_predicate};
 #define LDRGBS_Z_QUALIFIERS (arm_ldrgbs_z_qualifiers)
 
 static enum arm_type_qualifiers
 arm_ldrgbu_z_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate,
-      qualifier_unsigned};
+      qualifier_predicate};
 #define LDRGBU_Z_QUALIFIERS (arm_ldrgbu_z_qualifiers)
 
 static enum arm_type_qualifiers
@@ -777,7 +777,7 @@ arm_ldrgbwbxu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 static enum arm_type_qualifiers
 arm_ldrgbwbxu_z_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate,
-      qualifier_unsigned};
+      qualifier_predicate};
 #define LDRGBWBXU_Z_QUALIFIERS (arm_ldrgbwbxu_z_qualifiers)
 
 static enum arm_type_qualifiers
@@ -793,13 +793,13 @@ arm_ldrgbwbu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 static enum arm_type_qualifiers
 arm_ldrgbwbs_z_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_unsigned, qualifier_immediate,
-      qualifier_unsigned};
+      qualifier_predicate};
 #define LDRGBWBS_Z_QUALIFIERS (arm_ldrgbwbs_z_qualifiers)
 
 static enum arm_type_qualifiers
 arm_ldrgbwbu_z_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate,
-      qualifier_unsigned};
+      qualifier_predicate};
 #define LDRGBWBU_Z_QUALIFIERS (arm_ldrgbwbu_z_qualifiers)
 
 static enum arm_type_qualifiers
@@ -815,13 +815,13 @@ arm_strsbwbu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 static enum arm_type_qualifiers
 arm_strsbwbs_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_const,
-      qualifier_none, qualifier_unsigned};
+      qualifier_none, qualifier_predicate};
 #define STRSBWBS_P_QUALIFIERS (arm_strsbwbs_p_qualifiers)
 
 static enum arm_type_qualifiers
 arm_strsbwbu_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_const,
-      qualifier_unsigned, qualifier_unsigned};
+      qualifier_unsigned, qualifier_predicate};
 #define STRSBWBU_P_QUALIFIERS (arm_strsbwbu_p_qualifiers)
 
 static enum arm_type_qualifiers
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 5d51da1..e291c67 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -7282,7 +7282,7 @@
 		[(match_operand:V4SI 0 "s_register_operand" "w")
 		 (match_operand:SI 1 "immediate_operand" "i")
 		 (match_operand:V4SI 2 "s_register_operand" "w")
-		 (match_operand:HI 3 "vpr_register_operand" "Up")]
+		 (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	 VSTRWSBQ))
   ]
   "TARGET_HAVE_MVE"
@@ -7371,7 +7371,7 @@
   [(set (match_operand:V4SI 0 "s_register_operand" "=&w")
 	(unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w")
 		      (match_operand:SI 2 "immediate_operand" "i")
-		      (match_operand:HI 3 "vpr_register_operand" "Up")]
+		      (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	 VLDRWGBQ))
   ]
   "TARGET_HAVE_MVE"
@@ -7609,7 +7609,7 @@
 (define_insn "mve_vldrwq_z_fv4sf"
   [(set (match_operand:V4SF 0 "s_register_operand" "=w")
 	(unspec:V4SF [(match_operand:V4SI 1 "mve_memory_operand" "Ux")
-	(match_operand:HI 2 "vpr_register_operand" "Up")]
+	(match_operand:V4BI 2 "vpr_register_operand" "Up")]
 	 VLDRWQ_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -7629,7 +7629,7 @@
 (define_insn "mve_vldrwq_z_<supf>v4si"
   [(set (match_operand:V4SI 0 "s_register_operand" "=w")
 	(unspec:V4SI [(match_operand:V4SI 1 "mve_memory_operand" "Ux")
-	(match_operand:HI 2 "vpr_register_operand" "Up")]
+	(match_operand:V4BI 2 "vpr_register_operand" "Up")]
 	 VLDRWQ))
   ]
   "TARGET_HAVE_MVE"
@@ -7813,7 +7813,7 @@
   [(set (match_operand:V8HF 0 "s_register_operand" "=&w")
 	(unspec:V8HF [(match_operand:V8HI 1 "memory_operand" "Us")
 		      (match_operand:V8HI 2 "s_register_operand" "w")
-		      (match_operand:HI 3 "vpr_register_operand" "Up")]
+		      (match_operand:V8BI 3 "vpr_register_operand" "Up")]
 	 VLDRHQGO_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -7855,7 +7855,7 @@
   [(set (match_operand:V8HF 0 "s_register_operand" "=&w")
 	(unspec:V8HF [(match_operand:V8HI 1 "memory_operand" "Us")
 		      (match_operand:V8HI 2 "s_register_operand" "w")
-		      (match_operand:HI 3 "vpr_register_operand" "Up")]
+		      (match_operand:V8BI 3 "vpr_register_operand" "Up")]
 	 VLDRHQGSO_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -7897,7 +7897,7 @@
   [(set (match_operand:V4SF 0 "s_register_operand" "=&w")
 	(unspec:V4SF [(match_operand:V4SI 1 "s_register_operand" "w")
 		      (match_operand:SI 2 "immediate_operand" "i")
-		      (match_operand:HI 3 "vpr_register_operand" "Up")]
+		      (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	 VLDRWQGB_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -7958,7 +7958,7 @@
   [(set (match_operand:V4SF 0 "s_register_operand" "=&w")
 	(unspec:V4SF [(match_operand:V4SI 1 "memory_operand" "Us")
 		      (match_operand:V4SI 2 "s_register_operand" "w")
-		      (match_operand:HI 3 "vpr_register_operand" "Up")]
+		      (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	 VLDRWQGO_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -7980,7 +7980,7 @@
   [(set (match_operand:V4SI 0 "s_register_operand" "=&w")
 	(unspec:V4SI [(match_operand:V4SI 1 "memory_operand" "Us")
 		      (match_operand:V4SI 2 "s_register_operand" "w")
-		      (match_operand:HI 3 "vpr_register_operand" "Up")]
+		      (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	 VLDRWGOQ))
   ]
   "TARGET_HAVE_MVE"
@@ -8042,7 +8042,7 @@
   [(set (match_operand:V4SF 0 "s_register_operand" "=&w")
 	(unspec:V4SF [(match_operand:V4SI 1 "memory_operand" "Us")
 		      (match_operand:V4SI 2 "s_register_operand" "w")
-		      (match_operand:HI 3 "vpr_register_operand" "Up")]
+		      (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	 VLDRWQGSO_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -8064,7 +8064,7 @@
   [(set (match_operand:V4SI 0 "s_register_operand" "=&w")
 	(unspec:V4SI [(match_operand:V4SI 1 "memory_operand" "Us")
 		      (match_operand:V4SI 2 "s_register_operand" "w")
-		      (match_operand:HI 3 "vpr_register_operand" "Up")]
+		      (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	 VLDRWGSOQ))
   ]
   "TARGET_HAVE_MVE"
@@ -8104,7 +8104,7 @@
 (define_insn "mve_vstrhq_p_fv8hf"
   [(set (match_operand:V8HI 0 "mve_memory_operand" "=Ux")
 	(unspec:V8HI [(match_operand:V8HF 1 "s_register_operand" "w")
-		      (match_operand:HI 2 "vpr_register_operand" "Up")]
+		      (match_operand:V8BI 2 "vpr_register_operand" "Up")]
 	 VSTRHQ_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -8323,7 +8323,7 @@
 (define_insn "mve_vstrwq_p_<supf>v4si"
   [(set (match_operand:V4SI 0 "mve_memory_operand" "=Ux")
 	(unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w")
-		      (match_operand:HI 2 "vpr_register_operand" "Up")]
+		      (match_operand:V4BI 2 "vpr_register_operand" "Up")]
 	 VSTRWQ))
   ]
   "TARGET_HAVE_MVE"
@@ -8576,7 +8576,7 @@
   [(match_operand:V8HI 0 "mve_scatter_memory")
    (match_operand:V8HI 1 "s_register_operand")
    (match_operand:V8HF 2 "s_register_operand")
-   (match_operand:HI 3 "vpr_register_operand")
+   (match_operand:V8BI 3 "vpr_register_operand")
    (unspec:V4SI [(const_int 0)] VSTRHQSO_F)]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
 {
@@ -8594,7 +8594,7 @@
 	  [(match_operand:SI 0 "register_operand" "r")
 	   (match_operand:V8HI 1 "s_register_operand" "w")
 	   (match_operand:V8HF 2 "s_register_operand" "w")
-	   (match_operand:HI 3 "vpr_register_operand" "Up")]
+	   (match_operand:V8BI 3 "vpr_register_operand" "Up")]
 	  VSTRHQSO_F))]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
   "vpst\;vstrht.16\t%q2, [%0, %q1]"
@@ -8635,7 +8635,7 @@
   [(match_operand:V8HI 0 "memory_operand" "=Us")
    (match_operand:V8HI 1 "s_register_operand" "w")
    (match_operand:V8HF 2 "s_register_operand" "w")
-   (match_operand:HI 3 "vpr_register_operand" "Up")
+   (match_operand:V8BI 3 "vpr_register_operand" "Up")
    (unspec:V4SI [(const_int 0)] VSTRHQSSO_F)]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
 {
@@ -8654,7 +8654,7 @@
 	  [(match_operand:SI 0 "register_operand" "r")
 	   (match_operand:V8HI 1 "s_register_operand" "w")
 	   (match_operand:V8HF 2 "s_register_operand" "w")
-	   (match_operand:HI 3 "vpr_register_operand" "Up")]
+	   (match_operand:V8BI 3 "vpr_register_operand" "Up")]
 	  VSTRHQSSO_F))]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
   "vpst\;vstrht.16\t%q2, [%0, %q1, uxtw #1]"
@@ -8691,7 +8691,7 @@
 		[(match_operand:V4SI 0 "s_register_operand" "w")
 		 (match_operand:SI 1 "immediate_operand" "i")
 		 (match_operand:V4SF 2 "s_register_operand" "w")
-		 (match_operand:HI 3 "vpr_register_operand" "Up")]
+		 (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	 VSTRWQSB_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -8740,7 +8740,7 @@
   [(match_operand:V4SI 0 "mve_scatter_memory")
    (match_operand:V4SI 1 "s_register_operand")
    (match_operand:V4SF 2 "s_register_operand")
-   (match_operand:HI 3 "vpr_register_operand")
+   (match_operand:V4BI 3 "vpr_register_operand")
    (unspec:V4SI [(const_int 0)] VSTRWQSO_F)]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
 {
@@ -8758,7 +8758,7 @@
 	  [(match_operand:SI 0 "register_operand" "r")
 	   (match_operand:V4SI 1 "s_register_operand" "w")
 	   (match_operand:V4SF 2 "s_register_operand" "w")
-	   (match_operand:HI 3 "vpr_register_operand" "Up")]
+	   (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	  VSTRWQSO_F))]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
   "vpst\;vstrwt.32\t%q2, [%0, %q1]"
@@ -8771,7 +8771,7 @@
   [(match_operand:V4SI 0 "mve_scatter_memory")
    (match_operand:V4SI 1 "s_register_operand")
    (match_operand:V4SI 2 "s_register_operand")
-   (match_operand:HI 3 "vpr_register_operand")
+   (match_operand:V4BI 3 "vpr_register_operand")
    (unspec:V4SI [(const_int 0)] VSTRWSOQ)]
   "TARGET_HAVE_MVE"
 {
@@ -8789,7 +8789,7 @@
 	  [(match_operand:SI 0 "register_operand" "r")
 	   (match_operand:V4SI 1 "s_register_operand" "w")
 	   (match_operand:V4SI 2 "s_register_operand" "w")
-	   (match_operand:HI 3 "vpr_register_operand" "Up")]
+	   (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	  VSTRWSOQ))]
   "TARGET_HAVE_MVE"
   "vpst\;vstrwt.32\t%q2, [%0, %q1]"
@@ -8858,7 +8858,7 @@
   [(match_operand:V4SI 0 "mve_scatter_memory")
    (match_operand:V4SI 1 "s_register_operand")
    (match_operand:V4SF 2 "s_register_operand")
-   (match_operand:HI 3 "vpr_register_operand")
+   (match_operand:V4BI 3 "vpr_register_operand")
    (unspec:V4SI [(const_int 0)] VSTRWQSSO_F)]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
 {
@@ -8877,7 +8877,7 @@
 	  [(match_operand:SI 0 "register_operand" "r")
 	   (match_operand:V4SI 1 "s_register_operand" "w")
 	   (match_operand:V4SF 2 "s_register_operand" "w")
-	   (match_operand:HI 3 "vpr_register_operand" "Up")]
+	   (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	  VSTRWQSSO_F))]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
   "vpst\;vstrwt.32\t%q2, [%0, %q1, uxtw #2]"
@@ -8890,7 +8890,7 @@
   [(match_operand:V4SI 0 "mve_scatter_memory")
    (match_operand:V4SI 1 "s_register_operand")
    (match_operand:V4SI 2 "s_register_operand")
-   (match_operand:HI 3 "vpr_register_operand")
+   (match_operand:V4BI 3 "vpr_register_operand")
    (unspec:V4SI [(const_int 0)] VSTRWSSOQ)]
   "TARGET_HAVE_MVE"
 {
@@ -8909,7 +8909,7 @@
 	  [(match_operand:SI 0 "register_operand" "r")
 	   (match_operand:V4SI 1 "s_register_operand" "w")
 	   (match_operand:V4SI 2 "s_register_operand" "w")
-	   (match_operand:HI 3 "vpr_register_operand" "Up")]
+	   (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	  VSTRWSSOQ))]
   "TARGET_HAVE_MVE"
   "vpst\;vstrwt.32\t%q2, [%0, %q1, uxtw #2]"
@@ -9376,7 +9376,7 @@
 		[(match_operand:V4SI 1 "s_register_operand" "0")
 		 (match_operand:SI 2 "mve_vldrd_immediate" "Ri")
 		 (match_operand:V4SI 3 "s_register_operand" "w")
-		 (match_operand:HI 4 "vpr_register_operand")]
+		 (match_operand:V4BI 4 "vpr_register_operand")]
 	VSTRWSBWBQ))
    (set (match_operand:V4SI 0 "s_register_operand" "=w")
 	(unspec:V4SI [(match_dup 1) (match_dup 2)]
@@ -9427,7 +9427,7 @@
 		[(match_operand:V4SI 1 "s_register_operand" "0")
 		 (match_operand:SI 2 "mve_vldrd_immediate" "Ri")
 		 (match_operand:V4SF 3 "s_register_operand" "w")
-		 (match_operand:HI 4 "vpr_register_operand")]
+		 (match_operand:V4BI 4 "vpr_register_operand")]
 	VSTRWQSBWB_F))
    (set (match_operand:V4SI 0 "s_register_operand" "=w")
 	(unspec:V4SI [(match_dup 1) (match_dup 2)]
@@ -9551,7 +9551,7 @@
   [(match_operand:V4SI 0 "s_register_operand")
    (match_operand:V4SI 1 "s_register_operand")
    (match_operand:SI 2 "mve_vldrd_immediate")
-   (match_operand:HI 3 "vpr_register_operand")
+   (match_operand:V4BI 3 "vpr_register_operand")
    (unspec:V4SI [(const_int 0)] VLDRWGBWBQ)]
   "TARGET_HAVE_MVE"
 {
@@ -9566,7 +9566,7 @@
   [(match_operand:V4SI 0 "s_register_operand")
    (match_operand:V4SI 1 "s_register_operand")
    (match_operand:SI 2 "mve_vldrd_immediate")
-   (match_operand:HI 3 "vpr_register_operand")
+   (match_operand:V4BI 3 "vpr_register_operand")
    (unspec:V4SI [(const_int 0)] VLDRWGBWBQ)]
   "TARGET_HAVE_MVE"
 {
@@ -9585,7 +9585,7 @@
   [(set (match_operand:V4SI 0 "s_register_operand" "=&w")
 	(unspec:V4SI [(match_operand:V4SI 2 "s_register_operand" "1")
 		      (match_operand:SI 3 "mve_vldrd_immediate" "Ri")
-		      (match_operand:HI 4 "vpr_register_operand" "Up")
+		      (match_operand:V4BI 4 "vpr_register_operand" "Up")
 		      (mem:BLK (scratch))]
 	 VLDRWGBWBQ))
    (set (match_operand:V4SI 1 "s_register_operand" "=&w")
@@ -9659,7 +9659,7 @@
   [(match_operand:V4SI 0 "s_register_operand")
    (match_operand:V4SI 1 "s_register_operand")
    (match_operand:SI 2 "mve_vldrd_immediate")
-   (match_operand:HI 3 "vpr_register_operand")
+   (match_operand:V4BI 3 "vpr_register_operand")
    (unspec:V4SI [(const_int 0)] VLDRWQGBWB_F)]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
 {
@@ -9675,7 +9675,7 @@
   [(match_operand:V4SF 0 "s_register_operand")
    (match_operand:V4SI 1 "s_register_operand")
    (match_operand:SI 2 "mve_vldrd_immediate")
-   (match_operand:HI 3 "vpr_register_operand")
+   (match_operand:V4BI 3 "vpr_register_operand")
    (unspec:V4SI [(const_int 0)] VLDRWQGBWB_F)]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
 {
@@ -9694,7 +9694,7 @@
   [(set (match_operand:V4SF 0 "s_register_operand" "=&w")
 	(unspec:V4SF [(match_operand:V4SI 2 "s_register_operand" "1")
 		      (match_operand:SI 3 "mve_vldrd_immediate" "Ri")
-		      (match_operand:HI 4 "vpr_register_operand" "Up")
+		      (match_operand:V4BI 4 "vpr_register_operand" "Up")
 		      (mem:BLK (scratch))]
 	 VLDRWQGBWB_F))
    (set (match_operand:V4SI 1 "s_register_operand" "=&w")
-- 
cgit v1.1


From c6b4ea7ab1aa6c5c07798fa6c6ad15dd1761b5ed Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@arm.com>
Date: Wed, 13 Oct 2021 09:16:49 +0000
Subject: arm: Convert more MVE/CDE builtins to predicate qualifiers

This patch covers a few non-load/store builtins where we do not use
the <mode> iterator and thus we cannot use <MVE_vpred>.

Most of the work of this patch series was carried out while I was
working at STMicroelectronics as a Linaro assignee.

2022-02-22  Christophe Lyon  <christophe.lyon@arm.com>

	gcc/
	PR target/100757
	PR target/101325
	* config/arm/arm-builtins.cc (CX_UNARY_UNONE_QUALIFIERS): Use
	predicate.
	(CX_BINARY_UNONE_QUALIFIERS): Likewise.
	(CX_TERNARY_UNONE_QUALIFIERS): Likewise.
	(TERNOP_NONE_NONE_NONE_UNONE_QUALIFIERS): Delete.
	(QUADOP_NONE_NONE_NONE_NONE_UNONE_QUALIFIERS): Delete.
	(QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE_QUALIFIERS): Delete.
	* config/arm/arm_mve_builtins.def: Use predicated qualifiers.
	* config/arm/mve.md: Use VxBI instead of HI.
---
 gcc/config/arm/arm-builtins.cc      | 26 ++---------------
 gcc/config/arm/arm_mve_builtins.def | 58 ++++++++++++++++++-------------------
 gcc/config/arm/mve.md               | 52 ++++++++++++++++-----------------
 3 files changed, 58 insertions(+), 78 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc
index 5d582f1..a7acc1d 100644
--- a/gcc/config/arm/arm-builtins.cc
+++ b/gcc/config/arm/arm-builtins.cc
@@ -295,7 +295,7 @@ static enum arm_type_qualifiers
 arm_cx_unary_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_immediate, qualifier_none,
       qualifier_unsigned_immediate,
-      qualifier_unsigned };
+      qualifier_predicate };
 #define CX_UNARY_UNONE_QUALIFIERS (arm_cx_unary_unone_qualifiers)
 
 /* T (immediate, T, T, unsigned immediate).  */
@@ -304,7 +304,7 @@ arm_cx_binary_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_immediate,
       qualifier_none, qualifier_none,
       qualifier_unsigned_immediate,
-      qualifier_unsigned };
+      qualifier_predicate };
 #define CX_BINARY_UNONE_QUALIFIERS (arm_cx_binary_unone_qualifiers)
 
 /* T (immediate, T, T, T, unsigned immediate).  */
@@ -313,7 +313,7 @@ arm_cx_ternary_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_immediate,
       qualifier_none, qualifier_none, qualifier_none,
       qualifier_unsigned_immediate,
-      qualifier_unsigned };
+      qualifier_predicate };
 #define CX_TERNARY_UNONE_QUALIFIERS (arm_cx_ternary_unone_qualifiers)
 
 /* The first argument (return type) of a store should be void type,
@@ -510,12 +510,6 @@ arm_ternop_none_none_none_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_ternop_none_none_none_imm_qualifiers)
 
 static enum arm_type_qualifiers
-arm_ternop_none_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-  = { qualifier_none, qualifier_none, qualifier_none, qualifier_unsigned };
-#define TERNOP_NONE_NONE_NONE_UNONE_QUALIFIERS \
-  (arm_ternop_none_none_none_unone_qualifiers)
-
-static enum arm_type_qualifiers
 arm_ternop_none_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_none, qualifier_predicate };
 #define TERNOP_NONE_NONE_NONE_PRED_QUALIFIERS \
@@ -568,13 +562,6 @@ arm_quadop_unone_unone_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_quadop_unone_unone_none_none_pred_qualifiers)
 
 static enum arm_type_qualifiers
-arm_quadop_none_none_none_none_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-  = { qualifier_none, qualifier_none, qualifier_none, qualifier_none,
-    qualifier_unsigned };
-#define QUADOP_NONE_NONE_NONE_NONE_UNONE_QUALIFIERS \
-  (arm_quadop_none_none_none_none_unone_qualifiers)
-
-static enum arm_type_qualifiers
 arm_quadop_none_none_none_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_none, qualifier_none,
     qualifier_predicate };
@@ -589,13 +576,6 @@ arm_quadop_none_none_none_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   (arm_quadop_none_none_none_imm_pred_qualifiers)
 
 static enum arm_type_qualifiers
-arm_quadop_unone_unone_unone_unone_unone_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-  = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
-    qualifier_unsigned, qualifier_unsigned };
-#define QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE_QUALIFIERS \
-  (arm_quadop_unone_unone_unone_unone_unone_qualifiers)
-
-static enum arm_type_qualifiers
 arm_quadop_unone_unone_unone_unone_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
     qualifier_unsigned, qualifier_predicate };
diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def
index 7db6d47..1c8ee34 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -87,8 +87,8 @@ VAR4 (BINOP_UNONE_UNONE_UNONE, vcreateq_u, v16qi, v8hi, v4si, v2di)
 VAR4 (BINOP_NONE_UNONE_UNONE, vcreateq_s, v16qi, v8hi, v4si, v2di)
 VAR3 (BINOP_UNONE_UNONE_IMM, vshrq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_IMM, vshrq_n_s, v16qi, v8hi, v4si)
-VAR1 (BINOP_NONE_NONE_UNONE, vaddlvq_p_s, v4si)
-VAR1 (BINOP_UNONE_UNONE_UNONE, vaddlvq_p_u, v4si)
+VAR1 (BINOP_NONE_NONE_PRED, vaddlvq_p_s, v4si)
+VAR1 (BINOP_UNONE_UNONE_PRED, vaddlvq_p_u, v4si)
 VAR3 (BINOP_PRED_NONE_NONE, vcmpneq_, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vshlq_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_NONE, vshlq_u, v16qi, v8hi, v4si)
@@ -465,20 +465,20 @@ VAR2 (TERNOP_NONE_NONE_NONE_IMM, vqshrnbq_n_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_IMM, vqrshrntq_n_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_IMM_PRED, vorrq_m_n_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_IMM_PRED, vmvnq_m_n_s, v8hi, v4si)
-VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrmlaldavhq_p_u, v4si)
-VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrev16q_m_u, v16qi)
-VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vaddlvaq_p_u, v4si)
-VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vrmlsldavhxq_p_s, v4si)
-VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vrmlsldavhq_p_s, v4si)
-VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vrmlaldavhxq_p_s, v4si)
-VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vrmlaldavhq_p_s, v4si)
-VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vrev32q_m_f, v8hf)
-VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vrev16q_m_s, v16qi)
-VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vcvttq_m_f32_f16, v4sf)
-VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vcvttq_m_f16_f32, v8hf)
-VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vcvtbq_m_f32_f16, v4sf)
-VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vcvtbq_m_f16_f32, v8hf)
-VAR1 (TERNOP_NONE_NONE_NONE_UNONE, vaddlvaq_p_s, v4si)
+VAR1 (TERNOP_UNONE_UNONE_UNONE_PRED, vrmlaldavhq_p_u, v4si)
+VAR1 (TERNOP_UNONE_UNONE_UNONE_PRED, vrev16q_m_u, v16qi)
+VAR1 (TERNOP_UNONE_UNONE_UNONE_PRED, vaddlvaq_p_u, v4si)
+VAR1 (TERNOP_NONE_NONE_NONE_PRED, vrmlsldavhxq_p_s, v4si)
+VAR1 (TERNOP_NONE_NONE_NONE_PRED, vrmlsldavhq_p_s, v4si)
+VAR1 (TERNOP_NONE_NONE_NONE_PRED, vrmlaldavhxq_p_s, v4si)
+VAR1 (TERNOP_NONE_NONE_NONE_PRED, vrmlaldavhq_p_s, v4si)
+VAR1 (TERNOP_NONE_NONE_NONE_PRED, vrev32q_m_f, v8hf)
+VAR1 (TERNOP_NONE_NONE_NONE_PRED, vrev16q_m_s, v16qi)
+VAR1 (TERNOP_NONE_NONE_NONE_PRED, vcvttq_m_f32_f16, v4sf)
+VAR1 (TERNOP_NONE_NONE_NONE_PRED, vcvttq_m_f16_f32, v8hf)
+VAR1 (TERNOP_NONE_NONE_NONE_PRED, vcvtbq_m_f32_f16, v4sf)
+VAR1 (TERNOP_NONE_NONE_NONE_PRED, vcvtbq_m_f16_f32, v8hf)
+VAR1 (TERNOP_NONE_NONE_NONE_PRED, vaddlvaq_p_s, v4si)
 VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlsldavhaxq_s, v4si)
 VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlsldavhaq_s, v4si)
 VAR1 (TERNOP_NONE_NONE_NONE_NONE, vrmlaldavhaxq_s, v4si)
@@ -629,11 +629,11 @@ VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqshrntq_m_n_s, v8hi, v4si)
 VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqshrnbq_m_n_s, v8hi, v4si)
 VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqrshrntq_m_n_s, v8hi, v4si)
 VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vqrshrnbq_m_n_s, v8hi, v4si)
-VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vrmlaldavhaq_p_u, v4si)
-VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlsldavhaxq_p_s, v4si)
-VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlsldavhaq_p_s, v4si)
-VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlaldavhaxq_p_s, v4si)
-VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vrmlaldavhaq_p_s, v4si)
+VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vrmlaldavhaq_p_u, v4si)
+VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrmlsldavhaxq_p_s, v4si)
+VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrmlsldavhaq_p_s, v4si)
+VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrmlaldavhaxq_p_s, v4si)
+VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vrmlaldavhaq_p_s, v4si)
 VAR2 (QUADOP_UNONE_UNONE_NONE_IMM_PRED, vcvtq_m_n_from_f_u, v8hi, v4si)
 VAR2 (QUADOP_NONE_NONE_NONE_IMM_PRED, vcvtq_m_n_from_f_s, v8hi, v4si)
 VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vbrsrq_m_n_f, v8hf, v4sf)
@@ -845,14 +845,14 @@ VAR1 (BINOP_NONE_NONE_NONE, vsbciq_s, v4si)
 VAR1 (BINOP_UNONE_UNONE_UNONE, vsbciq_u, v4si)
 VAR1 (BINOP_NONE_NONE_NONE, vsbcq_s, v4si)
 VAR1 (BINOP_UNONE_UNONE_UNONE, vsbcq_u, v4si)
-VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vadciq_m_s, v4si)
-VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vadciq_m_u, v4si)
-VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vadcq_m_s, v4si)
-VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vadcq_m_u, v4si)
-VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vsbciq_m_s, v4si)
-VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vsbciq_m_u, v4si)
-VAR1 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vsbcq_m_s, v4si)
-VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vsbcq_m_u, v4si)
+VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vadciq_m_s, v4si)
+VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vadciq_m_u, v4si)
+VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vadcq_m_s, v4si)
+VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vadcq_m_u, v4si)
+VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsbciq_m_s, v4si)
+VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsbciq_m_u, v4si)
+VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsbcq_m_s, v4si)
+VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsbcq_m_u, v4si)
 VAR5 (STORE1, vst2q, v16qi, v8hi, v4si, v8hf, v4sf)
 VAR5 (LOAD1, vld4q, v16qi, v8hi, v4si, v8hf, v4sf)
 VAR5 (LOAD1, vld2q, v16qi, v8hi, v4si, v8hf, v4sf)
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index e291c67..908bedc 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -826,7 +826,7 @@
   [
    (set (match_operand:DI 0 "s_register_operand" "=r")
 	(unspec:DI [(match_operand:V4SI 1 "s_register_operand" "w")
-		    (match_operand:HI 2 "vpr_register_operand" "Up")]
+		    (match_operand:V4BI 2 "vpr_register_operand" "Up")]
 	 VADDLVQ_P))
   ]
   "TARGET_HAVE_MVE"
@@ -3739,7 +3739,7 @@
    (set (match_operand:DI 0 "s_register_operand" "=r")
 	(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
 		       (match_operand:V4SI 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	 VADDLVAQ_P))
   ]
   "TARGET_HAVE_MVE"
@@ -3949,7 +3949,7 @@
    (set (match_operand:V8HF 0 "s_register_operand" "=w")
 	(unspec:V8HF [(match_operand:V8HF 1 "s_register_operand" "0")
 		       (match_operand:V4SF 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCVTBQ_M_F16_F32))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3965,7 +3965,7 @@
    (set (match_operand:V4SF 0 "s_register_operand" "=w")
 	(unspec:V4SF [(match_operand:V4SF 1 "s_register_operand" "0")
 		       (match_operand:V8HF 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCVTBQ_M_F32_F16))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3981,7 +3981,7 @@
    (set (match_operand:V8HF 0 "s_register_operand" "=w")
 	(unspec:V8HF [(match_operand:V8HF 1 "s_register_operand" "0")
 		       (match_operand:V4SF 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCVTTQ_M_F16_F32))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -3997,7 +3997,7 @@
    (set (match_operand:V4SF 0 "s_register_operand" "=w")
 	(unspec:V4SF [(match_operand:V4SF 1 "s_register_operand" "0")
 		       (match_operand:V8HF 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VCVTTQ_M_F32_F16))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4595,7 +4595,7 @@
    (set (match_operand:V8HF 0 "s_register_operand" "=w")
 	(unspec:V8HF [(match_operand:V8HF 1 "s_register_operand" "0")
 		       (match_operand:V8HF 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VREV32Q_M_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -4659,7 +4659,7 @@
    (set (match_operand:DI 0 "s_register_operand" "=r")
 	(unspec:DI [(match_operand:V4SI 1 "s_register_operand" "w")
 		       (match_operand:V4SI 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VRMLALDAVHXQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -4691,7 +4691,7 @@
    (set (match_operand:DI 0 "s_register_operand" "=r")
 	(unspec:DI [(match_operand:V4SI 1 "s_register_operand" "w")
 		       (match_operand:V4SI 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VRMLSLDAVHQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -4707,7 +4707,7 @@
    (set (match_operand:DI 0 "s_register_operand" "=r")
 	(unspec:DI [(match_operand:V4SI 1 "s_register_operand" "w")
 		       (match_operand:V4SI 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")]
 	 VRMLSLDAVHXQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -4932,7 +4932,7 @@
    (set (match_operand:V16QI 0 "s_register_operand" "=w")
 	(unspec:V16QI [(match_operand:V16QI 1 "s_register_operand" "0")
 		       (match_operand:V16QI 2 "s_register_operand" "w")
-		       (match_operand:HI 3 "vpr_register_operand" "Up")]
+		       (match_operand:V16BI 3 "vpr_register_operand" "Up")]
 	 VREV16Q_M))
   ]
   "TARGET_HAVE_MVE"
@@ -4964,7 +4964,7 @@
    (set (match_operand:DI 0 "s_register_operand" "=r")
 	(unspec:DI [(match_operand:V4SI 1 "s_register_operand" "w")
 		    (match_operand:V4SI 2 "s_register_operand" "w")
-		    (match_operand:HI 3 "vpr_register_operand" "Up")]
+		    (match_operand:V4BI 3 "vpr_register_operand" "Up")]
 	 VRMLALDAVHQ_P))
   ]
   "TARGET_HAVE_MVE"
@@ -6233,7 +6233,7 @@
 	(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
 		       (match_operand:V4SI 2 "s_register_operand" "w")
 		       (match_operand:V4SI 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VRMLALDAVHAQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6556,7 +6556,7 @@
 	(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
 		       (match_operand:V4SI 2 "s_register_operand" "w")
 		       (match_operand:V4SI 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VRMLALDAVHAQ_P_U))
   ]
   "TARGET_HAVE_MVE"
@@ -6573,7 +6573,7 @@
 	(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
 		       (match_operand:V4SI 2 "s_register_operand" "w")
 		       (match_operand:V4SI 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VRMLALDAVHAXQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6590,7 +6590,7 @@
 	(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
 		       (match_operand:V4SI 2 "s_register_operand" "w")
 		       (match_operand:V4SI 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VRMLSLDAVHAQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -6607,7 +6607,7 @@
 	(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
 		       (match_operand:V4SI 2 "s_register_operand" "w")
 		       (match_operand:V4SI 3 "s_register_operand" "w")
-		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+		       (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")]
 	 VRMLSLDAVHAXQ_P_S))
   ]
   "TARGET_HAVE_MVE"
@@ -7528,7 +7528,7 @@
 (define_insn "mve_vldrhq_z_fv8hf"
   [(set (match_operand:V8HF 0 "s_register_operand" "=w")
 	(unspec:V8HF [(match_operand:V8HI 1 "mve_memory_operand" "Ux")
-	(match_operand:HI 2 "vpr_register_operand" "Up")]
+	(match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up")]
 	 VLDRHQ_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -8303,7 +8303,7 @@
 (define_insn "mve_vstrwq_p_fv4sf"
   [(set (match_operand:V4SI 0 "mve_memory_operand" "=Ux")
 	(unspec:V4SI [(match_operand:V4SF 1 "s_register_operand" "w")
-		      (match_operand:HI 2 "vpr_register_operand" "Up")]
+		      (match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up")]
 	 VSTRWQ_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -9844,7 +9844,7 @@
 	(unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "0")
 		      (match_operand:V4SI 2 "s_register_operand" "w")
 		      (match_operand:V4SI 3 "s_register_operand" "w")
-		      (match_operand:HI 4 "vpr_register_operand" "Up")]
+		      (match_operand:V4BI 4 "vpr_register_operand" "Up")]
 	 VADCIQ_M))
    (set (reg:SI VFPCC_REGNUM)
 	(unspec:SI [(const_int 0)]
@@ -9880,7 +9880,7 @@
 	(unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "0")
 		      (match_operand:V4SI 2 "s_register_operand" "w")
 		      (match_operand:V4SI 3 "s_register_operand" "w")
-		      (match_operand:HI 4 "vpr_register_operand" "Up")]
+		      (match_operand:V4BI 4 "vpr_register_operand" "Up")]
 	 VADCQ_M))
    (set (reg:SI VFPCC_REGNUM)
 	(unspec:SI [(reg:SI VFPCC_REGNUM)]
@@ -9917,7 +9917,7 @@
 	(unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w")
 		      (match_operand:V4SI 2 "s_register_operand" "w")
 		      (match_operand:V4SI 3 "s_register_operand" "w")
-		      (match_operand:HI 4 "vpr_register_operand" "Up")]
+		      (match_operand:V4BI 4 "vpr_register_operand" "Up")]
 	 VSBCIQ_M))
    (set (reg:SI VFPCC_REGNUM)
 	(unspec:SI [(const_int 0)]
@@ -9953,7 +9953,7 @@
 	(unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w")
 		      (match_operand:V4SI 2 "s_register_operand" "w")
 		      (match_operand:V4SI 3 "s_register_operand" "w")
-		      (match_operand:HI 4 "vpr_register_operand" "Up")]
+		      (match_operand:V4BI 4 "vpr_register_operand" "Up")]
 	 VSBCQ_M))
    (set (reg:SI VFPCC_REGNUM)
 	(unspec:SI [(reg:SI VFPCC_REGNUM)]
@@ -10457,7 +10457,7 @@
 	(unspec:V16QI [(match_operand:SI 1 "const_int_coproc_operand" "i")
 			   (match_operand:V16QI 2 "register_operand" "0")
 			   (match_operand:SI 3 "const_int_mve_cde1_operand" "i")
-			   (match_operand:HI 4 "vpr_register_operand" "Up")]
+			   (match_operand:V16BI 4 "vpr_register_operand" "Up")]
 	 CDE_VCX))]
   "TARGET_CDE && TARGET_HAVE_MVE"
   "vpst\;vcx1<a>t\\tp%c1, %q0, #%c3"
@@ -10471,7 +10471,7 @@
 			  (match_operand:V16QI 2 "register_operand" "0")
 			  (match_operand:V16QI 3 "register_operand" "t")
 			  (match_operand:SI 4 "const_int_mve_cde2_operand" "i")
-			  (match_operand:HI 5 "vpr_register_operand" "Up")]
+			  (match_operand:V16BI 5 "vpr_register_operand" "Up")]
 	 CDE_VCX))]
   "TARGET_CDE && TARGET_HAVE_MVE"
   "vpst\;vcx2<a>t\\tp%c1, %q0, %q3, #%c4"
@@ -10486,7 +10486,7 @@
 			  (match_operand:V16QI 3 "register_operand" "t")
 			  (match_operand:V16QI 4 "register_operand" "t")
 			  (match_operand:SI 5 "const_int_mve_cde3_operand" "i")
-			  (match_operand:HI 6 "vpr_register_operand" "Up")]
+			  (match_operand:V16BI 6 "vpr_register_operand" "Up")]
 	 CDE_VCX))]
   "TARGET_CDE && TARGET_HAVE_MVE"
   "vpst\;vcx3<a>t\\tp%c1, %q0, %q3, %q4, #%c5"
-- 
cgit v1.1


From e9f8443a9179c0e9e0d96dfa91c883d6ddb70d3b Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@arm.com>
Date: Wed, 13 Oct 2021 09:16:53 +0000
Subject: arm: Add VPR_REG to ALL_REGS

VPR_REG should be part of ALL_REGS, this patch fixes this omission.

Most of the work of this patch series was carried out while I was
working at STMicroelectronics as a Linaro assignee.

2022-02-22  Christophe Lyon  <christophe.lyon@arm.com>

	gcc/
	* config/arm/arm.h (REG_CLASS_CONTENTS): Add VPR_REG to ALL_REGS.
---
 gcc/config/arm/arm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 61c0221..ef7b66f 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -1347,7 +1347,7 @@ enum reg_class
   { 0x00000000, 0x00000000, 0x00000000, 0x00000080 }, /* AFP_REG */	\
   { 0x00000000, 0x00000000, 0x00000000, 0x00000400 }, /* VPR_REG.  */	\
   { 0x00005FFF, 0x00000000, 0x00000000, 0x00000400 }, /* GENERAL_AND_VPR_REGS.  */ \
-  { 0xFFFF7FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000000F }  /* ALL_REGS.  */	\
+  { 0xFFFF7FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000040F }  /* ALL_REGS.  */	\
 }
 
 #define FP_SYSREGS \
-- 
cgit v1.1


From 537c96588026aec09b9a00d6d0f3670f612428b5 Mon Sep 17 00:00:00 2001
From: Segher Boessenkool <segher@kernel.crashing.org>
Date: Tue, 22 Feb 2022 15:49:09 +0000
Subject: rs6000: Fix GC on rs6000.c decls for atomic handling (PR88134)

In PR88134 it is pointed out that we do not have GTY markup for some
variables we use for atomic.  So, let's add that.

2022-02-22  Segher Boessenkool  <segher@kernel.crashing.org>

	PR target/88134
	* config/rs6000/rs6000.cc (atomic_hold_decl, atomic_clear_decl,
	atomic_update_decl): Add GTY markup.
---
 gcc/config/rs6000/rs6000.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index ca9e7b8..a855e8c 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -27699,14 +27699,13 @@ emit_fusion_gpr_load (rtx target, rtx mem)
   return "";
 }
 
-
-#ifdef RS6000_GLIBC_ATOMIC_FENV
-/* Function declarations for rs6000_atomic_assign_expand_fenv.  */
-static tree atomic_hold_decl, atomic_clear_decl, atomic_update_decl;
-#endif
+/* This is not inside an  #ifdef RS6000_GLIBC_ATOMIC_FENV  because gengtype
+   ignores it then.  */
+static GTY(()) tree atomic_hold_decl;
+static GTY(()) tree atomic_clear_decl;
+static GTY(()) tree atomic_update_decl;
 
 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV hook.  */
-
 static void
 rs6000_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
 {
-- 
cgit v1.1


From 9d1796d82d46dd3086f07953129dc5761feb707b Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Tue, 22 Feb 2022 18:17:24 +0000
Subject: Restore bootstrap on x86_64-pc-linux-gnu

This patch resolves the bootstrap failure on x86_64-pc-linux-gnu.

2022-02-22  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	* config/i386/i386-expand.cc (ix86_expand_cmpxchg_loop): Restore
	bootstrap.
---
 gcc/config/i386/i386-expand.cc | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 7f7055b..faa0191 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23287,11 +23287,11 @@ void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
 
   switch (mode)
     {
-    case TImode:
+    case E_TImode:
       gendw = gen_atomic_compare_and_swapti_doubleword;
       hmode = DImode;
       break;
-    case DImode:
+    case E_DImode:
       if (doubleword)
 	{
 	  gendw = gen_atomic_compare_and_swapdi_doubleword;
@@ -23300,12 +23300,15 @@ void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
       else
 	gen = gen_atomic_compare_and_swapdi_1;
       break;
-    case SImode:
-      gen = gen_atomic_compare_and_swapsi_1; break;
-    case HImode:
-      gen = gen_atomic_compare_and_swaphi_1; break;
-    case QImode:
-      gen = gen_atomic_compare_and_swapqi_1; break;
+    case E_SImode:
+      gen = gen_atomic_compare_and_swapsi_1;
+      break;
+    case E_HImode:
+      gen = gen_atomic_compare_and_swaphi_1;
+      break;
+    case E_QImode:
+      gen = gen_atomic_compare_and_swapqi_1;
+      break;
     default:
       gcc_unreachable ();
     }
-- 
cgit v1.1


From fd0ab7c734b04b91653467b94afd48ceca122083 Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@arm.com>
Date: Wed, 23 Feb 2022 06:44:12 +0000
Subject: arm: Fix typo in auto-vectorized MVE comparisons

I made a last minute renaming of mve_const_bool_vec_to_hi () into
mve_bool_vec_to_const () and forgot to update the call sites in vfp.md
accordingly.

Committed as obvious.

2022-02-23  Christophe Lyon <christophe.lyon@arm.com>

	gcc/
	PR target/100757
	PR target/101325
	* config/arm/vfp.md (thumb2_movhi_vfp, thumb2_movhi_fp16): Fix
	typo.
---
 gcc/config/arm/vfp.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
index f00d1ca..d0f423c 100644
--- a/gcc/config/arm/vfp.md
+++ b/gcc/config/arm/vfp.md
@@ -89,7 +89,7 @@
       return "mov%?\t%0, %1\t%@ movhi";
     case 1:
       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_VECTOR_BOOL)
-        operands[1] = mve_const_bool_vec_to_hi (operands[1]);
+        operands[1] = mve_bool_vec_to_const (operands[1]);
       else
         operands[1] = gen_lowpart (HImode, operands[1]);
       return "mov%?\t%0, %1\t%@ movhi";
@@ -193,7 +193,7 @@
       return "mov%?\t%0, %1\t%@ movhi";
     case 1:
       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_VECTOR_BOOL)
-        operands[1] = mve_const_bool_vec_to_hi (operands[1]);
+        operands[1] = mve_bool_vec_to_const (operands[1]);
       else
         operands[1] = gen_lowpart (HImode, operands[1]);
       return "mov%?\t%0, %1\t%@ movhi";
-- 
cgit v1.1


From 06770148711226ba243b964451dfa8816d5d23e5 Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Wed, 23 Feb 2022 07:24:50 +0000
Subject: nvptx: Back-end portion of a fix for PR target/104489.

This one line fix/tweak is the back-end specific change for a fix for
PR target/104489, that allows the ISA for GCC's nvptx backend to be bumped
to sm_53.  The machine-independent middle-end pieces were posted here:
https://gcc.gnu.org/pipermail/gcc-patches/2022-February/590139.html

2022-02-23  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	PR target/104489
	* config/nvptx/nvptx.md (*movhf_insn): Add subregs_ok attribute.
---
 gcc/config/nvptx/nvptx.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index f6dc817..216e89f 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -288,7 +288,8 @@
   "@
    %.\\tmov.b16\\t%0, %1;
    %.\\tld.b16\\t%0, %1;
-   %.\\tst.b16\\t%0, %1;")
+   %.\\tst.b16\\t%0, %1;"
+  [(set_attr "subregs_ok" "true")])
 
 (define_expand "movhf"
   [(set (match_operand:HF 0 "nonimmediate_operand" "")
-- 
cgit v1.1


From ffb2c67170768d5aa2d84a143405da658930e9b0 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 23 Feb 2022 14:32:29 +0800
Subject: Fix typo in <code>v1ti3.

For evex encoding vp{xor,or,and}, suffix is needed.

Or there would be an error for
vpxor %xmm0, %xmm31, %xmm1

Error: unsupported instruction `vpxor'

gcc/ChangeLog:

	* config/i386/sse.md (<code>v1ti3): Add suffix and replace
	isa attr of alternative 2 from avx to avx512vl.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512vl-logicsuffix-1.c: New test.
---
 gcc/config/i386/sse.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b2f5634..3066ea3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17025,8 +17025,8 @@
   "@
    p<logic>\t{%2, %0|%0, %2}
    vp<logic>\t{%2, %1, %0|%0, %1, %2}
-   vp<logic>\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx,avx")
+   vp<logic>d\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx,avx512vl")
    (set_attr "prefix" "orig,vex,evex")
    (set_attr "prefix_data16" "1,*,*")
    (set_attr "type" "sselog")
-- 
cgit v1.1


From 7862f6ccd85a001e4d70abb00bb95d8c7846ba80 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Wed, 23 Feb 2022 09:33:33 +0100
Subject: [nvptx] Fix dummy location in gen_comment

I committed "[nvptx] Add -mptx-comment", but tested it in combination with the
proposed "[final] Handle compiler-generated asm insn" (
https://gcc.gnu.org/pipermail/gcc-patches/2022-February/590721.html ), so
by itself the commit introduced some regressions:
...
FAIL: gcc.dg/20020426-2.c (internal compiler error: Segmentation fault)
FAIL: gcc.dg/analyzer/zlib-3.c (internal compiler error: Segmentation fault)
FAIL: gcc.dg/pr101223.c (internal compiler error: Segmentation fault)
FAIL: gcc.dg/torture/pr80764.c   -O2  (internal compiler error: Segmentation fault)
...

There are due to cfun->function_start_locus == 0.

Fix these by using DECL_SOURCE_LOCATION (cfun->decl) instead.

Tested on nvptx.

gcc/ChangeLog:

2022-02-23  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.cc (gen_comment): Use
	DECL_SOURCE_LOCATION (cfun->decl) instead of cfun->function_start_locus.
---
 gcc/config/nvptx/nvptx.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 858789e..6f6d592 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -5382,7 +5382,7 @@ gen_comment (const char *s)
   char *comment = (char *) alloca (len);
   snprintf (comment, len, "%s%s%s", ASM_COMMENT_START, sep, s);
   return gen_rtx_ASM_INPUT_loc (VOIDmode, ggc_strdup (comment),
-				cfun->function_start_locus);
+				DECL_SOURCE_LOCATION (cfun->decl));
 }
 
 /* Initialize all declared regs at function entry.
-- 
cgit v1.1


From c982d02ffe26fcd07280bf0f35f90df9be00716e Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Wed, 23 Feb 2022 09:39:53 +0100
Subject: [nvptx] Add shf.{l,r}.wrap insn

Ptx contains funnel shift operations shf.l.wrap and shf.r.wrap that can be
used to implement 32-bit left or right rotate.

Add define_insns rotlsi3 and rotrsi3.

Tested on nvptx.

gcc/ChangeLog:

2022-02-23  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.md (define_insn "rotlsi3", define_insn
	"rotrsi3"): New define_insn.

gcc/testsuite/ChangeLog:

2022-02-23  Tom de Vries  <tdevries@suse.de>

	* gcc.target/nvptx/rotate-run.c: New test.
	* gcc.target/nvptx/rotate.c: New test.
---
 gcc/config/nvptx/nvptx.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 216e89f..4989b56 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -808,6 +808,22 @@
   ""
   "%.\\tshr.u%T0\\t%0, %1, %2;")
 
+(define_insn "rotlsi3"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+	(rotate:SI (match_operand:SI 1 "nvptx_register_operand" "R")
+		   (and:SI (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")
+			   (const_int 31))))]
+  "TARGET_SM35"
+  "%.\\tshf.l.wrap.b32\\t%0, %1, %1, %2;")
+
+(define_insn "rotrsi3"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+	(rotatert:SI (match_operand:SI 1 "nvptx_register_operand" "R")
+		     (and:SI (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")
+			     (const_int 31))))]
+  "TARGET_SM35"
+  "%.\\tshf.r.wrap.b32\\t%0, %1, %1, %2;")
+
 ;; Logical operations
 
 (define_code_iterator any_logic [and ior xor])
-- 
cgit v1.1


From a046033ea0ba97314265933bc48124574db2d62a Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Wed, 23 Feb 2022 15:58:59 +0100
Subject: [nvptx] Add missing t-omp-device isas

In t-omp-device we list isas that can be used in omp declare variant like so:
...
  #pragma omp declare variant (f30) match (device={isa("sm_30")})
...
and in nvptx_omp_device_kind_arch_isa we handle them.

Update both to reflect the current list of isas.

Tested on x86_64-linux with nvptx accelerator.

gcc/ChangeLog:

2022-02-23  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.cc (nvptx_omp_device_kind_arch_isa): Handle
	sm_70, sm_75 and sm_80.
	* config/nvptx/t-omp-device: Add sm_53, sm_70, sm_75 and sm_80.

Co-Authored-By: Tobias Burnus <tobias@codesourcery.com>
---
 gcc/config/nvptx/nvptx.cc     | 8 +++++++-
 gcc/config/nvptx/t-omp-device | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 6f6d592..b9451c2 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -6181,7 +6181,13 @@ nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
       if (strcmp (name, "sm_35") == 0)
 	return TARGET_SM35 && !TARGET_SM53;
       if (strcmp (name, "sm_53") == 0)
-	return TARGET_SM53;
+	return TARGET_SM53 && !TARGET_SM70;
+      if (strcmp (name, "sm_70") == 0)
+	return TARGET_SM70 && !TARGET_SM75;
+      if (strcmp (name, "sm_75") == 0)
+	return TARGET_SM75 && !TARGET_SM80;
+      if (strcmp (name, "sm_80") == 0)
+	return TARGET_SM80;
       return 0;
     default:
       gcc_unreachable ();
diff --git a/gcc/config/nvptx/t-omp-device b/gcc/config/nvptx/t-omp-device
index 8765d9f..4228218 100644
--- a/gcc/config/nvptx/t-omp-device
+++ b/gcc/config/nvptx/t-omp-device
@@ -1,4 +1,4 @@
 omp-device-properties-nvptx: $(srcdir)/config/nvptx/nvptx.cc
 	echo kind: gpu > $@
 	echo arch: nvptx >> $@
-	echo isa: sm_30 sm_35 >> $@
+	echo isa: sm_30 sm_35 sm_53 sm_70 sm_75 sm_80 >> $@
-- 
cgit v1.1


From eabf7bbe601f2c0d87bd0a1012d7a602df2037da Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Fri, 25 Feb 2022 12:06:52 +0100
Subject: i386: Use a new temp slot kind for splitter to
 floatdi<mode>2_i387_with_xmm [PR104674]

As mentioned in the PR, the following testcase is miscompiled for similar
reasons as the already fixed PR78791 - we use SLOT_TEMP slots in various
places during expansion and during expansion we can guarantee that the
lifetime of those temporary slot doesn't overlap.  But the following
splitter uses SLOT_TEMP too and in between expansion and split1 there is
a possibility that something extends the lifetime of SLOT_TEMP created
slots across an instruction that will be split by this splitter.

The following patch fixes it by using a new temp slot kind to make sure
it doesn't reuse a SLOT_TEMP that could be live across the instruction.

2022-02-25  Jakub Jelinek  <jakub@redhat.com>

	PR target/104674
	* config/i386/i386.h (enum ix86_stack_slot): Add SLOT_FLOATxFDI_387.
	* config/i386/i386.md (splitter to floatdi<mode>2_i387_with_xmm): Use
	SLOT_FLOATxFDI_387 rather than SLOT_TEMP.

	* gcc.target/i386/pr104674.c: New test.
---
 gcc/config/i386/i386.h  | 1 +
 gcc/config/i386/i386.md | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index f41e090..b37d4a9 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2414,6 +2414,7 @@ enum ix86_stack_slot
   SLOT_CW_FLOOR,
   SLOT_CW_CEIL,
   SLOT_STV_TEMP,
+  SLOT_FLOATxFDI_387,
   MAX_386_STACK_LOCALS
 };
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 8ffa641..e7c5490 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -5412,9 +5412,8 @@
    && can_create_pseudo_p ()"
   [(const_int 0)]
 {
-  emit_insn (gen_floatdi<mode>2_i387_with_xmm
-	     (operands[0], operands[1],
-	      assign_386_stack_local (DImode, SLOT_TEMP)));
+  rtx s = assign_386_stack_local (DImode, SLOT_FLOATxFDI_387);
+  emit_insn (gen_floatdi<mode>2_i387_with_xmm (operands[0], operands[1], s));
   DONE;
 })
 
-- 
cgit v1.1


From d54cdd1538deebed97fb9531dc3e1a42eaf0a80f Mon Sep 17 00:00:00 2001
From: Claudiu Zissulescu <claziss@synopsys.com>
Date: Fri, 25 Feb 2022 13:39:22 +0200
Subject: arc: Fail conditional move expand patterns

If the movcc comparison is not valid it triggers an assert in the
current implementation.  This behavior is not needed as we can FAIL
the movcc expand pattern.

gcc/
	* config/arc/arc.cc (gen_compare_reg): Return NULL_RTX if the
	comparison is not valid.
	* config/arc/arc.md (movsicc): Fail if comparison is not valid.
	(movdicc): Likewise.
	(movsfcc): Likewise.
	(movdfcc): Likewise.

Signed-off-by: Claudiu Zissulescu <claziss@synopsys.com>
---
 gcc/config/arc/arc.cc |  3 ++-
 gcc/config/arc/arc.md | 25 ++++++++++++++++++++-----
 2 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/arc/arc.cc b/gcc/config/arc/arc.cc
index 8cc1735..c27ba99 100644
--- a/gcc/config/arc/arc.cc
+++ b/gcc/config/arc/arc.cc
@@ -2256,7 +2256,8 @@ gen_compare_reg (rtx comparison, machine_mode omode)
   cmode = GET_MODE (x);
   if (cmode == VOIDmode)
     cmode = GET_MODE (y);
-  gcc_assert (cmode == SImode || cmode == SFmode || cmode == DFmode);
+  if (cmode != SImode && cmode != SFmode && cmode != DFmode)
+    return NULL_RTX;
   if (cmode == SImode)
     {
       if (!register_operand (x, SImode))
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index ace3cb7..39b3580 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -1618,8 +1618,11 @@ core_3, archs4x, archs4xd, archs4xd_slow"
 		         (match_operand:SI 2 "nonmemory_operand" "")
  		         (match_operand:SI 3 "register_operand" "")))]
   ""
-  "operands[1] = gen_compare_reg (operands[1], VOIDmode);")
-
+  "
+  operands[1] = gen_compare_reg (operands[1], VOIDmode);
+  if (operands[1] == NULL_RTX)
+    FAIL;
+  ")
 
 (define_expand "movdicc"
   [(set (match_operand:DI 0 "dest_reg_operand" "")
@@ -1627,7 +1630,11 @@ core_3, archs4x, archs4xd, archs4xd_slow"
 		        (match_operand:DI 2 "nonmemory_operand" "")
 		        (match_operand:DI 3 "register_operand" "")))]
   ""
-  "operands[1] = gen_compare_reg (operands[1], VOIDmode);")
+  "
+  operands[1] = gen_compare_reg (operands[1], VOIDmode);
+  if (operands[1] == NULL_RTX)
+    FAIL;
+  ")
 
 
 (define_expand "movsfcc"
@@ -1636,7 +1643,11 @@ core_3, archs4x, archs4xd, archs4xd_slow"
 		      (match_operand:SF 2 "nonmemory_operand" "")
 		      (match_operand:SF 3 "register_operand" "")))]
   ""
-  "operands[1] = gen_compare_reg (operands[1], VOIDmode);")
+  "
+  operands[1] = gen_compare_reg (operands[1], VOIDmode);
+  if (operands[1] == NULL_RTX)
+    FAIL;
+  ")
 
 (define_expand "movdfcc"
   [(set (match_operand:DF 0 "dest_reg_operand" "")
@@ -1644,7 +1655,11 @@ core_3, archs4x, archs4xd, archs4xd_slow"
 		      (match_operand:DF 2 "nonmemory_operand" "")
 		      (match_operand:DF 3 "register_operand" "")))]
   ""
-  "operands[1] = gen_compare_reg (operands[1], VOIDmode);")
+  "
+  operands[1] = gen_compare_reg (operands[1], VOIDmode);
+  if (operands[1] == NULL_RTX)
+    FAIL;
+  ")
 
 (define_insn "*movsicc_insn"
   [(set (match_operand:SI 0 "dest_reg_operand" "=w,w")
-- 
cgit v1.1


From 3885a122f817a1b6dca4a84ba9e020d5ab2060af Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Fri, 25 Feb 2022 18:58:48 +0100
Subject: rs6000: Use rs6000_emit_move in movmisalign<mode> expander [PR104681]

The following testcase ICEs, because for some strange reason it decides to use
movmisaligntf during expansion where the destination is MEM and source is
CONST_DOUBLE.  For normal mov<mode> expanders the rs6000 backend uses
rs6000_emit_move to ensure that if one operand is a MEM, the other is a REG
and a few other things, but for movmisalign<mode> nothing enforced this.
The middle-end documents that movmisalign<mode> shouldn't fail, so we can't
force that through predicates or condition on the expander.

2022-02-25  Jakub Jelinek  <jakub@redhat.com>

	PR target/104681
	* config/rs6000/vector.md (movmisalign<mode>): Use rs6000_emit_move.

	* g++.dg/opt/pr104681.C: New test.
---
 gcc/config/rs6000/vector.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index b87a742..4d0797c 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -1519,7 +1519,10 @@
  [(set (match_operand:VEC_N 0 "nonimmediate_operand")
        (match_operand:VEC_N 1 "any_operand"))]
  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_ALLOW_MOVMISALIGN"
- "")
+{
+  rs6000_emit_move (operands[0], operands[1], <MODE>mode);
+  DONE;
+})
 
 ;; Vector shift right in bits. Currently supported ony for shift
 ;; amounts that can be expressed as byte shifts (divisible by 8).
-- 
cgit v1.1


From 50d9ca7104d40f0a331d0dd01e3c069ecf7f6c97 Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Fri, 25 Feb 2022 15:09:03 +0800
Subject: AVX512F: Add helper enumeration for ternary logic intrinsics.

Sync with llvm change in https://reviews.llvm.org/D120307 to
add enumeration and truncate imm to unsigned char, so users could
use ~ on immediates.

gcc/ChangeLog:

	* config/i386/avx512fintrin.h (_MM_TERNLOG_ENUM): New enum.
	(_mm512_ternarylogic_epi64): Truncate imm to unsigned
	char to avoid error when using ~enum as parameter.
	(_mm512_mask_ternarylogic_epi64): Likewise.
	(_mm512_maskz_ternarylogic_epi64): Likewise.
	(_mm512_ternarylogic_epi32): Likewise.
	(_mm512_mask_ternarylogic_epi32): Likewise.
	(_mm512_maskz_ternarylogic_epi32): Likewise.
	* config/i386/avx512vlintrin.h (_mm256_ternarylogic_epi64):
	Adjust imm param type to unsigned char.
	(_mm256_mask_ternarylogic_epi64): Likewise.
	(_mm256_maskz_ternarylogic_epi64): Likewise.
	(_mm256_ternarylogic_epi32): Likewise.
	(_mm256_mask_ternarylogic_epi32): Likewise.
	(_mm256_maskz_ternarylogic_epi32): Likewise.
	(_mm_ternarylogic_epi64): Likewise.
	(_mm_mask_ternarylogic_epi64): Likewise.
	(_mm_maskz_ternarylogic_epi64): Likewise.
	(_mm_ternarylogic_epi32): Likewise.
	(_mm_mask_ternarylogic_epi32): Likewise.
	(_mm_maskz_ternarylogic_epi32): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512f-vpternlogd-1.c: Use new enum.
	* gcc.target/i386/avx512f-vpternlogq-1.c: Likewise.
	* gcc.target/i386/avx512vl-vpternlogd-1.c: Likewise.
	* gcc.target/i386/avx512vl-vpternlogq-1.c: Likewise.
	* gcc.target/i386/testimm-10.c: Remove imm check for vpternlog
	insns since the imm has been truncated in intrinsic.
---
 gcc/config/i386/avx512fintrin.h  | 132 ++++++++++++-------
 gcc/config/i386/avx512vlintrin.h | 278 ++++++++++++++++++++++++---------------
 2 files changed, 262 insertions(+), 148 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h
index bc10c82..29511fd 100644
--- a/gcc/config/i386/avx512fintrin.h
+++ b/gcc/config/i386/avx512fintrin.h
@@ -1639,16 +1639,27 @@ _mm_maskz_sub_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
 
 #endif
 
+/* Constant helper to represent the ternary logic operations among
+   vector A, B and C.  */
+typedef enum
+{
+  _MM_TERNLOG_A = 0xF0,
+  _MM_TERNLOG_B = 0xCC,
+  _MM_TERNLOG_C = 0xAA
+} _MM_TERNLOG_ENUM;
+
 #ifdef __OPTIMIZE__
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_ternarylogic_epi64 (__m512i __A, __m512i __B, __m512i __C,
 			   const int __imm)
 {
-  return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A,
-						     (__v8di) __B,
-						     (__v8di) __C, __imm,
-						     (__mmask8) -1);
+  return (__m512i)
+    __builtin_ia32_pternlogq512_mask ((__v8di) __A,
+				      (__v8di) __B,
+				      (__v8di) __C,
+				      (unsigned char) __imm,
+				      (__mmask8) -1);
 }
 
 extern __inline __m512i
@@ -1656,10 +1667,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_ternarylogic_epi64 (__m512i __A, __mmask8 __U, __m512i __B,
 				__m512i __C, const int __imm)
 {
-  return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A,
-						     (__v8di) __B,
-						     (__v8di) __C, __imm,
-						     (__mmask8) __U);
+  return (__m512i)
+    __builtin_ia32_pternlogq512_mask ((__v8di) __A,
+				      (__v8di) __B,
+				      (__v8di) __C,
+				      (unsigned char) __imm,
+				      (__mmask8) __U);
 }
 
 extern __inline __m512i
@@ -1667,10 +1680,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_ternarylogic_epi64 (__mmask8 __U, __m512i __A, __m512i __B,
 				 __m512i __C, const int __imm)
 {
-  return (__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di) __A,
-						      (__v8di) __B,
-						      (__v8di) __C,
-						      __imm, (__mmask8) __U);
+  return (__m512i)
+    __builtin_ia32_pternlogq512_maskz ((__v8di) __A,
+				       (__v8di) __B,
+				       (__v8di) __C,
+				       (unsigned char) __imm,
+				       (__mmask8) __U);
 }
 
 extern __inline __m512i
@@ -1678,10 +1693,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_ternarylogic_epi32 (__m512i __A, __m512i __B, __m512i __C,
 			   const int __imm)
 {
-  return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A,
-						     (__v16si) __B,
-						     (__v16si) __C,
-						     __imm, (__mmask16) -1);
+  return (__m512i)
+    __builtin_ia32_pternlogd512_mask ((__v16si) __A,
+				      (__v16si) __B,
+				      (__v16si) __C,
+				      (unsigned char) __imm,
+				      (__mmask16) -1);
 }
 
 extern __inline __m512i
@@ -1689,10 +1706,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_ternarylogic_epi32 (__m512i __A, __mmask16 __U, __m512i __B,
 				__m512i __C, const int __imm)
 {
-  return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A,
-						     (__v16si) __B,
-						     (__v16si) __C,
-						     __imm, (__mmask16) __U);
+  return (__m512i)
+    __builtin_ia32_pternlogd512_mask ((__v16si) __A,
+				      (__v16si) __B,
+				      (__v16si) __C,
+				      (unsigned char) __imm,
+				      (__mmask16) __U);
 }
 
 extern __inline __m512i
@@ -1700,33 +1719,56 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_ternarylogic_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
 				 __m512i __C, const int __imm)
 {
-  return (__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si) __A,
-						      (__v16si) __B,
-						      (__v16si) __C,
-						      __imm, (__mmask16) __U);
+  return (__m512i)
+    __builtin_ia32_pternlogd512_maskz ((__v16si) __A,
+				       (__v16si) __B,
+				       (__v16si) __C,
+				       (unsigned char) __imm,
+				       (__mmask16) __U);
 }
 #else
-#define _mm512_ternarylogic_epi64(A, B, C, I)				\
-  ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A),	\
-    (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)-1))
-#define _mm512_mask_ternarylogic_epi64(A, U, B, C, I)			\
-  ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A),	\
-    (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U)))
-#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, I)			\
-  ((__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di)(__m512i)(A),	\
-    (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U)))
-#define _mm512_ternarylogic_epi32(A, B, C, I)				\
-  ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A),	\
-    (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I),		\
-    (__mmask16)-1))
-#define _mm512_mask_ternarylogic_epi32(A, U, B, C, I)			\
-  ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A),	\
-    (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I),		\
-    (__mmask16)(U)))
-#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, I)			\
-  ((__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si)(__m512i)(A),	\
-    (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I),		\
-    (__mmask16)(U)))
+#define _mm512_ternarylogic_epi64(A, B, C, I)			\
+  ((__m512i)							\
+   __builtin_ia32_pternlogq512_mask ((__v8di) (__m512i) (A),	\
+				     (__v8di) (__m512i) (B),	\
+				     (__v8di) (__m512i) (C),	\
+				     (unsigned char) (I),	\
+				     (__mmask8) -1))
+#define _mm512_mask_ternarylogic_epi64(A, U, B, C, I)		\
+  ((__m512i)							\
+   __builtin_ia32_pternlogq512_mask ((__v8di) (__m512i) (A),	\
+				     (__v8di) (__m512i) (B),	\
+				     (__v8di) (__m512i) (C),	\
+				     (unsigned char)(I),	\
+				     (__mmask8) (U)))
+#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, I)		\
+  ((__m512i)							\
+   __builtin_ia32_pternlogq512_maskz ((__v8di) (__m512i) (A),	\
+				      (__v8di) (__m512i) (B),	\
+				      (__v8di) (__m512i) (C),	\
+				      (unsigned char) (I),	\
+				      (__mmask8) (U)))
+#define _mm512_ternarylogic_epi32(A, B, C, I)			\
+  ((__m512i)							\
+   __builtin_ia32_pternlogd512_mask ((__v16si) (__m512i) (A),	\
+				     (__v16si) (__m512i) (B),	\
+				     (__v16si) (__m512i) (C),	\
+				     (unsigned char) (I),	\
+				     (__mmask16) -1))
+#define _mm512_mask_ternarylogic_epi32(A, U, B, C, I)		\
+  ((__m512i)							\
+   __builtin_ia32_pternlogd512_mask ((__v16si) (__m512i) (A),	\
+				     (__v16si) (__m512i) (B),	\
+				     (__v16si) (__m512i) (C),	\
+				     (unsigned char) (I),	\
+				     (__mmask16) (U)))
+#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, I)		\
+  ((__m512i)							\
+   __builtin_ia32_pternlogd512_maskz ((__v16si) (__m512i) (A),	\
+				      (__v16si) (__m512i) (B),	\
+				      (__v16si) (__m512i) (C),	\
+				      (unsigned char) (I),	\
+				      (__mmask16) (U)))
 #endif
 
 extern __inline __m512d
diff --git a/gcc/config/i386/avx512vlintrin.h b/gcc/config/i386/avx512vlintrin.h
index bbced24..26b286e 100644
--- a/gcc/config/i386/avx512vlintrin.h
+++ b/gcc/config/i386/avx512vlintrin.h
@@ -10575,10 +10575,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_ternarylogic_epi64 (__m256i __A, __m256i __B, __m256i __C,
 			   const int __imm)
 {
-  return (__m256i) __builtin_ia32_pternlogq256_mask ((__v4di) __A,
-						     (__v4di) __B,
-						     (__v4di) __C, __imm,
-						     (__mmask8) -1);
+  return (__m256i)
+    __builtin_ia32_pternlogq256_mask ((__v4di) __A,
+				      (__v4di) __B,
+				      (__v4di) __C,
+				      (unsigned char) __imm,
+				      (__mmask8) -1);
 }
 
 extern __inline __m256i
@@ -10587,10 +10589,12 @@ _mm256_mask_ternarylogic_epi64 (__m256i __A, __mmask8 __U,
 				__m256i __B, __m256i __C,
 				const int __imm)
 {
-  return (__m256i) __builtin_ia32_pternlogq256_mask ((__v4di) __A,
-						     (__v4di) __B,
-						     (__v4di) __C, __imm,
-						     (__mmask8) __U);
+  return (__m256i)
+    __builtin_ia32_pternlogq256_mask ((__v4di) __A,
+				      (__v4di) __B,
+				      (__v4di) __C,
+				      (unsigned char) __imm,
+				      (__mmask8) __U);
 }
 
 extern __inline __m256i
@@ -10599,11 +10603,12 @@ _mm256_maskz_ternarylogic_epi64 (__mmask8 __U, __m256i __A,
 				 __m256i __B, __m256i __C,
 				 const int __imm)
 {
-  return (__m256i) __builtin_ia32_pternlogq256_maskz ((__v4di) __A,
-						      (__v4di) __B,
-						      (__v4di) __C,
-						      __imm,
-						      (__mmask8) __U);
+  return (__m256i)
+    __builtin_ia32_pternlogq256_maskz ((__v4di) __A,
+				       (__v4di) __B,
+				       (__v4di) __C,
+				       (unsigned char) __imm,
+				       (__mmask8) __U);
 }
 
 extern __inline __m256i
@@ -10611,10 +10616,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_ternarylogic_epi32 (__m256i __A, __m256i __B, __m256i __C,
 			   const int __imm)
 {
-  return (__m256i) __builtin_ia32_pternlogd256_mask ((__v8si) __A,
-						     (__v8si) __B,
-						     (__v8si) __C, __imm,
-						     (__mmask8) -1);
+  return (__m256i)
+    __builtin_ia32_pternlogd256_mask ((__v8si) __A,
+				      (__v8si) __B,
+				      (__v8si) __C,
+				      (unsigned char) __imm,
+				      (__mmask8) -1);
 }
 
 extern __inline __m256i
@@ -10623,10 +10630,12 @@ _mm256_mask_ternarylogic_epi32 (__m256i __A, __mmask8 __U,
 				__m256i __B, __m256i __C,
 				const int __imm)
 {
-  return (__m256i) __builtin_ia32_pternlogd256_mask ((__v8si) __A,
-						     (__v8si) __B,
-						     (__v8si) __C, __imm,
-						     (__mmask8) __U);
+  return (__m256i)
+    __builtin_ia32_pternlogd256_mask ((__v8si) __A,
+				      (__v8si) __B,
+				      (__v8si) __C,
+				      (unsigned char) __imm,
+				      (__mmask8) __U);
 }
 
 extern __inline __m256i
@@ -10635,11 +10644,12 @@ _mm256_maskz_ternarylogic_epi32 (__mmask8 __U, __m256i __A,
 				 __m256i __B, __m256i __C,
 				 const int __imm)
 {
-  return (__m256i) __builtin_ia32_pternlogd256_maskz ((__v8si) __A,
-						      (__v8si) __B,
-						      (__v8si) __C,
-						      __imm,
-						      (__mmask8) __U);
+  return (__m256i)
+    __builtin_ia32_pternlogd256_maskz ((__v8si) __A,
+				       (__v8si) __B,
+				       (__v8si) __C,
+				       (unsigned char) __imm,
+				       (__mmask8) __U);
 }
 
 extern __inline __m128i
@@ -10647,33 +10657,40 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_ternarylogic_epi64 (__m128i __A, __m128i __B, __m128i __C,
 			const int __imm)
 {
-  return (__m128i) __builtin_ia32_pternlogq128_mask ((__v2di) __A,
-						     (__v2di) __B,
-						     (__v2di) __C, __imm,
-						     (__mmask8) -1);
+  return (__m128i)
+    __builtin_ia32_pternlogq128_mask ((__v2di) __A,
+				      (__v2di) __B,
+				      (__v2di) __C,
+				      (unsigned char) __imm,
+				      (__mmask8) -1);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_ternarylogic_epi64 (__m128i __A, __mmask8 __U,
-			     __m128i __B, __m128i __C, const int __imm)
+			     __m128i __B, __m128i __C,
+			     const int __imm)
 {
-  return (__m128i) __builtin_ia32_pternlogq128_mask ((__v2di) __A,
-						     (__v2di) __B,
-						     (__v2di) __C, __imm,
-						     (__mmask8) __U);
+  return (__m128i)
+    __builtin_ia32_pternlogq128_mask ((__v2di) __A,
+				      (__v2di) __B,
+				      (__v2di) __C,
+				      (unsigned char) __imm,
+				      (__mmask8) __U);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_ternarylogic_epi64 (__mmask8 __U, __m128i __A,
-			      __m128i __B, __m128i __C, const int __imm)
+			      __m128i __B, __m128i __C,
+			      const int __imm)
 {
-  return (__m128i) __builtin_ia32_pternlogq128_maskz ((__v2di) __A,
-						      (__v2di) __B,
-						      (__v2di) __C,
-						      __imm,
-						      (__mmask8) __U);
+  return (__m128i)
+    __builtin_ia32_pternlogq128_maskz ((__v2di) __A,
+				       (__v2di) __B,
+				       (__v2di) __C,
+				       (unsigned char) __imm,
+				       (__mmask8) __U);
 }
 
 extern __inline __m128i
@@ -10681,33 +10698,40 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_ternarylogic_epi32 (__m128i __A, __m128i __B, __m128i __C,
 			const int __imm)
 {
-  return (__m128i) __builtin_ia32_pternlogd128_mask ((__v4si) __A,
-						     (__v4si) __B,
-						     (__v4si) __C, __imm,
-						     (__mmask8) -1);
+  return (__m128i)
+    __builtin_ia32_pternlogd128_mask ((__v4si) __A,
+				      (__v4si) __B,
+				      (__v4si) __C,
+				      (unsigned char) __imm,
+				      (__mmask8) -1);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_ternarylogic_epi32 (__m128i __A, __mmask8 __U,
-			     __m128i __B, __m128i __C, const int __imm)
+			     __m128i __B, __m128i __C,
+			     const int __imm)
 {
-  return (__m128i) __builtin_ia32_pternlogd128_mask ((__v4si) __A,
-						     (__v4si) __B,
-						     (__v4si) __C, __imm,
-						     (__mmask8) __U);
+  return (__m128i)
+    __builtin_ia32_pternlogd128_mask ((__v4si) __A,
+				      (__v4si) __B,
+				      (__v4si) __C,
+				      (unsigned char) __imm,
+				      (__mmask8) __U);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_ternarylogic_epi32 (__mmask8 __U, __m128i __A,
-			      __m128i __B, __m128i __C, const int __imm)
+			      __m128i __B, __m128i __C,
+			      const int __imm)
 {
-  return (__m128i) __builtin_ia32_pternlogd128_maskz ((__v4si) __A,
-						      (__v4si) __B,
-						      (__v4si) __C,
-						      __imm,
-						      (__mmask8) __U);
+  return (__m128i)
+    __builtin_ia32_pternlogd128_maskz ((__v4si) __A,
+				       (__v4si) __B,
+				       (__v4si) __C,
+				       (unsigned char) __imm,
+				       (__mmask8) __U);
 }
 
 extern __inline __m256
@@ -12910,53 +12934,101 @@ _mm256_permutex_pd (__m256d __X, const int __M)
     (__v2di)(__m128i)_mm_setzero_si128 (),\
     (__mmask8)(U)))
 
-#define _mm256_ternarylogic_epi64(A, B, C, I)                           \
-  ((__m256i) __builtin_ia32_pternlogq256_mask ((__v4di)(__m256i)(A),	\
-    (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), (int)(I), (__mmask8)-1))
-
-#define _mm256_mask_ternarylogic_epi64(A, U, B, C, I)			\
-  ((__m256i) __builtin_ia32_pternlogq256_mask ((__v4di)(__m256i)(A),	\
-    (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), (int)(I), (__mmask8)(U)))
-
-#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, I)			\
-  ((__m256i) __builtin_ia32_pternlogq256_maskz ((__v4di)(__m256i)(A),	\
-    (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), (int)(I), (__mmask8)(U)))
-
-#define _mm256_ternarylogic_epi32(A, B, C, I)                           \
-  ((__m256i) __builtin_ia32_pternlogd256_mask ((__v8si)(__m256i)(A),	\
-    (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), (int)(I), (__mmask8)-1))
-
-#define _mm256_mask_ternarylogic_epi32(A, U, B, C, I)                   \
-  ((__m256i) __builtin_ia32_pternlogd256_mask ((__v8si)(__m256i)(A),	\
-    (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), (int)(I), (__mmask8)(U)))
-
-#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, I)			\
-  ((__m256i) __builtin_ia32_pternlogd256_maskz ((__v8si)(__m256i)(A),	\
-    (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), (int)(I), (__mmask8)(U)))
-
-#define _mm_ternarylogic_epi64(A, B, C, I)                              \
-  ((__m128i) __builtin_ia32_pternlogq128_mask ((__v2di)(__m128i)(A),	\
-    (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), (int)(I), (__mmask8)-1))
-
-#define _mm_mask_ternarylogic_epi64(A, U, B, C, I)			\
-  ((__m128i) __builtin_ia32_pternlogq128_mask ((__v2di)(__m128i)(A),	\
-    (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), (int)(I), (__mmask8)(U)))
-
-#define _mm_maskz_ternarylogic_epi64(U, A, B, C, I)			\
-  ((__m128i) __builtin_ia32_pternlogq128_maskz ((__v2di)(__m128i)(A),	\
-    (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), (int)(I), (__mmask8)(U)))
-
-#define _mm_ternarylogic_epi32(A, B, C, I)                              \
-  ((__m128i) __builtin_ia32_pternlogd128_mask ((__v4si)(__m128i)(A),	\
-    (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), (int)(I), (__mmask8)-1))
-
-#define _mm_mask_ternarylogic_epi32(A, U, B, C, I)			\
-  ((__m128i) __builtin_ia32_pternlogd128_mask ((__v4si)(__m128i)(A),	\
-    (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), (int)(I), (__mmask8)(U)))
-
-#define _mm_maskz_ternarylogic_epi32(U, A, B, C, I)			\
-  ((__m128i) __builtin_ia32_pternlogd128_maskz ((__v4si)(__m128i)(A),	\
-    (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), (int)(I), (__mmask8)(U)))
+#define _mm256_ternarylogic_epi64(A, B, C, I)			\
+  ((__m256i)							\
+   __builtin_ia32_pternlogq256_mask ((__v4di) (__m256i) (A),	\
+				     (__v4di) (__m256i) (B),	\
+				     (__v4di) (__m256i) (C),	\
+				     (unsigned char) (I),	\
+				     (__mmask8) -1))
+
+#define _mm256_mask_ternarylogic_epi64(A, U, B, C, I)		\
+  ((__m256i)							\
+   __builtin_ia32_pternlogq256_mask ((__v4di) (__m256i) (A),	\
+				     (__v4di) (__m256i) (B),	\
+				     (__v4di) (__m256i) (C),	\
+				     (unsigned char) (I),	\
+				     (__mmask8) (U)))
+
+#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, I)		\
+  ((__m256i)							\
+   __builtin_ia32_pternlogq256_maskz ((__v4di) (__m256i) (A),	\
+				      (__v4di) (__m256i) (B),	\
+				      (__v4di) (__m256i) (C),	\
+				      (unsigned char) (I),	\
+				      (__mmask8) (U)))
+
+#define _mm256_ternarylogic_epi32(A, B, C, I)			\
+  ((__m256i)							\
+   __builtin_ia32_pternlogd256_mask ((__v8si) (__m256i) (A),	\
+				     (__v8si) (__m256i) (B),	\
+				     (__v8si) (__m256i) (C),	\
+				     (unsigned char) (I),	\
+				     (__mmask8) -1))
+
+#define _mm256_mask_ternarylogic_epi32(A, U, B, C, I)		\
+  ((__m256i)							\
+   __builtin_ia32_pternlogd256_mask ((__v8si) (__m256i) (A),	\
+				     (__v8si) (__m256i) (B),	\
+				     (__v8si) (__m256i) (C),	\
+				     (unsigned char) (I),	\
+				     (__mmask8) (U)))
+
+#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, I)		\
+  ((__m256i)							\
+   __builtin_ia32_pternlogd256_maskz ((__v8si) (__m256i) (A),	\
+				      (__v8si) (__m256i) (B),	\
+				      (__v8si) (__m256i) (C),	\
+				      (unsigned char) (I),	\
+				      (__mmask8) (U)))
+
+#define _mm_ternarylogic_epi64(A, B, C, I)			\
+  ((__m128i)							\
+   __builtin_ia32_pternlogq128_mask ((__v2di) (__m128i) (A),	\
+				     (__v2di) (__m128i) (B),	\
+				     (__v2di) (__m128i) (C),	\
+				     (unsigned char) (I),	\
+				     (__mmask8) -1))
+
+#define _mm_mask_ternarylogic_epi64(A, U, B, C, I)		\
+  ((__m128i)							\
+   __builtin_ia32_pternlogq128_mask ((__v2di) (__m128i) (A),	\
+				     (__v2di) (__m128i) (B),	\
+				     (__v2di) (__m128i) (C),	\
+				     (unsigned char) (I),	\
+				     (__mmask8) (U)))
+
+#define _mm_maskz_ternarylogic_epi64(U, A, B, C, I)		\
+  ((__m128i)							\
+   __builtin_ia32_pternlogq128_maskz ((__v2di) (__m128i) (A),	\
+				      (__v2di) (__m128i) (B),	\
+				      (__v2di) (__m128i) (C),	\
+				      (unsigned char) (I),	\
+				      (__mmask8) (U)))
+
+#define _mm_ternarylogic_epi32(A, B, C, I)			\
+  ((__m128i)							\
+   __builtin_ia32_pternlogd128_mask ((__v4si) (__m128i) (A),	\
+				     (__v4si) (__m128i) (B),	\
+				     (__v4si) (__m128i) (C),	\
+				     (unsigned char) (I),	\
+				     (__mmask8) -1))
+
+#define _mm_mask_ternarylogic_epi32(A, U, B, C, I)		\
+  ((__m128i)							\
+   __builtin_ia32_pternlogd128_mask ((__v4si) (__m128i) (A),	\
+				     (__v4si) (__m128i) (B),	\
+				     (__v4si) (__m128i) (C),	\
+				     (unsigned char) (I),	\
+				     (__mmask8) (U)))
+
+#define _mm_maskz_ternarylogic_epi32(U, A, B, C, I)		\
+  ((__m128i)							\
+   __builtin_ia32_pternlogd128_maskz ((__v4si) (__m128i) (A),	\
+				      (__v4si) (__m128i) (B),	\
+				      (__v4si) (__m128i) (C),	\
+				      (unsigned char) (I),	\
+				      (__mmask8) (U)))
 
 #define _mm256_roundscale_ps(A, B)				        \
   ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A),    \
-- 
cgit v1.1


From 9d87ad0ca5cd18807546a081e7d539be8b5418bf Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Fri, 25 Feb 2022 16:11:23 +0100
Subject: [nvptx] Add -mptx=_

Add an -mptx=_ value, that indicates the default ptx version.

It can be used to undo an explicit -mptx setting, so this:
...
$ gcc test.c -mptx=3.1 -mptx=_
...
has the same effect as:
...
$ gcc test.c
...

Tested on nvptx.

gcc/ChangeLog:

2022-02-28  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx-opts.h (enum ptx_version): Add
	PTX_VERSION_default.
	* config/nvptx/nvptx.cc (handle_ptx_version_option): Handle
	PTX_VERSION_default.
	* config/nvptx/nvptx.opt: Add EnumValue "_" / PTX_VERSION_default.
---
 gcc/config/nvptx/nvptx-opts.h | 1 +
 gcc/config/nvptx/nvptx.cc     | 3 ++-
 gcc/config/nvptx/nvptx.opt    | 3 +++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h
index e918d43..30852b6 100644
--- a/gcc/config/nvptx/nvptx-opts.h
+++ b/gcc/config/nvptx/nvptx-opts.h
@@ -32,6 +32,7 @@ enum ptx_isa
 
 enum ptx_version
 {
+  PTX_VERSION_default,
   PTX_VERSION_3_0,
   PTX_VERSION_3_1,
   PTX_VERSION_4_2,
diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index b9451c2..7862a90 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -296,7 +296,8 @@ sm_version_to_string (enum ptx_isa sm)
 static void
 handle_ptx_version_option (void)
 {
-  if (!OPTION_SET_P (ptx_version_option))
+  if (!OPTION_SET_P (ptx_version_option)
+      || ptx_version_option == PTX_VERSION_default)
     {
       ptx_version_option = default_ptx_version_option ();
       return;
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 9776c3b..f555ad1 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -94,6 +94,9 @@ Enum(ptx_version) String(6.3) Value(PTX_VERSION_6_3)
 EnumValue
 Enum(ptx_version) String(7.0) Value(PTX_VERSION_7_0)
 
+EnumValue
+Enum(ptx_version) String(_) Value(PTX_VERSION_default)
+
 mptx=
 Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option)
 Specify the version of the ptx version to use.
-- 
cgit v1.1


From 28068d1115648adcc08ae57372170f3277915a0d Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Mon, 28 Feb 2022 22:30:27 +0000
Subject: PR tree-optimization/91384: peephole2 to eliminate testl after negl.

This patch is my proposed solution to PR tree-optimization/91384 which is
a missed-optimization/code quality regression on x86_64.  The problematic
idiom is "if (r = -a)" which is equivalent to both "r = -a; if (r != 0)"
and alternatively "r = -a; if (a != 0)".  In this particular case, on
x86_64, we prefer to use the condition codes from the negation, rather
than require an explicit testl instruction.

Unfortunately, combine can't help, as it doesn't attempt to merge pairs
of instructions that share the same operand(s), only pairs/triples of
instructions where the result of each instruction feeds the next.  But
I doubt there's sufficient benefit to attempt this kind of "combination"
(that wouldn't already be caught by the tree-ssa passes).

Fortunately, it's relatively easy to fix this up (addressing the
regression) during peephole2 to eliminate the unnecessary testl in:

        movl    %edi, %ebx
        negl    %ebx
        testl   %edi, %edi
        je      .L2

2022-02-28  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	PR tree-optimization/91384
	* config/i386/i386.md (peephole2): Eliminate final testl insn
	from the sequence *movsi_internal, *negsi_1, *cmpsi_ccno_1 by
	transforming using *negsi_2 for the negation.

gcc/testsuite/ChangeLog
	PR tree-optimization/91384
	* gcc.target/i386/pr91384.c: New test case.
---
 gcc/config/i386/i386.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e7c5490..5e0a980 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -11011,6 +11011,19 @@
   [(set_attr "type" "negnot")
    (set_attr "mode" "<MODE>")])
 
+;; Optimize *negsi_1 followed by *cmpsi_ccno_1 (PR target/91384)
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+	(match_operand:SWI 1 "general_reg_operand"))
+   (parallel [(set (match_dup 0) (neg:SWI (match_dup 0)))
+	      (clobber (reg:CC FLAGS_REG))])
+   (set (reg:CCZ FLAGS_REG) (compare:CCZ (match_dup 1) (const_int 0)))]
+  ""
+  [(set (match_dup 0) (match_dup 1))
+   (parallel [(set (reg:CCZ FLAGS_REG)
+		   (compare:CCZ (neg:SWI (match_dup 0)) (const_int 0)))
+	      (set (match_dup 0) (neg:SWI (match_dup 0)))])])
+
 ;; Special expand pattern to handle integer mode abs
 
 (define_expand "abs<mode>2"
-- 
cgit v1.1


From e2385690a3ead66744e51115966f25f9c05bb3e2 Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Mon, 28 Feb 2022 15:09:59 +0800
Subject: i386: Fix V8HF vector init under -mno-avx [PR 104664]

For V8HFmode vector init with HFmode, do not directly emits V8HF move
with subreg, which may cause reload to assign general register to move
src.

gcc/ChangeLog:

	PR target/104664
	* config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate):
	  Use vec_setv8hf_0 for HF to V8HFmode move instead of subreg.

gcc/testsuite/ChangeLog:

	PR target/104664
	* gcc.target/i386/pr104664.c: New test.
---
 gcc/config/i386/i386-expand.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index faa0191..530f83f 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -14899,7 +14899,12 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 	  dperm.one_operand_p = true;
 
 	  if (mode == V8HFmode)
-	    tmp1 = lowpart_subreg (V8HFmode, force_reg (HFmode, val), HFmode);
+	    {
+	      tmp1 = force_reg (HFmode, val);
+	      tmp2 = gen_reg_rtx (mode);
+	      emit_insn (gen_vec_setv8hf_0 (tmp2, CONST0_RTX (mode), tmp1));
+	      tmp1 = gen_lowpart (mode, tmp2);
+	    }
 	  else
 	    {
 	      /* Extend to SImode using a paradoxical SUBREG.  */
-- 
cgit v1.1


From 2240ebd8e46e098f972a662d0aad85348b304889 Mon Sep 17 00:00:00 2001
From: Robin Dapp <rdapp@linux.ibm.com>
Date: Mon, 7 Feb 2022 08:39:41 +0100
Subject: arc: Fix for new ifcvt behavior [PR104154]

ifcvt now passes a CC-mode "comparison" to backends.  This patch
simply returns from gen_compare_reg () in that case since nothing
needs to be prepared anymore.

gcc/ChangeLog:

	PR rtl-optimization/104154
	* config/arc/arc.cc (gen_compare_reg):  Return the CC-mode
	comparison ifcvt passed us.
---
 gcc/config/arc/arc.cc | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/arc/arc.cc b/gcc/config/arc/arc.cc
index c27ba99..fbc17e6 100644
--- a/gcc/config/arc/arc.cc
+++ b/gcc/config/arc/arc.cc
@@ -2256,6 +2256,12 @@ gen_compare_reg (rtx comparison, machine_mode omode)
   cmode = GET_MODE (x);
   if (cmode == VOIDmode)
     cmode = GET_MODE (y);
+
+  /* If ifcvt passed us a MODE_CC comparison we can
+     just return it.  It should be in the proper form already.   */
+  if (GET_MODE_CLASS (cmode) == MODE_CC)
+    return comparison;
+
   if (cmode != SImode && cmode != SFmode && cmode != DFmode)
     return NULL_RTX;
   if (cmode == SImode)
-- 
cgit v1.1


From 7efe46935c5fce8db13e00aa6f4b0f1599b330e4 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Fri, 25 Feb 2022 11:47:12 +0100
Subject: [nvptx] Add nvptx-sm.def

Add a file gcc/config/nvptx/nvptx-sm.def that lists all sm_xx versions used in
the port, like so:
...
NVPTX_SM(30, NVPTX_SM_SEP)
NVPTX_SM(35, NVPTX_SM_SEP)
NVPTX_SM(53, NVPTX_SM_SEP)
NVPTX_SM(70, NVPTX_SM_SEP)
NVPTX_SM(75, NVPTX_SM_SEP)
NVPTX_SM(80,)
...
and use it in various places using a pattern:
...
  #define NVPTX_SM(XX, SEP) { ... }
  #include "nvptx-sm.def"
  #undef NVPTX_SM
...

Tested on nvptx.

gcc/ChangeLog:

2022-02-25  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx-sm.def: New file.
	* config/nvptx/nvptx-c.cc (nvptx_cpu_cpp_builtins): Use nvptx-sm.def.
	* config/nvptx/nvptx-opts.h (enum ptx_isa): Same.
	* config/nvptx/nvptx.cc (sm_version_to_string)
	(nvptx_omp_device_kind_arch_isa): Same.
---
 gcc/config/nvptx/nvptx-c.cc   | 22 ++++++++++------------
 gcc/config/nvptx/nvptx-opts.h | 11 +++++------
 gcc/config/nvptx/nvptx-sm.def | 30 ++++++++++++++++++++++++++++++
 gcc/config/nvptx/nvptx.cc     | 36 ++++++++++++------------------------
 4 files changed, 57 insertions(+), 42 deletions(-)
 create mode 100644 gcc/config/nvptx/nvptx-sm.def

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx-c.cc b/gcc/config/nvptx/nvptx-c.cc
index b2375fb..02f7562 100644
--- a/gcc/config/nvptx/nvptx-c.cc
+++ b/gcc/config/nvptx/nvptx-c.cc
@@ -39,17 +39,15 @@ nvptx_cpu_cpp_builtins (void)
     cpp_define (parse_in, "__nvptx_softstack__");
   if (TARGET_UNIFORM_SIMT)
     cpp_define (parse_in,"__nvptx_unisimt__");
-  if (TARGET_SM80)
-    cpp_define (parse_in, "__PTX_SM__=800");
-  else if (TARGET_SM75)
-    cpp_define (parse_in, "__PTX_SM__=750");
-  else if (TARGET_SM70)
-    cpp_define (parse_in, "__PTX_SM__=700");
-  else if (TARGET_SM53)
-    cpp_define (parse_in, "__PTX_SM__=530");
-  else if (TARGET_SM35)
-    cpp_define (parse_in, "__PTX_SM__=350");
-  else
-    cpp_define (parse_in,"__PTX_SM__=300");
+
+  const char *ptx_sm = NULL;
+#define NVPTX_SM(XX, SEP) \
+  {						\
+    if (TARGET_SM ## XX)			\
+      ptx_sm = "__PTX_SM__=" #XX "0"; \
+  }
+#include "nvptx-sm.def"
+#undef NVPTX_SM
+  cpp_define (parse_in, ptx_sm);
 }
 
diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h
index 30852b6..86b433c 100644
--- a/gcc/config/nvptx/nvptx-opts.h
+++ b/gcc/config/nvptx/nvptx-opts.h
@@ -22,12 +22,11 @@
 
 enum ptx_isa
 {
-  PTX_ISA_SM30,
-  PTX_ISA_SM35,
-  PTX_ISA_SM53,
-  PTX_ISA_SM70,
-  PTX_ISA_SM75,
-  PTX_ISA_SM80
+#define NVPTX_SM(XX, SEP) PTX_ISA_SM ## XX SEP
+#define NVPTX_SM_SEP ,
+#include "nvptx-sm.def"
+#undef NVPTX_SM_SEP
+#undef NVPTX_SM
 };
 
 enum ptx_version
diff --git a/gcc/config/nvptx/nvptx-sm.def b/gcc/config/nvptx/nvptx-sm.def
new file mode 100644
index 0000000..c552eb0
--- /dev/null
+++ b/gcc/config/nvptx/nvptx-sm.def
@@ -0,0 +1,30 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef NVPTX_SM_SEP
+#define NVPTX_SM_SEP
+#endif
+
+NVPTX_SM (30, NVPTX_SM_SEP)
+NVPTX_SM (35, NVPTX_SM_SEP)
+NVPTX_SM (53, NVPTX_SM_SEP)
+NVPTX_SM (70, NVPTX_SM_SEP)
+NVPTX_SM (75, NVPTX_SM_SEP)
+NVPTX_SM (80,)
+
+#undef NVPTX_SM_SEP
diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 7862a90..f3179ef 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -276,18 +276,11 @@ sm_version_to_string (enum ptx_isa sm)
 {
   switch (sm)
     {
-    case PTX_ISA_SM30:
-      return "30";
-    case PTX_ISA_SM35:
-      return "35";
-    case PTX_ISA_SM53:
-      return "53";
-    case PTX_ISA_SM70:
-      return "70";
-    case PTX_ISA_SM75:
-      return "75";
-    case PTX_ISA_SM80:
-      return "80";
+#define NVPTX_SM(XX, SEP)			\
+      case PTX_ISA_SM ## XX:			\
+	return #XX;
+#include "nvptx-sm.def"
+#undef NVPTX_SM
     default:
       gcc_unreachable ();
     }
@@ -6177,18 +6170,13 @@ nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
     case omp_device_arch:
       return strcmp (name, "nvptx") == 0;
     case omp_device_isa:
-      if (strcmp (name, "sm_30") == 0)
-	return !TARGET_SM35;
-      if (strcmp (name, "sm_35") == 0)
-	return TARGET_SM35 && !TARGET_SM53;
-      if (strcmp (name, "sm_53") == 0)
-	return TARGET_SM53 && !TARGET_SM70;
-      if (strcmp (name, "sm_70") == 0)
-	return TARGET_SM70 && !TARGET_SM75;
-      if (strcmp (name, "sm_75") == 0)
-	return TARGET_SM75 && !TARGET_SM80;
-      if (strcmp (name, "sm_80") == 0)
-	return TARGET_SM80;
+#define NVPTX_SM(XX, SEP)				\
+      {							\
+	if (strcmp (name, "sm_" #XX) == 0)		\
+	  return ptx_isa_option == PTX_ISA_SM ## XX;	\
+      }
+#include "nvptx-sm.def"
+#undef NVPTX_SM
       return 0;
     default:
       gcc_unreachable ();
-- 
cgit v1.1


From 22adaa5e565a0355dc013b4c1eeefd8ff4a96d9a Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Fri, 25 Feb 2022 12:18:17 +0100
Subject: [nvptx] Use nvptx-sm.def for t-omp-device

Add a script gen-omp-device-properties.sh that uses nvptx-sm.def to generate
omp-device-properties-nvptx.

Tested on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-02-25  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/gen-omp-device-properties.sh: New file.
	* config/nvptx/t-omp-device: Use gen-omp-device-properties.sh.
---
 gcc/config/nvptx/gen-omp-device-properties.sh | 33 +++++++++++++++++++++++++++
 gcc/config/nvptx/t-omp-device                 |  7 +++---
 2 files changed, 36 insertions(+), 4 deletions(-)
 create mode 100644 gcc/config/nvptx/gen-omp-device-properties.sh

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/gen-omp-device-properties.sh b/gcc/config/nvptx/gen-omp-device-properties.sh
new file mode 100644
index 0000000..175092c
--- /dev/null
+++ b/gcc/config/nvptx/gen-omp-device-properties.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+# Copyright (C) 2022 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+nvptx_sm_def="$1/nvptx-sm.def"
+
+sms=$(grep ^NVPTX_SM $nvptx_sm_def | sed 's/.*(//;s/,.*//')
+
+echo kind: gpu
+echo arch: nvptx
+
+isa=""
+for sm in $sms; do
+    isa="$isa sm_$sm"
+done
+
+echo isa: $isa
diff --git a/gcc/config/nvptx/t-omp-device b/gcc/config/nvptx/t-omp-device
index 4228218..c2b28a4 100644
--- a/gcc/config/nvptx/t-omp-device
+++ b/gcc/config/nvptx/t-omp-device
@@ -1,4 +1,3 @@
-omp-device-properties-nvptx: $(srcdir)/config/nvptx/nvptx.cc
-	echo kind: gpu > $@
-	echo arch: nvptx >> $@
-	echo isa: sm_30 sm_35 sm_53 sm_70 sm_75 sm_80 >> $@
+omp-device-properties-nvptx: $(srcdir)/config/nvptx/nvptx-sm.def
+	$(SHELL) $(srcdir)/config/nvptx/gen-omp-device-properties.sh \
+	  "$(srcdir)/config/nvptx" > $@
-- 
cgit v1.1


From d59d13c89503baf92d14b04c05708a6296916fad Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Fri, 25 Feb 2022 11:49:01 +0100
Subject: [nvptx] Add nvptx-gen.h and nvptx-gen.opt

Use nvptx-sm.def to generate new files nvptx-gen.h and nvptx-gen.opt, and:
- include nvptx-gen.h in nvptx.h, and
- add nvptx-gen.opt to extra_options (before nvptx.opt, in case that matters).

Tested on nvptx.

gcc/ChangeLog:

2022-02-25  Tom de Vries  <tdevries@suse.de>

	* config.gcc (nvptx*-*-*): Add nvptx/nvptx-gen.opt to extra_options.
	* config/nvptx/gen-copyright.sh: New file.
	* config/nvptx/gen-h.sh: New file.
	* config/nvptx/gen-opt.sh: New file.
	* config/nvptx/nvptx.h (TARGET_SM35, TARGET_SM53, TARGET_SM70)
	(TARGET_SM75, TARGET_SM80): Move ...
	* config/nvptx/nvptx-gen.h: ... here.  New file, generate.
	* config/nvptx/nvptx.opt (Enum ptx_isa): Move ...
	* config/nvptx/nvptx-gen.opt: ... here.  New file, generate.
	* config/nvptx/t-nvptx ($(srcdir)/config/nvptx/nvptx-gen.h)
	($(srcdir)/config/nvptx/nvptx-gen.opt): New make target.
---
 gcc/config/nvptx/gen-copyright.sh | 82 +++++++++++++++++++++++++++++++++++++++
 gcc/config/nvptx/gen-h.sh         | 44 +++++++++++++++++++++
 gcc/config/nvptx/gen-opt.sh       | 66 +++++++++++++++++++++++++++++++
 gcc/config/nvptx/nvptx-gen.h      | 29 ++++++++++++++
 gcc/config/nvptx/nvptx-gen.opt    | 42 ++++++++++++++++++++
 gcc/config/nvptx/nvptx.h          |  6 +--
 gcc/config/nvptx/nvptx.opt        | 22 -----------
 gcc/config/nvptx/t-nvptx          | 17 ++++++++
 8 files changed, 281 insertions(+), 27 deletions(-)
 create mode 100644 gcc/config/nvptx/gen-copyright.sh
 create mode 100644 gcc/config/nvptx/gen-h.sh
 create mode 100644 gcc/config/nvptx/gen-opt.sh
 create mode 100644 gcc/config/nvptx/nvptx-gen.h
 create mode 100644 gcc/config/nvptx/nvptx-gen.opt

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/gen-copyright.sh b/gcc/config/nvptx/gen-copyright.sh
new file mode 100644
index 0000000..79f4899
--- /dev/null
+++ b/gcc/config/nvptx/gen-copyright.sh
@@ -0,0 +1,82 @@
+#!/bin/sh
+
+# Copyright (C) 2022 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+style="$1"
+case $style in
+    opt)
+    ;;
+    c)
+	first=true
+    ;;
+    *)
+	echo "Unknown style: \"$style\""
+	exit 1
+	;;
+esac
+
+( cat <<EOF
+Copyright (C) 2022 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.
+EOF
+) | while read line; do
+    case $style in
+	opt)
+	    if [ "$line" = "" ]; then
+		echo ";"
+	    else
+		echo "; $line"
+	    fi
+	    ;;
+	c)
+	    if $first; then
+		echo "/* $line"
+		first=false
+	    else
+		if [ "$line" = "" ]; then
+		    echo
+		else
+		    echo "   $line"
+		fi
+	    fi
+	    ;;
+    esac
+done
+
+
+case $style in
+    c)
+	echo "*/"
+	;;
+esac
diff --git a/gcc/config/nvptx/gen-h.sh b/gcc/config/nvptx/gen-h.sh
new file mode 100644
index 0000000..605f874
--- /dev/null
+++ b/gcc/config/nvptx/gen-h.sh
@@ -0,0 +1,44 @@
+#!/bin/sh
+
+# Copyright (C) 2022 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+nvptx_sm_def="$1/nvptx-sm.def"
+gen_copyright_sh="$1/gen-copyright.sh"
+
+sms=$(grep ^NVPTX_SM $nvptx_sm_def | sed 's/.*(//;s/,.*//')
+
+cat <<EOF
+/* -*- buffer-read-only: t -*-
+   Generated automatically by gen-h.sh from nvptx-sm.def.
+*/
+EOF
+
+# Separator.
+echo
+
+. $gen_copyright_sh c
+
+# Separator.
+echo
+
+for sm in $sms; do
+    cat <<EOF
+#define TARGET_SM$sm (ptx_isa_option >= PTX_ISA_SM$sm)
+EOF
+done
diff --git a/gcc/config/nvptx/gen-opt.sh b/gcc/config/nvptx/gen-opt.sh
new file mode 100644
index 0000000..5248ed2
--- /dev/null
+++ b/gcc/config/nvptx/gen-opt.sh
@@ -0,0 +1,66 @@
+#!/bin/sh
+
+# Copyright (C) 2022 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+nvptx_sm_def="$1/nvptx-sm.def"
+gen_copyright_sh="$1/gen-copyright.sh"
+
+sms=$(grep ^NVPTX_SM $nvptx_sm_def | sed 's/.*(//;s/,.*//')
+
+last=
+for sm in $sms; do
+    last="$sm"
+done
+
+cat <<EOF
+; -*- buffer-read-only: t -*-
+; Generated automatically by gen-opt.sh from nvptx-sm.def.
+EOF
+
+# Separator.
+echo
+
+. $gen_copyright_sh opt
+
+# Separator.
+echo
+
+cat <<EOF
+Enum
+Name(ptx_isa) Type(int)
+Known PTX ISA versions (for use with the -misa= option):
+EOF
+
+# Separator.
+echo
+
+for sm in $sms; do
+    cat <<EOF
+EnumValue
+Enum(ptx_isa) String(sm_$sm) Value(PTX_ISA_SM$sm)
+EOF
+
+    if [ "$sm" == "$last" ]; then
+	# Don't end with trailing empty line.
+	continue
+    fi
+
+    # Separator.
+    echo
+done
diff --git a/gcc/config/nvptx/nvptx-gen.h b/gcc/config/nvptx/nvptx-gen.h
new file mode 100644
index 0000000..1d6f0db
--- /dev/null
+++ b/gcc/config/nvptx/nvptx-gen.h
@@ -0,0 +1,29 @@
+/* -*- buffer-read-only: t -*-
+   Generated automatically by gen-h.sh from nvptx-sm.def.
+*/
+
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it under
+   the terms of the GNU General Public License as published by the Free
+   Software Foundation; either version 3, or (at your option) any later
+   version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+   for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.
+*/
+
+#define TARGET_SM30 (ptx_isa_option >= PTX_ISA_SM30)
+#define TARGET_SM35 (ptx_isa_option >= PTX_ISA_SM35)
+#define TARGET_SM53 (ptx_isa_option >= PTX_ISA_SM53)
+#define TARGET_SM70 (ptx_isa_option >= PTX_ISA_SM70)
+#define TARGET_SM75 (ptx_isa_option >= PTX_ISA_SM75)
+#define TARGET_SM80 (ptx_isa_option >= PTX_ISA_SM80)
diff --git a/gcc/config/nvptx/nvptx-gen.opt b/gcc/config/nvptx/nvptx-gen.opt
new file mode 100644
index 0000000..b6d433e
--- /dev/null
+++ b/gcc/config/nvptx/nvptx-gen.opt
@@ -0,0 +1,42 @@
+; -*- buffer-read-only: t -*-
+; Generated automatically by gen-opt.sh from nvptx-sm.def.
+
+; Copyright (C) 2022 Free Software Foundation, Inc.
+;
+; This file is part of GCC.
+;
+; GCC is free software; you can redistribute it and/or modify it under
+; the terms of the GNU General Public License as published by the Free
+; Software Foundation; either version 3, or (at your option) any later
+; version.
+;
+; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+; for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with GCC; see the file COPYING3.  If not see
+; <http://www.gnu.org/licenses/>.
+
+Enum
+Name(ptx_isa) Type(int)
+Known PTX ISA versions (for use with the -misa= option):
+
+EnumValue
+Enum(ptx_isa) String(sm_30) Value(PTX_ISA_SM30)
+
+EnumValue
+Enum(ptx_isa) String(sm_35) Value(PTX_ISA_SM35)
+
+EnumValue
+Enum(ptx_isa) String(sm_53) Value(PTX_ISA_SM53)
+
+EnumValue
+Enum(ptx_isa) String(sm_70) Value(PTX_ISA_SM70)
+
+EnumValue
+Enum(ptx_isa) String(sm_75) Value(PTX_ISA_SM75)
+
+EnumValue
+Enum(ptx_isa) String(sm_80) Value(PTX_ISA_SM80)
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index edffd08..4ab412b 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -86,11 +86,7 @@
 #define Pmode (TARGET_ABI64 ? DImode : SImode)
 #define STACK_SIZE_MODE Pmode
 
-#define TARGET_SM35 (ptx_isa_option >= PTX_ISA_SM35)
-#define TARGET_SM53 (ptx_isa_option >= PTX_ISA_SM53)
-#define TARGET_SM70 (ptx_isa_option >= PTX_ISA_SM70)
-#define TARGET_SM75 (ptx_isa_option >= PTX_ISA_SM75)
-#define TARGET_SM80 (ptx_isa_option >= PTX_ISA_SM80)
+#include "nvptx-gen.h"
 
 #define TARGET_PTX_6_0 (ptx_version_option >= PTX_VERSION_6_0)
 #define TARGET_PTX_6_3 (ptx_version_option >= PTX_VERSION_6_3)
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index f555ad1..c83ceb3 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -51,28 +51,6 @@ mgomp
 Target Mask(GOMP)
 Generate code for OpenMP offloading: enables -msoft-stack and -muniform-simt.
 
-Enum
-Name(ptx_isa) Type(int)
-Known PTX ISA versions (for use with the -misa= option):
-
-EnumValue
-Enum(ptx_isa) String(sm_30) Value(PTX_ISA_SM30)
-
-EnumValue
-Enum(ptx_isa) String(sm_35) Value(PTX_ISA_SM35)
-
-EnumValue
-Enum(ptx_isa) String(sm_53) Value(PTX_ISA_SM53)
-
-EnumValue
-Enum(ptx_isa) String(sm_70) Value(PTX_ISA_SM70)
-
-EnumValue
-Enum(ptx_isa) String(sm_75) Value(PTX_ISA_SM75)
-
-EnumValue
-Enum(ptx_isa) String(sm_80) Value(PTX_ISA_SM80)
-
 ; Default needs to be in sync with default in ASM_SPEC in nvptx.h.
 misa=
 Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) Init(PTX_ISA_SM35)
diff --git a/gcc/config/nvptx/t-nvptx b/gcc/config/nvptx/t-nvptx
index b170766..f17fc9c 100644
--- a/gcc/config/nvptx/t-nvptx
+++ b/gcc/config/nvptx/t-nvptx
@@ -13,4 +13,21 @@ mkoffload$(exeext): mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY)
 	+$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \
 	  mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBS)
 
+$(srcdir)/config/nvptx/nvptx.h: $(srcdir)/config/nvptx/nvptx-gen.h
+$(srcdir)/config/nvptx/nvptx-gen.h: s-nvptx-gen-h; @true
+s-nvptx-gen-h: $(srcdir)/config/nvptx/nvptx-sm.def
+	$(SHELL) $(srcdir)/config/nvptx/gen-h.sh "$(srcdir)/config/nvptx" \
+	  > tmp-nvptx-gen.h
+	$(SHELL) $(srcdir)/../move-if-change \
+	  tmp-nvptx-gen.h $(srcdir)/config/nvptx/nvptx-gen.h
+	$(STAMP) s-nvptx-gen-h
+
+$(srcdir)/config/nvptx/nvptx-gen.opt: s-nvptx-gen-opt; @true
+s-nvptx-gen-opt: $(srcdir)/config/nvptx/nvptx-sm.def
+	$(SHELL) $(srcdir)/config/nvptx/gen-opt.sh "$(srcdir)/config/nvptx" \
+	  > tmp-nvptx-gen.opt
+	$(SHELL) $(srcdir)/../move-if-change \
+	  tmp-nvptx-gen.opt $(srcdir)/config/nvptx/nvptx-gen.opt
+	$(STAMP) s-nvptx-gen-opt
+
 MULTILIB_OPTIONS = mgomp
-- 
cgit v1.1


From c2e0d0c1cfb4bf29daed189b39885841ee201a65 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Mon, 28 Feb 2022 16:06:54 +0100
Subject: [nvptx] Handle DCmode in define_expand "omp_simt_xchg_{bfly,idx}"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For a test-case doing an openmp target simd reduction on a complex double:
...
  DOUBLE COMPLEX :: counter_N0
  ...
  !$OMP TARGET SIMD reduction(+: counter_N0)
...
we run into:
...
during RTL pass: expand
b.f90: In function ‘MAIN__._omp_fn.0’:
b.f90:23:32: internal compiler error: in expand_insn, at optabs.cc:8029
   23 |     counter_N0 = counter_N0 + 1.
      |                                ^
0x10f1cd3 expand_insn(insn_code, unsigned int, expand_operand*)
        gcc/optabs.cc:8029
0xeac435 expand_GOMP_SIMT_XCHG_BFLY
        gcc/internal-fn.cc:375
...

Fix this by handling DCmode and CDImode in define_expand
"omp_simt_xchg_{bfly,idx}".

Tested on x86_64 with nvptx accelerator.

gcc/ChangeLog:

2022-02-28  Tom de Vries  <tdevries@suse.de>

	PR target/102429
	* config/nvptx/nvptx.cc (nvptx_gen_shuffle): Handle DCmode and CDImode.
	* config/nvptx/nvptx.md
	(define_predicate "nvptx_register_or_complex_di_df_register_operand"):
	New predicate.
	(define_expand "omp_simt_xchg_bfly", define_expand "omp_simt_xchg_idx"):
	Use nvptx_register_or_complex_di_df_register_operand.
---
 gcc/config/nvptx/nvptx.cc | 17 +++++++++++++++++
 gcc/config/nvptx/nvptx.md | 20 ++++++++++++++++----
 2 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index f3179ef..6ca99a6 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -1941,6 +1941,23 @@ nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
 
   switch (GET_MODE (dst))
     {
+      case E_DCmode:
+      case E_CDImode:
+	{
+	  gcc_assert (GET_CODE (dst) == CONCAT);
+	  gcc_assert (GET_CODE (src) == CONCAT);
+	  rtx dst_real = XEXP (dst, 0);
+	  rtx dst_imag = XEXP (dst, 1);
+	  rtx src_real = XEXP (src, 0);
+	  rtx src_imag = XEXP (src, 1);
+
+	  start_sequence ();
+	  emit_insn (nvptx_gen_shuffle (dst_real, src_real, idx, kind));
+	  emit_insn (nvptx_gen_shuffle (dst_imag, src_imag, idx, kind));
+	  res = get_insns ();
+	  end_sequence ();
+	}
+	break;
     case E_SImode:
       res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
       break;
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 4989b56..a453c1d 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -94,6 +94,18 @@
   return register_operand (op, mode);
 })
 
+(define_predicate "nvptx_register_or_complex_di_df_register_operand"
+  (ior (match_code "reg")
+       (match_code "concat"))
+{
+  if (GET_CODE (op) == CONCAT)
+    return ((GET_MODE (op) == DCmode || GET_MODE (op) == CDImode)
+	    && nvptx_register_operand (XEXP (op, 0), mode)
+	    && nvptx_register_operand (XEXP (op, 1), mode));
+
+  return nvptx_register_operand (op, mode);
+})
+
 (define_predicate "nvptx_nonimmediate_operand"
   (match_code "mem,reg")
 {
@@ -1902,8 +1914,8 @@
 ;; Implement IFN_GOMP_SIMT_XCHG_BFLY: perform a "butterfly" exchange
 ;; across lanes
 (define_expand "omp_simt_xchg_bfly"
-  [(match_operand 0 "nvptx_register_operand" "=R")
-   (match_operand 1 "nvptx_register_operand" "R")
+  [(match_operand 0 "nvptx_register_or_complex_di_df_register_operand" "=R")
+   (match_operand 1 "nvptx_register_or_complex_di_df_register_operand" "R")
    (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")]
   ""
 {
@@ -1915,8 +1927,8 @@
 ;; Implement IFN_GOMP_SIMT_XCHG_IDX: broadcast value in operand 1
 ;; from lane given by index in operand 2 to operand 0 in all lanes
 (define_expand "omp_simt_xchg_idx"
-  [(match_operand 0 "nvptx_register_operand" "=R")
-   (match_operand 1 "nvptx_register_operand" "R")
+  [(match_operand 0 "nvptx_register_or_complex_di_df_register_operand" "=R")
+   (match_operand 1 "nvptx_register_or_complex_di_df_register_operand" "R")
    (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")]
   ""
 {
-- 
cgit v1.1


From 12fa7641ceed9c9139e2ea7b62c11f3dc5b6f6f4 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Thu, 3 Mar 2022 09:21:04 +0100
Subject: [nvptx] Use --no-verify for sm_30

In PR97348, we ran into the problem that recent CUDA dropped support for
sm_30, which inhibited the build when building with CUDA bin in the path,
because the nvptx-tools assembler uses CUDA's ptxas to do ptx verification.

To fix this, in gcc-11 the default sm_xx was moved from sm_30 to sm_35.

This however broke support for sm_30 boards: an executable build for sm_30
might contain sm_35 code from the libraries, which are build with the default
sm_xx (PR104758).

We want to fix this by going back to having the libraries build with sm_30, as
was the case for gcc-5 to gcc-10.  That however reintroduces the problem from
PR97348.

Deal with PR97348 in the simplest way possible: when calling the assembler for
sm_30, specify --no-verify.

This has the unfortunate effect that after fixing PR104758 by building
libraries with sm_30, the libraries are no longer verified.  This can be
improved upon by:
- adding a configure test in gcc that tests if CUDA supports sm_30, and
  if so disabling this patch
- dealing with this in nvptx-tools somehow, either:
  - detect at ptxas execution time that it doesn't support sm_30, or
  - detect this at nvptx-tool configure time.

gcc/ChangeLog:

2022-03-03  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.h (ASM_SPEC): Add %{misa=sm_30:--no-verify}.
---
 gcc/config/nvptx/nvptx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 4ab412b..3ca22a5 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -32,7 +32,7 @@
 /* Default needs to be in sync with default for misa in nvptx.opt.
    We add a default here to work around a hard-coded sm_30 default in
    nvptx-as.  */
-#define ASM_SPEC "%{misa=*:-m %*; :-m sm_35}"
+#define ASM_SPEC "%{misa=*:-m %*; :-m sm_35}%{misa=sm_30:--no-verify}"
 
 #define TARGET_CPU_CPP_BUILTINS() nvptx_cpu_cpp_builtins ()
 
-- 
cgit v1.1


From 07667c911b1827fb98a1b5da621d51d8fcf0409a Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Wed, 2 Mar 2022 12:04:39 +0100
Subject: [nvptx] Build libraries with misa=sm_30

In gcc-11, when  specifying -misa=sm_30, an executable may still contain sm_35
code (due to libraries being built with the default -misa=sm_35), so it won't
run on an sm_30 board.

Fix this by building libraries with sm_30, as was the case in gcc-5 to gcc-10.

gcc/ChangeLog:

2022-03-03  Tom de Vries  <tdevries@suse.de>

	PR target/104758
	* config/nvptx/t-nvptx (MULTILIB_EXTRA_OPTS): Add misa=sm_30.
---
 gcc/config/nvptx/t-nvptx | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/t-nvptx b/gcc/config/nvptx/t-nvptx
index f17fc9c..056d2dd 100644
--- a/gcc/config/nvptx/t-nvptx
+++ b/gcc/config/nvptx/t-nvptx
@@ -31,3 +31,5 @@ s-nvptx-gen-opt: $(srcdir)/config/nvptx/nvptx-sm.def
 	$(STAMP) s-nvptx-gen-opt
 
 MULTILIB_OPTIONS = mgomp
+
+MULTILIB_EXTRA_OPTS = misa=sm_30
-- 
cgit v1.1


From 5b5e456f0187406e17444b6e40d974f94524f2a2 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Thu, 3 Mar 2022 09:22:42 +0100
Subject: [nvptx] Build libraries with mptx=3.1

In gcc-5 to gcc-11, the ptx isa version was 3.1.

On trunk, the default is now 6.0, which is also what will be the value in
the libraries.

Consequently, there may be setups with an older driver that worked with
gcc-11, but will become unsupported with gcc-12.

Fix this by building the libraries with mptx=3.1.

After this, setups with an older driver still won't work out of the box
with gcc-12, because the default ptx isa version has changed, but should work
after specifying mptx=3.1.

gcc/ChangeLog:

2022-03-03  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/t-nvptx (MULTILIB_EXTRA_OPTS): Add mptx=3.1.
---
 gcc/config/nvptx/t-nvptx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/nvptx/t-nvptx b/gcc/config/nvptx/t-nvptx
index 056d2dd..8f67264 100644
--- a/gcc/config/nvptx/t-nvptx
+++ b/gcc/config/nvptx/t-nvptx
@@ -32,4 +32,4 @@ s-nvptx-gen-opt: $(srcdir)/config/nvptx/nvptx-sm.def
 
 MULTILIB_OPTIONS = mgomp
 
-MULTILIB_EXTRA_OPTS = misa=sm_30
+MULTILIB_EXTRA_OPTS = misa=sm_30 mptx=3.1
-- 
cgit v1.1


From 609e8c492d62d92465460eae3d43dfc4b2c68288 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sat, 26 Feb 2022 14:17:23 -0800
Subject: x86: Always return pseudo register in ix86_gen_scratch_sse_rtx

ix86_gen_scratch_sse_rtx returns XMM7/XMM15/XMM31 as a scratch vector
register to prevent RTL optimizers from removing vector register.  It
introduces a conflict with explicit XMM7/XMM15/XMM31 usage and when it
is called by RTL optimizers, it may introduce conflicting usages of
XMM7/XMM15/XMM31.

Change ix86_gen_scratch_sse_rtx to always return a pseudo register and
xfail x86 tests which are optimized with a hard scratch register.

gcc/

	PR target/104704
	* config/i386/i386.cc (ix86_gen_scratch_sse_rtx): Always return
	a pseudo register.

gcc/testsuite/

	PR target/104704
	* gcc.target/i386/incoming-11.c: Xfail.
	* gcc.target/i386/pieces-memset-3.c: Likewise.
	* gcc.target/i386/pieces-memset-37.c: Likewise.
	* gcc.target/i386/pieces-memset-39.c: Likewise.
	* gcc.target/i386/pieces-memset-46.c: Likewise.
	* gcc.target/i386/pieces-memset-47.c: Likewise.
	* gcc.target/i386/pieces-memset-48.c: Likewise.
	* gcc.target/i386/pr90773-5.c: Likewise.
	* gcc.target/i386/pr90773-14.c: Likewise.
	* gcc.target/i386/pr90773-17.c: Likewise.
	* gcc.target/i386/pr100865-8a.c: Likewise.
	* gcc.target/i386/pr100865-8c.c: Likewise.
	* gcc.target/i386/pr100865-9c.c: Likewise.
	* gcc.target/i386/pieces-memset-21.c: Always expect vzeroupper.
	* gcc.target/i386/pr82941-1.c: Likewise.
	* gcc.target/i386/pr82942-1.c: Likewise.
	* gcc.target/i386/pr82990-1.c: Likewise.
	* gcc.target/i386/pr82990-3.c: Likewise.
	* gcc.target/i386/pr82990-5.c: Likewise.
	* gcc.target/i386/pr100865-11b.c: Expect vmovdqa instead of
	vmovdqa64.
	* gcc.target/i386/pr100865-12b.c: Likewise.
	* gcc.target/i386/pr100865-8b.c: Likewise.
	* gcc.target/i386/pr100865-9b.c: Likewise.
	* gcc.target/i386/pr104704-1.c: New test.
	* gcc.target/i386/pr104704-2.c: Likewise.
	* gcc.target/i386/pr104704-3.c: Likewise.
	* gcc.target/i386/pr104704-4.c: Likewise.
	* gcc.target/i386/pr104704-5.c: Likewise.
	* gcc.target/i386/pr104704-6.c: Likewise.
---
 gcc/config/i386/i386.cc | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index b2bf905..9521990 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23786,24 +23786,7 @@ ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
 rtx
 ix86_gen_scratch_sse_rtx (machine_mode mode)
 {
-  if (TARGET_SSE && !lra_in_progress)
-    {
-      unsigned int regno;
-      if (TARGET_64BIT)
-	{
-	  /* In 64-bit mode, use XMM31 to avoid vzeroupper and always
-	     use XMM31 for CSE.  */
-	  if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode))
-	    regno = LAST_EXT_REX_SSE_REG;
-	  else
-	    regno = LAST_REX_SSE_REG;
-	}
-      else
-	regno = LAST_SSE_REG;
-      return gen_rtx_REG (mode, regno);
-    }
-  else
-    return gen_reg_rtx (mode);
+  return gen_reg_rtx (mode);
 }
 
 /* Address space support.
-- 
cgit v1.1


From cb16bc3b5f34733ef9bbf8d2e3acacdecb099a62 Mon Sep 17 00:00:00 2001
From: Peter Bergner <bergner@linux.ibm.com>
Date: Fri, 4 Mar 2022 09:03:44 -0600
Subject: rs6000: Allow -mlong-double-64 after -mabi={ibm,ieee}longdouble
 [PR104208, PR87496]

The glibc build is showing a build error due to extra "error" checking from my
PR87496 fix.  That checking was overeager, disallowing setting the long double
size to 64-bits if the 128-bit long double ABI had already been specified.
Now we only emit an error if we specify a 128-bit long double ABI if our
long double size is not 128 bits.  This also fixes an erroneous error when
-mabi=ieeelongdouble is used and ISA 2.06 is not enabled, but the long double
size has been changed to 64 bits.

2022-03-04  Peter Bergner  <bergner@linux.ibm.com>

gcc/
	PR target/87496
	PR target/104208
	* config/rs6000/rs6000.cc (rs6000_option_override_internal): Make the
	ISA 2.06 requirement for -mabi=ieeelongdouble conditional on
	-mlong-double-128.
	Move the -mabi=ieeelongdouble and -mabi=ibmlongdouble error checking
	from here...
	* common/config/rs6000/rs6000-common.cc (rs6000_handle_option):
	... to here.

gcc/testsuite/
	PR target/87496
	PR target/104208
	* gcc.target/powerpc/pr104208-1.c: New test.
	* gcc.target/powerpc/pr104208-2.c: Likewise.
	* gcc.target/powerpc/pr87496-2.c: Swap long double options to trigger
	the expected error.
	* gcc.target/powerpc/pr87496-3.c: Likewise.
---
 gcc/config/rs6000/rs6000.cc | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index a855e8c..5b100a8 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -4178,13 +4178,6 @@ rs6000_option_override_internal (bool global_init_p)
     ; /* The option value can be seen when cl_target_option_restore is called.  */
   else if (rs6000_long_double_type_size == 128)
     rs6000_long_double_type_size = FLOAT_PRECISION_TFmode;
-  else if (OPTION_SET_P (rs6000_ieeequad))
-    {
-      if (global_options.x_rs6000_ieeequad)
-	error ("%qs requires %qs", "-mabi=ieeelongdouble", "-mlong-double-128");
-      else
-	error ("%qs requires %qs", "-mabi=ibmlongdouble", "-mlong-double-128");
-    }
 
   /* Set -mabi=ieeelongdouble on some old targets.  In the future, power server
      systems will also set long double to be IEEE 128-bit.  AIX and Darwin
@@ -4194,13 +4187,13 @@ rs6000_option_override_internal (bool global_init_p)
   if (!OPTION_SET_P (rs6000_ieeequad))
     rs6000_ieeequad = TARGET_IEEEQUAD_DEFAULT;
 
-  else
+  else if (TARGET_LONG_DOUBLE_128)
     {
       if (global_options.x_rs6000_ieeequad
 	  && (!TARGET_POPCNTD || !TARGET_VSX))
 	error ("%qs requires full ISA 2.06 support", "-mabi=ieeelongdouble");
 
-      if (rs6000_ieeequad != TARGET_IEEEQUAD_DEFAULT && TARGET_LONG_DOUBLE_128)
+      if (rs6000_ieeequad != TARGET_IEEEQUAD_DEFAULT)
 	{
 	  /* Determine if the user can change the default long double type at
 	     compilation time.  You need GLIBC 2.32 or newer to be able to
-- 
cgit v1.1


From f1b3e3853329b58fb2e50c17487df2ecbc4a5608 Mon Sep 17 00:00:00 2001
From: Iain Sandoe <iain@sandoe.co.uk>
Date: Wed, 23 Feb 2022 13:53:44 +0000
Subject: LRA, rs6000, Darwin: Revise lo_sum use for forced constants
 [PR104117].

Follow up discussion to the initial patch for this PR identified that it is
preferable to avoid the LRA change, and arrange for the target to reject the
hi and lo_sum selections when presented with an invalid address.

We split the Darwin high/low selectors into two:
 1. One that handles non-PIC addresses (kernel mode, mdynamic-no-pic).
 2. One that handles PIC addresses and rejects SYMBOL_REFs unless they are
    suitably wrapped in the MACHOPIC_OFFSET unspec.

The second case is handled by providing a new predicate (macho_pic_address)
that checks the requirements.

Signed-off-by: Iain Sandoe <iain@sandoe.co.uk>

	PR target/104117

gcc/ChangeLog:

	* config/rs6000/darwin.md (@machopic_high_<mode>): New.
	(@machopic_low_<mode>): New.
	* config/rs6000/predicates.md (macho_pic_address): New.
	* config/rs6000/rs6000.cc (rs6000_legitimize_address): Do not
	apply the TLS processing to Darwin.
	* lra-constraints.cc (process_address_1): Revert the changes
	in r12-7209.
---
 gcc/config/rs6000/darwin.md     | 19 +++++++++++++++----
 gcc/config/rs6000/predicates.md | 14 ++++++++++++++
 gcc/config/rs6000/rs6000.cc     |  2 +-
 3 files changed, 30 insertions(+), 5 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/darwin.md b/gcc/config/rs6000/darwin.md
index 8443585..e73d59e 100644
--- a/gcc/config/rs6000/darwin.md
+++ b/gcc/config/rs6000/darwin.md
@@ -121,21 +121,32 @@ You should have received a copy of the GNU General Public License
    stw %0,lo16(%2)(%1)"
   [(set_attr "type" "store")])
 
-;; 64-bit MachO load/store support
-
 ;; Mach-O PIC.
 
 (define_insn "@macho_high_<mode>"
   [(set (match_operand:P 0 "gpc_reg_operand" "=b*r")
 	(high:P (match_operand 1 "" "")))]
-  "TARGET_MACHO && (DEFAULT_ABI == ABI_DARWIN)"
+  "TARGET_MACHO && (DEFAULT_ABI == ABI_DARWIN) && !flag_pic"
   "lis %0,ha16(%1)")
 
 (define_insn "@macho_low_<mode>"
   [(set (match_operand:P 0 "gpc_reg_operand" "=r")
 	(lo_sum:P (match_operand:P 1 "gpc_reg_operand" "b")
 		   (match_operand 2 "" "")))]
-   "TARGET_MACHO && (DEFAULT_ABI == ABI_DARWIN)"
+   "TARGET_MACHO && (DEFAULT_ABI == ABI_DARWIN) && !flag_pic"
+   "la %0,lo16(%2)(%1)")
+
+(define_insn "@machopic_high_<mode>"
+  [(set (match_operand:P 0 "gpc_reg_operand" "=b*r")
+	(high:P (match_operand 1 "macho_pic_address" "")))]
+  "TARGET_MACHO && flag_pic"
+  "lis %0,ha16(%1)")
+
+(define_insn "@machopic_low_<mode>"
+  [(set (match_operand:P 0 "gpc_reg_operand" "=r")
+	(lo_sum:P (match_operand:P 1 "gpc_reg_operand" "b")
+		   (match_operand 2 "macho_pic_address" "")))]
+   "TARGET_MACHO && flag_pic"
    "la %0,lo16(%2)(%1)")
 
 (define_split
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index c65dfb9..28f6e98 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -2045,3 +2045,17 @@
  (if_then_else (match_test "TARGET_VSX")
   (match_operand 0 "reg_or_cint_operand")
   (match_operand 0 "const_int_operand")))
+
+;; Return true if the operand is a valid Mach-O pic address.
+;;
+(define_predicate "macho_pic_address"
+  (match_code "const,unspec")
+{
+  if (GET_CODE (op) == CONST)
+    op = XEXP (op, 0);
+
+  if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_MACHOPIC_OFFSET)
+    return CONSTANT_P (XVECEXP (op, 0, 0));
+  else
+    return false;
+})
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 5b100a8..2388d44 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -9021,7 +9021,7 @@ rs6000_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
       else
 	return force_reg (Pmode, x);
     }
-  if (SYMBOL_REF_P (x))
+  if (SYMBOL_REF_P (x) && !TARGET_MACHO)
     {
       enum tls_model model = SYMBOL_REF_TLS_MODEL (x);
       if (model != 0)
-- 
cgit v1.1


From 25587472ccd223c861fe77cfeca4ba33c3f6cd99 Mon Sep 17 00:00:00 2001
From: Iain Sandoe <iain@sandoe.co.uk>
Date: Fri, 4 Mar 2022 12:39:03 +0000
Subject: Darwin: Fix a type mismatch warning for a non-GCC bootstrap compiler.

DECL_MD_FUNCTION_CODE() returns an int, on one particular compiler the
code in darwin_fold_builtin() triggers a warning.

Fixed thus.

Signed-off-by: Iain Sandoe <iain@sandoe.co.uk>

gcc/ChangeLog:

	* config/darwin.cc (darwin_fold_builtin): Make fcode an int to
	avoid a mismatch with DECL_MD_FUNCTION_CODE().
---
 gcc/config/darwin.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/config')

diff --git a/gcc/config/darwin.cc b/gcc/config/darwin.cc
index 783fe3c..f065a13 100644
--- a/gcc/config/darwin.cc
+++ b/gcc/config/darwin.cc
@@ -3621,7 +3621,7 @@ tree
 darwin_fold_builtin (tree fndecl, int n_args, tree *argp,
 		     bool ARG_UNUSED (ignore))
 {
-  unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
+  int fcode = DECL_MD_FUNCTION_CODE (fndecl);
 
   if (fcode == darwin_builtin_cfstring)
     {
-- 
cgit v1.1


From 77eccbf39ed55297802bb66dff5f62507a7239e3 Mon Sep 17 00:00:00 2001
From: Segher Boessenkool <segher@kernel.crashing.org>
Date: Tue, 1 Mar 2022 17:04:29 +0000
Subject: rs6000: Improve .machine

This adds more correct .machine for most older CPUs.  It should be
conservative in the sense that everything we handled before we handle at
least as well now.  This does not yet revamp the server CPU handling, it
is too risky at this point in time.

Tested on powerpc64-linux {-m32,-m64}.  Also manually tested with all
-mcpu=, and the output of that passed through the GNU assembler.

2022-03-04  Segher Boessenkool  <segher@kernel.crashing.org>

	* config/rs6000/rs6000.cc (rs6000_machine_from_flags): Restructure a
	bit.  Handle most older CPUs.
---
 gcc/config/rs6000/rs6000.cc | 81 ++++++++++++++++++++++++++++++---------------
 1 file changed, 54 insertions(+), 27 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 2388d44..7afbc29 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -5764,33 +5764,60 @@ const char *rs6000_machine;
 const char *
 rs6000_machine_from_flags (void)
 {
-  /* For some CPUs, the machine cannot be determined by ISA flags.  We have to
-     check them first.  */
-  switch (rs6000_cpu)
-    {
-    case PROCESSOR_PPC8540:
-    case PROCESSOR_PPC8548:
-      return "e500";
-
-    case PROCESSOR_PPCE300C2:
-    case PROCESSOR_PPCE300C3:
-      return "e300";
-
-    case PROCESSOR_PPCE500MC:
-      return "e500mc";
-
-    case PROCESSOR_PPCE500MC64:
-      return "e500mc64";
-
-    case PROCESSOR_PPCE5500:
-      return "e5500";
-
-    case PROCESSOR_PPCE6500:
-      return "e6500";
-
-    default:
-      break;
-    }
+  /* e300 and e500 */
+  if (rs6000_cpu == PROCESSOR_PPCE300C2 || rs6000_cpu == PROCESSOR_PPCE300C3)
+    return "e300";
+  if (rs6000_cpu == PROCESSOR_PPC8540 || rs6000_cpu == PROCESSOR_PPC8548)
+    return "e500";
+  if (rs6000_cpu == PROCESSOR_PPCE500MC)
+    return "e500mc";
+  if (rs6000_cpu == PROCESSOR_PPCE500MC64)
+    return "e500mc64";
+  if (rs6000_cpu == PROCESSOR_PPCE5500)
+    return "e5500";
+  if (rs6000_cpu == PROCESSOR_PPCE6500)
+    return "e6500";
+
+  /* 400 series */
+  if (rs6000_cpu == PROCESSOR_PPC403)
+    return "\"403\"";
+  if (rs6000_cpu == PROCESSOR_PPC405)
+    return "\"405\"";
+  if (rs6000_cpu == PROCESSOR_PPC440)
+    return "\"440\"";
+  if (rs6000_cpu == PROCESSOR_PPC476)
+    return "\"476\"";
+
+  /* A2 */
+  if (rs6000_cpu == PROCESSOR_PPCA2)
+    return "a2";
+
+  /* Cell BE */
+  if (rs6000_cpu == PROCESSOR_CELL)
+    return "cell";
+
+  /* Titan */
+  if (rs6000_cpu == PROCESSOR_TITAN)
+    return "titan";
+
+  /* 500 series and 800 series */
+  if (rs6000_cpu == PROCESSOR_MPCCORE)
+    return "\"821\"";
+
+  /* 600 series and 700 series, "classic" */
+  if (rs6000_cpu == PROCESSOR_PPC601 || rs6000_cpu == PROCESSOR_PPC603
+      || rs6000_cpu == PROCESSOR_PPC604 || rs6000_cpu == PROCESSOR_PPC604e
+      || rs6000_cpu == PROCESSOR_PPC750 || rs6000_cpu == PROCESSOR_POWERPC)
+    return "ppc";
+
+  /* Classic with AltiVec, "G4" */
+  if (rs6000_cpu == PROCESSOR_PPC7400 || rs6000_cpu == PROCESSOR_PPC7450)
+    return "\"7450\"";
+
+  /* The older 64-bit CPUs */
+  if (rs6000_cpu == PROCESSOR_PPC620 || rs6000_cpu == PROCESSOR_PPC630
+      || rs6000_cpu == PROCESSOR_RS64A || rs6000_cpu == PROCESSOR_POWERPC64)
+    return "ppc64";
 
   HOST_WIDE_INT flags = rs6000_isa_flags;
 
-- 
cgit v1.1


From 1301d7f647c7ac40da7f910aa6e790205e34bb8b Mon Sep 17 00:00:00 2001
From: Michael Meissner <meissner@linux.ibm.com>
Date: Sat, 5 Mar 2022 00:01:52 -0500
Subject: Optimize signed DImode -> TImode on power10.

On power10, GCC tries to optimize the signed conversion from DImode to
TImode by using the vextsd2q instruction.  However to generate this
instruction, it would have to generate 3 direct moves (1 from the GPR
registers to the altivec registers, and 2 from the altivec registers to
the GPR register).

This patch generates the shift right immediate instruction to do the
conversion if the target/source registers ares GPR registers like it does
on earlier systems.  If the target/source registers are Altivec registers,
it will generate the vextsd2q instruction.

2022-03-05   Michael Meissner  <meissner@linux.ibm.com>

gcc/
	PR target/104698
	* config/rs6000/vsx.md (UNSPEC_MTVSRD_DITI_W1): Delete.
	(mtvsrdd_diti_w1): Delete.
	(extendditi2): Convert from define_expand to
	define_insn_and_split.  Replace with code to deal with both GPR
	registers and with altivec registers.

gcc/testsuite/
	PR target/104698
	* gcc.target/powerpc/pr104698-1.c: New test.
	* gcc.target/powerpc/pr104698-2.c: New test.
---
 gcc/config/rs6000/vsx.md | 83 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 61 insertions(+), 22 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index b53de10..d0fb92f 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -360,7 +360,6 @@
    UNSPEC_XXGENPCV
    UNSPEC_MTVSBM
    UNSPEC_EXTENDDITI2
-   UNSPEC_MTVSRD_DITI_W1
    UNSPEC_VCNTMB
    UNSPEC_VEXPAND
    UNSPEC_VEXTRACT
@@ -5023,15 +5022,67 @@
   DONE;
 })
 
-;; ISA 3.1 vector sign extend
-;; Move DI value from GPR to TI mode in VSX register, word 1.
-(define_insn "mtvsrdd_diti_w1"
-  [(set (match_operand:TI 0 "register_operand" "=wa")
-	(unspec:TI [(match_operand:DI 1 "register_operand" "r")]
-		     UNSPEC_MTVSRD_DITI_W1))]
-  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "mtvsrdd %x0,0,%1"
-  [(set_attr "type" "vecmove")])
+;; Sign extend DI to TI.  We provide both GPR targets and Altivec targets on
+;; power10.  On earlier systems, the machine independent code will generate a
+;; shift left to sign extend the 64-bit value to 128-bit.
+;;
+;; If the register allocator prefers to use GPR registers, we will use a shift
+;; left instruction to sign extend the 64-bit value to 128-bit.
+;;
+;; If the register allocator prefers to use Altivec registers on power10,
+;; generate the vextsd2q instruction.
+(define_insn_and_split "extendditi2"
+  [(set (match_operand:TI 0 "register_operand" "=r,r,v,v,v")
+	(sign_extend:TI (match_operand:DI 1 "input_operand" "r,m,r,wa,Z")))
+   (clobber (reg:DI CA_REGNO))]
+  "TARGET_POWERPC64 && TARGET_POWER10"
+  "#"
+  "&& reload_completed"
+  [(pc)]
+{
+  rtx dest = operands[0];
+  rtx src = operands[1];
+  int dest_regno = reg_or_subregno (dest);
+
+  /* Handle conversion to GPR registers.  Load up the low part and then do
+     a sign extension to the upper part.  */
+  if (INT_REGNO_P (dest_regno))
+    {
+      rtx dest_hi = gen_highpart (DImode, dest);
+      rtx dest_lo = gen_lowpart (DImode, dest);
+
+      emit_move_insn (dest_lo, src);
+      /* In case src is a MEM, we have to use the destination, which is a
+         register, instead of re-using the source.  */
+      rtx src2 = (REG_P (src) || SUBREG_P (src)) ? src : dest_lo;
+      emit_insn (gen_ashrdi3 (dest_hi, src2, GEN_INT (63)));
+      DONE;
+    }
+
+  /* For conversion to an Altivec register, generate either a splat operation
+     or a load rightmost double word instruction.  Both instructions gets the
+     DImode value into the lower 64 bits, and then do the vextsd2q
+     instruction.  */
+
+  else if (ALTIVEC_REGNO_P (dest_regno))
+    {
+      if (MEM_P (src))
+	emit_insn (gen_vsx_lxvrdx (dest, src));
+      else
+	{
+	  rtx dest_v2di = gen_rtx_REG (V2DImode, dest_regno);
+	  emit_insn (gen_vsx_splat_v2di (dest_v2di, src));
+	}
+
+      emit_insn (gen_extendditi2_vector (dest, dest));
+      DONE;
+    }
+
+  else
+    gcc_unreachable ();
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "shift,load,vecmove,vecperm,load")])
 
 ;; Sign extend 64-bit value in TI reg, word 1, to 128-bit value in TI reg
 (define_insn "extendditi2_vector"
@@ -5042,18 +5093,6 @@
   "vextsd2q %0,%1"
   [(set_attr "type" "vecexts")])
 
-(define_expand "extendditi2"
-  [(set (match_operand:TI 0 "gpc_reg_operand")
-	(sign_extend:DI (match_operand:DI 1 "gpc_reg_operand")))]
-  "TARGET_POWER10"
-  {
-    /* Move 64-bit src from GPR to vector reg and sign extend to 128-bits.  */
-    rtx temp = gen_reg_rtx (TImode);
-    emit_insn (gen_mtvsrdd_diti_w1 (temp, operands[1]));
-    emit_insn (gen_extendditi2_vector (operands[0], temp));
-    DONE;
-  })
-
 
 ;; ISA 3.0 Binary Floating-Point Support
 
-- 
cgit v1.1


From 8ea4a34bd0b0a46277b5e077c89cbd86dfb09c48 Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Sat, 5 Mar 2022 08:50:45 +0000
Subject: PR 104732: Simplify/fix DI mode logic expansion/splitting on -m32.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This clean-up patch resolves PR testsuite/104732, the failure of the recent
test gcc.target/i386/pr100711-1.c on 32-bit Solaris/x86.  Rather than just
tweak the testcase, the proposed approach is to fix the underlying problem
by removing the "TARGET_STV && TARGET_SSE2" conditionals from the DI mode
logical operation expanders and pre-reload splitters in i386.md, which as
I'll show generate inferior code (even a GCC 12 regression) on !TARGET_64BIT
whenever -mno-stv (such as Solaris) or -msse (but not -msse2).

First a little bit of history.  In the beginning, DImode operations on
i386 weren't defined by the machine description, and lowered during RTL
expansion to SI mode operations.  The with PR 65105 in 2015, -mstv was
added, together with a SWIM1248x mode iterator (later renamed to SWIM1248x)
together with several *<code>di3_doubleword post-reload splitters that
made use of register allocation to perform some double word operations
in 64-but XMM registers.  A short while later in 2016, PR 70322 added
similar support for one_cmpldi2.  All of this logic was dependent upon
"!TARGET_64BIT && TARGET_STV && TARGET_SSE2".  With the passing of time,
these conditions became irrelevant when in 2019, it was decided to split
these double-word patterns before reload.
https://gcc.gnu.org/pipermail/gcc-patches/2019-June/523877.html
https://gcc.gnu.org/pipermail/gcc-patches/2019-October/532236.html
Hence the current situation, where on most modern CPU architectures
(where "TARGET_STV && TARGET_SSE2" is true), RTL is expanded with DI
mode operations, that are then split into two SI mode instructions
before reload, except on Solaris and other odd cases, where the splitting
is to two SI mode instructions is done during RTL expansion.  By the
time compilation reaches register allocation both paths in theory
produce identical or similar code, so the vestigial legacy/logic would
appear to be harmless.

Unfortunately, there is one place where this arbitrary choice of how
to lower DI mode doubleword operations is visible to the middle-end,
it controls whether the backend appears to have a suitable optab, and
the presence (or not) of DImode optabs can influence vectorization
cost models and veclower decisions.

The issue (and code quality regression) can be seen in this test case:

typedef long long v2di __attribute__((vector_size (16)));
v2di x;
void foo (long long a)
{
    v2di t = {a, a};
    x = ~t;
}

which when compiled with "-O2 -m32 -msse -march=pentiumpro" produces:

foo:    subl    $28, %esp
        movl    %ebx, 16(%esp)
        movl    32(%esp), %eax
        movl    %esi, 20(%esp)
        movl    36(%esp), %edx
        movl    %edi, 24(%esp)
        movl    %eax, %esi
        movl    %eax, %edi
        movl    %edx, %ebx
        movl    %edx, %ecx
        notl    %esi
        notl    %ebx
        movl    %esi, (%esp)
        notl    %edi
        notl    %ecx
        movl    %ebx, 4(%esp)
        movl    20(%esp), %esi
        movl    %edi, 8(%esp)
        movl    16(%esp), %ebx
        movl    %ecx, 12(%esp)
        movl    24(%esp), %edi
        movss   8(%esp), %xmm1
        movss   12(%esp), %xmm2
        movss   (%esp), %xmm0
        movss   4(%esp), %xmm3
        unpcklps        %xmm2, %xmm1
        unpcklps        %xmm3, %xmm0
        movlhps %xmm1, %xmm0
        movaps  %xmm0, x
        addl    $28, %esp
        ret

Importantly notice the four "notl" instructions.  With this patch:

foo:	subl    $28, %esp
        movl    32(%esp), %edx
        movl    36(%esp), %eax
        notl    %edx
        movl    %edx, (%esp)
        notl    %eax
        movl    %eax, 4(%esp)
        movl    %edx, 8(%esp)
        movl    %eax, 12(%esp)
        movaps  (%esp), %xmm1
        movaps  %xmm1, x
        addl    $28, %esp
        ret

Notice only two "notl" instructions.  Checking with godbolt.org, GCC
generated 4 NOTs in GCC 4.x and 5.x, 2 NOTs between GCC 6.x and 9.x,
and regressed to 4 NOTs since GCC 10.x [which hopefully qualifies
this clean-up as suitable for stage 4].

Most significantly, this patch allows pr100711-1.c to pass with
-mno-stv, allowing pandn to be used with V2DImode on Solaris/x86.
Fingers-crossed this should reduce the number of discrepancies
encountered supporting Solaris/x86.

2022-03-05  Roger Sayle  <roger@nextmovesoftware.com>
	    Uroš Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog
	PR testsuite/104732
	* config/i386/i386.md (SWIM1248x): Renamed from SWIM1248s.
	Include DI mode unconditionally.
	(*anddi3_doubleword): Remove && TARGET_STV && TARGET_SSE2 condition,
	i.e. always split on !TARGET_64BIT.
	(*<any_or>di3_doubleword): Likewise.
	(*one_cmpldi2_doubleword): Likewise.
	(and<mode>3 expander): Update to use SWIM1248x from SWIM1248s.
	(<any_or><mode>3 expander): Likewise.
	(one_cmpl<mode>2 expander): Likewise.

gcc/testsuite/ChangeLog
	PR testsuite/104732
	* gcc.target/i386/pr104732.c: New test case.
---
 gcc/config/i386/i386.md | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'gcc/config')

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 5e0a980..d15170e 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1079,11 +1079,11 @@
 			       (HI "TARGET_HIMODE_MATH")
 			       SI])
 
-;; Math-dependant integer modes with DImode (enabled for 32bit with STV).
-(define_mode_iterator SWIM1248s
+;; Math-dependant integer modes with DImode.
+(define_mode_iterator SWIM1248x
 	[(QI "TARGET_QIMODE_MATH")
 	 (HI "TARGET_HIMODE_MATH")
-	 SI (DI "TARGET_64BIT || (TARGET_STV && TARGET_SSE2)")])
+	 SI DI])
 
 ;; Math-dependant single word integer modes without QImode.
 (define_mode_iterator SWIM248 [(HI "TARGET_HIMODE_MATH")
@@ -9693,9 +9693,9 @@
 ;; it should be done with splitters.
 
 (define_expand "and<mode>3"
-  [(set (match_operand:SWIM1248s 0 "nonimmediate_operand")
-	(and:SWIM1248s (match_operand:SWIM1248s 1 "nonimmediate_operand")
-		       (match_operand:SWIM1248s 2 "<general_szext_operand>")))]
+  [(set (match_operand:SWIM1248x 0 "nonimmediate_operand")
+	(and:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")
+		       (match_operand:SWIM1248x 2 "<general_szext_operand>")))]
   ""
 {
   machine_mode mode = <MODE>mode;
@@ -9733,7 +9733,7 @@
 	 (match_operand:DI 1 "nonimmediate_operand")
 	 (match_operand:DI 2 "x86_64_szext_general_operand")))
    (clobber (reg:CC FLAGS_REG))]
-  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2
+  "!TARGET_64BIT
    && ix86_binary_operator_ok (AND, DImode, operands)
    && ix86_pre_reload_split ()"
   "#"
@@ -10337,9 +10337,9 @@
 ;; If this is considered useful, it should be done with splitters.
 
 (define_expand "<code><mode>3"
-  [(set (match_operand:SWIM1248s 0 "nonimmediate_operand")
-	(any_or:SWIM1248s (match_operand:SWIM1248s 1 "nonimmediate_operand")
-			  (match_operand:SWIM1248s 2 "<general_operand>")))]
+  [(set (match_operand:SWIM1248x 0 "nonimmediate_operand")
+	(any_or:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")
+			  (match_operand:SWIM1248x 2 "<general_operand>")))]
   ""
   "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
 
@@ -10349,7 +10349,7 @@
 	 (match_operand:DI 1 "nonimmediate_operand")
 	 (match_operand:DI 2 "x86_64_szext_general_operand")))
    (clobber (reg:CC FLAGS_REG))]
-  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2
+  "!TARGET_64BIT
    && ix86_binary_operator_ok (<CODE>, DImode, operands)
    && ix86_pre_reload_split ()"
   "#"
@@ -11427,15 +11427,15 @@
 ;; One complement instructions
 
 (define_expand "one_cmpl<mode>2"
-  [(set (match_operand:SWIM1248s 0 "nonimmediate_operand")
-	(not:SWIM1248s (match_operand:SWIM1248s 1 "nonimmediate_operand")))]
+  [(set (match_operand:SWIM1248x 0 "nonimmediate_operand")
+	(not:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")))]
   ""
   "ix86_expand_unary_operator (NOT, <MODE>mode, operands); DONE;")
 
 (define_insn_and_split "*one_cmpldi2_doubleword"
   [(set (match_operand:DI 0 "nonimmediate_operand")
 	(not:DI (match_operand:DI 1 "nonimmediate_operand")))]
-  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2
+  "!TARGET_64BIT
    && ix86_unary_operator_ok (NOT, DImode, operands)
    && ix86_pre_reload_split ()"
   "#"
-- 
cgit v1.1