Merge from trunk revision 3175d40fc52fb8eb3c3b18cc343d773da24434fb.

author: Ian Lance Taylor <iant@golang.org> 2020-10-12 09:46:38 -0700
committer: Ian Lance Taylor <iant@golang.org> 2020-10-12 09:46:38 -0700
commit: 9cd320ea6572c577cdf17ce1f9ea5230b166af6d (patch)
tree: d1c8e7c2e09a91ed75f0e5476c648c2e745aa2de /gcc/config
parent: 4854d721be78358e59367982bdd94461b4be3c5a (diff)
parent: 3175d40fc52fb8eb3c3b18cc343d773da24434fb (diff)
download: gcc-9cd320ea6572c577cdf17ce1f9ea5230b166af6d.zip
gcc-9cd320ea6572c577cdf17ce1f9ea5230b166af6d.tar.gz
gcc-9cd320ea6572c577cdf17ce1f9ea5230b166af6d.tar.bz2
207 files changed, 12689 insertions, 6566 deletions
diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def
index 3be55fa..389084f 100644
--- a/gcc/config/aarch64/aarch64-arches.def
+++ b/gcc/config/aarch64/aarch64-arches.def
@@ -37,5 +37,6 @@ AARCH64_ARCH("armv8.3-a",     generic,	     8_3A,	8,  AARCH64_FL_FOR_ARCH8_3)
 AARCH64_ARCH("armv8.4-a",     generic,	     8_4A,	8,  AARCH64_FL_FOR_ARCH8_4)
 AARCH64_ARCH("armv8.5-a",     generic,	     8_5A,	8,  AARCH64_FL_FOR_ARCH8_5)
 AARCH64_ARCH("armv8.6-a",     generic,	     8_6A,	8,  AARCH64_FL_FOR_ARCH8_6)
+AARCH64_ARCH("armv8-r",       generic,	     8R  ,	8,  AARCH64_FL_FOR_ARCH8_R)
 
 #undef AARCH64_ARCH
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 49dfbaf..732a4dc 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -117,6 +117,22 @@ enum aarch64_type_qualifiers
   qualifier_lane_quadtup_index = 0x1000,
 };
 
+/* Flags that describe what a function might do.  */
+const unsigned int FLAG_NONE = 0U;
+const unsigned int FLAG_READ_FPCR = 1U << 0;
+const unsigned int FLAG_RAISE_FP_EXCEPTIONS = 1U << 1;
+const unsigned int FLAG_READ_MEMORY = 1U << 2;
+const unsigned int FLAG_PREFETCH_MEMORY = 1U << 3;
+const unsigned int FLAG_WRITE_MEMORY = 1U << 4;
+
+/* Not all FP intrinsics raise FP exceptions or read FPCR register,
+   use this flag to suppress it.  */
+const unsigned int FLAG_AUTO_FP = 1U << 5;
+
+const unsigned int FLAG_FP = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS;
+const unsigned int FLAG_ALL = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS
+  | FLAG_READ_MEMORY | FLAG_PREFETCH_MEMORY | FLAG_WRITE_MEMORY;
+
 typedef struct
 {
   const char *name;
@@ -124,6 +140,7 @@ typedef struct
   const enum insn_code code;
   unsigned int fcode;
   enum aarch64_type_qualifiers *qualifiers;
+  unsigned int flags;
 } aarch64_simd_builtin_datum;
 
 static enum aarch64_type_qualifiers
@@ -336,53 +353,53 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define CF4(N, X) CODE_FOR_##N##X##4
 #define CF10(N, X) CODE_FOR_##N##X
 
-#define VAR1(T, N, MAP, A) \
-  {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T},
-#define VAR2(T, N, MAP, A, B) \
-  VAR1 (T, N, MAP, A) \
-  VAR1 (T, N, MAP, B)
-#define VAR3(T, N, MAP, A, B, C) \
-  VAR2 (T, N, MAP, A, B) \
-  VAR1 (T, N, MAP, C)
-#define VAR4(T, N, MAP, A, B, C, D) \
-  VAR3 (T, N, MAP, A, B, C) \
-  VAR1 (T, N, MAP, D)
-#define VAR5(T, N, MAP, A, B, C, D, E) \
-  VAR4 (T, N, MAP, A, B, C, D) \
-  VAR1 (T, N, MAP, E)
-#define VAR6(T, N, MAP, A, B, C, D, E, F) \
-  VAR5 (T, N, MAP, A, B, C, D, E) \
-  VAR1 (T, N, MAP, F)
-#define VAR7(T, N, MAP, A, B, C, D, E, F, G) \
-  VAR6 (T, N, MAP, A, B, C, D, E, F) \
-  VAR1 (T, N, MAP, G)
-#define VAR8(T, N, MAP, A, B, C, D, E, F, G, H) \
-  VAR7 (T, N, MAP, A, B, C, D, E, F, G) \
-  VAR1 (T, N, MAP, H)
-#define VAR9(T, N, MAP, A, B, C, D, E, F, G, H, I) \
-  VAR8 (T, N, MAP, A, B, C, D, E, F, G, H) \
-  VAR1 (T, N, MAP, I)
-#define VAR10(T, N, MAP, A, B, C, D, E, F, G, H, I, J) \
-  VAR9 (T, N, MAP, A, B, C, D, E, F, G, H, I) \
-  VAR1 (T, N, MAP, J)
-#define VAR11(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K) \
-  VAR10 (T, N, MAP, A, B, C, D, E, F, G, H, I, J) \
-  VAR1 (T, N, MAP, K)
-#define VAR12(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L) \
-  VAR11 (T, N, MAP, A, B, C, D, E, F, G, H, I, J, K) \
-  VAR1 (T, N, MAP, L)
-#define VAR13(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
-  VAR12 (T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L) \
-  VAR1 (T, N, MAP, M)
-#define VAR14(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
-  VAR13 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
-  VAR1 (T, X, MAP, N)
-#define VAR15(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
-  VAR14 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
-  VAR1 (T, X, MAP, O)
-#define VAR16(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
-  VAR15 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
-  VAR1 (T, X, MAP, P)
+#define VAR1(T, N, MAP, FLAG, A) \
+  {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG},
+#define VAR2(T, N, MAP, FLAG, A, B) \
+  VAR1 (T, N, MAP, FLAG, A) \
+  VAR1 (T, N, MAP, FLAG, B)
+#define VAR3(T, N, MAP, FLAG, A, B, C) \
+  VAR2 (T, N, MAP, FLAG, A, B) \
+  VAR1 (T, N, MAP, FLAG, C)
+#define VAR4(T, N, MAP, FLAG, A, B, C, D) \
+  VAR3 (T, N, MAP, FLAG, A, B, C) \
+  VAR1 (T, N, MAP, FLAG, D)
+#define VAR5(T, N, MAP, FLAG, A, B, C, D, E) \
+  VAR4 (T, N, MAP, FLAG, A, B, C, D) \
+  VAR1 (T, N, MAP, FLAG, E)
+#define VAR6(T, N, MAP, FLAG, A, B, C, D, E, F) \
+  VAR5 (T, N, MAP, FLAG, A, B, C, D, E) \
+  VAR1 (T, N, MAP, FLAG, F)
+#define VAR7(T, N, MAP, FLAG, A, B, C, D, E, F, G) \
+  VAR6 (T, N, MAP, FLAG, A, B, C, D, E, F) \
+  VAR1 (T, N, MAP, FLAG, G)
+#define VAR8(T, N, MAP, FLAG, A, B, C, D, E, F, G, H) \
+  VAR7 (T, N, MAP, FLAG, A, B, C, D, E, F, G) \
+  VAR1 (T, N, MAP, FLAG, H)
+#define VAR9(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I) \
+  VAR8 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H) \
+  VAR1 (T, N, MAP, FLAG, I)
+#define VAR10(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J) \
+  VAR9 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I) \
+  VAR1 (T, N, MAP, FLAG, J)
+#define VAR11(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K) \
+  VAR10 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J) \
+  VAR1 (T, N, MAP, FLAG, K)
+#define VAR12(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR11 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K) \
+  VAR1 (T, N, MAP, FLAG, L)
+#define VAR13(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR12 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR1 (T, N, MAP, FLAG, M)
+#define VAR14(T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
+  VAR13 (T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR1 (T, X, MAP, FLAG, N)
+#define VAR15(T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
+  VAR14 (T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
+  VAR1 (T, X, MAP, FLAG, O)
+#define VAR16(T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+  VAR15 (T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
+  VAR1 (T, X, MAP, FLAG, P)
 
 #include "aarch64-builtin-iterators.h"
 
@@ -438,7 +455,7 @@ typedef struct
   AARCH64_SIMD_BUILTIN_FCMLA_LANEQ##I##_##M,
 
 #undef VAR1
-#define VAR1(T, N, MAP, A) \
+#define VAR1(T, N, MAP, FLAG, A) \
   AARCH64_SIMD_BUILTIN_##T##_##N##A,
 
 enum aarch64_builtins
@@ -615,14 +632,16 @@ tree aarch64_bf16_type_node = NULL_TREE;
 tree aarch64_bf16_ptr_type_node = NULL_TREE;
 
 /* Wrapper around add_builtin_function.  NAME is the name of the built-in
-   function, TYPE is the function type, and CODE is the function subcode
-   (relative to AARCH64_BUILTIN_GENERAL).  */
+   function, TYPE is the function type, CODE is the function subcode
+   (relative to AARCH64_BUILTIN_GENERAL), and ATTRS is the function
+   attributes.  */
 static tree
-aarch64_general_add_builtin (const char *name, tree type, unsigned int code)
+aarch64_general_add_builtin (const char *name, tree type, unsigned int code,
+			     tree attrs = NULL_TREE)
 {
   code = (code << AARCH64_BUILTIN_SHIFT) | AARCH64_BUILTIN_GENERAL;
   return add_builtin_function (name, type, code, BUILT_IN_MD,
-			       NULL, NULL_TREE);
+			       NULL, attrs);
 }
 
 static const char *
@@ -879,6 +898,95 @@ aarch64_init_simd_builtin_scalar_types (void)
 					     "__builtin_aarch64_simd_udi");
 }
 
+/* Return a set of FLAG_* flags that describe what the function could do,
+   taking the command-line flags into account.  */
+static unsigned int
+aarch64_call_properties (aarch64_simd_builtin_datum *d)
+{
+  unsigned int flags = d->flags;
+
+  if (!(flags & FLAG_AUTO_FP) && FLOAT_MODE_P (d->mode))
+    flags |= FLAG_FP;
+
+  /* -fno-trapping-math means that we can assume any FP exceptions
+     are not user-visible.  */
+  if (!flag_trapping_math)
+    flags &= ~FLAG_RAISE_FP_EXCEPTIONS;
+
+  return flags;
+}
+
+/* Return true if calls to the function could modify some form of
+   global state.  */
+static bool
+aarch64_modifies_global_state_p (aarch64_simd_builtin_datum *d)
+{
+  unsigned int flags = aarch64_call_properties (d);
+
+  if (flags & FLAG_RAISE_FP_EXCEPTIONS)
+    return true;
+
+  if (flags & FLAG_PREFETCH_MEMORY)
+    return true;
+
+  return flags & FLAG_WRITE_MEMORY;
+}
+
+/* Return true if calls to the function could read some form of
+   global state.  */
+static bool
+aarch64_reads_global_state_p (aarch64_simd_builtin_datum *d)
+{
+  unsigned int flags = aarch64_call_properties (d);
+
+  if (flags & FLAG_READ_FPCR)
+    return true;
+
+  return flags & FLAG_READ_MEMORY;
+}
+
+/* Return true if calls to the function could raise a signal.  */
+static bool
+aarch64_could_trap_p (aarch64_simd_builtin_datum *d)
+{
+  unsigned int flags = aarch64_call_properties (d);
+
+  if (flags & FLAG_RAISE_FP_EXCEPTIONS)
+    return true;
+
+  if (flags & (FLAG_READ_MEMORY | FLAG_WRITE_MEMORY))
+    return true;
+
+  return false;
+}
+
+/* Add attribute NAME to ATTRS.  */
+static tree
+aarch64_add_attribute (const char *name, tree attrs)
+{
+  return tree_cons (get_identifier (name), NULL_TREE, attrs);
+}
+
+/* Return the appropriate function attributes.  */
+static tree
+aarch64_get_attributes (aarch64_simd_builtin_datum *d)
+{
+  tree attrs = NULL_TREE;
+
+  if (!aarch64_modifies_global_state_p (d))
+    {
+      if (aarch64_reads_global_state_p (d))
+	attrs = aarch64_add_attribute ("pure", attrs);
+      else
+	attrs = aarch64_add_attribute ("const", attrs);
+    }
+
+  if (!flag_non_call_exceptions || !aarch64_could_trap_p (d))
+    attrs = aarch64_add_attribute ("nothrow", attrs);
+
+  return aarch64_add_attribute ("leaf", attrs);
+}
+
 static bool aarch64_simd_builtins_initialized_p = false;
 
 /* Due to the architecture not providing lane variant of the lane instructions
@@ -1032,7 +1140,9 @@ aarch64_init_simd_builtins (void)
 	snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s",
 		  d->name);
 
-      fndecl = aarch64_general_add_builtin (namebuf, ftype, fcode);
+      tree attrs = aarch64_get_attributes (d);
+
+      fndecl = aarch64_general_add_builtin (namebuf, ftype, fcode, attrs);
       aarch64_builtin_decls[fcode] = fndecl;
     }
 
@@ -1914,7 +2024,7 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, rtx target)
   return target;
 }
 
-/* Expand an expression EXP as fpsr or cpsr setter (depending on
+/* Expand an expression EXP as fpsr or fpcr setter (depending on
    UNSPEC) using MODE.  */
 static void
 aarch64_expand_fpsr_fpcr_setter (int unspec, machine_mode mode, tree exp)
@@ -1924,6 +2034,18 @@ aarch64_expand_fpsr_fpcr_setter (int unspec, machine_mode mode, tree exp)
   emit_insn (gen_aarch64_set (unspec, mode, op));
 }
 
+/* Expand a fpsr or fpcr getter (depending on UNSPEC) using MODE.
+   Return the target.  */
+static rtx
+aarch64_expand_fpsr_fpcr_getter (enum insn_code icode, machine_mode mode,
+				 rtx target)
+{
+  expand_operand op;
+  create_output_operand (&op, target, mode);
+  expand_insn (icode, 1, &op);
+  return op.value;
+}
+
 /* Expand an expression EXP that calls built-in function FCODE,
    with result going to TARGET if that's convenient.  IGNORE is true
    if the result of the builtin is ignored.  */
@@ -1938,26 +2060,26 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
   switch (fcode)
     {
     case AARCH64_BUILTIN_GET_FPCR:
-      emit_insn (gen_aarch64_get (UNSPECV_GET_FPCR, SImode, target));
-      return target;
+      return aarch64_expand_fpsr_fpcr_getter (CODE_FOR_aarch64_get_fpcrsi,
+					      SImode, target);
     case AARCH64_BUILTIN_SET_FPCR:
       aarch64_expand_fpsr_fpcr_setter (UNSPECV_SET_FPCR, SImode, exp);
       return target;
     case AARCH64_BUILTIN_GET_FPSR:
-      emit_insn (gen_aarch64_get (UNSPECV_GET_FPSR, SImode, target));
-      return target;
+      return aarch64_expand_fpsr_fpcr_getter (CODE_FOR_aarch64_get_fpsrsi,
+					      SImode, target);
     case AARCH64_BUILTIN_SET_FPSR:
       aarch64_expand_fpsr_fpcr_setter (UNSPECV_SET_FPSR, SImode, exp);
       return target;
     case AARCH64_BUILTIN_GET_FPCR64:
-      emit_insn (gen_aarch64_get (UNSPECV_GET_FPCR, DImode, target));
-      return target;
+      return aarch64_expand_fpsr_fpcr_getter (CODE_FOR_aarch64_get_fpcrdi,
+					      DImode, target);
     case AARCH64_BUILTIN_SET_FPCR64:
       aarch64_expand_fpsr_fpcr_setter (UNSPECV_SET_FPCR, DImode, exp);
       return target;
     case AARCH64_BUILTIN_GET_FPSR64:
-      emit_insn (gen_aarch64_get (UNSPECV_GET_FPSR, DImode, target));
-      return target;
+      return aarch64_expand_fpsr_fpcr_getter (CODE_FOR_aarch64_get_fpsrdi,
+					      DImode, target);
     case AARCH64_BUILTIN_SET_FPSR64:
       aarch64_expand_fpsr_fpcr_setter (UNSPECV_SET_FPSR, DImode, exp);
       return target;
@@ -1969,20 +2091,13 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
       arg0 = CALL_EXPR_ARG (exp, 0);
       op0 = force_reg (Pmode, expand_normal (arg0));
 
-      if (!target)
-	target = gen_reg_rtx (Pmode);
-      else
-	target = force_reg (Pmode, target);
-
-      emit_move_insn (target, op0);
-
       if (fcode == AARCH64_PAUTH_BUILTIN_XPACLRI)
 	{
 	  rtx lr = gen_rtx_REG (Pmode, R30_REGNUM);
 	  icode = CODE_FOR_xpaclri;
 	  emit_move_insn (lr, op0);
 	  emit_insn (GEN_FCN (icode) ());
-	  emit_move_insn (target, lr);
+	  return lr;
 	}
       else
 	{
@@ -2012,20 +2127,18 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
 	  emit_move_insn (x17_reg, op0);
 	  emit_move_insn (x16_reg, op1);
 	  emit_insn (GEN_FCN (icode) ());
-	  emit_move_insn (target, x17_reg);
+	  return x17_reg;
 	}
 
-      return target;
-
     case AARCH64_JSCVT:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      op0 = force_reg (DFmode, expand_normal (arg0));
-      if (!target)
-	target = gen_reg_rtx (SImode);
-      else
-	target = force_reg (SImode, target);
-      emit_insn (GEN_FCN (CODE_FOR_aarch64_fjcvtzs) (target, op0));
-      return target;
+      {
+	expand_operand ops[2];
+	create_output_operand (&ops[0], target, SImode);
+	op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
+	create_input_operand (&ops[1], op0, DFmode);
+	expand_insn (CODE_FOR_aarch64_fjcvtzs, 2, ops);
+	return ops[0].value;
+      }
 
     case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V2SF:
     case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V2SF:
@@ -2196,7 +2309,7 @@ aarch64_general_builtin_rsqrt (unsigned int fn)
 }
 
 #undef VAR1
-#define VAR1(T, N, MAP, A) \
+#define VAR1(T, N, MAP, FLAG, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
 
 /* Try to fold a call to the built-in function with subcode FCODE.  The
@@ -2209,11 +2322,11 @@ aarch64_general_fold_builtin (unsigned int fcode, tree type,
 {
   switch (fcode)
     {
-      BUILTIN_VDQF (UNOP, abs, 2)
+      BUILTIN_VDQF (UNOP, abs, 2, ALL)
 	return fold_build1 (ABS_EXPR, type, args[0]);
-      VAR1 (UNOP, floatv2si, 2, v2sf)
-      VAR1 (UNOP, floatv4si, 2, v4sf)
-      VAR1 (UNOP, floatv2di, 2, v2df)
+      VAR1 (UNOP, floatv2si, 2, ALL, v2sf)
+      VAR1 (UNOP, floatv4si, 2, ALL, v4sf)
+      VAR1 (UNOP, floatv2di, 2, ALL, v2df)
 	return fold_build1 (FLOAT_EXPR, type, args[0]);
       default:
 	break;
@@ -2239,24 +2352,24 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt)
      the arguments to the __builtin.  */
   switch (fcode)
     {
-      BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
+      BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
 	new_stmt = gimple_build_call_internal (IFN_REDUC_PLUS,
 					       1, args[0]);
 	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
 	break;
-      BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10)
-      BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
+      BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10, ALL)
+      BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10, ALL)
 	new_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
 					       1, args[0]);
 	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
 	break;
-      BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10)
-      BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
+      BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10, ALL)
+      BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10, ALL)
 	new_stmt = gimple_build_call_internal (IFN_REDUC_MIN,
 					       1, args[0]);
 	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
 	break;
-      BUILTIN_GPF (BINOP, fmulx, 0)
+      BUILTIN_GPF (BINOP, fmulx, 0, ALL)
 	{
 	  gcc_assert (nargs == 2);
 	  bool a0_cst_p = TREE_CODE (args[0]) == REAL_CST;
diff --git a/gcc/config/aarch64/aarch64-c.c b/gcc/config/aarch64/aarch64-c.c
index 1882288..5e23328 100644
--- a/gcc/config/aarch64/aarch64-c.c
+++ b/gcc/config/aarch64/aarch64-c.c
@@ -63,7 +63,8 @@ aarch64_define_unconditional_macros (cpp_reader *pfile)
      as interoperability with the same arm macro.  */
   builtin_define ("__ARM_ARCH_8A");
 
-  builtin_define_with_int_value ("__ARM_ARCH_PROFILE", 'A');
+  builtin_define_with_int_value ("__ARM_ARCH_PROFILE",
+      AARCH64_ISA_V8_R ? 'R' : 'A');
   builtin_define ("__ARM_FEATURE_CLZ");
   builtin_define ("__ARM_FEATURE_IDIV");
   builtin_define ("__ARM_FEATURE_UNALIGNED");
@@ -149,7 +150,7 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
 	bits = 0;
       builtin_define_with_int_value ("__ARM_FEATURE_SVE_BITS", bits);
     }
-  aarch64_def_or_undef (TARGET_SVE, "__ARM_FEATURE_SVE_VECTOR_OPERATIONS",
+  aarch64_def_or_undef (TARGET_SVE, "__ARM_FEATURE_SVE_VECTOR_OPERATORS",
 			pfile);
   aarch64_def_or_undef (TARGET_SVE_I8MM,
 			"__ARM_FEATURE_SVE_MATMUL_INT8", pfile);
@@ -181,6 +182,19 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
   aarch64_def_or_undef (aarch64_bti_enabled (),
 			"__ARM_FEATURE_BTI_DEFAULT", pfile);
 
+  cpp_undef (pfile, "__ARM_FEATURE_PAC_DEFAULT");
+  if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
+    {
+      int v = 0;
+      if (aarch64_ra_sign_key == AARCH64_KEY_A)
+	v |= 1;
+      if (aarch64_ra_sign_key == AARCH64_KEY_B)
+	v |= 2;
+      if (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL)
+	v |= 4;
+      builtin_define_with_int_value ("__ARM_FEATURE_PAC_DEFAULT", v);
+    }
+
   aarch64_def_or_undef (TARGET_I8MM, "__ARM_FEATURE_MATMUL_INT8", pfile);
   aarch64_def_or_undef (TARGET_BF16_SIMD,
 			"__ARM_FEATURE_BF16_VECTOR_ARITHMETIC", pfile);
@@ -228,12 +242,12 @@ aarch64_pragma_target_parse (tree args, tree pop_target)
   else
     {
       pop_target = pop_target ? pop_target : target_option_default_node;
-      cl_target_option_restore (&global_options,
+      cl_target_option_restore (&global_options, &global_options_set,
 				TREE_TARGET_OPTION (pop_target));
     }
 
   target_option_current_node
-    = build_target_option_node (&global_options);
+    = build_target_option_node (&global_options, &global_options_set);
 
   aarch64_reset_previous_fndecl ();
   /* For the definitions, ensure all newly defined macros are considered
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index bc89a14..b2e1932 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -103,8 +103,11 @@ AARCH64_CORE("cortex-a75",  cortexa75, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2
 AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD, neoversen1, 0x41, 0xd0b, -1)
 AARCH64_CORE("cortex-a76ae",  cortexa76ae, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, neoversen1, 0x41, 0xd0e, -1)
 AARCH64_CORE("cortex-a77",  cortexa77, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, neoversen1, 0x41, 0xd0d, -1)
+AARCH64_CORE("cortex-a78",  cortexa78, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd41, -1)
+AARCH64_CORE("cortex-a78ae",  cortexa78ae, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd42, -1)
 AARCH64_CORE("cortex-a65",  cortexa65, cortexa53, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa73, 0x41, 0xd06, -1)
 AARCH64_CORE("cortex-a65ae",  cortexa65ae, cortexa53, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa73, 0x41, 0xd43, -1)
+AARCH64_CORE("cortex-x1",  cortexx1, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd44, -1)
 AARCH64_CORE("ares",  ares, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd0c, -1)
 AARCH64_CORE("neoverse-n1",  neoversen1, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd0c, -1)
 AARCH64_CORE("neoverse-e1",  neoversee1, cortexa53, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa73, 0x41, 0xd4a, -1)
@@ -119,6 +122,9 @@ AARCH64_CORE("octeontx2f95",   octeontx2f95,   cortexa57, 8_2A,  AARCH64_FL_FOR_
 AARCH64_CORE("octeontx2f95n",  octeontx2f95n,  cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_CRYPTO | AARCH64_FL_PROFILE, cortexa57, 0x43, 0x0b4, -1)
 AARCH64_CORE("octeontx2f95mm", octeontx2f95mm, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_CRYPTO | AARCH64_FL_PROFILE, cortexa57, 0x43, 0x0b5, -1)
 
+/* Fujitsu ('F') cores. */
+AARCH64_CORE("a64fx", a64fx, a64fx, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_SVE, a64fx, 0x46, 0x001, -1)
+
 /* HiSilicon ('H') cores. */
 AARCH64_CORE("tsv110",  tsv110, tsv110, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, tsv110,   0x48, 0xd01, -1)
 
@@ -130,11 +136,15 @@ AARCH64_CORE("thunderx3t110",  thunderx3t110,  thunderx3t110, 8_3A,  AARCH64_FL_
 /* ARMv8.4-A Architecture Processors.  */
 
 /* Arm ('A') cores.  */
-AARCH64_CORE("zeus", zeus, cortexa57, 8_4A,  AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | AARCH64_FL_RNG, neoversen1, 0x41, 0xd40, -1)
+AARCH64_CORE("zeus", zeus, cortexa57, 8_4A,  AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | AARCH64_FL_RNG, neoversev1, 0x41, 0xd40, -1)
+AARCH64_CORE("neoverse-v1", neoversev1, cortexa57, 8_4A,  AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | AARCH64_FL_RNG, neoversev1, 0x41, 0xd40, -1)
 
 /* Qualcomm ('Q') cores. */
 AARCH64_CORE("saphira",     saphira,    saphira,    8_4A,  AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira,   0x51, 0xC01, -1)
 
+/* Armv8.5-A Architecture Processors.  */
+AARCH64_CORE("neoverse-n2", neoversen2, cortexa57, 8_5A, AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_RNG | AARCH64_FL_MEMTAG, neoversen1, 0x41, 0xd49, -1)
+
 /* ARMv8-A big.LITTLE implementations.  */
 
 AARCH64_CORE("cortex-a57.cortex-a53",  cortexa57cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, 0x41, AARCH64_BIG_LITTLE (0xd07, 0xd03), -1)
@@ -147,4 +157,7 @@ AARCH64_CORE("cortex-a73.cortex-a53",  cortexa73cortexa53, cortexa53, 8A,  AARCH
 AARCH64_CORE("cortex-a75.cortex-a55",  cortexa75cortexa55, cortexa53, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD, cortexa73, 0x41, AARCH64_BIG_LITTLE (0xd0a, 0xd05), -1)
 AARCH64_CORE("cortex-a76.cortex-a55",  cortexa76cortexa55, cortexa53, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD, neoversen1, 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1)
 
+/* Armv8-R Architecture Processors.  */
+AARCH64_CORE("cortex-r82", cortexr82, cortexa53, 8R, AARCH64_FL_FOR_ARCH8_R, cortexa53, 0x41, 0xd15, -1)
+
 #undef AARCH64_CORE
diff --git a/gcc/config/aarch64/aarch64-ldpstp.md b/gcc/config/aarch64/aarch64-ldpstp.md
index dd6f396..02d7a5b 100644
--- a/gcc/config/aarch64/aarch64-ldpstp.md
+++ b/gcc/config/aarch64/aarch64-ldpstp.md
@@ -294,3 +294,45 @@
   else
     FAIL;
 })
+
+(define_peephole2
+  [(match_scratch:DI 8 "r")
+   (set (match_operand:VP_2E 0 "memory_operand" "")
+        (match_operand:VP_2E 1 "aarch64_reg_or_zero" ""))
+   (set (match_operand:VP_2E 2 "memory_operand" "")
+        (match_operand:VP_2E 3 "aarch64_reg_or_zero" ""))
+   (set (match_operand:VP_2E 4 "memory_operand" "")
+        (match_operand:VP_2E 5 "aarch64_reg_or_zero" ""))
+   (set (match_operand:VP_2E 6 "memory_operand" "")
+        (match_operand:VP_2E 7 "aarch64_reg_or_zero" ""))
+   (match_dup 8)]
+  "TARGET_SIMD
+   && aarch64_operands_adjust_ok_for_ldpstp (operands, false, <MODE>mode)"
+  [(const_int 0)]
+{
+  if (aarch64_gen_adjusted_ldpstp (operands, false, <MODE>mode, UNKNOWN))
+    DONE;
+  else
+    FAIL;
+})
+
+(define_peephole2
+  [(match_scratch:DI 8 "r")
+   (set (match_operand:VP_2E 0 "register_operand" "")
+        (match_operand:VP_2E 1 "memory_operand" ""))
+   (set (match_operand:VP_2E 2 "register_operand" "")
+        (match_operand:VP_2E 3 "memory_operand" ""))
+   (set (match_operand:VP_2E 4 "register_operand" "")
+        (match_operand:VP_2E 5 "memory_operand" ""))
+   (set (match_operand:VP_2E 6 "register_operand" "")
+        (match_operand:VP_2E 7 "memory_operand" ""))
+   (match_dup 8)]
+  "TARGET_SIMD
+   && aarch64_operands_adjust_ok_for_ldpstp (operands, true, <MODE>mode)"
+  [(const_int 0)]
+{
+  if (aarch64_gen_adjusted_ldpstp (operands, true, <MODE>mode, UNKNOWN))
+    DONE;
+  else
+    FAIL;
+})
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index 8257df9..ca08642 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -155,7 +155,7 @@ AARCH64_OPT_EXTENSION("sve", AARCH64_FL_SVE, AARCH64_FL_FP | AARCH64_FL_SIMD | \
 AARCH64_OPT_EXTENSION("profile", AARCH64_FL_PROFILE, 0, 0, false, "")
 
 /* Enabling/Disabling "rng" only changes "rng".  */
-AARCH64_OPT_EXTENSION("rng", AARCH64_FL_RNG, 0, 0, false, "")
+AARCH64_OPT_EXTENSION("rng", AARCH64_FL_RNG, 0, 0, false, "rng")
 
 /* Enabling/Disabling "memtag" only changes "memtag".  */
 AARCH64_OPT_EXTENSION("memtag", AARCH64_FL_MEMTAG, 0, 0, false, "")
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 865ad67..7a34c84 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -136,6 +136,25 @@ enum aarch64_addr_query_type {
   ADDR_QUERY_ANY
 };
 
+/* Enumerates values that can be arbitrarily mixed into a calculation
+   in order to make the result of the calculation unique to its use case.
+
+   AARCH64_SALT_SSP_SET
+   AARCH64_SALT_SSP_TEST
+      Used when calculating the address of the stack protection canary value.
+      There is a separate value for setting and testing the canary, meaning
+      that these two operations produce unique addresses: they are different
+      from each other, and from all other address calculations.
+
+      The main purpose of this is to prevent the SET address being spilled
+      to the stack and reloaded for the TEST, since that would give an
+      attacker the opportunity to change the address of the expected
+      canary value.  */
+enum aarch64_salt_type {
+  AARCH64_SALT_SSP_SET,
+  AARCH64_SALT_SSP_TEST
+};
+
 /* A set of tuning parameters contains references to size and time
    cost models and vectors for address cost calculations, register
    move costs and memory move costs.  */
@@ -578,6 +597,7 @@ int aarch64_vec_fpconst_pow_of_2 (rtx);
 rtx aarch64_eh_return_handler_rtx (void);
 rtx aarch64_mask_from_zextract_ops (rtx, rtx);
 const char *aarch64_output_move_struct (rtx *operands);
+rtx aarch64_return_addr_rtx (void);
 rtx aarch64_return_addr (int, rtx);
 rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
 bool aarch64_simd_mem_operand_p (rtx);
@@ -607,9 +627,9 @@ opt_machine_mode aarch64_ptrue_all_mode (rtx);
 rtx aarch64_convert_sve_data_to_pred (rtx, machine_mode, rtx);
 rtx aarch64_expand_sve_dupq (rtx, machine_mode, rtx);
 void aarch64_expand_mov_immediate (rtx, rtx);
+rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type);
 rtx aarch64_ptrue_reg (machine_mode);
 rtx aarch64_pfalse_reg (machine_mode);
-bool aarch64_sve_pred_dominates_p (rtx *, rtx);
 bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
 void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
 void aarch64_expand_sve_mem_move (rtx, rtx, machine_mode);
@@ -681,7 +701,7 @@ void aarch64_split_compare_and_swap (rtx op[]);
 
 void aarch64_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx);
 
-bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
+bool aarch64_gen_adjusted_ldpstp (rtx *, bool, machine_mode, RTX_CODE);
 
 void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx);
 bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
@@ -732,7 +752,7 @@ int aarch64_ccmp_mode_to_code (machine_mode mode);
 
 bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset);
 bool aarch64_operands_ok_for_ldpstp (rtx *, bool, machine_mode);
-bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, scalar_mode);
+bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, machine_mode);
 void aarch64_swap_ldrstr_operands (rtx *, bool);
 
 extern void aarch64_asm_output_pool_epilogue (FILE *, const char *,
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 332a0b6..3554fb0 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -37,695 +37,698 @@
    macro holding the RTL pattern for the intrinsic.  This mapping is:
    0 - CODE_FOR_aarch64_<name><mode>
    1-9 - CODE_FOR_<name><mode><1-9>
-   10 - CODE_FOR_<name><mode>.  */
-
-  BUILTIN_VDC (COMBINE, combine, 0)
-  VAR1 (COMBINEP, combine, 0, di)
-  BUILTIN_VB (BINOP, pmul, 0)
-  BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0)
-  BUILTIN_VHSDF_DF (UNOP, sqrt, 2)
-  BUILTIN_VD_BHSI (BINOP, addp, 0)
-  VAR1 (UNOP, addp, 0, di)
-  BUILTIN_VDQ_BHSI (UNOP, clrsb, 2)
-  BUILTIN_VDQ_BHSI (UNOP, clz, 2)
-  BUILTIN_VS (UNOP, ctz, 2)
-  BUILTIN_VB (UNOP, popcount, 2)
+   10 - CODE_FOR_<name><mode>.
+
+   Parameter 4 is the 'flag' of the intrinsic.  This is used to
+   help describe the attributes (for example, pure) for the intrinsic
+   function.  */
+
+  BUILTIN_VDC (COMBINE, combine, 0, ALL)
+  VAR1 (COMBINEP, combine, 0, ALL, di)
+  BUILTIN_VB (BINOP, pmul, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, ALL)
+  BUILTIN_VHSDF_DF (UNOP, sqrt, 2, ALL)
+  BUILTIN_VD_BHSI (BINOP, addp, 0, NONE)
+  VAR1 (UNOP, addp, 0, NONE, di)
+  BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, ALL)
+  BUILTIN_VDQ_BHSI (UNOP, clz, 2, ALL)
+  BUILTIN_VS (UNOP, ctz, 2, ALL)
+  BUILTIN_VB (UNOP, popcount, 2, ALL)
 
   /* Implemented by aarch64_<sur>q<r>shl<mode>.  */
-  BUILTIN_VSDQ_I (BINOP, sqshl, 0)
-  BUILTIN_VSDQ_I (BINOP_UUS, uqshl, 0)
-  BUILTIN_VSDQ_I (BINOP, sqrshl, 0)
-  BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0)
+  BUILTIN_VSDQ_I (BINOP, sqshl, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP_UUS, uqshl, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP, sqrshl, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0, ALL)
   /* Implemented by aarch64_<su_optab><optab><mode>.  */
-  BUILTIN_VSDQ_I (BINOP, sqadd, 0)
-  BUILTIN_VSDQ_I (BINOPU, uqadd, 0)
-  BUILTIN_VSDQ_I (BINOP, sqsub, 0)
-  BUILTIN_VSDQ_I (BINOPU, uqsub, 0)
+  BUILTIN_VSDQ_I (BINOP, sqadd, 0, ALL)
+  BUILTIN_VSDQ_I (BINOPU, uqadd, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP, sqsub, 0, ALL)
+  BUILTIN_VSDQ_I (BINOPU, uqsub, 0, ALL)
   /* Implemented by aarch64_<sur>qadd<mode>.  */
-  BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0)
-  BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0)
+  BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, ALL)
 
   /* Implemented by aarch64_get_dreg<VSTRUCT:mode><VDC:mode>.  */
-  BUILTIN_VDC (GETREG, get_dregoi, 0)
-  BUILTIN_VDC (GETREG, get_dregci, 0)
-  BUILTIN_VDC (GETREG, get_dregxi, 0)
-  VAR1 (GETREGP, get_dregoi, 0, di)
-  VAR1 (GETREGP, get_dregci, 0, di)
-  VAR1 (GETREGP, get_dregxi, 0, di)
+  BUILTIN_VDC (GETREG, get_dregoi, 0, ALL)
+  BUILTIN_VDC (GETREG, get_dregci, 0, ALL)
+  BUILTIN_VDC (GETREG, get_dregxi, 0, ALL)
+  VAR1 (GETREGP, get_dregoi, 0, ALL, di)
+  VAR1 (GETREGP, get_dregci, 0, ALL, di)
+  VAR1 (GETREGP, get_dregxi, 0, ALL, di)
   /* Implemented by aarch64_get_qreg<VSTRUCT:mode><VQ:mode>.  */
-  BUILTIN_VQ (GETREG, get_qregoi, 0)
-  BUILTIN_VQ (GETREG, get_qregci, 0)
-  BUILTIN_VQ (GETREG, get_qregxi, 0)
-  VAR1 (GETREGP, get_qregoi, 0, v2di)
-  VAR1 (GETREGP, get_qregci, 0, v2di)
-  VAR1 (GETREGP, get_qregxi, 0, v2di)
+  BUILTIN_VQ (GETREG, get_qregoi, 0, ALL)
+  BUILTIN_VQ (GETREG, get_qregci, 0, ALL)
+  BUILTIN_VQ (GETREG, get_qregxi, 0, ALL)
+  VAR1 (GETREGP, get_qregoi, 0, ALL, v2di)
+  VAR1 (GETREGP, get_qregci, 0, ALL, v2di)
+  VAR1 (GETREGP, get_qregxi, 0, ALL, v2di)
   /* Implemented by aarch64_set_qreg<VSTRUCT:mode><VQ:mode>.  */
-  BUILTIN_VQ (SETREG, set_qregoi, 0)
-  BUILTIN_VQ (SETREG, set_qregci, 0)
-  BUILTIN_VQ (SETREG, set_qregxi, 0)
-  VAR1 (SETREGP, set_qregoi, 0, v2di)
-  VAR1 (SETREGP, set_qregci, 0, v2di)
-  VAR1 (SETREGP, set_qregxi, 0, v2di)
+  BUILTIN_VQ (SETREG, set_qregoi, 0, ALL)
+  BUILTIN_VQ (SETREG, set_qregci, 0, ALL)
+  BUILTIN_VQ (SETREG, set_qregxi, 0, ALL)
+  VAR1 (SETREGP, set_qregoi, 0, ALL, v2di)
+  VAR1 (SETREGP, set_qregci, 0, ALL, v2di)
+  VAR1 (SETREGP, set_qregxi, 0, ALL, v2di)
   /* Implemented by aarch64_ld1x2<VQ:mode>. */
-  BUILTIN_VQ (LOADSTRUCT, ld1x2, 0)
+  BUILTIN_VQ (LOADSTRUCT, ld1x2, 0, ALL)
   /* Implemented by aarch64_ld1x2<VDC:mode>. */
-  BUILTIN_VDC (LOADSTRUCT, ld1x2, 0)
+  BUILTIN_VDC (LOADSTRUCT, ld1x2, 0, ALL)
   /* Implemented by aarch64_ld<VSTRUCT:nregs><VDC:mode>.  */
-  BUILTIN_VDC (LOADSTRUCT, ld2, 0)
-  BUILTIN_VDC (LOADSTRUCT, ld3, 0)
-  BUILTIN_VDC (LOADSTRUCT, ld4, 0)
+  BUILTIN_VDC (LOADSTRUCT, ld2, 0, ALL)
+  BUILTIN_VDC (LOADSTRUCT, ld3, 0, ALL)
+  BUILTIN_VDC (LOADSTRUCT, ld4, 0, ALL)
   /* Implemented by aarch64_ld<VSTRUCT:nregs><VQ:mode>.  */
-  BUILTIN_VQ (LOADSTRUCT, ld2, 0)
-  BUILTIN_VQ (LOADSTRUCT, ld3, 0)
-  BUILTIN_VQ (LOADSTRUCT, ld4, 0)
+  BUILTIN_VQ (LOADSTRUCT, ld2, 0, ALL)
+  BUILTIN_VQ (LOADSTRUCT, ld3, 0, ALL)
+  BUILTIN_VQ (LOADSTRUCT, ld4, 0, ALL)
   /* Implemented by aarch64_ld<VSTRUCT:nregs>r<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT, ld2r, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT, ld3r, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT, ld4r, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld2r, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld3r, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld4r, 0, ALL)
   /* Implemented by aarch64_ld<VSTRUCT:nregs>_lane<VQ:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld2_lane, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld3_lane, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld4_lane, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld2_lane, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld3_lane, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld4_lane, 0, ALL)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VDC:mode>.  */
-  BUILTIN_VDC (STORESTRUCT, st2, 0)
-  BUILTIN_VDC (STORESTRUCT, st3, 0)
-  BUILTIN_VDC (STORESTRUCT, st4, 0)
+  BUILTIN_VDC (STORESTRUCT, st2, 0, ALL)
+  BUILTIN_VDC (STORESTRUCT, st3, 0, ALL)
+  BUILTIN_VDC (STORESTRUCT, st4, 0, ALL)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VQ:mode>.  */
-  BUILTIN_VQ (STORESTRUCT, st2, 0)
-  BUILTIN_VQ (STORESTRUCT, st3, 0)
-  BUILTIN_VQ (STORESTRUCT, st4, 0)
-
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st2_lane, 0)
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0)
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st4_lane, 0)
-
-  BUILTIN_VQW (BINOP, saddl2, 0)
-  BUILTIN_VQW (BINOP, uaddl2, 0)
-  BUILTIN_VQW (BINOP, ssubl2, 0)
-  BUILTIN_VQW (BINOP, usubl2, 0)
-  BUILTIN_VQW (BINOP, saddw2, 0)
-  BUILTIN_VQW (BINOP, uaddw2, 0)
-  BUILTIN_VQW (BINOP, ssubw2, 0)
-  BUILTIN_VQW (BINOP, usubw2, 0)
+  BUILTIN_VQ (STORESTRUCT, st2, 0, ALL)
+  BUILTIN_VQ (STORESTRUCT, st3, 0, ALL)
+  BUILTIN_VQ (STORESTRUCT, st4, 0, ALL)
+
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st2_lane, 0, ALL)
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0, ALL)
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st4_lane, 0, ALL)
+
+  BUILTIN_VQW (BINOP, saddl2, 0, NONE)
+  BUILTIN_VQW (BINOP, uaddl2, 0, NONE)
+  BUILTIN_VQW (BINOP, ssubl2, 0, NONE)
+  BUILTIN_VQW (BINOP, usubl2, 0, NONE)
+  BUILTIN_VQW (BINOP, saddw2, 0, NONE)
+  BUILTIN_VQW (BINOP, uaddw2, 0, NONE)
+  BUILTIN_VQW (BINOP, ssubw2, 0, NONE)
+  BUILTIN_VQW (BINOP, usubw2, 0, NONE)
   /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>l<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddl, 0)
-  BUILTIN_VD_BHSI (BINOP, uaddl, 0)
-  BUILTIN_VD_BHSI (BINOP, ssubl, 0)
-  BUILTIN_VD_BHSI (BINOP, usubl, 0)
+  BUILTIN_VD_BHSI (BINOP, saddl, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, uaddl, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, ssubl, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, usubl, 0, NONE)
   /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddw, 0)
-  BUILTIN_VD_BHSI (BINOP, uaddw, 0)
-  BUILTIN_VD_BHSI (BINOP, ssubw, 0)
-  BUILTIN_VD_BHSI (BINOP, usubw, 0)
+  BUILTIN_VD_BHSI (BINOP, saddw, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, uaddw, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, ssubw, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, usubw, 0, NONE)
   /* Implemented by aarch64_<sur>h<addsub><mode>.  */
-  BUILTIN_VDQ_BHSI (BINOP, shadd, 0)
-  BUILTIN_VDQ_BHSI (BINOP, shsub, 0)
-  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0)
-  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0)
-  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0)
-  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0)
+  BUILTIN_VDQ_BHSI (BINOP, shadd, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, shsub, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, NONE)
   /* Implemented by aarch64_<sur><addsub>hn<mode>.  */
-  BUILTIN_VQN (BINOP, addhn, 0)
-  BUILTIN_VQN (BINOP, subhn, 0)
-  BUILTIN_VQN (BINOP, raddhn, 0)
-  BUILTIN_VQN (BINOP, rsubhn, 0)
+  BUILTIN_VQN (BINOP, addhn, 0, NONE)
+  BUILTIN_VQN (BINOP, subhn, 0, NONE)
+  BUILTIN_VQN (BINOP, raddhn, 0, NONE)
+  BUILTIN_VQN (BINOP, rsubhn, 0, NONE)
   /* Implemented by aarch64_<sur><addsub>hn2<mode>.  */
-  BUILTIN_VQN (TERNOP, addhn2, 0)
-  BUILTIN_VQN (TERNOP, subhn2, 0)
-  BUILTIN_VQN (TERNOP, raddhn2, 0)
-  BUILTIN_VQN (TERNOP, rsubhn2, 0)
+  BUILTIN_VQN (TERNOP, addhn2, 0, NONE)
+  BUILTIN_VQN (TERNOP, subhn2, 0, NONE)
+  BUILTIN_VQN (TERNOP, raddhn2, 0, NONE)
+  BUILTIN_VQN (TERNOP, rsubhn2, 0, NONE)
 
-  BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0)
+  BUILTIN_VSQN_HSDI (UNOPUS, sqmovun, 0, ALL)
   /* Implemented by aarch64_<sur>qmovn<mode>.  */
-  BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0)
-  BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0)
+  BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0, ALL)
+  BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0, ALL)
   /* Implemented by aarch64_s<optab><mode>.  */
-  BUILTIN_VSDQ_I (UNOP, sqabs, 0)
-  BUILTIN_VSDQ_I (UNOP, sqneg, 0)
+  BUILTIN_VSDQ_I (UNOP, sqabs, 0, ALL)
+  BUILTIN_VSDQ_I (UNOP, sqneg, 0, ALL)
 
   /* Implemented by aarch64_sqdml<SBINQOPS:as>l<mode>.  */
-  BUILTIN_VSD_HSI (TERNOP, sqdmlal, 0)
-  BUILTIN_VSD_HSI (TERNOP, sqdmlsl, 0)
+  BUILTIN_VSD_HSI (TERNOP, sqdmlal, 0, ALL)
+  BUILTIN_VSD_HSI (TERNOP, sqdmlsl, 0, ALL)
   /* Implemented by aarch64_sqdml<SBINQOPS:as>l_lane<mode>.  */
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_lane, 0)
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_lane, 0)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_lane, 0, ALL)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_lane, 0, ALL)
   /* Implemented by aarch64_sqdml<SBINQOPS:as>l_laneq<mode>.  */
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_laneq, 0)
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_laneq, 0)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_laneq, 0, ALL)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_laneq, 0, ALL)
   /* Implemented by aarch64_sqdml<SBINQOPS:as>l_n<mode>.  */
-  BUILTIN_VD_HSI (TERNOP, sqdmlal_n, 0)
-  BUILTIN_VD_HSI (TERNOP, sqdmlsl_n, 0)
-
-  BUILTIN_VQ_HSI (TERNOP, sqdmlal2, 0)
-  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_lane, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_lane, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_laneq, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_laneq, 0)
-  BUILTIN_VQ_HSI (TERNOP, sqdmlal2_n, 0)
-  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2_n, 0)
-
-  BUILTIN_VD_BHSI (BINOP, intrinsic_vec_smult_lo_, 0)
-  BUILTIN_VD_BHSI (BINOPU, intrinsic_vec_umult_lo_, 0)
-
-  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10)
-  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10)
-
-  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0)
-  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0)
-  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_laneq_, 0)
-  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_laneq_, 0)
-  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_lane_, 0)
-  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_lane_, 0)
-  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_laneq_, 0)
-  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_laneq_, 0)
-
-  BUILTIN_VSD_HSI (BINOP, sqdmull, 0)
-  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_lane, 0)
-  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_laneq, 0)
-  BUILTIN_VD_HSI (BINOP, sqdmull_n, 0)
-  BUILTIN_VQ_HSI (BINOP, sqdmull2, 0)
-  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_lane, 0)
-  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_laneq, 0)
-  BUILTIN_VQ_HSI (BINOP, sqdmull2_n, 0)
+  BUILTIN_VD_HSI (TERNOP, sqdmlal_n, 0, ALL)
+  BUILTIN_VD_HSI (TERNOP, sqdmlsl_n, 0, ALL)
+
+  BUILTIN_VQ_HSI (TERNOP, sqdmlal2, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2, 0, ALL)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_lane, 0, ALL)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_lane, 0, ALL)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_laneq, 0, ALL)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_laneq, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP, sqdmlal2_n, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2_n, 0, ALL)
+
+  BUILTIN_VD_BHSI (BINOP, intrinsic_vec_smult_lo_, 0, ALL)
+  BUILTIN_VD_BHSI (BINOPU, intrinsic_vec_umult_lo_, 0, ALL)
+
+  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10, ALL)
+  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10, ALL)
+
+  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0, ALL)
+  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0, ALL)
+  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_laneq_, 0, ALL)
+  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_laneq_, 0, ALL)
+  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_lane_, 0, ALL)
+  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_lane_, 0, ALL)
+  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_laneq_, 0, ALL)
+  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_laneq_, 0, ALL)
+
+  BUILTIN_VSD_HSI (BINOP, sqdmull, 0, ALL)
+  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_lane, 0, ALL)
+  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_laneq, 0, ALL)
+  BUILTIN_VD_HSI (BINOP, sqdmull_n, 0, ALL)
+  BUILTIN_VQ_HSI (BINOP, sqdmull2, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_lane, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_laneq, 0, ALL)
+  BUILTIN_VQ_HSI (BINOP, sqdmull2_n, 0, ALL)
   /* Implemented by aarch64_sq<r>dmulh<mode>.  */
-  BUILTIN_VSDQ_HSI (BINOP, sqdmulh, 0)
-  BUILTIN_VSDQ_HSI (BINOP, sqrdmulh, 0)
+  BUILTIN_VSDQ_HSI (BINOP, sqdmulh, 0, ALL)
+  BUILTIN_VSDQ_HSI (BINOP, sqrdmulh, 0, ALL)
   /* Implemented by aarch64_sq<r>dmulh_lane<q><mode>.  */
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_lane, 0)
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_laneq, 0)
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0)
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_lane, 0, ALL)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_laneq, 0, ALL)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0, ALL)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0, ALL)
 
-  BUILTIN_VSDQ_I_DI (BINOP, ashl, 3)
+  BUILTIN_VSDQ_I_DI (BINOP, ashl, 3, ALL)
   /* Implemented by aarch64_<sur>shl<mode>.  */
-  BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)
-  BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0)
-  BUILTIN_VSDQ_I_DI (BINOP, srshl, 0)
-  BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0)
+  BUILTIN_VSDQ_I_DI (BINOP, sshl, 0, ALL)
+  BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0, ALL)
+  BUILTIN_VSDQ_I_DI (BINOP, srshl, 0, ALL)
+  BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, ALL)
 
   /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>.  */
-  BUILTIN_VB (TERNOP, sdot, 0)
-  BUILTIN_VB (TERNOPU, udot, 0)
-  BUILTIN_VB (TERNOP_SSUS, usdot, 0)
-  BUILTIN_VB (QUADOP_LANE, sdot_lane, 0)
-  BUILTIN_VB (QUADOPU_LANE, udot_lane, 0)
-  BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0)
-  BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0)
-  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_lane, 0)
-  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_laneq, 0)
-  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_lane, 0)
-  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0)
+  BUILTIN_VB (TERNOP, sdot, 0, ALL)
+  BUILTIN_VB (TERNOPU, udot, 0, ALL)
+  BUILTIN_VB (TERNOP_SSUS, usdot, 0, ALL)
+  BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, ALL)
+  BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, ALL)
+  BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, ALL)
+  BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0, ALL)
+  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_lane, 0, ALL)
+  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_laneq, 0, ALL)
+  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_lane, 0, ALL)
+  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0, ALL)
 
   /* Implemented by aarch64_fcadd<rot><mode>.   */
-  BUILTIN_VHSDF (BINOP, fcadd90, 0)
-  BUILTIN_VHSDF (BINOP, fcadd270, 0)
+  BUILTIN_VHSDF (BINOP, fcadd90, 0, FP)
+  BUILTIN_VHSDF (BINOP, fcadd270, 0, FP)
 
   /* Implemented by aarch64_fcmla{_lane}{q}<rot><mode>.   */
-  BUILTIN_VHSDF (TERNOP, fcmla0, 0)
-  BUILTIN_VHSDF (TERNOP, fcmla90, 0)
-  BUILTIN_VHSDF (TERNOP, fcmla180, 0)
-  BUILTIN_VHSDF (TERNOP, fcmla270, 0)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane0, 0)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane90, 0)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane180, 0)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane270, 0)
-
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane0, 0)
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane90, 0)
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane180, 0)
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane270, 0)
-
-  BUILTIN_VDQ_I (SHIFTIMM, ashr, 3)
-  VAR1 (SHIFTIMM, ashr_simd, 0, di)
-  BUILTIN_VDQ_I (SHIFTIMM, lshr, 3)
-  VAR1 (USHIFTIMM, lshr_simd, 0, di)
+  BUILTIN_VHSDF (TERNOP, fcmla0, 0, ALL)
+  BUILTIN_VHSDF (TERNOP, fcmla90, 0, ALL)
+  BUILTIN_VHSDF (TERNOP, fcmla180, 0, ALL)
+  BUILTIN_VHSDF (TERNOP, fcmla270, 0, ALL)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane0, 0, ALL)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane90, 0, ALL)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane180, 0, ALL)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane270, 0, ALL)
+
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane0, 0, ALL)
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane90, 0, ALL)
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane180, 0, ALL)
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane270, 0, ALL)
+
+  BUILTIN_VDQ_I (SHIFTIMM, ashr, 3, ALL)
+  VAR1 (SHIFTIMM, ashr_simd, 0, ALL, di)
+  BUILTIN_VDQ_I (SHIFTIMM, lshr, 3, ALL)
+  VAR1 (USHIFTIMM, lshr_simd, 0, ALL, di)
   /* Implemented by aarch64_<sur>shr_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTIMM, srshr_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTIMM, urshr_n, 0)
+  BUILTIN_VSDQ_I_DI (SHIFTIMM, srshr_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (USHIFTIMM, urshr_n, 0, ALL)
   /* Implemented by aarch64_<sur>sra_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTACC, ssra_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usra_n, 0)
-  BUILTIN_VSDQ_I_DI (SHIFTACC, srsra_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, ursra_n, 0)
+  BUILTIN_VSDQ_I_DI (SHIFTACC, ssra_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usra_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (SHIFTACC, srsra_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, ursra_n, 0, ALL)
   /* Implemented by aarch64_<sur>shll_n<mode>.  */
-  BUILTIN_VD_BHSI (SHIFTIMM, sshll_n, 0)
-  BUILTIN_VD_BHSI (USHIFTIMM, ushll_n, 0)
+  BUILTIN_VD_BHSI (SHIFTIMM, sshll_n, 0, ALL)
+  BUILTIN_VD_BHSI (USHIFTIMM, ushll_n, 0, ALL)
   /* Implemented by aarch64_<sur>shll2_n<mode>.  */
-  BUILTIN_VQW (SHIFTIMM, sshll2_n, 0)
-  BUILTIN_VQW (SHIFTIMM, ushll2_n, 0)
+  BUILTIN_VQW (SHIFTIMM, sshll2_n, 0, ALL)
+  BUILTIN_VQW (SHIFTIMM, ushll2_n, 0, ALL)
   /* Implemented by aarch64_<sur>q<r>shr<u>n_n<mode>.  */
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrun_n, 0)
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrun_n, 0)
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrn_n, 0)
-  BUILTIN_VSQN_HSDI (USHIFTIMM, uqshrn_n, 0)
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrn_n, 0)
-  BUILTIN_VSQN_HSDI (USHIFTIMM, uqrshrn_n, 0)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrun_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrun_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrn_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (USHIFTIMM, uqshrn_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrn_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (USHIFTIMM, uqrshrn_n, 0, ALL)
   /* Implemented by aarch64_<sur>s<lr>i_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0)
-  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0)
-  VAR2 (SHIFTINSERTP, ssli_n, 0, di, v2di)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0)
+  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0, ALL)
+  VAR2 (SHIFTINSERTP, ssli_n, 0, ALL, di, v2di)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0, ALL)
   /* Implemented by aarch64_<sur>qshl<u>_n<mode>.  */
-  BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0)
-  BUILTIN_VSDQ_I (SHIFTIMM, sqshl_n, 0)
-  BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0)
+  BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0, ALL)
+  BUILTIN_VSDQ_I (SHIFTIMM, sqshl_n, 0, ALL)
+  BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0, ALL)
 
   /* Implemented by aarch64_reduc_plus_<mode>.  */
-  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
+  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
 
   /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar).  */
-  BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10)
-  BUILTIN_VDQIF_F16 (UNOP, reduc_smin_scal_, 10)
-  BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
-  BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
-  BUILTIN_VHSDF (UNOP, reduc_smax_nan_scal_, 10)
-  BUILTIN_VHSDF (UNOP, reduc_smin_nan_scal_, 10)
+  BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10, NONE)
+  BUILTIN_VDQIF_F16 (UNOP, reduc_smin_scal_, 10, NONE)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10, NONE)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10, NONE)
+  BUILTIN_VHSDF (UNOP, reduc_smax_nan_scal_, 10, NONE)
+  BUILTIN_VHSDF (UNOP, reduc_smin_nan_scal_, 10, NONE)
 
   /* Implemented by <maxmin_uns><mode>3.
      smax variants map to fmaxnm,
      smax_nan variants map to fmax.  */
-  BUILTIN_VDQ_BHSI (BINOP, smax, 3)
-  BUILTIN_VDQ_BHSI (BINOP, smin, 3)
-  BUILTIN_VDQ_BHSI (BINOP, umax, 3)
-  BUILTIN_VDQ_BHSI (BINOP, umin, 3)
-  BUILTIN_VHSDF_DF (BINOP, smax_nan, 3)
-  BUILTIN_VHSDF_DF (BINOP, smin_nan, 3)
+  BUILTIN_VDQ_BHSI (BINOP, smax, 3, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, smin, 3, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, umax, 3, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, umin, 3, NONE)
+  BUILTIN_VHSDF_DF (BINOP, smax_nan, 3, NONE)
+  BUILTIN_VHSDF_DF (BINOP, smin_nan, 3, NONE)
 
   /* Implemented by <maxmin_uns><mode>3.  */
-  BUILTIN_VHSDF_HSDF (BINOP, fmax, 3)
-  BUILTIN_VHSDF_HSDF (BINOP, fmin, 3)
+  BUILTIN_VHSDF_HSDF (BINOP, fmax, 3, FP)
+  BUILTIN_VHSDF_HSDF (BINOP, fmin, 3, FP)
 
   /* Implemented by aarch64_<maxmin_uns>p<mode>.  */
-  BUILTIN_VDQ_BHSI (BINOP, smaxp, 0)
-  BUILTIN_VDQ_BHSI (BINOP, sminp, 0)
-  BUILTIN_VDQ_BHSI (BINOP, umaxp, 0)
-  BUILTIN_VDQ_BHSI (BINOP, uminp, 0)
-  BUILTIN_VHSDF (BINOP, smaxp, 0)
-  BUILTIN_VHSDF (BINOP, sminp, 0)
-  BUILTIN_VHSDF (BINOP, smax_nanp, 0)
-  BUILTIN_VHSDF (BINOP, smin_nanp, 0)
+  BUILTIN_VDQ_BHSI (BINOP, smaxp, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, sminp, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, umaxp, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, uminp, 0, NONE)
+  BUILTIN_VHSDF (BINOP, smaxp, 0, NONE)
+  BUILTIN_VHSDF (BINOP, sminp, 0, NONE)
+  BUILTIN_VHSDF (BINOP, smax_nanp, 0, NONE)
+  BUILTIN_VHSDF (BINOP, smin_nanp, 0, NONE)
 
   /* Implemented by <frint_pattern><mode>2.  */
-  BUILTIN_VHSDF (UNOP, btrunc, 2)
-  BUILTIN_VHSDF (UNOP, ceil, 2)
-  BUILTIN_VHSDF (UNOP, floor, 2)
-  BUILTIN_VHSDF (UNOP, nearbyint, 2)
-  BUILTIN_VHSDF (UNOP, rint, 2)
-  BUILTIN_VHSDF (UNOP, round, 2)
-  BUILTIN_VHSDF_DF (UNOP, frintn, 2)
-
-  VAR1 (UNOP, btrunc, 2, hf)
-  VAR1 (UNOP, ceil, 2, hf)
-  VAR1 (UNOP, floor, 2, hf)
-  VAR1 (UNOP, frintn, 2, hf)
-  VAR1 (UNOP, nearbyint, 2, hf)
-  VAR1 (UNOP, rint, 2, hf)
-  VAR1 (UNOP, round, 2, hf)
+  BUILTIN_VHSDF (UNOP, btrunc, 2, FP)
+  BUILTIN_VHSDF (UNOP, ceil, 2, FP)
+  BUILTIN_VHSDF (UNOP, floor, 2, FP)
+  BUILTIN_VHSDF (UNOP, nearbyint, 2, FP)
+  BUILTIN_VHSDF (UNOP, rint, 2, FP)
+  BUILTIN_VHSDF (UNOP, round, 2, FP)
+  BUILTIN_VHSDF_HSDF (UNOP, frintn, 2, FP)
+
+  VAR1 (UNOP, btrunc, 2, FP, hf)
+  VAR1 (UNOP, ceil, 2, FP, hf)
+  VAR1 (UNOP, floor, 2, FP, hf)
+  VAR1 (UNOP, nearbyint, 2, FP, hf)
+  VAR1 (UNOP, rint, 2, FP, hf)
+  VAR1 (UNOP, round, 2, FP, hf)
 
   /* Implemented by l<fcvt_pattern><su_optab><VQDF:mode><vcvt_target>2.  */
-  VAR1 (UNOP, lbtruncv4hf, 2, v4hi)
-  VAR1 (UNOP, lbtruncv8hf, 2, v8hi)
-  VAR1 (UNOP, lbtruncv2sf, 2, v2si)
-  VAR1 (UNOP, lbtruncv4sf, 2, v4si)
-  VAR1 (UNOP, lbtruncv2df, 2, v2di)
-
-  VAR1 (UNOPUS, lbtruncuv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lbtruncuv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lbtruncuv2sf, 2, v2si)
-  VAR1 (UNOPUS, lbtruncuv4sf, 2, v4si)
-  VAR1 (UNOPUS, lbtruncuv2df, 2, v2di)
-
-  VAR1 (UNOP, lroundv4hf, 2, v4hi)
-  VAR1 (UNOP, lroundv8hf, 2, v8hi)
-  VAR1 (UNOP, lroundv2sf, 2, v2si)
-  VAR1 (UNOP, lroundv4sf, 2, v4si)
-  VAR1 (UNOP, lroundv2df, 2, v2di)
+  VAR1 (UNOP, lbtruncv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lbtruncv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lbtruncv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lbtruncv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lbtruncv2df, 2, FP, v2di)
+
+  VAR1 (UNOPUS, lbtruncuv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lbtruncuv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lbtruncuv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lbtruncuv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lbtruncuv2df, 2, FP, v2di)
+
+  VAR1 (UNOP, lroundv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lroundv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lroundv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lroundv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lroundv2df, 2, FP, v2di)
   /* Implemented by l<fcvt_pattern><su_optab><GPF_F16:mode><GPI:mode>2.  */
-  BUILTIN_GPI_I16 (UNOP, lroundhf, 2)
-  VAR1 (UNOP, lroundsf, 2, si)
-  VAR1 (UNOP, lrounddf, 2, di)
-
-  VAR1 (UNOPUS, lrounduv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lrounduv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lrounduv2sf, 2, v2si)
-  VAR1 (UNOPUS, lrounduv4sf, 2, v4si)
-  VAR1 (UNOPUS, lrounduv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lrounduhf, 2)
-  VAR1 (UNOPUS, lroundusf, 2, si)
-  VAR1 (UNOPUS, lroundudf, 2, di)
-
-  VAR1 (UNOP, lceilv4hf, 2, v4hi)
-  VAR1 (UNOP, lceilv8hf, 2, v8hi)
-  VAR1 (UNOP, lceilv2sf, 2, v2si)
-  VAR1 (UNOP, lceilv4sf, 2, v4si)
-  VAR1 (UNOP, lceilv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOP, lceilhf, 2)
-
-  VAR1 (UNOPUS, lceiluv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lceiluv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lceiluv2sf, 2, v2si)
-  VAR1 (UNOPUS, lceiluv4sf, 2, v4si)
-  VAR1 (UNOPUS, lceiluv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lceiluhf, 2)
-  VAR1 (UNOPUS, lceilusf, 2, si)
-  VAR1 (UNOPUS, lceiludf, 2, di)
-
-  VAR1 (UNOP, lfloorv4hf, 2, v4hi)
-  VAR1 (UNOP, lfloorv8hf, 2, v8hi)
-  VAR1 (UNOP, lfloorv2sf, 2, v2si)
-  VAR1 (UNOP, lfloorv4sf, 2, v4si)
-  VAR1 (UNOP, lfloorv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOP, lfloorhf, 2)
-
-  VAR1 (UNOPUS, lflooruv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lflooruv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lflooruv2sf, 2, v2si)
-  VAR1 (UNOPUS, lflooruv4sf, 2, v4si)
-  VAR1 (UNOPUS, lflooruv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lflooruhf, 2)
-  VAR1 (UNOPUS, lfloorusf, 2, si)
-  VAR1 (UNOPUS, lfloorudf, 2, di)
-
-  VAR1 (UNOP, lfrintnv4hf, 2, v4hi)
-  VAR1 (UNOP, lfrintnv8hf, 2, v8hi)
-  VAR1 (UNOP, lfrintnv2sf, 2, v2si)
-  VAR1 (UNOP, lfrintnv4sf, 2, v4si)
-  VAR1 (UNOP, lfrintnv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOP, lfrintnhf, 2)
-  VAR1 (UNOP, lfrintnsf, 2, si)
-  VAR1 (UNOP, lfrintndf, 2, di)
-
-  VAR1 (UNOPUS, lfrintnuv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lfrintnuv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lfrintnuv2sf, 2, v2si)
-  VAR1 (UNOPUS, lfrintnuv4sf, 2, v4si)
-  VAR1 (UNOPUS, lfrintnuv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lfrintnuhf, 2)
-  VAR1 (UNOPUS, lfrintnusf, 2, si)
-  VAR1 (UNOPUS, lfrintnudf, 2, di)
+  BUILTIN_GPI_I16 (UNOP, lroundhf, 2, FP)
+  VAR1 (UNOP, lroundsf, 2, FP, si)
+  VAR1 (UNOP, lrounddf, 2, FP, di)
+
+  VAR1 (UNOPUS, lrounduv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lrounduv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lrounduv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lrounduv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lrounduv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lrounduhf, 2, FP)
+  VAR1 (UNOPUS, lroundusf, 2, FP, si)
+  VAR1 (UNOPUS, lroundudf, 2, FP, di)
+
+  VAR1 (UNOP, lceilv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lceilv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lceilv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lceilv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lceilv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOP, lceilhf, 2, FP)
+
+  VAR1 (UNOPUS, lceiluv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lceiluv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lceiluv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lceiluv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lceiluv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lceiluhf, 2, FP)
+  VAR1 (UNOPUS, lceilusf, 2, FP, si)
+  VAR1 (UNOPUS, lceiludf, 2, FP, di)
+
+  VAR1 (UNOP, lfloorv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lfloorv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lfloorv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lfloorv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lfloorv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOP, lfloorhf, 2, FP)
+
+  VAR1 (UNOPUS, lflooruv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lflooruv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lflooruv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lflooruv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lflooruv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lflooruhf, 2, FP)
+  VAR1 (UNOPUS, lfloorusf, 2, FP, si)
+  VAR1 (UNOPUS, lfloorudf, 2, FP, di)
+
+  VAR1 (UNOP, lfrintnv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lfrintnv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lfrintnv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lfrintnv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lfrintnv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOP, lfrintnhf, 2, FP)
+  VAR1 (UNOP, lfrintnsf, 2, FP, si)
+  VAR1 (UNOP, lfrintndf, 2, FP, di)
+
+  VAR1 (UNOPUS, lfrintnuv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lfrintnuv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lfrintnuv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lfrintnuv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lfrintnuv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lfrintnuhf, 2, FP)
+  VAR1 (UNOPUS, lfrintnusf, 2, FP, si)
+  VAR1 (UNOPUS, lfrintnudf, 2, FP, di)
 
   /* Implemented by <optab><fcvt_target><VDQF:mode>2.  */
-  VAR1 (UNOP, floatv4hi, 2, v4hf)
-  VAR1 (UNOP, floatv8hi, 2, v8hf)
-  VAR1 (UNOP, floatv2si, 2, v2sf)
-  VAR1 (UNOP, floatv4si, 2, v4sf)
-  VAR1 (UNOP, floatv2di, 2, v2df)
+  VAR1 (UNOP, floatv4hi, 2, ALL, v4hf)
+  VAR1 (UNOP, floatv8hi, 2, ALL, v8hf)
+  VAR1 (UNOP, floatv2si, 2, ALL, v2sf)
+  VAR1 (UNOP, floatv4si, 2, ALL, v4sf)
+  VAR1 (UNOP, floatv2di, 2, ALL, v2df)
 
-  VAR1 (UNOP, floatunsv4hi, 2, v4hf)
-  VAR1 (UNOP, floatunsv8hi, 2, v8hf)
-  VAR1 (UNOP, floatunsv2si, 2, v2sf)
-  VAR1 (UNOP, floatunsv4si, 2, v4sf)
-  VAR1 (UNOP, floatunsv2di, 2, v2df)
+  VAR1 (UNOP, floatunsv4hi, 2, ALL, v4hf)
+  VAR1 (UNOP, floatunsv8hi, 2, ALL, v8hf)
+  VAR1 (UNOP, floatunsv2si, 2, ALL, v2sf)
+  VAR1 (UNOP, floatunsv4si, 2, ALL, v4sf)
+  VAR1 (UNOP, floatunsv2di, 2, ALL, v2df)
 
-  VAR5 (UNOPU, bswap, 2, v4hi, v8hi, v2si, v4si, v2di)
+  VAR5 (UNOPU, bswap, 2, ALL, v4hi, v8hi, v2si, v4si, v2di)
 
-  BUILTIN_VB (UNOP, rbit, 0)
+  BUILTIN_VB (UNOP, rbit, 0, ALL)
 
   /* Implemented by
      aarch64_<PERMUTE:perm_insn><mode>.  */
-  BUILTIN_VALL (BINOP, zip1, 0)
-  BUILTIN_VALL (BINOP, zip2, 0)
-  BUILTIN_VALL (BINOP, uzp1, 0)
-  BUILTIN_VALL (BINOP, uzp2, 0)
-  BUILTIN_VALL (BINOP, trn1, 0)
-  BUILTIN_VALL (BINOP, trn2, 0)
+  BUILTIN_VALL (BINOP, zip1, 0, ALL)
+  BUILTIN_VALL (BINOP, zip2, 0, ALL)
+  BUILTIN_VALL (BINOP, uzp1, 0, ALL)
+  BUILTIN_VALL (BINOP, uzp2, 0, ALL)
+  BUILTIN_VALL (BINOP, trn1, 0, ALL)
+  BUILTIN_VALL (BINOP, trn2, 0, ALL)
 
-  BUILTIN_GPF_F16 (UNOP, frecpe, 0)
-  BUILTIN_GPF_F16 (UNOP, frecpx, 0)
+  BUILTIN_GPF_F16 (UNOP, frecpe, 0, ALL)
+  BUILTIN_GPF_F16 (UNOP, frecpx, 0, ALL)
 
-  BUILTIN_VDQ_SI (UNOP, urecpe, 0)
+  BUILTIN_VDQ_SI (UNOP, urecpe, 0, ALL)
 
-  BUILTIN_VHSDF (UNOP, frecpe, 0)
-  BUILTIN_VHSDF_HSDF (BINOP, frecps, 0)
+  BUILTIN_VHSDF (UNOP, frecpe, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, frecps, 0, ALL)
 
   /* Implemented by a mixture of abs2 patterns.  Note the DImode builtin is
      only ever used for the int64x1_t intrinsic, there is no scalar version.  */
-  BUILTIN_VSDQ_I_DI (UNOP, abs, 0)
-  BUILTIN_VHSDF (UNOP, abs, 2)
-  VAR1 (UNOP, abs, 2, hf)
+  BUILTIN_VSDQ_I_DI (UNOP, abs, 0, ALL)
+  BUILTIN_VHSDF (UNOP, abs, 2, ALL)
+  VAR1 (UNOP, abs, 2, ALL, hf)
 
-  BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10)
-  VAR1 (BINOP, float_truncate_hi_, 0, v4sf)
-  VAR1 (BINOP, float_truncate_hi_, 0, v8hf)
+  BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10, ALL)
+  VAR1 (BINOP, float_truncate_hi_, 0, ALL, v4sf)
+  VAR1 (BINOP, float_truncate_hi_, 0, ALL, v8hf)
 
-  VAR1 (UNOP, float_extend_lo_, 0, v2df)
-  VAR1 (UNOP, float_extend_lo_,  0, v4sf)
-  BUILTIN_VDF (UNOP, float_truncate_lo_, 0)
+  VAR1 (UNOP, float_extend_lo_, 0, ALL, v2df)
+  VAR1 (UNOP, float_extend_lo_,  0, ALL, v4sf)
+  BUILTIN_VDF (UNOP, float_truncate_lo_, 0, ALL)
 
   /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
-  BUILTIN_VALL_F16 (LOAD1, ld1, 0)
-  VAR1(STORE1P, ld1, 0, v2di)
+  BUILTIN_VALL_F16 (LOAD1, ld1, 0, ALL)
+  VAR1(STORE1P, ld1, 0, ALL, v2di)
 
   /* Implemented by aarch64_st1<VALL_F16:mode>.  */
-  BUILTIN_VALL_F16 (STORE1, st1, 0)
-  VAR1(STORE1P, st1, 0, v2di)
+  BUILTIN_VALL_F16 (STORE1, st1, 0, ALL)
+  VAR1(STORE1P, st1, 0, ALL, v2di)
 
   /* Implemented by aarch64_ld1x3<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0, ALL)
 
   /* Implemented by aarch64_ld1x4<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT, ld1x4, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x4, 0, ALL)
 
   /* Implemented by aarch64_st1x2<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0, ALL)
 
   /* Implemented by aarch64_st1x3<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0, ALL)
 
   /* Implemented by aarch64_st1x4<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0, ALL)
 
   /* Implemented by fma<mode>4.  */
-  BUILTIN_VHSDF (TERNOP, fma, 4)
-  VAR1 (TERNOP, fma, 4, hf)
+  BUILTIN_VHSDF (TERNOP, fma, 4, ALL)
+  VAR1 (TERNOP, fma, 4, ALL, hf)
   /* Implemented by fnma<mode>4.  */
-  BUILTIN_VHSDF (TERNOP, fnma, 4)
-  VAR1 (TERNOP, fnma, 4, hf)
+  BUILTIN_VHSDF (TERNOP, fnma, 4, ALL)
+  VAR1 (TERNOP, fnma, 4, ALL, hf)
 
   /* Implemented by aarch64_simd_bsl<mode>.  */
-  BUILTIN_VDQQH (BSL_P, simd_bsl, 0)
-  VAR2 (BSL_P, simd_bsl,0, di, v2di)
-  BUILTIN_VSDQ_I_DI (BSL_U, simd_bsl, 0)
-  BUILTIN_VALLDIF (BSL_S, simd_bsl, 0)
+  BUILTIN_VDQQH (BSL_P, simd_bsl, 0, ALL)
+  VAR2 (BSL_P, simd_bsl,0, ALL, di, v2di)
+  BUILTIN_VSDQ_I_DI (BSL_U, simd_bsl, 0, ALL)
+  BUILTIN_VALLDIF (BSL_S, simd_bsl, 0, ALL)
 
   /* Implemented by aarch64_crypto_aes<op><mode>.  */
-  VAR1 (BINOPU, crypto_aese, 0, v16qi)
-  VAR1 (BINOPU, crypto_aesd, 0, v16qi)
-  VAR1 (UNOPU, crypto_aesmc, 0, v16qi)
-  VAR1 (UNOPU, crypto_aesimc, 0, v16qi)
+  VAR1 (BINOPU, crypto_aese, 0, ALL, v16qi)
+  VAR1 (BINOPU, crypto_aesd, 0, ALL, v16qi)
+  VAR1 (UNOPU, crypto_aesmc, 0, ALL, v16qi)
+  VAR1 (UNOPU, crypto_aesimc, 0, ALL, v16qi)
 
   /* Implemented by aarch64_crypto_sha1<op><mode>.  */
-  VAR1 (UNOPU, crypto_sha1h, 0, si)
-  VAR1 (BINOPU, crypto_sha1su1, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1c, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1m, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1p, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1su0, 0, v4si)
+  VAR1 (UNOPU, crypto_sha1h, 0, ALL, si)
+  VAR1 (BINOPU, crypto_sha1su1, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha1c, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha1m, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha1p, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha1su0, 0, ALL, v4si)
 
   /* Implemented by aarch64_crypto_sha256<op><mode>.  */
-  VAR1 (TERNOPU, crypto_sha256h, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha256h2, 0, v4si)
-  VAR1 (BINOPU, crypto_sha256su0, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha256su1, 0, v4si)
+  VAR1 (TERNOPU, crypto_sha256h, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha256h2, 0, ALL, v4si)
+  VAR1 (BINOPU, crypto_sha256su0, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha256su1, 0, ALL, v4si)
 
   /* Implemented by aarch64_crypto_pmull<mode>.  */
-  VAR1 (BINOPP, crypto_pmull, 0, di)
-  VAR1 (BINOPP, crypto_pmull, 0, v2di)
+  VAR1 (BINOPP, crypto_pmull, 0, ALL, di)
+  VAR1 (BINOPP, crypto_pmull, 0, ALL, v2di)
 
   /* Implemented by aarch64_tbl3<mode>.  */
-  VAR1 (BINOP, tbl3, 0, v8qi)
-  VAR1 (BINOP, tbl3, 0, v16qi)
+  VAR1 (BINOP, tbl3, 0, ALL, v8qi)
+  VAR1 (BINOP, tbl3, 0, ALL, v16qi)
 
   /* Implemented by aarch64_qtbl3<mode>.  */
-  VAR1 (BINOP, qtbl3, 0, v8qi)
-  VAR1 (BINOP, qtbl3, 0, v16qi)
+  VAR1 (BINOP, qtbl3, 0, ALL, v8qi)
+  VAR1 (BINOP, qtbl3, 0, ALL, v16qi)
 
   /* Implemented by aarch64_qtbl4<mode>.  */
-  VAR1 (BINOP, qtbl4, 0, v8qi)
-  VAR1 (BINOP, qtbl4, 0, v16qi)
+  VAR1 (BINOP, qtbl4, 0, ALL, v8qi)
+  VAR1 (BINOP, qtbl4, 0, ALL, v16qi)
 
   /* Implemented by aarch64_tbx4<mode>.  */
-  VAR1 (TERNOP, tbx4, 0, v8qi)
-  VAR1 (TERNOP, tbx4, 0, v16qi)
+  VAR1 (TERNOP, tbx4, 0, ALL, v8qi)
+  VAR1 (TERNOP, tbx4, 0, ALL, v16qi)
 
   /* Implemented by aarch64_qtbx3<mode>.  */
-  VAR1 (TERNOP, qtbx3, 0, v8qi)
-  VAR1 (TERNOP, qtbx3, 0, v16qi)
+  VAR1 (TERNOP, qtbx3, 0, ALL, v8qi)
+  VAR1 (TERNOP, qtbx3, 0, ALL, v16qi)
 
   /* Implemented by aarch64_qtbx4<mode>.  */
-  VAR1 (TERNOP, qtbx4, 0, v8qi)
-  VAR1 (TERNOP, qtbx4, 0, v16qi)
+  VAR1 (TERNOP, qtbx4, 0, ALL, v8qi)
+  VAR1 (TERNOP, qtbx4, 0, ALL, v16qi)
 
   /* Builtins for ARMv8.1-A Adv.SIMD instructions.  */
 
   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h<mode>.  */
-  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlah, 0)
-  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlsh, 0)
+  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlah, 0, ALL)
+  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlsh, 0, ALL)
 
   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_lane<mode>.  */
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_lane, 0)
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_lane, 0)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_lane, 0, ALL)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_lane, 0, ALL)
 
   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_laneq<mode>.  */
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_laneq, 0)
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_laneq, 0, ALL)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0, ALL)
 
   /* Implemented by <FCVT_F2FIXED/FIXED2F:fcvt_fixed_insn><*><*>3.  */
-  BUILTIN_VSDQ_HSDI (SHIFTIMM, scvtf, 3)
-  BUILTIN_VSDQ_HSDI (FCVTIMM_SUS, ucvtf, 3)
-  BUILTIN_VHSDF_HSDF (SHIFTIMM, fcvtzs, 3)
-  BUILTIN_VHSDF_HSDF (SHIFTIMM_USS, fcvtzu, 3)
-  VAR1 (SHIFTIMM, scvtfsi, 3, hf)
-  VAR1 (SHIFTIMM, scvtfdi, 3, hf)
-  VAR1 (FCVTIMM_SUS, ucvtfsi, 3, hf)
-  VAR1 (FCVTIMM_SUS, ucvtfdi, 3, hf)
-  BUILTIN_GPI (SHIFTIMM, fcvtzshf, 3)
-  BUILTIN_GPI (SHIFTIMM_USS, fcvtzuhf, 3)
+  BUILTIN_VSDQ_HSDI (SHIFTIMM, scvtf, 3, ALL)
+  BUILTIN_VSDQ_HSDI (FCVTIMM_SUS, ucvtf, 3, ALL)
+  BUILTIN_VHSDF_HSDF (SHIFTIMM, fcvtzs, 3, ALL)
+  BUILTIN_VHSDF_HSDF (SHIFTIMM_USS, fcvtzu, 3, ALL)
+  VAR1 (SHIFTIMM, scvtfsi, 3, ALL, hf)
+  VAR1 (SHIFTIMM, scvtfdi, 3, ALL, hf)
+  VAR1 (FCVTIMM_SUS, ucvtfsi, 3, ALL, hf)
+  VAR1 (FCVTIMM_SUS, ucvtfdi, 3, ALL, hf)
+  BUILTIN_GPI (SHIFTIMM, fcvtzshf, 3, ALL)
+  BUILTIN_GPI (SHIFTIMM_USS, fcvtzuhf, 3, ALL)
 
   /* Implemented by aarch64_rsqrte<mode>.  */
-  BUILTIN_VHSDF_HSDF (UNOP, rsqrte, 0)
+  BUILTIN_VHSDF_HSDF (UNOP, rsqrte, 0, ALL)
 
   /* Implemented by aarch64_rsqrts<mode>.  */
-  BUILTIN_VHSDF_HSDF (BINOP, rsqrts, 0)
+  BUILTIN_VHSDF_HSDF (BINOP, rsqrts, 0, ALL)
 
   /* Implemented by fabd<mode>3.  */
-  BUILTIN_VHSDF_HSDF (BINOP, fabd, 3)
+  BUILTIN_VHSDF_HSDF (BINOP, fabd, 3, ALL)
 
   /* Implemented by aarch64_faddp<mode>.  */
-  BUILTIN_VHSDF (BINOP, faddp, 0)
+  BUILTIN_VHSDF (BINOP, faddp, 0, FP)
 
   /* Implemented by aarch64_cm<optab><mode>.  */
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmeq, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmge, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmgt, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmle, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmlt, 0)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmeq, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmge, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmgt, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmle, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmlt, 0, ALL)
 
   /* Implemented by neg<mode>2.  */
-  BUILTIN_VHSDF_HSDF (UNOP, neg, 2)
+  BUILTIN_VHSDF_HSDF (UNOP, neg, 2, ALL)
 
   /* Implemented by aarch64_fac<optab><mode>.  */
-  BUILTIN_VHSDF_HSDF (BINOP_USS, faclt, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, facle, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, faclt, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facle, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0, ALL)
 
   /* Implemented by sqrt<mode>2.  */
-  VAR1 (UNOP, sqrt, 2, hf)
+  VAR1 (UNOP, sqrt, 2, ALL, hf)
 
   /* Implemented by <optab><mode>hf2.  */
-  VAR1 (UNOP, floatdi, 2, hf)
-  VAR1 (UNOP, floatsi, 2, hf)
-  VAR1 (UNOP, floathi, 2, hf)
-  VAR1 (UNOPUS, floatunsdi, 2, hf)
-  VAR1 (UNOPUS, floatunssi, 2, hf)
-  VAR1 (UNOPUS, floatunshi, 2, hf)
-  BUILTIN_GPI_I16 (UNOP, fix_trunchf, 2)
-  BUILTIN_GPI (UNOP, fix_truncsf, 2)
-  BUILTIN_GPI (UNOP, fix_truncdf, 2)
-  BUILTIN_GPI_I16 (UNOPUS, fixuns_trunchf, 2)
-  BUILTIN_GPI (UNOPUS, fixuns_truncsf, 2)
-  BUILTIN_GPI (UNOPUS, fixuns_truncdf, 2)
+  VAR1 (UNOP, floatdi, 2, ALL, hf)
+  VAR1 (UNOP, floatsi, 2, ALL, hf)
+  VAR1 (UNOP, floathi, 2, ALL, hf)
+  VAR1 (UNOPUS, floatunsdi, 2, ALL, hf)
+  VAR1 (UNOPUS, floatunssi, 2, ALL, hf)
+  VAR1 (UNOPUS, floatunshi, 2, ALL, hf)
+  BUILTIN_GPI_I16 (UNOP, fix_trunchf, 2, ALL)
+  BUILTIN_GPI (UNOP, fix_truncsf, 2, ALL)
+  BUILTIN_GPI (UNOP, fix_truncdf, 2, ALL)
+  BUILTIN_GPI_I16 (UNOPUS, fixuns_trunchf, 2, ALL)
+  BUILTIN_GPI (UNOPUS, fixuns_truncsf, 2, ALL)
+  BUILTIN_GPI (UNOPUS, fixuns_truncdf, 2, ALL)
 
   /* Implemented by aarch64_sm3ss1qv4si.  */
-  VAR1 (TERNOPU, sm3ss1q, 0, v4si)
+  VAR1 (TERNOPU, sm3ss1q, 0, ALL, v4si)
   /* Implemented by aarch64_sm3tt<sm3tt_op>qv4si.  */
-  VAR1 (QUADOPUI, sm3tt1aq, 0, v4si)
-  VAR1 (QUADOPUI, sm3tt1bq, 0, v4si)
-  VAR1 (QUADOPUI, sm3tt2aq, 0, v4si)
-  VAR1 (QUADOPUI, sm3tt2bq, 0, v4si)
+  VAR1 (QUADOPUI, sm3tt1aq, 0, ALL, v4si)
+  VAR1 (QUADOPUI, sm3tt1bq, 0, ALL, v4si)
+  VAR1 (QUADOPUI, sm3tt2aq, 0, ALL, v4si)
+  VAR1 (QUADOPUI, sm3tt2bq, 0, ALL, v4si)
   /* Implemented by aarch64_sm3partw<sm3part_op>qv4si.  */
-  VAR1 (TERNOPU, sm3partw1q, 0, v4si)
-  VAR1 (TERNOPU, sm3partw2q, 0, v4si)
+  VAR1 (TERNOPU, sm3partw1q, 0, ALL, v4si)
+  VAR1 (TERNOPU, sm3partw2q, 0, ALL, v4si)
   /* Implemented by aarch64_sm4eqv4si.  */
-  VAR1 (BINOPU, sm4eq, 0, v4si)
+  VAR1 (BINOPU, sm4eq, 0, ALL, v4si)
   /* Implemented by aarch64_sm4ekeyqv4si.  */
-  VAR1 (BINOPU, sm4ekeyq, 0, v4si)
+  VAR1 (BINOPU, sm4ekeyq, 0, ALL, v4si)
   /* Implemented by aarch64_crypto_sha512hqv2di.  */
-  VAR1 (TERNOPU, crypto_sha512hq, 0, v2di)
+  VAR1 (TERNOPU, crypto_sha512hq, 0, ALL, v2di)
   /* Implemented by aarch64_sha512h2qv2di.  */
-  VAR1 (TERNOPU, crypto_sha512h2q, 0, v2di)
+  VAR1 (TERNOPU, crypto_sha512h2q, 0, ALL, v2di)
   /* Implemented by aarch64_crypto_sha512su0qv2di.  */
-  VAR1 (BINOPU, crypto_sha512su0q, 0, v2di)
+  VAR1 (BINOPU, crypto_sha512su0q, 0, ALL, v2di)
   /* Implemented by aarch64_crypto_sha512su1qv2di.  */
-  VAR1 (TERNOPU, crypto_sha512su1q, 0, v2di)
+  VAR1 (TERNOPU, crypto_sha512su1q, 0, ALL, v2di)
   /* Implemented by eor3q<mode>4.  */
-  BUILTIN_VQ_I (TERNOPU, eor3q, 4)
-  BUILTIN_VQ_I (TERNOP, eor3q, 4)
+  BUILTIN_VQ_I (TERNOPU, eor3q, 4, ALL)
+  BUILTIN_VQ_I (TERNOP, eor3q, 4, ALL)
   /* Implemented by aarch64_rax1qv2di.  */
-  VAR1 (BINOPU, rax1q, 0, v2di)
+  VAR1 (BINOPU, rax1q, 0, ALL, v2di)
   /* Implemented by aarch64_xarqv2di.  */
-  VAR1 (TERNOPUI, xarq, 0, v2di)
+  VAR1 (TERNOPUI, xarq, 0, ALL, v2di)
   /* Implemented by bcaxq<mode>4.  */
-  BUILTIN_VQ_I (TERNOPU, bcaxq, 4)
-  BUILTIN_VQ_I (TERNOP, bcaxq, 4)
+  BUILTIN_VQ_I (TERNOPU, bcaxq, 4, ALL)
+  BUILTIN_VQ_I (TERNOP, bcaxq, 4, ALL)
 
   /* Implemented by aarch64_fml<f16mac1>l<f16quad>_low<mode>.  */
-  VAR1 (TERNOP, fmlal_low, 0, v2sf)
-  VAR1 (TERNOP, fmlsl_low, 0, v2sf)
-  VAR1 (TERNOP, fmlalq_low, 0, v4sf)
-  VAR1 (TERNOP, fmlslq_low, 0, v4sf)
+  VAR1 (TERNOP, fmlal_low, 0, ALL, v2sf)
+  VAR1 (TERNOP, fmlsl_low, 0, ALL, v2sf)
+  VAR1 (TERNOP, fmlalq_low, 0, ALL, v4sf)
+  VAR1 (TERNOP, fmlslq_low, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>l<f16quad>_high<mode>.  */
-  VAR1 (TERNOP, fmlal_high, 0, v2sf)
-  VAR1 (TERNOP, fmlsl_high, 0, v2sf)
-  VAR1 (TERNOP, fmlalq_high, 0, v4sf)
-  VAR1 (TERNOP, fmlslq_high, 0, v4sf)
+  VAR1 (TERNOP, fmlal_high, 0, ALL, v2sf)
+  VAR1 (TERNOP, fmlsl_high, 0, ALL, v2sf)
+  VAR1 (TERNOP, fmlalq_high, 0, ALL, v4sf)
+  VAR1 (TERNOP, fmlslq_high, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>l_lane_lowv2sf.  */
-  VAR1 (QUADOP_LANE, fmlal_lane_low, 0, v2sf)
-  VAR1 (QUADOP_LANE, fmlsl_lane_low, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlal_lane_low, 0, ALL, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_lane_low, 0, ALL, v2sf)
   /* Implemented by aarch64_fml<f16mac1>l_laneq_lowv2sf.  */
-  VAR1 (QUADOP_LANE, fmlal_laneq_low, 0, v2sf)
-  VAR1 (QUADOP_LANE, fmlsl_laneq_low, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlal_laneq_low, 0, ALL, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_laneq_low, 0, ALL, v2sf)
   /* Implemented by aarch64_fml<f16mac1>lq_lane_lowv4sf.  */
-  VAR1 (QUADOP_LANE, fmlalq_lane_low, 0, v4sf)
-  VAR1 (QUADOP_LANE, fmlslq_lane_low, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlalq_lane_low, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_lane_low, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>lq_laneq_lowv4sf.  */
-  VAR1 (QUADOP_LANE, fmlalq_laneq_low, 0, v4sf)
-  VAR1 (QUADOP_LANE, fmlslq_laneq_low, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlalq_laneq_low, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_laneq_low, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>l_lane_highv2sf.  */
-  VAR1 (QUADOP_LANE, fmlal_lane_high, 0, v2sf)
-  VAR1 (QUADOP_LANE, fmlsl_lane_high, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlal_lane_high, 0, ALL, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_lane_high, 0, ALL, v2sf)
   /* Implemented by aarch64_fml<f16mac1>l_laneq_highv2sf.  */
-  VAR1 (QUADOP_LANE, fmlal_laneq_high, 0, v2sf)
-  VAR1 (QUADOP_LANE, fmlsl_laneq_high, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlal_laneq_high, 0, ALL, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_laneq_high, 0, ALL, v2sf)
   /* Implemented by aarch64_fml<f16mac1>lq_lane_highv4sf.  */
-  VAR1 (QUADOP_LANE, fmlalq_lane_high, 0, v4sf)
-  VAR1 (QUADOP_LANE, fmlslq_lane_high, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlalq_lane_high, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_lane_high, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>lq_laneq_highv4sf.  */
-  VAR1 (QUADOP_LANE, fmlalq_laneq_high, 0, v4sf)
-  VAR1 (QUADOP_LANE, fmlslq_laneq_high, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlalq_laneq_high, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_laneq_high, 0, ALL, v4sf)
 
   /* Implemented by aarch64_<frintnzs_op><mode>.  */
-  BUILTIN_VSFDF (UNOP, frint32z, 0)
-  BUILTIN_VSFDF (UNOP, frint32x, 0)
-  BUILTIN_VSFDF (UNOP, frint64z, 0)
-  BUILTIN_VSFDF (UNOP, frint64x, 0)
+  BUILTIN_VSFDF (UNOP, frint32z, 0, ALL)
+  BUILTIN_VSFDF (UNOP, frint32x, 0, ALL)
+  BUILTIN_VSFDF (UNOP, frint64z, 0, ALL)
+  BUILTIN_VSFDF (UNOP, frint64x, 0, ALL)
 
   /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
-  VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
-  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
-  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
+  VAR2 (TERNOP, bfdot, 0, ALL, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, ALL, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, ALL, v2sf, v4sf)
 
   /* Implemented by aarch64_bfmmlaqv4sf  */
-  VAR1 (TERNOP, bfmmlaq, 0, v4sf)
+  VAR1 (TERNOP, bfmmlaq, 0, ALL, v4sf)
 
   /* Implemented by aarch64_bfmlal<bt>{_lane{q}}v4sf  */
-  VAR1 (TERNOP, bfmlalb, 0, v4sf)
-  VAR1 (TERNOP, bfmlalt, 0, v4sf)
-  VAR1 (QUADOP_LANE, bfmlalb_lane, 0, v4sf)
-  VAR1 (QUADOP_LANE, bfmlalt_lane, 0, v4sf)
-  VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, v4sf)
-  VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, v4sf)
+  VAR1 (TERNOP, bfmlalb, 0, ALL, v4sf)
+  VAR1 (TERNOP, bfmlalt, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_lane, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_lane, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
 
   /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
-  VAR1 (TERNOP, simd_smmla, 0, v16qi)
-  VAR1 (TERNOPU, simd_ummla, 0, v16qi)
-  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)
+  VAR1 (TERNOP, simd_smmla, 0, ALL, v16qi)
+  VAR1 (TERNOPU, simd_ummla, 0, ALL, v16qi)
+  VAR1 (TERNOP_SSUS, simd_usmmla, 0, ALL, v16qi)
 
   /* Implemented by aarch64_bfcvtn{q}{2}<mode>  */
-  VAR1 (UNOP, bfcvtn, 0, v4bf)
-  VAR1 (UNOP, bfcvtn_q, 0, v8bf)
-  VAR1 (BINOP, bfcvtn2, 0, v8bf)
-  VAR1 (UNOP, bfcvt, 0, bf)
+  VAR1 (UNOP, bfcvtn, 0, ALL, v4bf)
+  VAR1 (UNOP, bfcvtn_q, 0, ALL, v8bf)
+  VAR1 (BINOP, bfcvtn2, 0, ALL, v8bf)
+  VAR1 (UNOP, bfcvt, 0, ALL, bf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9f0e2bd..381a702 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -187,7 +187,7 @@
 		   plus_constant (Pmode,
 				  XEXP (operands[1], 0),
 				  GET_MODE_SIZE (<DREG:MODE>mode)))"
-  "ldp\\t%d0, %d2, %1"
+  "ldp\\t%d0, %d2, %z1"
   [(set_attr "type" "neon_ldp")]
 )
 
@@ -201,7 +201,7 @@
 		   plus_constant (Pmode,
 				  XEXP (operands[0], 0),
 				  GET_MODE_SIZE (<DREG:MODE>mode)))"
-  "stp\\t%d1, %d3, %0"
+  "stp\\t%d1, %d3, %z0"
   [(set_attr "type" "neon_stp")]
 )
 
@@ -215,7 +215,7 @@
 		    plus_constant (Pmode,
 			       XEXP (operands[1], 0),
 			       GET_MODE_SIZE (<VQ:MODE>mode)))"
-  "ldp\\t%q0, %q2, %1"
+  "ldp\\t%q0, %q2, %z1"
   [(set_attr "type" "neon_ldp_q")]
 )
 
@@ -228,7 +228,7 @@
 		plus_constant (Pmode,
 			       XEXP (operands[0], 0),
 			       GET_MODE_SIZE (<VQ:MODE>mode)))"
-  "stp\\t%q1, %q3, %0"
+  "stp\\t%q1, %q3, %z0"
   [(set_attr "type" "neon_stp_q")]
 )
 
@@ -958,7 +958,7 @@
   [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
 )
 
-(define_insn "*aarch64_simd_vec_copy_lane<mode>"
+(define_insn "@aarch64_simd_vec_copy_lane<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(vec_merge:VALL_F16
 	    (vec_duplicate:VALL_F16
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index c49fceb..e73aa9a 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -101,7 +101,7 @@ struct registered_function_hasher : nofree_ptr_hash <registered_function>
 /* Information about each single-predicate or single-vector type.  */
 static CONSTEXPR const vector_type_info vector_types[] = {
 #define DEF_SVE_TYPE(ACLE_NAME, NCHARS, ABI_NAME, SCALAR_TYPE) \
-  { #ACLE_NAME, #ABI_NAME, #NCHARS #ABI_NAME },
+  { #ACLE_NAME, #ABI_NAME, "u" #NCHARS #ABI_NAME },
 #include "aarch64-sve-builtins.def"
 };
 
@@ -564,15 +564,16 @@ static bool reported_missing_registers_p;
 
 /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
    and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
-   mangling of the type.  */
+   mangling of the type.  ACLE_NAME is the <arm_sve.h> name of the type.  */
 static void
 add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
-			const char *mangled_name)
+			const char *mangled_name, const char *acle_name)
 {
   tree mangled_name_tree
     = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
 
-  tree value = tree_cons (NULL_TREE, mangled_name_tree, NULL_TREE);
+  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
+  value = tree_cons (NULL_TREE, mangled_name_tree, value);
   value = tree_cons (NULL_TREE, size_int (num_pr), value);
   value = tree_cons (NULL_TREE, size_int (num_zr), value);
   TYPE_ATTRIBUTES (type) = tree_cons (get_identifier ("SVE type"), value,
@@ -2792,7 +2793,7 @@ function_expander::add_output_operand (insn_code icode)
 {
   unsigned int opno = m_ops.length ();
   machine_mode mode = insn_data[icode].operand[opno].mode;
-  m_ops.safe_grow (opno + 1);
+  m_ops.safe_grow (opno + 1, true);
   create_output_operand (&m_ops.last (), possible_target, mode);
 }
 
@@ -2829,7 +2830,7 @@ function_expander::add_input_operand (insn_code icode, rtx x)
       gcc_assert (GET_MODE (x) == VNx16BImode);
       x = gen_lowpart (mode, x);
     }
-  m_ops.safe_grow (m_ops.length () + 1);
+  m_ops.safe_grow (m_ops.length () + 1, true);
   create_input_operand (&m_ops.last (), x, mode);
 }
 
@@ -2837,7 +2838,7 @@ function_expander::add_input_operand (insn_code icode, rtx x)
 void
 function_expander::add_integer_operand (HOST_WIDE_INT x)
 {
-  m_ops.safe_grow (m_ops.length () + 1);
+  m_ops.safe_grow (m_ops.length () + 1, true);
   create_integer_operand (&m_ops.last (), x);
 }
 
@@ -2861,7 +2862,7 @@ function_expander::add_mem_operand (machine_mode mode, rtx addr)
 void
 function_expander::add_address_operand (rtx x)
 {
-  m_ops.safe_grow (m_ops.length () + 1);
+  m_ops.safe_grow (m_ops.length () + 1, true);
   create_address_operand (&m_ops.last (), x);
 }
 
@@ -2870,7 +2871,7 @@ function_expander::add_address_operand (rtx x)
 void
 function_expander::add_fixed_operand (rtx x)
 {
-  m_ops.safe_grow (m_ops.length () + 1);
+  m_ops.safe_grow (m_ops.length () + 1, true);
   create_fixed_operand (&m_ops.last (), x);
 }
 
@@ -3363,7 +3364,8 @@ register_builtin_types ()
       TYPE_ARTIFICIAL (vectype) = 1;
       TYPE_INDIVISIBLE_P (vectype) = 1;
       add_sve_type_attribute (vectype, num_zr, num_pr,
-			      vector_types[i].mangled_name);
+			      vector_types[i].mangled_name,
+			      vector_types[i].acle_name);
       make_type_sizeless (vectype);
       abi_vector_types[i] = vectype;
       lang_hooks.types.register_builtin_type (vectype,
@@ -3409,6 +3411,13 @@ register_tuple_type (unsigned int num_vectors, vector_type_index type)
 {
   tree tuple_type = lang_hooks.types.make_type (RECORD_TYPE);
 
+  /* Work out the structure name.  */
+  char buffer[sizeof ("svbfloat16x4_t")];
+  const char *vector_type_name = vector_types[type].acle_name;
+  snprintf (buffer, sizeof (buffer), "%.*sx%d_t",
+	    (int) strlen (vector_type_name) - 2, vector_type_name,
+	    num_vectors);
+
   /* The contents of the type are opaque, so we can define them in any
      way that maps to the correct ABI type.
 
@@ -3432,20 +3441,13 @@ register_tuple_type (unsigned int num_vectors, vector_type_index type)
 			   get_identifier ("__val"), array_type);
   DECL_FIELD_CONTEXT (field) = tuple_type;
   TYPE_FIELDS (tuple_type) = field;
-  add_sve_type_attribute (tuple_type, num_vectors, 0, NULL);
+  add_sve_type_attribute (tuple_type, num_vectors, 0, NULL, buffer);
   make_type_sizeless (tuple_type);
   layout_type (tuple_type);
   gcc_assert (VECTOR_MODE_P (TYPE_MODE (tuple_type))
 	      && TYPE_MODE_RAW (tuple_type) == TYPE_MODE (tuple_type)
 	      && TYPE_ALIGN (tuple_type) == 128);
 
-  /* Work out the structure name.  */
-  char buffer[sizeof ("svbfloat16x4_t")];
-  const char *vector_type_name = vector_types[type].acle_name;
-  snprintf (buffer, sizeof (buffer), "%.*sx%d_t",
-	    (int) strlen (vector_type_name) - 2, vector_type_name,
-	    num_vectors);
-
   tree decl = build_decl (input_location, TYPE_DECL,
 			  get_identifier (buffer), tuple_type);
   TYPE_NAME (tuple_type) = decl;
@@ -3646,6 +3648,29 @@ builtin_type_p (const_tree type, unsigned int *num_zr, unsigned int *num_pr)
   return false;
 }
 
+/* ATTRS is the attribute list for a sizeless SVE type.  Return the
+   attributes of the associated fixed-length SVE type, taking the
+   "SVE type" attributes from NEW_SVE_TYPE_ARGS.  */
+static tree
+get_arm_sve_vector_bits_attributes (tree old_attrs, tree new_sve_type_args)
+{
+  tree new_attrs = NULL_TREE;
+  tree *ptr = &new_attrs;
+  for (tree attr = old_attrs; attr; attr = TREE_CHAIN (attr))
+    {
+      tree name = get_attribute_name (attr);
+      if (is_attribute_p ("SVE sizeless type", name))
+	continue;
+
+      tree args = TREE_VALUE (attr);
+      if (is_attribute_p ("SVE type", name))
+	args = new_sve_type_args;
+      *ptr = tree_cons (TREE_PURPOSE (attr), args, NULL_TREE);
+      ptr = &TREE_CHAIN (*ptr);
+    }
+  return new_attrs;
+}
+
 /* An attribute callback for the "arm_sve_vector_bits" attribute.  */
 tree
 handle_arm_sve_vector_bits_attribute (tree *node, tree, tree args, int,
@@ -3654,12 +3679,27 @@ handle_arm_sve_vector_bits_attribute (tree *node, tree, tree args, int,
   *no_add_attrs = true;
 
   tree type = *node;
-  if (!VECTOR_TYPE_P (type) || !builtin_type_p (type))
+  tree attr = lookup_sve_type_attribute (type);
+  if (!attr)
     {
       error ("%qs applied to non-SVE type %qT", "arm_sve_vector_bits", type);
       return NULL_TREE;
     }
 
+  if (!VECTOR_TYPE_P (type))
+    {
+      error ("%qs applied to non-vector type %qT",
+	     "arm_sve_vector_bits", type);
+      return NULL_TREE;
+    }
+
+  if (!sizeless_type_p (type))
+    {
+      error ("%qs applied to type %qT, which already has a size",
+	     "arm_sve_vector_bits", type);
+      return NULL_TREE;
+    }
+
   tree size = TREE_VALUE (args);
   if (TREE_CODE (size) != INTEGER_CST)
     {
@@ -3675,6 +3715,23 @@ handle_arm_sve_vector_bits_attribute (tree *node, tree, tree args, int,
       return NULL_TREE;
     }
 
+  /* Construct a new list of "SVE type" attribute arguments.  */
+  tree new_sve_type_args = copy_list (TREE_VALUE (attr));
+
+  /* Mangle the type as an instance of the imaginary template:
+
+       __SVE_VLS<typename, unsigned>
+
+     where the first parameter is the SVE type and where the second
+     parameter is the SVE vector length in bits.  */
+  tree mangled_name_node = chain_index (2, new_sve_type_args);
+  const char *old_mangled_name
+    = IDENTIFIER_POINTER (TREE_VALUE (mangled_name_node));
+  char *new_mangled_name
+    = xasprintf ("9__SVE_VLSI%sLj%dEE", old_mangled_name, (int) value);
+  TREE_VALUE (mangled_name_node) = get_identifier (new_mangled_name);
+  free (new_mangled_name);
+
   /* FIXME: The type ought to be a distinct copy in all cases, but
      currently that makes the C frontend reject conversions between
      svbool_t and its fixed-length variants.  Using a type variant
@@ -3687,6 +3744,44 @@ handle_arm_sve_vector_bits_attribute (tree *node, tree, tree args, int,
   else
     new_type = build_distinct_type_copy (base_type);
 
+  /* Construct a TYPE_DECL for the new type.  This serves two purposes:
+
+     - It ensures we don't print the original TYPE_DECL in error messages.
+       Printing the original name would be confusing because there are
+       situations in which the distinction between the original type and
+       the new type matters.  For example:
+
+	   __SVInt8_t __attribute__((arm_sve_vector_bits(512))) *a;
+	   __SVInt8_t *b;
+
+	   a = b;
+
+       is invalid in C++, but without this, we'd print both types in
+       the same way.
+
+     - Having a separate TYPE_DECL is necessary to ensure that C++
+       mangling works correctly.  See mangle_builtin_type for details.
+
+     The name of the decl is something like:
+
+       svint8_t __attribute__((arm_sve_vector_bits(512)))
+
+     This is a compromise.  It would be more accurate to use something like:
+
+       __SVInt8_t __attribute__((arm_sve_vector_bits(512)))
+
+     but the <arm_sve.h> name is likely to be more meaningful.  */
+  tree acle_name_node = TREE_CHAIN (mangled_name_node);
+  const char *old_type_name = IDENTIFIER_POINTER (TREE_VALUE (acle_name_node));
+  char *new_type_name
+    = xasprintf ("%s __attribute__((arm_sve_vector_bits(%d)))",
+		 old_type_name, (int) value);
+  tree decl = build_decl (BUILTINS_LOCATION, TYPE_DECL,
+			  get_identifier (new_type_name), new_type);
+  DECL_ARTIFICIAL (decl) = 1;
+  TYPE_NAME (new_type) = decl;
+  free (new_type_name);
+
   /* Allow the GNU vector extensions to be applied to vectors.
      The extensions aren't yet defined for packed predicates,
      so continue to treat them as abstract entities for now.  */
@@ -3696,16 +3791,17 @@ handle_arm_sve_vector_bits_attribute (tree *node, tree, tree args, int,
   /* The new type is a normal sized type; it doesn't have the same
      restrictions as sizeless types.  */
   TYPE_ATTRIBUTES (new_type)
-    = remove_attribute ("SVE sizeless type",
-			copy_list (TYPE_ATTRIBUTES (new_type)));
+    = get_arm_sve_vector_bits_attributes (TYPE_ATTRIBUTES (new_type),
+					  new_sve_type_args);
 
   /* Apply the relevant attributes, qualifiers and alignment of TYPE,
      if they differ from the original (sizeless) BASE_TYPE.  */
   if (TYPE_ATTRIBUTES (base_type) != TYPE_ATTRIBUTES (type)
       || TYPE_QUALS (base_type) != TYPE_QUALS (type))
     {
-      tree attrs = remove_attribute ("SVE sizeless type",
-				     copy_list (TYPE_ATTRIBUTES (type)));
+      tree attrs
+	= get_arm_sve_vector_bits_attributes (TYPE_ATTRIBUTES (type),
+					      new_sve_type_args);
       new_type = build_type_attribute_qual_variant (new_type, attrs,
 						    TYPE_QUALS (type));
     }
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 9d06bf7..31a8c5a 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -464,6 +464,95 @@
 ;;
 ;; - MNEMONIC is the mnemonic of the associated SVE instruction.
 ;;
+;; For (3) and (4), we combine these operations with an UNSPEC_SEL
+;; that selects between the result of the FP operation and the "else"
+;; value.  (This else value is a merge input for _m ACLE functions
+;; and zero for _z ACLE functions.)  The outer pattern then has the form:
+;;
+;;   (unspec [pred fp_operation else_value] UNSPEC_SEL)
+;;
+;; This means that the patterns for (3) and (4) have two predicates:
+;; one for the FP operation itself and one for the UNSPEC_SEL.
+;; This pattern is equivalent to the result of combining an instance
+;; of (1) or (2) with a separate vcond instruction, so these patterns
+;; are useful as combine targets too.
+;;
+;; However, in the combine case, the instructions that we want to
+;; combine might use different predicates.  Then:
+;;
+;; - Some of the active lanes of the FP operation might be discarded
+;;   by the UNSPEC_SEL.  It's OK to drop the FP operation on those lanes,
+;;   even for SVE_STRICT_GP, since the operations on those lanes are
+;;   effectively dead code.
+;;
+;; - Some of the inactive lanes of the FP operation might be selected
+;;   by the UNSPEC_SEL, giving unspecified values for those lanes.
+;;   SVE_RELAXED_GP lets us extend the FP operation to cover these
+;;   extra lanes, but SVE_STRICT_GP does not.
+;;
+;; Thus SVE_RELAXED_GP allows us to ignore the predicate on the FP operation
+;; and operate on exactly the lanes selected by the UNSPEC_SEL predicate.
+;; This typically leads to patterns like:
+;;
+;;    (unspec [(match_operand 1 "register_operand" "Upl")
+;;             (unspec [(match_operand N)
+;;                      (const_int SVE_RELAXED_GP)
+;;                      ...]
+;;                     UNSPEC_COND_<MNEMONIC>)
+;;             ...])
+;;
+;; where operand N is allowed to be anything.  These instructions then
+;; have rewrite rules to replace operand N with operand 1, which gives the
+;; instructions a canonical form and means that the original operand N is
+;; not kept live unnecessarily.
+;;
+;; In contrast, SVE_STRICT_GP only allows the UNSPEC_SEL predicate to be
+;; a subset of the FP operation predicate.  This case isn't interesting
+;; for FP operations that have an all-true predicate, since such operations
+;; use SVE_RELAXED_GP instead.  And it is not possible for instruction
+;; conditions to track the subset relationship for arbitrary registers.
+;; So in practice, the only useful case for SVE_STRICT_GP is the one
+;; in which the predicates match:
+;;
+;;    (unspec [(match_operand 1 "register_operand" "Upl")
+;;             (unspec [(match_dup 1)
+;;                      (const_int SVE_STRICT_GP)
+;;                      ...]
+;;                     UNSPEC_COND_<MNEMONIC>)
+;;             ...])
+;;
+;; This pattern would also be correct for SVE_RELAXED_GP, but it would
+;; be redundant with the one above.  However, if the combine pattern
+;; has multiple FP operations, using a match_operand allows combinations
+;; of SVE_STRICT_GP and SVE_RELAXED_GP in the same operation, provided
+;; that the predicates are the same:
+;;
+;;    (unspec [(match_operand 1 "register_operand" "Upl")
+;;             (...
+;;                (unspec [(match_dup 1)
+;;                         (match_operand:SI N "aarch64_sve_gp_strictness")
+;;                         ...]
+;;                        UNSPEC_COND_<MNEMONIC1>)
+;;                (unspec [(match_dup 1)
+;;                         (match_operand:SI M "aarch64_sve_gp_strictness")
+;;                         ...]
+;;                        UNSPEC_COND_<MNEMONIC2>) ...)
+;;             ...])
+;;
+;; The fully-relaxed version of this pattern is:
+;;
+;;    (unspec [(match_operand 1 "register_operand" "Upl")
+;;             (...
+;;                (unspec [(match_operand:SI N)
+;;                         (const_int SVE_RELAXED_GP)
+;;                         ...]
+;;                        UNSPEC_COND_<MNEMONIC1>)
+;;                (unspec [(match_operand:SI M)
+;;                         (const_int SVE_RELAXED_GP)
+;;                         ...]
+;;                        UNSPEC_COND_<MNEMONIC2>) ...)
+;;             ...])
+;;
 ;; -------------------------------------------------------------------------
 ;; ---- Note on FFR handling
 ;; -------------------------------------------------------------------------
@@ -3304,18 +3393,18 @@
 )
 
 ;; Predicated floating-point unary arithmetic, merging with the first input.
-(define_insn_and_rewrite "*cond_<optab><mode>_2"
+(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 3)
-	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")]
 	     SVE_COND_FP_UNARY)
 	   (match_dup 2)]
 	  UNSPEC_SEL))]
-  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[3], operands[1])"
+  "TARGET_SVE"
   "@
    <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>
    movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
@@ -3326,6 +3415,24 @@
   [(set_attr "movprfx" "*,yes")]
 )
 
+(define_insn "*cond_<optab><mode>_2_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")]
+	     SVE_COND_FP_UNARY)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>
+   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
 ;; Predicated floating-point unary arithmetic, merging with an independent
 ;; value.
 ;;
@@ -3334,20 +3441,18 @@
 ;; which is handled above rather than here.  Marking all the alternatives
 ;; as earlyclobber helps to make the instruction more regular to the
 ;; register allocator.
-(define_insn_and_rewrite "*cond_<optab><mode>_any"
+(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, ?&w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")]
 	     SVE_COND_FP_UNARY)
 	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
 	  UNSPEC_SEL))]
-  "TARGET_SVE
-   && !rtx_equal_p (operands[2], operands[3])
-   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[3])"
   "@
    <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
    movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
@@ -3359,6 +3464,25 @@
   [(set_attr "movprfx" "*,yes,yes")]
 )
 
+(define_insn "*cond_<optab><mode>_any_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, ?&w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")]
+	     SVE_COND_FP_UNARY)
+	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[3])"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0, %3\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [FP] Square root
 ;; -------------------------------------------------------------------------
@@ -3644,10 +3768,10 @@
 ;; -------------------------------------------------------------------------
 
 (define_insn "sub<mode>3"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, ?&w")
-	(minus:SVE_FULL_I
-	  (match_operand:SVE_FULL_I 1 "aarch64_sve_arith_operand" "w, vsa, vsa")
-	  (match_operand:SVE_FULL_I 2 "register_operand" "w, 0, w")))]
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
+	(minus:SVE_I
+	  (match_operand:SVE_I 1 "aarch64_sve_arith_operand" "w, vsa, vsa")
+	  (match_operand:SVE_I 2 "register_operand" "w, 0, w")))]
   "TARGET_SVE"
   "@
    sub\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>
@@ -4649,19 +4773,19 @@
 
 ;; Predicated floating-point binary operations that take an integer as their
 ;; second operand, with inactive lanes coming from the first operand.
-(define_insn_and_rewrite "*cond_<optab><mode>_2"
+(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
 	      (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w")]
 	     SVE_COND_FP_BINARY_INT)
 	   (match_dup 2)]
 	  UNSPEC_SEL))]
-  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "TARGET_SVE"
   "@
    <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
    movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
@@ -4672,24 +4796,41 @@
   [(set_attr "movprfx" "*,yes")]
 )
 
+(define_insn "*cond_<optab><mode>_2_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+	      (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w")]
+	     SVE_COND_FP_BINARY_INT)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
 ;; Predicated floating-point binary operations that take an integer as
 ;; their second operand, with the values of inactive lanes being distinct
 ;; from the other inputs.
-(define_insn_and_rewrite "*cond_<optab><mode>_any"
+(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 5)
-	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w")
 	      (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w, w, w")]
 	     SVE_COND_FP_BINARY_INT)
 	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
 	  UNSPEC_SEL))]
-  "TARGET_SVE
-   && !rtx_equal_p (operands[2], operands[4])
-   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
   "@
    movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
    movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
@@ -4713,6 +4854,35 @@
   [(set_attr "movprfx" "yes")]
 )
 
+(define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w")
+	      (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w, w, w")]
+	     SVE_COND_FP_BINARY_INT)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[4], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[4])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+					     operands[4], operands[1]));
+    operands[4] = operands[2] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [FP] General binary arithmetic corresponding to rtx codes
 ;; -------------------------------------------------------------------------
@@ -4813,19 +4983,19 @@
 )
 
 ;; Predicated floating-point operations, merging with the first input.
-(define_insn_and_rewrite "*cond_<optab><mode>_2"
+(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
 	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
 	     SVE_COND_FP_BINARY)
 	   (match_dup 2)]
 	  UNSPEC_SEL))]
-  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "TARGET_SVE"
   "@
    <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
    movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
@@ -4836,20 +5006,39 @@
   [(set_attr "movprfx" "*,yes")]
 )
 
+(define_insn "*cond_<optab><mode>_2_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
+	     SVE_COND_FP_BINARY)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
 ;; Same for operations that take a 1-bit constant.
-(define_insn_and_rewrite "*cond_<optab><mode>_2_const"
+(define_insn_and_rewrite "*cond_<optab><mode>_2_const_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
 	      (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
 	     SVE_COND_FP_BINARY_I1)
 	   (match_dup 2)]
 	  UNSPEC_SEL))]
-  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "TARGET_SVE"
   "@
    <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
    movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
@@ -4860,20 +5049,39 @@
   [(set_attr "movprfx" "*,yes")]
 )
 
+(define_insn "*cond_<optab><mode>_2_const_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+	      (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
+	     SVE_COND_FP_BINARY_I1)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
+  [(set_attr "movprfx" "*,yes")]
+)
+
 ;; Predicated floating-point operations, merging with the second input.
-(define_insn_and_rewrite "*cond_<optab><mode>_3"
+(define_insn_and_rewrite "*cond_<optab><mode>_3_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
 	      (match_operand:SVE_FULL_F 3 "register_operand" "0, w")]
 	     SVE_COND_FP_BINARY)
 	   (match_dup 3)]
 	  UNSPEC_SEL))]
-  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "TARGET_SVE"
   "@
    <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
    movprfx\t%0, %3\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
@@ -4884,14 +5092,33 @@
   [(set_attr "movprfx" "*,yes")]
 )
 
+(define_insn "*cond_<optab><mode>_3_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "0, w")]
+	     SVE_COND_FP_BINARY)
+	   (match_dup 3)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0, %3\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
 ;; Predicated floating-point operations, merging with an independent value.
-(define_insn_and_rewrite "*cond_<optab><mode>_any"
+(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 5)
-	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w, w")
 	      (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w, w, w")]
 	     SVE_COND_FP_BINARY)
@@ -4899,8 +5126,7 @@
 	  UNSPEC_SEL))]
   "TARGET_SVE
    && !rtx_equal_p (operands[2], operands[4])
-   && !rtx_equal_p (operands[3], operands[4])
-   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+   && !rtx_equal_p (operands[3], operands[4])"
   "@
    movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
    movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
@@ -4925,22 +5151,52 @@
   [(set_attr "movprfx" "yes")]
 )
 
+(define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w, w, w")]
+	     SVE_COND_FP_BINARY)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[4])
+   && !rtx_equal_p (operands[3], operands[4])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[4], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[4])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+					     operands[4], operands[1]));
+    operands[4] = operands[2] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; Same for operations that take a 1-bit constant.
-(define_insn_and_rewrite "*cond_<optab><mode>_any_const"
+(define_insn_and_rewrite "*cond_<optab><mode>_any_const_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 5)
-	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")
 	      (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
 	     SVE_COND_FP_BINARY_I1)
 	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, 0, w")]
 	  UNSPEC_SEL))]
-  "TARGET_SVE
-   && !rtx_equal_p (operands[2], operands[4])
-   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
   "@
    movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
    movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
@@ -4963,6 +5219,34 @@
   [(set_attr "movprfx" "yes")]
 )
 
+(define_insn_and_rewrite "*cond_<optab><mode>_any_const_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")
+	      (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
+	     SVE_COND_FP_BINARY_I1)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   #"
+  "&& reload_completed
+   && register_operand (operands[4], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[4])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+					     operands[4], operands[1]));
+    operands[4] = operands[2] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [FP] Addition
 ;; -------------------------------------------------------------------------
@@ -5001,19 +5285,19 @@
 
 ;; Predicated floating-point addition of a constant, merging with the
 ;; first input.
-(define_insn_and_rewrite "*cond_add<mode>_2_const"
+(define_insn_and_rewrite "*cond_add<mode>_2_const_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w, ?w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "0, 0, w, w")
 	      (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate" "vsA, vsN, vsA, vsN")]
 	     UNSPEC_COND_FADD)
 	   (match_dup 2)]
 	  UNSPEC_SEL))]
-  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "TARGET_SVE"
   "@
    fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
    fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
@@ -5026,23 +5310,42 @@
   [(set_attr "movprfx" "*,*,yes,yes")]
 )
 
+(define_insn "*cond_add<mode>_2_const_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w, ?w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, 0, w, w")
+	      (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate" "vsA, vsN, vsA, vsN")]
+	     UNSPEC_COND_FADD)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
+   movprfx\t%0, %2\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0, %2\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3"
+  [(set_attr "movprfx" "*,*,yes,yes")]
+)
+
 ;; Predicated floating-point addition of a constant, merging with an
 ;; independent value.
-(define_insn_and_rewrite "*cond_add<mode>_any_const"
+(define_insn_and_rewrite "*cond_add<mode>_any_const_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, w, w, ?w, ?w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 5)
-	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w, w, w, w")
 	      (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate" "vsA, vsN, vsA, vsN, vsA, vsN")]
 	     UNSPEC_COND_FADD)
 	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, 0, w, w")]
 	  UNSPEC_SEL))]
-  "TARGET_SVE
-   && !rtx_equal_p (operands[2], operands[4])
-   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
   "@
    movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
    movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
@@ -5068,6 +5371,37 @@
   [(set_attr "movprfx" "yes")]
 )
 
+(define_insn_and_rewrite "*cond_add<mode>_any_const_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, w, w, ?w, ?w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w, w, w, w")
+	      (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate" "vsA, vsN, vsA, vsN, vsA, vsN")]
+	     UNSPEC_COND_FADD)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, 0, w, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
+   #
+   #"
+  "&& reload_completed
+   && register_operand (operands[4], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[4])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+					     operands[4], operands[1]));
+    operands[4] = operands[2] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; Register merging forms are handled through SVE_COND_FP_BINARY.
 
 ;; -------------------------------------------------------------------------
@@ -5110,19 +5444,19 @@
 )
 
 ;; Predicated FCADD, merging with the first input.
-(define_insn_and_rewrite "*cond_<optab><mode>_2"
+(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
 	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
 	     SVE_COND_FCADD)
 	   (match_dup 2)]
 	  UNSPEC_SEL))]
-  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "TARGET_SVE"
   "@
    fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
    movprfx\t%0, %2\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>"
@@ -5133,22 +5467,39 @@
   [(set_attr "movprfx" "*,yes")]
 )
 
+(define_insn "*cond_<optab><mode>_2_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
+	     SVE_COND_FCADD)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0, %2\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
 ;; Predicated FCADD, merging with an independent value.
-(define_insn_and_rewrite "*cond_<optab><mode>_any"
+(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 5)
-	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "w, 0, w, w")
 	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, w")]
 	     SVE_COND_FCADD)
 	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
 	  UNSPEC_SEL))]
-  "TARGET_SVE
-   && !rtx_equal_p (operands[2], operands[4])
-   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
   "@
    movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
    movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
@@ -5172,6 +5523,35 @@
   [(set_attr "movprfx" "yes")]
 )
 
+(define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, 0, w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, w")]
+	     SVE_COND_FCADD)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
+   #"
+  "&& reload_completed
+   && register_operand (operands[4], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[4])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+					     operands[4], operands[1]));
+    operands[4] = operands[2] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [FP] Subtraction
 ;; -------------------------------------------------------------------------
@@ -5209,19 +5589,19 @@
 
 ;; Predicated floating-point subtraction from a constant, merging with the
 ;; second input.
-(define_insn_and_rewrite "*cond_sub<mode>_3_const"
+(define_insn_and_rewrite "*cond_sub<mode>_3_const_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
 	      (match_operand:SVE_FULL_F 3 "register_operand" "0, w")]
 	     UNSPEC_COND_FSUB)
 	   (match_dup 3)]
 	  UNSPEC_SEL))]
-  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "TARGET_SVE"
   "@
    fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
    movprfx\t%0, %3\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2"
@@ -5232,23 +5612,40 @@
   [(set_attr "movprfx" "*,yes")]
 )
 
+(define_insn "*cond_sub<mode>_3_const_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "0, w")]
+	     UNSPEC_COND_FSUB)
+	   (match_dup 3)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
+   movprfx\t%0, %3\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2"
+  [(set_attr "movprfx" "*,yes")]
+)
+
 ;; Predicated floating-point subtraction from a constant, merging with an
 ;; independent value.
-(define_insn_and_rewrite "*cond_sub<mode>_any_const"
+(define_insn_and_rewrite "*cond_sub<mode>_const_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 5)
-	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
 	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w")]
 	     UNSPEC_COND_FSUB)
 	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, 0, w")]
 	  UNSPEC_SEL))]
-  "TARGET_SVE
-   && !rtx_equal_p (operands[3], operands[4])
-   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "TARGET_SVE && !rtx_equal_p (operands[3], operands[4])"
   "@
    movprfx\t%0.<Vetype>, %1/z, %3.<Vetype>\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
    movprfx\t%0.<Vetype>, %1/m, %3.<Vetype>\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
@@ -5271,6 +5668,33 @@
   [(set_attr "movprfx" "yes")]
 )
 
+(define_insn_and_rewrite "*cond_sub<mode>_const_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w")]
+	     UNSPEC_COND_FSUB)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && !rtx_equal_p (operands[3], operands[4])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %3.<Vetype>\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
+   movprfx\t%0.<Vetype>, %1/m, %3.<Vetype>\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
+   #"
+  "&& reload_completed
+   && register_operand (operands[4], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[4])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[3],
+                                             operands[4], operands[1]));
+    operands[4] = operands[3] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
 ;; Register merging forms are handled through SVE_COND_FP_BINARY.
 
 ;; -------------------------------------------------------------------------
@@ -5297,19 +5721,19 @@
 )
 
 ;; Predicated floating-point absolute difference.
-(define_insn_and_rewrite "*aarch64_pred_abd<mode>"
+(define_insn_and_rewrite "*aarch64_pred_abd<mode>_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 5)
-	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "%0, w")
 	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
 	     UNSPEC_COND_FSUB)]
 	  UNSPEC_COND_FABS))]
-  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "TARGET_SVE"
   "@
    fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
    movprfx\t%0, %2\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
@@ -5320,6 +5744,25 @@
   [(set_attr "movprfx" "*,yes")]
 )
 
+(define_insn "*aarch64_pred_abd<mode>_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "%0, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
+	     UNSPEC_COND_FSUB)]
+	  UNSPEC_COND_FABS))]
+  "TARGET_SVE"
+  "@
+   fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
 (define_expand "@aarch64_cond_abd<mode>"
   [(set (match_operand:SVE_FULL_F 0 "register_operand")
 	(unspec:SVE_FULL_F
@@ -5344,82 +5787,124 @@
 
 ;; Predicated floating-point absolute difference, merging with the first
 ;; input.
-(define_insn_and_rewrite "*aarch64_cond_abd<mode>_2"
+(define_insn_and_rewrite "*aarch64_cond_abd<mode>_2_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (unspec:SVE_FULL_F
-		[(match_operand 6)
-		 (match_operand:SI 7 "aarch64_sve_gp_strictness")
+		[(match_operand 5)
+		 (const_int SVE_RELAXED_GP)
 		 (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
 		 (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
 		UNSPEC_COND_FSUB)]
 	     UNSPEC_COND_FABS)
 	   (match_dup 2)]
 	  UNSPEC_SEL))]
-  "TARGET_SVE
-   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])
-   && aarch64_sve_pred_dominates_p (&operands[6], operands[1])"
+  "TARGET_SVE"
   "@
    fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
    movprfx\t%0, %2\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
   "&& (!rtx_equal_p (operands[1], operands[4])
-       || !rtx_equal_p (operands[1], operands[6]))"
+       || !rtx_equal_p (operands[1], operands[5]))"
   {
     operands[4] = copy_rtx (operands[1]);
-    operands[6] = copy_rtx (operands[1]);
+    operands[5] = copy_rtx (operands[1]);
   }
   [(set_attr "movprfx" "*,yes")]
 )
 
+(define_insn "*aarch64_cond_abd<mode>_2_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	      (unspec:SVE_FULL_F
+		[(match_dup 1)
+		 (match_operand:SI 5 "aarch64_sve_gp_strictness")
+		 (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+		 (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
+		UNSPEC_COND_FSUB)]
+	     UNSPEC_COND_FABS)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
 ;; Predicated floating-point absolute difference, merging with the second
 ;; input.
-(define_insn_and_rewrite "*aarch64_cond_abd<mode>_3"
+(define_insn_and_rewrite "*aarch64_cond_abd<mode>_3_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (unspec:SVE_FULL_F
-		[(match_operand 6)
-		 (match_operand:SI 7 "aarch64_sve_gp_strictness")
+		[(match_operand 5)
+		 (const_int SVE_RELAXED_GP)
 		 (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
 		 (match_operand:SVE_FULL_F 3 "register_operand" "0, w")]
 		UNSPEC_COND_FSUB)]
 	     UNSPEC_COND_FABS)
 	   (match_dup 3)]
 	  UNSPEC_SEL))]
-  "TARGET_SVE
-   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])
-   && aarch64_sve_pred_dominates_p (&operands[6], operands[1])"
+  "TARGET_SVE"
   "@
    fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
    movprfx\t%0, %3\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
   "&& (!rtx_equal_p (operands[1], operands[4])
-       || !rtx_equal_p (operands[1], operands[6]))"
+       || !rtx_equal_p (operands[1], operands[5]))"
   {
     operands[4] = copy_rtx (operands[1]);
-    operands[6] = copy_rtx (operands[1]);
+    operands[5] = copy_rtx (operands[1]);
   }
   [(set_attr "movprfx" "*,yes")]
 )
 
+(define_insn "*aarch64_cond_abd<mode>_3_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	      (unspec:SVE_FULL_F
+		[(match_dup 1)
+		 (match_operand:SI 5 "aarch64_sve_gp_strictness")
+		 (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
+		 (match_operand:SVE_FULL_F 3 "register_operand" "0, w")]
+		UNSPEC_COND_FSUB)]
+	     UNSPEC_COND_FABS)
+	   (match_dup 3)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0, %3\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
 ;; Predicated floating-point absolute difference, merging with an
 ;; independent value.
-(define_insn_and_rewrite "*aarch64_cond_abd<mode>_any"
+(define_insn_and_rewrite "*aarch64_cond_abd<mode>_any_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 5)
-	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (unspec:SVE_FULL_F
-		[(match_operand 7)
-		 (match_operand:SI 8 "aarch64_sve_gp_strictness")
+		[(match_operand 6)
+		 (const_int SVE_RELAXED_GP)
 		 (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w, w")
 		 (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w, w, w")]
 		UNSPEC_COND_FSUB)]
@@ -5428,9 +5913,7 @@
 	  UNSPEC_SEL))]
   "TARGET_SVE
    && !rtx_equal_p (operands[2], operands[4])
-   && !rtx_equal_p (operands[3], operands[4])
-   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])
-   && aarch64_sve_pred_dominates_p (&operands[7], operands[1])"
+   && !rtx_equal_p (operands[3], operands[4])"
   "@
    movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
    movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
@@ -5440,18 +5923,18 @@
   "&& 1"
   {
     if (reload_completed
-        && register_operand (operands[4], <MODE>mode)
-        && !rtx_equal_p (operands[0], operands[4]))
+	&& register_operand (operands[4], <MODE>mode)
+	&& !rtx_equal_p (operands[0], operands[4]))
       {
 	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[3],
 						 operands[4], operands[1]));
 	operands[4] = operands[3] = operands[0];
       }
     else if (!rtx_equal_p (operands[1], operands[5])
-	     || !rtx_equal_p (operands[1], operands[7]))
+	     || !rtx_equal_p (operands[1], operands[6]))
       {
 	operands[5] = copy_rtx (operands[1]);
-	operands[7] = copy_rtx (operands[1]);
+	operands[6] = copy_rtx (operands[1]);
       }
     else
       FAIL;
@@ -5459,6 +5942,42 @@
   [(set_attr "movprfx" "yes")]
 )
 
+(define_insn_and_rewrite "*aarch64_cond_abd<mode>_any_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (unspec:SVE_FULL_F
+		[(match_dup 1)
+		 (match_operand:SI 6 "aarch64_sve_gp_strictness")
+		 (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w, w")
+		 (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w, w, w")]
+		UNSPEC_COND_FSUB)]
+	     UNSPEC_COND_FABS)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[4])
+   && !rtx_equal_p (operands[3], operands[4])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[4], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[4])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[3],
+					     operands[4], operands[1]));
+    operands[4] = operands[3] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [FP] Multiplication
 ;; -------------------------------------------------------------------------
@@ -6384,20 +6903,20 @@
 
 ;; Predicated floating-point ternary operations, merging with the
 ;; first input.
-(define_insn_and_rewrite "*cond_<optab><mode>_2"
+(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 5)
-	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
 	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
 	      (match_operand:SVE_FULL_F 4 "register_operand" "w, w")]
 	     SVE_COND_FP_TERNARY)
 	   (match_dup 2)]
 	  UNSPEC_SEL))]
-  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "TARGET_SVE"
   "@
    <sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
    movprfx\t%0, %2\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
@@ -6408,22 +6927,42 @@
   [(set_attr "movprfx" "*,yes")]
 )
 
+(define_insn "*cond_<optab><mode>_2_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
+	      (match_operand:SVE_FULL_F 4 "register_operand" "w, w")]
+	     SVE_COND_FP_TERNARY)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
 ;; Predicated floating-point ternary operations, merging with the
 ;; third input.
-(define_insn_and_rewrite "*cond_<optab><mode>_4"
+(define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 5)
-	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
 	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
 	      (match_operand:SVE_FULL_F 4 "register_operand" "0, w")]
 	     SVE_COND_FP_TERNARY)
 	   (match_dup 4)]
 	  UNSPEC_SEL))]
-  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "TARGET_SVE"
   "@
    <sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
    movprfx\t%0, %4\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
@@ -6434,15 +6973,35 @@
   [(set_attr "movprfx" "*,yes")]
 )
 
+(define_insn "*cond_<optab><mode>_4_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
+	      (match_operand:SVE_FULL_F 4 "register_operand" "0, w")]
+	     SVE_COND_FP_TERNARY)
+	   (match_dup 4)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
 ;; Predicated floating-point ternary operations, merging with an
 ;; independent value.
-(define_insn_and_rewrite "*cond_<optab><mode>_any"
+(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 6)
-	      (match_operand:SI 7 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, 0, w, w, w")
 	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, 0, w, w")
 	      (match_operand:SVE_FULL_F 4 "register_operand" "w, 0, w, w, w, w")]
@@ -6452,8 +7011,7 @@
   "TARGET_SVE
    && !rtx_equal_p (operands[2], operands[5])
    && !rtx_equal_p (operands[3], operands[5])
-   && !rtx_equal_p (operands[4], operands[5])
-   && aarch64_sve_pred_dominates_p (&operands[6], operands[1])"
+   && !rtx_equal_p (operands[4], operands[5])"
   "@
    movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
    movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
@@ -6479,6 +7037,41 @@
   [(set_attr "movprfx" "yes")]
 )
 
+(define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, 0, w, w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, 0, w, w")
+	      (match_operand:SVE_FULL_F 4 "register_operand" "w, 0, w, w, w, w")]
+	     SVE_COND_FP_TERNARY)
+	   (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+					     operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; Unpredicated FMLA and FMLS by selected lanes.  It doesn't seem worth using
 ;; (fma ...) since target-independent code won't understand the indexing.
 (define_insn "@aarch64_<optab>_lane_<mode>"
@@ -6540,20 +7133,20 @@
 )
 
 ;; Predicated FCMLA, merging with the third input.
-(define_insn_and_rewrite "*cond_<optab><mode>_4"
+(define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 5)
-	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
 	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
 	      (match_operand:SVE_FULL_F 4 "register_operand" "0, w")]
 	     SVE_COND_FCMLA)
 	   (match_dup 4)]
 	  UNSPEC_SEL))]
-  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "TARGET_SVE"
   "@
    fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
    movprfx\t%0, %4\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>"
@@ -6564,23 +7157,41 @@
   [(set_attr "movprfx" "*,yes")]
 )
 
+(define_insn "*cond_<optab><mode>_4_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
+	      (match_operand:SVE_FULL_F 4 "register_operand" "0, w")]
+	     SVE_COND_FCMLA)
+	   (match_dup 4)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0, %4\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
 ;; Predicated FCMLA, merging with an independent value.
-(define_insn_and_rewrite "*cond_<optab><mode>_any"
+(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 6)
-	      (match_operand:SI 7 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w, w")
 	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, w")
 	      (match_operand:SVE_FULL_F 4 "register_operand" "w, 0, w, w")]
 	     SVE_COND_FCMLA)
 	   (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
 	  UNSPEC_SEL))]
-  "TARGET_SVE
-   && !rtx_equal_p (operands[4], operands[5])
-   && aarch64_sve_pred_dominates_p (&operands[6], operands[1])"
+  "TARGET_SVE && !rtx_equal_p (operands[4], operands[5])"
   "@
    movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
    movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
@@ -6604,6 +7215,36 @@
   [(set_attr "movprfx" "yes")]
 )
 
+(define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, w")
+	      (match_operand:SVE_FULL_F 4 "register_operand" "w, 0, w, w")]
+	     SVE_COND_FCMLA)
+	   (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+					     operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; Unpredicated FCMLA with indexing.
 (define_insn "@aarch64_<optab>_lane_<mode>"
   [(set (match_operand:SVE_FULL_HSF 0 "register_operand" "=w, ?&w")
@@ -7296,34 +7937,52 @@
   "TARGET_SVE"
 )
 
-(define_insn_and_rewrite "*aarch64_pred_fac<cmp_op><mode>"
+(define_insn_and_rewrite "*aarch64_pred_fac<cmp_op><mode>_relaxed"
   [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
 	(unspec:<VPRED>
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
 	   (match_operand:SI 4 "aarch64_sve_ptrue_flag")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 5)
-	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "w")]
 	     UNSPEC_COND_FABS)
 	   (unspec:SVE_FULL_F
-	     [(match_operand 7)
-	      (match_operand:SI 8 "aarch64_sve_gp_strictness")
+	     [(match_operand 6)
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 3 "register_operand" "w")]
 	     UNSPEC_COND_FABS)]
 	  SVE_COND_FP_ABS_CMP))]
-  "TARGET_SVE
-   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])
-   && aarch64_sve_pred_dominates_p (&operands[7], operands[1])"
+  "TARGET_SVE"
   "fac<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
   "&& (!rtx_equal_p (operands[1], operands[5])
-       || !rtx_equal_p (operands[1], operands[7]))"
+       || !rtx_equal_p (operands[1], operands[6]))"
   {
     operands[5] = copy_rtx (operands[1]);
-    operands[7] = copy_rtx (operands[1]);
+    operands[6] = copy_rtx (operands[1]);
   }
 )
 
+(define_insn "*aarch64_pred_fac<cmp_op><mode>_strict"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+	(unspec:<VPRED>
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	   (match_operand:SI 4 "aarch64_sve_ptrue_flag")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+	     UNSPEC_COND_FABS)
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w")]
+	     UNSPEC_COND_FABS)]
+	  SVE_COND_FP_ABS_CMP))]
+  "TARGET_SVE"
+  "fac<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [PRED] Select
 ;; -------------------------------------------------------------------------
@@ -7905,20 +8564,18 @@
 ;; the same register (despite having different modes).  Making all the
 ;; alternatives earlyclobber makes things more consistent for the
 ;; register allocator.
-(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>"
+(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>_relaxed"
   [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=&w, &w, ?&w")
 	(unspec:SVE_FULL_HSDI
 	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl")
 	   (unspec:SVE_FULL_HSDI
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")]
 	     SVE_COND_FCVTI)
 	   (match_operand:SVE_FULL_HSDI 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
 	  UNSPEC_SEL))]
-  "TARGET_SVE
-   && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>
-   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>"
   "@
    fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>
    movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>
@@ -7930,6 +8587,25 @@
   [(set_attr "movprfx" "*,yes,yes")]
 )
 
+(define_insn "*cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>_strict"
+  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=&w, &w, ?&w")
+	(unspec:SVE_FULL_HSDI
+	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:SVE_FULL_HSDI
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")]
+	     SVE_COND_FCVTI)
+	   (match_operand:SVE_FULL_HSDI 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>"
+  "@
+   fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>
+   movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>
+   movprfx\t%0, %3\;fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>"
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
 ;; Predicated narrowing float-to-integer conversion with merging.
 (define_expand "@cond_<optab>_trunc<VNx2DF_ONLY:mode><VNx4SI_ONLY:mode>"
   [(set (match_operand:VNx4SI_ONLY 0 "register_operand")
@@ -8069,20 +8745,18 @@
 ;; the same register (despite having different modes).  Making all the
 ;; alternatives earlyclobber makes things more consistent for the
 ;; register allocator.
-(define_insn_and_rewrite "*cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>"
+(define_insn_and_rewrite "*cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>_relaxed"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, ?&w")
 	(unspec:SVE_FULL_F
 	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl")
 	   (unspec:SVE_FULL_F
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w, w")]
 	     SVE_COND_ICVTF)
 	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
 	  UNSPEC_SEL))]
-  "TARGET_SVE
-   && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>
-   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>"
   "@
    <su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
    movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;<su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
@@ -8094,6 +8768,25 @@
   [(set_attr "movprfx" "*,yes,yes")]
 )
 
+(define_insn "*cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>_strict"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w, w")]
+	     SVE_COND_ICVTF)
+	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>"
+  "@
+   <su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
+   movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;<su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
+   movprfx\t%0, %3\;<su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>"
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
 ;; Predicated widening integer-to-float conversion with merging.
 (define_expand "@cond_<optab>_extend<VNx4SI_ONLY:mode><VNx2DF_ONLY:mode>"
   [(set (match_operand:VNx2DF_ONLY 0 "register_operand")
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index e18b9fe..0cafd0b 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -1890,18 +1890,18 @@
 )
 
 ;; These instructions do not take MOVPRFX.
-(define_insn_and_rewrite "*cond_<sve_fp_op><mode>"
+(define_insn_and_rewrite "*cond_<sve_fp_op><mode>_relaxed"
   [(set (match_operand:SVE_FULL_SDF 0 "register_operand" "=w")
 	(unspec:SVE_FULL_SDF
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
 	   (unspec:SVE_FULL_SDF
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:<VNARROW> 2 "register_operand" "w")]
 	     SVE2_COND_FP_UNARY_LONG)
 	   (match_operand:SVE_FULL_SDF 3 "register_operand" "0")]
 	  UNSPEC_SEL))]
-  "TARGET_SVE2 && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "TARGET_SVE2"
   "<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Ventype>"
   "&& !rtx_equal_p (operands[1], operands[4])"
   {
@@ -1909,6 +1909,21 @@
   }
 )
 
+(define_insn "*cond_<sve_fp_op><mode>_strict"
+  [(set (match_operand:SVE_FULL_SDF 0 "register_operand" "=w")
+	(unspec:SVE_FULL_SDF
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	   (unspec:SVE_FULL_SDF
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:<VNARROW> 2 "register_operand" "w")]
+	     SVE2_COND_FP_UNARY_LONG)
+	   (match_operand:SVE_FULL_SDF 3 "register_operand" "0")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE2"
+  "<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Ventype>"
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [FP<-FP] Narrowing conversions
 ;; -------------------------------------------------------------------------
@@ -1963,20 +1978,18 @@
   "TARGET_SVE2"
 )
 
-(define_insn_and_rewrite "*cond_<sve_fp_op><mode>_any"
+(define_insn_and_rewrite "*cond_<sve_fp_op><mode>_any_relaxed"
   [(set (match_operand:VNx4SF_ONLY 0 "register_operand" "=&w, &w, &w")
 	(unspec:VNx4SF_ONLY
 	  [(match_operand:<VWIDE_PRED> 1 "register_operand" "Upl, Upl, Upl")
 	   (unspec:VNx4SF_ONLY
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:<VWIDE> 2 "register_operand" "w, w, w")]
 	     SVE2_COND_FP_UNARY_NARROWB)
 	   (match_operand:VNx4SF_ONLY 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
 	  UNSPEC_SEL))]
-  "TARGET_SVE2
-   && !rtx_equal_p (operands[2], operands[3])
-   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "TARGET_SVE2 && !rtx_equal_p (operands[2], operands[3])"
   "@
    <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vewtype>
    movprfx\t%0.<Vewtype>, %1/z, %2.<Vewtype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vewtype>
@@ -1988,6 +2001,25 @@
   [(set_attr "movprfx" "*,yes,yes")]
 )
 
+(define_insn "*cond_<sve_fp_op><mode>_any_strict"
+  [(set (match_operand:VNx4SF_ONLY 0 "register_operand" "=&w, &w, &w")
+	(unspec:VNx4SF_ONLY
+	  [(match_operand:<VWIDE_PRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:VNx4SF_ONLY
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:<VWIDE> 2 "register_operand" "w, w, w")]
+	     SVE2_COND_FP_UNARY_NARROWB)
+	   (match_operand:VNx4SF_ONLY 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE2 && !rtx_equal_p (operands[2], operands[3])"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vewtype>
+   movprfx\t%0.<Vewtype>, %1/z, %2.<Vewtype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vewtype>
+   movprfx\t%0, %3\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vewtype>"
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
 ;; Predicated FCVTXNT.  This doesn't give a natural aarch64_pred_*/cond_*
 ;; pair because the even elements always have to be supplied for active
 ;; elements, even if the inactive elements don't matter.
@@ -2113,14 +2145,12 @@
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
 	   (unspec:<V_INT_EQUIV>
 	     [(match_operand 4)
-	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (const_int SVE_RELAXED_GP)
 	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")]
 	     SVE2_COND_INT_UNARY_FP)
 	   (match_operand:<V_INT_EQUIV> 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
 	  UNSPEC_SEL))]
-  "TARGET_SVE2
-   && !rtx_equal_p (operands[2], operands[3])
-   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "TARGET_SVE2 && !rtx_equal_p (operands[2], operands[3])"
   "@
    <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
    movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
@@ -2132,6 +2162,25 @@
   [(set_attr "movprfx" "*,yes,yes")]
 )
 
+(define_insn "*cond_<sve_fp_op><mode>_strict"
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=&w, ?&w, ?&w")
+	(unspec:<V_INT_EQUIV>
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:<V_INT_EQUIV>
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")]
+	     SVE2_COND_INT_UNARY_FP)
+	   (match_operand:<V_INT_EQUIV> 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE2 && !rtx_equal_p (operands[2], operands[3])"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0, %3\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Polynomial multiplication
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
index 841af9d..e060302 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,tsv110,thunderx3t110,zeus,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55"
+	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82"
 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 17dbe67..3cf20ea 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -868,6 +868,17 @@ static const cpu_prefetch_tune xgene1_prefetch_tune =
   -1			/* default_opt_level  */
 };
 
+static const cpu_prefetch_tune a64fx_prefetch_tune =
+{
+  8,			/* num_slots  */
+  64,			/* l1_cache_size  */
+  256,			/* l1_cache_line_size  */
+  32768,		/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
 static const struct tune_params generic_tunings =
 {
   &cortexa57_extra_costs,
@@ -1325,6 +1336,58 @@ static const struct tune_params neoversen1_tunings =
   &generic_prefetch_tune
 };
 
+static const struct tune_params neoversev1_tunings =
+{
+  &cortexa57_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &cortexa57_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_256, /* sve_width  */
+  4, /* memmov_cost  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &generic_prefetch_tune
+};
+
+static const struct tune_params a64fx_tunings =
+{
+  &generic_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &generic_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_512, /* sve_width  */
+  4, /* memmov_cost  */
+  7, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32",	/* function_align.  */
+  "16",	/* jump_align.  */
+  "32",	/* loop_align.  */
+  4,	/* int_reassoc_width.  */
+  2,	/* fp_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &a64fx_prefetch_tune
+};
+
 /* Support for fine-grained override of the tuning structures.  */
 struct aarch64_tuning_override_function
 {
@@ -1898,6 +1961,29 @@ aarch64_sve_abi (void)
   return sve_abi;
 }
 
+/* If X is an UNSPEC_SALT_ADDR expression, return the address that it
+   wraps, otherwise return X itself.  */
+
+static rtx
+strip_salt (rtx x)
+{
+  rtx search = x;
+  if (GET_CODE (search) == CONST)
+    search = XEXP (search, 0);
+  if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
+    x = XVECEXP (search, 0, 0);
+  return x;
+}
+
+/* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
+   expression.  */
+
+static rtx
+strip_offset_and_salt (rtx addr, poly_int64 *offset)
+{
+  return strip_salt (strip_offset (addr, offset));
+}
+
 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 const char *
 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
@@ -2895,14 +2981,9 @@ static enum tls_model
 tls_symbolic_operand_type (rtx addr)
 {
   enum tls_model tls_kind = TLS_MODEL_NONE;
-  if (GET_CODE (addr) == CONST)
-    {
-      poly_int64 addend;
-      rtx sym = strip_offset (addr, &addend);
-      if (GET_CODE (sym) == SYMBOL_REF)
-	tls_kind = SYMBOL_REF_TLS_MODEL (sym);
-    }
-  else if (GET_CODE (addr) == SYMBOL_REF)
+  poly_int64 offset;
+  addr = strip_offset_and_salt (addr, &offset);
+  if (GET_CODE (addr) == SYMBOL_REF)
     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 
   return tls_kind;
@@ -3367,11 +3448,16 @@ aarch64_split_128bit_move (rtx dst, rtx src)
     }
 }
 
+/* Return true if we should split a move from 128-bit value SRC
+   to 128-bit register DEST.  */
+
 bool
 aarch64_split_128bit_move_p (rtx dst, rtx src)
 {
-  return (! REG_P (src)
-	  || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
+  if (FP_REGNUM_P (REGNO (dst)))
+    return REG_P (src) && !FP_REGNUM_P (REGNO (src));
+  /* All moves to GPRs need to be split.  */
+  return true;
 }
 
 /* Split a complex SIMD combine.  */
@@ -3657,24 +3743,6 @@ aarch64_pfalse_reg (machine_mode mode)
   return gen_lowpart (mode, reg);
 }
 
-/* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
-   true, or alternatively if we know that the operation predicated by
-   PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
-   aarch64_sve_gp_strictness operand that describes the operation
-   predicated by PRED1[0].  */
-
-bool
-aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
-{
-  machine_mode mode = GET_MODE (pred2);
-  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
-	      && mode == GET_MODE (pred1[0])
-	      && aarch64_sve_gp_strictness (pred1[1], SImode));
-  return (pred1[0] == CONSTM1_RTX (mode)
-	  || INTVAL (pred1[1]) == SVE_RELAXED_GP
-	  || rtx_equal_p (pred1[0], pred2));
-}
-
 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
    for it.  PRED2[0] is the predicate for the instruction whose result
    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
@@ -5202,6 +5270,48 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
 				  as_a <scalar_int_mode> (mode));
 }
 
+/* Return the MEM rtx that provides the canary value that should be used
+   for stack-smashing protection.  MODE is the mode of the memory.
+   For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
+   (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
+   indicates whether the caller is performing a SET or a TEST operation.  */
+
+rtx
+aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
+				  aarch64_salt_type salt_type)
+{
+  rtx addr;
+  if (aarch64_stack_protector_guard == SSP_GLOBAL)
+    {
+      gcc_assert (MEM_P (decl_rtl));
+      addr = XEXP (decl_rtl, 0);
+      poly_int64 offset;
+      rtx base = strip_offset_and_salt (addr, &offset);
+      if (!SYMBOL_REF_P (base))
+	return decl_rtl;
+
+      rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
+      addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
+      addr = gen_rtx_CONST (Pmode, addr);
+      addr = plus_constant (Pmode, addr, offset);
+    }
+  else
+    {
+      /* Calculate the address from the system register.  */
+      rtx salt = GEN_INT (salt_type);
+      addr = gen_reg_rtx (mode);
+      if (mode == DImode)
+	emit_insn (gen_reg_stack_protect_address_di (addr, salt));
+      else
+	{
+	  emit_insn (gen_reg_stack_protect_address_si (addr, salt));
+	  addr = convert_memory_address (Pmode, addr);
+	}
+      addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
+    }
+  return gen_rtx_MEM (mode, force_reg (Pmode, addr));
+}
+
 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
    that is known to contain PTRUE.  */
 
@@ -5980,7 +6090,6 @@ aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
 
   if (!silent_p
       && !TARGET_FLOAT
-      && fndecl && TREE_PUBLIC (fndecl)
       && fntype && fntype != error_mark_node)
     {
       const_tree type = TREE_TYPE (fntype);
@@ -6920,6 +7029,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
     case E_TFmode:
       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
 
+    case E_V4SImode:
+      return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
+
     default:
       gcc_unreachable ();
     }
@@ -6943,6 +7055,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
     case E_TFmode:
       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
 
+    case E_V4SImode:
+      return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
+
     default:
       gcc_unreachable ();
     }
@@ -6957,6 +7072,17 @@ aarch64_return_address_signing_enabled (void)
   /* This function should only be called after frame laid out.   */
   gcc_assert (cfun->machine->frame.laid_out);
 
+  /* Turn return address signing off in any function that uses
+     __builtin_eh_return.  The address passed to __builtin_eh_return
+     is not signed so either it has to be signed (with original sp)
+     or the code path that uses it has to avoid authenticating it.
+     Currently eh return introduces a return to anywhere gadget, no
+     matter what we do here since it uses ret with user provided
+     address. An ideal fix for that is to use indirect branch which
+     can be protected with BTI j (to some extent).  */
+  if (crtl->calls_eh_return)
+    return false;
+
   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
      if its LR is pushed onto stack.  */
   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
@@ -8624,8 +8750,6 @@ aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
 static bool
 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
 {
-  rtx base, offset;
-
   if (GET_CODE (x) == HIGH)
     return true;
 
@@ -8635,10 +8759,12 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
     if (GET_CODE (*iter) == CONST_POLY_INT)
       return true;
 
-  split_const (x, &base, &offset);
+  poly_int64 offset;
+  rtx base = strip_offset_and_salt (x, &offset);
   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
     {
-      if (aarch64_classify_symbol (base, INTVAL (offset))
+      /* We checked for POLY_INT_CST offsets above.  */
+      if (aarch64_classify_symbol (base, offset.to_constant ())
 	  != SYMBOL_FORCE_TO_MEM)
 	return true;
       else
@@ -9164,9 +9290,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	  && GET_MODE_SIZE (mode).is_constant (&const_size)
 	  && const_size >= 4)
 	{
-	  rtx sym, addend;
-
-	  split_const (x, &sym, &addend);
+	  poly_int64 offset;
+	  rtx sym = strip_offset_and_salt (x, &offset);
 	  return ((GET_CODE (sym) == LABEL_REF
 		   || (GET_CODE (sym) == SYMBOL_REF
 		       && CONSTANT_POOL_ADDRESS_P (sym)
@@ -9181,10 +9306,12 @@ aarch64_classify_address (struct aarch64_address_info *info,
       if (allow_reg_index_p
 	  && aarch64_base_register_rtx_p (info->base, strict_p))
 	{
-	  rtx sym, offs;
-	  split_const (info->offset, &sym, &offs);
+	  poly_int64 offset;
+	  HOST_WIDE_INT const_offset;
+	  rtx sym = strip_offset_and_salt (info->offset, &offset);
 	  if (GET_CODE (sym) == SYMBOL_REF
-	      && (aarch64_classify_symbol (sym, INTVAL (offs))
+	      && offset.is_constant (&const_offset)
+	      && (aarch64_classify_symbol (sym, const_offset)
 		  == SYMBOL_SMALL_ABSOLUTE))
 	    {
 	      /* The symbol and offset must be aligned to the access size.  */
@@ -9210,7 +9337,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	      if (known_eq (ref_size, 0))
 		ref_size = GET_MODE_SIZE (DImode);
 
-	      return (multiple_p (INTVAL (offs), ref_size)
+	      return (multiple_p (const_offset, ref_size)
 		      && multiple_p (align / BITS_PER_UNIT, ref_size));
 	    }
 	}
@@ -9242,9 +9369,8 @@ aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
 bool
 aarch64_symbolic_address_p (rtx x)
 {
-  rtx offset;
-
-  split_const (x, &x, &offset);
+  poly_int64 offset;
+  x = strip_offset_and_salt (x, &offset);
   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
 }
 
@@ -9975,27 +10101,16 @@ aarch64_print_operand (FILE *f, rtx x, int code)
   switch (code)
     {
     case 'c':
-      switch (GET_CODE (x))
+      if (CONST_INT_P (x))
+	fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
+      else
 	{
-	case CONST_INT:
-	  fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
-	  break;
-
-	case SYMBOL_REF:
-	  output_addr_const (f, x);
-	  break;
-
-	case CONST:
-	  if (GET_CODE (XEXP (x, 0)) == PLUS
-	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
-	    {
-	      output_addr_const (f, x);
-	      break;
-	    }
-	  /* Fall through.  */
-
-	default:
-	  output_operand_lossage ("unsupported operand for code '%c'", code);
+	  poly_int64 offset;
+	  rtx base = strip_offset_and_salt (x, &offset);
+	  if (SYMBOL_REF_P (base))
+	    output_addr_const (f, x);
+	  else
+	    output_operand_lossage ("unsupported operand for code '%c'", code);
 	}
       break;
 
@@ -10570,6 +10685,19 @@ aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
     output_addr_const (f, x);
 }
 
+/* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
+
+static bool
+aarch64_output_addr_const_extra (FILE *file, rtx x)
+{
+  if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
+    {
+      output_addr_const (file, XVECEXP (x, 0, 0));
+      return true;
+   }
+  return false;
+}
+
 bool
 aarch64_label_mentioned_p (rtx x)
 {
@@ -10825,6 +10953,24 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
   return cfun->machine->frame.frame_size;
 }
 
+
+/* Get return address without mangling.  */
+
+rtx
+aarch64_return_addr_rtx (void)
+{
+  rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
+  /* Note: aarch64_return_address_signing_enabled only
+     works after cfun->machine->frame.laid_out is set,
+     so here we don't know if the return address will
+     be signed or not.  */
+  rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
+  emit_move_insn (lr, val);
+  emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
+  return lr;
+}
+
+
 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
    previous frame.  */
 
@@ -10833,7 +10979,7 @@ aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
 {
   if (count != 0)
     return const0_rtx;
-  return get_hard_reg_initial_val (Pmode, LR_REGNUM);
+  return aarch64_return_addr_rtx ();
 }
 
 static void
@@ -11708,8 +11854,6 @@ aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
 	  if (speed)
 	    {
 	      machine_mode mode = GET_MODE (XEXP (op1, 0));
-	      const struct cpu_cost_table *extra_cost
-		= aarch64_tune_params.insn_extra_cost;
 
 	      if (GET_MODE_CLASS (mode) == MODE_INT)
 		*cost += extra_cost->alu.arith;
@@ -14965,7 +15109,7 @@ aarch64_override_options (void)
   /* Save these options as the default ones in case we push and pop them later
      while processing functions with potential target attributes.  */
   target_option_default_node = target_option_current_node
-      = build_target_option_node (&global_options);
+    = build_target_option_node (&global_options, &global_options_set);
 }
 
 /* Implement targetm.override_options_after_change.  */
@@ -14976,6 +15120,16 @@ aarch64_override_options_after_change (void)
   aarch64_override_options_after_change_1 (&global_options);
 }
 
+/* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
+static char *
+aarch64_offload_options (void)
+{
+  if (TARGET_ILP32)
+    return xstrdup ("-foffload-abi=ilp32");
+  else
+    return xstrdup ("-foffload-abi=lp64");
+}
+
 static struct machine_function *
 aarch64_init_machine_status (void)
 {
@@ -15030,7 +15184,8 @@ initialize_aarch64_code_model (struct gcc_options *opts)
 /* Implement TARGET_OPTION_SAVE.  */
 
 static void
-aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
+aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts,
+		     struct gcc_options */* opts_set */)
 {
   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
   ptr->x_aarch64_branch_protection_string
@@ -15041,7 +15196,9 @@ aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
    using the information saved in PTR.  */
 
 static void
-aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
+aarch64_option_restore (struct gcc_options *opts,
+			struct gcc_options */* opts_set */,
+			struct cl_target_option *ptr)
 {
   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
@@ -15131,7 +15288,8 @@ aarch64_set_current_function (tree fndecl)
   aarch64_previous_fndecl = fndecl;
 
   /* First set the target options.  */
-  cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
+  cl_target_option_restore (&global_options, &global_options_set,
+			    TREE_TARGET_OPTION (new_tree));
 
   aarch64_save_restore_target_globals (new_tree);
 }
@@ -15630,17 +15788,18 @@ aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
     }
   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
 
-  old_optimize = build_optimization_node (&global_options);
+  old_optimize
+    = build_optimization_node (&global_options, &global_options_set);
   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
 
   /* If the function changed the optimization levels as well as setting
      target options, start with the optimizations specified.  */
   if (func_optimize && func_optimize != old_optimize)
-    cl_optimization_restore (&global_options,
+    cl_optimization_restore (&global_options, &global_options_set,
 			     TREE_OPTIMIZATION (func_optimize));
 
   /* Save the current target options to restore at the end.  */
-  cl_target_option_save (&cur_target, &global_options);
+  cl_target_option_save (&cur_target, &global_options, &global_options_set);
 
   /* If fndecl already has some target attributes applied to it, unpack
      them so that we add this attribute on top of them, rather than
@@ -15651,11 +15810,12 @@ aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
 	= TREE_TARGET_OPTION (existing_target);
 
       if (existing_options)
-	cl_target_option_restore (&global_options, existing_options);
+	cl_target_option_restore (&global_options, &global_options_set,
+				  existing_options);
     }
   else
-    cl_target_option_restore (&global_options,
-			TREE_TARGET_OPTION (target_option_current_node));
+    cl_target_option_restore (&global_options, &global_options_set,
+			      TREE_TARGET_OPTION (target_option_current_node));
 
   ret = aarch64_process_target_attr (args);
 
@@ -15675,12 +15835,14 @@ aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
 	  aarch64_init_simd_builtins ();
 	  current_target_pragma = saved_current_target_pragma;
 	}
-      new_target = build_target_option_node (&global_options);
+      new_target = build_target_option_node (&global_options,
+					     &global_options_set);
     }
   else
     new_target = NULL;
 
-  new_optimize = build_optimization_node (&global_options);
+  new_optimize = build_optimization_node (&global_options,
+					  &global_options_set);
 
   if (fndecl && ret)
     {
@@ -15690,10 +15852,10 @@ aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
     }
 
-  cl_target_option_restore (&global_options, &cur_target);
+  cl_target_option_restore (&global_options, &global_options_set, &cur_target);
 
   if (old_optimize != new_optimize)
-    cl_optimization_restore (&global_options,
+    cl_optimization_restore (&global_options, &global_options_set,
 			     TREE_OPTIMIZATION (old_optimize));
   return ret;
 }
@@ -15845,6 +16007,7 @@ aarch64_tls_symbol_p (rtx x)
   if (! TARGET_HAVE_TLS)
     return false;
 
+  x = strip_salt (x);
   if (GET_CODE (x) != SYMBOL_REF)
     return false;
 
@@ -15900,6 +16063,8 @@ aarch64_classify_tls_symbol (rtx x)
 enum aarch64_symbol_type
 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
 {
+  x = strip_salt (x);
+
   if (GET_CODE (x) == LABEL_REF)
     {
       switch (aarch64_cmodel)
@@ -15999,11 +16164,10 @@ aarch64_constant_address_p (rtx x)
 bool
 aarch64_legitimate_pic_operand_p (rtx x)
 {
-  if (GET_CODE (x) == SYMBOL_REF
-      || (GET_CODE (x) == CONST
-	  && GET_CODE (XEXP (x, 0)) == PLUS
-	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
-     return false;
+  poly_int64 offset;
+  x = strip_offset_and_salt (x, &offset);
+  if (GET_CODE (x) == SYMBOL_REF)
+    return false;
 
   return true;
 }
@@ -16049,7 +16213,7 @@ aarch64_legitimate_constant_p (machine_mode mode, rtx x)
   /* If an offset is being added to something else, we need to allow the
      base to be moved into the destination register, meaning that there
      are no free temporaries for the offset.  */
-  x = strip_offset (x, &offset);
+  x = strip_offset_and_salt (x, &offset);
   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
     return false;
 
@@ -17948,6 +18112,7 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
       return aarch64_simd_valid_immediate (x, NULL);
     }
 
+  x = strip_salt (x);
   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
     return true;
 
@@ -19991,6 +20156,8 @@ struct expand_vec_perm_d
   bool testing_p;
 };
 
+static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
+
 /* Generate a variable permutation.  */
 
 static void
@@ -20176,6 +20343,59 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to re-encode the PERM constant so it combines odd and even elements.
+   This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
+   We retry with this new constant with the full suite of patterns.  */
+static bool
+aarch64_evpc_reencode (struct expand_vec_perm_d *d)
+{
+  expand_vec_perm_d newd;
+  unsigned HOST_WIDE_INT nelt;
+
+  if (d->vec_flags != VEC_ADVSIMD)
+    return false;
+
+  /* Get the new mode.  Always twice the size of the inner
+     and half the elements.  */
+  poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
+  unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
+  auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
+  machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
+
+  if (new_mode == word_mode)
+    return false;
+
+  /* to_constant is safe since this routine is specific to Advanced SIMD
+     vectors.  */
+  nelt = d->perm.length ().to_constant ();
+
+  vec_perm_builder newpermconst;
+  newpermconst.new_vector (nelt / 2, nelt / 2, 1);
+
+  /* Convert the perm constant if we can.  Require even, odd as the pairs.  */
+  for (unsigned int i = 0; i < nelt; i += 2)
+    {
+      poly_int64 elt0 = d->perm[i];
+      poly_int64 elt1 = d->perm[i + 1];
+      poly_int64 newelt;
+      if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
+	return false;
+      newpermconst.quick_push (newelt.to_constant ());
+    }
+  newpermconst.finalize ();
+
+  newd.vmode = new_mode;
+  newd.vec_flags = VEC_ADVSIMD;
+  newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
+  newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
+  newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
+  newd.testing_p = d->testing_p;
+  newd.one_vector_p = d->one_vector_p;
+
+  newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
+  return aarch64_expand_vec_perm_const_1 (&newd);
+}
+
 /* Recognize patterns suitable for the UZP instructions.  */
 static bool
 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
@@ -20539,6 +20759,81 @@ aarch64_evpc_sel (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize patterns suitable for the INS instructions.  */
+static bool
+aarch64_evpc_ins (struct expand_vec_perm_d *d)
+{
+  machine_mode mode = d->vmode;
+  unsigned HOST_WIDE_INT nelt;
+
+  if (d->vec_flags != VEC_ADVSIMD)
+    return false;
+
+  /* to_constant is safe since this routine is specific to Advanced SIMD
+     vectors.  */
+  nelt = d->perm.length ().to_constant ();
+  rtx insv = d->op0;
+
+  HOST_WIDE_INT idx = -1;
+
+  for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
+    {
+      HOST_WIDE_INT elt;
+      if (!d->perm[i].is_constant (&elt))
+	return false;
+      if (elt == (HOST_WIDE_INT) i)
+	continue;
+      if (idx != -1)
+	{
+	  idx = -1;
+	  break;
+	}
+      idx = i;
+    }
+
+  if (idx == -1)
+    {
+      insv = d->op1;
+      for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
+	{
+	  if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
+	    continue;
+	  if (idx != -1)
+	    return false;
+	  idx = i;
+	}
+
+      if (idx == -1)
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  gcc_assert (idx != -1);
+
+  unsigned extractindex = d->perm[idx].to_constant ();
+  rtx extractv = d->op0;
+  if (extractindex >= nelt)
+    {
+      extractv = d->op1;
+      extractindex -= nelt;
+    }
+  gcc_assert (extractindex < nelt);
+
+  emit_move_insn (d->target, insv);
+  insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
+  expand_operand ops[5];
+  create_output_operand (&ops[0], d->target, mode);
+  create_input_operand (&ops[1], d->target, mode);
+  create_integer_operand (&ops[2], 1 << idx);
+  create_input_operand (&ops[3], extractv, mode);
+  create_integer_operand (&ops[4], extractindex);
+  expand_insn (icode, 5, ops);
+
+  return true;
+}
+
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
@@ -20573,6 +20868,10 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	return true;
       else if (aarch64_evpc_sel (d))
 	return true;
+      else if (aarch64_evpc_ins (d))
+	return true;
+      else if (aarch64_evpc_reencode (d))
+	return true;
       if (d->vec_flags == VEC_SVE_DATA)
 	return aarch64_evpc_sve_tbl (d);
       else if (d->vec_flags == VEC_ADVSIMD)
@@ -20934,6 +21233,27 @@ static void
 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
 					      machine_mode mode)
 {
+  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
+     address copies using V4SImode so that we can use Q registers.  */
+  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+    {
+      mode = V4SImode;
+      rtx reg1 = gen_reg_rtx (mode);
+      rtx reg2 = gen_reg_rtx (mode);
+      /* "Cast" the pointers to the correct mode.  */
+      *src = adjust_address (*src, mode, 0);
+      *dst = adjust_address (*dst, mode, 0);
+      /* Emit the memcpy.  */
+      emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
+					aarch64_progress_pointer (*src)));
+      emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
+					 aarch64_progress_pointer (*dst), reg2));
+      /* Move the pointers forward.  */
+      *src = aarch64_move_pointer (*src, 32);
+      *dst = aarch64_move_pointer (*dst, 32);
+      return;
+    }
+
   rtx reg = gen_reg_rtx (mode);
 
   /* "Cast" the pointers to the correct mode.  */
@@ -20987,9 +21307,12 @@ aarch64_expand_cpymem (rtx *operands)
   /* Convert n to bits to make the rest of the code simpler.  */
   n = n * BITS_PER_UNIT;
 
-  /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
-     larger than TImode, but we should not use them for loads/stores here.  */
-  const int copy_limit = GET_MODE_BITSIZE (TImode);
+  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
+     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD.  */
+  const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
+			   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
+			  || !TARGET_SIMD)
+			 ? GET_MODE_BITSIZE (TImode) : 256;
 
   while (n > 0)
     {
@@ -21995,7 +22318,7 @@ aarch64_ldrstr_offset_compare (const void *x, const void *y)
 
 bool
 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
-				       scalar_mode mode)
+				       machine_mode mode)
 {
   const int num_insns = 4;
   enum reg_class rclass;
@@ -22072,7 +22395,7 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
   for (int i = 0; i < num_insns; i++)
     offvals[i] = INTVAL (offset[i]);
 
-  msize = GET_MODE_SIZE (mode);
+  msize = GET_MODE_SIZE (mode).to_constant ();
 
   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
@@ -22112,7 +22435,7 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
 
 bool
 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
-			     scalar_mode mode, RTX_CODE code)
+			     machine_mode mode, RTX_CODE code)
 {
   rtx base, offset_1, offset_3, t1, t2;
   rtx mem_1, mem_2, mem_3, mem_4;
@@ -22151,7 +22474,7 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
 	      && offset_3 != NULL_RTX);
 
   /* Adjust offset so it can fit in LDP/STP instruction.  */
-  msize = GET_MODE_SIZE (mode);
+  msize = GET_MODE_SIZE (mode).to_constant();
   stp_off_upper_limit = msize * (0x40 - 1);
   stp_off_lower_limit = - msize * 0x40;
 
@@ -23417,6 +23740,9 @@ aarch64_libgcc_floating_mode_supported_p
 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
   aarch64_override_options_after_change
 
+#undef TARGET_OFFLOAD_OPTIONS
+#define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
+
 #undef TARGET_OPTION_SAVE
 #define TARGET_OPTION_SAVE aarch64_option_save
 
@@ -23642,6 +23968,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_PRINT_OPERAND_ADDRESS
 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
 
+#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
+#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
+
 #undef TARGET_OPTAB_SUPPORTED_P
 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 4534e37..00b5f84 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -161,6 +161,8 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_LSE	      (1 << 4)  /* Has Large System Extensions.  */
 #define AARCH64_FL_RDMA       (1 << 5)  /* Has Round Double Multiply Add.  */
 #define AARCH64_FL_V8_1       (1 << 6)  /* Has ARMv8.1-A extensions.  */
+/* Armv8-R.  */
+#define AARCH64_FL_V8_R       (1 << 7)  /* Armv8-R AArch64.  */
 /* ARMv8.2-A architecture extensions.  */
 #define AARCH64_FL_V8_2       (1 << 8)  /* Has ARMv8.2-A features.  */
 #define AARCH64_FL_F16	      (1 << 9)  /* Has ARMv8.2-A FP16 extensions.  */
@@ -246,6 +248,8 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_FOR_ARCH8_6			\
   (AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_V8_6 | AARCH64_FL_FPSIMD \
    | AARCH64_FL_I8MM | AARCH64_FL_BF16)
+#define AARCH64_FL_FOR_ARCH8_R     \
+  (AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_V8_R)
 
 /* Macros to test ISA flags.  */
 
@@ -282,6 +286,7 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_F64MM	   (aarch64_isa_flags & AARCH64_FL_F64MM)
 #define AARCH64_ISA_BF16	   (aarch64_isa_flags & AARCH64_FL_BF16)
 #define AARCH64_ISA_SB		   (aarch64_isa_flags & AARCH64_FL_SB)
+#define AARCH64_ISA_V8_R	   (aarch64_isa_flags & AARCH64_FL_V8_R)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
@@ -1133,7 +1138,7 @@ typedef struct
 #define PROFILE_HOOK(LABEL)						\
   {									\
     rtx fun, lr;							\
-    lr = get_hard_reg_initial_val (Pmode, LR_REGNUM);			\
+    lr = aarch64_return_addr_rtx ();					\
     fun = gen_rtx_SYMBOL_REF (Pmode, MCOUNT_NAME);			\
     emit_library_call (fun, LCT_NORMAL, VOIDmode, lr, Pmode);		\
   }
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index d5ca189..78fe7c43 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -281,6 +281,7 @@
     UNSPEC_GEN_TAG_RND		; Generate a random 4-bit MTE tag.
     UNSPEC_TAG_SPACE		; Translate address to MTE tag address space.
     UNSPEC_LD1RO
+    UNSPEC_SALT_ADDR
 ])
 
 (define_c_enum "unspecv" [
@@ -1360,13 +1361,14 @@
 
 (define_insn "*movti_aarch64"
   [(set (match_operand:TI 0
-	 "nonimmediate_operand"  "=   r,w, r,w,r,m,m,w,m")
+	 "nonimmediate_operand"  "=   r,w,w, r,w,r,m,m,w,m")
 	(match_operand:TI 1
-	 "aarch64_movti_operand" " rUti,r, w,w,m,r,Z,m,w"))]
+	 "aarch64_movti_operand" " rUti,Z,r, w,w,m,r,Z,m,w"))]
   "(register_operand (operands[0], TImode)
     || aarch64_reg_or_zero (operands[1], TImode))"
   "@
    #
+   movi\\t%0.2d, #0
    #
    #
    mov\\t%0.16b, %1.16b
@@ -1375,11 +1377,11 @@
    stp\\txzr, xzr, %0
    ldr\\t%q0, %1
    str\\t%q1, %0"
-  [(set_attr "type" "multiple,f_mcr,f_mrc,neon_logic_q, \
+  [(set_attr "type" "multiple,neon_move,f_mcr,f_mrc,neon_logic_q, \
 		             load_16,store_16,store_16,\
                              load_16,store_16")
-   (set_attr "length" "8,8,8,4,4,4,4,4,4")
-   (set_attr "arch" "*,*,*,simd,*,*,*,fp,fp")]
+   (set_attr "length" "8,4,8,8,4,4,4,4,4,4")
+   (set_attr "arch" "*,simd,*,*,simd,*,*,*,fp,fp")]
 )
 
 ;; Split a TImode register-register or register-immediate move into
@@ -1510,9 +1512,9 @@
 
 (define_insn "*movtf_aarch64"
   [(set (match_operand:TF 0
-	 "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,m,?r,m ,m")
+	 "nonimmediate_operand" "=w,?r ,w ,?r,w,?w,w,m,?r,m ,m")
 	(match_operand:TF 1
-	 "general_operand"      " w,?r, ?r,w ,Y,Y ,m,w,m ,?r,Y"))]
+	 "general_operand"      " w,?rY,?r,w ,Y,Y ,m,w,m ,?r,Y"))]
   "TARGET_FLOAT && (register_operand (operands[0], TFmode)
     || aarch64_reg_or_fp_zero (operands[1], TFmode))"
   "@
@@ -1535,7 +1537,7 @@
 
 (define_split
    [(set (match_operand:TF 0 "register_operand" "")
-	 (match_operand:TF 1 "aarch64_reg_or_imm" ""))]
+	 (match_operand:TF 1 "nonmemory_operand" ""))]
   "reload_completed && aarch64_split_128bit_move_p (operands[0], operands[1])"
   [(const_int 0)]
   {
@@ -1574,8 +1576,8 @@
 				XEXP (operands[1], 0),
 				GET_MODE_SIZE (<SX:MODE>mode)))"
   "@
-   ldp\\t%w0, %w2, %1
-   ldp\\t%s0, %s2, %1"
+   ldp\\t%w0, %w2, %z1
+   ldp\\t%s0, %s2, %z1"
   [(set_attr "type" "load_8,neon_load1_2reg")
    (set_attr "arch" "*,fp")]
 )
@@ -1591,8 +1593,8 @@
 				XEXP (operands[1], 0),
 				GET_MODE_SIZE (<DX:MODE>mode)))"
   "@
-   ldp\\t%x0, %x2, %1
-   ldp\\t%d0, %d2, %1"
+   ldp\\t%x0, %x2, %z1
+   ldp\\t%d0, %d2, %z1"
   [(set_attr "type" "load_16,neon_load1_2reg")
    (set_attr "arch" "*,fp")]
 )
@@ -1607,7 +1609,7 @@
 		    plus_constant (Pmode,
 				   XEXP (operands[1], 0),
 				   GET_MODE_SIZE (TFmode)))"
-  "ldp\\t%q0, %q2, %1"
+  "ldp\\t%q0, %q2, %z1"
   [(set_attr "type" "neon_ldp_q")
    (set_attr "fp" "yes")]
 )
@@ -1624,8 +1626,8 @@
 				XEXP (operands[0], 0),
 				GET_MODE_SIZE (<SX:MODE>mode)))"
   "@
-   stp\\t%w1, %w3, %0
-   stp\\t%s1, %s3, %0"
+   stp\\t%w1, %w3, %z0
+   stp\\t%s1, %s3, %z0"
   [(set_attr "type" "store_8,neon_store1_2reg")
    (set_attr "arch" "*,fp")]
 )
@@ -1641,8 +1643,8 @@
 				XEXP (operands[0], 0),
 				GET_MODE_SIZE (<DX:MODE>mode)))"
   "@
-   stp\\t%x1, %x3, %0
-   stp\\t%d1, %d3, %0"
+   stp\\t%x1, %x3, %z0
+   stp\\t%d1, %d3, %z0"
   [(set_attr "type" "store_16,neon_store1_2reg")
    (set_attr "arch" "*,fp")]
 )
@@ -1657,7 +1659,7 @@
 		 plus_constant (Pmode,
 				XEXP (operands[0], 0),
 				GET_MODE_SIZE (TFmode)))"
-  "stp\\t%q1, %q3, %0"
+  "stp\\t%q1, %q3, %z0"
   [(set_attr "type" "neon_stp_q")
    (set_attr "fp" "yes")]
 )
@@ -1790,7 +1792,7 @@
 		plus_constant (Pmode,
 			       XEXP (operands[1], 0),
 			       GET_MODE_SIZE (SImode)))"
-  "ldpsw\\t%0, %2, %1"
+  "ldpsw\\t%0, %2, %z1"
   [(set_attr "type" "load_8")]
 )
 
@@ -1819,8 +1821,8 @@
 			       XEXP (operands[1], 0),
 			       GET_MODE_SIZE (SImode)))"
   "@
-   ldp\t%w0, %w2, %1
-   ldp\t%s0, %s2, %1"
+   ldp\t%w0, %w2, %z1
+   ldp\t%s0, %s2, %z1"
   [(set_attr "type" "load_8,neon_load1_2reg")
    (set_attr "arch" "*,fp")]
 )
@@ -2341,38 +2343,6 @@
   [(set_attr "type" "alus_shift_imm")]
 )
 
-(define_insn "*adds_mul_imm_<mode>"
-  [(set (reg:CC_NZ CC_REGNUM)
-	(compare:CC_NZ
-	 (plus:GPI (mult:GPI
-		    (match_operand:GPI 1 "register_operand" "r")
-		    (match_operand:QI 2 "aarch64_pwr_2_<mode>" "n"))
-		   (match_operand:GPI 3 "register_operand" "r"))
-	 (const_int 0)))
-   (set (match_operand:GPI 0 "register_operand" "=r")
-	(plus:GPI (mult:GPI (match_dup 1) (match_dup 2))
-		  (match_dup 3)))]
-  ""
-  "adds\\t%<w>0, %<w>3, %<w>1, lsl %p2"
-  [(set_attr "type" "alus_shift_imm")]
-)
-
-(define_insn "*subs_mul_imm_<mode>"
-  [(set (reg:CC_NZ CC_REGNUM)
-	(compare:CC_NZ
-	 (minus:GPI (match_operand:GPI 1 "register_operand" "r")
-		    (mult:GPI
-		     (match_operand:GPI 2 "register_operand" "r")
-		     (match_operand:QI 3 "aarch64_pwr_2_<mode>" "n")))
-	 (const_int 0)))
-   (set (match_operand:GPI 0 "register_operand" "=r")
-	(minus:GPI (match_dup 1)
-		   (mult:GPI (match_dup 2) (match_dup 3))))]
-  ""
-  "subs\\t%<w>0, %<w>1, %<w>2, lsl %p3"
-  [(set_attr "type" "alus_shift_imm")]
-)
-
 (define_insn "*adds_<optab><ALLX:mode>_<GPI:mode>"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
@@ -2383,7 +2353,7 @@
    (set (match_operand:GPI 0 "register_operand" "=r")
 	(plus:GPI (ANY_EXTEND:GPI (match_dup 1)) (match_dup 2)))]
   ""
-  "adds\\t%<GPI:w>0, %<GPI:w>2, %<GPI:w>1, <su>xt<ALLX:size>"
+  "adds\\t%<GPI:w>0, %<GPI:w>2, %w1, <su>xt<ALLX:size>"
   [(set_attr "type" "alus_ext")]
 )
 
@@ -2397,7 +2367,7 @@
    (set (match_operand:GPI 0 "register_operand" "=r")
 	(minus:GPI (match_dup 1) (ANY_EXTEND:GPI (match_dup 2))))]
   ""
-  "subs\\t%<GPI:w>0, %<GPI:w>1, %<GPI:w>2, <su>xt<ALLX:size>"
+  "subs\\t%<GPI:w>0, %<GPI:w>1, %w2, <su>xt<ALLX:size>"
   [(set_attr "type" "alus_ext")]
 )
 
@@ -2415,7 +2385,7 @@
 			      (match_dup 2))
 		  (match_dup 3)))]
   ""
-  "adds\\t%<GPI:w>0, %<GPI:w>3, %<GPI:w>1, <su>xt<ALLX:size> %2"
+  "adds\\t%<GPI:w>0, %<GPI:w>3, %w1, <su>xt<ALLX:size> %2"
   [(set_attr "type" "alus_ext")]
 )
 
@@ -2433,47 +2403,7 @@
 		   (ashift:GPI (ANY_EXTEND:GPI (match_dup 2))
 			       (match_dup 3))))]
   ""
-  "subs\\t%<GPI:w>0, %<GPI:w>1, %<GPI:w>2, <su>xt<ALLX:size> %3"
-  [(set_attr "type" "alus_ext")]
-)
-
-(define_insn "*adds_<optab><mode>_multp2"
-  [(set (reg:CC_NZ CC_REGNUM)
-	(compare:CC_NZ
-	 (plus:GPI (ANY_EXTRACT:GPI
-		    (mult:GPI (match_operand:GPI 1 "register_operand" "r")
-			      (match_operand 2 "aarch64_pwr_imm3" "Up3"))
-		    (match_operand 3 "const_int_operand" "n")
-		    (const_int 0))
-		   (match_operand:GPI 4 "register_operand" "rk"))
-	(const_int 0)))
-   (set (match_operand:GPI 0 "register_operand" "=r")
-	(plus:GPI (ANY_EXTRACT:GPI (mult:GPI (match_dup 1) (match_dup 2))
-				   (match_dup 3)
-				   (const_int 0))
-		  (match_dup 4)))]
-  "aarch64_is_extend_from_extract (<MODE>mode, operands[2], operands[3])"
-  "adds\\t%<w>0, %<w>4, %<w>1, <su>xt%e3 %p2"
-  [(set_attr "type" "alus_ext")]
-)
-
-(define_insn "*subs_<optab><mode>_multp2"
-  [(set (reg:CC_NZ CC_REGNUM)
-	(compare:CC_NZ
-	 (minus:GPI (match_operand:GPI 4 "register_operand" "rk")
-		    (ANY_EXTRACT:GPI
-		     (mult:GPI (match_operand:GPI 1 "register_operand" "r")
-			       (match_operand 2 "aarch64_pwr_imm3" "Up3"))
-		     (match_operand 3 "const_int_operand" "n")
-		     (const_int 0)))
-	(const_int 0)))
-   (set (match_operand:GPI 0 "register_operand" "=r")
-	(minus:GPI (match_dup 4) (ANY_EXTRACT:GPI
-				  (mult:GPI (match_dup 1) (match_dup 2))
-				  (match_dup 3)
-				  (const_int 0))))]
-  "aarch64_is_extend_from_extract (<MODE>mode, operands[2], operands[3])"
-  "subs\\t%<w>0, %<w>4, %<w>1, <su>xt%e3 %p2"
+  "subs\\t%<GPI:w>0, %<GPI:w>1, %w2, <su>xt<ALLX:size> %3"
   [(set_attr "type" "alus_ext")]
 )
 
@@ -2534,22 +2464,12 @@
   [(set_attr "type" "alu_shift_imm")]
 )
 
-(define_insn "*add_mul_imm_<mode>"
-  [(set (match_operand:GPI 0 "register_operand" "=r")
-	(plus:GPI (mult:GPI (match_operand:GPI 1 "register_operand" "r")
-			    (match_operand:QI 2 "aarch64_pwr_2_<mode>" "n"))
-		  (match_operand:GPI 3 "register_operand" "r")))]
-  ""
-  "add\\t%<w>0, %<w>3, %<w>1, lsl %p2"
-  [(set_attr "type" "alu_shift_imm")]
-)
-
 (define_insn "*add_<optab><ALLX:mode>_<GPI:mode>"
   [(set (match_operand:GPI 0 "register_operand" "=rk")
 	(plus:GPI (ANY_EXTEND:GPI (match_operand:ALLX 1 "register_operand" "r"))
 		  (match_operand:GPI 2 "register_operand" "r")))]
   ""
-  "add\\t%<GPI:w>0, %<GPI:w>2, %<GPI:w>1, <su>xt<ALLX:size>"
+  "add\\t%<GPI:w>0, %<GPI:w>2, %w1, <su>xt<ALLX:size>"
   [(set_attr "type" "alu_ext")]
 )
 
@@ -2571,7 +2491,7 @@
 			      (match_operand 2 "aarch64_imm3" "Ui3"))
 		  (match_operand:GPI 3 "register_operand" "r")))]
   ""
-  "add\\t%<GPI:w>0, %<GPI:w>3, %<GPI:w>1, <su>xt<ALLX:size> %2"
+  "add\\t%<GPI:w>0, %<GPI:w>3, %w1, <su>xt<ALLX:size> %2"
   [(set_attr "type" "alu_ext")]
 )
 
@@ -2588,57 +2508,6 @@
   [(set_attr "type" "alu_ext")]
 )
 
-(define_insn "*add_<optab><ALLX:mode>_mult_<GPI:mode>"
-  [(set (match_operand:GPI 0 "register_operand" "=rk")
-	(plus:GPI (mult:GPI (ANY_EXTEND:GPI
-			     (match_operand:ALLX 1 "register_operand" "r"))
-			    (match_operand 2 "aarch64_pwr_imm3" "Up3"))
-		  (match_operand:GPI 3 "register_operand" "r")))]
-  ""
-  "add\\t%<GPI:w>0, %<GPI:w>3, %<GPI:w>1, <su>xt<ALLX:size> %p2"
-  [(set_attr "type" "alu_ext")]
-)
-
-;; zero_extend version of above
-(define_insn "*add_<optab><SHORT:mode>_mult_si_uxtw"
-  [(set (match_operand:DI 0 "register_operand" "=rk")
-	(zero_extend:DI (plus:SI (mult:SI (ANY_EXTEND:SI
-			     (match_operand:SHORT 1 "register_operand" "r"))
-			    (match_operand 2 "aarch64_pwr_imm3" "Up3"))
-		  (match_operand:SI 3 "register_operand" "r"))))]
-  ""
-  "add\\t%w0, %w3, %w1, <su>xt<SHORT:size> %p2"
-  [(set_attr "type" "alu_ext")]
-)
-
-(define_insn "*add_<optab><mode>_multp2"
-  [(set (match_operand:GPI 0 "register_operand" "=rk")
-	(plus:GPI (ANY_EXTRACT:GPI
-		   (mult:GPI (match_operand:GPI 1 "register_operand" "r")
-			     (match_operand 2 "aarch64_pwr_imm3" "Up3"))
-		   (match_operand 3 "const_int_operand" "n")
-		   (const_int 0))
-		  (match_operand:GPI 4 "register_operand" "r")))]
-  "aarch64_is_extend_from_extract (<MODE>mode, operands[2], operands[3])"
-  "add\\t%<w>0, %<w>4, %<w>1, <su>xt%e3 %p2"
-  [(set_attr "type" "alu_ext")]
-)
-
-;; zero_extend version of above
-(define_insn "*add_<optab>si_multp2_uxtw"
-  [(set (match_operand:DI 0 "register_operand" "=rk")
-	(zero_extend:DI
-         (plus:SI (ANY_EXTRACT:SI
-		   (mult:SI (match_operand:SI 1 "register_operand" "r")
-			    (match_operand 2 "aarch64_pwr_imm3" "Up3"))
-		   (match_operand 3 "const_int_operand" "n")
-                   (const_int 0))
-		  (match_operand:SI 4 "register_operand" "r"))))]
-  "aarch64_is_extend_from_extract (SImode, operands[2], operands[3])"
-  "add\\t%w0, %w4, %w1, <su>xt%e3 %p2"
-  [(set_attr "type" "alu_ext")]
-)
-
 (define_expand "add<mode>3_carryin"
   [(set (match_operand:GPI 0 "register_operand")
 	(plus:GPI
@@ -2819,7 +2688,7 @@
   "*
   operands[3] = GEN_INT (aarch64_uxt_size (INTVAL(operands[2]),
 					   INTVAL (operands[3])));
-  return \"add\t%<w>0, %<w>4, %<w>1, uxt%e3 %2\";"
+  return \"add\t%<w>0, %<w>4, %w1, uxt%e3 %2\";"
   [(set_attr "type" "alu_ext")]
 )
 
@@ -2840,38 +2709,6 @@
   [(set_attr "type" "alu_ext")]
 )
 
-(define_insn "*add_uxt<mode>_multp2"
-  [(set (match_operand:GPI 0 "register_operand" "=rk")
-	(plus:GPI (and:GPI
-		   (mult:GPI (match_operand:GPI 1 "register_operand" "r")
-			     (match_operand 2 "aarch64_pwr_imm3" "Up3"))
-		   (match_operand 3 "const_int_operand" "n"))
-		  (match_operand:GPI 4 "register_operand" "r")))]
-  "aarch64_uxt_size (exact_log2 (INTVAL (operands[2])), INTVAL (operands[3])) != 0"
-  "*
-  operands[3] = GEN_INT (aarch64_uxt_size (exact_log2 (INTVAL (operands[2])),
-					   INTVAL (operands[3])));
-  return \"add\t%<w>0, %<w>4, %<w>1, uxt%e3 %p2\";"
-  [(set_attr "type" "alu_ext")]
-)
-
-;; zero_extend version of above
-(define_insn "*add_uxtsi_multp2_uxtw"
-  [(set (match_operand:DI 0 "register_operand" "=rk")
-	(zero_extend:DI
-         (plus:SI (and:SI
-		   (mult:SI (match_operand:SI 1 "register_operand" "r")
-			    (match_operand 2 "aarch64_pwr_imm3" "Up3"))
-		   (match_operand 3 "const_int_operand" "n"))
-		  (match_operand:SI 4 "register_operand" "r"))))]
-  "aarch64_uxt_size (exact_log2 (INTVAL (operands[2])), INTVAL (operands[3])) != 0"
-  "*
-  operands[3] = GEN_INT (aarch64_uxt_size (exact_log2 (INTVAL (operands[2])),
-					   INTVAL (operands[3])));
-  return \"add\t%w0, %w4, %w1, uxt%e3 %p2\";"
-  [(set_attr "type" "alu_ext")]
-)
-
 (define_insn "subsi3"
   [(set (match_operand:SI 0 "register_operand" "=rk")
 	(minus:SI (match_operand:SI 1 "register_operand" "rk")
@@ -3275,37 +3112,13 @@
   [(set_attr "type" "alu_shift_imm")]
 )
 
-(define_insn "*sub_mul_imm_<mode>"
-  [(set (match_operand:GPI 0 "register_operand" "=r")
-	(minus:GPI (match_operand:GPI 3 "register_operand" "r")
-		   (mult:GPI
-		    (match_operand:GPI 1 "register_operand" "r")
-		    (match_operand:QI 2 "aarch64_pwr_2_<mode>" "n"))))]
-  ""
-  "sub\\t%<w>0, %<w>3, %<w>1, lsl %p2"
-  [(set_attr "type" "alu_shift_imm")]
-)
-
-;; zero_extend version of above
-(define_insn "*sub_mul_imm_si_uxtw"
-  [(set (match_operand:DI 0 "register_operand" "=r")
-	(zero_extend:DI
-         (minus:SI (match_operand:SI 3 "register_operand" "r")
-		   (mult:SI
-		    (match_operand:SI 1 "register_operand" "r")
-		    (match_operand:QI 2 "aarch64_pwr_2_si" "n")))))]
-  ""
-  "sub\\t%w0, %w3, %w1, lsl %p2"
-  [(set_attr "type" "alu_shift_imm")]
-)
-
 (define_insn "*sub_<optab><ALLX:mode>_<GPI:mode>"
   [(set (match_operand:GPI 0 "register_operand" "=rk")
 	(minus:GPI (match_operand:GPI 1 "register_operand" "rk")
 		   (ANY_EXTEND:GPI
 		    (match_operand:ALLX 2 "register_operand" "r"))))]
   ""
-  "sub\\t%<GPI:w>0, %<GPI:w>1, %<GPI:w>2, <su>xt<ALLX:size>"
+  "sub\\t%<GPI:w>0, %<GPI:w>1, %w2, <su>xt<ALLX:size>"
   [(set_attr "type" "alu_ext")]
 )
 
@@ -3328,7 +3141,7 @@
 				(match_operand:ALLX 2 "register_operand" "r"))
 			       (match_operand 3 "aarch64_imm3" "Ui3"))))]
   ""
-  "sub\\t%<GPI:w>0, %<GPI:w>1, %<GPI:w>2, <su>xt<ALLX:size> %3"
+  "sub\\t%<GPI:w>0, %<GPI:w>1, %w2, <su>xt<ALLX:size> %3"
   [(set_attr "type" "alu_ext")]
 )
 
@@ -3345,34 +3158,6 @@
   [(set_attr "type" "alu_ext")]
 )
 
-(define_insn "*sub_<optab><mode>_multp2"
-  [(set (match_operand:GPI 0 "register_operand" "=rk")
-	(minus:GPI (match_operand:GPI 4 "register_operand" "rk")
-		   (ANY_EXTRACT:GPI
-		    (mult:GPI (match_operand:GPI 1 "register_operand" "r")
-			      (match_operand 2 "aarch64_pwr_imm3" "Up3"))
-		    (match_operand 3 "const_int_operand" "n")
-		    (const_int 0))))]
-  "aarch64_is_extend_from_extract (<MODE>mode, operands[2], operands[3])"
-  "sub\\t%<w>0, %<w>4, %<w>1, <su>xt%e3 %p2"
-  [(set_attr "type" "alu_ext")]
-)
-
-;; zero_extend version of above
-(define_insn "*sub_<optab>si_multp2_uxtw"
-  [(set (match_operand:DI 0 "register_operand" "=rk")
-	(zero_extend:DI
-         (minus:SI (match_operand:SI 4 "register_operand" "rk")
-		   (ANY_EXTRACT:SI
-		    (mult:SI (match_operand:SI 1 "register_operand" "r")
-			     (match_operand 2 "aarch64_pwr_imm3" "Up3"))
-		    (match_operand 3 "const_int_operand" "n")
-		    (const_int 0)))))]
-  "aarch64_is_extend_from_extract (SImode, operands[2], operands[3])"
-  "sub\\t%w0, %w4, %w1, <su>xt%e3 %p2"
-  [(set_attr "type" "alu_ext")]
-)
-
 ;; The hardware description is op1 + ~op2 + C.
 ;;                           = op1 + (-op2 + 1) + (1 - !C)
 ;;                           = op1 - op2 - 1 + 1 - !C
@@ -3607,7 +3392,7 @@
   "*
   operands[3] = GEN_INT (aarch64_uxt_size (INTVAL (operands[2]),
 					   INTVAL (operands[3])));
-  return \"sub\t%<w>0, %<w>4, %<w>1, uxt%e3 %2\";"
+  return \"sub\t%<w>0, %<w>4, %w1, uxt%e3 %2\";"
   [(set_attr "type" "alu_ext")]
 )
 
@@ -3628,38 +3413,6 @@
   [(set_attr "type" "alu_ext")]
 )
 
-(define_insn "*sub_uxt<mode>_multp2"
-  [(set (match_operand:GPI 0 "register_operand" "=rk")
-	(minus:GPI (match_operand:GPI 4 "register_operand" "rk")
-		   (and:GPI
-		    (mult:GPI (match_operand:GPI 1 "register_operand" "r")
-			      (match_operand 2 "aarch64_pwr_imm3" "Up3"))
-		    (match_operand 3 "const_int_operand" "n"))))]
-  "aarch64_uxt_size (exact_log2 (INTVAL (operands[2])),INTVAL (operands[3])) != 0"
-  "*
-  operands[3] = GEN_INT (aarch64_uxt_size (exact_log2 (INTVAL (operands[2])),
-					   INTVAL (operands[3])));
-  return \"sub\t%<w>0, %<w>4, %<w>1, uxt%e3 %p2\";"
-  [(set_attr "type" "alu_ext")]
-)
-
-;; zero_extend version of above
-(define_insn "*sub_uxtsi_multp2_uxtw"
-  [(set (match_operand:DI 0 "register_operand" "=rk")
-	(zero_extend:DI
-         (minus:SI (match_operand:SI 4 "register_operand" "rk")
-		   (and:SI
-		    (mult:SI (match_operand:SI 1 "register_operand" "r")
-			     (match_operand 2 "aarch64_pwr_imm3" "Up3"))
-		    (match_operand 3 "const_int_operand" "n")))))]
-  "aarch64_uxt_size (exact_log2 (INTVAL (operands[2])),INTVAL (operands[3])) != 0"
-  "*
-  operands[3] = GEN_INT (aarch64_uxt_size (exact_log2 (INTVAL (operands[2])),
-					   INTVAL (operands[3])));
-  return \"sub\t%w0, %w4, %w1, uxt%e3 %p2\";"
-  [(set_attr "type" "alu_ext")]
-)
-
 (define_expand "abs<mode>2"
   [(match_operand:GPI 0 "register_operand")
    (match_operand:GPI 1 "register_operand")]
@@ -3772,28 +3525,6 @@
   [(set_attr "type" "alu_shift_imm")]
 )
 
-(define_insn "*neg_mul_imm_<mode>2"
-  [(set (match_operand:GPI 0 "register_operand" "=r")
-	(neg:GPI (mult:GPI
-		  (match_operand:GPI 1 "register_operand" "r")
-		  (match_operand:QI 2 "aarch64_pwr_2_<mode>" "n"))))]
-  ""
-  "neg\\t%<w>0, %<w>1, lsl %p2"
-  [(set_attr "type" "alu_shift_imm")]
-)
-
-;; zero_extend version of above
-(define_insn "*neg_mul_imm_si2_uxtw"
-  [(set (match_operand:DI 0 "register_operand" "=r")
-	(zero_extend:DI
-         (neg:SI (mult:SI
-		  (match_operand:SI 1 "register_operand" "r")
-		  (match_operand:QI 2 "aarch64_pwr_2_si" "n")))))]
-  ""
-  "neg\\t%w0, %w1, lsl %p2"
-  [(set_attr "type" "alu_shift_imm")]
-)
-
 (define_insn "mul<mode>3"
   [(set (match_operand:GPI 0 "register_operand" "=r")
 	(mult:GPI (match_operand:GPI 1 "register_operand" "r")
@@ -4054,7 +3785,7 @@
 			 (match_operand:ALLX 0 "register_operand" "r"))
 			(match_operand:GPI 1 "register_operand" "r")))]
   ""
-  "cmp\\t%<GPI:w>1, %<GPI:w>0, <su>xt<ALLX:size>"
+  "cmp\\t%<GPI:w>1, %w0, <su>xt<ALLX:size>"
   [(set_attr "type" "alus_ext")]
 )
 
@@ -4066,7 +3797,7 @@
 			 (match_operand 1 "aarch64_imm3" "Ui3"))
 	(match_operand:GPI 2 "register_operand" "r")))]
   ""
-  "cmp\\t%<GPI:w>2, %<GPI:w>0, <su>xt<ALLX:size> %1"
+  "cmp\\t%<GPI:w>2, %w0, <su>xt<ALLX:size> %1"
   [(set_attr "type" "alus_ext")]
 )
 
@@ -7059,7 +6790,8 @@
 (define_insn "aarch64_fjcvtzs"
   [(set (match_operand:SI 0 "register_operand" "=r")
 	(unspec:SI [(match_operand:DF 1 "register_operand" "w")]
-		   UNSPEC_FJCVTZS))]
+		   UNSPEC_FJCVTZS))
+   (clobber (reg:CC CC_REGNUM))]
   "TARGET_JSCVT"
   "fjcvtzs\\t%w0, %d1"
   [(set_attr "type" "f_cvtf2i")]
@@ -7151,43 +6883,37 @@
   DONE;
 })
 
-;; Named patterns for stack smashing protection.
+;; Defined for -mstack-protector-guard=sysreg, which goes through this
+;; pattern rather than stack_protect_combined_set.  Our implementation
+;; of the latter can handle both.
 (define_expand "stack_protect_set"
   [(match_operand 0 "memory_operand")
-   (match_operand 1 "memory_operand")]
+   (match_operand 1 "")]
   ""
 {
-  machine_mode mode = GET_MODE (operands[0]);
-  if (aarch64_stack_protector_guard != SSP_GLOBAL)
-  {
-    /* Generate access through the system register.  */
-    rtx tmp_reg = gen_reg_rtx (mode);
-    if (mode == DImode)
-    {
-        emit_insn (gen_reg_stack_protect_address_di (tmp_reg));
-        emit_insn (gen_adddi3 (tmp_reg, tmp_reg,
-			       GEN_INT (aarch64_stack_protector_guard_offset)));
-    }
-    else
-    {
-	emit_insn (gen_reg_stack_protect_address_si (tmp_reg));
-	emit_insn (gen_addsi3 (tmp_reg, tmp_reg,
-			       GEN_INT (aarch64_stack_protector_guard_offset)));
+  emit_insn (gen_stack_protect_combined_set (operands[0], operands[1]));
+  DONE;
+})
 
-    }
-    operands[1] = gen_rtx_MEM (mode, tmp_reg);
-  }
-  
+(define_expand "stack_protect_combined_set"
+  [(match_operand 0 "memory_operand")
+   (match_operand 1 "")]
+  ""
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  operands[1] = aarch64_stack_protect_canary_mem (mode, operands[1],
+						  AARCH64_SALT_SSP_SET);
   emit_insn ((mode == DImode
 	      ? gen_stack_protect_set_di
 	      : gen_stack_protect_set_si) (operands[0], operands[1]));
   DONE;
 })
 
+;; Operand 1 is either AARCH64_SALT_SSP_SET or AARCH64_SALT_SSP_TEST.
 (define_insn "reg_stack_protect_address_<mode>"
  [(set (match_operand:PTR 0 "register_operand" "=r")
-       (unspec:PTR [(const_int 0)]
-	UNSPEC_SSP_SYSREG))]
+       (unspec:PTR [(match_operand 1 "const_int_operand")]
+		   UNSPEC_SSP_SYSREG))]
  "aarch64_stack_protector_guard != SSP_GLOBAL"
  {
    char buf[150];
@@ -7210,63 +6936,51 @@
   [(set_attr "length" "12")
    (set_attr "type" "multiple")])
 
+;; Defined for -mstack-protector-guard=sysreg, which goes through this
+;; pattern rather than stack_protect_combined_test.  Our implementation
+;; of the latter can handle both.
 (define_expand "stack_protect_test"
   [(match_operand 0 "memory_operand")
-   (match_operand 1 "memory_operand")
+   (match_operand 1 "")
    (match_operand 2)]
   ""
 {
-  rtx result;
-  machine_mode mode = GET_MODE (operands[0]);
-
-  result = gen_reg_rtx(mode);
-  if (aarch64_stack_protector_guard != SSP_GLOBAL)
-  {
-    /* Generate access through the system register. The
-       sequence we want here is the access
-       of the stack offset to come with
-       mrs scratch_reg, <system_register>
-       add scratch_reg, scratch_reg, :lo12:offset. */
-    rtx tmp_reg = gen_reg_rtx (mode);
-    if (mode == DImode)
-    {
-       emit_insn (gen_reg_stack_protect_address_di (tmp_reg));
-       emit_insn (gen_adddi3 (tmp_reg, tmp_reg,
-       		              GEN_INT (aarch64_stack_protector_guard_offset)));
-    }
-    else
-    {
-	emit_insn (gen_reg_stack_protect_address_si (tmp_reg));
-	emit_insn (gen_addsi3 (tmp_reg, tmp_reg,
-			       GEN_INT (aarch64_stack_protector_guard_offset)));
+  emit_insn (gen_stack_protect_combined_test (operands[0], operands[1],
+					      operands[2]));
+  DONE;
+})
 
-    }
-    operands[1] = gen_rtx_MEM (mode, tmp_reg);
-  }
+(define_expand "stack_protect_combined_test"
+  [(match_operand 0 "memory_operand")
+   (match_operand 1 "")
+   (match_operand 2)]
+  ""
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  operands[1] = aarch64_stack_protect_canary_mem (mode, operands[1],
+						  AARCH64_SALT_SSP_TEST);
   emit_insn ((mode == DImode
-		  ? gen_stack_protect_test_di
-		  : gen_stack_protect_test_si) (result,
-					        operands[0],
-					        operands[1]));
-
-  if (mode == DImode)
-    emit_jump_insn (gen_cbranchdi4 (gen_rtx_EQ (VOIDmode, result, const0_rtx),
-				    result, const0_rtx, operands[2]));
-  else
-    emit_jump_insn (gen_cbranchsi4 (gen_rtx_EQ (VOIDmode, result, const0_rtx),
-				    result, const0_rtx, operands[2]));
+	     ? gen_stack_protect_test_di
+	     : gen_stack_protect_test_si) (operands[0], operands[1]));
+
+  rtx cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
+  emit_jump_insn (gen_condjump (gen_rtx_EQ (VOIDmode, cc_reg, const0_rtx),
+				cc_reg, operands[2]));
   DONE;
 })
 
+;; DO NOT SPLIT THIS PATTERN.  It is important for security reasons that the
+;; canary value does not live beyond the end of this sequence.
 (define_insn "stack_protect_test_<mode>"
-  [(set (match_operand:PTR 0 "register_operand" "=r")
-	(unspec:PTR [(match_operand:PTR 1 "memory_operand" "m")
-		     (match_operand:PTR 2 "memory_operand" "m")]
-	 UNSPEC_SP_TEST))
+  [(set (reg:CC CC_REGNUM)
+	(unspec:CC [(match_operand:PTR 0 "memory_operand" "m")
+		    (match_operand:PTR 1 "memory_operand" "m")]
+		   UNSPEC_SP_TEST))
+   (clobber (match_scratch:PTR 2 "=&r"))
    (clobber (match_scratch:PTR 3 "=&r"))]
   ""
-  "ldr\t%<w>3, %1\;ldr\t%<w>0, %2\;eor\t%<w>0, %<w>3, %<w>0"
-  [(set_attr "length" "12")
+  "ldr\t%<w>2, %0\;ldr\t%<w>3, %1\;subs\t%<w>2, %<w>2, %<w>3\;mov\t%3, 0"
+  [(set_attr "length" "16")
    (set_attr "type" "multiple")])
 
 ;; Write into the Floating-point Status or Control Register.
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 50f8b23..85c0d62 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -6088,6 +6088,20 @@ vreinterpretq_u32_p128 (poly128_t __a)
   return (uint32x4_t)__a;
 }
 
+__extension__ extern __inline float64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f64_p128 (poly128_t __a)
+{
+  return (float64x2_t) __a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_f64 (float64x2_t __a)
+{
+  return (poly128_t) __a;
+}
+
 /* vset_lane  */
 
 __extension__ extern __inline float16x4_t
@@ -12670,6 +12684,13 @@ vceqq_u64 (uint64x2_t __a, uint64x2_t __b)
   return (__a == __b);
 }
 
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+  return (__a == __b);
+}
+
 /* vceq - scalar.  */
 
 __extension__ extern __inline uint32_t
@@ -12779,6 +12800,13 @@ vceqz_u64 (uint64x1_t __a)
   return (__a == __AARCH64_UINT64_C (0));
 }
 
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_p64 (poly64x1_t __a)
+{
+  return (__a == __AARCH64_UINT64_C (0));
+}
+
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vceqzq_f32 (float32x4_t __a)
@@ -12856,6 +12884,13 @@ vceqzq_u64 (uint64x2_t __a)
   return (__a == __AARCH64_UINT64_C (0));
 }
 
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_p64 (poly64x2_t __a)
+{
+  return (__a == __AARCH64_UINT64_C (0));
+}
+
 /* vceqz - scalar.  */
 
 __extension__ extern __inline uint32_t
@@ -14054,6 +14089,48 @@ vclsq_s32 (int32x4_t __a)
   return __builtin_aarch64_clrsbv4si (__a);
 }
 
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcls_u8 (uint8x8_t __a)
+{
+  return __builtin_aarch64_clrsbv8qi ((int8x8_t) __a);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcls_u16 (uint16x4_t __a)
+{
+  return __builtin_aarch64_clrsbv4hi ((int16x4_t) __a);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcls_u32 (uint32x2_t __a)
+{
+  return __builtin_aarch64_clrsbv2si ((int32x2_t) __a);
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclsq_u8 (uint8x16_t __a)
+{
+  return __builtin_aarch64_clrsbv16qi ((int8x16_t) __a);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclsq_u16 (uint16x8_t __a)
+{
+  return __builtin_aarch64_clrsbv8hi ((int16x8_t) __a);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclsq_u32 (uint32x4_t __a)
+{
+  return __builtin_aarch64_clrsbv4si ((int32x4_t) __a);
+}
+
 /* vclz.  */
 
 __extension__ extern __inline int8x8_t
@@ -15538,7 +15615,7 @@ vdupq_n_f64 (float64_t __a)
 
 __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_p8 (uint32_t __a)
+vdupq_n_p8 (poly8_t __a)
 {
   return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
 		       __a, __a, __a, __a, __a, __a, __a, __a};
@@ -15546,21 +15623,21 @@ vdupq_n_p8 (uint32_t __a)
 
 __extension__ extern __inline poly16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_p16 (uint32_t __a)
+vdupq_n_p16 (poly16_t __a)
 {
   return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
 __extension__ extern __inline poly64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_p64 (uint64_t __a)
+vdupq_n_p64 (poly64_t __a)
 {
   return (poly64x2_t) {__a, __a};
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_s8 (int32_t __a)
+vdupq_n_s8 (int8_t __a)
 {
   return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
 		      __a, __a, __a, __a, __a, __a, __a, __a};
@@ -15568,7 +15645,7 @@ vdupq_n_s8 (int32_t __a)
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_s16 (int32_t __a)
+vdupq_n_s16 (int16_t __a)
 {
   return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
@@ -15589,7 +15666,7 @@ vdupq_n_s64 (int64_t __a)
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_u8 (uint32_t __a)
+vdupq_n_u8 (uint8_t __a)
 {
   return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
 		       __a, __a, __a, __a, __a, __a, __a, __a};
@@ -15597,7 +15674,7 @@ vdupq_n_u8 (uint32_t __a)
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_u16 (uint32_t __a)
+vdupq_n_u16 (uint16_t __a)
 {
   return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
@@ -19613,6 +19690,13 @@ vld4q_p64 (const poly64_t * __a)
   return ret;
 }
 
+__extension__ extern __inline poly128_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vldrq_p128 (const poly128_t * __ptr)
+{
+  return *__ptr;
+}
+
 /* vldn_dup */
 
 __extension__ extern __inline int8x8x2_t
@@ -23962,42 +24046,42 @@ __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovun_s16 (int16x8_t __a)
 {
-  return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a);
+  return __builtin_aarch64_sqmovunv8hi_us (__a);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovun_s32 (int32x4_t __a)
 {
-  return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a);
+  return __builtin_aarch64_sqmovunv4si_us (__a);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovun_s64 (int64x2_t __a)
 {
-  return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a);
+  return __builtin_aarch64_sqmovunv2di_us (__a);
 }
 
-__extension__ extern __inline int8_t
+__extension__ extern __inline uint8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovunh_s16 (int16_t __a)
 {
-  return (int8_t) __builtin_aarch64_sqmovunhi (__a);
+  return __builtin_aarch64_sqmovunhi_us (__a);
 }
 
-__extension__ extern __inline int16_t
+__extension__ extern __inline uint16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovuns_s32 (int32_t __a)
 {
-  return (int16_t) __builtin_aarch64_sqmovunsi (__a);
+  return __builtin_aarch64_sqmovunsi_us (__a);
 }
 
-__extension__ extern __inline int32_t
+__extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqmovund_s64 (int64_t __a)
 {
-  return (int32_t) __builtin_aarch64_sqmovundi (__a);
+  return __builtin_aarch64_sqmovundi_us (__a);
 }
 
 /* vqneg */
@@ -24253,28 +24337,28 @@ vqrshld_s64 (int64_t __a, int64_t __b)
 
 __extension__ extern __inline uint8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrshlb_u8 (uint8_t __a, uint8_t __b)
+vqrshlb_u8 (uint8_t __a, int8_t __b)
 {
   return __builtin_aarch64_uqrshlqi_uus (__a, __b);
 }
 
 __extension__ extern __inline uint16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrshlh_u16 (uint16_t __a, uint16_t __b)
+vqrshlh_u16 (uint16_t __a, int16_t __b)
 {
   return __builtin_aarch64_uqrshlhi_uus (__a, __b);
 }
 
 __extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrshls_u32 (uint32_t __a, uint32_t __b)
+vqrshls_u32 (uint32_t __a, int32_t __b)
 {
   return __builtin_aarch64_uqrshlsi_uus (__a, __b);
 }
 
 __extension__ extern __inline uint64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrshld_u64 (uint64_t __a, uint64_t __b)
+vqrshld_u64 (uint64_t __a, int64_t __b)
 {
   return __builtin_aarch64_uqrshldi_uus (__a, __b);
 }
@@ -24553,28 +24637,28 @@ vqshld_s64 (int64_t __a, int64_t __b)
 
 __extension__ extern __inline uint8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqshlb_u8 (uint8_t __a, uint8_t __b)
+vqshlb_u8 (uint8_t __a, int8_t __b)
 {
   return __builtin_aarch64_uqshlqi_uus (__a, __b);
 }
 
 __extension__ extern __inline uint16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqshlh_u16 (uint16_t __a, uint16_t __b)
+vqshlh_u16 (uint16_t __a, int16_t __b)
 {
   return __builtin_aarch64_uqshlhi_uus (__a, __b);
 }
 
 __extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqshls_u32 (uint32_t __a, uint32_t __b)
+vqshls_u32 (uint32_t __a, int32_t __b)
 {
   return __builtin_aarch64_uqshlsi_uus (__a, __b);
 }
 
 __extension__ extern __inline uint64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqshld_u64 (uint64_t __a, uint64_t __b)
+vqshld_u64 (uint64_t __a, int64_t __b)
 {
   return __builtin_aarch64_uqshldi_uus (__a, __b);
 }
@@ -26003,6 +26087,13 @@ vrndmq_f64 (float64x2_t __a)
 
 /* vrndn  */
 
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndns_f32 (float32_t __a)
+{
+  return __builtin_aarch64_frintnsf (__a);
+}
+
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vrndn_f32 (float32x2_t __a)
@@ -26908,7 +26999,7 @@ vshld_s64 (int64_t __a, int64_t __b)
 
 __extension__ extern __inline uint64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vshld_u64 (uint64_t __a, uint64_t __b)
+vshld_u64 (uint64_t __a, int64_t __b)
 {
   return __builtin_aarch64_ushldi_uus (__a, __b);
 }
@@ -30104,6 +30195,13 @@ vst4q_p64 (poly64_t * __a, poly64x2x4_t __val)
   __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vstrq_p128 (poly128_t * __ptr, poly128_t __val)
+{
+  *__ptr = __val;
+}
+
 /* vsub */
 
 __extension__ extern __inline int64_t
@@ -30491,6 +30589,17 @@ vtrn1q_u32 (uint32x4_t __a, uint32x4_t __b)
 #endif
 }
 
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2});
+#endif
+}
+
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b)
@@ -30761,6 +30870,18 @@ vtrn2q_u64 (uint64x2_t __a, uint64x2_t __b)
 #endif
 }
 
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3});
+#endif
+}
+
 __extension__ extern __inline float16x4x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vtrn_f16 (float16x4_t __a, float16x4_t __b)
@@ -31407,6 +31528,17 @@ vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b)
 #endif
 }
 
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2});
+#endif
+}
+
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vuzp2_f16 (float16x4_t __a, float16x4_t __b)
@@ -31666,6 +31798,17 @@ vuzp2q_u64 (uint64x2_t __a, uint64x2_t __b)
 #endif
 }
 
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3});
+#endif
+}
+
 __INTERLEAVE_LIST (uzp)
 
 /* vzip */
@@ -31934,6 +32077,17 @@ vzip1q_u64 (uint64x2_t __a, uint64x2_t __b)
 #endif
 }
 
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2});
+#endif
+}
+
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vzip2_f16 (float16x4_t __a, float16x4_t __b)
@@ -32198,6 +32352,17 @@ vzip2q_u64 (uint64x2_t __a, uint64x2_t __b)
 #endif
 }
 
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3});
+#endif
+}
+
 __INTERLEAVE_LIST (zip)
 
 #undef __INTERLEAVE_LIST
@@ -35659,6 +35824,55 @@ vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
 
 #pragma GCC pop_options
 
+__extension__ extern __inline poly8x8_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vadd_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return __a ^ __b;
+}
+
+__extension__ extern __inline poly16x4_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vadd_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+  return __a ^ __b;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vadd_p64 (poly64x1_t __a, poly64x1_t __b)
+{
+  return __a ^ __b;
+}
+
+__extension__ extern __inline poly8x16_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vaddq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+  return __a ^ __b;
+}
+
+__extension__ extern __inline poly16x8_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vaddq_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+  return __a ^__b;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vaddq_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+  return __a ^ __b;
+}
+
+__extension__ extern __inline poly128_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vaddq_p128 (poly128_t __a, poly128_t __b)
+{
+  return __a ^ __b;
+}
+
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/driver-aarch64.c b/gcc/config/aarch64/driver-aarch64.c
index d1229e6..d68a725 100644
--- a/gcc/config/aarch64/driver-aarch64.c
+++ b/gcc/config/aarch64/driver-aarch64.c
@@ -21,6 +21,7 @@
 
 #include "config.h"
 #define INCLUDE_STRING
+#define INCLUDE_SET
 #include "system.h"
 #include "coretypes.h"
 #include "tm.h"
@@ -116,9 +117,15 @@ valid_bL_core_p (unsigned int *core, unsigned int bL_core)
 /* Returns the hex integer that is after ':' for the FIELD.
    Returns -1 is returned if there was problem parsing the integer. */
 static unsigned
-parse_field (const char *field)
+parse_field (const std::string &field)
 {
-  const char *rest = strchr (field, ':');
+  const char *rest = strchr (field.c_str (), ':');
+
+  /* The line must be in the format of <name>:<value>, if it's not
+     then we have a weird format.  */
+  if (rest == NULL)
+    return -1;
+
   char *after;
   unsigned fint = strtol (rest + 1, &after, 16);
   if (after == rest + 1)
@@ -126,6 +133,82 @@ parse_field (const char *field)
   return fint;
 }
 
+/* Returns the index of the ':' inside the FIELD which must be found
+   after the value of KEY.  Returns string::npos if line does not contain
+   a field.  */
+
+static size_t
+find_field (const std::string &field, const std::string &key)
+{
+  size_t key_pos, sep_pos;
+  key_pos = field.find (key);
+  if (key_pos == std::string::npos)
+    return std::string::npos;
+
+  sep_pos = field.find (":", key_pos + 1);
+  if (sep_pos == std::string::npos)
+    return std::string::npos;
+
+  return sep_pos;
+}
+
+/* Splits and returns a string based on whitespace and return it as
+   part of a set. Empty strings are ignored.  */
+
+static void
+split_words (const std::string &val, std::set<std::string> &result)
+{
+  size_t cur, prev = 0;
+  std::string word;
+  while ((cur = val.find_first_of (" \n", prev)) != std::string::npos)
+    {
+      word = val.substr (prev, cur - prev);
+      /* Skip adding empty words.  */
+      if (!word.empty ())
+	result.insert (word);
+      prev = cur + 1;
+    }
+
+  if (prev != cur)
+    result.insert (val.substr (prev));
+}
+
+/* Read an entire line from F until '\n' or EOF.  */
+
+static std::string
+readline (FILE *f)
+{
+  char *buf = NULL;
+  int size = 0;
+  int last = 0;
+  const int buf_size = 128;
+
+  if (feof (f))
+    return std::string ();
+
+  do
+    {
+      size += buf_size;
+      buf = (char*) xrealloc (buf, size);
+      gcc_assert (buf);
+      /* If fgets fails it returns NULL, but if it reaches EOF
+	 with 0 characters read it also returns EOF.  However
+	 the condition on the loop would have broken out of the
+	 loop in that case,  and if we are in the first iteration
+	 then the empty string is the correct thing to return.  */
+      if (!fgets (buf + last, buf_size, f))
+	return std::string ();
+      /* If we're not at the end of the line then override the
+	 \0 added by fgets.  */
+      last = strnlen (buf, size) - 1;
+    }
+  while (!feof (f) && buf[last] != '\n');
+
+  std::string result (buf);
+  free (buf);
+  return result;
+}
+
 /*  Return true iff ARR contains CORE, in either of the two elements. */
 
 static bool
@@ -164,7 +247,6 @@ host_detect_local_cpu (int argc, const char **argv)
 {
   const char *res = NULL;
   static const int num_exts = ARRAY_SIZE (aarch64_extensions);
-  char buf[128];
   FILE *f = NULL;
   bool arch = false;
   bool tune = false;
@@ -178,6 +260,9 @@ host_detect_local_cpu (int argc, const char **argv)
   bool processed_exts = false;
   uint64_t extension_flags = 0;
   uint64_t default_flags = 0;
+  std::string buf;
+  size_t sep_pos = -1;
+  char *fcpu_info;
 
   gcc_assert (argc);
 
@@ -195,16 +280,20 @@ host_detect_local_cpu (int argc, const char **argv)
   if (!arch && !tune && !cpu)
     goto not_found;
 
-  f = fopen ("/proc/cpuinfo", "r");
+  fcpu_info = getenv ("GCC_CPUINFO");
+  if (fcpu_info)
+    f = fopen (fcpu_info, "r");
+  else
+    f = fopen ("/proc/cpuinfo", "r");
 
   if (f == NULL)
     goto not_found;
 
   /* Look through /proc/cpuinfo to determine the implementer
      and then the part number that identifies a particular core.  */
-  while (fgets (buf, sizeof (buf), f) != NULL)
+  while (!(buf = readline (f)).empty ())
     {
-      if (strstr (buf, "implementer") != NULL)
+      if (find_field (buf, "implementer") != std::string::npos)
 	{
 	  unsigned cimp = parse_field (buf);
 	  if (cimp == INVALID_IMP)
@@ -216,8 +305,7 @@ host_detect_local_cpu (int argc, const char **argv)
 	  else if (imp != cimp)
 	    goto not_found;
 	}
-
-      if (strstr (buf, "variant") != NULL)
+      else if (find_field (buf, "variant") != std::string::npos)
 	{
 	  unsigned cvariant = parse_field (buf);
 	  if (!contains_core_p (variants, cvariant))
@@ -229,8 +317,7 @@ host_detect_local_cpu (int argc, const char **argv)
 	    }
           continue;
         }
-
-      if (strstr (buf, "part") != NULL)
+      else if (find_field (buf, "part") != std::string::npos)
 	{
 	  unsigned ccore = parse_field (buf);
 	  if (!contains_core_p (cores, ccore))
@@ -242,39 +329,36 @@ host_detect_local_cpu (int argc, const char **argv)
 	    }
 	  continue;
 	}
-      if (!tune && !processed_exts && strstr (buf, "Features") != NULL)
+      else if (!tune && !processed_exts
+	       && (sep_pos = find_field (buf, "Features")) != std::string::npos)
 	{
+	  /* First create the list of features in the buffer.  */
+	  std::set<std::string> features;
+	  /* Drop everything till the :.  */
+	  buf = buf.substr (sep_pos + 1);
+	  split_words (buf, features);
+
 	  for (i = 0; i < num_exts; i++)
 	    {
-	      const char *p = aarch64_extensions[i].feat_string;
+	      const std::string val (aarch64_extensions[i].feat_string);
 
 	      /* If the feature contains no HWCAPS string then ignore it for the
 		 auto detection.  */
-	      if (*p == '\0')
+	      if (val.empty ())
 		continue;
 
 	      bool enabled = true;
 
 	      /* This may be a multi-token feature string.  We need
 		 to match all parts, which could be in any order.  */
-	      size_t len = strlen (buf);
-	      do
-		{
-		  const char *end = strchr (p, ' ');
-		  if (end == NULL)
-		    end = strchr (p, '\0');
-		  if (memmem (buf, len, p, end - p) == NULL)
-		    {
-		      /* Failed to match this token.  Turn off the
-			 features we'd otherwise enable.  */
-		      enabled = false;
-		      break;
-		    }
-		  if (*end == '\0')
-		    break;
-		  p = end + 1;
-		}
-	      while (1);
+	      std::set<std::string> tokens;
+	      split_words (val, tokens);
+	      std::set<std::string>::iterator it;
+
+	      /* Iterate till the first feature isn't found or all of them
+		 are found.  */
+	      for (it = tokens.begin (); enabled && it != tokens.end (); ++it)
+		enabled = enabled && features.count (*it);
 
 	      if (enabled)
 		extension_flags |= aarch64_extensions[i].flag;
diff --git a/gcc/config/aarch64/geniterators.sh b/gcc/config/aarch64/geniterators.sh
index a742096..43feb48 100644
--- a/gcc/config/aarch64/geniterators.sh
+++ b/gcc/config/aarch64/geniterators.sh
@@ -70,8 +70,8 @@ iterdef {
 	sub(/ *\]/, "", s)
 
 	n = split(s, a)
-	printf "#define BUILTIN_" a[1] "(T, N, MAP) \\\n"
-	printf "  VAR" (n-1) " (T, N, MAP"
+	printf "#define BUILTIN_" a[1] "(T, N, MAP, FLAG) \\\n"
+	printf "  VAR" (n-1) " (T, N, MAP, FLAG"
 	for (i = 2; i <= n; i++)
 		printf ", "  tolower(a[i])
 	printf ")\n"
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 9a51916..054fd85 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -98,6 +98,9 @@
 ;; Copy of the above.
 (define_mode_iterator DREG2 [V8QI V4HI V4HF V2SI V2SF DF])
 
+;; All modes suitable to store/load pair (2 elements) using STP/LDP.
+(define_mode_iterator VP_2E [V2SI V2SF V2DI V2DF])
+
 ;; Advanced SIMD, 64-bit container, all integer modes.
 (define_mode_iterator VD_BHSI [V8QI V4HI V2SI])
 
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 1754b1e..91b5148 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -235,21 +235,6 @@
   (and (match_code "const_int")
        (match_test "IN_RANGE (UINTVAL (op), 0, 0xffffff)")))
 
-(define_predicate "aarch64_pwr_imm3"
-  (and (match_code "const_int")
-       (match_test "INTVAL (op) != 0
-		    && (unsigned) exact_log2 (INTVAL (op)) <= 4")))
-
-(define_predicate "aarch64_pwr_2_si"
-  (and (match_code "const_int")
-       (match_test "INTVAL (op) != 0
-		    && (unsigned) exact_log2 (INTVAL (op)) < 32")))
-
-(define_predicate "aarch64_pwr_2_di"
-  (and (match_code "const_int")
-       (match_test "INTVAL (op) != 0
-		    && (unsigned) exact_log2 (INTVAL (op)) < 64")))
-
 (define_predicate "aarch64_mem_pair_offset"
   (and (match_code "const_int")
        (match_test "aarch64_offset_7bit_signed_scaled_p (mode, INTVAL (op))")))
diff --git a/gcc/config/arm/arm-c.c b/gcc/config/arm/arm-c.c
index 7468a20..899b890 100644
--- a/gcc/config/arm/arm-c.c
+++ b/gcc/config/arm/arm-c.c
@@ -390,7 +390,7 @@ arm_pragma_target_parse (tree args, tree pop_target)
   if (! args)
     {
       cur_tree = ((pop_target) ? pop_target : target_option_default_node);
-      cl_target_option_restore (&global_options,
+      cl_target_option_restore (&global_options, &global_options_set,
 				TREE_TARGET_OPTION (cur_tree));
     }
   else
@@ -399,7 +399,7 @@ arm_pragma_target_parse (tree args, tree pop_target)
 						  &global_options_set);
       if (cur_tree == NULL_TREE)
 	{
-	  cl_target_option_restore (&global_options,
+	  cl_target_option_restore (&global_options, &global_options_set,
 				    TREE_TARGET_OPTION (prev_tree));
 	  return false;
 	}
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index 728be50..8c61ad0 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -135,10 +135,6 @@ define feature armv8_1m_main
 # Floating point and Neon extensions.
 # VFPv1 is not supported in GCC.
 
-# This feature bit is enabled for all VFP, MVE and
-# MVE with floating point extensions.
-define feature vfp_base
-
 # Vector floating point v2.
 define feature vfpv2
 
@@ -251,7 +247,7 @@ define fgroup ALL_SIMD	ALL_SIMD_INTERNAL ALL_SIMD_EXTERNAL
 
 # List of all FPU bits to strip out if -mfpu is used to override the
 # default.  fp16 is deliberately missing from this list.
-define fgroup ALL_FPU_INTERNAL	vfp_base vfpv2 vfpv3 vfpv4 fpv5 fp16conv fp_dbl ALL_SIMD_INTERNAL
+define fgroup ALL_FPU_INTERNAL	vfpv2 vfpv3 vfpv4 fpv5 fp16conv fp_dbl ALL_SIMD_INTERNAL
 # Similarly, but including fp16 and other extensions that aren't part of
 # -mfpu support.
 define fgroup ALL_FPU_EXTERNAL fp16 bf16
@@ -296,11 +292,11 @@ define fgroup ARMv8r      ARMv8a
 define fgroup ARMv8_1m_main ARMv8m_main armv8_1m_main
 
 # Useful combinations.
-define fgroup VFPv2	vfp_base vfpv2
+define fgroup VFPv2	vfpv2
 define fgroup VFPv3	VFPv2 vfpv3
 define fgroup VFPv4	VFPv3 vfpv4 fp16conv
 define fgroup FPv5	VFPv4 fpv5
-define fgroup MVE      mve vfp_base armv7em
+define fgroup MVE      mve armv7em
 define fgroup MVE_FP   MVE FPv5 fp16 mve_float
 
 define fgroup FP_DBL	fp_dbl
@@ -310,6 +306,18 @@ define fgroup NEON	FP_D32 neon
 define fgroup CRYPTO	NEON crypto
 define fgroup DOTPROD	NEON dotprod
 
+# Implied feature bits.  These are for non-named features shared between fgroups.
+# Shared feature f belonging to fgroups A and B will be erroneously removed if:
+# A and B are enabled by default AND A is disabled by a removal flag.
+# To ensure that f is retained, we must add such bits to the ISA after
+# processing the removal flags.  This is implemented by 'implied bits':
+# define implied <name> [<feature-or-fgroup>]+
+# This indicates that, if any of the listed features are enabled, or if any
+# member of a listed fgroup is enabled, then <name> will be implicitly enabled.
+
+# Enabled for all VFP, MVE and MVE with floating point extensions.
+define implied vfp_base MVE MVE_FP ALL_FP
+
 # List of all quirk bits to strip out when comparing CPU features with
 # architectures.
 # xscale isn't really a 'quirk', but it isn't an architecture either and we
@@ -716,7 +724,7 @@ begin arch armv8-r
 end arch armv8-r
 
 begin arch armv8.1-m.main
- tune for cortex-m7
+ tune for cortex-m55
  tune flags CO_PROC
  base 8M_MAIN
  profile M
@@ -1447,6 +1455,39 @@ begin cpu cortex-a77
  part d0d
 end cpu cortex-a77
 
+begin cpu cortex-a78
+ cname cortexa78
+ tune for cortex-a57
+ tune flags LDSCHED
+ architecture armv8.2-a+fp16+dotprod
+ option crypto add FP_ARMv8 CRYPTO
+ costs cortex_a57
+ vendor 41
+ part d41
+end cpu cortex-a78
+
+begin cpu cortex-a78ae
+ cname cortexa78ae
+ tune for cortex-a57
+ tune flags LDSCHED
+ architecture armv8.2-a+fp16+dotprod
+ option crypto add FP_ARMv8 CRYPTO
+ costs cortex_a57
+ vendor 41
+ part d42
+end cpu cortex-a78ae
+
+begin cpu cortex-x1
+ cname cortexx1
+ tune for cortex-a57
+ tune flags LDSCHED
+ architecture armv8.2-a+fp16+dotprod
+ option crypto add FP_ARMv8 CRYPTO
+ costs cortex_a57
+ vendor 41
+ part d44
+end cpu cortex-x1
+
 begin cpu neoverse-n1
  cname neoversen1
  alias !ares
@@ -1478,6 +1519,30 @@ begin cpu cortex-a76.cortex-a55
  costs cortex_a57
 end cpu cortex-a76.cortex-a55
 
+# Armv8.4 A-profile Architecture Processors
+begin cpu neoverse-v1
+  cname neoversev1
+  tune for cortex-a57
+  tune flags LDSCHED
+  architecture armv8.4-a+fp16+bf16+i8mm
+  option crypto add FP_ARMv8 CRYPTO
+  costs cortex_a57
+  vendor 41
+  part 0xd40
+end cpu neoverse-v1
+
+# Armv8.5 A-profile Architecture Processors
+begin cpu neoverse-n2
+  cname neoversen2
+  tune for cortex-a57
+  tune flags LDSCHED
+  architecture armv8.5-a+fp16+bf16+i8mm
+  option crypto add FP_ARMv8 CRYPTO
+  costs cortex_a57
+  vendor 41
+  part 0xd49
+end cpu neoverse-n2
+
 # V8 M-profile implementations.
 begin cpu cortex-m23
  cname cortexm23
@@ -1508,6 +1573,10 @@ begin cpu cortex-m55
  cname cortexm55
  tune flags LDSCHED
  architecture armv8.1-m.main+mve.fp+fp.dp
+ option nomve.fp remove mve_float
+ option nomve remove mve mve_float
+ option nofp remove ALL_FP mve_float
+ option nodsp remove MVE mve_float
  isa quirk_no_asmcpu
  costs v7m
  vendor 41
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index d52e8bf..703d616 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -116,6 +116,8 @@ extern enum reg_class coproc_secondary_reload_class (machine_mode, rtx,
 extern bool arm_tls_referenced_p (rtx);
 
 extern int arm_coproc_mem_operand (rtx, bool);
+extern int arm_coproc_mem_operand_no_writeback (rtx);
+extern int arm_coproc_mem_operand_wb (rtx, int);
 extern int neon_vector_mem_operand (rtx, int, bool);
 extern int mve_vector_mem_operand (machine_mode, rtx, bool);
 extern int neon_struct_mem_operand (rtx);
@@ -370,9 +372,11 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
 extern bool arm_fusion_enabled_p (tune_params::fuse_ops);
 extern bool arm_valid_symbolic_address_p (rtx);
 extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
+extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool);
 #endif /* RTX_CODE */
 
 extern bool arm_gen_setmem (rtx *);
+extern void arm_expand_vcond (rtx *, machine_mode);
 extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
 
 extern bool arm_autoinc_modes_ok_p (machine_mode, enum arm_auto_incmodes);
diff --git a/gcc/config/arm/arm-tables.opt b/gcc/config/arm/arm-tables.opt
index ce35661..05f5c08 100644
--- a/gcc/config/arm/arm-tables.opt
+++ b/gcc/config/arm/arm-tables.opt
@@ -241,6 +241,15 @@ EnumValue
 Enum(processor_type) String(cortex-a77) Value( TARGET_CPU_cortexa77)
 
 EnumValue
+Enum(processor_type) String(cortex-a78) Value( TARGET_CPU_cortexa78)
+
+EnumValue
+Enum(processor_type) String(cortex-a78ae) Value( TARGET_CPU_cortexa78ae)
+
+EnumValue
+Enum(processor_type) String(cortex-x1) Value( TARGET_CPU_cortexx1)
+
+EnumValue
 Enum(processor_type) String(neoverse-n1) Value( TARGET_CPU_neoversen1)
 
 EnumValue
@@ -250,6 +259,12 @@ EnumValue
 Enum(processor_type) String(cortex-a76.cortex-a55) Value( TARGET_CPU_cortexa76cortexa55)
 
 EnumValue
+Enum(processor_type) String(neoverse-v1) Value( TARGET_CPU_neoversev1)
+
+EnumValue
+Enum(processor_type) String(neoverse-n2) Value( TARGET_CPU_neoversen2)
+
+EnumValue
 Enum(processor_type) String(cortex-m23) Value( TARGET_CPU_cortexm23)
 
 EnumValue
diff --git a/gcc/config/arm/arm-tune.md b/gcc/config/arm/arm-tune.md
index 8ea9435..32657da 100644
--- a/gcc/config/arm/arm-tune.md
+++ b/gcc/config/arm/arm-tune.md
@@ -45,7 +45,9 @@
 	cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,
 	cortexa73cortexa53,cortexa55,cortexa75,
 	cortexa76,cortexa76ae,cortexa77,
+	cortexa78,cortexa78ae,cortexx1,
 	neoversen1,cortexa75cortexa55,cortexa76cortexa55,
-	cortexm23,cortexm33,cortexm35p,
-	cortexm55,cortexr52"
+	neoversev1,neoversen2,cortexm23,
+	cortexm33,cortexm35p,cortexm55,
+	cortexr52"
 	(const (symbol_ref "((enum attr_tune) arm_tune)")))
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 4ed7173..0b8c5fa 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -247,8 +247,7 @@ static tree arm_build_builtin_va_list (void);
 static void arm_expand_builtin_va_start (tree, rtx);
 static tree arm_gimplify_va_arg_expr (tree, tree, gimple_seq *, gimple_seq *);
 static void arm_option_override (void);
-static void arm_option_save (struct cl_target_option *, struct gcc_options *);
-static void arm_option_restore (struct gcc_options *,
+static void arm_option_restore (struct gcc_options *, struct gcc_options *,
 				struct cl_target_option *);
 static void arm_override_options_after_change (void);
 static void arm_option_print (FILE *, int, struct cl_target_option *);
@@ -442,9 +441,6 @@ static const struct attribute_spec arm_attribute_table[] =
 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE arm_override_options_after_change
 
-#undef TARGET_OPTION_SAVE
-#define TARGET_OPTION_SAVE arm_option_save
-
 #undef TARGET_OPTION_RESTORE
 #define TARGET_OPTION_RESTORE arm_option_restore
 
@@ -3024,10 +3020,11 @@ static GTY(()) bool thumb_flipper;
 static GTY(()) tree init_optimize;
 
 static void
-arm_override_options_after_change_1 (struct gcc_options *opts)
+arm_override_options_after_change_1 (struct gcc_options *opts,
+				     struct gcc_options *opts_set)
 {
   /* -falign-functions without argument: supply one.  */
-  if (opts->x_flag_align_functions && !opts->x_str_align_functions)
+  if (opts->x_flag_align_functions && !opts_set->x_str_align_functions)
     opts->x_str_align_functions = TARGET_THUMB_P (opts->x_target_flags)
       && opts->x_optimize_size ? "2" : "4";
 }
@@ -3037,31 +3034,15 @@ arm_override_options_after_change_1 (struct gcc_options *opts)
 static void
 arm_override_options_after_change (void)
 {
-  arm_configure_build_target (&arm_active_target,
-			      TREE_TARGET_OPTION (target_option_default_node),
-			      &global_options_set, false);
-
-  arm_override_options_after_change_1 (&global_options);
-}
-
-/* Implement TARGET_OPTION_SAVE.  */
-static void
-arm_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
-{
-  ptr->x_arm_arch_string = opts->x_arm_arch_string;
-  ptr->x_arm_cpu_string = opts->x_arm_cpu_string;
-  ptr->x_arm_tune_string = opts->x_arm_tune_string;
+  arm_override_options_after_change_1 (&global_options, &global_options_set);
 }
 
 /* Implement TARGET_OPTION_RESTORE.  */
 static void
-arm_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
+arm_option_restore (struct gcc_options */* opts */,
+		    struct gcc_options *opts_set, struct cl_target_option *ptr)
 {
-  opts->x_arm_arch_string = ptr->x_arm_arch_string;
-  opts->x_arm_cpu_string = ptr->x_arm_cpu_string;
-  opts->x_arm_tune_string = ptr->x_arm_tune_string;
-  arm_configure_build_target (&arm_active_target, ptr, &global_options_set,
-			      false);
+  arm_configure_build_target (&arm_active_target, ptr, opts_set, false);
 }
 
 /* Reset options between modes that the user has specified.  */
@@ -3069,7 +3050,7 @@ static void
 arm_option_override_internal (struct gcc_options *opts,
 			      struct gcc_options *opts_set)
 {
-  arm_override_options_after_change_1 (opts);
+  arm_override_options_after_change_1 (opts, opts_set);
 
   if (TARGET_INTERWORK && !bitmap_bit_p (arm_active_target.isa, isa_bit_thumb))
     {
@@ -3410,6 +3391,20 @@ arm_configure_build_target (struct arm_build_target *target,
       bitmap_ior (target->isa, target->isa, fpu_bits);
     }
 
+  /* There may be implied bits which we still need to enable. These are
+     non-named features which are needed to complete other sets of features,
+     but cannot be enabled from arm-cpus.in due to being shared between
+     multiple fgroups. Each entry in all_implied_fbits is of the form
+     ante -> cons, meaning that if the feature "ante" is enabled, we should
+     implicitly enable "cons".  */
+  const struct fbit_implication *impl = all_implied_fbits;
+  while (impl->ante)
+    {
+      if (bitmap_bit_p (target->isa, impl->ante))
+	bitmap_set_bit (target->isa, impl->cons);
+      impl++;
+    }
+
   if (!arm_selected_tune)
     arm_selected_tune = arm_selected_cpu;
   else /* Validate the features passed to -mtune.  */
@@ -3460,7 +3455,7 @@ arm_option_override (void)
       arm_fpu_index = (enum fpu_type) fpu_index;
     }
 
-  cl_target_option_save (&opts, &global_options);
+  cl_target_option_save (&opts, &global_options, &global_options_set);
   arm_configure_build_target (&arm_active_target, &opts, &global_options_set,
 			      true);
 
@@ -3685,7 +3680,8 @@ arm_option_override (void)
     flag_schedule_fusion = 0;
 
   /* Need to remember initial options before they are overriden.  */
-  init_optimize = build_optimization_node (&global_options);
+  init_optimize = build_optimization_node (&global_options,
+					   &global_options_set);
 
   arm_options_perform_arch_sanity_checks ();
   arm_option_override_internal (&global_options, &global_options_set);
@@ -3694,7 +3690,7 @@ arm_option_override (void)
 
   /* Create the default target_options structure.  */
   target_option_default_node = target_option_current_node
-    = build_target_option_node (&global_options);
+    = build_target_option_node (&global_options, &global_options_set);
 
   /* Register global variables with the garbage collector.  */
   arm_add_gc_roots ();
@@ -3838,7 +3834,7 @@ arm_options_perform_arch_sanity_checks (void)
 
   /* We don't clear D16-D31 VFP registers for cmse_nonsecure_call functions
      and ARMv8-M Baseline and Mainline do not allow such configuration.  */
-  if (use_cmse && LAST_VFP_REGNUM > LAST_LO_VFP_REGNUM)
+  if (use_cmse && TARGET_HARD_FLOAT && LAST_VFP_REGNUM > LAST_LO_VFP_REGNUM)
     error ("ARMv8-M Security Extensions incompatible with selected FPU");
 
 
@@ -13235,13 +13231,14 @@ neon_element_bits (machine_mode mode)
 /* Predicates for `match_operand' and `match_operator'.  */
 
 /* Return TRUE if OP is a valid coprocessor memory address pattern.
-   WB is true if full writeback address modes are allowed and is false
+   WB level is 2 if full writeback address modes are allowed, 1
    if limited writeback address modes (POST_INC and PRE_DEC) are
-   allowed.  */
+   allowed and 0 if no writeback at all is supported.  */
 
 int
-arm_coproc_mem_operand (rtx op, bool wb)
+arm_coproc_mem_operand_wb (rtx op, int wb_level)
 {
+  gcc_assert (wb_level == 0 || wb_level == 1 || wb_level == 2);
   rtx ind;
 
   /* Reject eliminable registers.  */
@@ -13274,16 +13271,18 @@ arm_coproc_mem_operand (rtx op, bool wb)
 
   /* Autoincremment addressing modes.  POST_INC and PRE_DEC are
      acceptable in any case (subject to verification by
-     arm_address_register_rtx_p).  We need WB to be true to accept
+     arm_address_register_rtx_p).  We need full writeback to accept
+     PRE_INC and POST_DEC, and at least restricted writeback for
      PRE_INC and POST_DEC.  */
-  if (GET_CODE (ind) == POST_INC
-      || GET_CODE (ind) == PRE_DEC
-      || (wb
-	  && (GET_CODE (ind) == PRE_INC
-	      || GET_CODE (ind) == POST_DEC)))
+  if (wb_level > 0
+      && (GET_CODE (ind) == POST_INC
+	  || GET_CODE (ind) == PRE_DEC
+	  || (wb_level > 1
+	      && (GET_CODE (ind) == PRE_INC
+		  || GET_CODE (ind) == POST_DEC))))
     return arm_address_register_rtx_p (XEXP (ind, 0), 0);
 
-  if (wb
+  if (wb_level > 1
       && (GET_CODE (ind) == POST_MODIFY || GET_CODE (ind) == PRE_MODIFY)
       && arm_address_register_rtx_p (XEXP (ind, 0), 0)
       && GET_CODE (XEXP (ind, 1)) == PLUS
@@ -13292,19 +13291,42 @@ arm_coproc_mem_operand (rtx op, bool wb)
 
   /* Match:
      (plus (reg)
-	   (const)).  */
+	   (const))
+
+     The encoded immediate for 16-bit modes is multiplied by 2,
+     while the encoded immediate for 32-bit and 64-bit modes is
+     multiplied by 4.  */
+  int factor = MIN (GET_MODE_SIZE (GET_MODE (op)), 4);
   if (GET_CODE (ind) == PLUS
       && REG_P (XEXP (ind, 0))
       && REG_MODE_OK_FOR_BASE_P (XEXP (ind, 0), VOIDmode)
       && CONST_INT_P (XEXP (ind, 1))
-      && INTVAL (XEXP (ind, 1)) > -1024
-      && INTVAL (XEXP (ind, 1)) <  1024
-      && (INTVAL (XEXP (ind, 1)) & 3) == 0)
+      && IN_RANGE (INTVAL (XEXP (ind, 1)), -255 * factor, 255 * factor)
+      && (INTVAL (XEXP (ind, 1)) & (factor - 1)) == 0)
     return TRUE;
 
   return FALSE;
 }
 
+/* Return TRUE if OP is a valid coprocessor memory address pattern.
+   WB is true if full writeback address modes are allowed and is false
+   if limited writeback address modes (POST_INC and PRE_DEC) are
+   allowed.  */
+
+int arm_coproc_mem_operand (rtx op, bool wb)
+{
+  return arm_coproc_mem_operand_wb (op, wb ? 2 : 1);
+}
+
+/* Return TRUE if OP is a valid coprocessor memory address pattern in a
+   context in which no writeback address modes are allowed.  */
+
+int
+arm_coproc_mem_operand_no_writeback (rtx op)
+{
+  return arm_coproc_mem_operand_wb (op, 0);
+}
+
 /* This function returns TRUE on matching mode and op.
 1. For given modes, check for [Rn], return TRUE for Rn <= LO_REGS.
 2. For other modes, check for [Rn], return TRUE for Rn < R15 (expect R13).  */
@@ -23568,7 +23590,7 @@ arm_print_condition (FILE *stream)
 /* Globally reserved letters: acln
    Puncutation letters currently used: @_|?().!#
    Lower case letters currently used: bcdefhimpqtvwxyz
-   Upper case letters currently used: ABCDFGHJKLMNOPQRSTU
+   Upper case letters currently used: ABCDEFGHIJKLMNOPQRSTU
    Letters previously used, but now deprecated/obsolete: sVWXYZ.
 
    Note that the global reservation for 'c' is only for CONSTANT_ADDRESS_P.
@@ -24134,11 +24156,12 @@ arm_print_operand (FILE *stream, rtx x, int code)
       }
       return;
 
-    /* To print the memory operand with "Ux" constraint.  Based on the rtx_code
-       the memory operands output looks like following.
+    /* To print the memory operand with "Ux" or "Uj" constraint.  Based on the
+       rtx_code the memory operands output looks like following.
        1. [Rn], #+/-<imm>
        2. [Rn, #+/-<imm>]!
-       3. [Rn].  */
+       3. [Rn, #+/-<imm>]
+       4. [Rn].  */
     case 'E':
       {
 	rtx addr;
@@ -24173,6 +24196,16 @@ arm_print_operand (FILE *stream, rtx x, int code)
 		  asm_fprintf (stream, ", #%wd]!",INTVAL (postinc_reg));
 	      }
 	  }
+	else if (code == PLUS)
+	  {
+	    rtx base = XEXP (addr, 0);
+	    rtx index = XEXP (addr, 1);
+
+	    gcc_assert (REG_P (base) && CONST_INT_P (index));
+
+	    HOST_WIDE_INT offset = INTVAL (index);
+	    asm_fprintf (stream, "[%r, #%wd]", REGNO (base), offset);
+	  }
 	else
 	  {
 	    gcc_assert (REG_P (addr));
@@ -28931,6 +28964,30 @@ arm_preferred_simd_mode (scalar_mode mode)
       default:;
       }
 
+  if (TARGET_HAVE_MVE)
+    switch (mode)
+      {
+      case E_QImode:
+	return V16QImode;
+      case E_HImode:
+	return V8HImode;
+      case E_SImode:
+	return V4SImode;
+
+      default:;
+      }
+
+  if (TARGET_HAVE_MVE_FLOAT)
+    switch (mode)
+      {
+      case E_HFmode:
+	return V8HFmode;
+      case E_SFmode:
+	return V4SFmode;
+
+      default:;
+      }
+
   return word_mode;
 }
 
@@ -29851,12 +29908,23 @@ arm_frame_pointer_required (void)
   return false;
 }
 
-/* Only thumb1 can't support conditional execution, so return true if
-   the target is not thumb1.  */
+/* Implement the TARGET_HAVE_CONDITIONAL_EXECUTION hook.
+   All modes except THUMB1 have conditional execution.
+   If we have conditional arithmetic, return false before reload to
+   enable some ifcvt transformations. */
 static bool
 arm_have_conditional_execution (void)
 {
-  return !TARGET_THUMB1;
+  bool has_cond_exec, enable_ifcvt_trans;
+
+  /* Only THUMB1 cannot support conditional execution. */
+  has_cond_exec = !TARGET_THUMB1;
+
+  /* Enable ifcvt transformations if we have conditional arithmetic, but only
+     before reload. */
+  enable_ifcvt_trans = TARGET_COND_ARITH && !reload_completed;
+
+  return has_cond_exec && !enable_ifcvt_trans;
 }
 
 /* The AAPCS sets the maximum alignment of a vector to 64 bits.  */
@@ -30604,6 +30672,127 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
     arm_post_atomic_barrier (model);
 }
 
+/* Expand code to compare vectors OP0 and OP1 using condition CODE.
+   If CAN_INVERT, store either the result or its inverse in TARGET
+   and return true if TARGET contains the inverse.  If !CAN_INVERT,
+   always store the result in TARGET, never its inverse.
+
+   Note that the handling of floating-point comparisons is not
+   IEEE compliant.  */
+
+bool
+arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
+			   bool can_invert)
+{
+  machine_mode cmp_result_mode = GET_MODE (target);
+  machine_mode cmp_mode = GET_MODE (op0);
+
+  bool inverted;
+  switch (code)
+    {
+    /* For these we need to compute the inverse of the requested
+       comparison.  */
+    case UNORDERED:
+    case UNLT:
+    case UNLE:
+    case UNGT:
+    case UNGE:
+    case UNEQ:
+    case NE:
+      code = reverse_condition_maybe_unordered (code);
+      if (!can_invert)
+	{
+	  /* Recursively emit the inverted comparison into a temporary
+	     and then store its inverse in TARGET.  This avoids reusing
+	     TARGET (which for integer NE could be one of the inputs).  */
+	  rtx tmp = gen_reg_rtx (cmp_result_mode);
+	  if (arm_expand_vector_compare (tmp, code, op0, op1, true))
+	    gcc_unreachable ();
+	  emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
+	  return false;
+	}
+      inverted = true;
+      break;
+
+    default:
+      inverted = false;
+      break;
+    }
+
+  switch (code)
+    {
+    /* These are natively supported for zero comparisons, but otherwise
+       require the operands to be swapped.  */
+    case LE:
+    case LT:
+      if (op1 != CONST0_RTX (cmp_mode))
+	{
+	  code = swap_condition (code);
+	  std::swap (op0, op1);
+	}
+      /* Fall through.  */
+
+    /* These are natively supported for both register and zero operands.  */
+    case EQ:
+    case GE:
+    case GT:
+      emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
+      return inverted;
+
+    /* These are natively supported for register operands only.
+       Comparisons with zero aren't useful and should be folded
+       or canonicalized by target-independent code.  */
+    case GEU:
+    case GTU:
+      emit_insn (gen_neon_vc (code, cmp_mode, target,
+			      op0, force_reg (cmp_mode, op1)));
+      return inverted;
+
+    /* These require the operands to be swapped and likewise do not
+       support comparisons with zero.  */
+    case LEU:
+    case LTU:
+      emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
+			      target, force_reg (cmp_mode, op1), op0));
+      return inverted;
+
+    /* These need a combination of two comparisons.  */
+    case LTGT:
+    case ORDERED:
+      {
+	/* Operands are LTGT iff (a > b || a > b).
+	   Operands are ORDERED iff (a > b || a <= b).  */
+	rtx gt_res = gen_reg_rtx (cmp_result_mode);
+	rtx alt_res = gen_reg_rtx (cmp_result_mode);
+	rtx_code alt_code = (code == LTGT ? LT : LE);
+	if (arm_expand_vector_compare (gt_res, GT, op0, op1, true)
+	    || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true))
+	  gcc_unreachable ();
+	emit_insn (gen_rtx_SET (target, gen_rtx_IOR (cmp_result_mode,
+						     gt_res, alt_res)));
+	return inverted;
+      }
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Expand a vcond or vcondu pattern with operands OPERANDS.
+   CMP_RESULT_MODE is the mode of the comparison result.  */
+
+void
+arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)
+{
+  rtx mask = gen_reg_rtx (cmp_result_mode);
+  bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]),
+					     operands[4], operands[5], true);
+  if (inverted)
+    std::swap (operands[1], operands[2]);
+  emit_insn (gen_neon_vbsl (GET_MODE (operands[0]), operands[0],
+			    mask, operands[1], operands[2]));
+}
+
 #define MAX_VECT_LEN 16
 
 struct expand_vec_perm_d
@@ -32302,9 +32491,12 @@ arm_set_current_function (tree fndecl)
   arm_previous_fndecl = fndecl;
 
   /* First set the target options.  */
-  cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
+  cl_target_option_restore (&global_options, &global_options_set,
+			    TREE_TARGET_OPTION (new_tree));
 
   save_restore_target_globals (new_tree);
+
+  arm_override_options_after_change_1 (&global_options, &global_options_set);
 }
 
 /* Implement TARGET_OPTION_PRINT.  */
@@ -32502,7 +32694,7 @@ arm_valid_target_attribute_tree (tree args, struct gcc_options *opts,
   if (!arm_valid_target_attribute_rec (args, opts))
     return NULL_TREE;
 
-  cl_target_option_save (&cl_opts, opts);
+  cl_target_option_save (&cl_opts, opts, opts_set);
   arm_configure_build_target (&arm_active_target, &cl_opts, opts_set, false);
   arm_option_check_internal (opts);
   /* Do any overrides, such as global options arch=xxx.
@@ -32511,11 +32703,11 @@ arm_valid_target_attribute_tree (tree args, struct gcc_options *opts,
   arm_options_perform_arch_sanity_checks ();
   arm_option_override_internal (opts, opts_set);
 
-  return build_target_option_node (opts);
+  return build_target_option_node (opts, opts_set);
 }
 
 static void 
-add_attribute  (const char * mode, tree *attributes)
+add_attribute (const char * mode, tree *attributes)
 {
   size_t len = strlen (mode);
   tree value = build_string (len, mode);
@@ -32567,7 +32759,7 @@ arm_valid_target_attribute_p (tree fndecl, tree ARG_UNUSED (name),
 			      tree args, int ARG_UNUSED (flags))
 {
   bool ret = true;
-  struct gcc_options func_options;
+  struct gcc_options func_options, func_options_set;
   tree cur_tree, new_optimize;
   gcc_assert ((fndecl != NULL_TREE) && (args != NULL_TREE));
 
@@ -32583,22 +32775,23 @@ arm_valid_target_attribute_p (tree fndecl, tree ARG_UNUSED (name),
   memset (&func_options, 0, sizeof (func_options));
   init_options_struct (&func_options, NULL);
   lang_hooks.init_options_struct (&func_options);
+  memset (&func_options_set, 0, sizeof (func_options_set));
 
   /* Initialize func_options to the defaults.  */
-  cl_optimization_restore (&func_options,
+  cl_optimization_restore (&func_options, &func_options_set,
 			   TREE_OPTIMIZATION (func_optimize));
 
-  cl_target_option_restore (&func_options,
+  cl_target_option_restore (&func_options, &func_options_set,
 			    TREE_TARGET_OPTION (target_option_default_node));
 
   /* Set func_options flags with new target mode.  */
   cur_tree = arm_valid_target_attribute_tree (args, &func_options,
-					      &global_options_set);
+					      &func_options_set);
 
   if (cur_tree == NULL_TREE)
     ret = false;
 
-  new_optimize = build_optimization_node (&func_options);
+  new_optimize = build_optimization_node (&func_options, &func_options_set);
 
   DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = cur_tree;
 
@@ -33082,9 +33275,7 @@ arm_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
     = smallest_int_mode_for_size (2 * GET_MODE_BITSIZE (mode));
 
   rtx libval = emit_library_call_value (libfunc, NULL_RTX, LCT_CONST,
-					libval_mode,
-					op0, GET_MODE (op0),
-					op1, GET_MODE (op1));
+					libval_mode, op0, mode, op1, mode);
 
   rtx quotient = simplify_gen_subreg (mode, libval, libval_mode, 0);
   rtx remainder = simplify_gen_subreg (mode, libval, libval_mode,
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 3887c51..4a63d33 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -177,6 +177,10 @@ emission of floating point pcs attributes.  */
 
 #define TARGET_CRC32			(arm_arch_crc)
 
+/* Thumb-2 but also has some conditional arithmetic instructions like csinc,
+   csinv, etc. */
+#define TARGET_COND_ARITH		(arm_arch8_1m_main)
+
 /* The following two macros concern the ability to execute coprocessor
    instructions for VFPv3 or NEON.  TARGET_VFP3/TARGET_VFPD32 are currently
    only ever tested when we know we are generating for VFP hardware; we need
@@ -1106,6 +1110,47 @@ extern const int arm_arch_cde_coproc_bits[];
 #define VALID_MVE_STRUCT_MODE(MODE) \
   ((MODE) == TImode || (MODE) == OImode || (MODE) == XImode)
 
+/* The conditions under which vector modes are supported for general
+   arithmetic using Neon.  */
+
+#define ARM_HAVE_NEON_V8QI_ARITH TARGET_NEON
+#define ARM_HAVE_NEON_V4HI_ARITH TARGET_NEON
+#define ARM_HAVE_NEON_V2SI_ARITH TARGET_NEON
+
+#define ARM_HAVE_NEON_V16QI_ARITH TARGET_NEON
+#define ARM_HAVE_NEON_V8HI_ARITH TARGET_NEON
+#define ARM_HAVE_NEON_V4SI_ARITH TARGET_NEON
+#define ARM_HAVE_NEON_V2DI_ARITH TARGET_NEON
+
+/* HF operations have their own flush-to-zero control (FPSCR.FZ16).  */
+#define ARM_HAVE_NEON_V4HF_ARITH TARGET_NEON_FP16INST
+#define ARM_HAVE_NEON_V8HF_ARITH TARGET_NEON_FP16INST
+
+/* SF operations always flush to zero, regardless of FPSCR.FZ, so we can
+   only use them for general arithmetic when -funsafe-math-optimizations
+   is in effect.  */
+#define ARM_HAVE_NEON_V2SF_ARITH \
+  (TARGET_NEON && flag_unsafe_math_optimizations)
+#define ARM_HAVE_NEON_V4SF_ARITH ARM_HAVE_NEON_V2SF_ARITH
+
+/* The conditions under which vector modes are supported for general
+   arithmetic by any vector extension.  */
+
+#define ARM_HAVE_V8QI_ARITH (ARM_HAVE_NEON_V8QI_ARITH || TARGET_REALLY_IWMMXT)
+#define ARM_HAVE_V4HI_ARITH (ARM_HAVE_NEON_V4HI_ARITH || TARGET_REALLY_IWMMXT)
+#define ARM_HAVE_V2SI_ARITH (ARM_HAVE_NEON_V2SI_ARITH || TARGET_REALLY_IWMMXT)
+
+#define ARM_HAVE_V16QI_ARITH (ARM_HAVE_NEON_V16QI_ARITH || TARGET_HAVE_MVE)
+#define ARM_HAVE_V8HI_ARITH (ARM_HAVE_NEON_V8HI_ARITH || TARGET_HAVE_MVE)
+#define ARM_HAVE_V4SI_ARITH (ARM_HAVE_NEON_V4SI_ARITH || TARGET_HAVE_MVE)
+#define ARM_HAVE_V2DI_ARITH ARM_HAVE_NEON_V2DI_ARITH
+
+#define ARM_HAVE_V4HF_ARITH ARM_HAVE_NEON_V4HF_ARITH
+#define ARM_HAVE_V2SF_ARITH ARM_HAVE_NEON_V2SF_ARITH
+
+#define ARM_HAVE_V8HF_ARITH (ARM_HAVE_NEON_V8HF_ARITH || TARGET_HAVE_MVE_FLOAT)
+#define ARM_HAVE_V4SF_ARITH (ARM_HAVE_NEON_V4SF_ARITH || TARGET_HAVE_MVE_FLOAT)
+
 /* The register numbers in sequence, for passing to arm_gen_load_multiple.  */
 extern int arm_regs_in_sequence[];
 
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index a6a31f8..1a8e498 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -7289,7 +7289,9 @@
 (define_insn "*arm32_mov<mode>"
   [(set (match_operand:HFBF 0 "nonimmediate_operand" "=r,m,r,r")
 	(match_operand:HFBF 1 "general_operand"	   " m,r,r,F"))]
-  "TARGET_32BIT && !TARGET_HARD_FLOAT
+  "TARGET_32BIT
+   && !TARGET_HARD_FLOAT
+   && !TARGET_HAVE_MVE
    && (	  s_register_operand (operands[0], <MODE>mode)
        || s_register_operand (operands[1], <MODE>mode))"
   "*
@@ -7355,7 +7357,7 @@
   if (arm_disable_literal_pool
       && (REG_P (operands[0]) || SUBREG_P (operands[0]))
       && CONST_DOUBLE_P (operands[1])
-      && TARGET_HARD_FLOAT
+      && TARGET_VFP_BASE
       && !vfp3_const_double_rtx (operands[1]))
     {
       rtx clobreg = gen_reg_rtx (SFmode);
@@ -7452,7 +7454,7 @@
   if (arm_disable_literal_pool
       && (REG_P (operands[0]) || SUBREG_P (operands[0]))
       && CONSTANT_P (operands[1])
-      && TARGET_HARD_FLOAT
+      && TARGET_VFP_BASE
       && !arm_const_double_rtx (operands[1])
       && !(TARGET_VFP_DOUBLE && vfp3_const_double_rtx (operands[1])))
     {
@@ -9212,7 +9214,7 @@
 	operands[2] = operands[1];
       else
 	{
-	  rtx mem = XEXP (force_const_mem (SImode, operands[1]), 0);
+	  rtx mem = force_const_mem (SImode, operands[1]);
 	  emit_move_insn (operands[2], mem);
 	}
     }
@@ -9295,7 +9297,7 @@
 	operands[3] = operands[1];
       else
 	{
-	  rtx mem = XEXP (force_const_mem (SImode, operands[1]), 0);
+	  rtx mem = force_const_mem (SImode, operands[1]);
 	  emit_move_insn (operands[3], mem);
 	}
     }
@@ -9320,6 +9322,8 @@
   [(set_attr "arch" "t1,32")]
 )
 
+;; DO NOT SPLIT THIS PATTERN.  It is important for security reasons that the
+;; canary value does not live beyond the end of this sequence.
 (define_insn "arm_stack_protect_test_insn"
   [(set (reg:CC_Z CC_REGNUM)
 	(compare:CC_Z (unspec:SI [(match_operand:SI 1 "memory_operand" "m,m")
@@ -9329,8 +9333,8 @@
    (clobber (match_operand:SI 0 "register_operand" "=&l,&r"))
    (clobber (match_dup 2))]
   "TARGET_32BIT"
-  "ldr\t%0, [%2]\;ldr\t%2, %1\;eors\t%0, %2, %0"
-  [(set_attr "length" "8,12")
+  "ldr\t%0, [%2]\;ldr\t%2, %1\;eors\t%0, %2, %0\;mov\t%2, #0"
+  [(set_attr "length" "12,16")
    (set_attr "conds" "set")
    (set_attr "type" "multiple")
    (set_attr "arch" "t,32")]
@@ -11211,7 +11215,7 @@
 	  [(match_operand 3 "cc_register" "") (const_int 0)])
 	 (neg:SI (match_operand:SI 2 "s_register_operand" "l,r"))
 	 (match_operand:SI 1 "s_register_operand" "0,0")))]
-  "TARGET_32BIT"
+  "TARGET_32BIT && !TARGET_COND_ARITH"
   "#"
   "&& reload_completed"
   [(cond_exec (match_op_dup 4 [(match_dup 3) (const_int 0)])
diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
index cd3d8e1..f01cd65 100644
--- a/gcc/config/arm/arm.opt
+++ b/gcc/config/arm/arm.opt
@@ -21,15 +21,6 @@
 HeaderInclude
 config/arm/arm-opts.h
 
-TargetSave
-const char *x_arm_arch_string
-
-TargetSave
-const char *x_arm_cpu_string
-
-TargetSave
-const char *x_arm_tune_string
-
 Enum
 Name(tls_type) Type(enum arm_tls_type)
 TLS dialect to use:
@@ -82,7 +73,7 @@ mapcs-stack-check
 Target Report Mask(APCS_STACK) Undocumented
 
 march=
-Target RejectNegative Negative(march=) ToLower Joined Var(arm_arch_string)
+Target Save RejectNegative Negative(march=) ToLower Joined Var(arm_arch_string)
 Specify the name of the target architecture.
 
 ; Other arm_arch values are loaded from arm-tables.opt
@@ -107,7 +98,7 @@ Target Report Mask(CALLER_INTERWORKING)
 Thumb: Assume function pointers may go to non-Thumb aware code.
 
 mcpu=
-Target RejectNegative Negative(mcpu=) ToLower Joined Var(arm_cpu_string)
+Target Save RejectNegative Negative(mcpu=) ToLower Joined Var(arm_cpu_string)
 Specify the name of the target CPU.
 
 mfloat-abi=
@@ -232,7 +223,7 @@ Target Report Mask(TPCS_LEAF_FRAME)
 Thumb: Generate (leaf) stack frames even if not needed.
 
 mtune=
-Target RejectNegative Negative(mtune=) ToLower Joined Var(arm_tune_string)
+Target Save RejectNegative Negative(mtune=) ToLower Joined Var(arm_tune_string)
 Tune code for the given processor.
 
 mprint-tune-info
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index a801705..ccdac67 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -141,6 +141,7 @@
 #define vrev64q_m(__inactive, __a, __p) __arm_vrev64q_m(__inactive, __a, __p)
 #define vqrdmlashq(__a, __b, __c) __arm_vqrdmlashq(__a, __b, __c)
 #define vqrdmlahq(__a, __b, __c) __arm_vqrdmlahq(__a, __b, __c)
+#define vqdmlashq(__a, __b, __c) __arm_vqdmlashq(__a, __b, __c)
 #define vqdmlahq(__a, __b, __c) __arm_vqdmlahq(__a, __b, __c)
 #define vmvnq_m(__inactive, __a, __p) __arm_vmvnq_m(__inactive, __a, __p)
 #define vmlasq(__a, __b, __c) __arm_vmlasq(__a, __b, __c)
@@ -260,6 +261,7 @@
 #define vorrq_m(__inactive, __a, __b, __p) __arm_vorrq_m(__inactive, __a, __b, __p)
 #define vqaddq_m(__inactive, __a, __b, __p) __arm_vqaddq_m(__inactive, __a, __b, __p)
 #define vqdmladhq_m(__inactive, __a, __b, __p) __arm_vqdmladhq_m(__inactive, __a, __b, __p)
+#define vqdmlashq_m(__a, __b, __c, __p) __arm_vqdmlashq_m(__a, __b, __c, __p)
 #define vqdmladhxq_m(__inactive, __a, __b, __p) __arm_vqdmladhxq_m(__inactive, __a, __b, __p)
 #define vqdmlahq_m(__a, __b, __c, __p) __arm_vqdmlahq_m(__a, __b, __c, __p)
 #define vqdmlsdhq_m(__inactive, __a, __b, __p) __arm_vqdmlsdhq_m(__inactive, __a, __b, __p)
@@ -643,6 +645,7 @@
 #define vcvtpq_u16_f16(__a) __arm_vcvtpq_u16_f16(__a)
 #define vcvtpq_u32_f32(__a) __arm_vcvtpq_u32_f32(__a)
 #define vcvtnq_u16_f16(__a) __arm_vcvtnq_u16_f16(__a)
+#define vcvtnq_u32_f32(__a) __arm_vcvtnq_u32_f32(__a)
 #define vcvtmq_u16_f16(__a) __arm_vcvtmq_u16_f16(__a)
 #define vcvtmq_u32_f32(__a) __arm_vcvtmq_u32_f32(__a)
 #define vcvtaq_u16_f16(__a) __arm_vcvtaq_u16_f16(__a)
@@ -1234,9 +1237,6 @@
 #define vpselq_u8(__a, __b, __p) __arm_vpselq_u8(__a, __b, __p)
 #define vpselq_s8(__a, __b, __p) __arm_vpselq_s8(__a, __b, __p)
 #define vrev64q_m_u8(__inactive, __a, __p) __arm_vrev64q_m_u8(__inactive, __a, __p)
-#define vqrdmlashq_n_u8(__a, __b, __c) __arm_vqrdmlashq_n_u8(__a, __b, __c)
-#define vqrdmlahq_n_u8(__a, __b, __c) __arm_vqrdmlahq_n_u8(__a, __b, __c)
-#define vqdmlahq_n_u8(__a, __b, __c) __arm_vqdmlahq_n_u8(__a, __b, __c)
 #define vmvnq_m_u8(__inactive, __a, __p) __arm_vmvnq_m_u8(__inactive, __a, __p)
 #define vmlasq_n_u8(__a, __b, __c) __arm_vmlasq_n_u8(__a, __b, __c)
 #define vmlaq_n_u8(__a, __b, __c) __arm_vmlaq_n_u8(__a, __b, __c)
@@ -1306,6 +1306,7 @@
 #define vqdmlsdhxq_s8(__inactive, __a, __b) __arm_vqdmlsdhxq_s8(__inactive, __a, __b)
 #define vqdmlsdhq_s8(__inactive, __a, __b) __arm_vqdmlsdhq_s8(__inactive, __a, __b)
 #define vqdmlahq_n_s8(__a, __b, __c) __arm_vqdmlahq_n_s8(__a, __b, __c)
+#define vqdmlashq_n_s8(__a, __b, __c) __arm_vqdmlashq_n_s8(__a, __b, __c)
 #define vqdmladhxq_s8(__inactive, __a, __b) __arm_vqdmladhxq_s8(__inactive, __a, __b)
 #define vqdmladhq_s8(__inactive, __a, __b) __arm_vqdmladhq_s8(__inactive, __a, __b)
 #define vmlsdavaxq_s8(__a, __b, __c) __arm_vmlsdavaxq_s8(__a, __b, __c)
@@ -1319,9 +1320,6 @@
 #define vpselq_u16(__a, __b, __p) __arm_vpselq_u16(__a, __b, __p)
 #define vpselq_s16(__a, __b, __p) __arm_vpselq_s16(__a, __b, __p)
 #define vrev64q_m_u16(__inactive, __a, __p) __arm_vrev64q_m_u16(__inactive, __a, __p)
-#define vqrdmlashq_n_u16(__a, __b, __c) __arm_vqrdmlashq_n_u16(__a, __b, __c)
-#define vqrdmlahq_n_u16(__a, __b, __c) __arm_vqrdmlahq_n_u16(__a, __b, __c)
-#define vqdmlahq_n_u16(__a, __b, __c) __arm_vqdmlahq_n_u16(__a, __b, __c)
 #define vmvnq_m_u16(__inactive, __a, __p) __arm_vmvnq_m_u16(__inactive, __a, __p)
 #define vmlasq_n_u16(__a, __b, __c) __arm_vmlasq_n_u16(__a, __b, __c)
 #define vmlaq_n_u16(__a, __b, __c) __arm_vmlaq_n_u16(__a, __b, __c)
@@ -1390,6 +1388,7 @@
 #define vqrdmladhq_s16(__inactive, __a, __b) __arm_vqrdmladhq_s16(__inactive, __a, __b)
 #define vqdmlsdhxq_s16(__inactive, __a, __b) __arm_vqdmlsdhxq_s16(__inactive, __a, __b)
 #define vqdmlsdhq_s16(__inactive, __a, __b) __arm_vqdmlsdhq_s16(__inactive, __a, __b)
+#define vqdmlashq_n_s16(__a, __b, __c) __arm_vqdmlashq_n_s16(__a, __b, __c)
 #define vqdmlahq_n_s16(__a, __b, __c) __arm_vqdmlahq_n_s16(__a, __b, __c)
 #define vqdmladhxq_s16(__inactive, __a, __b) __arm_vqdmladhxq_s16(__inactive, __a, __b)
 #define vqdmladhq_s16(__inactive, __a, __b) __arm_vqdmladhq_s16(__inactive, __a, __b)
@@ -1404,9 +1403,6 @@
 #define vpselq_u32(__a, __b, __p) __arm_vpselq_u32(__a, __b, __p)
 #define vpselq_s32(__a, __b, __p) __arm_vpselq_s32(__a, __b, __p)
 #define vrev64q_m_u32(__inactive, __a, __p) __arm_vrev64q_m_u32(__inactive, __a, __p)
-#define vqrdmlashq_n_u32(__a, __b, __c) __arm_vqrdmlashq_n_u32(__a, __b, __c)
-#define vqrdmlahq_n_u32(__a, __b, __c) __arm_vqrdmlahq_n_u32(__a, __b, __c)
-#define vqdmlahq_n_u32(__a, __b, __c) __arm_vqdmlahq_n_u32(__a, __b, __c)
 #define vmvnq_m_u32(__inactive, __a, __p) __arm_vmvnq_m_u32(__inactive, __a, __p)
 #define vmlasq_n_u32(__a, __b, __c) __arm_vmlasq_n_u32(__a, __b, __c)
 #define vmlaq_n_u32(__a, __b, __c) __arm_vmlaq_n_u32(__a, __b, __c)
@@ -1475,6 +1471,7 @@
 #define vqrdmladhq_s32(__inactive, __a, __b) __arm_vqrdmladhq_s32(__inactive, __a, __b)
 #define vqdmlsdhxq_s32(__inactive, __a, __b) __arm_vqdmlsdhxq_s32(__inactive, __a, __b)
 #define vqdmlsdhq_s32(__inactive, __a, __b) __arm_vqdmlsdhq_s32(__inactive, __a, __b)
+#define vqdmlashq_n_s32(__a, __b, __c) __arm_vqdmlashq_n_s32(__a, __b, __c)
 #define vqdmlahq_n_s32(__a, __b, __c) __arm_vqdmlahq_n_s32(__a, __b, __c)
 #define vqdmladhxq_s32(__inactive, __a, __b) __arm_vqdmladhxq_s32(__inactive, __a, __b)
 #define vqdmladhq_s32(__inactive, __a, __b) __arm_vqdmladhq_s32(__inactive, __a, __b)
@@ -1901,6 +1898,9 @@
 #define vqdmladhxq_m_s8(__inactive, __a, __b, __p) __arm_vqdmladhxq_m_s8(__inactive, __a, __b, __p)
 #define vqdmladhxq_m_s32(__inactive, __a, __b, __p) __arm_vqdmladhxq_m_s32(__inactive, __a, __b, __p)
 #define vqdmladhxq_m_s16(__inactive, __a, __b, __p) __arm_vqdmladhxq_m_s16(__inactive, __a, __b, __p)
+#define vqdmlashq_m_n_s8(__a, __b, __c, __p) __arm_vqdmlashq_m_n_s8(__a, __b, __c, __p)
+#define vqdmlashq_m_n_s32(__a, __b, __c, __p) __arm_vqdmlashq_m_n_s32(__a, __b, __c, __p)
+#define vqdmlashq_m_n_s16(__a, __b, __c, __p) __arm_vqdmlashq_m_n_s16(__a, __b, __c, __p)
 #define vqdmlahq_m_n_s8(__a, __b, __c, __p) __arm_vqdmlahq_m_n_s8(__a, __b, __c, __p)
 #define vqdmlahq_m_n_s32(__a, __b, __c, __p) __arm_vqdmlahq_m_n_s32(__a, __b, __c, __p)
 #define vqdmlahq_m_n_s16(__a, __b, __c, __p) __arm_vqdmlahq_m_n_s16(__a, __b, __c, __p)
@@ -2024,8 +2024,6 @@
 #define vmlaldavaq_p_u16(__a, __b, __c, __p) __arm_vmlaldavaq_p_u16(__a, __b, __c, __p)
 #define vmlaldavaxq_p_s32(__a, __b, __c, __p) __arm_vmlaldavaxq_p_s32(__a, __b, __c, __p)
 #define vmlaldavaxq_p_s16(__a, __b, __c, __p) __arm_vmlaldavaxq_p_s16(__a, __b, __c, __p)
-#define vmlaldavaxq_p_u32(__a, __b, __c, __p) __arm_vmlaldavaxq_p_u32(__a, __b, __c, __p)
-#define vmlaldavaxq_p_u16(__a, __b, __c, __p) __arm_vmlaldavaxq_p_u16(__a, __b, __c, __p)
 #define vmlsldavaq_p_s32(__a, __b, __c, __p) __arm_vmlsldavaq_p_s32(__a, __b, __c, __p)
 #define vmlsldavaq_p_s16(__a, __b, __c, __p) __arm_vmlsldavaq_p_s16(__a, __b, __c, __p)
 #define vmlsldavaxq_p_s32(__a, __b, __c, __p) __arm_vmlsldavaxq_p_s32(__a, __b, __c, __p)
@@ -6961,27 +6959,6 @@ __arm_vrev64q_m_u8 (uint8x16_t __inactive, uint8x16_t __a, mve_pred16_t __p)
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqrdmlashq_n_u8 (uint8x16_t __a, uint8x16_t __b, uint8_t __c)
-{
-  return __builtin_mve_vqrdmlashq_n_uv16qi (__a, __b, __c);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqrdmlahq_n_u8 (uint8x16_t __a, uint8x16_t __b, uint8_t __c)
-{
-  return __builtin_mve_vqrdmlahq_n_uv16qi (__a, __b, __c);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqdmlahq_n_u8 (uint8x16_t __a, uint8x16_t __b, uint8_t __c)
-{
-  return __builtin_mve_vqdmlahq_n_uv16qi (__a, __b, __c);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vmvnq_m_u8 (uint8x16_t __inactive, uint8x16_t __a, mve_pred16_t __p)
 {
   return __builtin_mve_vmvnq_m_uv16qi (__inactive, __a, __p);
@@ -7424,6 +7401,13 @@ __arm_vqrdmlashq_n_s8 (int8x16_t __a, int8x16_t __b, int8_t __c)
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vqdmlashq_n_s8 (int8x16_t __a, int8x16_t __b, int8_t __c)
+{
+  return __builtin_mve_vqdmlashq_n_sv16qi (__a, __b, __c);
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vqrdmlahq_n_s8 (int8x16_t __a, int8x16_t __b, int8_t __c)
 {
   return __builtin_mve_vqrdmlahq_n_sv16qi (__a, __b, __c);
@@ -7557,27 +7541,6 @@ __arm_vrev64q_m_u16 (uint16x8_t __inactive, uint16x8_t __a, mve_pred16_t __p)
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqrdmlashq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
-{
-  return __builtin_mve_vqrdmlashq_n_uv8hi (__a, __b, __c);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqrdmlahq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
-{
-  return __builtin_mve_vqrdmlahq_n_uv8hi (__a, __b, __c);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqdmlahq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
-{
-  return __builtin_mve_vqdmlahq_n_uv8hi (__a, __b, __c);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vmvnq_m_u16 (uint16x8_t __inactive, uint16x8_t __a, mve_pred16_t __p)
 {
   return __builtin_mve_vmvnq_m_uv8hi (__inactive, __a, __p);
@@ -8019,6 +7982,13 @@ __arm_vqrdmlashq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vqdmlashq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
+{
+  return __builtin_mve_vqdmlashq_n_sv8hi (__a, __b, __c);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vqrdmlahq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
 {
   return __builtin_mve_vqrdmlahq_n_sv8hi (__a, __b, __c);
@@ -8152,27 +8122,6 @@ __arm_vrev64q_m_u32 (uint32x4_t __inactive, uint32x4_t __a, mve_pred16_t __p)
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqrdmlashq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
-{
-  return __builtin_mve_vqrdmlashq_n_uv4si (__a, __b, __c);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqrdmlahq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
-{
-  return __builtin_mve_vqrdmlahq_n_uv4si (__a, __b, __c);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqdmlahq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
-{
-  return __builtin_mve_vqdmlahq_n_uv4si (__a, __b, __c);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vmvnq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, mve_pred16_t __p)
 {
   return __builtin_mve_vmvnq_m_uv4si (__inactive, __a, __p);
@@ -8614,6 +8563,13 @@ __arm_vqrdmlashq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vqdmlashq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
+{
+  return __builtin_mve_vqdmlashq_n_sv4si (__a, __b, __c);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vqrdmlahq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
 {
   return __builtin_mve_vqrdmlahq_n_sv4si (__a, __b, __c);
@@ -11141,6 +11097,27 @@ __arm_vqrdmlashq_m_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c, mve_pred16_
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vqdmlashq_m_n_s8 (int8x16_t __a, int8x16_t __b, int8_t __c, mve_pred16_t __p)
+{
+  return __builtin_mve_vqdmlashq_m_n_sv16qi (__a, __b, __c, __p);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vqdmlashq_m_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c, mve_pred16_t __p)
+{
+  return __builtin_mve_vqdmlashq_m_n_sv8hi (__a, __b, __c, __p);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vqdmlashq_m_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c, mve_pred16_t __p)
+{
+  return __builtin_mve_vqdmlashq_m_n_sv4si (__a, __b, __c, __p);
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vqrdmlsdhq_m_s8 (int8x16_t __inactive, int8x16_t __a, int8x16_t __b, mve_pred16_t __p)
 {
   return __builtin_mve_vqrdmlsdhq_m_sv16qi (__inactive, __a, __b, __p);
@@ -11811,20 +11788,6 @@ __arm_vmlaldavaxq_p_s16 (int64_t __a, int16x8_t __b, int16x8_t __c, mve_pred16_t
   return __builtin_mve_vmlaldavaxq_p_sv8hi (__a, __b, __c, __p);
 }
 
-__extension__ extern __inline uint64_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vmlaldavaxq_p_u32 (uint64_t __a, uint32x4_t __b, uint32x4_t __c, mve_pred16_t __p)
-{
-  return __builtin_mve_vmlaldavaxq_p_uv4si (__a, __b, __c, __p);
-}
-
-__extension__ extern __inline uint64_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vmlaldavaxq_p_u16 (uint64_t __a, uint16x8_t __b, uint16x8_t __c, mve_pred16_t __p)
-{
-  return __builtin_mve_vmlaldavaxq_p_uv8hi (__a, __b, __c, __p);
-}
-
 __extension__ extern __inline int64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vmlsldavaq_p_s32 (int64_t __a, int32x4_t __b, int32x4_t __c, mve_pred16_t __p)
@@ -17012,6 +16975,13 @@ __arm_vcvtnq_u16_f16 (float16x8_t __a)
   return __builtin_mve_vcvtnq_uv8hi (__a);
 }
 
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vcvtnq_u32_f32 (float32x4_t __a)
+{
+  return __builtin_mve_vcvtnq_uv4si (__a);
+}
+
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcvtmq_u16_f16 (float16x8_t __a)
@@ -23742,27 +23712,6 @@ __arm_vrev64q_m (uint8x16_t __inactive, uint8x16_t __a, mve_pred16_t __p)
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqrdmlashq (uint8x16_t __a, uint8x16_t __b, uint8_t __c)
-{
- return __arm_vqrdmlashq_n_u8 (__a, __b, __c);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqrdmlahq (uint8x16_t __a, uint8x16_t __b, uint8_t __c)
-{
- return __arm_vqrdmlahq_n_u8 (__a, __b, __c);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqdmlahq (uint8x16_t __a, uint8x16_t __b, uint8_t __c)
-{
- return __arm_vqdmlahq_n_u8 (__a, __b, __c);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vmvnq_m (uint8x16_t __inactive, uint8x16_t __a, mve_pred16_t __p)
 {
  return __arm_vmvnq_m_u8 (__inactive, __a, __p);
@@ -24204,6 +24153,13 @@ __arm_vqrdmlashq (int8x16_t __a, int8x16_t __b, int8_t __c)
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vqdmlashq (int8x16_t __a, int8x16_t __b, int8_t __c)
+{
+ return __arm_vqdmlashq_n_s8 (__a, __b, __c);
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vqrdmlahq (int8x16_t __a, int8x16_t __b, int8_t __c)
 {
  return __arm_vqrdmlahq_n_s8 (__a, __b, __c);
@@ -24337,27 +24293,6 @@ __arm_vrev64q_m (uint16x8_t __inactive, uint16x8_t __a, mve_pred16_t __p)
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqrdmlashq (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
-{
- return __arm_vqrdmlashq_n_u16 (__a, __b, __c);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqrdmlahq (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
-{
- return __arm_vqrdmlahq_n_u16 (__a, __b, __c);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqdmlahq (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
-{
- return __arm_vqdmlahq_n_u16 (__a, __b, __c);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vmvnq_m (uint16x8_t __inactive, uint16x8_t __a, mve_pred16_t __p)
 {
  return __arm_vmvnq_m_u16 (__inactive, __a, __p);
@@ -24799,6 +24734,13 @@ __arm_vqrdmlashq (int16x8_t __a, int16x8_t __b, int16_t __c)
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vqdmlashq (int16x8_t __a, int16x8_t __b, int16_t __c)
+{
+ return __arm_vqdmlashq_n_s16 (__a, __b, __c);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vqrdmlahq (int16x8_t __a, int16x8_t __b, int16_t __c)
 {
  return __arm_vqrdmlahq_n_s16 (__a, __b, __c);
@@ -24932,27 +24874,6 @@ __arm_vrev64q_m (uint32x4_t __inactive, uint32x4_t __a, mve_pred16_t __p)
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqrdmlashq (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
-{
- return __arm_vqrdmlashq_n_u32 (__a, __b, __c);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqrdmlahq (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
-{
- return __arm_vqrdmlahq_n_u32 (__a, __b, __c);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vqdmlahq (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
-{
- return __arm_vqdmlahq_n_u32 (__a, __b, __c);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vmvnq_m (uint32x4_t __inactive, uint32x4_t __a, mve_pred16_t __p)
 {
  return __arm_vmvnq_m_u32 (__inactive, __a, __p);
@@ -25394,6 +25315,13 @@ __arm_vqrdmlashq (int32x4_t __a, int32x4_t __b, int32_t __c)
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vqdmlashq (int32x4_t __a, int32x4_t __b, int32_t __c)
+{
+ return __arm_vqdmlashq_n_s32 (__a, __b, __c);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vqrdmlahq (int32x4_t __a, int32x4_t __b, int32_t __c)
 {
  return __arm_vqrdmlahq_n_s32 (__a, __b, __c);
@@ -27921,6 +27849,27 @@ __arm_vqrdmlashq_m (int16x8_t __a, int16x8_t __b, int16_t __c, mve_pred16_t __p)
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vqdmlashq_m (int8x16_t __a, int8x16_t __b, int8_t __c, mve_pred16_t __p)
+{
+ return __arm_vqdmlashq_m_n_s8 (__a, __b, __c, __p);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vqdmlashq_m (int16x8_t __a, int16x8_t __b, int16_t __c, mve_pred16_t __p)
+{
+ return __arm_vqdmlashq_m_n_s16 (__a, __b, __c, __p);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vqdmlashq_m (int32x4_t __a, int32x4_t __b, int32_t __c, mve_pred16_t __p)
+{
+ return __arm_vqdmlashq_m_n_s32 (__a, __b, __c, __p);
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vqrdmlsdhq_m (int8x16_t __inactive, int8x16_t __a, int8x16_t __b, mve_pred16_t __p)
 {
  return __arm_vqrdmlsdhq_m_s8 (__inactive, __a, __b, __p);
@@ -28591,20 +28540,6 @@ __arm_vmlaldavaxq_p (int64_t __a, int16x8_t __b, int16x8_t __c, mve_pred16_t __p
  return __arm_vmlaldavaxq_p_s16 (__a, __b, __c, __p);
 }
 
-__extension__ extern __inline uint64_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vmlaldavaxq_p (uint64_t __a, uint32x4_t __b, uint32x4_t __c, mve_pred16_t __p)
-{
- return __arm_vmlaldavaxq_p_u32 (__a, __b, __c, __p);
-}
-
-__extension__ extern __inline uint64_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vmlaldavaxq_p (uint64_t __a, uint16x8_t __b, uint16x8_t __c, mve_pred16_t __p)
-{
- return __arm_vmlaldavaxq_p_u16 (__a, __b, __c, __p);
-}
-
 __extension__ extern __inline int64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vmlsldavaq_p (int64_t __a, int32x4_t __b, int32x4_t __c, mve_pred16_t __p)
@@ -35651,6 +35586,7 @@ enum {
 	short: __ARM_mve_type_int_n, \
 	int: __ARM_mve_type_int_n, \
 	long: __ARM_mve_type_int_n, \
+	double: __ARM_mve_type_fp_n, \
 	long long: __ARM_mve_type_int_n, \
 	unsigned char: __ARM_mve_type_int_n, \
 	unsigned short: __ARM_mve_type_int_n, \
@@ -35723,6 +35659,8 @@ extern void *__ARM_undef;
     _Generic(param, type: param, default: *(type *)__ARM_undef)
 #define __ARM_mve_coerce1(param, type) \
     _Generic(param, type: param, const type: param, default: *(type *)__ARM_undef)
+#define __ARM_mve_coerce2(param, type) \
+    _Generic(param, type: param, float16_t: param, float32_t: param, default: *(type *)__ARM_undef)
 
 #if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
 
@@ -35939,14 +35877,14 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vaddq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t)), \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vaddq_f16 (__ARM_mve_coerce(p0, float16x8_t), __ARM_mve_coerce(p1, float16x8_t)), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vaddq_f32 (__ARM_mve_coerce(p0, float32x4_t), __ARM_mve_coerce(p1, float32x4_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)), \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t)), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)));})
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, int)), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, int)), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, int)), \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int)), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int)), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int)), \
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)));})
 
 #define __arm_vandq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -35997,8 +35935,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vmulq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t)), \
   int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vmulq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vmulq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vmulq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vmulq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)), \
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vmulq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vmulq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)), \
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vmulq_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t)), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmulq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmulq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)), \
@@ -36029,8 +35967,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vcmpeqq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t)), \
   int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vcmpeqq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vcmpeqq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)), \
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)), \
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vcmpeqq_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t)), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vcmpeqq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vcmpeqq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)), \
@@ -36069,8 +36007,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vcmpeqq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t), p2), \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcmpeqq_m_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcmpeqq_m_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t), p2), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t), p2));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double), p2), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpeqq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double), p2));})
 
 #define __arm_vcmpgtq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -36083,8 +36021,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpgtq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcmpgtq_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t)), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcmpgtq_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t)), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)));})
 
 #define __arm_vcmpleq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -36097,8 +36035,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpleq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t)), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpleq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t)), \
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpleq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)));})
 
 #define __arm_vcmpltq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -36111,8 +36049,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpltq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcmpltq_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t)), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcmpltq_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t)), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)));})
 
 #define __arm_vcmpneq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -36123,8 +36061,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t)), \
   int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vcmpneq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)), \
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)), \
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vcmpneq_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t)), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vcmpneq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vcmpneq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)), \
@@ -36179,8 +36117,8 @@ extern void *__ARM_undef;
 #define __arm_vmaxnmavq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmavq_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t)), \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmavq_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t)));})
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmavq_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t)), \
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmavq_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t)));})
 
 #define __arm_vmaxnmq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -36191,14 +36129,14 @@ extern void *__ARM_undef;
 #define __arm_vmaxnmvq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmvq_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t)), \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmvq_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t)));})
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmvq_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t)), \
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmvq_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t)));})
 
 #define __arm_vmaxnmvq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmvq_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t)), \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmvq_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t)));})
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmvq_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t)), \
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmvq_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t)));})
 
 #define __arm_vminnmaq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -36209,8 +36147,8 @@ extern void *__ARM_undef;
 #define __arm_vminnmavq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmavq_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t)), \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmavq_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t)));})
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmavq_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t)), \
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmavq_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t)));})
 
 #define __arm_vbrsrq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
@@ -36232,8 +36170,8 @@ extern void *__ARM_undef;
 #define __arm_vsubq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vsubq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)), \
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vsubq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)), \
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t)), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t)), \
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \
@@ -36252,8 +36190,8 @@ extern void *__ARM_undef;
 #define __arm_vminnmvq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmvq_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t)), \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmvq_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t)));})
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmvq_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t)), \
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmvq_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t)));})
 
 #define __arm_vshlq_r(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
@@ -36782,10 +36720,15 @@ extern void *__ARM_undef;
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t)));})
+	    int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));})
+
+#define __arm_vqdmlashq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
+  __typeof(p1) __p1 = (p1); \
+  __typeof(p2) __p2 = (p2); \
+  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \
+	    int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));})
 
 #define __arm_vqrdmlahq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -36793,10 +36736,7 @@ extern void *__ARM_undef;
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t)));})
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));})
 
 #define __arm_vmlasq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -36815,10 +36755,7 @@ extern void *__ARM_undef;
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t)));})
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));})
 
 #define __arm_vqrdmladhxq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -37011,8 +36948,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpgtq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t), p2), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpgtq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t), p2), \
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpgtq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t), p2), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t), p2), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t), p2), \
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double), p2), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgtq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double), p2), \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcmpgtq_m_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcmpgtq_m_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2));})
 
@@ -37027,8 +36964,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpleq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t), p2), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpleq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t), p2), \
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpleq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t), p2), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t), p2), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t), p2));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double), p2), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpleq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double), p2));})
 
 #define __arm_vcmpltq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -37041,8 +36978,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpltq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t), p2), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpltq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t), p2), \
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpltq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t), p2), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t), p2), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t), p2));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double), p2), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpltq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double), p2));})
 
 #define __arm_vcmpneq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -37061,8 +36998,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vcmpneq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t), p2), \
   int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vcmpneq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t), p2), \
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vcmpneq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t), p2), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t), p2), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t), p2));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double), p2), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpneq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double), p2));})
 
 #define __arm_vcvtbq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -37116,8 +37053,8 @@ extern void *__ARM_undef;
   __typeof(p1) __p1 = (p1); \
   __typeof(p2) __p2 = (p2); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmaq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t)), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmaq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t)), \
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmaq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double)), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmaq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double)), \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vfmaq_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t)), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vfmaq_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t)));})
 
@@ -37132,8 +37069,8 @@ extern void *__ARM_undef;
   __typeof(p1) __p1 = (p1); \
   __typeof(p2) __p2 = (p2); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmasq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t)), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmasq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t)));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmasq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double)), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmasq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double)));})
 
 #define __arm_vmaxnmaq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -37156,14 +37093,14 @@ extern void *__ARM_undef;
 #define __arm_vmaxnmavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmavq_p_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmavq_p_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t), p2));})
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmavq_p_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t), p2), \
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmavq_p_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t), p2));})
 
 #define __arm_vmaxnmvq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmvq_p_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmvq_p_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t), p2));})
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vmaxnmvq_p_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t), p2), \
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vmaxnmvq_p_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t), p2));})
 
 #define __arm_vminnmaq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -37174,14 +37111,14 @@ extern void *__ARM_undef;
 #define __arm_vminnmavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmavq_p_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmavq_p_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t), p2));})
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmavq_p_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t), p2), \
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmavq_p_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t), p2));})
 
 #define __arm_vminnmvq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmvq_p_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmvq_p_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t), p2));})
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vminnmvq_p_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t), p2), \
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vminnmvq_p_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t), p2));})
 
 #define __arm_vrndnq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -37248,8 +37185,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpgeq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)), \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcmpgeq_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t)), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcmpgeq_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t)), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t)), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t)));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)));})
 
 #define __arm_vrshrnbq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -37353,8 +37290,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vcmpgeq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t), p2), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vcmpgeq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t), p2), \
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vcmpgeq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t), p2), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16_t), p2), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32_t), p2), \
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double), p2), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vcmpgeq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double), p2), \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcmpgeq_m_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcmpgeq_m_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2));})
 
@@ -37389,8 +37326,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, int), p3), \
   int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, int), p3), \
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, int), p3), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));})
 
 #define __arm_vandq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -37531,15 +37468,15 @@ extern void *__ARM_undef;
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vfmaq_m_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vfmaq_m_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmaq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmaq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmaq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmaq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));})
 
 #define __arm_vfmasq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   __typeof(p2) __p2 = (p2); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmasq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmasq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vfmasq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vfmasq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));})
 
 #define __arm_vfmsq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -37580,8 +37517,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vmulq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t), p3), \
   int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vmulq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t), p3), \
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vmulq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t), p3), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vmulq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vmulq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vmulq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vmulq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));})
 
 #define __arm_vornq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -37614,8 +37551,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t), p3), \
   int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t), p3), \
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t), p3), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vsubq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vsubq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));})
 
 #define __arm_vorrq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -38113,8 +38050,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t), p3), \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vaddq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vaddq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_x_n_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_x_n_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));})
 
 #define __arm_vandq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
   __typeof(p2) __p2 = (p2); \
@@ -38248,8 +38185,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vmulq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t), p3), \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vmulq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vmulq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vmulq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vmulq_x_n_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vmulq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vmulq_x_n_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));})
 
 #define __arm_vnegq_x(p1,p2) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
@@ -38337,8 +38274,8 @@ extern void *__ARM_undef;
   _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vsubq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vsubq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16_t), p3), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vsubq_x_n_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32_t), p3));})
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vsubq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vsubq_x_n_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));})
 
 #define __arm_vcmulq_rot90_x(p1,p2,p3)  ({ __typeof(p1) __p1 = (p1); \
   __typeof(p2) __p2 = (p2); \
@@ -38370,8 +38307,8 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vsetq_lane_u16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, uint16x8_t), p2), \
   int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vsetq_lane_u32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, uint32x4_t), p2), \
   int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint64x2_t]: __arm_vsetq_lane_u64 (__ARM_mve_coerce(__p0, uint64_t), __ARM_mve_coerce(__p1, uint64x2_t), p2), \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vsetq_lane_f16 (__ARM_mve_coerce(__p0, float16_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \
-  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vsetq_lane_f32 (__ARM_mve_coerce(__p0, float32_t), __ARM_mve_coerce(__p1, float32x4_t), p2));})
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float16x8_t]: __arm_vsetq_lane_f16 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float16x8_t), p2), \
+  int (*)[__ARM_mve_type_fp_n][__ARM_mve_type_float32x4_t]: __arm_vsetq_lane_f32 (__ARM_mve_coerce2(__p0, double), __ARM_mve_coerce(__p1, float32x4_t), p2));})
 
 #else /* MVE Integer.  */
 
@@ -38895,12 +38832,12 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vaddq_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t)), \
   int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vaddq_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t)), \
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vaddq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8_t)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16_t)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32_t)), \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8_t)), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32_t)));})
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, int)), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, int)), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, int)), \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int)), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int)), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int)));})
 
 #define __arm_vandq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -39254,10 +39191,15 @@ extern void *__ARM_undef;
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t)));})
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));})
+
+#define __arm_vqdmlashq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
+  __typeof(p1) __p1 = (p1); \
+  __typeof(p2) __p2 = (p2); \
+  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));})
 
 #define __arm_vqrdmlahq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -39265,10 +39207,7 @@ extern void *__ARM_undef;
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t)));})
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));})
 
 #define __arm_vqrdmladhxq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -39399,10 +39338,7 @@ extern void *__ARM_undef;
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t)), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8_t)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16_t)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32_t)));})
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlahq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t)));})
 
 #define __arm_vqdmlsdhq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -40800,6 +40736,14 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t), p3), \
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqrdmlashq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t), p3));})
 
+#define __arm_vqdmlashq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
+  __typeof(p1) __p1 = (p1); \
+  __typeof(p2) __p2 = (p2); \
+  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8_t), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16_t), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vqdmlashq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32_t), p3));})
+
 #define __arm_vqrshlq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   __typeof(p2) __p2 = (p2); \
@@ -41057,9 +41001,7 @@ extern void *__ARM_undef;
   __typeof(p2) __p2 = (p2); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
   int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vmlaldavaxq_p_s16 (__ARM_mve_coerce(__p0, int64_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlaldavaxq_p_s32 (__ARM_mve_coerce(__p0, int64_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vmlaldavaxq_p_u16 (__ARM_mve_coerce(__p0, uint64_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vmlaldavaxq_p_u32 (__ARM_mve_coerce(__p0, uint64_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));})
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vmlaldavaxq_p_s32 (__ARM_mve_coerce(__p0, int64_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3));})
 
 #define __arm_vmlsldavaq_p(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -41679,16 +41621,16 @@ extern void *__ARM_undef;
 #define __arm_vmaxavq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxavq_s8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, int8x16_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxavq_s16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, int16x8_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxavq_s32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, int32x4_t)));})
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxavq_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxavq_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxavq_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t)));})
 
 #define __arm_vmaxavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxavq_p_s8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxavq_p_s16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxavq_p_s32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, int32x4_t), p2));})
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxavq_p_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxavq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxavq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t), p2));})
 
 #define __arm_vmaxq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
   __typeof(p2) __p2 = (p2); \
@@ -41703,36 +41645,36 @@ extern void *__ARM_undef;
 #define __arm_vmaxvq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxvq_s8 (__ARM_mve_coerce(__p0, int8_t), __ARM_mve_coerce(__p1, int8x16_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxvq_s16 (__ARM_mve_coerce(__p0, int16_t), __ARM_mve_coerce(__p1, int16x8_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxvq_s32 (__ARM_mve_coerce(__p0, int32_t), __ARM_mve_coerce(__p1, int32x4_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vmaxvq_u8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vmaxvq_u16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vmaxvq_u32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, uint32x4_t)));})
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxvq_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxvq_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxvq_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vmaxvq_u8 (__p0, __ARM_mve_coerce(__p1, uint8x16_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vmaxvq_u16 (__p0, __ARM_mve_coerce(__p1, uint16x8_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vmaxvq_u32 (__p0,__ARM_mve_coerce(__p1, uint32x4_t)));})
 
 #define __arm_vmaxvq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxvq_p_s8 (__ARM_mve_coerce(__p0, int8_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxvq_p_s16 (__ARM_mve_coerce(__p0, int16_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxvq_p_s32 (__ARM_mve_coerce(__p0, int32_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vmaxvq_p_u8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, uint8x16_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vmaxvq_p_u16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, uint16x8_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vmaxvq_p_u32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));})
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vmaxvq_p_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vmaxvq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vmaxvq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vmaxvq_p_u8 (__p0, __ARM_mve_coerce(__p1, uint8x16_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vmaxvq_p_u16 (__p0, __ARM_mve_coerce(__p1, uint16x8_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vmaxvq_p_u32 (__p0, __ARM_mve_coerce(__p1, uint32x4_t), p2));})
 
 #define __arm_vminavq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminavq_s8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, int8x16_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminavq_s16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, int16x8_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminavq_s32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, int32x4_t)));})
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminavq_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminavq_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminavq_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t)));})
 
 #define __arm_vminavq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminavq_p_s8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminavq_p_s16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminavq_p_s32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, int32x4_t), p2));})
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminavq_p_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminavq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminavq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t), p2));})
 
 #define __arm_vminq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
   __typeof(p2) __p2 = (p2); \
@@ -41747,22 +41689,22 @@ extern void *__ARM_undef;
 #define __arm_vminvq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminvq_s8 (__ARM_mve_coerce(__p0, int8_t), __ARM_mve_coerce(__p1, int8x16_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminvq_s16 (__ARM_mve_coerce(__p0, int16_t), __ARM_mve_coerce(__p1, int16x8_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminvq_s32 (__ARM_mve_coerce(__p0, int32_t), __ARM_mve_coerce(__p1, int32x4_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vminvq_u8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vminvq_u16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vminvq_u32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, uint32x4_t)));})
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminvq_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminvq_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminvq_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vminvq_u8 (__p0, __ARM_mve_coerce(__p1, uint8x16_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vminvq_u16 (__p0, __ARM_mve_coerce(__p1, uint16x8_t)), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vminvq_u32 (__p0, __ARM_mve_coerce(__p1, uint32x4_t)));})
 
 #define __arm_vminvq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminvq_p_s8 (__ARM_mve_coerce(__p0, int8_t), __ARM_mve_coerce(__p1, int8x16_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminvq_p_s16 (__ARM_mve_coerce(__p0, int16_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminvq_p_s32 (__ARM_mve_coerce(__p0, int32_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vminvq_p_u8 (__ARM_mve_coerce(__p0, uint8_t), __ARM_mve_coerce(__p1, uint8x16_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vminvq_p_u16 (__ARM_mve_coerce(__p0, uint16_t), __ARM_mve_coerce(__p1, uint16x8_t), p2), \
-  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vminvq_p_u32 (__ARM_mve_coerce(__p0, uint32_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));})
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int8x16_t]: __arm_vminvq_p_s8 (__p0, __ARM_mve_coerce(__p1, int8x16_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int16x8_t]: __arm_vminvq_p_s16 (__p0, __ARM_mve_coerce(__p1, int16x8_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_int32x4_t]: __arm_vminvq_p_s32 (__p0, __ARM_mve_coerce(__p1, int32x4_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint8x16_t]: __arm_vminvq_p_u8 (__p0, __ARM_mve_coerce(__p1, uint8x16_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint16x8_t]: __arm_vminvq_p_u16 (__p0, __ARM_mve_coerce(__p1, uint16x8_t), p2), \
+  int (*)[__ARM_mve_type_int_n][__ARM_mve_type_uint32x4_t]: __arm_vminvq_p_u32 (__p0, __ARM_mve_coerce(__p1, uint32x4_t), p2));})
 
 #define __arm_vmladavaq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def
index 753e40a..ac92818 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -312,9 +312,6 @@ VAR3 (TERNOP_NONE_NONE_UNONE_IMM, vshlcq_vec_s, v16qi, v8hi, v4si)
 VAR4 (TERNOP_UNONE_UNONE_UNONE_UNONE, vpselq_u, v16qi, v8hi, v4si, v2di)
 VAR4 (TERNOP_NONE_NONE_NONE_UNONE, vpselq_s, v16qi, v8hi, v4si, v2di)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrev64q_m_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vqrdmlashq_n_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vqrdmlahq_n_u, v16qi, v8hi, v4si)
-VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vqdmlahq_n_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmvnq_m_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlasq_n_u, v16qi, v8hi, v4si)
 VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vmlaq_n_u, v16qi, v8hi, v4si)
@@ -384,6 +381,7 @@ VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqrdmladhq_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqdmlsdhxq_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqdmlsdhq_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqdmlahq_n_s, v16qi, v8hi, v4si)
+VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqdmlashq_n_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqdmladhxq_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_NONE, vqdmladhq_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_NONE, vmlsdavaxq_s, v16qi, v8hi, v4si)
@@ -574,6 +572,7 @@ VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmulhq_m_n_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlsdhxq_m_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlsdhq_m_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlahq_m_n_s, v16qi, v8hi, v4si)
+VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmlashq_m_n_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmladhxq_m_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqdmladhq_m_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_NONE_NONE_UNONE, vqaddq_m_s, v16qi, v8hi, v4si)
@@ -615,7 +614,6 @@ VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vrshrq_m_n_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_NONE_IMM_UNONE, vqshlq_m_n_s, v16qi, v8hi, v4si)
 VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmulltq_poly_m_p, v16qi, v8hi)
 VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmullbq_poly_m_p, v16qi, v8hi)
-VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmlaldavaxq_p_u, v8hi, v4si)
 VAR2 (QUADOP_UNONE_UNONE_UNONE_UNONE_UNONE, vmlaldavaq_p_u, v8hi, v4si)
 VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshrntq_m_n_u, v8hi, v4si)
 VAR2 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshrnbq_m_n_u, v8hi, v4si)
diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index 011badc..789e333 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -452,6 +452,16 @@
  (and (match_code "mem")
       (match_test "TARGET_32BIT && arm_coproc_mem_operand (op, FALSE)")))
 
+(define_memory_constraint "Uj"
+ "@internal
+  In ARM/Thumb-2 state a VFP load/store address that supports writeback
+  for Neon but not for MVE"
+ (and (match_code "mem")
+      (match_test "TARGET_32BIT")
+      (match_test "TARGET_HAVE_MVE
+                   ? arm_coproc_mem_operand_no_writeback (op)
+                   : neon_vector_mem_operand (op, 2, true)")))
+
 (define_memory_constraint "Uy"
  "@internal
   In ARM/Thumb-2 state a valid iWMMX load/store address."
diff --git a/gcc/config/arm/driver-arm.c b/gcc/config/arm/driver-arm.c
index 254e5ba..85058f2 100644
--- a/gcc/config/arm/driver-arm.c
+++ b/gcc/config/arm/driver-arm.c
@@ -61,6 +61,7 @@ host_detect_local_cpu (int argc, const char **argv)
   FILE *f = NULL;
   bool arch;
   const struct vendor_cpu *cpu_table = NULL;
+  char *fcpu_info = NULL;
 
   if (argc < 1)
     goto not_found;
@@ -69,7 +70,12 @@ host_detect_local_cpu (int argc, const char **argv)
   if (!arch && strcmp (argv[0], "cpu") != 0 && strcmp (argv[0], "tune"))
     goto not_found;
 
-  f = fopen ("/proc/cpuinfo", "r");
+  fcpu_info = getenv ("GCC_CPUINFO");
+  if (fcpu_info)
+    f = fopen (fcpu_info, "r");
+  else
+    f = fopen ("/proc/cpuinfo", "r");
+
   if (f == NULL)
     goto not_found;
 
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 0bc9eba..f934872 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -66,14 +66,6 @@
 ;; Integer and float modes supported by Neon and IWMMXT.
 (define_mode_iterator VALL [V2DI V2SI V4HI V8QI V2SF V4SI V8HI V16QI V4SF])
 
-;; Integer and float modes supported by Neon, IWMMXT and MVE, used by
-;; arithmetic epxand patterns.
-(define_mode_iterator VNIM [V16QI V8HI V4SI V4SF])
-
-;; Integer and float modes supported by Neon and IWMMXT but not MVE, used by
-;; arithmetic epxand patterns.
-(define_mode_iterator VNINOTM [V2SI V4HI V8QI V2SF V2DI])
-
 ;; Integer and float modes supported by Neon, IWMMXT and MVE.
 (define_mode_iterator VNIM1 [V16QI V8HI V4SI V4SF V2DI])
 
@@ -267,6 +259,16 @@
 (define_mode_iterator VBFCVT [V4BF V8BF])
 (define_mode_iterator VBFCVTM [V2SI SF])
 
+;; MVE mode iterator.
+(define_mode_iterator MVE_types [V16QI V8HI V4SI V2DI TI V8HF V4SF V2DF])
+(define_mode_iterator MVE_VLD_ST [V16QI V8HI V4SI V8HF V4SF])
+(define_mode_iterator MVE_0 [V8HF V4SF])
+(define_mode_iterator MVE_1 [V16QI V8HI V4SI V2DI])
+(define_mode_iterator MVE_3 [V16QI V8HI])
+(define_mode_iterator MVE_2 [V16QI V8HI V4SI])
+(define_mode_iterator MVE_5 [V8HI V4SI])
+(define_mode_iterator MVE_6 [V8HI V4SI])
+
 ;;----------------------------------------------------------------------------
 ;; Code iterators
 ;;----------------------------------------------------------------------------
@@ -901,6 +903,35 @@
 (define_mode_attr cde_suffix [(SI "") (DI "d")])
 (define_mode_attr cde_dest [(SI "%0") (DI "%0, %H0")])
 
+;;MVE mode attribute.
+(define_mode_attr MVE_CNVT [(V8HI "V8HF") (V4SI "V4SF") (V8HF "V8HI")
+			    (V4SF "V4SI")])
+(define_mode_attr MVE_LANES [(V16QI "16") (V8HI "8") (V4SI "4")])
+
+(define_mode_attr MVE_constraint [ (V16QI "Ra") (V8HI "Rc") (V4SI "Re")])
+(define_mode_attr MVE_constraint1 [ (V8HI "Ra") (V4SI "Rc")])
+(define_mode_attr MVE_constraint2 [(V16QI "Rb") (V8HI "Rd") (V4SI "Rf")
+				    (V8HF "Rd") (V4SF "Rf")])
+(define_mode_attr MVE_constraint3 [ (V8HI "Rb") (V4SI "Rd")])
+
+(define_mode_attr MVE_pred [ (V16QI "mve_imm_7") (V8HI "mve_imm_15")
+				   (V4SI "mve_imm_31")])
+(define_mode_attr MVE_pred1 [ (V8HI "mve_imm_7") (V4SI "mve_imm_15")])
+(define_mode_attr MVE_pred2 [(V16QI "mve_imm_8") (V8HI "mve_imm_16")
+			     (V4SI "mve_imm_32")
+			     (V8HF "mve_imm_16") (V4SF "mve_imm_32")])
+(define_mode_attr MVE_pred3 [ (V8HI "mve_imm_8") (V4SI "mve_imm_16")])
+
+(define_mode_attr MVE_B_ELEM [ (V16QI "V16QI") (V8HI "V8QI") (V4SI "V4QI")])
+(define_mode_attr MVE_H_ELEM [ (V8HI "V8HI") (V4SI "V4HI")])
+
+(define_mode_attr V_sz_elem1 [(V16QI "b") (V8HI  "h") (V4SI "w") (V8HF "h")
+			      (V4SF "w")])
+(define_mode_attr V_extr_elem [(V16QI "u8") (V8HI "u16") (V4SI "32")
+			       (V8HF "u16") (V4SF "32")])
+(define_mode_attr earlyclobber_32 [(V16QI "=w") (V8HI "=w") (V4SI "=&w")
+						(V8HF "=w") (V4SF "=&w")])
+
 ;;----------------------------------------------------------------------------
 ;; Code attributes
 ;;----------------------------------------------------------------------------
@@ -1181,6 +1212,188 @@
 
 (define_int_attr mmla_sfx [(UNSPEC_MATMUL_S "s8") (UNSPEC_MATMUL_U "u8")
 			   (UNSPEC_MATMUL_US "s8")])
+;;MVE int attribute.
+(define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
+		       (VREV16Q_U "u") (VMVNQ_N_S "s") (VMVNQ_N_U "u")
+		       (VCVTAQ_U "u") (VCVTAQ_S "s") (VREV64Q_S "s")
+		       (VREV64Q_U "u") (VMVNQ_S "s") (VMVNQ_U "u")
+		       (VDUPQ_N_U "u") (VDUPQ_N_S"s") (VADDVQ_S "s")
+		       (VADDVQ_U "u") (VADDVQ_S "s") (VADDVQ_U "u")
+		       (VMOVLTQ_U "u") (VMOVLTQ_S "s") (VMOVLBQ_S "s")
+		       (VMOVLBQ_U "u") (VCVTQ_FROM_F_S "s") (VCVTQ_FROM_F_U "u")
+		       (VCVTPQ_S "s") (VCVTPQ_U "u") (VCVTNQ_S "s")
+		       (VCVTNQ_U "u") (VCVTMQ_S "s") (VCVTMQ_U "u")
+		       (VCLZQ_U "u") (VCLZQ_S "s") (VREV32Q_U "u")
+		       (VREV32Q_S "s") (VADDLVQ_U "u") (VADDLVQ_S "s")
+		       (VCVTQ_N_TO_F_S "s") (VCVTQ_N_TO_F_U "u")
+		       (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s")
+		       (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U "u")
+		       (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s") (VSHLQ_S "s")
+		       (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s")
+		       (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
+		       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
+		       (VADDVQ_P_S "s")	(VADDVQ_P_U "u") (VANDQ_S "s")
+		       (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u")
+		       (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s")
+		       (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s")
+		       (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u")
+		       (VCMPEQQ_N_S "s") (VCMPEQQ_N_U "u") (VCMPNEQ_N_S "s")
+		       (VCMPNEQ_N_U "u") (VEORQ_S "s") (VEORQ_U "u")
+		       (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s")
+		       (VHADDQ_U "u") (VHSUBQ_N_S "s")	(VHSUBQ_N_U "u")
+		       (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u")
+		       (VMAXVQ_S "s") (VMAXVQ_U "u") (VMINQ_S "s") (VMINQ_U "u")
+		       (VMINVQ_S "s") (VMINVQ_U "u") (VMLADAVQ_S "s")
+		       (VMLADAVQ_U "u") (VMULHQ_S "s") (VMULHQ_U "u")
+		       (VMULLBQ_INT_S "s") (VMULLBQ_INT_U "u") (VQADDQ_S "s")
+		       (VMULLTQ_INT_S "s") (VMULLTQ_INT_U "u") (VQADDQ_U "u")
+		       (VMULQ_N_S "s") (VMULQ_N_U "u") (VMULQ_S "s")
+		       (VMULQ_U "u") (VORNQ_S "s") (VORNQ_U "u") (VORRQ_S "s")
+		       (VORRQ_U "u") (VQADDQ_N_S "s") (VQADDQ_N_U "u")
+		       (VQRSHLQ_N_S "s") (VQRSHLQ_N_U "u") (VQRSHLQ_S "s")
+		       (VQRSHLQ_U "u") (VQSHLQ_N_S "s")	(VQSHLQ_N_U "u")
+		       (VQSHLQ_R_S "s") (VQSHLQ_R_U "u") (VQSHLQ_S "s")
+		       (VQSHLQ_U "u") (VQSUBQ_N_S "s") (VQSUBQ_N_U "u")
+		       (VQSUBQ_S "s") (VQSUBQ_U "u") (VRHADDQ_S "s")
+		       (VRHADDQ_U "u") (VRMULHQ_S "s") (VRMULHQ_U "u")
+		       (VRSHLQ_N_S "s") (VRSHLQ_N_U "u") (VRSHLQ_S "s")
+		       (VRSHLQ_U "u") (VRSHRQ_N_S "s") (VRSHRQ_N_U "u")
+		       (VSHLQ_N_S "s") (VSHLQ_N_U "u") (VSHLQ_R_S "s")
+		       (VSHLQ_R_U "u") (VSUBQ_N_S "s") (VSUBQ_N_U "u")
+		       (VSUBQ_S "s") (VSUBQ_U "u") (VADDVAQ_S "s")
+		       (VADDVAQ_U "u") (VADDLVAQ_S "s") (VADDLVAQ_U "u")
+		       (VBICQ_N_S "s") (VBICQ_N_U "u") (VMLALDAVQ_U "u")
+		       (VMLALDAVQ_S "s") (VMLALDAVXQ_U "u") (VMLALDAVXQ_S "s")
+		       (VMOVNBQ_U "u") (VMOVNBQ_S "s") (VMOVNTQ_U "u")
+		       (VMOVNTQ_S "s") (VORRQ_N_S "s") (VORRQ_N_U "u")
+		       (VQMOVNBQ_U "u") (VQMOVNBQ_S "s") (VQMOVNTQ_S "s")
+		       (VQMOVNTQ_U "u") (VSHLLBQ_N_U "u") (VSHLLBQ_N_S "s")
+		       (VSHLLTQ_N_U "u") (VSHLLTQ_N_S "s") (VRMLALDAVHQ_U "u")
+		       (VRMLALDAVHQ_S "s") (VBICQ_M_N_S "s") (VBICQ_M_N_U "u")
+		       (VCVTAQ_M_S "s") (VCVTAQ_M_U "u") (VCVTQ_M_TO_F_S "s")
+		       (VCVTQ_M_TO_F_U "u") (VQRSHRNBQ_N_S "s")
+		       (VQRSHRNBQ_N_U "u") (VABAVQ_S "s") (VABAVQ_U "u")
+		       (VRMLALDAVHAQ_U "u") (VRMLALDAVHAQ_S "s") (VSHLCQ_S "s")
+		       (VSHLCQ_U "u") (VADDVAQ_P_S "s") (VADDVAQ_P_U "u")
+		       (VCLZQ_M_S "s") (VCLZQ_M_U "u") (VCMPEQQ_M_N_S "s")
+		       (VCMPEQQ_M_N_U "u") (VCMPEQQ_M_S "s") (VCMPEQQ_M_U "u")
+		       (VCMPNEQ_M_N_S "s") (VCMPNEQ_M_N_U "u") (VCMPNEQ_M_S "s")
+		       (VCMPNEQ_M_U "u") (VDUPQ_M_N_S "s") (VDUPQ_M_N_U "u")
+		       (VMAXVQ_P_S "s") (VMAXVQ_P_U "u") (VMINVQ_P_S "s")
+		       (VMINVQ_P_U "u") (VMLADAVAQ_S "s") (VMLADAVAQ_U "u")
+		       (VMLADAVQ_P_S "s") (VMLADAVQ_P_U "u") (VMLAQ_N_S "s")
+		       (VMLAQ_N_U "u") (VMLASQ_N_S "s") (VMLASQ_N_U "u")
+		       (VMVNQ_M_S "s") (VMVNQ_M_U "u") (VPSELQ_S "s")
+		       (VPSELQ_U "u") (VQDMLAHQ_N_S "s")
+		       (VQDMLASHQ_N_S "s")
+		       (VQRDMLAHQ_N_S "s")
+		       (VQRDMLASHQ_N_S "s")
+		       (VQRSHLQ_M_N_S "s") (VQRSHLQ_M_N_U "u")
+		       (VQSHLQ_M_R_S "s") (VQSHLQ_M_R_U "u") (VSRIQ_N_S "s")
+		       (VREV64Q_M_S "s") (VREV64Q_M_U "u") (VSRIQ_N_U "u")
+		       (VRSHLQ_M_N_S "s") (VRSHLQ_M_N_U "u") (VSHLQ_M_R_S "s")
+		       (VSHLQ_M_R_U "u") (VSLIQ_N_S "s") (VSLIQ_N_U "u")
+		       (VMLALDAVQ_P_S "s") (VQMOVNBQ_M_S "s") (VMOVLTQ_M_S "s")
+		       (VMOVNBQ_M_S "s") (VRSHRNTQ_N_S "s") (VORRQ_M_N_S "s")
+		       (VREV32Q_M_S "s") (VQRSHRNTQ_N_S "s") (VMOVNTQ_M_S "s")
+		       (VMOVLBQ_M_S "s") (VMLALDAVAQ_S "s") (VQSHRNBQ_N_S "s")
+		       (VSHRNBQ_N_S "s") (VRSHRNBQ_N_S "s") (VMLALDAVXQ_P_S "s")
+		       (VQMOVNTQ_M_S "s") (VMVNQ_M_N_S "s") (VQSHRNTQ_N_S "s")
+		       (VMLALDAVAXQ_S "s") (VSHRNTQ_N_S "s") (VMLALDAVQ_P_U "u")
+		       (VQMOVNBQ_M_U "u") (VMOVLTQ_M_U "u") (VMOVNBQ_M_U "u")
+		       (VRSHRNTQ_N_U "u") (VORRQ_M_N_U "u") (VREV32Q_M_U "u")
+		       (VREV16Q_M_S "s") (VREV16Q_M_U "u")
+		       (VQRSHRNTQ_N_U "u") (VMOVNTQ_M_U "u") (VMOVLBQ_M_U "u")
+		       (VMLALDAVAQ_U "u") (VQSHRNBQ_N_U "u") (VSHRNBQ_N_U "u")
+		       (VRSHRNBQ_N_U "u") (VMLALDAVXQ_P_U "u")
+		       (VMVNQ_M_N_U "u") (VQSHRNTQ_N_U "u") (VMLALDAVAXQ_U "u")
+		       (VQMOVNTQ_M_U "u") (VSHRNTQ_N_U "u") (VCVTMQ_M_S "s")
+		       (VCVTMQ_M_U "u") (VCVTNQ_M_S "s") (VCVTNQ_M_U "u")
+		       (VCVTPQ_M_S "s") (VCVTPQ_M_U "u") (VADDLVAQ_P_S "s")
+		       (VCVTQ_M_N_FROM_F_U "u") (VCVTQ_M_FROM_F_S "s")
+		       (VCVTQ_M_FROM_F_U "u") (VRMLALDAVHQ_P_U "u")
+		       (VRMLALDAVHQ_P_S "s") (VADDLVAQ_P_U "u")
+		       (VCVTQ_M_N_FROM_F_S "s") (VABAVQ_P_U "u")
+		       (VABAVQ_P_S "s") (VSHLQ_M_S "s") (VSHLQ_M_U "u")
+		       (VSRIQ_M_N_S "s") (VSRIQ_M_N_U "u") (VSUBQ_M_S "s")
+		       (VSUBQ_M_U "u") (VCVTQ_M_N_TO_F_S "s")
+		       (VCVTQ_M_N_TO_F_U "u") (VADDQ_M_N_U "u")
+		       (VSHLQ_M_N_S "s") (VMAXQ_M_U "u") (VHSUBQ_M_N_U "u")
+		       (VMULQ_M_N_S "s") (VQSHLQ_M_U "u") (VRHADDQ_M_S "s")
+		       (VEORQ_M_U "u") (VSHRQ_M_N_U "u") (VCADDQ_ROT90_M_U "u")
+		       (VMLADAVAQ_P_U "u") (VEORQ_M_S "s") (VBRSRQ_M_N_S "s")
+		       (VMULQ_M_U "u") (VQRDMLAHQ_M_N_S "s") (VHSUBQ_M_N_S "s")
+		       (VQRSHLQ_M_S "s") (VMULQ_M_N_U "u")
+		       (VMULQ_M_S "s") (VQSHLQ_M_N_U "u") (VSLIQ_M_N_U "u")
+		       (VMLADAVAQ_P_S "s") (VQRSHLQ_M_U "u")
+		       (VMULLBQ_INT_M_U "u") (VSHLQ_M_N_U "u") (VQSUBQ_M_U "u")
+		       (VQDMLASHQ_M_N_S "s")
+		       (VQRDMLASHQ_M_N_U "u") (VRSHRQ_M_N_S "s")
+		       (VORNQ_M_S "s") (VCADDQ_ROT270_M_S "s") (VRHADDQ_M_U "u")
+		       (VRSHRQ_M_N_U "u") (VMLASQ_M_N_U "u") (VHSUBQ_M_U "u")
+		       (VQSUBQ_M_N_S "s") (VMULLTQ_INT_M_S "s")
+		       (VORRQ_M_S "s") (VQDMLAHQ_M_N_U "u") (VRSHLQ_M_S "s")
+		       (VHADDQ_M_U "u") (VHADDQ_M_N_S "s") (VMULLTQ_INT_M_U "u")
+		       (VORRQ_M_U "u") (VHADDQ_M_S "s") (VHADDQ_M_N_U "u")
+		       (VQDMLAHQ_M_N_S "s") (VMAXQ_M_S "s") (VORNQ_M_U "u")
+		       (VCADDQ_ROT270_M_U "u") (VQADDQ_M_U "u")
+		       (VQRDMLASHQ_M_N_S "s") (VBICQ_M_U "u") (VMINQ_M_U "u")
+		       (VSUBQ_M_N_S "s") (VMULLBQ_INT_M_S "s") (VQSUBQ_M_S "s")
+		       (VCADDQ_ROT90_M_S "s") (VRMULHQ_M_S "s") (VANDQ_M_U "u")
+		       (VMULHQ_M_S "s") (VADDQ_M_S "s") (VQRDMLAHQ_M_N_U "u")
+		       (VMLASQ_M_N_S "s") (VHSUBQ_M_S "s") (VRMULHQ_M_U "u")
+		       (VQADDQ_M_N_S "s") (VSHRQ_M_N_S "s") (VANDQ_M_S "s")
+		       (VABDQ_M_U "u") (VQSHLQ_M_S "s") (VABDQ_M_S "s")
+		       (VSUBQ_M_N_U "u") (VMLAQ_M_N_S "s") (VBRSRQ_M_N_U "u")
+		       (VADDQ_M_U "u") (VRSHLQ_M_U "u") (VSLIQ_M_N_S "s")
+		       (VQADDQ_M_N_U "u") (VADDQ_M_N_S "s") (VQSUBQ_M_N_U "u")
+		       (VMLAQ_M_N_U "u") (VMINQ_M_S "s") (VMULHQ_M_U "u")
+		       (VQADDQ_M_S "s") (VBICQ_M_S "s") (VQSHLQ_M_N_S "s")
+		       (VQSHRNTQ_M_N_S "s") (VQSHRNTQ_M_N_U "u")
+		       (VSHRNTQ_M_N_U "u") (VSHRNTQ_M_N_S "s")
+		       (VSHRNBQ_M_N_S "s") (VSHRNBQ_M_N_U "u")
+		       (VSHLLTQ_M_N_S "s") (VSHLLTQ_M_N_U "u")
+		       (VSHLLBQ_M_N_S "s") (VSHLLBQ_M_N_U "u")
+		       (VRSHRNTQ_M_N_S "s") (VRSHRNTQ_M_N_U "u")
+		       (VRSHRNBQ_M_N_U "u") (VRSHRNBQ_M_N_S "s")
+		       (VQSHRNTQ_M_N_U "u") (VQSHRNTQ_M_N_S "s")
+		       (VQSHRNBQ_M_N_S "s") (VQSHRNBQ_M_N_U "u")
+		       (VQRSHRNTQ_M_N_S "s") (VQRSHRNTQ_M_N_U "u")
+		       (VQRSHRNBQ_M_N_S "s") (VQRSHRNBQ_M_N_U "u")
+		       (VMLALDAVAXQ_P_S "s")
+		       (VMLALDAVAQ_P_S "s") (VMLALDAVAQ_P_U "u")
+		       (VSTRWQSB_S "s") (VSTRWQSB_U "u") (VSTRBQSO_S "s")
+		       (VSTRBQSO_U "u") (VSTRBQ_S "s") (VSTRBQ_U "u")
+		       (VLDRBQGO_S "s") (VLDRBQGO_U "u") (VLDRBQ_S "s")
+		       (VLDRBQ_U "u") (VLDRWQGB_S "s") (VLDRWQGB_U "u")
+		       (VLD1Q_S "s") (VLD1Q_U "u") (VLDRHQGO_S "s")
+		       (VLDRHQGO_U "u") (VLDRHQGSO_S "s") (VLDRHQGSO_U "u")
+		       (VLDRHQ_S "s") (VLDRHQ_U "u") (VLDRWQ_S "s")
+		       (VLDRWQ_U "u") (VLDRDQGB_S "s") (VLDRDQGB_U "u")
+		       (VLDRDQGO_S "s") (VLDRDQGO_U "u") (VLDRDQGSO_S "s")
+		       (VLDRDQGSO_U "u") (VLDRWQGO_S "s") (VLDRWQGO_U "u")
+		       (VLDRWQGSO_S "s") (VLDRWQGSO_U "u") (VST1Q_S "s")
+		       (VST1Q_U "u") (VSTRHQSO_S "s") (VSTRHQSO_U "u")
+		       (VSTRHQSSO_S "s") (VSTRHQSSO_U "u") (VSTRHQ_S "s")
+		       (VSTRHQ_U "u") (VSTRWQ_S "s") (VSTRWQ_U "u")
+		       (VSTRDQSB_S "s") (VSTRDQSB_U "u") (VSTRDQSO_S "s")
+		       (VSTRDQSO_U "u") (VSTRDQSSO_S "s") (VSTRDQSSO_U "u")
+		       (VSTRWQSO_U "u") (VSTRWQSO_S "s") (VSTRWQSSO_U "u")
+		       (VSTRWQSSO_S "s") (VSTRWQSBWB_S "s") (VSTRWQSBWB_U "u")
+		       (VLDRWQGBWB_S "s") (VLDRWQGBWB_U "u") (VLDRDQGBWB_S "s")
+		       (VLDRDQGBWB_U "u") (VSTRDQSBWB_S "s") (VADCQ_M_S "s")
+		       (VSTRDQSBWB_U "u") (VSBCQ_U "u")  (VSBCQ_M_U "u")
+		       (VSBCQ_S "s")  (VSBCQ_M_S "s") (VSBCIQ_U "u")
+		       (VSBCIQ_M_U "u") (VSBCIQ_S "s") (VSBCIQ_M_S "s")
+		       (VADCQ_U "u")  (VADCQ_M_U "u") (VADCQ_S "s")
+		       (VADCIQ_U "u") (VADCIQ_M_U "u") (VADCIQ_S "s")
+		       (VADCIQ_M_S "s") (SQRSHRL_64 "64") (SQRSHRL_48 "48")
+		       (UQRSHLL_64 "64") (UQRSHLL_48 "48") (VSHLCQ_M_S "s")
+		       (VSHLCQ_M_U "u")])
+
+(define_int_attr mode1 [(VCTP8Q "8") (VCTP16Q "16") (VCTP32Q "32")
+			(VCTP64Q "64") (VCTP8Q_M "8") (VCTP16Q_M "16")
+			(VCTP32Q_M "32") (VCTP64Q_M "64")])
 
 ;; Both kinds of return insn.
 (define_code_iterator RETURNS [return simple_return])
@@ -1256,3 +1469,249 @@
 
 ;; An iterator for CDE MVE accumulator/non-accumulator versions.
 (define_int_attr a [(UNSPEC_VCDE "") (UNSPEC_VCDEA "a")])
+
+;; MVE int iterator.
+(define_int_iterator VCVTQ_TO_F [VCVTQ_TO_F_S VCVTQ_TO_F_U])
+(define_int_iterator VMVNQ_N [VMVNQ_N_U VMVNQ_N_S])
+(define_int_iterator VREV64Q [VREV64Q_S VREV64Q_U])
+(define_int_iterator VCVTQ_FROM_F [VCVTQ_FROM_F_S VCVTQ_FROM_F_U])
+(define_int_iterator VREV16Q [VREV16Q_U VREV16Q_S])
+(define_int_iterator VCVTAQ [VCVTAQ_U VCVTAQ_S])
+(define_int_iterator VMVNQ [VMVNQ_U VMVNQ_S])
+(define_int_iterator VDUPQ_N [VDUPQ_N_U VDUPQ_N_S])
+(define_int_iterator VCLZQ [VCLZQ_U VCLZQ_S])
+(define_int_iterator VADDVQ [VADDVQ_U VADDVQ_S])
+(define_int_iterator VREV32Q [VREV32Q_U VREV32Q_S])
+(define_int_iterator VMOVLBQ [VMOVLBQ_S VMOVLBQ_U])
+(define_int_iterator VMOVLTQ [VMOVLTQ_U VMOVLTQ_S])
+(define_int_iterator VCVTPQ [VCVTPQ_S VCVTPQ_U])
+(define_int_iterator VCVTNQ [VCVTNQ_S VCVTNQ_U])
+(define_int_iterator VCVTMQ [VCVTMQ_S VCVTMQ_U])
+(define_int_iterator VADDLVQ [VADDLVQ_U VADDLVQ_S])
+(define_int_iterator VCTPQ [VCTP8Q VCTP16Q VCTP32Q VCTP64Q])
+(define_int_iterator VCTPQ_M [VCTP8Q_M VCTP16Q_M VCTP32Q_M VCTP64Q_M])
+(define_int_iterator VCVTQ_N_TO_F [VCVTQ_N_TO_F_S VCVTQ_N_TO_F_U])
+(define_int_iterator VCREATEQ [VCREATEQ_U VCREATEQ_S])
+(define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U])
+(define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U])
+(define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U])
+(define_int_iterator VCMPNEQ [VCMPNEQ_U VCMPNEQ_S])
+(define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U])
+(define_int_iterator VABDQ [VABDQ_S VABDQ_U])
+(define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
+(define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
+(define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
+(define_int_iterator VANDQ [VANDQ_U VANDQ_S])
+(define_int_iterator VBICQ [VBICQ_S VBICQ_U])
+(define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
+(define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U])
+(define_int_iterator VCADDQ_ROT90 [VCADDQ_ROT90_U VCADDQ_ROT90_S])
+(define_int_iterator VCMPEQQ [VCMPEQQ_U VCMPEQQ_S])
+(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S VCMPEQQ_N_U])
+(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_U VCMPNEQ_N_S])
+(define_int_iterator VEORQ [VEORQ_U VEORQ_S])
+(define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U])
+(define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S])
+(define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U])
+(define_int_iterator VHSUBQ_N [VHSUBQ_N_U VHSUBQ_N_S])
+(define_int_iterator VMAXQ [VMAXQ_U VMAXQ_S])
+(define_int_iterator VMAXVQ [VMAXVQ_U VMAXVQ_S])
+(define_int_iterator VMINQ [VMINQ_S VMINQ_U])
+(define_int_iterator VMINVQ [VMINVQ_U VMINVQ_S])
+(define_int_iterator VMLADAVQ [VMLADAVQ_U VMLADAVQ_S])
+(define_int_iterator VMULHQ [VMULHQ_S VMULHQ_U])
+(define_int_iterator VMULLBQ_INT [VMULLBQ_INT_U VMULLBQ_INT_S])
+(define_int_iterator VMULLTQ_INT [VMULLTQ_INT_U VMULLTQ_INT_S])
+(define_int_iterator VMULQ [VMULQ_U VMULQ_S])
+(define_int_iterator VMULQ_N [VMULQ_N_U VMULQ_N_S])
+(define_int_iterator VORNQ [VORNQ_U VORNQ_S])
+(define_int_iterator VORRQ [VORRQ_S VORRQ_U])
+(define_int_iterator VQADDQ [VQADDQ_U VQADDQ_S])
+(define_int_iterator VQADDQ_N [VQADDQ_N_S VQADDQ_N_U])
+(define_int_iterator VQRSHLQ [VQRSHLQ_S VQRSHLQ_U])
+(define_int_iterator VQRSHLQ_N [VQRSHLQ_N_S VQRSHLQ_N_U])
+(define_int_iterator VQSHLQ [VQSHLQ_S VQSHLQ_U])
+(define_int_iterator VQSHLQ_N [VQSHLQ_N_S VQSHLQ_N_U])
+(define_int_iterator VQSHLQ_R [VQSHLQ_R_U VQSHLQ_R_S])
+(define_int_iterator VQSUBQ [VQSUBQ_U VQSUBQ_S])
+(define_int_iterator VQSUBQ_N [VQSUBQ_N_S VQSUBQ_N_U])
+(define_int_iterator VRHADDQ [VRHADDQ_S VRHADDQ_U])
+(define_int_iterator VRMULHQ [VRMULHQ_S VRMULHQ_U])
+(define_int_iterator VRSHLQ [VRSHLQ_S VRSHLQ_U])
+(define_int_iterator VRSHLQ_N [VRSHLQ_N_U VRSHLQ_N_S])
+(define_int_iterator VRSHRQ_N [VRSHRQ_N_S VRSHRQ_N_U])
+(define_int_iterator VSHLQ_N [VSHLQ_N_U VSHLQ_N_S])
+(define_int_iterator VSHLQ_R [VSHLQ_R_S VSHLQ_R_U])
+(define_int_iterator VSUBQ [VSUBQ_S VSUBQ_U])
+(define_int_iterator VSUBQ_N [VSUBQ_N_S VSUBQ_N_U])
+(define_int_iterator VADDLVAQ [VADDLVAQ_S VADDLVAQ_U])
+(define_int_iterator VBICQ_N [VBICQ_N_S VBICQ_N_U])
+(define_int_iterator VMLALDAVQ [VMLALDAVQ_U VMLALDAVQ_S])
+(define_int_iterator VMLALDAVXQ [VMLALDAVXQ_U VMLALDAVXQ_S])
+(define_int_iterator VMOVNBQ [VMOVNBQ_U VMOVNBQ_S])
+(define_int_iterator VMOVNTQ [VMOVNTQ_S VMOVNTQ_U])
+(define_int_iterator VORRQ_N [VORRQ_N_U VORRQ_N_S])
+(define_int_iterator VQMOVNBQ [VQMOVNBQ_U VQMOVNBQ_S])
+(define_int_iterator VQMOVNTQ [VQMOVNTQ_U VQMOVNTQ_S])
+(define_int_iterator VSHLLBQ_N [VSHLLBQ_N_S VSHLLBQ_N_U])
+(define_int_iterator VSHLLTQ_N [VSHLLTQ_N_U VSHLLTQ_N_S])
+(define_int_iterator VRMLALDAVHQ [VRMLALDAVHQ_U VRMLALDAVHQ_S])
+(define_int_iterator VBICQ_M_N [VBICQ_M_N_S VBICQ_M_N_U])
+(define_int_iterator VCVTAQ_M [VCVTAQ_M_S VCVTAQ_M_U])
+(define_int_iterator VCVTQ_M_TO_F [VCVTQ_M_TO_F_S VCVTQ_M_TO_F_U])
+(define_int_iterator VQRSHRNBQ_N [VQRSHRNBQ_N_U VQRSHRNBQ_N_S])
+(define_int_iterator VABAVQ [VABAVQ_S VABAVQ_U])
+(define_int_iterator VSHLCQ [VSHLCQ_S VSHLCQ_U])
+(define_int_iterator VRMLALDAVHAQ [VRMLALDAVHAQ_S VRMLALDAVHAQ_U])
+(define_int_iterator VADDVAQ_P [VADDVAQ_P_S VADDVAQ_P_U])
+(define_int_iterator VCLZQ_M [VCLZQ_M_S VCLZQ_M_U])
+(define_int_iterator VCMPEQQ_M_N [VCMPEQQ_M_N_S VCMPEQQ_M_N_U])
+(define_int_iterator VCMPEQQ_M [VCMPEQQ_M_S VCMPEQQ_M_U])
+(define_int_iterator VCMPNEQ_M_N [VCMPNEQ_M_N_S VCMPNEQ_M_N_U])
+(define_int_iterator VCMPNEQ_M [VCMPNEQ_M_S VCMPNEQ_M_U])
+(define_int_iterator VDUPQ_M_N [VDUPQ_M_N_S VDUPQ_M_N_U])
+(define_int_iterator VMAXVQ_P [VMAXVQ_P_S VMAXVQ_P_U])
+(define_int_iterator VMINVQ_P [VMINVQ_P_S VMINVQ_P_U])
+(define_int_iterator VMLADAVAQ [VMLADAVAQ_S VMLADAVAQ_U])
+(define_int_iterator VMLADAVQ_P [VMLADAVQ_P_S VMLADAVQ_P_U])
+(define_int_iterator VMLAQ_N [VMLAQ_N_S VMLAQ_N_U])
+(define_int_iterator VMLASQ_N [VMLASQ_N_S VMLASQ_N_U])
+(define_int_iterator VMVNQ_M [VMVNQ_M_S VMVNQ_M_U])
+(define_int_iterator VPSELQ [VPSELQ_S VPSELQ_U])
+(define_int_iterator VQDMLAHQ_N [VQDMLAHQ_N_S])
+(define_int_iterator VQDMLASHQ_N [VQDMLASHQ_N_S])
+(define_int_iterator VQRDMLAHQ_N [VQRDMLAHQ_N_S])
+(define_int_iterator VQRDMLASHQ_N [VQRDMLASHQ_N_S])
+(define_int_iterator VQRSHLQ_M_N [VQRSHLQ_M_N_S VQRSHLQ_M_N_U])
+(define_int_iterator VQSHLQ_M_R [VQSHLQ_M_R_S VQSHLQ_M_R_U])
+(define_int_iterator VREV64Q_M [VREV64Q_M_S VREV64Q_M_U])
+(define_int_iterator VRSHLQ_M_N [VRSHLQ_M_N_S VRSHLQ_M_N_U])
+(define_int_iterator VSHLQ_M_R [VSHLQ_M_R_S VSHLQ_M_R_U])
+(define_int_iterator VSLIQ_N [VSLIQ_N_S VSLIQ_N_U])
+(define_int_iterator VSRIQ_N [VSRIQ_N_S VSRIQ_N_U])
+(define_int_iterator VMLALDAVQ_P [VMLALDAVQ_P_U VMLALDAVQ_P_S])
+(define_int_iterator VQMOVNBQ_M [VQMOVNBQ_M_S VQMOVNBQ_M_U])
+(define_int_iterator VMOVLTQ_M [VMOVLTQ_M_U VMOVLTQ_M_S])
+(define_int_iterator VMOVNBQ_M [VMOVNBQ_M_U VMOVNBQ_M_S])
+(define_int_iterator VRSHRNTQ_N [VRSHRNTQ_N_U VRSHRNTQ_N_S])
+(define_int_iterator VORRQ_M_N [VORRQ_M_N_S VORRQ_M_N_U])
+(define_int_iterator VREV32Q_M [VREV32Q_M_S VREV32Q_M_U])
+(define_int_iterator VREV16Q_M [VREV16Q_M_S VREV16Q_M_U])
+(define_int_iterator VQRSHRNTQ_N [VQRSHRNTQ_N_U VQRSHRNTQ_N_S])
+(define_int_iterator VMOVNTQ_M [VMOVNTQ_M_U VMOVNTQ_M_S])
+(define_int_iterator VMOVLBQ_M [VMOVLBQ_M_U VMOVLBQ_M_S])
+(define_int_iterator VMLALDAVAQ [VMLALDAVAQ_S VMLALDAVAQ_U])
+(define_int_iterator VQSHRNBQ_N [VQSHRNBQ_N_U VQSHRNBQ_N_S])
+(define_int_iterator VSHRNBQ_N [VSHRNBQ_N_U VSHRNBQ_N_S])
+(define_int_iterator VRSHRNBQ_N [VRSHRNBQ_N_S VRSHRNBQ_N_U])
+(define_int_iterator VMLALDAVXQ_P [VMLALDAVXQ_P_U VMLALDAVXQ_P_S])
+(define_int_iterator VQMOVNTQ_M [VQMOVNTQ_M_U VQMOVNTQ_M_S])
+(define_int_iterator VMVNQ_M_N [VMVNQ_M_N_U VMVNQ_M_N_S])
+(define_int_iterator VQSHRNTQ_N [VQSHRNTQ_N_U VQSHRNTQ_N_S])
+(define_int_iterator VMLALDAVAXQ [VMLALDAVAXQ_S VMLALDAVAXQ_U])
+(define_int_iterator VSHRNTQ_N [VSHRNTQ_N_S VSHRNTQ_N_U])
+(define_int_iterator VCVTMQ_M [VCVTMQ_M_S VCVTMQ_M_U])
+(define_int_iterator VCVTNQ_M [VCVTNQ_M_S VCVTNQ_M_U])
+(define_int_iterator VCVTPQ_M [VCVTPQ_M_S VCVTPQ_M_U])
+(define_int_iterator VCVTQ_M_N_FROM_F [VCVTQ_M_N_FROM_F_S VCVTQ_M_N_FROM_F_U])
+(define_int_iterator VCVTQ_M_FROM_F [VCVTQ_M_FROM_F_U VCVTQ_M_FROM_F_S])
+(define_int_iterator VRMLALDAVHQ_P [VRMLALDAVHQ_P_S VRMLALDAVHQ_P_U])
+(define_int_iterator VADDLVAQ_P [VADDLVAQ_P_U VADDLVAQ_P_S])
+(define_int_iterator VABAVQ_P [VABAVQ_P_S VABAVQ_P_U])
+(define_int_iterator VSHLQ_M [VSHLQ_M_S VSHLQ_M_U])
+(define_int_iterator VSRIQ_M_N [VSRIQ_M_N_S VSRIQ_M_N_U])
+(define_int_iterator VSUBQ_M [VSUBQ_M_U VSUBQ_M_S])
+(define_int_iterator VCVTQ_M_N_TO_F [VCVTQ_M_N_TO_F_U VCVTQ_M_N_TO_F_S])
+(define_int_iterator VHSUBQ_M [VHSUBQ_M_S VHSUBQ_M_U])
+(define_int_iterator VSLIQ_M_N [VSLIQ_M_N_U VSLIQ_M_N_S])
+(define_int_iterator VRSHLQ_M [VRSHLQ_M_S VRSHLQ_M_U])
+(define_int_iterator VMINQ_M [VMINQ_M_S VMINQ_M_U])
+(define_int_iterator VMULLBQ_INT_M [VMULLBQ_INT_M_U VMULLBQ_INT_M_S])
+(define_int_iterator VMULHQ_M [VMULHQ_M_S VMULHQ_M_U])
+(define_int_iterator VMULQ_M [VMULQ_M_S VMULQ_M_U])
+(define_int_iterator VHSUBQ_M_N [VHSUBQ_M_N_S VHSUBQ_M_N_U])
+(define_int_iterator VHADDQ_M_N [VHADDQ_M_N_S VHADDQ_M_N_U])
+(define_int_iterator VORRQ_M [VORRQ_M_S VORRQ_M_U])
+(define_int_iterator VRMULHQ_M [VRMULHQ_M_U VRMULHQ_M_S])
+(define_int_iterator VQADDQ_M [VQADDQ_M_U VQADDQ_M_S])
+(define_int_iterator VRSHRQ_M_N [VRSHRQ_M_N_S VRSHRQ_M_N_U])
+(define_int_iterator VQSUBQ_M_N [VQSUBQ_M_N_U VQSUBQ_M_N_S])
+(define_int_iterator VADDQ_M [VADDQ_M_U VADDQ_M_S])
+(define_int_iterator VORNQ_M [VORNQ_M_U VORNQ_M_S])
+(define_int_iterator VRHADDQ_M [VRHADDQ_M_U VRHADDQ_M_S])
+(define_int_iterator VQSHLQ_M [VQSHLQ_M_U VQSHLQ_M_S])
+(define_int_iterator VANDQ_M [VANDQ_M_U VANDQ_M_S])
+(define_int_iterator VBICQ_M [VBICQ_M_U VBICQ_M_S])
+(define_int_iterator VSHLQ_M_N [VSHLQ_M_N_S VSHLQ_M_N_U])
+(define_int_iterator VCADDQ_ROT270_M [VCADDQ_ROT270_M_U VCADDQ_ROT270_M_S])
+(define_int_iterator VQRSHLQ_M [VQRSHLQ_M_U VQRSHLQ_M_S])
+(define_int_iterator VQADDQ_M_N [VQADDQ_M_N_U VQADDQ_M_N_S])
+(define_int_iterator VADDQ_M_N [VADDQ_M_N_S VADDQ_M_N_U])
+(define_int_iterator VMAXQ_M [VMAXQ_M_S VMAXQ_M_U])
+(define_int_iterator VQSUBQ_M [VQSUBQ_M_U VQSUBQ_M_S])
+(define_int_iterator VMLASQ_M_N [VMLASQ_M_N_U VMLASQ_M_N_S])
+(define_int_iterator VMLADAVAQ_P [VMLADAVAQ_P_U VMLADAVAQ_P_S])
+(define_int_iterator VBRSRQ_M_N [VBRSRQ_M_N_U VBRSRQ_M_N_S])
+(define_int_iterator VMULQ_M_N [VMULQ_M_N_U VMULQ_M_N_S])
+(define_int_iterator VCADDQ_ROT90_M [VCADDQ_ROT90_M_U VCADDQ_ROT90_M_S])
+(define_int_iterator VMULLTQ_INT_M [VMULLTQ_INT_M_S VMULLTQ_INT_M_U])
+(define_int_iterator VEORQ_M [VEORQ_M_S VEORQ_M_U])
+(define_int_iterator VSHRQ_M_N [VSHRQ_M_N_S VSHRQ_M_N_U])
+(define_int_iterator VSUBQ_M_N [VSUBQ_M_N_S VSUBQ_M_N_U])
+(define_int_iterator VHADDQ_M [VHADDQ_M_S VHADDQ_M_U])
+(define_int_iterator VABDQ_M [VABDQ_M_S VABDQ_M_U])
+(define_int_iterator VMLAQ_M_N [VMLAQ_M_N_S VMLAQ_M_N_U])
+(define_int_iterator VQSHLQ_M_N [VQSHLQ_M_N_S VQSHLQ_M_N_U])
+(define_int_iterator VMLALDAVAQ_P [VMLALDAVAQ_P_U VMLALDAVAQ_P_S])
+(define_int_iterator VMLALDAVAXQ_P [VMLALDAVAXQ_P_S])
+(define_int_iterator VQRSHRNBQ_M_N [VQRSHRNBQ_M_N_U VQRSHRNBQ_M_N_S])
+(define_int_iterator VQRSHRNTQ_M_N [VQRSHRNTQ_M_N_S VQRSHRNTQ_M_N_U])
+(define_int_iterator VQSHRNBQ_M_N [VQSHRNBQ_M_N_U VQSHRNBQ_M_N_S])
+(define_int_iterator VQSHRNTQ_M_N [VQSHRNTQ_M_N_S VQSHRNTQ_M_N_U])
+(define_int_iterator VRSHRNBQ_M_N [VRSHRNBQ_M_N_U VRSHRNBQ_M_N_S])
+(define_int_iterator VRSHRNTQ_M_N [VRSHRNTQ_M_N_U VRSHRNTQ_M_N_S])
+(define_int_iterator VSHLLBQ_M_N [VSHLLBQ_M_N_U VSHLLBQ_M_N_S])
+(define_int_iterator VSHLLTQ_M_N [VSHLLTQ_M_N_U VSHLLTQ_M_N_S])
+(define_int_iterator VSHRNBQ_M_N [VSHRNBQ_M_N_S VSHRNBQ_M_N_U])
+(define_int_iterator VSHRNTQ_M_N [VSHRNTQ_M_N_S VSHRNTQ_M_N_U])
+(define_int_iterator VSTRWSBQ [VSTRWQSB_S VSTRWQSB_U])
+(define_int_iterator VSTRBSOQ [VSTRBQSO_S VSTRBQSO_U])
+(define_int_iterator VSTRBQ [VSTRBQ_S VSTRBQ_U])
+(define_int_iterator VLDRBGOQ [VLDRBQGO_S VLDRBQGO_U])
+(define_int_iterator VLDRBQ [VLDRBQ_S VLDRBQ_U])
+(define_int_iterator VLDRWGBQ [VLDRWQGB_S VLDRWQGB_U])
+(define_int_iterator VLD1Q [VLD1Q_S VLD1Q_U])
+(define_int_iterator VLDRHGOQ [VLDRHQGO_S VLDRHQGO_U])
+(define_int_iterator VLDRHGSOQ [VLDRHQGSO_S VLDRHQGSO_U])
+(define_int_iterator VLDRHQ [VLDRHQ_S VLDRHQ_U])
+(define_int_iterator VLDRWQ [VLDRWQ_S VLDRWQ_U])
+(define_int_iterator VLDRDGBQ [VLDRDQGB_S VLDRDQGB_U])
+(define_int_iterator VLDRDGOQ [VLDRDQGO_S VLDRDQGO_U])
+(define_int_iterator VLDRDGSOQ [VLDRDQGSO_S VLDRDQGSO_U])
+(define_int_iterator VLDRWGOQ [VLDRWQGO_S VLDRWQGO_U])
+(define_int_iterator VLDRWGSOQ [VLDRWQGSO_S VLDRWQGSO_U])
+(define_int_iterator VST1Q [VST1Q_S VST1Q_U])
+(define_int_iterator VSTRHSOQ [VSTRHQSO_S VSTRHQSO_U])
+(define_int_iterator VSTRHSSOQ [VSTRHQSSO_S VSTRHQSSO_U])
+(define_int_iterator VSTRHQ [VSTRHQ_S VSTRHQ_U])
+(define_int_iterator VSTRWQ [VSTRWQ_S VSTRWQ_U])
+(define_int_iterator VSTRDSBQ [VSTRDQSB_S VSTRDQSB_U])
+(define_int_iterator VSTRDSOQ [VSTRDQSO_S VSTRDQSO_U])
+(define_int_iterator VSTRDSSOQ [VSTRDQSSO_S VSTRDQSSO_U])
+(define_int_iterator VSTRWSOQ [VSTRWQSO_S VSTRWQSO_U])
+(define_int_iterator VSTRWSSOQ [VSTRWQSSO_S VSTRWQSSO_U])
+(define_int_iterator VSTRWSBWBQ [VSTRWQSBWB_S VSTRWQSBWB_U])
+(define_int_iterator VLDRWGBWBQ [VLDRWQGBWB_S VLDRWQGBWB_U])
+(define_int_iterator VSTRDSBWBQ [VSTRDQSBWB_S VSTRDQSBWB_U])
+(define_int_iterator VLDRDGBWBQ [VLDRDQGBWB_S VLDRDQGBWB_U])
+(define_int_iterator VADCIQ [VADCIQ_U VADCIQ_S])
+(define_int_iterator VADCIQ_M [VADCIQ_M_U VADCIQ_M_S])
+(define_int_iterator VSBCQ [VSBCQ_U VSBCQ_S])
+(define_int_iterator VSBCQ_M [VSBCQ_M_U VSBCQ_M_S])
+(define_int_iterator VSBCIQ [VSBCIQ_U VSBCIQ_S])
+(define_int_iterator VSBCIQ_M [VSBCIQ_M_U VSBCIQ_M_S])
+(define_int_iterator VADCQ [VADCQ_U VADCQ_S])
+(define_int_iterator VADCQ_M [VADCQ_M_U VADCQ_M_S])
+(define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
+(define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
+(define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 9758862..0d77601 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -17,654 +17,6 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; <http://www.gnu.org/licenses/>.
 
-(define_mode_iterator MVE_types [V16QI V8HI V4SI V2DI TI V8HF V4SF V2DF])
-(define_mode_iterator MVE_VLD_ST [V16QI V8HI V4SI V8HF V4SF])
-(define_mode_iterator MVE_0 [V8HF V4SF])
-(define_mode_iterator MVE_1 [V16QI V8HI V4SI V2DI])
-(define_mode_iterator MVE_3 [V16QI V8HI])
-(define_mode_iterator MVE_2 [V16QI V8HI V4SI])
-(define_mode_iterator MVE_5 [V8HI V4SI])
-(define_mode_iterator MVE_6 [V8HI V4SI])
-
-(define_c_enum "unspec" [VST4Q VRNDXQ_F VRNDQ_F VRNDPQ_F VRNDNQ_F VRNDMQ_F
-			 VRNDAQ_F VREV64Q_F VNEGQ_F VDUPQ_N_F VABSQ_F VREV32Q_F
-			 VCVTTQ_F32_F16 VCVTBQ_F32_F16 VCVTQ_TO_F_S VQNEGQ_S
-			 VCVTQ_TO_F_U VREV16Q_S VREV16Q_U VADDLVQ_S VMVNQ_N_S
-			 VMVNQ_N_U VCVTAQ_S VCVTAQ_U VREV64Q_S VREV64Q_U
-			 VQABSQ_S VNEGQ_S VMVNQ_S VMVNQ_U VDUPQ_N_U VDUPQ_N_S
-			 VCLZQ_U VCLZQ_S VCLSQ_S VADDVQ_S VADDVQ_U VABSQ_S
-			 VREV32Q_U VREV32Q_S VMOVLTQ_U VMOVLTQ_S VMOVLBQ_S
-			 VMOVLBQ_U VCVTQ_FROM_F_S VCVTQ_FROM_F_U VCVTPQ_S
-			 VCVTPQ_U VCVTNQ_S VCVTNQ_U VCVTMQ_S VCVTMQ_U
-			 VADDLVQ_U VCTP8Q VCTP16Q VCTP32Q VCTP64Q VPNOT
-			 VCREATEQ_F VCVTQ_N_TO_F_S VCVTQ_N_TO_F_U VBRSRQ_N_F
-			 VSUBQ_N_F VCREATEQ_U VCREATEQ_S VSHRQ_N_S VSHRQ_N_U
-			 VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U VADDLVQ_P_S
-			 VADDLVQ_P_U VCMPNEQ_U VCMPNEQ_S VSHLQ_S VSHLQ_U VABDQ_S
-			 VADDQ_N_S VADDVAQ_S VADDVQ_P_S VANDQ_S VBICQ_S
-			 VBRSRQ_N_S VCADDQ_ROT270_S VCADDQ_ROT90_S VCMPEQQ_S
-			 VCMPEQQ_N_S VCMPNEQ_N_S VEORQ_S VHADDQ_S VHADDQ_N_S
-			 VHSUBQ_S VHSUBQ_N_S VMAXQ_S VMAXVQ_S VMINQ_S VMINVQ_S
-			 VMLADAVQ_S VMULHQ_S VMULLBQ_INT_S VMULLTQ_INT_S VMULQ_S
-			 VMULQ_N_S VORNQ_S VORRQ_S VQADDQ_S VQADDQ_N_S VQRSHLQ_S
-			 VQRSHLQ_N_S VQSHLQ_S VQSHLQ_N_S VQSHLQ_R_S VQSUBQ_S
-			 VQSUBQ_N_S VRHADDQ_S VRMULHQ_S VRSHLQ_S VRSHLQ_N_S
-			 VRSHRQ_N_S VSHLQ_N_S VSHLQ_R_S VSUBQ_S VSUBQ_N_S
-			 VABDQ_U VADDQ_N_U VADDVAQ_U VADDVQ_P_U VANDQ_U VBICQ_U
-			 VBRSRQ_N_U VCADDQ_ROT270_U VCADDQ_ROT90_U VCMPEQQ_U
-			 VCMPEQQ_N_U VCMPNEQ_N_U VEORQ_U VHADDQ_U VHADDQ_N_U
-			 VHSUBQ_U VHSUBQ_N_U VMAXQ_U VMAXVQ_U VMINQ_U VMINVQ_U
-			 VMLADAVQ_U VMULHQ_U VMULLBQ_INT_U VMULLTQ_INT_U VMULQ_U
-			 VMULQ_N_U VORNQ_U VORRQ_U VQADDQ_U VQADDQ_N_U VQRSHLQ_U
-			 VQRSHLQ_N_U VQSHLQ_U VQSHLQ_N_U VQSHLQ_R_U VQSUBQ_U
-			 VQSUBQ_N_U VRHADDQ_U VRMULHQ_U VRSHLQ_U VRSHLQ_N_U
-			 VRSHRQ_N_U VSHLQ_N_U VSHLQ_R_U VSUBQ_U VSUBQ_N_U
-			 VCMPGEQ_N_S VCMPGEQ_S VCMPGTQ_N_S VCMPGTQ_S VCMPLEQ_N_S
-			 VCMPLEQ_S VCMPLTQ_N_S VCMPLTQ_S VHCADDQ_ROT270_S
-			 VHCADDQ_ROT90_S VMAXAQ_S VMAXAVQ_S VMINAQ_S VMINAVQ_S
-			 VMLADAVXQ_S VMLSDAVQ_S VMLSDAVXQ_S VQDMULHQ_N_S
-			 VQDMULHQ_S VQRDMULHQ_N_S VQRDMULHQ_S VQSHLUQ_N_S
-			 VCMPCSQ_N_U VCMPCSQ_U VCMPHIQ_N_U VCMPHIQ_U VABDQ_M_S
-			 VABDQ_M_U VABDQ_F VADDQ_N_F VANDQ_F VBICQ_F
-			 VCADDQ_ROT270_F VCADDQ_ROT90_F VCMPEQQ_F VCMPEQQ_N_F
-			 VCMPGEQ_F VCMPGEQ_N_F VCMPGTQ_F VCMPGTQ_N_F VCMPLEQ_F
-			 VCMPLEQ_N_F VCMPLTQ_F VCMPLTQ_N_F VCMPNEQ_F VCMPNEQ_N_F
-			 VCMULQ_F VCMULQ_ROT180_F VCMULQ_ROT270_F VCMULQ_ROT90_F
-			 VEORQ_F VMAXNMAQ_F VMAXNMAVQ_F VMAXNMQ_F VMAXNMVQ_F
-			 VMINNMAQ_F VMINNMAVQ_F VMINNMQ_F VMINNMVQ_F VMULQ_F
-			 VMULQ_N_F VORNQ_F VORRQ_F VSUBQ_F VADDLVAQ_U
-			 VADDLVAQ_S VBICQ_N_U VBICQ_N_S VCTP8Q_M VCTP16Q_M
-			 VCTP32Q_M VCTP64Q_M VCVTBQ_F16_F32 VCVTTQ_F16_F32
-			 VMLALDAVQ_U VMLALDAVXQ_U VMLALDAVXQ_S VMLALDAVQ_S
-			 VMLSLDAVQ_S VMLSLDAVXQ_S VMOVNBQ_U VMOVNBQ_S
-			 VMOVNTQ_U VMOVNTQ_S VORRQ_N_S VORRQ_N_U VQDMULLBQ_N_S
-			 VQDMULLBQ_S VQDMULLTQ_N_S VQDMULLTQ_S VQMOVNBQ_U
-			 VQMOVNBQ_S VQMOVUNBQ_S VQMOVUNTQ_S VRMLALDAVHXQ_S
-			 VRMLSLDAVHQ_S VRMLSLDAVHXQ_S VSHLLBQ_S
-			 VSHLLBQ_U VSHLLTQ_U VSHLLTQ_S VQMOVNTQ_U VQMOVNTQ_S
-			 VSHLLBQ_N_S VSHLLBQ_N_U VSHLLTQ_N_U VSHLLTQ_N_S
-			 VRMLALDAVHQ_U VRMLALDAVHQ_S VMULLTQ_POLY_P
-			 VMULLBQ_POLY_P VBICQ_M_N_S VBICQ_M_N_U VCMPEQQ_M_F
-			 VCVTAQ_M_S VCVTAQ_M_U VCVTQ_M_TO_F_S VCVTQ_M_TO_F_U
-			 VQRSHRNBQ_N_U VQRSHRNBQ_N_S VQRSHRUNBQ_N_S
-			 VRMLALDAVHAQ_S VABAVQ_S VABAVQ_U VSHLCQ_S VSHLCQ_U
-			 VRMLALDAVHAQ_U VABSQ_M_S VADDVAQ_P_S VADDVAQ_P_U
-			 VCLSQ_M_S VCLZQ_M_S VCLZQ_M_U VCMPCSQ_M_N_U
-			 VCMPCSQ_M_U VCMPEQQ_M_N_S VCMPEQQ_M_N_U VCMPEQQ_M_S
-			 VCMPEQQ_M_U VCMPGEQ_M_N_S VCMPGEQ_M_S VCMPGTQ_M_N_S
-			 VCMPGTQ_M_S VCMPHIQ_M_N_U VCMPHIQ_M_U VCMPLEQ_M_N_S
-			 VCMPLEQ_M_S VCMPLTQ_M_N_S VCMPLTQ_M_S VCMPNEQ_M_N_S
-			 VCMPNEQ_M_N_U VCMPNEQ_M_S VCMPNEQ_M_U VDUPQ_M_N_S
-			 VDUPQ_M_N_U VDWDUPQ_N_U VDWDUPQ_WB_U VIWDUPQ_N_U
-			 VIWDUPQ_WB_U VMAXAQ_M_S VMAXAVQ_P_S VMAXVQ_P_S
-			 VMAXVQ_P_U VMINAQ_M_S VMINAVQ_P_S VMINVQ_P_S VMINVQ_P_U
-			 VMLADAVAQ_S VMLADAVAQ_U VMLADAVQ_P_S VMLADAVQ_P_U
-			 VMLADAVXQ_P_S VMLAQ_N_S VMLAQ_N_U VMLASQ_N_S VMLASQ_N_U
-			 VMLSDAVQ_P_S VMLSDAVXQ_P_S VMVNQ_M_S VMVNQ_M_U
-			 VNEGQ_M_S VPSELQ_S VPSELQ_U VQABSQ_M_S VQDMLAHQ_N_S
-			 VQDMLAHQ_N_U VQNEGQ_M_S VQRDMLADHQ_S VQRDMLADHXQ_S
-			 VQRDMLAHQ_N_S VQRDMLAHQ_N_U VQRDMLASHQ_N_S
-			 VQRDMLASHQ_N_U VQRDMLSDHQ_S VQRDMLSDHXQ_S VQRSHLQ_M_N_S
-			 VQRSHLQ_M_N_U VQSHLQ_M_R_S VQSHLQ_M_R_U VREV64Q_M_S
-			 VREV64Q_M_U VRSHLQ_M_N_S VRSHLQ_M_N_U VSHLQ_M_R_S
-			 VSHLQ_M_R_U VSLIQ_N_S VSLIQ_N_U VSRIQ_N_S VSRIQ_N_U
-			 VQDMLSDHXQ_S VQDMLSDHQ_S VQDMLADHXQ_S VQDMLADHQ_S
-			 VMLSDAVAXQ_S VMLSDAVAQ_S VMLADAVAXQ_S
-			 VCMPGEQ_M_F VCMPGTQ_M_N_F VMLSLDAVQ_P_S VRMLALDAVHAXQ_S
-			 VMLSLDAVXQ_P_S VFMAQ_F VMLSLDAVAQ_S VQSHRUNBQ_N_S
-			 VQRSHRUNTQ_N_S VCMLAQ_F VMINNMAQ_M_F VFMASQ_N_F
-			 VDUPQ_M_N_F VCMPGTQ_M_F VCMPLTQ_M_F VRMLSLDAVHQ_P_S
-			 VQSHRUNTQ_N_S VABSQ_M_F VMAXNMAVQ_P_F VFMAQ_N_F
-			 VRMLSLDAVHXQ_P_S VREV32Q_M_F VRMLSLDAVHAQ_S
-			 VRMLSLDAVHAXQ_S VCMPLTQ_M_N_F VCMPNEQ_M_F VRNDAQ_M_F
-			 VRNDPQ_M_F VADDLVAQ_P_S VQMOVUNBQ_M_S VCMPLEQ_M_F
-			 VCMLAQ_ROT180_F VMLSLDAVAXQ_S VRNDXQ_M_F VFMSQ_F
-			 VMINNMVQ_P_F VMAXNMVQ_P_F VPSELQ_F VCMLAQ_ROT90_F
-			 VQMOVUNTQ_M_S VREV64Q_M_F VNEGQ_M_F VRNDMQ_M_F
-			 VCMPLEQ_M_N_F VCMPGEQ_M_N_F VRNDNQ_M_F VMINNMAVQ_P_F
-			 VCMPNEQ_M_N_F VRMLALDAVHQ_P_S VRMLALDAVHXQ_P_S
-			 VCMPEQQ_M_N_F VCMLAQ_ROT270_F VMAXNMAQ_M_F VRNDQ_M_F
-			 VMLALDAVQ_P_U VMLALDAVQ_P_S VQMOVNBQ_M_S VQMOVNBQ_M_U
-			 VMOVLTQ_M_U VMOVLTQ_M_S VMOVNBQ_M_U VMOVNBQ_M_S
-			 VRSHRNTQ_N_U VRSHRNTQ_N_S VORRQ_M_N_S VORRQ_M_N_U
-			 VREV32Q_M_S VREV32Q_M_U VQRSHRNTQ_N_U VQRSHRNTQ_N_S
-			 VMOVNTQ_M_U VMOVNTQ_M_S VMOVLBQ_M_U VMOVLBQ_M_S
-			 VMLALDAVAQ_S VMLALDAVAQ_U VQSHRNBQ_N_U VQSHRNBQ_N_S
-			 VSHRNBQ_N_U VSHRNBQ_N_S VRSHRNBQ_N_S VRSHRNBQ_N_U
-			 VMLALDAVXQ_P_U VMLALDAVXQ_P_S VQMOVNTQ_M_U VQMOVNTQ_M_S
-			 VMVNQ_M_N_U VMVNQ_M_N_S VQSHRNTQ_N_U VQSHRNTQ_N_S
-			 VMLALDAVAXQ_S VMLALDAVAXQ_U VSHRNTQ_N_S VSHRNTQ_N_U
-			 VCVTBQ_M_F16_F32 VCVTBQ_M_F32_F16 VCVTTQ_M_F16_F32
-			 VCVTTQ_M_F32_F16 VCVTMQ_M_S VCVTMQ_M_U VCVTNQ_M_S
-			 VCVTPQ_M_S VCVTPQ_M_U VCVTQ_M_N_FROM_F_S VCVTNQ_M_U
-			 VREV16Q_M_S VREV16Q_M_U VREV32Q_M VCVTQ_M_FROM_F_U
-			 VCVTQ_M_FROM_F_S VRMLALDAVHQ_P_U VADDLVAQ_P_U
-			 VCVTQ_M_N_FROM_F_U VQSHLUQ_M_N_S VABAVQ_P_S
-			 VABAVQ_P_U VSHLQ_M_S VSHLQ_M_U VSRIQ_M_N_S
-			 VSRIQ_M_N_U VSUBQ_M_U VSUBQ_M_S VCVTQ_M_N_TO_F_U
-			 VCVTQ_M_N_TO_F_S VQADDQ_M_U VQADDQ_M_S
-			 VRSHRQ_M_N_S VSUBQ_M_N_S VSUBQ_M_N_U VBRSRQ_M_N_S
-			 VSUBQ_M_N_F VBICQ_M_F VHADDQ_M_U VBICQ_M_U VBICQ_M_S
-			 VMULQ_M_N_U VHADDQ_M_S VORNQ_M_F VMLAQ_M_N_S VQSUBQ_M_U
-			 VQSUBQ_M_S VMLAQ_M_N_U VQSUBQ_M_N_U VQSUBQ_M_N_S
-			 VMULLTQ_INT_M_S VMULLTQ_INT_M_U VMULQ_M_N_S VMULQ_M_N_F
-			 VMLASQ_M_N_U VMLASQ_M_N_S VMAXQ_M_U VQRDMLAHQ_M_N_U
-			 VCADDQ_ROT270_M_F VCADDQ_ROT270_M_U VCADDQ_ROT270_M_S
-			 VQRSHLQ_M_S VMULQ_M_F VRHADDQ_M_U VSHRQ_M_N_U
-			 VRHADDQ_M_S VMULQ_M_S VMULQ_M_U VQRDMLASHQ_M_N_S
-			 VRSHLQ_M_S VRSHLQ_M_U VRSHRQ_M_N_U VADDQ_M_N_F
-			 VADDQ_M_N_S VADDQ_M_N_U VQRDMLASHQ_M_N_U VMAXQ_M_S
-			 VQRDMLAHQ_M_N_S VORRQ_M_S VORRQ_M_U VORRQ_M_F
-			 VQRSHLQ_M_U VRMULHQ_M_U VRMULHQ_M_S VMINQ_M_S VMINQ_M_U
-			 VANDQ_M_F VANDQ_M_U VANDQ_M_S VHSUBQ_M_N_S VHSUBQ_M_N_U
-			 VMULHQ_M_S VMULHQ_M_U VMULLBQ_INT_M_U
-			 VMULLBQ_INT_M_S VCADDQ_ROT90_M_F
-			 VSHRQ_M_N_S VADDQ_M_U VSLIQ_M_N_U
-			 VQADDQ_M_N_S VBRSRQ_M_N_F VABDQ_M_F VBRSRQ_M_N_U
-			 VEORQ_M_F VSHLQ_M_N_S VQDMLAHQ_M_N_U VQDMLAHQ_M_N_S
-			 VSHLQ_M_N_U VMLADAVAQ_P_U VMLADAVAQ_P_S VSLIQ_M_N_S
-			 VQSHLQ_M_U VQSHLQ_M_S VCADDQ_ROT90_M_U VCADDQ_ROT90_M_S
-			 VORNQ_M_U VORNQ_M_S VQSHLQ_M_N_S VQSHLQ_M_N_U VADDQ_M_S
-			 VHADDQ_M_N_S VADDQ_M_F VQADDQ_M_N_U VEORQ_M_S VEORQ_M_U
-			 VHSUBQ_M_S VHSUBQ_M_U VHADDQ_M_N_U VHCADDQ_ROT90_M_S
-			 VQRDMLSDHQ_M_S VQRDMLSDHXQ_M_S VQRDMLADHXQ_M_S
-			 VQDMULHQ_M_S VMLADAVAXQ_P_S VQDMLADHXQ_M_S
-			 VQRDMULHQ_M_S VMLSDAVAXQ_P_S VQDMULHQ_M_N_S
-			 VHCADDQ_ROT270_M_S VQDMLSDHQ_M_S VQDMLSDHXQ_M_S
-			 VMLSDAVAQ_P_S VQRDMLADHQ_M_S VQDMLADHQ_M_S
-			 VMLALDAVAQ_P_U VMLALDAVAQ_P_S VMLALDAVAXQ_P_U
-			 VQRSHRNBQ_M_N_U VQRSHRNBQ_M_N_S VQRSHRNTQ_M_N_S
-			 VQSHRNBQ_M_N_U VQSHRNBQ_M_N_S VQSHRNTQ_M_N_S
-			 VRSHRNBQ_M_N_U VRSHRNBQ_M_N_S VRSHRNTQ_M_N_U
-			 VSHLLBQ_M_N_U VSHLLBQ_M_N_S VSHLLTQ_M_N_U VSHLLTQ_M_N_S
-			 VSHRNBQ_M_N_S VSHRNBQ_M_N_U VSHRNTQ_M_N_S VSHRNTQ_M_N_U
-			 VMLALDAVAXQ_P_S VQRSHRNTQ_M_N_U VQSHRNTQ_M_N_U
-			 VRSHRNTQ_M_N_S VQRDMULHQ_M_N_S VRMLALDAVHAQ_P_S
-			 VMLSLDAVAQ_P_S VMLSLDAVAXQ_P_S VMULLBQ_POLY_M_P
-			 VMULLTQ_POLY_M_P VQDMULLBQ_M_N_S VQDMULLBQ_M_S
-			 VQDMULLTQ_M_N_S VQDMULLTQ_M_S VQRSHRUNBQ_M_N_S
-			 VQRSHRUNTQ_M_N_SVQSHRUNBQ_M_N_S VQSHRUNTQ_M_N_S
-			 VRMLALDAVHAQ_P_U VRMLALDAVHAXQ_P_S VRMLSLDAVHAQ_P_S
-			 VRMLSLDAVHAXQ_P_S VQRSHRUNTQ_M_N_S VQSHRUNBQ_M_N_S
-			 VCMLAQ_M_F VCMLAQ_ROT180_M_F VCMLAQ_ROT270_M_F
-			 VCMLAQ_ROT90_M_F VCMULQ_M_F VCMULQ_ROT180_M_F
-			 VCMULQ_ROT270_M_F VCMULQ_ROT90_M_F VFMAQ_M_F
-			 VFMAQ_M_N_F VFMASQ_M_N_F VFMSQ_M_F VMAXNMQ_M_F
-			 VMINNMQ_M_F VSUBQ_M_F VSTRWQSB_S VSTRWQSB_U
-			 VSTRBQSO_S VSTRBQSO_U VSTRBQ_S VSTRBQ_U VLDRBQGO_S
-			 VLDRBQGO_U VLDRBQ_S VLDRBQ_U VLDRWQGB_S VLDRWQGB_U
-			 VLD1Q_F VLD1Q_S VLD1Q_U VLDRHQ_F VLDRHQGO_S
-			 VLDRHQGO_U VLDRHQGSO_S VLDRHQGSO_U VLDRHQ_S VLDRHQ_U
-			 VLDRWQ_F VLDRWQ_S VLDRWQ_U VLDRDQGB_S VLDRDQGB_U
-			 VLDRDQGO_S VLDRDQGO_U VLDRDQGSO_S VLDRDQGSO_U
-			 VLDRHQGO_F VLDRHQGSO_F VLDRWQGB_F VLDRWQGO_F
-			 VLDRWQGO_S VLDRWQGO_U VLDRWQGSO_F VLDRWQGSO_S
-			 VLDRWQGSO_U VSTRHQ_F VST1Q_S VST1Q_U VSTRHQSO_S
-			 VSTRHQSO_U VSTRHQSSO_S VSTRHQSSO_U VSTRHQ_S
-			 VSTRHQ_U VSTRWQ_S VSTRWQ_U VSTRWQ_F VST1Q_F VSTRDQSB_S
-			 VSTRDQSB_U VSTRDQSO_S VSTRDQSO_U VSTRDQSSO_S
-			 VSTRDQSSO_U VSTRWQSO_S VSTRWQSO_U VSTRWQSSO_S
-			 VSTRWQSSO_U VSTRHQSO_F VSTRHQSSO_F VSTRWQSB_F
-			 VSTRWQSO_F VSTRWQSSO_F VDDUPQ VDDUPQ_M VDWDUPQ
-			 VDWDUPQ_M VIDUPQ VIDUPQ_M VIWDUPQ VIWDUPQ_M
-			 VSTRWQSBWB_S VSTRWQSBWB_U VLDRWQGBWB_S VLDRWQGBWB_U
-			 VSTRWQSBWB_F VLDRWQGBWB_F VSTRDQSBWB_S VSTRDQSBWB_U
-			 VLDRDQGBWB_S VLDRDQGBWB_U VADCQ_U VADCQ_M_U VADCQ_S
-			 VADCQ_M_S VSBCIQ_U VSBCIQ_S VSBCIQ_M_U VSBCIQ_M_S
-			 VSBCQ_U VSBCQ_S VSBCQ_M_U VSBCQ_M_S VADCIQ_U VADCIQ_M_U
-			 VADCIQ_S VADCIQ_M_S VLD2Q VLD4Q VST2Q SRSHRL SRSHR
-			 URSHR URSHRL SQRSHR UQRSHL UQRSHLL_64 VSHLCQ_M_U
-			 UQRSHLL_48 SQRSHRL_64 SQRSHRL_48 VSHLCQ_M_S])
-
-(define_mode_attr MVE_CNVT [(V8HI "V8HF") (V4SI "V4SF") (V8HF "V8HI")
-			    (V4SF "V4SI")])
-
-(define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
-		       (VREV16Q_U "u") (VMVNQ_N_S "s") (VMVNQ_N_U "u")
-		       (VCVTAQ_U "u") (VCVTAQ_S "s") (VREV64Q_S "s")
-		       (VREV64Q_U "u") (VMVNQ_S "s") (VMVNQ_U "u")
-		       (VDUPQ_N_U "u") (VDUPQ_N_S"s") (VADDVQ_S "s")
-		       (VADDVQ_U "u") (VADDVQ_S "s") (VADDVQ_U "u")
-		       (VMOVLTQ_U "u") (VMOVLTQ_S "s") (VMOVLBQ_S "s")
-		       (VMOVLBQ_U "u") (VCVTQ_FROM_F_S "s") (VCVTQ_FROM_F_U "u")
-		       (VCVTPQ_S "s") (VCVTPQ_U "u") (VCVTNQ_S "s")
-		       (VCVTNQ_U "u") (VCVTMQ_S "s") (VCVTMQ_U "u")
-		       (VCLZQ_U "u") (VCLZQ_S "s") (VREV32Q_U "u")
-		       (VREV32Q_S "s") (VADDLVQ_U "u") (VADDLVQ_S "s")
-		       (VCVTQ_N_TO_F_S "s") (VCVTQ_N_TO_F_U "u")
-		       (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s")
-		       (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U "u")
-		       (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s") (VSHLQ_S "s")
-		       (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s")
-		       (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
-		       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
-		       (VADDVQ_P_S "s")	(VADDVQ_P_U "u") (VANDQ_S "s")
-		       (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u")
-		       (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s")
-		       (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s")
-		       (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u")
-		       (VCMPEQQ_N_S "s") (VCMPEQQ_N_U "u") (VCMPNEQ_N_S "s")
-		       (VCMPNEQ_N_U "u") (VEORQ_S "s") (VEORQ_U "u")
-		       (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s")
-		       (VHADDQ_U "u") (VHSUBQ_N_S "s")	(VHSUBQ_N_U "u")
-		       (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u")
-		       (VMAXVQ_S "s") (VMAXVQ_U "u") (VMINQ_S "s") (VMINQ_U "u")
-		       (VMINVQ_S "s") (VMINVQ_U "u") (VMLADAVQ_S "s")
-		       (VMLADAVQ_U "u") (VMULHQ_S "s") (VMULHQ_U "u")
-		       (VMULLBQ_INT_S "s") (VMULLBQ_INT_U "u") (VQADDQ_S "s")
-		       (VMULLTQ_INT_S "s") (VMULLTQ_INT_U "u") (VQADDQ_U "u")
-		       (VMULQ_N_S "s") (VMULQ_N_U "u") (VMULQ_S "s")
-		       (VMULQ_U "u") (VORNQ_S "s") (VORNQ_U "u") (VORRQ_S "s")
-		       (VORRQ_U "u") (VQADDQ_N_S "s") (VQADDQ_N_U "u")
-		       (VQRSHLQ_N_S "s") (VQRSHLQ_N_U "u") (VQRSHLQ_S "s")
-		       (VQRSHLQ_U "u") (VQSHLQ_N_S "s")	(VQSHLQ_N_U "u")
-		       (VQSHLQ_R_S "s") (VQSHLQ_R_U "u") (VQSHLQ_S "s")
-		       (VQSHLQ_U "u") (VQSUBQ_N_S "s") (VQSUBQ_N_U "u")
-		       (VQSUBQ_S "s") (VQSUBQ_U "u") (VRHADDQ_S "s")
-		       (VRHADDQ_U "u") (VRMULHQ_S "s") (VRMULHQ_U "u")
-		       (VRSHLQ_N_S "s") (VRSHLQ_N_U "u") (VRSHLQ_S "s")
-		       (VRSHLQ_U "u") (VRSHRQ_N_S "s") (VRSHRQ_N_U "u")
-		       (VSHLQ_N_S "s") (VSHLQ_N_U "u") (VSHLQ_R_S "s")
-		       (VSHLQ_R_U "u") (VSUBQ_N_S "s") (VSUBQ_N_U "u")
-		       (VSUBQ_S "s") (VSUBQ_U "u") (VADDVAQ_S "s")
-		       (VADDVAQ_U "u") (VADDLVAQ_S "s") (VADDLVAQ_U "u")
-		       (VBICQ_N_S "s") (VBICQ_N_U "u") (VMLALDAVQ_U "u")
-		       (VMLALDAVQ_S "s") (VMLALDAVXQ_U "u") (VMLALDAVXQ_S "s")
-		       (VMOVNBQ_U "u") (VMOVNBQ_S "s") (VMOVNTQ_U "u")
-		       (VMOVNTQ_S "s") (VORRQ_N_S "s") (VORRQ_N_U "u")
-		       (VQMOVNBQ_U "u") (VQMOVNBQ_S "s") (VQMOVNTQ_S "s")
-		       (VQMOVNTQ_U "u") (VSHLLBQ_N_U "u") (VSHLLBQ_N_S "s")
-		       (VSHLLTQ_N_U "u") (VSHLLTQ_N_S "s") (VRMLALDAVHQ_U "u")
-		       (VRMLALDAVHQ_S "s") (VBICQ_M_N_S "s") (VBICQ_M_N_U "u")
-		       (VCVTAQ_M_S "s") (VCVTAQ_M_U "u") (VCVTQ_M_TO_F_S "s")
-		       (VCVTQ_M_TO_F_U "u") (VQRSHRNBQ_N_S "s")
-		       (VQRSHRNBQ_N_U "u") (VABAVQ_S "s") (VABAVQ_U "u")
-		       (VRMLALDAVHAQ_U "u") (VRMLALDAVHAQ_S "s") (VSHLCQ_S "s")
-		       (VSHLCQ_U "u") (VADDVAQ_P_S "s") (VADDVAQ_P_U "u")
-		       (VCLZQ_M_S "s") (VCLZQ_M_U "u") (VCMPEQQ_M_N_S "s")
-		       (VCMPEQQ_M_N_U "u") (VCMPEQQ_M_S "s") (VCMPEQQ_M_U "u")
-		       (VCMPNEQ_M_N_S "s") (VCMPNEQ_M_N_U "u") (VCMPNEQ_M_S "s")
-		       (VCMPNEQ_M_U "u") (VDUPQ_M_N_S "s") (VDUPQ_M_N_U "u")
-		       (VMAXVQ_P_S "s") (VMAXVQ_P_U "u") (VMINVQ_P_S "s")
-		       (VMINVQ_P_U "u") (VMLADAVAQ_S "s") (VMLADAVAQ_U "u")
-		       (VMLADAVQ_P_S "s") (VMLADAVQ_P_U "u") (VMLAQ_N_S "s")
-		       (VMLAQ_N_U "u") (VMLASQ_N_S "s") (VMLASQ_N_U "u")
-		       (VMVNQ_M_S "s") (VMVNQ_M_U "u") (VPSELQ_S "s")
-		       (VPSELQ_U "u") (VQDMLAHQ_N_S "s") (VQDMLAHQ_N_U "u")
-		       (VQRDMLAHQ_N_S "s") (VQRDMLAHQ_N_U "u")
-		       (VQRDMLASHQ_N_S "s") (VQRDMLASHQ_N_U "u")
-		       (VQRSHLQ_M_N_S "s") (VQRSHLQ_M_N_U "u")
-		       (VQSHLQ_M_R_S "s") (VQSHLQ_M_R_U "u") (VSRIQ_N_S "s")
-		       (VREV64Q_M_S "s") (VREV64Q_M_U "u") (VSRIQ_N_U "u")
-		       (VRSHLQ_M_N_S "s") (VRSHLQ_M_N_U "u") (VSHLQ_M_R_S "s")
-		       (VSHLQ_M_R_U "u") (VSLIQ_N_S "s") (VSLIQ_N_U "u")
-		       (VMLALDAVQ_P_S "s") (VQMOVNBQ_M_S "s") (VMOVLTQ_M_S "s")
-		       (VMOVNBQ_M_S "s") (VRSHRNTQ_N_S "s") (VORRQ_M_N_S "s")
-		       (VREV32Q_M_S "s") (VQRSHRNTQ_N_S "s") (VMOVNTQ_M_S "s")
-		       (VMOVLBQ_M_S "s") (VMLALDAVAQ_S "s") (VQSHRNBQ_N_S "s")
-		       (VSHRNBQ_N_S "s") (VRSHRNBQ_N_S "s") (VMLALDAVXQ_P_S "s")
-		       (VQMOVNTQ_M_S "s") (VMVNQ_M_N_S "s") (VQSHRNTQ_N_S "s")
-		       (VMLALDAVAXQ_S "s") (VSHRNTQ_N_S "s") (VMLALDAVQ_P_U "u")
-		       (VQMOVNBQ_M_U "u") (VMOVLTQ_M_U "u") (VMOVNBQ_M_U "u")
-		       (VRSHRNTQ_N_U "u") (VORRQ_M_N_U "u") (VREV32Q_M_U "u")
-		       (VREV16Q_M_S "s") (VREV16Q_M_U "u")
-		       (VQRSHRNTQ_N_U "u") (VMOVNTQ_M_U "u") (VMOVLBQ_M_U "u")
-		       (VMLALDAVAQ_U "u") (VQSHRNBQ_N_U "u") (VSHRNBQ_N_U "u")
-		       (VRSHRNBQ_N_U "u") (VMLALDAVXQ_P_U "u")
-		       (VMVNQ_M_N_U "u") (VQSHRNTQ_N_U "u") (VMLALDAVAXQ_U "u")
-		       (VQMOVNTQ_M_U "u") (VSHRNTQ_N_U "u") (VCVTMQ_M_S "s")
-		       (VCVTMQ_M_U "u") (VCVTNQ_M_S "s") (VCVTNQ_M_U "u")
-		       (VCVTPQ_M_S "s") (VCVTPQ_M_U "u") (VADDLVAQ_P_S "s")
-		       (VCVTQ_M_N_FROM_F_U "u") (VCVTQ_M_FROM_F_S "s")
-		       (VCVTQ_M_FROM_F_U "u") (VRMLALDAVHQ_P_U "u")
-		       (VRMLALDAVHQ_P_S "s") (VADDLVAQ_P_U "u")
-		       (VCVTQ_M_N_FROM_F_S "s") (VABAVQ_P_U "u")
-		       (VABAVQ_P_S "s") (VSHLQ_M_S "s") (VSHLQ_M_U "u")
-		       (VSRIQ_M_N_S "s") (VSRIQ_M_N_U "u") (VSUBQ_M_S "s")
-		       (VSUBQ_M_U "u") (VCVTQ_M_N_TO_F_S "s")
-		       (VCVTQ_M_N_TO_F_U "u") (VADDQ_M_N_U "u")
-		       (VSHLQ_M_N_S "s") (VMAXQ_M_U "u") (VHSUBQ_M_N_U "u")
-		       (VMULQ_M_N_S "s") (VQSHLQ_M_U "u") (VRHADDQ_M_S "s")
-		       (VEORQ_M_U "u") (VSHRQ_M_N_U "u") (VCADDQ_ROT90_M_U "u")
-		       (VMLADAVAQ_P_U "u") (VEORQ_M_S "s") (VBRSRQ_M_N_S "s")
-		       (VMULQ_M_U "u") (VQRDMLAHQ_M_N_S "s") (VHSUBQ_M_N_S "s")
-		       (VQRSHLQ_M_S "s") (VMULQ_M_N_U "u")
-		       (VMULQ_M_S "s") (VQSHLQ_M_N_U "u") (VSLIQ_M_N_U "u")
-		       (VMLADAVAQ_P_S "s") (VQRSHLQ_M_U "u")
-		       (VMULLBQ_INT_M_U "u") (VSHLQ_M_N_U "u") (VQSUBQ_M_U "u")
-		       (VQRDMLASHQ_M_N_U "u") (VRSHRQ_M_N_S "s")
-		       (VORNQ_M_S "s") (VCADDQ_ROT270_M_S "s") (VRHADDQ_M_U "u")
-		       (VRSHRQ_M_N_U "u") (VMLASQ_M_N_U "u") (VHSUBQ_M_U "u")
-		       (VQSUBQ_M_N_S "s") (VMULLTQ_INT_M_S "s")
-		       (VORRQ_M_S "s") (VQDMLAHQ_M_N_U "u") (VRSHLQ_M_S "s")
-		       (VHADDQ_M_U "u") (VHADDQ_M_N_S "s") (VMULLTQ_INT_M_U "u")
-		       (VORRQ_M_U "u") (VHADDQ_M_S "s") (VHADDQ_M_N_U "u")
-		       (VQDMLAHQ_M_N_S "s") (VMAXQ_M_S "s") (VORNQ_M_U "u")
-		       (VCADDQ_ROT270_M_U "u") (VQADDQ_M_U "u")
-		       (VQRDMLASHQ_M_N_S "s") (VBICQ_M_U "u") (VMINQ_M_U "u")
-		       (VSUBQ_M_N_S "s") (VMULLBQ_INT_M_S "s") (VQSUBQ_M_S "s")
-		       (VCADDQ_ROT90_M_S "s") (VRMULHQ_M_S "s") (VANDQ_M_U "u")
-		       (VMULHQ_M_S "s") (VADDQ_M_S "s") (VQRDMLAHQ_M_N_U "u")
-		       (VMLASQ_M_N_S "s") (VHSUBQ_M_S "s") (VRMULHQ_M_U "u")
-		       (VQADDQ_M_N_S "s") (VSHRQ_M_N_S "s") (VANDQ_M_S "s")
-		       (VABDQ_M_U "u") (VQSHLQ_M_S "s") (VABDQ_M_S "s")
-		       (VSUBQ_M_N_U "u") (VMLAQ_M_N_S "s") (VBRSRQ_M_N_U "u")
-		       (VADDQ_M_U "u") (VRSHLQ_M_U "u") (VSLIQ_M_N_S "s")
-		       (VQADDQ_M_N_U "u") (VADDQ_M_N_S "s") (VQSUBQ_M_N_U "u")
-		       (VMLAQ_M_N_U "u") (VMINQ_M_S "s") (VMULHQ_M_U "u")
-		       (VQADDQ_M_S "s") (VBICQ_M_S "s") (VQSHLQ_M_N_S "s")
-		       (VQSHRNTQ_M_N_S "s") (VQSHRNTQ_M_N_U "u")
-		       (VSHRNTQ_M_N_U "u") (VSHRNTQ_M_N_S "s")
-		       (VSHRNBQ_M_N_S "s") (VSHRNBQ_M_N_U "u")
-		       (VSHLLTQ_M_N_S "s") (VSHLLTQ_M_N_U "u")
-		       (VSHLLBQ_M_N_S "s") (VSHLLBQ_M_N_U "u")
-		       (VRSHRNTQ_M_N_S "s") (VRSHRNTQ_M_N_U "u")
-		       (VRSHRNBQ_M_N_U "u") (VRSHRNBQ_M_N_S "s")
-		       (VQSHRNTQ_M_N_U "u") (VQSHRNTQ_M_N_S "s")
-		       (VQSHRNBQ_M_N_S "s") (VQSHRNBQ_M_N_U "u")
-		       (VQRSHRNTQ_M_N_S "s") (VQRSHRNTQ_M_N_U "u")
-		       (VQRSHRNBQ_M_N_S "s") (VQRSHRNBQ_M_N_U "u")
-		       (VMLALDAVAXQ_P_S "s") (VMLALDAVAXQ_P_U "u")
-		       (VMLALDAVAQ_P_S "s") (VMLALDAVAQ_P_U "u")
-		       (VSTRWQSB_S "s") (VSTRWQSB_U "u") (VSTRBQSO_S "s")
-		       (VSTRBQSO_U "u") (VSTRBQ_S "s") (VSTRBQ_U "u")
-		       (VLDRBQGO_S "s") (VLDRBQGO_U "u") (VLDRBQ_S "s")
-		       (VLDRBQ_U "u") (VLDRWQGB_S "s") (VLDRWQGB_U "u")
-		       (VLD1Q_S "s") (VLD1Q_U "u") (VLDRHQGO_S "s")
-		       (VLDRHQGO_U "u") (VLDRHQGSO_S "s") (VLDRHQGSO_U "u")
-		       (VLDRHQ_S "s") (VLDRHQ_U "u") (VLDRWQ_S "s")
-		       (VLDRWQ_U "u") (VLDRDQGB_S "s") (VLDRDQGB_U "u")
-		       (VLDRDQGO_S "s") (VLDRDQGO_U "u") (VLDRDQGSO_S "s")
-		       (VLDRDQGSO_U "u") (VLDRWQGO_S "s") (VLDRWQGO_U "u")
-		       (VLDRWQGSO_S "s") (VLDRWQGSO_U "u") (VST1Q_S "s")
-		       (VST1Q_U "u") (VSTRHQSO_S "s") (VSTRHQSO_U "u")
-		       (VSTRHQSSO_S "s") (VSTRHQSSO_U "u") (VSTRHQ_S "s")
-		       (VSTRHQ_U "u") (VSTRWQ_S "s") (VSTRWQ_U "u")
-		       (VSTRDQSB_S "s") (VSTRDQSB_U "u") (VSTRDQSO_S "s")
-		       (VSTRDQSO_U "u") (VSTRDQSSO_S "s") (VSTRDQSSO_U "u")
-		       (VSTRWQSO_U "u") (VSTRWQSO_S "s") (VSTRWQSSO_U "u")
-		       (VSTRWQSSO_S "s") (VSTRWQSBWB_S "s") (VSTRWQSBWB_U "u")
-		       (VLDRWQGBWB_S "s") (VLDRWQGBWB_U "u") (VLDRDQGBWB_S "s")
-		       (VLDRDQGBWB_U "u") (VSTRDQSBWB_S "s") (VADCQ_M_S "s")
-		       (VSTRDQSBWB_U "u") (VSBCQ_U "u")  (VSBCQ_M_U "u")
-		       (VSBCQ_S "s")  (VSBCQ_M_S "s") (VSBCIQ_U "u")
-		       (VSBCIQ_M_U "u") (VSBCIQ_S "s") (VSBCIQ_M_S "s")
-		       (VADCQ_U "u")  (VADCQ_M_U "u") (VADCQ_S "s")
-		       (VADCIQ_U "u") (VADCIQ_M_U "u") (VADCIQ_S "s")
-		       (VADCIQ_M_S "s") (SQRSHRL_64 "64") (SQRSHRL_48 "48")
-		       (UQRSHLL_64 "64") (UQRSHLL_48 "48") (VSHLCQ_M_S "s")
-		       (VSHLCQ_M_U "u")])
-
-(define_int_attr mode1 [(VCTP8Q "8") (VCTP16Q "16") (VCTP32Q "32")
-			(VCTP64Q "64") (VCTP8Q_M "8") (VCTP16Q_M "16")
-			(VCTP32Q_M "32") (VCTP64Q_M "64")])
-(define_mode_attr MVE_pred2 [(V16QI "mve_imm_8") (V8HI "mve_imm_16")
-			     (V4SI "mve_imm_32")
-			     (V8HF "mve_imm_16") (V4SF "mve_imm_32")])
-(define_mode_attr MVE_constraint2 [(V16QI "Rb") (V8HI "Rd") (V4SI "Rf")
-				    (V8HF "Rd") (V4SF "Rf")])
-(define_mode_attr MVE_LANES [(V16QI "16") (V8HI "8") (V4SI "4")])
-(define_mode_attr MVE_constraint [ (V16QI "Ra") (V8HI "Rc") (V4SI "Re")])
-(define_mode_attr MVE_pred [ (V16QI "mve_imm_7") (V8HI "mve_imm_15")
-				   (V4SI "mve_imm_31")])
-(define_mode_attr MVE_constraint3 [ (V8HI "Rb") (V4SI "Rd")])
-(define_mode_attr MVE_pred3 [ (V8HI "mve_imm_8") (V4SI "mve_imm_16")])
-(define_mode_attr MVE_constraint1 [ (V8HI "Ra") (V4SI "Rc")])
-(define_mode_attr MVE_pred1 [ (V8HI "mve_imm_7") (V4SI "mve_imm_15")])
-(define_mode_attr MVE_B_ELEM [ (V16QI "V16QI") (V8HI "V8QI") (V4SI "V4QI")])
-(define_mode_attr MVE_H_ELEM [ (V8HI "V8HI") (V4SI "V4HI")])
-(define_mode_attr V_sz_elem1 [(V16QI "b") (V8HI  "h") (V4SI "w") (V8HF "h")
-			      (V4SF "w")])
-(define_mode_attr V_extr_elem [(V16QI "u8") (V8HI "u16") (V4SI "32")
-                              (V8HF "u16") (V4SF "32")])
-
-(define_mode_attr earlyclobber_32 [(V16QI "=w") (V8HI "=w") (V4SI "=&w")
-						(V8HF "=w") (V4SF "=&w")])
-
-(define_int_iterator VCVTQ_TO_F [VCVTQ_TO_F_S VCVTQ_TO_F_U])
-(define_int_iterator VMVNQ_N [VMVNQ_N_U VMVNQ_N_S])
-(define_int_iterator VREV64Q [VREV64Q_S VREV64Q_U])
-(define_int_iterator VCVTQ_FROM_F [VCVTQ_FROM_F_S VCVTQ_FROM_F_U])
-(define_int_iterator VREV16Q [VREV16Q_U VREV16Q_S])
-(define_int_iterator VCVTAQ [VCVTAQ_U VCVTAQ_S])
-(define_int_iterator VMVNQ [VMVNQ_U VMVNQ_S])
-(define_int_iterator VDUPQ_N [VDUPQ_N_U VDUPQ_N_S])
-(define_int_iterator VCLZQ [VCLZQ_U VCLZQ_S])
-(define_int_iterator VADDVQ [VADDVQ_U VADDVQ_S])
-(define_int_iterator VREV32Q [VREV32Q_U VREV32Q_S])
-(define_int_iterator VMOVLBQ [VMOVLBQ_S VMOVLBQ_U])
-(define_int_iterator VMOVLTQ [VMOVLTQ_U VMOVLTQ_S])
-(define_int_iterator VCVTPQ [VCVTPQ_S VCVTPQ_U])
-(define_int_iterator VCVTNQ [VCVTNQ_S VCVTNQ_U])
-(define_int_iterator VCVTMQ [VCVTMQ_S VCVTMQ_U])
-(define_int_iterator VADDLVQ [VADDLVQ_U VADDLVQ_S])
-(define_int_iterator VCTPQ [VCTP8Q VCTP16Q VCTP32Q VCTP64Q])
-(define_int_iterator VCTPQ_M [VCTP8Q_M VCTP16Q_M VCTP32Q_M VCTP64Q_M])
-(define_int_iterator VCVTQ_N_TO_F [VCVTQ_N_TO_F_S VCVTQ_N_TO_F_U])
-(define_int_iterator VCREATEQ [VCREATEQ_U VCREATEQ_S])
-(define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U])
-(define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U])
-(define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U])
-(define_int_iterator VCMPNEQ [VCMPNEQ_U VCMPNEQ_S])
-(define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U])
-(define_int_iterator VABDQ [VABDQ_S VABDQ_U])
-(define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
-(define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
-(define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
-(define_int_iterator VANDQ [VANDQ_U VANDQ_S])
-(define_int_iterator VBICQ [VBICQ_S VBICQ_U])
-(define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
-(define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U])
-(define_int_iterator VCADDQ_ROT90 [VCADDQ_ROT90_U VCADDQ_ROT90_S])
-(define_int_iterator VCMPEQQ [VCMPEQQ_U VCMPEQQ_S])
-(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S VCMPEQQ_N_U])
-(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_U VCMPNEQ_N_S])
-(define_int_iterator VEORQ [VEORQ_U VEORQ_S])
-(define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U])
-(define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S])
-(define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U])
-(define_int_iterator VHSUBQ_N [VHSUBQ_N_U VHSUBQ_N_S])
-(define_int_iterator VMAXQ [VMAXQ_U VMAXQ_S])
-(define_int_iterator VMAXVQ [VMAXVQ_U VMAXVQ_S])
-(define_int_iterator VMINQ [VMINQ_S VMINQ_U])
-(define_int_iterator VMINVQ [VMINVQ_U VMINVQ_S])
-(define_int_iterator VMLADAVQ [VMLADAVQ_U VMLADAVQ_S])
-(define_int_iterator VMULHQ [VMULHQ_S VMULHQ_U])
-(define_int_iterator VMULLBQ_INT [VMULLBQ_INT_U VMULLBQ_INT_S])
-(define_int_iterator VMULLTQ_INT [VMULLTQ_INT_U VMULLTQ_INT_S])
-(define_int_iterator VMULQ [VMULQ_U VMULQ_S])
-(define_int_iterator VMULQ_N [VMULQ_N_U VMULQ_N_S])
-(define_int_iterator VORNQ [VORNQ_U VORNQ_S])
-(define_int_iterator VORRQ [VORRQ_S VORRQ_U])
-(define_int_iterator VQADDQ [VQADDQ_U VQADDQ_S])
-(define_int_iterator VQADDQ_N [VQADDQ_N_S VQADDQ_N_U])
-(define_int_iterator VQRSHLQ [VQRSHLQ_S VQRSHLQ_U])
-(define_int_iterator VQRSHLQ_N [VQRSHLQ_N_S VQRSHLQ_N_U])
-(define_int_iterator VQSHLQ [VQSHLQ_S VQSHLQ_U])
-(define_int_iterator VQSHLQ_N [VQSHLQ_N_S VQSHLQ_N_U])
-(define_int_iterator VQSHLQ_R [VQSHLQ_R_U VQSHLQ_R_S])
-(define_int_iterator VQSUBQ [VQSUBQ_U VQSUBQ_S])
-(define_int_iterator VQSUBQ_N [VQSUBQ_N_S VQSUBQ_N_U])
-(define_int_iterator VRHADDQ [VRHADDQ_S VRHADDQ_U])
-(define_int_iterator VRMULHQ [VRMULHQ_S VRMULHQ_U])
-(define_int_iterator VRSHLQ [VRSHLQ_S VRSHLQ_U])
-(define_int_iterator VRSHLQ_N [VRSHLQ_N_U VRSHLQ_N_S])
-(define_int_iterator VRSHRQ_N [VRSHRQ_N_S VRSHRQ_N_U])
-(define_int_iterator VSHLQ_N [VSHLQ_N_U VSHLQ_N_S])
-(define_int_iterator VSHLQ_R [VSHLQ_R_S VSHLQ_R_U])
-(define_int_iterator VSUBQ [VSUBQ_S VSUBQ_U])
-(define_int_iterator VSUBQ_N [VSUBQ_N_S VSUBQ_N_U])
-(define_int_iterator VADDLVAQ [VADDLVAQ_S VADDLVAQ_U])
-(define_int_iterator VBICQ_N [VBICQ_N_S VBICQ_N_U])
-(define_int_iterator VMLALDAVQ [VMLALDAVQ_U VMLALDAVQ_S])
-(define_int_iterator VMLALDAVXQ [VMLALDAVXQ_U VMLALDAVXQ_S])
-(define_int_iterator VMOVNBQ [VMOVNBQ_U VMOVNBQ_S])
-(define_int_iterator VMOVNTQ [VMOVNTQ_S VMOVNTQ_U])
-(define_int_iterator VORRQ_N [VORRQ_N_U VORRQ_N_S])
-(define_int_iterator VQMOVNBQ [VQMOVNBQ_U VQMOVNBQ_S])
-(define_int_iterator VQMOVNTQ [VQMOVNTQ_U VQMOVNTQ_S])
-(define_int_iterator VSHLLBQ_N [VSHLLBQ_N_S VSHLLBQ_N_U])
-(define_int_iterator VSHLLTQ_N [VSHLLTQ_N_U VSHLLTQ_N_S])
-(define_int_iterator VRMLALDAVHQ [VRMLALDAVHQ_U VRMLALDAVHQ_S])
-(define_int_iterator VBICQ_M_N [VBICQ_M_N_S VBICQ_M_N_U])
-(define_int_iterator VCVTAQ_M [VCVTAQ_M_S VCVTAQ_M_U])
-(define_int_iterator VCVTQ_M_TO_F [VCVTQ_M_TO_F_S VCVTQ_M_TO_F_U])
-(define_int_iterator VQRSHRNBQ_N [VQRSHRNBQ_N_U VQRSHRNBQ_N_S])
-(define_int_iterator VABAVQ [VABAVQ_S VABAVQ_U])
-(define_int_iterator VSHLCQ [VSHLCQ_S VSHLCQ_U])
-(define_int_iterator VRMLALDAVHAQ [VRMLALDAVHAQ_S VRMLALDAVHAQ_U])
-(define_int_iterator VADDVAQ_P [VADDVAQ_P_S VADDVAQ_P_U])
-(define_int_iterator VCLZQ_M [VCLZQ_M_S VCLZQ_M_U])
-(define_int_iterator VCMPEQQ_M_N [VCMPEQQ_M_N_S VCMPEQQ_M_N_U])
-(define_int_iterator VCMPEQQ_M [VCMPEQQ_M_S VCMPEQQ_M_U])
-(define_int_iterator VCMPNEQ_M_N [VCMPNEQ_M_N_S VCMPNEQ_M_N_U])
-(define_int_iterator VCMPNEQ_M [VCMPNEQ_M_S VCMPNEQ_M_U])
-(define_int_iterator VDUPQ_M_N [VDUPQ_M_N_S VDUPQ_M_N_U])
-(define_int_iterator VMAXVQ_P [VMAXVQ_P_S VMAXVQ_P_U])
-(define_int_iterator VMINVQ_P [VMINVQ_P_S VMINVQ_P_U])
-(define_int_iterator VMLADAVAQ [VMLADAVAQ_S VMLADAVAQ_U])
-(define_int_iterator VMLADAVQ_P [VMLADAVQ_P_S VMLADAVQ_P_U])
-(define_int_iterator VMLAQ_N [VMLAQ_N_S VMLAQ_N_U])
-(define_int_iterator VMLASQ_N [VMLASQ_N_S VMLASQ_N_U])
-(define_int_iterator VMVNQ_M [VMVNQ_M_S VMVNQ_M_U])
-(define_int_iterator VPSELQ [VPSELQ_S VPSELQ_U])
-(define_int_iterator VQDMLAHQ_N [VQDMLAHQ_N_S VQDMLAHQ_N_U])
-(define_int_iterator VQRDMLAHQ_N [VQRDMLAHQ_N_S VQRDMLAHQ_N_U])
-(define_int_iterator VQRDMLASHQ_N [VQRDMLASHQ_N_S VQRDMLASHQ_N_U])
-(define_int_iterator VQRSHLQ_M_N [VQRSHLQ_M_N_S VQRSHLQ_M_N_U])
-(define_int_iterator VQSHLQ_M_R [VQSHLQ_M_R_S VQSHLQ_M_R_U])
-(define_int_iterator VREV64Q_M [VREV64Q_M_S VREV64Q_M_U])
-(define_int_iterator VRSHLQ_M_N [VRSHLQ_M_N_S VRSHLQ_M_N_U])
-(define_int_iterator VSHLQ_M_R [VSHLQ_M_R_S VSHLQ_M_R_U])
-(define_int_iterator VSLIQ_N [VSLIQ_N_S VSLIQ_N_U])
-(define_int_iterator VSRIQ_N [VSRIQ_N_S VSRIQ_N_U])
-(define_int_iterator VMLALDAVQ_P [VMLALDAVQ_P_U VMLALDAVQ_P_S])
-(define_int_iterator VQMOVNBQ_M [VQMOVNBQ_M_S VQMOVNBQ_M_U])
-(define_int_iterator VMOVLTQ_M [VMOVLTQ_M_U VMOVLTQ_M_S])
-(define_int_iterator VMOVNBQ_M [VMOVNBQ_M_U VMOVNBQ_M_S])
-(define_int_iterator VRSHRNTQ_N [VRSHRNTQ_N_U VRSHRNTQ_N_S])
-(define_int_iterator VORRQ_M_N [VORRQ_M_N_S VORRQ_M_N_U])
-(define_int_iterator VREV32Q_M [VREV32Q_M_S VREV32Q_M_U])
-(define_int_iterator VREV16Q_M [VREV16Q_M_S VREV16Q_M_U])
-(define_int_iterator VQRSHRNTQ_N [VQRSHRNTQ_N_U VQRSHRNTQ_N_S])
-(define_int_iterator VMOVNTQ_M [VMOVNTQ_M_U VMOVNTQ_M_S])
-(define_int_iterator VMOVLBQ_M [VMOVLBQ_M_U VMOVLBQ_M_S])
-(define_int_iterator VMLALDAVAQ [VMLALDAVAQ_S VMLALDAVAQ_U])
-(define_int_iterator VQSHRNBQ_N [VQSHRNBQ_N_U VQSHRNBQ_N_S])
-(define_int_iterator VSHRNBQ_N [VSHRNBQ_N_U VSHRNBQ_N_S])
-(define_int_iterator VRSHRNBQ_N [VRSHRNBQ_N_S VRSHRNBQ_N_U])
-(define_int_iterator VMLALDAVXQ_P [VMLALDAVXQ_P_U VMLALDAVXQ_P_S])
-(define_int_iterator VQMOVNTQ_M [VQMOVNTQ_M_U VQMOVNTQ_M_S])
-(define_int_iterator VMVNQ_M_N [VMVNQ_M_N_U VMVNQ_M_N_S])
-(define_int_iterator VQSHRNTQ_N [VQSHRNTQ_N_U VQSHRNTQ_N_S])
-(define_int_iterator VMLALDAVAXQ [VMLALDAVAXQ_S VMLALDAVAXQ_U])
-(define_int_iterator VSHRNTQ_N [VSHRNTQ_N_S VSHRNTQ_N_U])
-(define_int_iterator VCVTMQ_M [VCVTMQ_M_S VCVTMQ_M_U])
-(define_int_iterator VCVTNQ_M [VCVTNQ_M_S VCVTNQ_M_U])
-(define_int_iterator VCVTPQ_M [VCVTPQ_M_S VCVTPQ_M_U])
-(define_int_iterator VCVTQ_M_N_FROM_F [VCVTQ_M_N_FROM_F_S VCVTQ_M_N_FROM_F_U])
-(define_int_iterator VCVTQ_M_FROM_F [VCVTQ_M_FROM_F_U VCVTQ_M_FROM_F_S])
-(define_int_iterator VRMLALDAVHQ_P [VRMLALDAVHQ_P_S VRMLALDAVHQ_P_U])
-(define_int_iterator VADDLVAQ_P [VADDLVAQ_P_U VADDLVAQ_P_S])
-(define_int_iterator VABAVQ_P [VABAVQ_P_S VABAVQ_P_U])
-(define_int_iterator VSHLQ_M [VSHLQ_M_S VSHLQ_M_U])
-(define_int_iterator VSRIQ_M_N [VSRIQ_M_N_S VSRIQ_M_N_U])
-(define_int_iterator VSUBQ_M [VSUBQ_M_U VSUBQ_M_S])
-(define_int_iterator VCVTQ_M_N_TO_F [VCVTQ_M_N_TO_F_U VCVTQ_M_N_TO_F_S])
-(define_int_iterator VHSUBQ_M [VHSUBQ_M_S VHSUBQ_M_U])
-(define_int_iterator VSLIQ_M_N [VSLIQ_M_N_U VSLIQ_M_N_S])
-(define_int_iterator VRSHLQ_M [VRSHLQ_M_S VRSHLQ_M_U])
-(define_int_iterator VMINQ_M [VMINQ_M_S VMINQ_M_U])
-(define_int_iterator VMULLBQ_INT_M [VMULLBQ_INT_M_U VMULLBQ_INT_M_S])
-(define_int_iterator VMULHQ_M [VMULHQ_M_S VMULHQ_M_U])
-(define_int_iterator VMULQ_M [VMULQ_M_S VMULQ_M_U])
-(define_int_iterator VHSUBQ_M_N [VHSUBQ_M_N_S VHSUBQ_M_N_U])
-(define_int_iterator VHADDQ_M_N [VHADDQ_M_N_S VHADDQ_M_N_U])
-(define_int_iterator VORRQ_M [VORRQ_M_S VORRQ_M_U])
-(define_int_iterator VRMULHQ_M [VRMULHQ_M_U VRMULHQ_M_S])
-(define_int_iterator VQADDQ_M [VQADDQ_M_U VQADDQ_M_S])
-(define_int_iterator VRSHRQ_M_N [VRSHRQ_M_N_S VRSHRQ_M_N_U])
-(define_int_iterator VQSUBQ_M_N [VQSUBQ_M_N_U VQSUBQ_M_N_S])
-(define_int_iterator VADDQ_M [VADDQ_M_U VADDQ_M_S])
-(define_int_iterator VORNQ_M [VORNQ_M_U VORNQ_M_S])
-(define_int_iterator VRHADDQ_M [VRHADDQ_M_U VRHADDQ_M_S])
-(define_int_iterator VQSHLQ_M [VQSHLQ_M_U VQSHLQ_M_S])
-(define_int_iterator VANDQ_M [VANDQ_M_U VANDQ_M_S])
-(define_int_iterator VBICQ_M [VBICQ_M_U VBICQ_M_S])
-(define_int_iterator VSHLQ_M_N [VSHLQ_M_N_S VSHLQ_M_N_U])
-(define_int_iterator VCADDQ_ROT270_M [VCADDQ_ROT270_M_U VCADDQ_ROT270_M_S])
-(define_int_iterator VQRSHLQ_M [VQRSHLQ_M_U VQRSHLQ_M_S])
-(define_int_iterator VQADDQ_M_N [VQADDQ_M_N_U VQADDQ_M_N_S])
-(define_int_iterator VADDQ_M_N [VADDQ_M_N_S VADDQ_M_N_U])
-(define_int_iterator VMAXQ_M [VMAXQ_M_S VMAXQ_M_U])
-(define_int_iterator VQSUBQ_M [VQSUBQ_M_U VQSUBQ_M_S])
-(define_int_iterator VMLASQ_M_N [VMLASQ_M_N_U VMLASQ_M_N_S])
-(define_int_iterator VMLADAVAQ_P [VMLADAVAQ_P_U VMLADAVAQ_P_S])
-(define_int_iterator VBRSRQ_M_N [VBRSRQ_M_N_U VBRSRQ_M_N_S])
-(define_int_iterator VMULQ_M_N [VMULQ_M_N_U VMULQ_M_N_S])
-(define_int_iterator VCADDQ_ROT90_M [VCADDQ_ROT90_M_U VCADDQ_ROT90_M_S])
-(define_int_iterator VMULLTQ_INT_M [VMULLTQ_INT_M_S VMULLTQ_INT_M_U])
-(define_int_iterator VEORQ_M [VEORQ_M_S VEORQ_M_U])
-(define_int_iterator VSHRQ_M_N [VSHRQ_M_N_S VSHRQ_M_N_U])
-(define_int_iterator VSUBQ_M_N [VSUBQ_M_N_S VSUBQ_M_N_U])
-(define_int_iterator VHADDQ_M [VHADDQ_M_S VHADDQ_M_U])
-(define_int_iterator VABDQ_M [VABDQ_M_S VABDQ_M_U])
-(define_int_iterator VMLAQ_M_N [VMLAQ_M_N_S VMLAQ_M_N_U])
-(define_int_iterator VQSHLQ_M_N [VQSHLQ_M_N_S VQSHLQ_M_N_U])
-(define_int_iterator VMLALDAVAQ_P [VMLALDAVAQ_P_U VMLALDAVAQ_P_S])
-(define_int_iterator VMLALDAVAXQ_P [VMLALDAVAXQ_P_U VMLALDAVAXQ_P_S])
-(define_int_iterator VQRSHRNBQ_M_N [VQRSHRNBQ_M_N_U VQRSHRNBQ_M_N_S])
-(define_int_iterator VQRSHRNTQ_M_N [VQRSHRNTQ_M_N_S VQRSHRNTQ_M_N_U])
-(define_int_iterator VQSHRNBQ_M_N [VQSHRNBQ_M_N_U VQSHRNBQ_M_N_S])
-(define_int_iterator VQSHRNTQ_M_N [VQSHRNTQ_M_N_S VQSHRNTQ_M_N_U])
-(define_int_iterator VRSHRNBQ_M_N [VRSHRNBQ_M_N_U VRSHRNBQ_M_N_S])
-(define_int_iterator VRSHRNTQ_M_N [VRSHRNTQ_M_N_U VRSHRNTQ_M_N_S])
-(define_int_iterator VSHLLBQ_M_N [VSHLLBQ_M_N_U VSHLLBQ_M_N_S])
-(define_int_iterator VSHLLTQ_M_N [VSHLLTQ_M_N_U VSHLLTQ_M_N_S])
-(define_int_iterator VSHRNBQ_M_N [VSHRNBQ_M_N_S VSHRNBQ_M_N_U])
-(define_int_iterator VSHRNTQ_M_N [VSHRNTQ_M_N_S VSHRNTQ_M_N_U])
-(define_int_iterator VSTRWSBQ [VSTRWQSB_S VSTRWQSB_U])
-(define_int_iterator VSTRBSOQ [VSTRBQSO_S VSTRBQSO_U])
-(define_int_iterator VSTRBQ [VSTRBQ_S VSTRBQ_U])
-(define_int_iterator VLDRBGOQ [VLDRBQGO_S VLDRBQGO_U])
-(define_int_iterator VLDRBQ [VLDRBQ_S VLDRBQ_U])
-(define_int_iterator VLDRWGBQ [VLDRWQGB_S VLDRWQGB_U])
-(define_int_iterator VLD1Q [VLD1Q_S VLD1Q_U])
-(define_int_iterator VLDRHGOQ [VLDRHQGO_S VLDRHQGO_U])
-(define_int_iterator VLDRHGSOQ [VLDRHQGSO_S VLDRHQGSO_U])
-(define_int_iterator VLDRHQ [VLDRHQ_S VLDRHQ_U])
-(define_int_iterator VLDRWQ [VLDRWQ_S VLDRWQ_U])
-(define_int_iterator VLDRDGBQ [VLDRDQGB_S VLDRDQGB_U])
-(define_int_iterator VLDRDGOQ [VLDRDQGO_S VLDRDQGO_U])
-(define_int_iterator VLDRDGSOQ [VLDRDQGSO_S VLDRDQGSO_U])
-(define_int_iterator VLDRWGOQ [VLDRWQGO_S VLDRWQGO_U])
-(define_int_iterator VLDRWGSOQ [VLDRWQGSO_S VLDRWQGSO_U])
-(define_int_iterator VST1Q [VST1Q_S VST1Q_U])
-(define_int_iterator VSTRHSOQ [VSTRHQSO_S VSTRHQSO_U])
-(define_int_iterator VSTRHSSOQ [VSTRHQSSO_S VSTRHQSSO_U])
-(define_int_iterator VSTRHQ [VSTRHQ_S VSTRHQ_U])
-(define_int_iterator VSTRWQ [VSTRWQ_S VSTRWQ_U])
-(define_int_iterator VSTRDSBQ [VSTRDQSB_S VSTRDQSB_U])
-(define_int_iterator VSTRDSOQ [VSTRDQSO_S VSTRDQSO_U])
-(define_int_iterator VSTRDSSOQ [VSTRDQSSO_S VSTRDQSSO_U])
-(define_int_iterator VSTRWSOQ [VSTRWQSO_S VSTRWQSO_U])
-(define_int_iterator VSTRWSSOQ [VSTRWQSSO_S VSTRWQSSO_U])
-(define_int_iterator VSTRWSBWBQ [VSTRWQSBWB_S VSTRWQSBWB_U])
-(define_int_iterator VLDRWGBWBQ [VLDRWQGBWB_S VLDRWQGBWB_U])
-(define_int_iterator VSTRDSBWBQ [VSTRDQSBWB_S VSTRDQSBWB_U])
-(define_int_iterator VLDRDGBWBQ [VLDRDQGBWB_S VLDRDQGBWB_U])
-(define_int_iterator VADCIQ [VADCIQ_U VADCIQ_S])
-(define_int_iterator VADCIQ_M [VADCIQ_M_U VADCIQ_M_S])
-(define_int_iterator VSBCQ [VSBCQ_U VSBCQ_S])
-(define_int_iterator VSBCQ_M [VSBCQ_M_U VSBCQ_M_S])
-(define_int_iterator VSBCIQ [VSBCIQ_U VSBCIQ_S])
-(define_int_iterator VSBCIQ_M [VSBCIQ_M_U VSBCIQ_M_S])
-(define_int_iterator VADCQ [VADCQ_U VADCQ_S])
-(define_int_iterator VADCQ_M [VADCQ_M_U VADCQ_M_S])
-(define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
-(define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
-(define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
-
 (define_insn "*mve_mov<mode>"
   [(set (match_operand:MVE_types 0 "nonimmediate_operand" "=w,w,r,w,w,r,w,Ux,w")
 	(match_operand:MVE_types 1 "general_operand" "w,r,w,Dn,Uxi,r,Dm,w,Ul"))]
@@ -4310,7 +3662,7 @@
    (set_attr "length""8")])
 
 ;;
-;; [vqdmlahq_n_s, vqdmlahq_n_u])
+;; [vqdmlahq_n_s])
 ;;
 (define_insn "mve_vqdmlahq_n_<supf><mode>"
   [
@@ -4326,6 +3678,22 @@
 ])
 
 ;;
+;; [vqdmlashq_n_s])
+;;
+(define_insn "mve_vqdmlashq_n_<supf><mode>"
+  [
+   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
+	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
+		       (match_operand:MVE_2 2 "s_register_operand" "w")
+		       (match_operand:<V_elem> 3 "s_register_operand" "r")]
+	 VQDMLASHQ_N))
+  ]
+  "TARGET_HAVE_MVE"
+  "vqdmlash.s%#<V_sz_elem>\t%q0, %q2, %3"
+  [(set_attr "type" "mve_move")
+])
+
+;;
 ;; [vqnegq_m_s])
 ;;
 (define_insn "mve_vqnegq_m_s<mode>"
@@ -4374,7 +3742,7 @@
 ])
 
 ;;
-;; [vqrdmlahq_n_s, vqrdmlahq_n_u])
+;; [vqrdmlahq_n_s])
 ;;
 (define_insn "mve_vqrdmlahq_n_<supf><mode>"
   [
@@ -4390,7 +3758,7 @@
 ])
 
 ;;
-;; [vqrdmlashq_n_s, vqrdmlashq_n_u])
+;; [vqrdmlashq_n_s])
 ;;
 (define_insn "mve_vqrdmlashq_n_<supf><mode>"
   [
@@ -6552,6 +5920,23 @@
    (set_attr "length""8")])
 
 ;;
+;; [vqdmlashq_m_n_s])
+;;
+(define_insn "mve_vqdmlashq_m_n_s<mode>"
+  [
+   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
+	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
+		       (match_operand:MVE_2 2 "s_register_operand" "w")
+		       (match_operand:<V_elem> 3 "s_register_operand" "r")
+		       (match_operand:HI 4 "vpr_register_operand" "Up")]
+	 VQDMLASHQ_M_N_S))
+  ]
+  "TARGET_HAVE_MVE"
+  "vpst\;vqdmlasht.s%#<V_sz_elem>\t%q0, %q2, %3"
+  [(set_attr "type" "mve_move")
+   (set_attr "length""8")])
+
+;;
 ;; [vqrdmlahq_m_n_s])
 ;;
 (define_insn "mve_vqrdmlahq_m_n_s<mode>"
@@ -7113,7 +6498,7 @@
    (set_attr "length""8")])
 
 ;;
-;; [vmlaldavaxq_p_u, vmlaldavaxq_p_s])
+;; [vmlaldavaxq_p_s])
 ;;
 (define_insn "mve_vmlaldavaxq_p_<supf><mode>"
   [
@@ -9330,7 +8715,7 @@
   [(set_attr "length" "4")])
 
 (define_expand "mve_vst1q_f<mode>"
-  [(match_operand:<MVE_CNVT> 0 "memory_operand")
+  [(match_operand:<MVE_CNVT> 0 "mve_memory_operand")
    (unspec:<MVE_CNVT> [(match_operand:MVE_0 1 "s_register_operand")] VST1Q_F)
   ]
   "TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT"
@@ -9340,7 +8725,7 @@
 })
 
 (define_expand "mve_vst1q_<supf><mode>"
-  [(match_operand:MVE_2 0 "memory_operand")
+  [(match_operand:MVE_2 0 "mve_memory_operand")
    (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand")] VST1Q)
   ]
   "TARGET_HAVE_MVE"
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 3e7b51d..85e424e 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -501,7 +501,7 @@
   [(set (match_operand:VDQ 0 "s_register_operand" "=w")
         (plus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
 		  (match_operand:VDQ 2 "s_register_operand" "w")))]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+  "ARM_HAVE_NEON_<MODE>_ARITH"
   "vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
   [(set (attr "type")
       (if_then_else (match_test "<Is_float_mode>")
@@ -509,54 +509,11 @@
                     (const_string "neon_add<q>")))]
 )
 
-;; As with SFmode, full support for HFmode vector arithmetic is only available
-;; when flag-unsafe-math-optimizations is enabled.
-
-;; Add pattern with modes V8HF and V4HF is split into separate patterns to add
-;; support for standard pattern addv8hf3 in MVE.  Following pattern is called
-;; from "addv8hf3" standard pattern inside vec-common.md file.
-
-(define_insn "addv8hf3_neon"
-  [(set
-    (match_operand:V8HF 0 "s_register_operand" "=w")
-    (plus:V8HF
-     (match_operand:V8HF 1 "s_register_operand" "w")
-     (match_operand:V8HF 2 "s_register_operand" "w")))]
- "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations"
- "vadd.f16\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
- [(set_attr "type" "neon_fp_addsub_s_q")]
-)
-
-(define_insn "addv4hf3"
-  [(set
-    (match_operand:V4HF 0 "s_register_operand" "=w")
-    (plus:V4HF
-     (match_operand:V4HF 1 "s_register_operand" "w")
-     (match_operand:V4HF 2 "s_register_operand" "w")))]
- "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations"
- "vadd.f16\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
- [(set_attr "type" "neon_fp_addsub_s_q")]
-)
-
-(define_insn "add<mode>3_fp16"
-  [(set
-    (match_operand:VH 0 "s_register_operand" "=w")
-    (plus:VH
-     (match_operand:VH 1 "s_register_operand" "w")
-     (match_operand:VH 2 "s_register_operand" "w")))]
- "TARGET_NEON_FP16INST"
- "vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
- [(set (attr "type")
-   (if_then_else (match_test "<Is_float_mode>")
-    (const_string "neon_fp_addsub_s<q>")
-    (const_string "neon_add<q>")))]
-)
-
 (define_insn "*sub<mode>3_neon"
   [(set (match_operand:VDQ 0 "s_register_operand" "=w")
         (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
                    (match_operand:VDQ 2 "s_register_operand" "w")))]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+  "ARM_HAVE_NEON_<MODE>_ARITH"
   "vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
   [(set (attr "type")
       (if_then_else (match_test "<Is_float_mode>")
@@ -570,7 +527,7 @@
    (minus:VH
     (match_operand:VH 1 "s_register_operand" "w")
     (match_operand:VH 2 "s_register_operand" "w")))]
- "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations"
+ "ARM_HAVE_NEON_<MODE>_ARITH"
  "vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
  [(set_attr "type" "neon_sub<q>")]
 )
@@ -590,7 +547,7 @@
   [(set (match_operand:VDQW 0 "s_register_operand" "=w")
         (mult:VDQW (match_operand:VDQW 1 "s_register_operand" "w")
                    (match_operand:VDQW 2 "s_register_operand" "w")))]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+  "ARM_HAVE_NEON_<MODE>_ARITH"
   "vmul.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
   [(set (attr "type")
       (if_then_else (match_test "<Is_float_mode>")
@@ -635,7 +592,7 @@
         (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
                             (match_operand:VDQW 3 "s_register_operand" "w"))
 		  (match_operand:VDQW 1 "s_register_operand" "0")))]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+  "ARM_HAVE_NEON_<MODE>_ARITH"
   "vmla.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
   [(set (attr "type")
       (if_then_else (match_test "<Is_float_mode>")
@@ -648,7 +605,7 @@
 	(plus:VH (mult:VH (match_operand:VH 2 "s_register_operand" "w")
 			  (match_operand:VH 3 "s_register_operand" "w"))
 		  (match_operand:VH 1 "s_register_operand" "0")))]
-  "TARGET_NEON_FP16INST && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+  "ARM_HAVE_NEON_<MODE>_ARITH"
   "vmla.f16\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
   [(set_attr "type" "neon_fp_mla_s<q>")]
 )
@@ -658,7 +615,7 @@
         (minus:VDQW (match_operand:VDQW 1 "s_register_operand" "0")
                     (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
                                (match_operand:VDQW 3 "s_register_operand" "w"))))]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+  "ARM_HAVE_NEON_<MODE>_ARITH"
   "vmls.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
   [(set (attr "type")
       (if_then_else (match_test "<Is_float_mode>")
@@ -676,7 +633,7 @@
         (fma:VCVTF (match_operand:VCVTF 1 "register_operand" "w")
 		 (match_operand:VCVTF 2 "register_operand" "w")
 		 (match_operand:VCVTF 3 "register_operand" "0")))]
-  "TARGET_NEON && TARGET_FMA && flag_unsafe_math_optimizations"
+  "ARM_HAVE_NEON_<MODE>_ARITH && TARGET_FMA"
   "vfma.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
   [(set_attr "type" "neon_fp_mla_s<q>")]
 )
@@ -697,18 +654,7 @@
     (match_operand:VH 1 "register_operand" "w")
     (match_operand:VH 2 "register_operand" "w")
     (match_operand:VH 3 "register_operand" "0")))]
- "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations"
- "vfma.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
- [(set_attr "type" "neon_fp_mla_s<q>")]
-)
-
-(define_insn "fma<VH:mode>4_intrinsic"
- [(set (match_operand:VH 0 "register_operand" "=w")
-   (fma:VH
-    (match_operand:VH 1 "register_operand" "w")
-    (match_operand:VH 2 "register_operand" "w")
-    (match_operand:VH 3 "register_operand" "0")))]
- "TARGET_NEON_FP16INST"
+ "ARM_HAVE_NEON_<MODE>_ARITH"
  "vfma.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
  [(set_attr "type" "neon_fp_mla_s<q>")]
 )
@@ -718,7 +664,7 @@
         (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
 		   (match_operand:VCVTF 2 "register_operand" "w")
 		   (match_operand:VCVTF 3 "register_operand" "0")))]
-  "TARGET_NEON && TARGET_FMA && flag_unsafe_math_optimizations"
+  "ARM_HAVE_NEON_<MODE>_ARITH && TARGET_FMA"
   "vfms.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
   [(set_attr "type" "neon_fp_mla_s<q>")]
 )
@@ -1238,7 +1184,7 @@
                            (parallel [(const_int 0) (const_int 1)]))
           (vec_select:V2SF (match_dup 1)
                            (parallel [(const_int 2) (const_int 3)]))))]
-  "TARGET_NEON && flag_unsafe_math_optimizations"
+  "ARM_HAVE_NEON_V4SF_ARITH"
   "<VQH_mnem>.f32\t%P0, %e1, %f1"
   [(set_attr "vqh_mnem" "<VQH_mnem>")
    (set_attr "type" "neon_fp_reduc_<VQH_type>_s_q")]
@@ -1305,7 +1251,7 @@
 (define_expand "reduc_plus_scal_<mode>"
   [(match_operand:<V_elem> 0 "nonimmediate_operand")
    (match_operand:VD 1 "s_register_operand")]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+  "ARM_HAVE_NEON_<MODE>_ARITH"
 {
   rtx vec = gen_reg_rtx (<MODE>mode);
   neon_pairwise_reduce (vec, operands[1], <MODE>mode,
@@ -1318,8 +1264,7 @@
 (define_expand "reduc_plus_scal_<mode>"
   [(match_operand:<V_elem> 0 "nonimmediate_operand")
    (match_operand:VQ 1 "s_register_operand")]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)
-   && !BYTES_BIG_ENDIAN"
+  "ARM_HAVE_NEON_<MODE>_ARITH && !BYTES_BIG_ENDIAN"
 {
   rtx step1 = gen_reg_rtx (<V_HALF>mode);
 
@@ -1354,7 +1299,7 @@
 (define_expand "reduc_smin_scal_<mode>"
   [(match_operand:<V_elem> 0 "nonimmediate_operand")
    (match_operand:VD 1 "s_register_operand")]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+  "ARM_HAVE_NEON_<MODE>_ARITH"
 {
   rtx vec = gen_reg_rtx (<MODE>mode);
 
@@ -1368,8 +1313,7 @@
 (define_expand "reduc_smin_scal_<mode>"
   [(match_operand:<V_elem> 0 "nonimmediate_operand")
    (match_operand:VQ 1 "s_register_operand")]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)
-   && !BYTES_BIG_ENDIAN"
+  "ARM_HAVE_NEON_<MODE>_ARITH && !BYTES_BIG_ENDIAN"
 {
   rtx step1 = gen_reg_rtx (<V_HALF>mode);
 
@@ -1382,7 +1326,7 @@
 (define_expand "reduc_smax_scal_<mode>"
   [(match_operand:<V_elem> 0 "nonimmediate_operand")
    (match_operand:VD 1 "s_register_operand")]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+  "ARM_HAVE_NEON_<MODE>_ARITH"
 {
   rtx vec = gen_reg_rtx (<MODE>mode);
   neon_pairwise_reduce (vec, operands[1], <MODE>mode,
@@ -1395,8 +1339,7 @@
 (define_expand "reduc_smax_scal_<mode>"
   [(match_operand:<V_elem> 0 "nonimmediate_operand")
    (match_operand:VQ 1 "s_register_operand")]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)
-   && !BYTES_BIG_ENDIAN"
+  "ARM_HAVE_NEON_<MODE>_ARITH && !BYTES_BIG_ENDIAN"
 {
   rtx step1 = gen_reg_rtx (<V_HALF>mode);
 
@@ -1573,6 +1516,30 @@
   [(set_attr "type" "neon_qsub<q>")]
 )
 
+(define_expand "vec_cmp<mode><v_cmp_result>"
+  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
+	(match_operator:<V_cmp_result> 1 "comparison_operator"
+	  [(match_operand:VDQW 2 "s_register_operand")
+	   (match_operand:VDQW 3 "reg_or_zero_operand")]))]
+  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
+			     operands[2], operands[3], false);
+  DONE;
+})
+
+(define_expand "vec_cmpu<mode><mode>"
+  [(set (match_operand:VDQIW 0 "s_register_operand")
+	(match_operator:VDQIW 1 "comparison_operator"
+	  [(match_operand:VDQIW 2 "s_register_operand")
+	   (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
+  "TARGET_NEON"
+{
+  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
+			     operands[2], operands[3], false);
+  DONE;
+})
+
 ;; Conditional instructions.  These are comparisons with conditional moves for
 ;; vectors.  They perform the assignment:
 ;;   
@@ -1586,230 +1553,53 @@
 	(if_then_else:VDQW
 	  (match_operator 3 "comparison_operator"
 	    [(match_operand:VDQW 4 "s_register_operand")
-	     (match_operand:VDQW 5 "nonmemory_operand")])
+	     (match_operand:VDQW 5 "reg_or_zero_operand")])
 	  (match_operand:VDQW 1 "s_register_operand")
 	  (match_operand:VDQW 2 "s_register_operand")))]
   "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
 {
-  int inverse = 0;
-  int use_zero_form = 0;
-  int swap_bsl_operands = 0;
-  rtx mask = gen_reg_rtx (<V_cmp_result>mode);
-  rtx tmp = gen_reg_rtx (<V_cmp_result>mode);
-
-  rtx (*base_comparison) (rtx, rtx, rtx);
-  rtx (*complimentary_comparison) (rtx, rtx, rtx);
-
-  switch (GET_CODE (operands[3]))
-    {
-    case GE:
-    case GT:
-    case LE:
-    case LT:
-    case EQ:
-      if (operands[5] == CONST0_RTX (<MODE>mode))
-	{
-	  use_zero_form = 1;
-	  break;
-	}
-      /* Fall through.  */
-    default:
-      if (!REG_P (operands[5]))
-	operands[5] = force_reg (<MODE>mode, operands[5]);
-    }
-
-  switch (GET_CODE (operands[3]))
-    {
-    case LT:
-    case UNLT:
-      inverse = 1;
-      /* Fall through.  */
-    case GE:
-    case UNGE:
-    case ORDERED:
-    case UNORDERED:
-      base_comparison = gen_neon_vcge<mode>;
-      complimentary_comparison = gen_neon_vcgt<mode>;
-      break;
-    case LE:
-    case UNLE:
-      inverse = 1;
-      /* Fall through.  */
-    case GT:
-    case UNGT:
-      base_comparison = gen_neon_vcgt<mode>;
-      complimentary_comparison = gen_neon_vcge<mode>;
-      break;
-    case EQ:
-    case NE:
-    case UNEQ:
-      base_comparison = gen_neon_vceq<mode>;
-      complimentary_comparison = gen_neon_vceq<mode>;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  switch (GET_CODE (operands[3]))
-    {
-    case LT:
-    case LE:
-    case GT:
-    case GE:
-    case EQ:
-      /* The easy case.  Here we emit one of vcge, vcgt or vceq.
-	 As a LT b <=> b GE a && a LE b <=> b GT a.  Our transformations are:
-	 a GE b -> a GE b
-	 a GT b -> a GT b
-	 a LE b -> b GE a
-	 a LT b -> b GT a
-	 a EQ b -> a EQ b
-	 Note that there also exist direct comparison against 0 forms,
-	 so catch those as a special case.  */
-      if (use_zero_form)
-	{
-	  inverse = 0;
-	  switch (GET_CODE (operands[3]))
-	    {
-	    case LT:
-	      base_comparison = gen_neon_vclt<mode>;
-	      break;
-	    case LE:
-	      base_comparison = gen_neon_vcle<mode>;
-	      break;
-	    default:
-	      /* Do nothing, other zero form cases already have the correct
-		 base_comparison.  */
-	      break;
-	    }
-	}
-
-      if (!inverse)
-	emit_insn (base_comparison (mask, operands[4], operands[5]));
-      else
-	emit_insn (complimentary_comparison (mask, operands[5], operands[4]));
-      break;
-    case UNLT:
-    case UNLE:
-    case UNGT:
-    case UNGE:
-    case NE:
-      /* Vector compare returns false for lanes which are unordered, so if we use
-	 the inverse of the comparison we actually want to emit, then
-	 swap the operands to BSL, we will end up with the correct result.
-	 Note that a NE NaN and NaN NE b are true for all a, b.
-
-	 Our transformations are:
-	 a GE b -> !(b GT a)
-	 a GT b -> !(b GE a)
-	 a LE b -> !(a GT b)
-	 a LT b -> !(a GE b)
-	 a NE b -> !(a EQ b)  */
-
-      if (inverse)
-	emit_insn (base_comparison (mask, operands[4], operands[5]));
-      else
-	emit_insn (complimentary_comparison (mask, operands[5], operands[4]));
-
-      swap_bsl_operands = 1;
-      break;
-    case UNEQ:
-      /* We check (a > b ||  b > a).  combining these comparisons give us
-	 true iff !(a != b && a ORDERED b), swapping the operands to BSL
-	 will then give us (a == b ||  a UNORDERED b) as intended.  */
-
-      emit_insn (gen_neon_vcgt<mode> (mask, operands[4], operands[5]));
-      emit_insn (gen_neon_vcgt<mode> (tmp, operands[5], operands[4]));
-      emit_insn (gen_ior<v_cmp_result>3 (mask, mask, tmp));
-      swap_bsl_operands = 1;
-      break;
-    case UNORDERED:
-       /* Operands are ORDERED iff (a > b || b >= a).
-	 Swapping the operands to BSL will give the UNORDERED case.  */
-     swap_bsl_operands = 1;
-     /* Fall through.  */
-    case ORDERED:
-      emit_insn (gen_neon_vcgt<mode> (tmp, operands[4], operands[5]));
-      emit_insn (gen_neon_vcge<mode> (mask, operands[5], operands[4]));
-      emit_insn (gen_ior<v_cmp_result>3 (mask, mask, tmp));
-      break;
-    default:
-      gcc_unreachable ();
-    }
+  arm_expand_vcond (operands, <V_cmp_result>mode);
+  DONE;
+})
 
-  if (swap_bsl_operands)
-    emit_insn (gen_neon_vbsl<mode> (operands[0], mask, operands[2],
-				    operands[1]));
-  else
-    emit_insn (gen_neon_vbsl<mode> (operands[0], mask, operands[1],
-				    operands[2]));
+(define_expand "vcond<V_cvtto><mode>"
+  [(set (match_operand:<V_CVTTO> 0 "s_register_operand")
+	(if_then_else:<V_CVTTO>
+	  (match_operator 3 "comparison_operator"
+	    [(match_operand:V32 4 "s_register_operand")
+	     (match_operand:V32 5 "reg_or_zero_operand")])
+	  (match_operand:<V_CVTTO> 1 "s_register_operand")
+	  (match_operand:<V_CVTTO> 2 "s_register_operand")))]
+  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vcond (operands, <V_cmp_result>mode);
   DONE;
 })
 
-(define_expand "vcondu<mode><mode>"
-  [(set (match_operand:VDQIW 0 "s_register_operand")
-	(if_then_else:VDQIW
+(define_expand "vcondu<mode><v_cmp_result>"
+  [(set (match_operand:VDQW 0 "s_register_operand")
+	(if_then_else:VDQW
 	  (match_operator 3 "arm_comparison_operator"
-	    [(match_operand:VDQIW 4 "s_register_operand")
-	     (match_operand:VDQIW 5 "s_register_operand")])
-	  (match_operand:VDQIW 1 "s_register_operand")
-	  (match_operand:VDQIW 2 "s_register_operand")))]
+	    [(match_operand:<V_cmp_result> 4 "s_register_operand")
+	     (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
+	  (match_operand:VDQW 1 "s_register_operand")
+	  (match_operand:VDQW 2 "s_register_operand")))]
   "TARGET_NEON"
 {
-  rtx mask;
-  int inverse = 0, immediate_zero = 0;
-  
-  mask = gen_reg_rtx (<V_cmp_result>mode);
-  
-  if (operands[5] == CONST0_RTX (<MODE>mode))
-    immediate_zero = 1;
-  else if (!REG_P (operands[5]))
-    operands[5] = force_reg (<MODE>mode, operands[5]);
-  
-  switch (GET_CODE (operands[3]))
-    {
-    case GEU:
-      emit_insn (gen_neon_vcgeu<mode> (mask, operands[4], operands[5]));
-      break;
-    
-    case GTU:
-      emit_insn (gen_neon_vcgtu<mode> (mask, operands[4], operands[5]));
-      break;
-    
-    case EQ:
-      emit_insn (gen_neon_vceq<mode> (mask, operands[4], operands[5]));
-      break;
-    
-    case LEU:
-      if (immediate_zero)
-	emit_insn (gen_neon_vcle<mode> (mask, operands[4], operands[5]));
-      else
-	emit_insn (gen_neon_vcgeu<mode> (mask, operands[5], operands[4]));
-      break;
-    
-    case LTU:
-      if (immediate_zero)
-        emit_insn (gen_neon_vclt<mode> (mask, operands[4], operands[5]));
-      else
-	emit_insn (gen_neon_vcgtu<mode> (mask, operands[5], operands[4]));
-      break;
-    
-    case NE:
-      emit_insn (gen_neon_vceq<mode> (mask, operands[4], operands[5]));
-      inverse = 1;
-      break;
-    
-    default:
-      gcc_unreachable ();
-    }
-  
-  if (inverse)
-    emit_insn (gen_neon_vbsl<mode> (operands[0], mask, operands[2],
-				    operands[1]));
-  else
-    emit_insn (gen_neon_vbsl<mode> (operands[0], mask, operands[1],
-				    operands[2]));
+  arm_expand_vcond (operands, <V_cmp_result>mode);
+  DONE;
+})
 
+(define_expand "vcond_mask_<mode><v_cmp_result>"
+  [(set (match_operand:VDQW 0 "s_register_operand")
+	(if_then_else:VDQW
+	  (match_operand:<V_cmp_result> 3 "s_register_operand")
+	  (match_operand:VDQW 1 "s_register_operand")
+	  (match_operand:VDQW 2 "s_register_operand")))]
+  "TARGET_NEON"
+{
+  emit_insn (gen_neon_vbsl<mode> (operands[0], operands[3], operands[1],
+				  operands[2]));
   DONE;
 })
 
@@ -1823,7 +1613,7 @@
    (match_operand:VCVTF 2 "s_register_operand")]
   "TARGET_NEON"
 {
-  if (!<Is_float_mode> || flag_unsafe_math_optimizations)
+  if (ARM_HAVE_NEON_<MODE>_ARITH)
     emit_insn (gen_add<mode>3 (operands[0], operands[1], operands[2]));
   else
     emit_insn (gen_neon_vadd<mode>_unspec (operands[0], operands[1],
@@ -1837,7 +1627,7 @@
    (match_operand:VH 2 "s_register_operand")]
   "TARGET_NEON_FP16INST"
 {
-  emit_insn (gen_add<mode>3_fp16 (operands[0], operands[1], operands[2]));
+  emit_insn (gen_add<mode>3 (operands[0], operands[1], operands[2]));
   DONE;
 })
 
@@ -1948,7 +1738,7 @@
    (mult:VH
     (match_operand:VH 1 "s_register_operand" "w")
     (match_operand:VH 2 "s_register_operand" "w")))]
-  "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations"
+  "ARM_HAVE_NEON_<MODE>_ARITH"
   "vmul.f16\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
  [(set_attr "type" "neon_mul_<VH_elem_ch><q>")]
 )
@@ -1971,7 +1761,7 @@
    (match_operand:VDQW 3 "s_register_operand")]
   "TARGET_NEON"
 {
-  if (!<Is_float_mode> || flag_unsafe_math_optimizations)
+  if (ARM_HAVE_NEON_<MODE>_ARITH)
     emit_insn (gen_mul<mode>3add<mode>_neon (operands[0], operands[1],
 				             operands[2], operands[3]));
   else
@@ -1999,8 +1789,8 @@
    (match_operand:VH 3 "s_register_operand")]
   "TARGET_NEON_FP16INST"
 {
-  emit_insn (gen_fma<mode>4_intrinsic (operands[0], operands[2], operands[3],
-				       operands[1]));
+  emit_insn (gen_fma<mode>4 (operands[0], operands[2], operands[3],
+			     operands[1]));
   DONE;
 })
 
@@ -2462,7 +2252,7 @@
    (match_operand:VDQW 3 "s_register_operand")]
   "TARGET_NEON"
 {
-  if (!<Is_float_mode> || flag_unsafe_math_optimizations)
+  if (ARM_HAVE_NEON_<MODE>_ARITH)
     emit_insn (gen_mul<mode>3neg<mode>add<mode>_neon (operands[0],
 		 operands[1], operands[2], operands[3]));
   else
@@ -2569,7 +2359,7 @@
    (match_operand:VCVTF 2 "s_register_operand")]
   "TARGET_NEON"
 {
-  if (!<Is_float_mode> || flag_unsafe_math_optimizations)
+  if (ARM_HAVE_NEON_<MODE>_ARITH)
     emit_insn (gen_sub<mode>3 (operands[0], operands[1], operands[2]));
   else
     emit_insn (gen_neon_vsub<mode>_unspec (operands[0], operands[1],
@@ -2644,7 +2434,7 @@
 
 ;; These may expand to an UNSPEC pattern when a floating point mode is used
 ;; without unsafe math optimizations.
-(define_expand "neon_vc<cmp_op><mode>"
+(define_expand "@neon_vc<cmp_op><mode>"
   [(match_operand:<V_cmp_result> 0 "s_register_operand")
      (neg:<V_cmp_result>
        (COMPARISONS:VDQW (match_operand:VDQW 1 "s_register_operand")
@@ -2684,7 +2474,7 @@
   }
 )
 
-(define_insn "neon_vc<cmp_op><mode>_insn"
+(define_insn "@neon_vc<cmp_op><mode>_insn"
   [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w,w")
         (neg:<V_cmp_result>
           (COMPARISONS:<V_cmp_result>
@@ -2728,7 +2518,7 @@
   [(set_attr "type" "neon_fp_compare_s<q>")]
 )
 
-(define_expand "neon_vc<cmp_op><mode>"
+(define_expand "@neon_vc<cmp_op><mode>"
  [(match_operand:<V_cmp_result> 0 "s_register_operand")
   (neg:<V_cmp_result>
    (COMPARISONS:VH
@@ -2794,7 +2584,7 @@
 }
  [(set_attr "type" "neon_fp_compare_s<q>")])
 
-(define_insn "neon_vc<cmp_op>u<mode>"
+(define_insn "@neon_vc<code><mode>"
   [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
         (neg:<V_cmp_result>
           (GTUGEU:<V_cmp_result>
@@ -4751,7 +4541,7 @@ if (BYTES_BIG_ENDIAN)
   [(set_attr "type" "neon_bsl<q>")]
 )
 
-(define_expand "neon_vbsl<mode>"
+(define_expand "@neon_vbsl<mode>"
   [(set (match_operand:VDQX 0 "s_register_operand")
         (unspec:VDQX [(match_operand:<V_cmp_result> 1 "s_register_operand")
                       (match_operand:VDQX 2 "s_register_operand")
@@ -6658,7 +6448,7 @@ if (BYTES_BIG_ENDIAN)
  [(set (match_operand:VF 0 "s_register_operand" "=w")
        (abs:VF (minus:VF (match_operand:VF 1 "s_register_operand" "w")
 			 (match_operand:VF 2 "s_register_operand" "w"))))]
- "TARGET_NEON && flag_unsafe_math_optimizations"
+ "ARM_HAVE_NEON_<MODE>_ARITH"
  "vabd.<V_s_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
  [(set_attr "type" "neon_fp_abd_s<q>")]
 )
@@ -6668,7 +6458,7 @@ if (BYTES_BIG_ENDIAN)
        (abs:VF (unspec:VF [(match_operand:VF 1 "s_register_operand" "w")
 			    (match_operand:VF 2 "s_register_operand" "w")]
 		UNSPEC_VSUB)))]
- "TARGET_NEON && flag_unsafe_math_optimizations"
+ "ARM_HAVE_NEON_<MODE>_ARITH"
  "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
  [(set_attr "type" "neon_fp_abd_s<q>")]
 )
diff --git a/gcc/config/arm/parsecpu.awk b/gcc/config/arm/parsecpu.awk
index 7fc3754..9423e8a 100644
--- a/gcc/config/arm/parsecpu.awk
+++ b/gcc/config/arm/parsecpu.awk
@@ -190,6 +190,23 @@ function gen_isa () {
 	ORS = z
 	print "\n"
     }
+
+    print "struct fbit_implication {"
+    print "  /* Represents a feature implication, where:"
+    print "     ante IMPLIES cons"
+    print "     meaning that if ante is enabled then we should"
+    print "     also implicitly enable cons.  */"
+    print "  enum isa_feature ante;"
+    print "  enum isa_feature cons;"
+    print "};\n"
+    print "static const struct fbit_implication all_implied_fbits[] ="
+    print "{"
+    for (impl in implied_bits) {
+      split (impl, impl_parts, SUBSEP)
+      print "  { isa_bit_" impl_parts[2] ", isa_bit_" impl_parts[1] " },"
+    }
+    print "  { isa_nobit, isa_nobit }"
+    print "};\n"
 }
 
 function gen_data () {
@@ -600,6 +617,40 @@ BEGIN {
     parse_ok = 1
 }
 
+/^define implied / {
+  if (NF < 4) fatal("syntax: define implied <name> [<feature-or-fgroup>]+\n" \
+		    "Implied bits must be defined with at least one antecedent.")
+  toplevel()
+  fbit = $3
+  if (fbit in features) fatal("implied feature " fbit " aliases a real feature")
+  if (fbit in fgroup) fatal("implied feature " fbit " aliases a feature group")
+  fcount = NF
+  features[fbit] = 1
+  for (n = 4; n <= fcount; n++) {
+    ante = $n
+    if (fbit == ante) fatal("feature cannot imply itself")
+    else if (ante in features) {
+      for (impl in implied_bits) {
+	split(impl, impl_sep, SUBSEP)
+	if (ante == impl_sep[1])
+	  fatal(ante " implies implied bit " fbit		\
+		". Chained implications not currently supported")
+      }
+      implied_bits[fbit, ante] = 1
+    } else if (ante in fgroup) {
+      for (bitcomb in fgrp_bits) {
+	split(bitcomb, bitsep, SUBSEP)
+	if (bitsep[1] == ante) {
+	  implied_bits[fbit, bitsep[2]] = 1
+	}
+      }
+    } else {
+      fatal("implied bit antecedent " ante " unrecognized")
+    }
+  }
+  parse_ok = 1
+}
+
 /^begin fpu / {
     if (NF != 3) fatal("syntax: begin fpu <name>")
     toplevel()
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index 981eec5..2144520 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -485,6 +485,18 @@
   (and (match_operand 0 "expandable_comparison_operator")
        (match_test "maybe_get_arm_condition_code (op) != ARM_NV")))
 
+(define_special_predicate "arm_comparison_operation"
+  (match_code "eq,ne,le,lt,ge,gt,geu,gtu,leu,ltu,unordered,
+         ordered,unlt,unle,unge,ungt")
+{
+  if (XEXP (op, 1) != const0_rtx)
+    return false;
+  rtx op0 = XEXP (op, 0);
+  if (!REG_P (op0) || REGNO (op0) != CC_REGNUM)
+    return false;
+  return maybe_get_arm_condition_code (op) != ARM_NV;
+})
+
 (define_special_predicate "lt_ge_comparison_operator"
   (match_code "lt,ge"))
 
diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md
index 2486163..b5377a3 100644
--- a/gcc/config/arm/thumb1.md
+++ b/gcc/config/arm/thumb1.md
@@ -64,20 +64,6 @@
    (set_attr "conds" "clob")]
 )
 
-(define_split
-  [(set (match_operand:SI 0 "register_operand" "")
-	(match_operand:SI 1 "immediate_operand" ""))]
-  "TARGET_THUMB1
-   && arm_disable_literal_pool
-   && GET_CODE (operands[1]) == CONST_INT
-   && !satisfies_constraint_I (operands[1])"
-  [(clobber (const_int 0))]
-  "
-    thumb1_gen_const_int (operands[0], INTVAL (operands[1]));
-    DONE;
-  "
-)
-
 (define_insn "*thumb1_adddi3"
   [(set (match_operand:DI          0 "register_operand" "=l")
 	(plus:DI (match_operand:DI 1 "register_operand" "%0")
@@ -696,18 +682,59 @@
   "TARGET_THUMB1
    && (   register_operand (operands[0], SImode)
        || register_operand (operands[1], SImode))"
-  "@
-   movs	%0, %1
-   movs	%0, %1
-   movw	%0, %1
-   #
-   #
-   ldmia\\t%1, {%0}
-   stmia\\t%0, {%1}
-   movs\\t%0, #:upper8_15:%1; lsls\\t%0, #8; adds\\t%0, #:upper0_7:%1; lsls\\t%0, #8; adds\\t%0, #:lower8_15:%1; lsls\\t%0, #8; adds\\t%0, #:lower0_7:%1
-   ldr\\t%0, %1
-   str\\t%1, %0
-   mov\\t%0, %1"
+{
+  switch (which_alternative)
+    {
+      default:
+      case 0: return "movs\t%0, %1";
+      case 1: return "movs\t%0, %1";
+      case 2: return "movw\t%0, %1";
+      case 3: return "#";
+      case 4: return "#";
+      case 5: return "ldmia\t%1, {%0}";
+      case 6: return "stmia\t%0, {%1}";
+      case 7:
+      /* pure-code alternative: build the constant byte by byte,
+	 instead of loading it from a constant pool.  */
+	{
+	  int i;
+	  HOST_WIDE_INT op1 = INTVAL (operands[1]);
+	  bool mov_done_p = false;
+	  rtx ops[2];
+	  ops[0] = operands[0];
+
+	  /* Emit upper 3 bytes if needed.  */
+	  for (i = 0; i < 3; i++)
+	    {
+	       int byte = (op1 >> (8 * (3 - i))) & 0xff;
+
+	      if (byte)
+		{
+		  ops[1] = GEN_INT (byte);
+		  if (mov_done_p)
+		    output_asm_insn ("adds\t%0, %1", ops);
+		  else
+		    output_asm_insn ("movs\t%0, %1", ops);
+		  mov_done_p = true;
+		}
+
+	      if (mov_done_p)
+		output_asm_insn ("lsls\t%0, #8", ops);
+	    }
+
+	  /* Emit lower byte if needed.  */
+	  ops[1] = GEN_INT (op1 & 0xff);
+	  if (!mov_done_p)
+	    output_asm_insn ("movs\t%0, %1", ops);
+	  else if (op1 & 0xff)
+	    output_asm_insn ("adds\t%0, %1", ops);
+	  return "";
+	}
+      case 8: return "ldr\t%0, %1";
+      case 9: return "str\t%1, %0";
+      case 10: return "mov\t%0, %1";
+    }
+}
   [(set_attr "length" "2,2,4,4,4,2,2,14,2,2,2")
    (set_attr "type" "mov_reg,mov_imm,mov_imm,multiple,multiple,load_4,store_4,alu_sreg,load_4,store_4,mov_reg")
    (set_attr "pool_range" "*,*,*,*,*,*,*, *,1018,*,*")
@@ -790,6 +817,21 @@
   }"
 )
 
+(define_split
+  [(set (match_operand:SI 0 "register_operand" "")
+	(match_operand:SI 1 "const_int_operand" ""))]
+  "TARGET_THUMB1
+   && arm_disable_literal_pool
+   && GET_CODE (operands[1]) == CONST_INT
+   && !TARGET_HAVE_MOVT
+   && !satisfies_constraint_K (operands[1])"
+  [(clobber (const_int 0))]
+  "
+    thumb1_gen_const_int (operands[0], INTVAL (operands[1]));
+    DONE;
+  "
+)
+
 (define_insn "*thumb1_movhi_insn"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=l,l,m,l*r,*h,l,r")
 	(match_operand:HI 1 "general_operand"       "l,m,l,k*h,*r,I,n"))]
@@ -2020,6 +2062,8 @@
   [(set_attr "type" "mov_reg")]
 )
 
+;; DO NOT SPLIT THIS PATTERN.  It is important for security reasons that the
+;; canary value does not live beyond the end of this sequence.
 (define_insn "thumb1_stack_protect_test_insn"
   [(set (match_operand:SI 0 "register_operand" "=&l")
 	(unspec:SI [(match_operand:SI 1 "memory_operand" "m")
@@ -2027,9 +2071,9 @@
 	 UNSPEC_SP_TEST))
    (clobber (match_dup 2))]
   "TARGET_THUMB1"
-  "ldr\t%0, [%2]\;ldr\t%2, %1\;eors\t%0, %2, %0"
-  [(set_attr "length" "8")
-   (set_attr "conds" "set")
+  "ldr\t%0, [%2]\;ldr\t%2, %1\;eors\t%0, %2, %0\;movs\t%2, #0"
+  [(set_attr "length" "10")
+   (set_attr "conds" "clob")
    (set_attr "type" "multiple")]
 )
 
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index 1a5f24e..2a8fdf2 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -744,6 +744,10 @@
       return \"%i5\\t%0, %1, %2, lsr #31\";
 
     output_asm_insn (\"cmp\\t%2, %3\", operands);
+
+    if (GET_CODE (operands[5]) == PLUS && TARGET_COND_ARITH)
+      return \"cinc\\t%0, %1, %d4\";
+
     if (GET_CODE (operands[5]) == AND)
       {
 	output_asm_insn (\"ite\\t%D4\", operands);
@@ -877,7 +881,7 @@
 		 [(match_operand:SI 1 "s_register_operand" "r")
 		  (match_operand:SI 2 "arm_rhs_operand" "rI")])))
    (clobber (reg:CC CC_REGNUM))]
-  "TARGET_THUMB2"
+  "TARGET_THUMB2 && !TARGET_COND_ARITH"
   "#"
   "&& reload_completed"
   [(const_int 0)]
@@ -938,6 +942,49 @@
    (set_attr "type" "multiple")]
 )
 
+(define_insn "*thumb2_csinv"
+  [(set (match_operand:SI 0 "arm_general_register_operand" "=r, r")
+        (if_then_else:SI
+         (match_operand 1 "arm_comparison_operation" "")
+         (not:SI (match_operand:SI 2 "arm_general_register_operand" "r, r"))
+         (match_operand:SI 3 "reg_or_zero_operand" "r, Pz")))]
+  "TARGET_COND_ARITH"
+  "@
+   csinv\\t%0, %3, %2, %D1
+   csinv\\t%0, zr, %2, %D1"
+  [(set_attr "type" "csel")
+   (set_attr "predicable" "no")]
+)
+
+(define_insn "*thumb2_csinc"
+  [(set (match_operand:SI 0 "arm_general_register_operand" "=r, r")
+        (if_then_else:SI
+         (match_operand 1 "arm_comparison_operation" "")
+         (plus:SI (match_operand:SI 2 "arm_general_register_operand" "r, r")
+                  (const_int 1))
+         (match_operand:SI 3 "reg_or_zero_operand" "r, Pz")))]
+  "TARGET_COND_ARITH"
+  "@
+   csinc\\t%0, %3, %2, %D1
+   csinc\\t%0, zr, %2, %D1"
+  [(set_attr "type" "csel")
+   (set_attr "predicable" "no")]
+)
+
+(define_insn "*thumb2_csneg"
+  [(set (match_operand:SI 0 "arm_general_register_operand" "=r, r")
+        (if_then_else:SI
+         (match_operand 1 "arm_comparison_operation" "")
+         (neg:SI (match_operand:SI 2 "arm_general_register_operand" "r, r"))
+         (match_operand:SI 3 "reg_or_zero_operand" "r, Pz")))]
+  "TARGET_COND_ARITH"
+  "@
+   csneg\\t%0, %3, %2, %D1
+   csneg\\t%0, zr, %2, %D1"
+  [(set_attr "type" "csel")
+   (set_attr "predicable" "no")]
+)
+
 (define_insn "*thumb2_movcond"
   [(set (match_operand:SI 0 "s_register_operand" "=Ts,Ts,Ts")
 	(if_then_else:SI
@@ -947,7 +994,7 @@
 	 (match_operand:SI 1 "arm_rhs_operand" "0,TsI,?TsI")
 	 (match_operand:SI 2 "arm_rhs_operand" "TsI,0,TsI")))
    (clobber (reg:CC CC_REGNUM))]
-  "TARGET_THUMB2"
+  "TARGET_THUMB2 && !TARGET_COND_ARITH"
   "*
   if (GET_CODE (operands[5]) == LT
       && (operands[4] == const0_rtx))
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 0a2399d..a3844e9 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -519,3 +519,803 @@
   UNSPEC_BFMAB
   UNSPEC_BFMAT
 ])
+
+;; Enumerators for MVE unspecs.
+(define_c_enum "unspec" [
+  VST4Q
+  VRNDXQ_F
+  VRNDQ_F
+  VRNDPQ_F
+  VRNDNQ_F
+  VRNDMQ_F
+  VRNDAQ_F
+  VREV64Q_F
+  VNEGQ_F
+  VDUPQ_N_F
+  VABSQ_F
+  VREV32Q_F
+  VCVTTQ_F32_F16
+  VCVTBQ_F32_F16
+  VCVTQ_TO_F_S
+  VQNEGQ_S
+  VCVTQ_TO_F_U
+  VREV16Q_S
+  VREV16Q_U
+  VADDLVQ_S
+  VMVNQ_N_S
+  VMVNQ_N_U
+  VCVTAQ_S
+  VCVTAQ_U
+  VREV64Q_S
+  VREV64Q_U
+  VQABSQ_S
+  VNEGQ_S
+  VMVNQ_S
+  VMVNQ_U
+  VDUPQ_N_U
+  VDUPQ_N_S
+  VCLZQ_U
+  VCLZQ_S
+  VCLSQ_S
+  VADDVQ_S
+  VADDVQ_U
+  VABSQ_S
+  VREV32Q_U
+  VREV32Q_S
+  VMOVLTQ_U
+  VMOVLTQ_S
+  VMOVLBQ_S
+  VMOVLBQ_U
+  VCVTQ_FROM_F_S
+  VCVTQ_FROM_F_U
+  VCVTPQ_S
+  VCVTPQ_U
+  VCVTNQ_S
+  VCVTNQ_U
+  VCVTMQ_S
+  VCVTMQ_U
+  VADDLVQ_U
+  VCTP8Q
+  VCTP16Q
+  VCTP32Q
+  VCTP64Q
+  VPNOT
+  VCREATEQ_F
+  VCVTQ_N_TO_F_S
+  VCVTQ_N_TO_F_U
+  VBRSRQ_N_F
+  VSUBQ_N_F
+  VCREATEQ_U
+  VCREATEQ_S
+  VSHRQ_N_S
+  VSHRQ_N_U
+  VCVTQ_N_FROM_F_S
+  VCVTQ_N_FROM_F_U
+  VADDLVQ_P_S
+  VADDLVQ_P_U
+  VCMPNEQ_U
+  VCMPNEQ_S
+  VSHLQ_S
+  VSHLQ_U
+  VABDQ_S
+  VADDQ_N_S
+  VADDVAQ_S
+  VADDVQ_P_S
+  VANDQ_S
+  VBICQ_S
+  VBRSRQ_N_S
+  VCADDQ_ROT270_S
+  VCADDQ_ROT90_S
+  VCMPEQQ_S
+  VCMPEQQ_N_S
+  VCMPNEQ_N_S
+  VEORQ_S
+  VHADDQ_S
+  VHADDQ_N_S
+  VHSUBQ_S
+  VHSUBQ_N_S
+  VMAXQ_S
+  VMAXVQ_S
+  VMINQ_S
+  VMINVQ_S
+  VMLADAVQ_S
+  VMULHQ_S
+  VMULLBQ_INT_S
+  VMULLTQ_INT_S
+  VMULQ_S
+  VMULQ_N_S
+  VORNQ_S
+  VORRQ_S
+  VQADDQ_S
+  VQADDQ_N_S
+  VQRSHLQ_S
+  VQRSHLQ_N_S
+  VQSHLQ_S
+  VQSHLQ_N_S
+  VQSHLQ_R_S
+  VQSUBQ_S
+  VQSUBQ_N_S
+  VRHADDQ_S
+  VRMULHQ_S
+  VRSHLQ_S
+  VRSHLQ_N_S
+  VRSHRQ_N_S
+  VSHLQ_N_S
+  VSHLQ_R_S
+  VSUBQ_S
+  VSUBQ_N_S
+  VABDQ_U
+  VADDQ_N_U
+  VADDVAQ_U
+  VADDVQ_P_U
+  VANDQ_U
+  VBICQ_U
+  VBRSRQ_N_U
+  VCADDQ_ROT270_U
+  VCADDQ_ROT90_U
+  VCMPEQQ_U
+  VCMPEQQ_N_U
+  VCMPNEQ_N_U
+  VEORQ_U
+  VHADDQ_U
+  VHADDQ_N_U
+  VHSUBQ_U
+  VHSUBQ_N_U
+  VMAXQ_U
+  VMAXVQ_U
+  VMINQ_U
+  VMINVQ_U
+  VMLADAVQ_U
+  VMULHQ_U
+  VMULLBQ_INT_U
+  VMULLTQ_INT_U
+  VMULQ_U
+  VMULQ_N_U
+  VORNQ_U
+  VORRQ_U
+  VQADDQ_U
+  VQADDQ_N_U
+  VQRSHLQ_U
+  VQRSHLQ_N_U
+  VQSHLQ_U
+  VQSHLQ_N_U
+  VQSHLQ_R_U
+  VQSUBQ_U
+  VQSUBQ_N_U
+  VRHADDQ_U
+  VRMULHQ_U
+  VRSHLQ_U
+  VRSHLQ_N_U
+  VRSHRQ_N_U
+  VSHLQ_N_U
+  VSHLQ_R_U
+  VSUBQ_U
+  VSUBQ_N_U
+  VCMPGEQ_N_S
+  VCMPGEQ_S
+  VCMPGTQ_N_S
+  VCMPGTQ_S
+  VCMPLEQ_N_S
+  VCMPLEQ_S
+  VCMPLTQ_N_S
+  VCMPLTQ_S
+  VHCADDQ_ROT270_S
+  VHCADDQ_ROT90_S
+  VMAXAQ_S
+  VMAXAVQ_S
+  VMINAQ_S
+  VMINAVQ_S
+  VMLADAVXQ_S
+  VMLSDAVQ_S
+  VMLSDAVXQ_S
+  VQDMULHQ_N_S
+  VQDMULHQ_S
+  VQRDMULHQ_N_S
+  VQRDMULHQ_S
+  VQSHLUQ_N_S
+  VCMPCSQ_N_U
+  VCMPCSQ_U
+  VCMPHIQ_N_U
+  VCMPHIQ_U
+  VABDQ_M_S
+  VABDQ_M_U
+  VABDQ_F
+  VADDQ_N_F
+  VANDQ_F
+  VBICQ_F
+  VCADDQ_ROT270_F
+  VCADDQ_ROT90_F
+  VCMPEQQ_F
+  VCMPEQQ_N_F
+  VCMPGEQ_F
+  VCMPGEQ_N_F
+  VCMPGTQ_F
+  VCMPGTQ_N_F
+  VCMPLEQ_F
+  VCMPLEQ_N_F
+  VCMPLTQ_F
+  VCMPLTQ_N_F
+  VCMPNEQ_F
+  VCMPNEQ_N_F
+  VCMULQ_F
+  VCMULQ_ROT180_F
+  VCMULQ_ROT270_F
+  VCMULQ_ROT90_F
+  VEORQ_F
+  VMAXNMAQ_F
+  VMAXNMAVQ_F
+  VMAXNMQ_F
+  VMAXNMVQ_F
+  VMINNMAQ_F
+  VMINNMAVQ_F
+  VMINNMQ_F
+  VMINNMVQ_F
+  VMULQ_F
+  VMULQ_N_F
+  VORNQ_F
+  VORRQ_F
+  VSUBQ_F
+  VADDLVAQ_U
+  VADDLVAQ_S
+  VBICQ_N_U
+  VBICQ_N_S
+  VCTP8Q_M
+  VCTP16Q_M
+  VCTP32Q_M
+  VCTP64Q_M
+  VCVTBQ_F16_F32
+  VCVTTQ_F16_F32
+  VMLALDAVQ_U
+  VMLALDAVXQ_U
+  VMLALDAVXQ_S
+  VMLALDAVQ_S
+  VMLSLDAVQ_S
+  VMLSLDAVXQ_S
+  VMOVNBQ_U
+  VMOVNBQ_S
+  VMOVNTQ_U
+  VMOVNTQ_S
+  VORRQ_N_S
+  VORRQ_N_U
+  VQDMULLBQ_N_S
+  VQDMULLBQ_S
+  VQDMULLTQ_N_S
+  VQDMULLTQ_S
+  VQMOVNBQ_U
+  VQMOVNBQ_S
+  VQMOVUNBQ_S
+  VQMOVUNTQ_S
+  VRMLALDAVHXQ_S
+  VRMLSLDAVHQ_S
+  VRMLSLDAVHXQ_S
+  VSHLLBQ_S
+  VSHLLBQ_U
+  VSHLLTQ_U
+  VSHLLTQ_S
+  VQMOVNTQ_U
+  VQMOVNTQ_S
+  VSHLLBQ_N_S
+  VSHLLBQ_N_U
+  VSHLLTQ_N_U
+  VSHLLTQ_N_S
+  VRMLALDAVHQ_U
+  VRMLALDAVHQ_S
+  VMULLTQ_POLY_P
+  VMULLBQ_POLY_P
+  VBICQ_M_N_S
+  VBICQ_M_N_U
+  VCMPEQQ_M_F
+  VCVTAQ_M_S
+  VCVTAQ_M_U
+  VCVTQ_M_TO_F_S
+  VCVTQ_M_TO_F_U
+  VQRSHRNBQ_N_U
+  VQRSHRNBQ_N_S
+  VQRSHRUNBQ_N_S
+  VRMLALDAVHAQ_S
+  VABAVQ_S
+  VABAVQ_U
+  VSHLCQ_S
+  VSHLCQ_U
+  VRMLALDAVHAQ_U
+  VABSQ_M_S
+  VADDVAQ_P_S
+  VADDVAQ_P_U
+  VCLSQ_M_S
+  VCLZQ_M_S
+  VCLZQ_M_U
+  VCMPCSQ_M_N_U
+  VCMPCSQ_M_U
+  VCMPEQQ_M_N_S
+  VCMPEQQ_M_N_U
+  VCMPEQQ_M_S
+  VCMPEQQ_M_U
+  VCMPGEQ_M_N_S
+  VCMPGEQ_M_S
+  VCMPGTQ_M_N_S
+  VCMPGTQ_M_S
+  VCMPHIQ_M_N_U
+  VCMPHIQ_M_U
+  VCMPLEQ_M_N_S
+  VCMPLEQ_M_S
+  VCMPLTQ_M_N_S
+  VCMPLTQ_M_S
+  VCMPNEQ_M_N_S
+  VCMPNEQ_M_N_U
+  VCMPNEQ_M_S
+  VCMPNEQ_M_U
+  VDUPQ_M_N_S
+  VDUPQ_M_N_U
+  VDWDUPQ_N_U
+  VDWDUPQ_WB_U
+  VIWDUPQ_N_U
+  VIWDUPQ_WB_U
+  VMAXAQ_M_S
+  VMAXAVQ_P_S
+  VMAXVQ_P_S
+  VMAXVQ_P_U
+  VMINAQ_M_S
+  VMINAVQ_P_S
+  VMINVQ_P_S
+  VMINVQ_P_U
+  VMLADAVAQ_S
+  VMLADAVAQ_U
+  VMLADAVQ_P_S
+  VMLADAVQ_P_U
+  VMLADAVXQ_P_S
+  VMLAQ_N_S
+  VMLAQ_N_U
+  VMLASQ_N_S
+  VMLASQ_N_U
+  VMLSDAVQ_P_S
+  VMLSDAVXQ_P_S
+  VMVNQ_M_S
+  VMVNQ_M_U
+  VNEGQ_M_S
+  VPSELQ_S
+  VPSELQ_U
+  VQABSQ_M_S
+  VQDMLAHQ_N_S
+  VQDMLASHQ_N_S
+  VQNEGQ_M_S
+  VQRDMLADHQ_S
+  VQRDMLADHXQ_S
+  VQRDMLAHQ_N_S
+  VQRDMLASHQ_N_S
+  VQRDMLSDHQ_S
+  VQRDMLSDHXQ_S
+  VQRSHLQ_M_N_S
+  VQRSHLQ_M_N_U
+  VQSHLQ_M_R_S
+  VQSHLQ_M_R_U
+  VREV64Q_M_S
+  VREV64Q_M_U
+  VRSHLQ_M_N_S
+  VRSHLQ_M_N_U
+  VSHLQ_M_R_S
+  VSHLQ_M_R_U
+  VSLIQ_N_S
+  VSLIQ_N_U
+  VSRIQ_N_S
+  VSRIQ_N_U
+  VQDMLSDHXQ_S
+  VQDMLSDHQ_S
+  VQDMLADHXQ_S
+  VQDMLADHQ_S
+  VMLSDAVAXQ_S
+  VMLSDAVAQ_S
+  VMLADAVAXQ_S
+  VCMPGEQ_M_F
+  VCMPGTQ_M_N_F
+  VMLSLDAVQ_P_S
+  VRMLALDAVHAXQ_S
+  VMLSLDAVXQ_P_S
+  VFMAQ_F
+  VMLSLDAVAQ_S
+  VQSHRUNBQ_N_S
+  VQRSHRUNTQ_N_S
+  VCMLAQ_F
+  VMINNMAQ_M_F
+  VFMASQ_N_F
+  VDUPQ_M_N_F
+  VCMPGTQ_M_F
+  VCMPLTQ_M_F
+  VRMLSLDAVHQ_P_S
+  VQSHRUNTQ_N_S
+  VABSQ_M_F
+  VMAXNMAVQ_P_F
+  VFMAQ_N_F
+  VRMLSLDAVHXQ_P_S
+  VREV32Q_M_F
+  VRMLSLDAVHAQ_S
+  VRMLSLDAVHAXQ_S
+  VCMPLTQ_M_N_F
+  VCMPNEQ_M_F
+  VRNDAQ_M_F
+  VRNDPQ_M_F
+  VADDLVAQ_P_S
+  VQMOVUNBQ_M_S
+  VCMPLEQ_M_F
+  VCMLAQ_ROT180_F
+  VMLSLDAVAXQ_S
+  VRNDXQ_M_F
+  VFMSQ_F
+  VMINNMVQ_P_F
+  VMAXNMVQ_P_F
+  VPSELQ_F
+  VCMLAQ_ROT90_F
+  VQMOVUNTQ_M_S
+  VREV64Q_M_F
+  VNEGQ_M_F
+  VRNDMQ_M_F
+  VCMPLEQ_M_N_F
+  VCMPGEQ_M_N_F
+  VRNDNQ_M_F
+  VMINNMAVQ_P_F
+  VCMPNEQ_M_N_F
+  VRMLALDAVHQ_P_S
+  VRMLALDAVHXQ_P_S
+  VCMPEQQ_M_N_F
+  VCMLAQ_ROT270_F
+  VMAXNMAQ_M_F
+  VRNDQ_M_F
+  VMLALDAVQ_P_U
+  VMLALDAVQ_P_S
+  VQMOVNBQ_M_S
+  VQMOVNBQ_M_U
+  VMOVLTQ_M_U
+  VMOVLTQ_M_S
+  VMOVNBQ_M_U
+  VMOVNBQ_M_S
+  VRSHRNTQ_N_U
+  VRSHRNTQ_N_S
+  VORRQ_M_N_S
+  VORRQ_M_N_U
+  VREV32Q_M_S
+  VREV32Q_M_U
+  VQRSHRNTQ_N_U
+  VQRSHRNTQ_N_S
+  VMOVNTQ_M_U
+  VMOVNTQ_M_S
+  VMOVLBQ_M_U
+  VMOVLBQ_M_S
+  VMLALDAVAQ_S
+  VMLALDAVAQ_U
+  VQSHRNBQ_N_U
+  VQSHRNBQ_N_S
+  VSHRNBQ_N_U
+  VSHRNBQ_N_S
+  VRSHRNBQ_N_S
+  VRSHRNBQ_N_U
+  VMLALDAVXQ_P_U
+  VMLALDAVXQ_P_S
+  VQMOVNTQ_M_U
+  VQMOVNTQ_M_S
+  VMVNQ_M_N_U
+  VMVNQ_M_N_S
+  VQSHRNTQ_N_U
+  VQSHRNTQ_N_S
+  VMLALDAVAXQ_S
+  VMLALDAVAXQ_U
+  VSHRNTQ_N_S
+  VSHRNTQ_N_U
+  VCVTBQ_M_F16_F32
+  VCVTBQ_M_F32_F16
+  VCVTTQ_M_F16_F32
+  VCVTTQ_M_F32_F16
+  VCVTMQ_M_S
+  VCVTMQ_M_U
+  VCVTNQ_M_S
+  VCVTPQ_M_S
+  VCVTPQ_M_U
+  VCVTQ_M_N_FROM_F_S
+  VCVTNQ_M_U
+  VREV16Q_M_S
+  VREV16Q_M_U
+  VREV32Q_M
+  VCVTQ_M_FROM_F_U
+  VCVTQ_M_FROM_F_S
+  VRMLALDAVHQ_P_U
+  VADDLVAQ_P_U
+  VCVTQ_M_N_FROM_F_U
+  VQSHLUQ_M_N_S
+  VABAVQ_P_S
+  VABAVQ_P_U
+  VSHLQ_M_S
+  VSHLQ_M_U
+  VSRIQ_M_N_S
+  VSRIQ_M_N_U
+  VSUBQ_M_U
+  VSUBQ_M_S
+  VCVTQ_M_N_TO_F_U
+  VCVTQ_M_N_TO_F_S
+  VQADDQ_M_U
+  VQADDQ_M_S
+  VRSHRQ_M_N_S
+  VSUBQ_M_N_S
+  VSUBQ_M_N_U
+  VBRSRQ_M_N_S
+  VSUBQ_M_N_F
+  VBICQ_M_F
+  VHADDQ_M_U
+  VBICQ_M_U
+  VBICQ_M_S
+  VMULQ_M_N_U
+  VHADDQ_M_S
+  VORNQ_M_F
+  VMLAQ_M_N_S
+  VQSUBQ_M_U
+  VQSUBQ_M_S
+  VMLAQ_M_N_U
+  VQSUBQ_M_N_U
+  VQSUBQ_M_N_S
+  VMULLTQ_INT_M_S
+  VMULLTQ_INT_M_U
+  VMULQ_M_N_S
+  VMULQ_M_N_F
+  VMLASQ_M_N_U
+  VMLASQ_M_N_S
+  VMAXQ_M_U
+  VQRDMLAHQ_M_N_U
+  VCADDQ_ROT270_M_F
+  VCADDQ_ROT270_M_U
+  VCADDQ_ROT270_M_S
+  VQRSHLQ_M_S
+  VMULQ_M_F
+  VRHADDQ_M_U
+  VSHRQ_M_N_U
+  VRHADDQ_M_S
+  VMULQ_M_S
+  VMULQ_M_U
+  VQDMLASHQ_M_N_S
+  VQRDMLASHQ_M_N_S
+  VRSHLQ_M_S
+  VRSHLQ_M_U
+  VRSHRQ_M_N_U
+  VADDQ_M_N_F
+  VADDQ_M_N_S
+  VADDQ_M_N_U
+  VQRDMLASHQ_M_N_U
+  VMAXQ_M_S
+  VQRDMLAHQ_M_N_S
+  VORRQ_M_S
+  VORRQ_M_U
+  VORRQ_M_F
+  VQRSHLQ_M_U
+  VRMULHQ_M_U
+  VRMULHQ_M_S
+  VMINQ_M_S
+  VMINQ_M_U
+  VANDQ_M_F
+  VANDQ_M_U
+  VANDQ_M_S
+  VHSUBQ_M_N_S
+  VHSUBQ_M_N_U
+  VMULHQ_M_S
+  VMULHQ_M_U
+  VMULLBQ_INT_M_U
+  VMULLBQ_INT_M_S
+  VCADDQ_ROT90_M_F
+  VSHRQ_M_N_S
+  VADDQ_M_U
+  VSLIQ_M_N_U
+  VQADDQ_M_N_S
+  VBRSRQ_M_N_F
+  VABDQ_M_F
+  VBRSRQ_M_N_U
+  VEORQ_M_F
+  VSHLQ_M_N_S
+  VQDMLAHQ_M_N_U
+  VQDMLAHQ_M_N_S
+  VSHLQ_M_N_U
+  VMLADAVAQ_P_U
+  VMLADAVAQ_P_S
+  VSLIQ_M_N_S
+  VQSHLQ_M_U
+  VQSHLQ_M_S
+  VCADDQ_ROT90_M_U
+  VCADDQ_ROT90_M_S
+  VORNQ_M_U
+  VORNQ_M_S
+  VQSHLQ_M_N_S
+  VQSHLQ_M_N_U
+  VADDQ_M_S
+  VHADDQ_M_N_S
+  VADDQ_M_F
+  VQADDQ_M_N_U
+  VEORQ_M_S
+  VEORQ_M_U
+  VHSUBQ_M_S
+  VHSUBQ_M_U
+  VHADDQ_M_N_U
+  VHCADDQ_ROT90_M_S
+  VQRDMLSDHQ_M_S
+  VQRDMLSDHXQ_M_S
+  VQRDMLADHXQ_M_S
+  VQDMULHQ_M_S
+  VMLADAVAXQ_P_S
+  VQDMLADHXQ_M_S
+  VQRDMULHQ_M_S
+  VMLSDAVAXQ_P_S
+  VQDMULHQ_M_N_S
+  VHCADDQ_ROT270_M_S
+  VQDMLSDHQ_M_S
+  VQDMLSDHXQ_M_S
+  VMLSDAVAQ_P_S
+  VQRDMLADHQ_M_S
+  VQDMLADHQ_M_S
+  VMLALDAVAQ_P_U
+  VMLALDAVAQ_P_S
+  VQRSHRNBQ_M_N_U
+  VQRSHRNBQ_M_N_S
+  VQRSHRNTQ_M_N_S
+  VQSHRNBQ_M_N_U
+  VQSHRNBQ_M_N_S
+  VQSHRNTQ_M_N_S
+  VRSHRNBQ_M_N_U
+  VRSHRNBQ_M_N_S
+  VRSHRNTQ_M_N_U
+  VSHLLBQ_M_N_U
+  VSHLLBQ_M_N_S
+  VSHLLTQ_M_N_U
+  VSHLLTQ_M_N_S
+  VSHRNBQ_M_N_S
+  VSHRNBQ_M_N_U
+  VSHRNTQ_M_N_S
+  VSHRNTQ_M_N_U
+  VMLALDAVAXQ_P_S
+  VQRSHRNTQ_M_N_U
+  VQSHRNTQ_M_N_U
+  VRSHRNTQ_M_N_S
+  VQRDMULHQ_M_N_S
+  VRMLALDAVHAQ_P_S
+  VMLSLDAVAQ_P_S
+  VMLSLDAVAXQ_P_S
+  VMULLBQ_POLY_M_P
+  VMULLTQ_POLY_M_P
+  VQDMULLBQ_M_N_S
+  VQDMULLBQ_M_S
+  VQDMULLTQ_M_N_S
+  VQDMULLTQ_M_S
+  VQRSHRUNBQ_M_N_S
+  VQSHRUNBQ_M_N_S
+  VQSHRUNTQ_M_N_S
+  VRMLALDAVHAQ_P_U
+  VRMLALDAVHAXQ_P_S
+  VRMLSLDAVHAQ_P_S
+  VRMLSLDAVHAXQ_P_S
+  VQRSHRUNTQ_M_N_S
+  VCMLAQ_M_F
+  VCMLAQ_ROT180_M_F
+  VCMLAQ_ROT270_M_F
+  VCMLAQ_ROT90_M_F
+  VCMULQ_M_F
+  VCMULQ_ROT180_M_F
+  VCMULQ_ROT270_M_F
+  VCMULQ_ROT90_M_F
+  VFMAQ_M_F
+  VFMAQ_M_N_F
+  VFMASQ_M_N_F
+  VFMSQ_M_F
+  VMAXNMQ_M_F
+  VMINNMQ_M_F
+  VSUBQ_M_F
+  VSTRWQSB_S
+  VSTRWQSB_U
+  VSTRBQSO_S
+  VSTRBQSO_U
+  VSTRBQ_S
+  VSTRBQ_U
+  VLDRBQGO_S
+  VLDRBQGO_U
+  VLDRBQ_S
+  VLDRBQ_U
+  VLDRWQGB_S
+  VLDRWQGB_U
+  VLD1Q_F
+  VLD1Q_S
+  VLD1Q_U
+  VLDRHQ_F
+  VLDRHQGO_S
+  VLDRHQGO_U
+  VLDRHQGSO_S
+  VLDRHQGSO_U
+  VLDRHQ_S
+  VLDRHQ_U
+  VLDRWQ_F
+  VLDRWQ_S
+  VLDRWQ_U
+  VLDRDQGB_S
+  VLDRDQGB_U
+  VLDRDQGO_S
+  VLDRDQGO_U
+  VLDRDQGSO_S
+  VLDRDQGSO_U
+  VLDRHQGO_F
+  VLDRHQGSO_F
+  VLDRWQGB_F
+  VLDRWQGO_F
+  VLDRWQGO_S
+  VLDRWQGO_U
+  VLDRWQGSO_F
+  VLDRWQGSO_S
+  VLDRWQGSO_U
+  VSTRHQ_F
+  VST1Q_S
+  VST1Q_U
+  VSTRHQSO_S
+  VSTRHQ_U
+  VSTRWQ_S
+  VSTRWQ_U
+  VSTRWQ_F
+  VST1Q_F
+  VSTRDQSB_S
+  VSTRDQSB_U
+  VSTRDQSO_S
+  VSTRDQSO_U
+  VSTRDQSSO_S
+  VSTRDQSSO_U
+  VSTRWQSO_S
+  VSTRWQSO_U
+  VSTRWQSSO_S
+  VSTRWQSSO_U
+  VSTRHQSO_F
+  VSTRHQSSO_F
+  VSTRWQSB_F
+  VSTRWQSO_F
+  VSTRWQSSO_F
+  VDDUPQ
+  VDDUPQ_M
+  VDWDUPQ
+  VDWDUPQ_M
+  VIDUPQ
+  VIDUPQ_M
+  VIWDUPQ
+  VIWDUPQ_M
+  VSTRWQSBWB_S
+  VSTRWQSBWB_U
+  VLDRWQGBWB_S
+  VLDRWQGBWB_U
+  VSTRWQSBWB_F
+  VLDRWQGBWB_F
+  VSTRDQSBWB_S
+  VSTRDQSBWB_U
+  VLDRDQGBWB_S
+  VLDRDQGBWB_U
+  VADCQ_U
+  VADCQ_M_U
+  VADCQ_S
+  VADCQ_M_S
+  VSBCIQ_U
+  VSBCIQ_S
+  VSBCIQ_M_U
+  VSBCIQ_M_S
+  VSBCQ_U
+  VSBCQ_S
+  VSBCQ_M_U
+  VSBCQ_M_S
+  VADCIQ_U
+  VADCIQ_M_U
+  VADCIQ_S
+  VADCIQ_M_S
+  VLD2Q
+  VLD4Q
+  VST2Q
+  VSHLCQ_M_U
+  VSHLCQ_M_S
+  VSTRHQSO_U
+  VSTRHQSSO_S
+  VSTRHQSSO_U
+  VSTRHQ_S
+  SRSHRL
+  SRSHR
+  URSHR
+  URSHRL
+  SQRSHR
+  UQRSHL
+  UQRSHLL_64
+  UQRSHLL_48
+  SQRSHRL_64
+  SQRSHRL_48
+  VSHLCQ_M_
+])
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index b7e3619..c3c86c4 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -81,43 +81,11 @@
 ;; patterns separately for Neon, IWMMXT and MVE.
 
 (define_expand "add<mode>3"
-  [(set (match_operand:VNIM 0 "s_register_operand")
-	(plus:VNIM (match_operand:VNIM 1 "s_register_operand")
-		   (match_operand:VNIM 2 "s_register_operand")))]
-  "(TARGET_NEON && ((<MODE>mode != V2SFmode && <MODE>mode != V4SFmode)
-		    || flag_unsafe_math_optimizations))
-   || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))
-   || (TARGET_HAVE_MVE && VALID_MVE_SI_MODE(<MODE>mode))
-   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE(<MODE>mode))"
-{
-})
-
-;; Vector arithmetic.  Expanders are blank, then unnamed insns implement
-;; patterns separately for Neon and MVE.
-
-(define_expand "addv8hf3"
-  [(set (match_operand:V8HF 0 "s_register_operand")
-	(plus:V8HF (match_operand:V8HF 1 "s_register_operand")
-		   (match_operand:V8HF 2 "s_register_operand")))]
-  "(TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE(V8HFmode))
-   || (TARGET_NEON_FP16INST && flag_unsafe_math_optimizations)"
-{
-  if (TARGET_NEON_FP16INST && flag_unsafe_math_optimizations)
-    emit_insn (gen_addv8hf3_neon (operands[0], operands[1], operands[2]));
-})
-
-;; Vector arithmetic.  Expanders are blank, then unnamed insns implement
-;; patterns separately for Neon and IWMMXT.
-
-(define_expand "add<mode>3"
-  [(set (match_operand:VNINOTM 0 "s_register_operand")
-	(plus:VNINOTM (match_operand:VNINOTM 1 "s_register_operand")
-		      (match_operand:VNINOTM 2 "s_register_operand")))]
-  "(TARGET_NEON && ((<MODE>mode != V2SFmode && <MODE>mode != V4SFmode)
-		    || flag_unsafe_math_optimizations))
-   || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))"
-{
-})
+  [(set (match_operand:VDQ 0 "s_register_operand")
+	(plus:VDQ (match_operand:VDQ 1 "s_register_operand")
+		  (match_operand:VDQ 2 "s_register_operand")))]
+  "ARM_HAVE_<MODE>_ARITH"
+)
 
 ;; Vector arithmetic. Expanders are blank, then unnamed insns implement
 ;; patterns separately for IWMMXT and Neon.
diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
index 3470679..e6c287c 100644
--- a/gcc/config/arm/vfp.md
+++ b/gcc/config/arm/vfp.md
@@ -391,11 +391,11 @@
 
 (define_insn "*mov<mode>_vfp_<mode>16"
   [(set (match_operand:HFBF 0 "nonimmediate_operand"
-			  "= ?r,?m,t,r,t,r,t, t, Um,r")
+			  "= ?r,?m,t,r,t,r,t, t, Uj,r")
 	(match_operand:HFBF 1 "general_operand"
-			  "  m,r,t,r,r,t,Dv,Um,t, F"))]
+			  "  m,r,t,r,r,t,Dv,Uj,t, F"))]
   "TARGET_32BIT
-   && TARGET_VFP_FP16INST
+   && (TARGET_VFP_FP16INST || TARGET_HAVE_MVE)
    && (s_register_operand (operands[0], <MODE>mode)
        || s_register_operand (operands[1], <MODE>mode))"
  {
@@ -415,12 +415,12 @@
       return \"vmov.f16\\t%0, %1\t%@ __<fporbf>\";
     case 7: /* S register from memory.  */
       if (TARGET_HAVE_MVE)
-	return \"vldr.16\\t%0, %A1\";
+	return \"vldr.16\\t%0, %1\";
       else
 	return \"vld1.16\\t{%z0}, %A1\";
     case 8: /* Memory from S register.  */
       if (TARGET_HAVE_MVE)
-	return \"vstr.16\\t%1, %A0\";
+	return \"vstr.16\\t%1, %0\";
       else
 	return \"vst1.16\\t{%z1}, %A0\";
     case 9: /* ARM register from constant.  */
@@ -2125,7 +2125,7 @@
 	(match_operand:DF 1 "const_double_operand" "F"))
    (clobber (match_operand:DF 2 "s_register_operand" "=r"))]
   "arm_disable_literal_pool
-   && TARGET_HARD_FLOAT
+   && TARGET_VFP_BASE
    && !arm_const_double_rtx (operands[1])
    && !(TARGET_VFP_DOUBLE && vfp3_const_double_rtx (operands[1]))"
   "#"
@@ -2151,7 +2151,7 @@
 	(match_operand:SF 1 "const_double_operand" "E"))
    (clobber (match_operand:SF 2 "s_register_operand" "=r"))]
   "arm_disable_literal_pool
-   && TARGET_HARD_FLOAT
+   && TARGET_VFP_BASE
    && !vfp3_const_double_rtx (operands[1])"
   "#"
   ""
diff --git a/gcc/config/bpf/bpf-helpers.def b/gcc/config/bpf/bpf-helpers.def
deleted file mode 100644
index 249ad24..0000000
--- a/gcc/config/bpf/bpf-helpers.def
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Kernel helpers database.
-   Copyright (C) 2019-2020 Free Software Foundation, Inc.
-
-This file is part of GCC.
-
-GCC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 3, or (at your option)
-any later version.
-
-GCC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GCC; see the file COPYING3.  If not see
-<http://www.gnu.org/licenses/>.  */
-
-/* This file contains the definition of the kernel helpers that are
-   available to eBPF programs.
-
-   The primary source for information on kernel helpers is the
-   linux/include/uapi/linux/bpf.h file in the Linux source tree.
-   Please keep this database in sync.
-
-   The first column is the first kernel version featuring the helper
-   function.  This should be an enumerate from bpf_kernel_version,
-   defined in bpf-opts.h.  Note that the backend assumes that helpers
-   never get deprecated in the kernel.  If that eventually happens,
-   then we will need to use a bitmask here instead of an enumerate.
-
-   The second column is the constant-name for the helper.
-   The third column is the program-name of the helper.
-
-   The fourth column is a list of names describing the types of the
-   values returned and accepted by the helper, in one of these forms:
-
-     TYPES (type1, type2, ..., 0)
-     VTYPES (type1, type2, ..., 0)
-
-   VTYPES should be used should the helper accept a variable number of
-   arguments, TYPES otherwise.  The valid type names are:
-
-     `vt' for void.
-     `it' for signed int.
-     `ut' for unsigned int.
-     `pt' for void*.
-     `cpt' for const void*.
-     `st' for short int.
-     `ust' for unsigned short int.
-     `cst' for const char *.
-     `ullt' for unsigned long long.
-     `llt' for long long.
-     `u32t' for uint32.
-     `u64t' for uint64.
-  
-   In types descriptions, the firt entry corresponds to the value
-   returned by the helper.  Subsequent names correspond to the helper
-   arguments.  Finally, a 0 should close the list.
-
-   VERY IMPORTANT: the helper entries should be listed in the same
-   order than in the definition of __BPF_FUNC_MAPPER in
-   linux/include/uapi/linux/bpf.h!  */
-
-DEF_HELPER (LINUX_V4_0, MAP_LOOKUP_ELEM, map_lookup_elem, TYPES (pt, pt, pt, 0))
-DEF_HELPER (LINUX_V4_0, MAP_UPDATE_ELEM, map_update_elem, TYPES (it, pt, pt, pt, ullt, 0))
-DEF_HELPER (LINUX_V4_0, MAP_DELETE_ELEM, map_delete_elem, TYPES (it, pt, pt, 0))
-DEF_HELPER (LINUX_V4_1, PROBE_READ, probe_read, TYPES (it, pt, ut, cpt, 0))
-DEF_HELPER (LINUX_V4_1, KTIME_GET_NS, ktime_get_ns, TYPES (ullt, 0))
-DEF_HELPER (LINUX_V4_1, TRACE_PRINTK, trace_printk, VTYPES (it, cst, it, 0))
-DEF_HELPER (LINUX_V4_1, GET_PRANDOM_U32, get_prandom_u32, TYPES (ullt, 0))
-DEF_HELPER (LINUX_V4_1, GET_SMP_PROCESSOR_ID, get_smp_processor_id, TYPES (ullt, 0))
-DEF_HELPER (LINUX_V4_1, SKB_STORE_BYTES, skb_store_bytes, TYPES (it, pt, it, pt, it, it, 0))
-DEF_HELPER (LINUX_V4_1, L3_CSUM_REPLACE, l3_csum_replace, TYPES (it, pt, it, it ,it ,it, 0))
-DEF_HELPER (LINUX_V4_1, L4_CSUM_REPLACE, l4_csum_replace, TYPES (it, pt, it, it, it, it, 0))
-DEF_HELPER (LINUX_V4_2, TAIL_CALL, tail_call, TYPES (vt, pt, pt, it, 0))
-DEF_HELPER (LINUX_V4_2, CLONE_REDIRECT, clone_redirect, TYPES (it, pt, it, it, 0))
-DEF_HELPER (LINUX_V4_2, GET_CURRENT_PID_TGID, get_current_pid_tgid, TYPES (ullt, 0))
-DEF_HELPER (LINUX_V4_2, GET_CURRENT_UID_GID, get_current_uid_gid, TYPES (ullt, 0))
-DEF_HELPER (LINUX_V4_2, GET_CURRENT_COMM, get_current_comm, TYPES (it, pt, it, 0))
-DEF_HELPER (LINUX_V4_3, GET_CGROUP_CLASSID, get_cgroup_classid, TYPES (it, pt, 0))
-DEF_HELPER (LINUX_V4_3, SKB_VLAN_PUSH, skb_vlan_push, TYPES (it, pt, st, ust, 0))
-DEF_HELPER (LINUX_V4_3, SKB_VLAN_POP, skb_vlan_pop, TYPES (it, pt, 0))
-DEF_HELPER (LINUX_V4_3, SKB_GET_TUNNEL_KEY, skb_get_tunnel_key, TYPES (it, pt, pt, it, it, 0))
-DEF_HELPER (LINUX_V4_3, SKB_SET_TUNNEL_KEY, skb_set_tunnel_key, TYPES (it, pt, pt, it, it, 0))
-DEF_HELPER (LINUX_V4_3, PERF_EVENT_READ, perf_event_read, TYPES (ullt, pt, ullt, 0))
-DEF_HELPER (LINUX_V4_4, REDIRECT, redirect, TYPES (it, it, it, 0))
-DEF_HELPER (LINUX_V4_4, GET_ROUTE_REALM, get_route_realm, TYPES (ut, pt, 0))
-DEF_HELPER (LINUX_V4_4, PERF_EVENT_OUTPUT, perf_event_output, \
-	    TYPES (it, pt, pt, ullt, pt, it, 0))
-DEF_HELPER (LINUX_V4_5, SKB_LOAD_BYTES, skb_load_bytes, TYPES (it, pt, it, pt, it, 0))
-DEF_HELPER (LINUX_V4_6, GET_STACKID, get_stackid, TYPES (it, pt, pt, it, 0))
-DEF_HELPER (LINUX_V4_6, CSUM_DIFF, csum_diff, TYPES (it, pt, it, pt, it, it, 0))
-DEF_HELPER (LINUX_V4_6, SKB_GET_TUNNEL_OPT, skb_get_tunnel_opt, TYPES (it, pt, pt, it, 0))
-DEF_HELPER (LINUX_V4_6, SKB_SET_TUNNEL_OPT, skb_set_tunnel_opt, TYPES (it, pt, pt, it, 0))
-DEF_HELPER (LINUX_V4_8, SKB_CHANGE_PROTO, skb_change_proto, TYPES (it, pt, st, u64t, 0))
-DEF_HELPER (LINUX_V4_8, SKB_CHANGE_TYPE, skb_change_type, TYPES (it, pt, u32t, 0))
-DEF_HELPER (LINUX_V4_8, SKB_UNDER_CGROUP, skb_under_cgroup, TYPES (it, pt, pt, it, 0))
-DEF_HELPER (LINUX_V4_8, GET_HASH_RECALC, get_hash_recalc, TYPES (ut, pt, 0))
-DEF_HELPER (LINUX_V4_8, GET_CURRENT_TASK, get_current_task, TYPES (ullt, 0))
-DEF_HELPER (LINUX_V4_8, PROBE_WRITE_USER, probe_write_user, TYPES (it, pt, cpt, ut, 0))
-DEF_HELPER (LINUX_V4_9, CURRENT_TASK_UNDER_CGROUP, current_task_under_cgroup, \
-	    TYPES (it, pt, it, 0))
-DEF_HELPER (LINUX_V4_9, SKB_CHANGE_TAIL, skb_change_tail, TYPES (it, pt, ut, u64t, 0))
-DEF_HELPER (LINUX_V4_9, SKB_PULL_DATA, skb_pull_data, TYPES (it, pt, it, 0))
-DEF_HELPER (LINUX_V4_9, CSUM_UPDATE, csum_update, TYPES (llt, pt, u32t, 0))
-DEF_HELPER (LINUX_V4_9, SET_HASH_INVALID, set_hash_invalid, TYPES (vt, pt, 0))
-DEF_HELPER (LINUX_V4_10, GET_NUMA_NODE_ID, get_numa_node_id, TYPES (it, 0))
-DEF_HELPER (LINUX_V4_10, SKB_CHANGE_HEAD, skb_change_head, TYPES (it, pt, it, it, 0))
-DEF_HELPER (LINUX_V4_10, XDP_ADJUST_HEAD, xdp_adjust_head, TYPES (it, pt, it, 0))
-DEF_HELPER (LINUX_V4_11, PROBE_READ_STR, probe_read_str, TYPES (it, pt, u32t, cpt, 0))
-DEF_HELPER (LINUX_V4_12, GET_SOCKET_COOKIE, get_socket_cookie, TYPES (it, pt, 0))
-DEF_HELPER (LINUX_V4_12, GET_SOCKET_UID, get_socket_uid, TYPES (ut, pt, 0))
-DEF_HELPER (LINUX_V4_13, SET_HASH, set_hash, TYPES (ut, pt, u32t, 0))
-DEF_HELPER (LINUX_V4_13, SETSOCKOPT, setsockopt, TYPES (it, pt, it, it, pt, it, 0))
-DEF_HELPER (LINUX_V4_13, SKB_ADJUST_ROOM, skb_adjust_room, TYPES (it, pt, st, u32t, ullt, 0))
-DEF_HELPER (LINUX_V4_14, REDIRECT_MAP, redirect_map, TYPES (it, pt, it, it, 0))
-DEF_HELPER (LINUX_V4_14, SK_REDIRECT_MAP, sk_redirect_map, TYPES (it, pt, pt, it, it, 0))
-DEF_HELPER (LINUX_V4_14, SOCK_MAP_UPDATE, sock_map_update, TYPES (it, pt, pt, pt, ullt, 0))
-DEF_HELPER (LINUX_V4_15, XDP_ADJUST_META, xdp_adjust_meta, TYPES (it, pt, it, 0))
-DEF_HELPER (LINUX_V4_15, PERF_EVENT_READ_VALUE, perf_event_read_value,
-	    TYPES (it, pt, ullt, pt, ut, 0))
-DEF_HELPER (LINUX_V4_15, PERF_PROG_READ_VALUE, perf_prog_read_value,
-	    TYPES (it, pt, pt, ut, 0))
-DEF_HELPER (LINUX_V4_15, GETSOCKOPT, getsockopt, TYPES (it, pt, it, it, pt, it, 0))
-
-DEF_HELPER (LINUX_V4_16, OVERRIDE_RETURN, override_return, TYPES (it, pt, ult, 0))
-DEF_HELPER (LINUX_V4_16, SOCK_OPS_CB_FLAGS_SET, sock_ops_cb_flags_set, TYPES (it, pt, it, 0))
-DEF_HELPER (LINUX_V4_17, MSG_REDIRECT_MAP, msg_redirect_map, TYPES (it, pt, pt, it, it, 0))
-DEF_HELPER (LINUX_V4_17, MSG_APPLY_BYTES, msg_apply_bytes, TYPES (it, pt, it, 0))
-DEF_HELPER (LINUX_V4_17, MSG_CORK_BYTES, msg_cork_bytes, TYPES (it, pt, it, 0))
-DEF_HELPER (LINUX_V4_17, MSG_PULL_DATA, msg_pull_data, TYPES (it, pt, it, it, it, 0))
-DEF_HELPER (LINUX_V4_17, BIND, bind, TYPES (it, pt, pt, it, 0))
-DEF_HELPER (LINUX_V4_18, XDP_ADJUST_TAIL, xdp_adjust_tail, TYPES (it, pt, it, 0))
-DEF_HELPER (LINUX_V4_18, SKB_GET_XFRM_STATE,
-	    skb_get_xfrm_state, TYPES (it, pt, it, pt, it, it, 0))
-DEF_HELPER (LINUX_V4_18, GET_STACK, get_stack, TYPES (it, pt, pt, it, it, 0))
-DEF_HELPER (LINUX_V4_18, SKB_LOAD_BYTES_RELATIVE, skb_load_bytes_relative,
-	    TYPES (it, pt, it, pt, it, ut, 0))
-DEF_HELPER (LINUX_V4_18, FIB_LOOKUP, fib_lookup, TYPES (it, pt, pt, it, ut, 0))
-DEF_HELPER (LINUX_V4_18, SOCK_HASH_UPDATE, sock_hash_update, TYPES (it, pt, pt, pt, ullt, 0))
-DEF_HELPER (LINUX_V4_18, MSG_REDIRECT_HASH, msg_redirect_hash, TYPES (it, pt, pt, pt, it, 0))
-DEF_HELPER (LINUX_V4_18, SK_REDIRECT_HASH, sk_redirect_hash, TYPES (it, pt, pt, pt, it, 0))
-DEF_HELPER (LINUX_V4_18, LWT_PUSH_ENCAP, lwt_push_encap, TYPES (it, pt, ut, pt, ut, 0))
-DEF_HELPER (LINUX_V4_18, LWT_SEG6_STORE_BYTES, lwt_seg6_store_bytes,
-	    TYPES (it, pt, ut, pt, ut, 0))
-DEF_HELPER (LINUX_V4_18, LWT_SEG6_ADJUST_SRH, lwt_seg6_adjust_srh, TYPES (it, pt, ut, ut, 0))
-DEF_HELPER (LINUX_V4_18, LWT_SEG6_ACTION, lwt_seg6_action, TYPES (it, pt, ut, pt, ut, 0))
-DEF_HELPER (LINUX_V4_18, RC_REPEAT, rc_repeat, TYPES (it, pt, 0))
-DEF_HELPER (LINUX_V4_18, RC_KEYDOWN, rc_keydown, TYPES (it, pt, ut, ullt, ut, 0))
-DEF_HELPER (LINUX_V4_18, SKB_CGROUP_ID, skb_cgroup_id, TYPES (ullt, pt, 0))
-DEF_HELPER (LINUX_V4_18, GET_CURRENT_CGROUP_ID, get_current_cgroup_id, TYPES (ullt, 0))
-DEF_HELPER (LINUX_V4_19, GET_LOCAL_STORAGE, get_local_storage, TYPES (pt, pt, ullt, 0))
-DEF_HELPER (LINUX_V4_19, SK_SELECT_REUSEPORT, sk_select_reuseport,
-	    TYPES (it, pt, pt, pt, ut, 0))
-DEF_HELPER (LINUX_V4_19, SKB_ANCESTOR_CGROUP_ID, skb_ancestor_cgroup_id,
-	    TYPES (ullt, pt, it, 0))
-DEF_HELPER (LINUX_V4_20, SK_LOOKUP_TCP, sk_lookup_tcp, TYPES (pt, pt, pt, it, ullt, ullt, 0))
-DEF_HELPER (LINUX_V4_20, SK_LOOKUP_UDP, sk_lookup_udp, TYPES (pt, pt, pt, it, ullt, ullt, 0))
-DEF_HELPER (LINUX_V4_20, SK_RELEASE, sk_release, TYPES (it, pt, 0))
-DEF_HELPER (LINUX_V4_20, MAP_PUSH_ELEM, map_push_elem, TYPES (it, pt, pt, ullt, 0))
-DEF_HELPER (LINUX_V4_20, MAP_POP_ELEM, map_pop_elem, TYPES (it, pt, pt, 0))
-DEF_HELPER (LINUX_V4_20, MAP_PEEK_ELEM, map_peek_elem, TYPES (it, pt, pt, 0))
-DEF_HELPER (LINUX_V4_20, MSG_PUSH_DATA, msg_push_data, TYPES (it, pt, it, it, it, 0))
-DEF_HELPER (LINUX_V5_0, MSG_POP_DATA, msg_pop_data, TYPES (it, pt, it, it, it, 0))
-DEF_HELPER (LINUX_V5_0, RC_POINTER_REL, rc_pointer_rel, TYPES (it, pt, it, it, 0))
-DEF_HELPER (LINUX_V5_1, SPIN_LOCK, spin_lock, TYPES (vt, pt, 0))
-DEF_HELPER (LINUX_V5_1, SPIN_UNLOCK, spin_unlock, TYPES (vt, pt, 0))
-DEF_HELPER (LINUX_V5_1, SK_FULLSOCK, sk_fullsock, TYPES (pt, pt, 0))
-DEF_HELPER (LINUX_V5_1, TCP_SOCK, tcp_sock, TYPES (pt, pt, 0))
-DEF_HELPER (LINUX_V5_1, SKB_ECN_SET_CE, skb_ecn_set_ce, TYPES (it, pt, 0))
-DEF_HELPER (LINUX_V5_1, GET_LISTENER_SOCK, get_listener_sock, TYPES (pt, pt, 0))
-DEF_HELPER (LINUX_V5_2, SKC_LOOKUP_TCP, skc_lookup_tcp,
-	    TYPES (pt, pt, pt, u32t, u64t, u64t, 0))
-DEF_HELPER (LINUX_V5_2, TCP_CHECK_SYNCOOKIE, tcp_check_syncookie,
-	    TYPES (it, pt, pt, u32t, pt, u32t, 0))
-DEF_HELPER (LINUX_V5_2, SYSCTL_GET_NAME, sysctl_get_name, TYPES (it, pt, pt, ullt, u64t, 0))
-DEF_HELPER (LINUX_V5_2, SYSCTL_GET_CURRENT_VALUE, sysctl_get_current_value,
-	    TYPES (it, pt, pt, ullt, 0))
-DEF_HELPER (LINUX_V5_2, SYSCTL_GET_NEW_VALUE, sysctl_get_new_value,
-	    TYPES (it, pt, pt, ullt, 0))
-DEF_HELPER (LINUX_V5_2, SYSCTL_SET_NEW_VALUE, sysctl_set_new_value,
-	    TYPES (it, pt, pt, ullt, 0))
-DEF_HELPER (LINUX_V5_2, STRTOL, strtol, TYPES (it, cst, ullt, u64t, pt, 0))
-DEF_HELPER (LINUX_V5_2, STRTOUL, strtoul, TYPES (it, pt, ullt, u64t, pt, 0))
-DEF_HELPER (LINUX_V5_2, SK_STORAGE_GET, sk_storage_get, TYPES (pt, pt, pt, pt, u64t, 0))
-DEF_HELPER (LINUX_V5_2, SK_STORAGE_DELETE, sk_storage_delete, TYPES (it, pt, pt, 0))
-
-/*
-Local variables:
-mode:c
-End:
-*/
diff --git a/gcc/config/bpf/bpf-helpers.h b/gcc/config/bpf/bpf-helpers.h
index 1dd05c8..a615321 100644
--- a/gcc/config/bpf/bpf-helpers.h
+++ b/gcc/config/bpf/bpf-helpers.h
@@ -30,6 +30,7 @@
 #define __BPF_HELPERS_H
 
 #define SEC(NAME) __attribute__((section(NAME), used))
+#define KERNEL_HELPER(NUM) __attribute__((kernel_helper(NUM)))
 
 /* Flags used in some kernel helpers.  */
 
@@ -41,277 +42,376 @@
 #define BPF_F_NO_COMMON_LRU (1U << 1)
 #define BPF_F_NUMA_NODE (1U << 2)
 
-/* Functions to call kernel helpers.  We provide the "standard" bpf_*
-   names as synonyms of the corresponding GCC builtins.  In some
-   cases, where non-void pointers are passed to the helper, inline
-   functions are used to achieve proper type checking.  */
+/* Prototypes of functions to call kernel helpers.
+   Please keep these protoypes sorted by helper number.  */
 
-#ifndef KERNEL_VERSION
-# define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))
-#endif
+void *bpf_map_lookup_elem (void *map, const void *key)
+  KERNEL_HELPER (1);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,0,0)
+int bpf_map_update_elem (void *map, const void *key, const void *value,
+			 unsigned long long flags)
+  KERNEL_HELPER (2);
 
-#define bpf_map_lookup_elem	__builtin_bpf_helper_map_lookup_elem
-#define bpf_map_update_elem	__builtin_bpf_helper_map_update_elem
-#define bpf_map_delete_elem	__builtin_bpf_helper_map_delete_elem
+int bpf_map_delete_elem (void *map, const void *key)
+  KERNEL_HELPER (3);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,1,0)
+int bpf_probe_read (void *dst, int size, const void *unsafe_ptr)
+  KERNEL_HELPER (4);
 
-#define bpf_probe_read		__builtin_bpf_helper_probe_read
-#define bpf_ktime_get_ns	__builtin_bpf_helper_ktime_get_ns
-#define bpf_trace_printk	__builtin_bpf_helper_trace_printk
-#define bpf_get_prandom_u32	__builtin_bpf_helper_get_prandom_u32
-#define bpf_get_smp_processor_id __builtin_bpf_helper_get_smp_processor_id
-#define bpf_skb_store_bytes	__builtin_bpf_helper_skb_store_bytes
-#define bpf_l3_csum_replace	__builtin_bpf_helper_l3_csum_replace
-#define bpf_l4_csum_replace	__builtin_bpf_helper_l4_csum_replace
+unsigned long long bpf_ktime_get_ns (void)
+  KERNEL_HELPER (5);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,2,0)
+int bpf_trace_printk (const char *fmt, int fmt_size, ...)
+  KERNEL_HELPER (6);
 
-#define bpf_tail_call		__builtin_bpf_helper_tail_call
-#define bpf_clone_redirect	__builtin_bpf_helper_clone_redirect
-#define bpf_get_current_pid_tgid __builtin_bpf_helper_get_current_pid_tgid
-#define bpf_get_current_uid_gid  __builtin_bpf_helper_get_current_uid_gid
-#define bpf_get_current_comm	__builtin_bpf_helper_get_current_comm
+unsigned long long bpf_get_prandom_u32 (void)
+  KERNEL_HELPER (7);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,3,0)
+unsigned long long bpf_get_smp_processor_id (void)
+  KERNEL_HELPER (8);
 
-#define bpf_get_cgroup_classid	__builtin_bpf_helper_get_cgroup_classid
-#define bpf_skb_vlan_push	__builtin_bpf_helper_skb_vlan_push
-#define bpf_skb_vlan_pop	__builtin_bpf_helper_skb_vlan_pop
-#define bpf_skb_get_tunnel_key	__builtin_bpf_helper_skb_get_tunnel_key
-#define bpf_skb_set_tunnel_key	__builtin_bpf_helper_skb_set_tunnel_key
-#define bpf_perf_event_read	__builtin_bpf_helper_perf_event_read
+int bpf_skb_store_bytes (void *ctx, int off, void *from, int len,
+			 unsigned int start_header)
+  KERNEL_HELPER (9);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,4,0)
+int bpf_l3_csum_replace (void *ctx, int off, int from, int to, int flags)
+  KERNEL_HELPER (10);
 
-#define bpf_redirect		__builtin_bpf_helper_redirect
-#define bpf_get_route_realm	__builtin_bpf_helper_get_route_realm
-#define bpf_perf_event_output	__builtin_bpf_helper_perf_event_output
+int bpf_l4_csum_replace (void *ctx, int off, int from, int to, int flags)
+  KERNEL_HELPER (11);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,5,0)
+int bpf_tail_call (void *ctx, void *map, unsigned int index)
+  KERNEL_HELPER (12);
 
-#define bpf_skb_load_bytes	__builtin_bpf_helper_skb_load_bytes
+int bpf_clone_redirect (void *ctx, int ifindex, int flags)
+  KERNEL_HELPER (13);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,6,0)
+unsigned long long bpf_get_current_pid_tgid (void)
+  KERNEL_HELPER (14);
 
-#define bpf_get_stackid		__builtin_bpf_helper_get_stackid
-#define bpf_csum_diff		__builtin_bpf_helper_csum_diff
-#define bpf_skb_get_tunnel_opt	__builtin_bpf_helper_skb_get_tunnel_opt
-#define bpf_skb_set_tunnel_opt	__builtin_bpf_helper_skb_set_tunnel_opt
+unsigned long long bpf_get_current_uid_gid (void)
+  KERNEL_HELPER (15);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,8,0)
+int bpf_get_current_comm (void *buf, int buf_size)
+  KERNEL_HELPER (16);
 
-#define bpf_skb_change_proto	__builtin_bpf_helper_skb_change_proto
-#define bpf_skb_change_type	__builtin_bpf_helper_skb_change_type
-#define bpf_skb_under_cgroup	__builtin_bpf_helper_skb_under_cgroup
-#define bpf_get_hash_recalc	__builtin_bpf_helper_get_hash_recalc
-#define bpf_get_current_task	__builtin_bpf_helper_get_current_task
-#define bpf_probe_write_user	__builtin_bpf_helper_probe_write_user
+unsigned int bpf_get_cgroup_classid (void *ctx)
+  KERNEL_HELPER (17);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,9,0)
+int bpf_skb_vlan_push (void *ctx, short vlan_proto,
+		       unsigned short vlan_tci)
+  KERNEL_HELPER (18);
 
-#define bpf_current_task_under_cgroup __builtin_bpf_helper_current_task_under_cgroup
-#define bpf_skb_change_tail	__builtin_bpf_helper_skb_change_tail
-#define bpf_skb_pull_data	__builtin_bpf_helper_skb_pull_data
-#define bpf_csum_update		__builtin_bpf_helper_csum_update
-#define bpf_set_hash_invalid	__builtin_bpf_helper_set_hash_invalid
+int bpf_skb_vlan_pop (void *ctx)
+  KERNEL_HELPER (19);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,10,0)
+int bpf_skb_get_tunnel_key (void *ctx, void *key, int size, int flags)
+  KERNEL_HELPER (20);
 
-#define bpf_get_numa_node_id	__builtin_bpf_helper_get_numa_node_id
-#define bpf_skb_change_head	__builtin_bpf_helper_skb_change_head
-#define bpf_xdp_adjust_head	__builtin_bpf_helper_xdp_adjust_head
+int bpf_skb_set_tunnel_key (void *ctx, void *key, int size, int flags)
+  KERNEL_HELPER (21);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,11,0)
+unsigned long long bpf_perf_event_read (void *map, unsigned long long flags)
+  KERNEL_HELPER (22);
 
-#define bpf_probe_read_str	__builtin_bpf_helper_probe_read_str
+int bpf_redirect (int ifindex, int flags)
+  KERNEL_HELPER (23);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,12,0)
+unsigned int bpf_get_route_realm (void *ctx)
+  KERNEL_HELPER (24);
 
-#define bpf_get_socket_cookie	__builtin_bpf_helper_get_socket_cookie
-#define bpf_get_socket_uid	__builtin_bpf_helper_get_socket_uid
+int bpf_perf_event_output (void *ctx, void *map, unsigned long long flags,
+			   void *data, int size)
+  KERNEL_HELPER (25);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,13,0)
+int bpf_skb_load_bytes (void *ctx, int off, void *to, int len)
+  KERNEL_HELPER (26);
 
-#define bpf_set_hash		__builtin_bpf_helper_set_hash
-#define bpf_setsockopt		__builtin_bpf_helper_setsockopt
-#define bpf_skb_adjust_room	__builtin_bpf_helper_skb_adjust_room
+int bpf_get_stackid (void *ctx, void *map, int flags)
+  KERNEL_HELPER (27);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,14,0)
+int bpf_csum_diff (void *from, int from_size, void *to, int to_size, int seed)
+  KERNEL_HELPER (28);
 
-#define bpf_redirect_map	__builtin_bpf_helper_redirect_map
-#define bpf_sk_redirect_map	__builtin_bpf_helper_sk_redirect_map
-#define bpf_sock_map_update	__builtin_bpf_helper_sock_map_update
+int bpf_skb_get_tunnel_opt (void *ctx, void *md, int size)
+  KERNEL_HELPER (29);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,15,0)
+int bpf_skb_set_tunnel_opt (void *ctx, void *md, int size)
+  KERNEL_HELPER (30);
 
-#define bpf_perf_event_read_value __builtin_bpf_helper_perf_event_read_value
-#define bpf_perf_prog_read_value  __builtin_bpf_helper_perf_prog_read_value
-#define bpf_getsockopt		  __builtin_bpf_helper_getsockopt
-#define bpf_xdp_adjust_meta	__builtin_bpf_helper_xdp_adjust_meta
+int bpf_skb_change_proto (void *ctx, short proto, unsigned long flags)
+  KERNEL_HELPER (31);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,16,0)
+int bpf_skb_change_type (void *ctx, unsigned int type)
+  KERNEL_HELPER (32);
 
-#define bpf_override_return	__builtin_bpf_helper_override_return
-#define bpf_sock_ops_cb_flags_set __builtin_bpf_helper_sock_ops_cb_flags_set
+int bpf_skb_under_cgroup (void *ctx, void *map, int index)
+  KERNEL_HELPER (33);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,17,0)
+unsigned int bpf_get_hash_recalc (void *ctx)
+  KERNEL_HELPER (34);
 
-#define bpf_msg_redirect_map	__builtin_bpf_helper_msg_redirect_map
-#define bpf_msg_apply_bytes	__builtin_bpf_helper_msg_apply_bytes
-#define bpf_msg_cork_bytes	__builtin_bpf_helper_msg_cork_bytes
-#define bpf_pull_data		__builtin_bpf_helper_pull_data
-#define bpf_bind		__builtin_bpf_helper_bpf_bind
+unsigned long long bpf_get_current_task (void)
+  KERNEL_HELPER (35);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,18,0)
+int bpf_probe_write_user (void *dst, const void *src, int size)
+  KERNEL_HELPER (36);
 
-#define bpf_xdp_adjust_tail	__builtin_bpf_helper_xdp_adjust_tail
-#define bpf_skb_get_xfrm_state	__builtin_bpf_helper_skb_get_xfrm_state
-#define bpf_get_stack		__builtin_bpf_helper_get_stack
-#define bpf_skb_load_bytes_relative __builtin_bpf_helper_skb_load_bytes_relative
-#define bpf_sock_hash_update	__builtin_bpf_helper_sock_hash_update
-#define bpf_msg_redirect_hash	__builtin_bpf_helper_msg_redirect_hash
-#define bpf_sk_redirect_hash	__builtin_bpf_helper_sk_redirect_hash
-#define bpf_lwt_push_encap		__builtin_bpf_helper_lwt_push_encap
-#define bpf_lwt_seg6_store_bytes	__builtin_bpf_helper_lwt_seg6_store_bytes
-#define bpf_lwt_seg6_adjust_srh		__builtin_bpf_helper_lwt_seg6_adjust_srh
-#define bpf_lwt_seg6_action		__builtin_bpf_helper_lwt_seg6_action
-#define bpf_rc_repeat			__builtin_bpf_helper_rc_repeat
-#define bpf_rc_keydown			__builtin_bpf_helper_rc_keydown
-#define bpf_skb_cgroup_id		__builtin_bpf_helper_skb_cgroup_id
-#define bpf_get_current_cgroup_id	__builtin_bpf_helper_get_current_cgroup_id
+int bpf_current_task_under_cgroup (void *map, int index)
+  KERNEL_HELPER (37);
 
-static inline int
-bpf_fib_lookup (void *ctx, struct bpf_fib_lookup *param, int plen,
-		unsigned int flags)
-{
-  return __builtin_bpf_helper_fib_lookup (ctx, (void *) param, plen, flags);
-}
+int bpf_skb_change_tail (void *ctx, unsigned int len, unsigned long flags)
+  KERNEL_HELPER (38);
 
+int bpf_skb_pull_data (void *, int len)
+  KERNEL_HELPER (39);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,19,0)
+long long bpf_csum_update (void *ctx, unsigned int csum)
+  KERNEL_HELPER (40);
 
-#define bpf_get_local_storage	__builtin_bpf_helper_get_local_storage
-#define bpf_sk_select_reuseport	__builtin_bpf_helper_sk_select_reuseport
-#define bpf_skb_ancestor_cgroup_id	__builtin_bpf_helper_skb_ancestor_cgroup_id
+void bpf_set_hash_invalid (void *ctx)
+  KERNEL_HELPER (41);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (4,20,0)
+int bpf_get_numa_node_id (void)
+  KERNEL_HELPER (42);
 
-#define bpf_sk_release		__builtin_bpf_helper_sk_release
-#define bpf_map_push_elem	__builtin_bpf_helper_map_push_elem
-#define bpf_map_pop_elem	__builtin_bpf_helper_map_pop_elem
-#define bpf_map_peek_elem	__builtin_bpf_helper_map_peek_elem
-#define bpf_msg_push_data	__builtin_bpf_helper_msg_push_data
+int bpf_skb_change_head (void *, int len, int flags)
+  KERNEL_HELPER (43);
 
-static inline struct bpf_sock *
-bpf_sk_lookup_tcp (void *ctx, struct bpf_sock_tuple *tuple,
-		   int size, unsigned long long netns_id,
-		   unsigned long long flags)
-{
-  return
-    (struct bpf_sock *) __builtin_bpf_helper_sk_lookup_tcp (ctx,
-							    (void *) tuple,
-							    size,
-							    netns_id, flags);
-}
-
-static inline struct bpf_sock *
-bpf_sk_lookup_udp (void *ctx, struct bpf_sock_tuple *tuple,
-		   int size, unsigned long long netns_id,
-		   unsigned long long flags)
-{
-  return
-    (struct bpf_sock *) __builtin_bpf_helper_sk_lookup_udp (ctx,
-							    (void *) tuple,
-							    size,
-							    netns_id, flags);
-}
+int bpf_xdp_adjust_head (void *ctx, int offset)
+  KERNEL_HELPER (44);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (5,0,0)
+int bpf_probe_read_str (void *ctx, unsigned int size, const void *unsafe_ptr)
+  KERNEL_HELPER (45);
 
-#define bpf_msg_pop_data	__builtin_bpf_helper_pop_data
-#define bpf_rc_pointer_rel	__builtin_bpf_helper_rc_pointer_rel
+int bpf_get_socket_cookie (void *ctx)
+  KERNEL_HELPER (46);
 
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (5,1,0)
+unsigned int bpf_get_socket_uid (void *ctx)
+  KERNEL_HELPER (47);
 
-#define bpf_spin_lock		__builtin_bpf_helper_spin_lock
-#define bpf_spin_unlock		__builtin_bpf_helper_spin_unlock
-#define bpf_skb_ecn_set_ce	__builtin_bpf_helper_skb_ecn_set_ce
+unsigned int bpf_set_hash (void *ctx, unsigned int hash)
+  KERNEL_HELPER (48);
 
-static inline struct bpf_sock *
-bpf_sk_fullsock (struct bpf_sock *sk)
-{
-  return
-    (struct bpf_sock *) __builtin_bpf_helper_sk_fullsock ((void *) sk);
-}
+int bpf_setsockopt (void *ctx, int level, int optname, void *optval, int optlen)
+  KERNEL_HELPER (49);
 
-static inline struct bpf_sock *
-bpf_tcp_sock (struct bpf_sock *sk)
-{
-  return
-    (struct bpf_sock *) __builtin_bpf_helper_tcp_sock ((void *) sk);
-}
+int bpf_skb_adjust_room (void *ctx, int len_diff, unsigned int mode,
+			 unsigned long long flags)
+  KERNEL_HELPER (50);
 
-static inline struct bpf_sock *
-bpf_get_listener_sock (struct bpf_sock *sk)
-{
-  return
-    (struct bpf_sock *) __builtin_bpf_helper_get_listener_sock ((void *) sk);
-}
-
-#if __BPF_KERNEL_VERSION_CODE__ >= KERNEL_VERSION (5,2,0)
-
-
-#endif /* 5.2 */
-#endif /* 5.1 */
-#endif /* 5.0 */
-#endif /* 4.20 */
-#endif /* 4.19 */
-#endif /* 4.18 */
-#endif /* 4.17 */
-#endif /* 4.16 */
-#endif /* 4.15 */
-#endif /* 4.14 */
-#endif /* 4.13 */
-#endif /* 4.12 */
-#endif /* 4.11 */
-#endif /* 4.10 */
-#endif /* 4.9 */
-#endif /* 4.8 */
-#endif /* 4.6 */
-#endif /* 4.5 */
-#endif /* 4.4 */
-#endif /* 4.3 */
-#endif /* 4.2 */
-#endif /* 4.1 */
-#endif /* 4.0 */
+int bpf_redirect_map (void *map, int key, int flags)
+  KERNEL_HELPER (51);
+
+int bpf_sk_redirect_map (void *ctx, void *map, int key, int flags)
+  KERNEL_HELPER (52);
+
+int bpf_sock_map_update (void *map, void *key, void *value,
+			 unsigned long long flags)
+  KERNEL_HELPER (53);
+
+int bpf_xdp_adjust_meta (void *ctx, int offset)
+  KERNEL_HELPER (54);
+
+int bpf_perf_event_read_value (void *map, unsigned long long flags,
+			       void *buf, unsigned int buf_size)
+  KERNEL_HELPER (55);
+
+int bpf_perf_prog_read_value (void *ctx, void *buf, unsigned int buf_size)
+  KERNEL_HELPER (56);
+
+int bpf_getsockopt (void *ctx, int level, int optname, void *optval,
+		    int optlen)
+  KERNEL_HELPER (57);
+
+int bpf_override_return (void *ctx, unsigned long rc)
+  KERNEL_HELPER (58);
+
+int bpf_sock_ops_cb_flags_set (void *ctx, int flags)
+  KERNEL_HELPER (59);
+
+int bpf_msg_redirect_map (void *ctx, void *map, int key, int flags)
+  KERNEL_HELPER (60);
+
+int bpf_msg_apply_bytes (void *ctx, int len)
+  KERNEL_HELPER (61);
+
+int bpf_msg_cork_bytes (void *ctx, int len)
+  KERNEL_HELPER (62);
+
+int bpf_msg_pull_data (void *, int len)
+  KERNEL_HELPER (63);
+
+int bpf_bind (void *ctx, void *addr, int addr_len)
+  KERNEL_HELPER (64);
+
+int bpf_xdp_adjust_tail (struct xdp_md *xdp_md, int delta)
+  KERNEL_HELPER (65);
+
+int bpf_skb_get_xfrm_state (void *ctx, int index, void *state,
+			    int size, int flags)
+  KERNEL_HELPER (66);
+
+int bpf_get_stack (void *ctx, void *buf, int size, int flags)
+  KERNEL_HELPER (67);
+
+int bpf_skb_load_bytes_relative (void *ctx, int off, void *to, int len,
+				 unsigned int start_header)
+  KERNEL_HELPER (68);
+
+int bpf_fib_lookup (void *ctx, struct bpf_fib_lookup *params,
+		    int plen, unsigned int flags)
+  KERNEL_HELPER (69);
+
+int bpf_sock_hash_update (void *map, void *key, void *value,
+			  unsigned long long flags)
+  KERNEL_HELPER (70);
+
+int bpf_msg_redirect_hash (void *ctx, void *map, void *key, int flags)
+  KERNEL_HELPER (71);
+
+int bpf_sk_redirect_hash (void *ctx, void *map, void *key, int flags)
+  KERNEL_HELPER (72);
+
+int bpf_lwt_push_encap (void *ctx, unsigned int type, void *hdr,
+			unsigned int len)
+  KERNEL_HELPER (73);
+
+int bpf_lwt_seg6_store_bytes (void *ctx, unsigned int offset,
+			      void *from, unsigned int len)
+  KERNEL_HELPER (74);
+
+int bpf_lwt_seg6_adjust_srh (void *ctx, unsigned int offset,
+			     unsigned int len)
+  KERNEL_HELPER (75);
+
+int bpf_lwt_seg6_action (void *ctx, unsigned int action, void *param,
+			 unsigned int param_len)
+  KERNEL_HELPER (76);
+
+int bpf_rc_repeat (void *ctx)
+  KERNEL_HELPER (77);
+
+int bpf_rc_keydown (void *ctx, unsigned int protocol,
+		    unsigned long long scancode, unsigned int toggle)
+  KERNEL_HELPER (78);
+
+unsigned bpf_skb_cgroup_id (void *ctx)
+  KERNEL_HELPER (79);
+
+unsigned long long bpf_get_current_cgroup_id (void)
+  KERNEL_HELPER (80);
+
+void *bpf_get_local_storage (void *map, unsigned long long flags)
+  KERNEL_HELPER (81);
+
+int bpf_sk_select_reuseport (void *ctx, void *map, void *key, unsigned int flags)
+  KERNEL_HELPER (82);
+
+unsigned long long bpf_skb_ancestor_cgroup_id (void *ctx, int level)
+  KERNEL_HELPER (83);
+
+struct bpf_sock *bpf_sk_lookup_tcp (void *ctx, struct bpf_sock_tuple *tuple,
+				    int size, unsigned long long netns_id,
+				    unsigned long long flags)
+  KERNEL_HELPER (84);
+
+struct bpf_sock *bpf_sk_lookup_udp (void *ctx, struct bpf_sock_tuple *tuple,
+				    int size, unsigned long long netns_id,
+				    unsigned long long flags)
+  KERNEL_HELPER (85);
+
+int bpf_sk_release (struct bpf_sock *sk)
+  KERNEL_HELPER (86);
+
+int bpf_map_push_elem (void *map, const void *value, unsigned long long flags)
+  KERNEL_HELPER (87);
+
+int bpf_map_pop_elem (void *map, void *value)
+  KERNEL_HELPER (88);
+
+int bpf_map_peek_elem (void *map, void *value)
+  KERNEL_HELPER (89);
+
+int bpf_msg_push_data (void *ctx, int start, int cut, int flags)
+  KERNEL_HELPER (90);
+
+int bpf_msg_pop_data (void *ctx, int start, int cut, int flags)
+  KERNEL_HELPER (91);
+
+int bpf_rc_pointer_rel (void *ctx, int rel_x, int rel_y)
+  KERNEL_HELPER (92);
+
+void bpf_spin_lock (struct bpf_spin_lock *lock)
+  KERNEL_HELPER (93);
+
+void bpf_spin_unlock (struct bpf_spin_lock *lock)
+  KERNEL_HELPER (94);
+
+struct bpf_sock *bpf_sk_fullsock (struct bpf_sock *sk)
+  KERNEL_HELPER (95);
+
+struct bpf_sock *bpf_tcp_sock (struct bpf_sock *sk)
+  KERNEL_HELPER (96);
+
+int bpf_skb_ecn_set_ce (void *ctx)
+  KERNEL_HELPER (97);
+
+struct bpf_sock *bpf_get_listener_sock (struct bpf_sock *sk)
+  KERNEL_HELPER (98);
+
+struct bpf_sock *bpf_skc_lookup_tcp (void *ctx,
+				     struct bpf_sock_tuple *tuple,
+				     unsigned int tuple_size,
+				     unsigned long netns,
+				     unsigned long flags)
+  KERNEL_HELPER (99);
+
+int bpf_tcp_check_syncookie (struct bpf_sock *sk, void *iph,
+			     unsigned int iph_len,
+			     struct tcp_hdr *th,
+			     unsigned int th_len)
+  KERNEL_HELPER (100);
+
+int bpf_sysctl_get_name (struct bpf_sysctl *ctx,
+			 char *buf, unsigned long buf_len,
+			 unsigned long flags)
+  KERNEL_HELPER (101);
+
+int bpf_sysctl_get_current_value (struct bpf_sysctl *ctx,
+				  char *buf, unsigned long buf_len)
+  KERNEL_HELPER (102);
+
+int bpf_sysctl_get_new_value (struct bpf_sysctl *ctx, char *buf,
+			      unsigned long buf_len)
+  KERNEL_HELPER (103);
+
+int bpf_sysctl_set_new_value (struct bpf_sysctl *ctx, const char *buf,
+			      unsigned long buf_len)
+  KERNEL_HELPER (104);
+
+int bpf_strtol (const char *buf, unsigned long buf_len,
+		unsigned long flags, long *res)
+  KERNEL_HELPER (105);
+
+int bpf_strtoul (const char *buf, unsigned long buf_len,
+		 unsigned long flags, unsigned long *res)
+  KERNEL_HELPER (106);
+
+void *bpf_sk_storage_get (void *map, struct bpf_sock *sk,
+			  void *value, long flags)
+  KERNEL_HELPER (107);
+
+int bpf_sk_storage_delete (void *map, struct bpf_sock *sk)
+  KERNEL_HELPER (108);
 
 /* Functions to emit BPF_LD_ABS and BPF_LD_IND instructions.  We
    provide the "standard" names as synonyms of the corresponding GCC
    builtins.  Note how the SKB argument is ignored.  */
 
-static inline long long
-load_byte (void *skb __attribute__ ((unused)),
-	   unsigned long long off)
-{
-  return __builtin_bpf_load_byte (off);
-}
-
-static inline long long
-load_half (void *skb __attribute__ ((unused)),
-	   unsigned long long off)
-{
-  return __builtin_bpf_load_half (off);
-}
-
-static inline long long
-load_word (void *skb __attribute__ ((unused)),
-	   unsigned long long off)
-{
-  return __builtin_bpf_load_word (off);
-}
+#define load_byte(SKB,OFF) __builtin_bpf_load_byte ((OFF))
+#define load_half(SKB,OFF) __builtin_bpf_load_half ((OFF))
+#define load_word(SKB,OFF) __builtin_bpf_load_word ((OFF))
 
 struct bpf_map_def
 {
diff --git a/gcc/config/bpf/bpf.c b/gcc/config/bpf/bpf.c
index 36e0833..13181f2 100644
--- a/gcc/config/bpf/bpf.c
+++ b/gcc/config/bpf/bpf.c
@@ -66,6 +66,63 @@ struct GTY(()) machine_function
   int callee_saved_reg_size;
 };
 
+/* Handle an attribute requiring a FUNCTION_DECL;
+   arguments as in struct attribute_spec.handler.  */
+
+static tree
+bpf_handle_fndecl_attribute (tree *node, tree name,
+			     tree args,
+			     int flags ATTRIBUTE_UNUSED,
+			     bool *no_add_attrs)
+{
+  if (TREE_CODE (*node) != FUNCTION_DECL)
+    {
+      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+	       name);
+      *no_add_attrs = true;
+    }
+
+  if (is_attribute_p ("kernel_helper", name))
+    {
+      if (args)
+	{
+	  tree cst = TREE_VALUE (args);
+	  if (TREE_CODE (cst) != INTEGER_CST)
+	    {
+	      warning (OPT_Wattributes, "%qE attribute requires an integer argument",
+		       name);
+	      *no_add_attrs = true;
+	    }
+	}
+      else
+	{
+	  warning (OPT_Wattributes, "%qE requires an argument", name);
+	  *no_add_attrs = true;
+	}
+    }
+
+  return NULL_TREE;
+}
+
+/* Target-specific attributes.  */
+
+static const struct attribute_spec bpf_attribute_table[] =
+{
+  /* Syntax: { name, min_len, max_len, decl_required, type_required,
+	       function_type_required, affects_type_identity, handler,
+	       exclude } */
+
+ /* Attribute to mark function prototypes as kernel helpers.  */
+ { "kernel_helper", 1, 1, true, false, false, false,
+   bpf_handle_fndecl_attribute, NULL },
+
+ /* The last attribute spec is set to be NULL.  */
+ { NULL,	0,  0, false, false, false, false, NULL, NULL }
+};
+
+#undef TARGET_ATTRIBUTE_TABLE
+#define TARGET_ATTRIBUTE_TABLE bpf_attribute_table
+
 /* Data structures for the eBPF specific built-ins.  */
 
 /* Maximum number of arguments taken by a builtin function, plus
@@ -75,47 +132,13 @@ struct GTY(()) machine_function
 enum bpf_builtins
 {
   BPF_BUILTIN_UNUSED = 0,
-  /* Built-ins for kernel helpers.  */
-#define DEF_HELPER(V,D,N,T) BPF_BUILTIN_HELPER_##D,
-#  include "bpf-helpers.def"
-#undef DEF_HELPER
-  BPF_BUILTIN_HELPER_MAX,
   /* Built-ins for non-generic loads and stores.  */
-  BPF_BUILTIN_LOAD_BYTE = BPF_BUILTIN_HELPER_MAX,
+  BPF_BUILTIN_LOAD_BYTE,
   BPF_BUILTIN_LOAD_HALF,
   BPF_BUILTIN_LOAD_WORD,
   BPF_BUILTIN_MAX,
 };
 
-/* This table is indexed by an enum bpf_builtin.  */
-static const char *bpf_helper_names[] =
-{
-  NULL,
-#define DEF_HELPER(V,D,N,T) #N,
-#  include "bpf-helpers.def"
-#undef DEF_HELPER
-  NULL,
-  NULL,
-  NULL,
-  NULL
-};
-
-/* Return the builtin code corresponding to the kernel helper builtin
-   __builtin_NAME, or 0 if the name doesn't correspond to a kernel
-   helper builtin.  */
-
-static inline int
-bpf_helper_code (const char *name)
-{
-  int i;
-
-  for (i = 1; i < BPF_BUILTIN_HELPER_MAX; ++i)
-    if (strcmp (name, bpf_helper_names[i]) == 0)
-      return i;
-
-  return 0;
-}
-
 static GTY (()) tree bpf_builtins[(int) BPF_BUILTIN_MAX];
 
 /* Initialize the per-function machine status.  */
@@ -149,7 +172,7 @@ void
 bpf_target_macros (cpp_reader *pfile)
 {
   builtin_define ("__BPF__");
-  
+
   if (TARGET_BIG_ENDIAN)
     builtin_define ("__BPF_BIG_ENDIAN__");
   else
@@ -187,7 +210,7 @@ bpf_target_macros (cpp_reader *pfile)
       case LINUX_V5_1: version_code = "0x50100"; break;
       case LINUX_V5_2: version_code = "0x50200"; break;
       default:
-	gcc_unreachable ();      
+	gcc_unreachable ();
       }
 
     kernel_version_code = ACONCAT (("__BPF_KERNEL_VERSION_CODE__=",
@@ -196,23 +219,6 @@ bpf_target_macros (cpp_reader *pfile)
   }
 }
 
-/* Output assembly directives to switch to section NAME.  The section
-   should have attributes as specified by FLAGS, which is a bit mask
-   of the 'SECTION_*' flags defined in 'output.h'.  If DECL is
-   non-NULL, it is the 'VAR_DECL' or 'FUNCTION_DECL' with which this
-   section is associated.  */
-
-static void
-bpf_asm_named_section (const char *name,
-		       unsigned int flags ATTRIBUTE_UNUSED,
-		       tree decl ATTRIBUTE_UNUSED)
-{
-  fprintf (asm_out_file, "\t.section\t%s\n", name);
-}
-
-#undef TARGET_ASM_NAMED_SECTION
-#define TARGET_ASM_NAMED_SECTION bpf_asm_named_section
-
 /* Return an RTX representing the place where a function returns or
    receives a value of data type RET_TYPE, a tree node representing a
    data type.  */
@@ -359,7 +365,7 @@ bpf_expand_prologue (void)
       insn = emit_move_insn (stack_pointer_rtx,
 			     hard_frame_pointer_rtx);
       RTX_FRAME_RELATED_P (insn) = 1;
-      
+
       if (size > 0)
 	{
 	  insn = emit_insn (gen_rtx_SET (stack_pointer_rtx,
@@ -528,7 +534,7 @@ bpf_legitimate_address_p (machine_mode mode ATTRIBUTE_UNUSED,
 
 	rtx x0 = XEXP (x, 0);
 	rtx x1 = XEXP (x, 1);
-	
+
 	if (bpf_address_base_p (x0, strict) && GET_CODE (x1) == CONST_INT)
 	  return IN_RANGE (INTVAL (x1), -1 - 0x7fff, 0x7fff);
 
@@ -681,13 +687,16 @@ bpf_output_call (rtx target)
       break;
     case SYMBOL_REF:
       {
-	const char *function_name = XSTR (target, 0);
-	int code;
-      
-	if (strncmp (function_name, "__builtin_bpf_helper_", 21) == 0
-	    && ((code = bpf_helper_code (function_name + 21)) != 0))
+	tree decl = SYMBOL_REF_DECL (target);
+	tree attr;
+
+	if (decl
+	    && (attr = lookup_attribute ("kernel_helper",
+					 DECL_ATTRIBUTES (decl))))
 	  {
-	    xops[0] = GEN_INT (code);
+	    tree attr_args = TREE_VALUE (attr);
+
+	    xops[0] = GEN_INT (TREE_INT_CST_LOW (TREE_VALUE (attr_args)));
 	    output_asm_insn ("call\t%0", xops);
 	  }
 	else
@@ -696,8 +705,13 @@ bpf_output_call (rtx target)
 	break;
       }
     default:
-      error ("indirect call in function, which are not supported by eBPF");
-      output_asm_insn ("call 0", NULL);
+      if (TARGET_XBPF)
+	output_asm_insn ("call\t%0", &target);
+      else
+	{
+	  error ("indirect call in function, which are not supported by eBPF");
+	  output_asm_insn ("call 0", NULL);
+	}
       break;
     }
 
@@ -792,40 +806,7 @@ def_builtin (const char *name, enum bpf_builtins code, tree type)
 static void
 bpf_init_builtins (void)
 {
-  /* Built-ins for calling kernel helpers.  */
-
-  tree pt = build_pointer_type (void_type_node);
-  tree const_void_type
-    = build_qualified_type (void_type_node, TYPE_QUAL_CONST);
-  tree cpt = build_pointer_type (const_void_type);
-  tree st = short_integer_type_node;
-  tree ust = uint16_type_node;
-  tree it = integer_type_node;
-  tree ut = unsigned_type_node;
-  tree const_char_type
-    = build_qualified_type (char_type_node, TYPE_QUAL_CONST);
-  tree cst = build_pointer_type (const_char_type);
-  tree vt = void_type_node;
-  tree ult = long_unsigned_type_node;
-  tree u32t = uint32_type_node;
-  tree u64t = uint64_type_node;
-  tree llt = long_long_integer_type_node;
   tree ullt = long_long_unsigned_type_node;
-  
-#define TYPES build_function_type_list
-#define VTYPES build_varargs_function_type_list
-#define DEF_HELPER(V,D,N,T)				\
-  do							\
-    {							\
-      if (bpf_kernel >= (V))				\
-	def_builtin ("__builtin_bpf_helper_" #N,	\
-		     BPF_BUILTIN_HELPER_##D,		\
-		     T);				\
-    } while (0);
-#  include "bpf-helpers.def"
-#undef TYPES
-#undef VTYPES
-#undef DEF_HELPER
 
   /* Built-ins for BPF_LD_ABS and BPF_LD_IND instructions.  */
 
@@ -844,30 +825,17 @@ bpf_init_builtins (void)
    with bpf_init_builtins.  */
 
 static rtx
-bpf_expand_builtin (tree exp, rtx target,
+bpf_expand_builtin (tree exp, rtx target ATTRIBUTE_UNUSED,
 		    rtx subtarget ATTRIBUTE_UNUSED,
 		    machine_mode mode ATTRIBUTE_UNUSED,
-		    int ignore)
+		    int ignore ATTRIBUTE_UNUSED)
 {
   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
   int code = DECL_MD_FUNCTION_CODE (fndecl);
 
-  if (code >= 1 && code < BPF_BUILTIN_HELPER_MAX)
-    {
-      /* This is a builtin to call a kernel helper function.
-
-	 For these builtins, we just expand the function call normally
-	 with expand_call like we would do for a libcall. The function
-	 bpf_output_call below will then do The Right Thing (TM),
-	 recognizing the name of the called __builtin_helper_* symbol
-	 and emitting the corresponding CALL N instruction whenever
-	 necessary.  */
-
-      return expand_call (exp, target, ignore);
-    }
-  else if (code == BPF_BUILTIN_LOAD_BYTE
-	   || code == BPF_BUILTIN_LOAD_HALF
-	   || code == BPF_BUILTIN_LOAD_WORD)
+  if (code == BPF_BUILTIN_LOAD_BYTE
+      || code == BPF_BUILTIN_LOAD_HALF
+      || code == BPF_BUILTIN_LOAD_WORD)
     {
       /* Expand an indirect load from the sk_buff in the context.
 	 There is just one argument to the builtin, which is the
diff --git a/gcc/config/bpf/bpf.h b/gcc/config/bpf/bpf.h
index 08ecd11..359f389 100644
--- a/gcc/config/bpf/bpf.h
+++ b/gcc/config/bpf/bpf.h
@@ -22,7 +22,7 @@
 
 /**** Controlling the Compilation Driver.  */
 
-#define ASM_SPEC "%{mbig-endian:-EB} %{!mbig-endian:-EL}"
+#define ASM_SPEC "%{mbig-endian:-EB} %{!mbig-endian:-EL} %{mxbpf:-mxbpf}"
 #define LINK_SPEC "%{mbig-endian:-EB} %{!mbig-endian:-EL}"
 #define LIB_SPEC ""
 #define STARTFILE_SPEC ""
@@ -50,11 +50,6 @@
 	M = DImode;				\
     } while (0)
 
-/* Biggest alignment supported by the object file format of this
-   machine.  In this case this is ELF.  Use the same definition than
-   in elfos.h */
-#define MAX_OFILE_ALIGNMENT (((unsigned int) 1 << 28) * 8)
-
 /* Align argument parameters on the stack to 64-bit, at a minimum.  */
 #define PARM_BOUNDARY 64
 
@@ -241,6 +236,15 @@ enum reg_class
 /**** Debugging Info ****/
 
 /* We cannot support DWARF2 because of the limitations of eBPF.  */
+
+/* elfos.h insists in using DWARF.  Undo that here.  */
+#ifdef DWARF2_DEBUGGING_INFO
+# undef DWARF2_DEBUGGING_INFO
+#endif
+#ifdef PREFERRED_DEBUGGING_TYPE
+# undef PREFERRED_DEBUGGING_TYPE
+#endif
+
 #define DBX_DEBUGGING_INFO
 
 /**** Stack Layout and Calling Conventions.  */
@@ -387,7 +391,6 @@ enum reg_class
 #define TEXT_SECTION_ASM_OP "\t.text"
 #define DATA_SECTION_ASM_OP "\t.data"
 #define BSS_SECTION_ASM_OP "\t.bss"
-#define COMMON_ASM_OP "\t.common\t"
 
 /**** Defining the Output Assembler Language.  */
 
@@ -413,18 +416,6 @@ enum reg_class
 
 /*** Output of Uninitialized Variables.  */
 
-/* How to output an assembler line to define a local common
-   symbol.  */
-
-#define ASM_OUTPUT_ALIGNED_COMMON(FILE, NAME, SIZE, ALIGN)		\
-  do									\
-    {									\
-      fprintf ((FILE), "%s", COMMON_ASM_OP);				\
-      assemble_name ((FILE), (NAME));					\
-      fprintf ((FILE), ",%u,%u\n", (int)(SIZE), (ALIGN) / (BITS_PER_UNIT)); \
-    }									\
-  while (0)
-
 /* A C statement (sans semicolon) to output to the stdio stream
    FILE the assembler definition of uninitialized global DECL named
    NAME whose size is SIZE bytes and alignment is ALIGN bytes.
@@ -435,15 +426,6 @@ enum reg_class
     ASM_OUTPUT_ALIGNED_LOCAL (FILE, NAME, SIZE, ALIGN);		\
   } while (0)
 
-/* This says how to output an assembler line to define a local common
-   symbol.  */
-
-#define ASM_OUTPUT_ALIGNED_LOCAL(FILE,NAME,SIZE,ALIGN)			\
-  ( fputs ("\t.lcomm ", (FILE)),					\
-    assemble_name ((FILE), (NAME)),					\
-    fprintf ((FILE), "," HOST_WIDE_INT_PRINT_UNSIGNED "\n",		\
-	     (SIZE), ((ALIGN) / BITS_PER_UNIT)))
-
 /*** Output and Generation of Labels.  */
 
 /* Globalizing directive for a label.  */
@@ -458,11 +440,6 @@ enum reg_class
 #define ASM_GENERATE_INTERNAL_LABEL(LABEL,PREFIX,NUM)			\
   sprintf ((LABEL), "*%s%s%ld", (LOCAL_LABEL_PREFIX), (PREFIX), (long)(NUM))
 
-/*** Macros Controlling Initialization Routines.  */
-
-#define INIT_SECTION_ASM_OP "\t.init"
-#define FINI_SECTION_ASM_OP "\t.fini"
-
 /*** Output of Assembler Instructions.  */
 
 #define REGISTER_NAMES						\
@@ -488,11 +465,6 @@ enum reg_class
 #define ASM_OUTPUT_ALIGN(STREAM,LOG)		\
   fprintf (STREAM, "\t.align\t%d\n", (LOG))
 
-/* This is how to output an assembler line
-   that says to advance the location counter by SIZE bytes.  */
-#define ASM_OUTPUT_SKIP(FILE,SIZE)		\
-  fprintf (FILE, "\t.skip\t" HOST_WIDE_INT_PRINT_UNSIGNED "\n", (SIZE))
-
 /**** Miscellaneous Parameters.  */
 
 /* Specify the machine mode that this machine uses for the index in
diff --git a/gcc/config/bpf/bpf.md b/gcc/config/bpf/bpf.md
index 3aa8644..8e7cf50 100644
--- a/gcc/config/bpf/bpf.md
+++ b/gcc/config/bpf/bpf.md
@@ -82,10 +82,15 @@
 
 ;;;; NOPs
 
+;; The Linux kernel verifier performs some optimizations that rely on
+;; nop instructions to be encoded as `ja 0', i.e. a jump to offset 0,
+;; which actually means to jump to the next instruction, since in BPF
+;; offsets are expressed in 64-bit words _minus one_.
+
 (define_insn "nop"
   [(const_int 0)]
   ""
-  "mov\t%%r0,%%r0"
+  "ja\t0"
   [(set_attr "type" "alu")])
 
 ;;;; Arithmetic/Logical
@@ -160,6 +165,16 @@
   "div<msuffix>\t%0,%2"
   [(set_attr "type" "<mtype>")])
 
+;; However, xBPF does provide a signed division operator, sdiv.
+
+(define_insn "div<AM:mode>3"
+  [(set (match_operand:AM 0 "register_operand" "=r,r")
+        (div:AM (match_operand:AM 1 "register_operand" " 0,0")
+                (match_operand:AM 2 "reg_or_imm_operand" "r,I")))]
+  "TARGET_XBPF"
+  "sdiv<msuffix>\t%0,%2"
+  [(set_attr "type" "<mtype>")])
+
 ;;; Modulus
 
 ;; Note that eBPF doesn't provide instructions for signed integer
@@ -173,6 +188,16 @@
   "mod<msuffix>\t%0,%2"
   [(set_attr "type" "<mtype>")])
 
+;; Again, xBPF provides a signed version, smod.
+
+(define_insn "mod<AM:mode>3"
+  [(set (match_operand:AM 0 "register_operand" "=r,r")
+        (mod:AM (match_operand:AM 1 "register_operand" " 0,0")
+                (match_operand:AM 2 "reg_or_imm_operand" "r,I")))]
+  "TARGET_XBPF"
+  "smod<msuffix>\t%0,%2"
+  [(set_attr "type" "<mtype>")])
+
 ;;; Logical AND
 (define_insn "and<AM:mode>3"
   [(set (match_operand:AM 0 "register_operand" "=r,r")
@@ -271,7 +296,7 @@
 {
   if (!register_operand(operands[0], <MM:MODE>mode)
       && !register_operand(operands[1], <MM:MODE>mode))
-    operands[1] = force_reg (<MM:MODE>mode, operands[1]); 
+    operands[1] = force_reg (<MM:MODE>mode, operands[1]);
 }")
 
 (define_insn "*mov<MM:mode>"
diff --git a/gcc/config/bpf/constraints.md b/gcc/config/bpf/constraints.md
index 050383e..9e203be 100644
--- a/gcc/config/bpf/constraints.md
+++ b/gcc/config/bpf/constraints.md
@@ -29,4 +29,3 @@
 (define_constraint "S"
   "A constant call address."
   (match_code "const,symbol_ref,label_ref,const_int"))
-
diff --git a/gcc/config/bpf/predicates.md b/gcc/config/bpf/predicates.md
index 865a527..ce3cbc6 100644
--- a/gcc/config/bpf/predicates.md
+++ b/gcc/config/bpf/predicates.md
@@ -61,4 +61,3 @@
 
 (define_predicate "register_compare_operator"
   (match_code "eq,ne,geu,gtu,ge,gt"))
-
diff --git a/gcc/config/c6x/c6x.c b/gcc/config/c6x/c6x.c
index 39b0f75..9aa7ef0 100644
--- a/gcc/config/c6x/c6x.c
+++ b/gcc/config/c6x/c6x.c
@@ -3698,7 +3698,7 @@ insn_set_clock (rtx insn, int cycle)
   unsigned uid = INSN_UID (insn);
 
   if (uid >= INSN_INFO_LENGTH)
-    insn_info.safe_grow (uid * 5 / 4 + 10);
+    insn_info.safe_grow (uid * 5 / 4 + 10, true);
 
   INSN_INFO_ENTRY (uid).clock = cycle;
   INSN_INFO_ENTRY (uid).new_cond = NULL;
@@ -5600,7 +5600,8 @@ hwloop_optimize (hwloop_info loop)
       int j;
       rtx_insn *this_iter;
 
-      this_iter = duplicate_insn_chain (head_insn, tail_insn);
+      copy_bb_data id;
+      this_iter = duplicate_insn_chain (head_insn, tail_insn, NULL, &id);
       j = 0;
       while (this_iter)
 	{
diff --git a/gcc/config/cris/cris-passes.def b/gcc/config/cris/cris-passes.def
new file mode 100644
index 0000000..db3c74d
--- /dev/null
+++ b/gcc/config/cris/cris-passes.def
@@ -0,0 +1,20 @@
+/* Description of target passes for Visium.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+INSERT_PASS_AFTER (pass_delay_slots, 1, pass_cris_postdbr_cmpelim);
diff --git a/gcc/config/cris/cris-protos.h b/gcc/config/cris/cris-protos.h
index 2db1ea1..053bba9 100644
--- a/gcc/config/cris/cris-protos.h
+++ b/gcc/config/cris/cris-protos.h
@@ -60,3 +60,5 @@ extern int cris_fatal (char *);
 extern int cris_initial_elimination_offset (int, int);
 
 extern void cris_init_expanders (void);
+
+extern rtl_opt_pass *make_pass_cris_postdbr_cmpelim (gcc::context *);
diff --git a/gcc/config/cris/cris.c b/gcc/config/cris/cris.c
index b26b9f2..59cbcee 100644
--- a/gcc/config/cris/cris.c
+++ b/gcc/config/cris/cris.c
@@ -51,6 +51,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "output.h"
 #include "tm-constrs.h"
 #include "builtins.h"
+#include "cfgrtl.h"
+#include "tree-pass.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -129,6 +131,8 @@ static void cris_asm_output_mi_thunk
 static void cris_file_start (void);
 static void cris_init_libfuncs (void);
 
+static unsigned int cris_postdbr_cmpelim (void);
+
 static reg_class_t cris_preferred_reload_class (rtx, reg_class_t);
 
 static int cris_register_move_cost (machine_mode, reg_class_t, reg_class_t);
@@ -298,6 +302,204 @@ int cris_cpu_version = CRIS_DEFAULT_CPU_VERSION;
 
 struct gcc_target targetm = TARGET_INITIALIZER;
 
+namespace {
+
+const pass_data pass_data_cris_postdbr_cmpelim =
+{
+  RTL_PASS, /* type */
+  "mach2", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_cris_postdbr_cmpelim : public rtl_opt_pass
+{
+public:
+  pass_cris_postdbr_cmpelim (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_cris_postdbr_cmpelim, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual unsigned int execute (function *)
+    {
+      return cris_postdbr_cmpelim ();
+    }
+
+  /* No use running this if reorg and cmpelim aren't both run.  */
+  virtual bool gate (function *)
+    {
+      return
+	optimize > 0
+	&& flag_delayed_branch
+	&& flag_compare_elim_after_reload;
+    }
+};
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_cris_postdbr_cmpelim (gcc::context *ctxt)
+{
+  return new pass_cris_postdbr_cmpelim (ctxt);
+}
+
+/* "Cheap version" of cmpelim, making use of the opportunities opened up
+   by reorg.
+
+   Go through the insns of a function and look at each actual compare
+   insn; considering only those that compare a register to 0.  If the
+   previous CC-affecting insn sets the compared register or if a move
+   reads from it, try to change that into a CC-setting move and try to
+   have it recognized.  Bail at labels or non-matching insns that
+   clobber the compared register.  If successful, delete the compare.
+
+   Also, reorg isn't up to date regarding data-flow handling, so we
+   can't go beyond classic RTL scanning.  */
+
+static unsigned int
+cris_postdbr_cmpelim ()
+{
+  rtx_insn *insn;
+  rtx_insn *next;
+  rtx_insn *prev_cc_setter = 0;
+  rtx_insn *prev_cc_outer = 0;
+  rtx dccr = gen_rtx_REG (CCmode, CRIS_CC0_REGNUM);
+
+  /* Now look for compares in the insn stream.  */
+  for (insn = get_insns (); insn; insn = next)
+    {
+      rtx_insn *outer_insn = insn;
+      rtx pat = PATTERN (insn);
+
+      next = NEXT_INSN (outer_insn);
+
+      /* Forget previous state when we see a label; we can't track or
+	 merge its state.  */
+      if (LABEL_P (insn))
+	{
+	  prev_cc_setter = 0;
+	  continue;
+	}
+
+      if (!NONDEBUG_INSN_P (insn))
+	continue;
+
+      /* Consider filled delay slots; there might be a comparison there.
+	 It's only the second insn in a sequence that is interesting.  */
+      if (GET_CODE (pat) == SEQUENCE)
+	insn = as_a <rtx_insn *> XVECEXP (pat, 0, 1);
+      /* The "else" eliminates temptations to consider an insn in a
+	 delay slot for elimination; it can only be a prev_cc_setter.  */
+      else if (prev_cc_setter != 0 && GET_CODE (pat) == SET)
+	{
+	  rtx dest = SET_DEST (pat);
+	  rtx src = SET_SRC (pat);
+	  rtx prev_set;
+
+	  if (REG_P (dest)
+	      && REGNO (dest) == CRIS_CC0_REGNUM
+	      && GET_CODE (src) == COMPARE
+	      && REG_P (XEXP (src, 0))
+	      && XEXP (src, 1) == const0_rtx
+	      && (prev_set = single_set (prev_cc_setter)) != 0)
+	    {
+	      /* We have a candidate, and a prev_cc_setter to inspect.  */
+	      rtx reg = XEXP (src, 0);
+	      rtx prev_dest = SET_DEST (prev_set);
+	      rtx prev_src = SET_SRC (prev_set);
+	      bool src_same = rtx_equal_p (prev_src, reg);
+
+	      /* If the prev_cc_setter isn't a simple SET, or if the
+		 compared register is modified in prev_cc_setter without
+		 being the destination, or if it's modified between
+		 prev_cc_setter (equal to or contained in prev_cc_outer)
+		 and this insn, then we can't use the flags result.  And
+		 of course, the SET_DEST of prev_cc_setter (the main
+		 interest, not dccr) has to be the same register and
+		 mode we're interested in - or the SET_SRC.  We've
+		 already checked that the compared register isn't
+		 changed in-between.  */
+	      if (REG_P (prev_dest)
+		  && ! reg_set_p (reg, prev_src)
+		  && ! reg_set_between_p (reg, prev_cc_outer, outer_insn)
+		  && (src_same || rtx_equal_p (prev_dest, reg)))
+		{
+		  machine_mode ccmode = GET_MODE (src);
+		  rtx modeadjusted_dccr
+		    = (ccmode == CCmode ? dccr
+		       : gen_rtx_REG (CCmode, CRIS_CC0_REGNUM));
+		  rtx compare
+		    /* We don't need to copy_rtx pat: we're going to
+		       delete that insn. */
+		    = (src_same ? pat
+		       : gen_rtx_SET (modeadjusted_dccr,
+				      gen_rtx_COMPARE (ccmode,
+						       copy_rtx (prev_src),
+						       const0_rtx)));
+
+		  /* Replace tentatively, the prev_set combo that is
+		     ((set d s) (clobber dccr)) with
+		     ((cmp s 0) (set d s)) where (cmp s 0) is the
+		     compare we're looking at, and validate it or fail
+		     the whole thing.  First replace the ((set d s) ...)
+		     with ((cmp s 0) ...)).  */
+		  validate_change (prev_cc_setter,
+				   &XVECEXP (PATTERN (prev_cc_setter),
+					     0, 0), compare, true);
+
+		  /* Then the clobber with the (set d s).  */
+		  validate_change (prev_cc_setter,
+				   &XVECEXP (PATTERN (prev_cc_setter),
+					     0, 1), prev_set, true);
+
+		  if (apply_change_group ())
+		    {
+		      delete_insn (insn);
+
+		      /* We eliminated the compare.  Then we must go to
+			 the next insn: we can't consider the eliminated
+			 insn for the next prev_cc_setter.
+
+			 FIXME: if later insns still match, we could do
+			 the delete_insn part only, for them.  But, it
+			 seems rare that reorg would manage to move a
+			 second CC-clobber to another delay-slot,
+			 leaving two identical compares (and presumably
+			 users).  */
+		      prev_cc_setter = 0;
+		      continue;
+		    }
+		}
+	      }
+	}
+
+      if (reg_set_p (dccr, insn))
+	{
+	  rtx pat = PATTERN (insn);
+
+	  prev_cc_setter = 0;
+
+	  /* Make sure we can use it later on, otherwise forget it.
+	     Don't look too close, we're going to pass a lot of these.
+	     Just make sure the structure is that we can work with. */
+	  if (GET_CODE (pat) == PARALLEL
+	      && XVECLEN (pat, 0) == 2
+	      && GET_CODE (XVECEXP (pat, 0, 1)) == CLOBBER)
+	    {
+	      prev_cc_setter = insn;
+	      prev_cc_outer = outer_insn;
+	    }
+	}
+    }
+
+  return 0;
+}
+
 /* Helper for cris_load_multiple_op and cris_ret_movem_op.  */
 
 bool
diff --git a/gcc/config/cris/cris.md b/gcc/config/cris/cris.md
index c36a540..efafb5b 100644
--- a/gcc/config/cris/cris.md
+++ b/gcc/config/cris/cris.md
@@ -275,7 +275,7 @@
   "reload_completed"
   [(set (reg:CC_NZ CRIS_CC0_REGNUM)
 	(compare:CC_NZ (match_dup 1) (const_int 0)))
-   (set (match_operand 0) (match_operand 1))])
+   (set (match_dup 0) (match_dup 1))])
 
 (define_subst_attr "setnzvc" "setnzvc_subst" "" "_setnzvc")
 (define_subst_attr "ccnzvc" "setnzvc_subst" "" "_enabled")
@@ -288,7 +288,7 @@
   "reload_completed"
   [(set (reg:CC_NZVC CRIS_CC0_REGNUM)
 	(compare:CC_NZVC (match_dup 1) (const_int 0)))
-   (set (match_operand 0) (match_operand 1))])
+   (set (match_dup 0) (match_dup 1))])
 
 (define_subst_attr "setcc" "setcc_subst" "" "_setcc")
 (define_subst_attr "cccc" "setcc_subst" "" "_enabled")
@@ -301,7 +301,7 @@
   "reload_completed"
   [(set (reg:CC CRIS_CC0_REGNUM)
 	(compare:CC (match_dup 1) (const_int 0)))
-   (set (match_operand 0) (match_operand 1))])
+   (set (match_dup 0) (match_dup 1))])
 
 ;; Operand and operator predicates.
 
@@ -973,7 +973,6 @@
 ;; The last constraint is due to that after reload, the '%' is not
 ;; honored, and canonicalization doesn't care about keeping the same
 ;; register as in destination.  This will happen after insn splitting.
-;; gcc <= 2.7.2.  FIXME: Check for gcc-2.9x
 
  ""
 {
@@ -1291,6 +1290,45 @@
   [(set_attr "slottable" "yes")
    (set_attr "cc" "none")])
 
+;; This pattern is usually generated after reload, so a '%' is
+;; ineffective; use explicit combinations.
+(define_insn "*addi_b_<mode>"
+  [(set (match_operand:BWD 0 "register_operand" "=r,r")
+	(plus:BWD
+	 (match_operand:BWD 1 "register_operand" "0,r")
+	 (match_operand:BWD 2 "register_operand" "r,0")))]
+  ""
+  "@
+   addi %2.b,%0
+   addi %1.b,%0"
+  [(set_attr "slottable" "yes")])
+
+;; Strip the dccr clobber from addM3 with register operands, if the
+;; next instruction isn't using it.
+;; Not clobbering dccr may let cmpelim match a later compare with a
+;; previous operation of interest.  This has to run before cmpelim so it
+;; can't be a peephole2.  See gcc.target/cris/pr93372-45.c for a
+;; test-case.
+(define_split ;; "*add<mode>3_addi"
+  [(parallel
+    [(set (match_operand:BWD 0 "register_operand")
+	  (plus:BWD
+	   (match_operand:BWD 1 "register_operand")
+	   (match_operand:BWD 2 "register_operand")))
+     (clobber (reg:CC CRIS_CC0_REGNUM))])]
+  "reload_completed"
+  [(set (match_dup 0) (plus:BWD (match_dup 1) (match_dup 2)))]
+{
+  rtx reg = operands[0];
+  rtx_insn *i = next_nonnote_nondebug_insn_bb (curr_insn);
+
+  while (i != NULL_RTX && (!INSN_P (i) || DEBUG_INSN_P (i)))
+    i = next_nonnote_nondebug_insn_bb (i);
+
+  if (i == NULL_RTX || reg_mentioned_p (reg, i) || BARRIER_P (i))
+    FAIL;
+})
+
 (define_insn "<u>mul<s><mode>3"
   [(set (match_operand:WD 0 "register_operand" "=r")
 	(mult:WD
diff --git a/gcc/config/cris/t-cris b/gcc/config/cris/t-cris
index af5535b..eb4411e 100644
--- a/gcc/config/cris/t-cris
+++ b/gcc/config/cris/t-cris
@@ -25,5 +25,4 @@
 # section "Target Fragment" in the gcc info-files (or the paper copy) of
 # "Using and Porting GCC"
 
-$(out_object_file): gt-cris.h
-gt-cris.h : s-gtype ; @true
+PASSES_EXTRA += $(srcdir)/config/cris/cris-passes.def
diff --git a/gcc/config/csky/csky-elf.h b/gcc/config/csky/csky-elf.h
index 0a319c0..a79d757 100644
--- a/gcc/config/csky/csky-elf.h
+++ b/gcc/config/csky/csky-elf.h
@@ -47,6 +47,8 @@
   %{mcpu=*:-mcpu=%*}		\
   %{march=*:-march=%*}		\
   %{mhard-float:-mhard-float}	\
+  %{mfloat-abi=softfp:-mhard-float} \
+  %{mfloat-abi=hard:-mhard-float}   \
   %{melrw:-melrw}		\
   %{mno-elrw:-mno-elrw}		\
   %{mistack:-mistack}		\
@@ -68,8 +70,14 @@
  %{EL:-EL} -X"
 
 #undef	LIB_SPEC
-#define LIB_SPEC \
-  "%{pthread:-lpthread} -lc %{mccrt:-lcc-rt}"
+#define LIB_SPEC "\
+%{pthread:-lpthread} \
+--start-group \
+-lc \
+%{msim:-lsemi}%{!msim:-lnosys} \
+--end-group \
+%{mccrt:-lcc-rt} \
+"
 /* FIXME add this to LIB_SPEC when need */
 /*   %{!shared:%{profile:-lc_p}%{!profile:-lc}}" */
 
diff --git a/gcc/config/csky/csky-linux-elf.h b/gcc/config/csky/csky-linux-elf.h
index 2f052fd..cf587ae 100644
--- a/gcc/config/csky/csky-linux-elf.h
+++ b/gcc/config/csky/csky-linux-elf.h
@@ -47,6 +47,8 @@
   %{mcpu=*:-mcpu=%*}		\
   %{march=*:-march=%*}		\
   %{mhard-float:-mhard-float}	\
+  %{mfloat-abi=softfp:-mhard-float} \
+  %{mfloat-abi=hard:-mhard-float}   \
   %{melrw:-melrw}		\
   %{mno-elrw:-mno-elrw}		\
   %{mistack:-mistack}		\
@@ -61,7 +63,7 @@
   %{mvdsp:-mvdsp}		\
   "
 
-#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-cskyv2%{mhard-float:-hf}%{mbig-endian:-be}.so.1"
+#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-cskyv2%{mfloat-abi=hard:-hf}%{mbig-endian:-be}.so.1"
 
 #define LINUX_TARGET_LINK_SPEC	"%{h*} %{version:-v}		\
    %{b}								\
diff --git a/gcc/config/csky/csky-protos.h b/gcc/config/csky/csky-protos.h
index cc1a033..2c02399 100644
--- a/gcc/config/csky/csky-protos.h
+++ b/gcc/config/csky/csky-protos.h
@@ -68,4 +68,6 @@ extern int csky_compute_pushpop_length (rtx *);
 
 extern int csky_default_branch_cost (bool, bool);
 extern bool csky_default_logical_op_non_short_circuit (void);
+
+extern void csky_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree);
 #endif /* GCC_CSKY_PROTOS_H */
diff --git a/gcc/config/csky/csky.c b/gcc/config/csky/csky.c
index 7ba3ed3..5aa2336 100644
--- a/gcc/config/csky/csky.c
+++ b/gcc/config/csky/csky.c
@@ -328,6 +328,16 @@ csky_cpu_cpp_builtins (cpp_reader *pfile)
     {
       builtin_define ("__csky_hard_float__");
       builtin_define ("__CSKY_HARD_FLOAT__");
+      if (TARGET_HARD_FLOAT_ABI)
+	{
+	  builtin_define ("__csky_hard_float_abi__");
+	  builtin_define ("__CSKY_HARD_FLOAT_ABI__");
+	}
+      if (TARGET_SINGLE_FPU)
+	{
+	  builtin_define ("__csky_hard_float_fpu_sf__");
+	  builtin_define ("__CSKY_HARD_FLOAT_FPU_SF__");
+	}
     }
   else
     {
@@ -1790,9 +1800,22 @@ static rtx
 csky_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
 {
   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
+  int reg = pcum->reg;
+  machine_mode mode = arg.mode;
 
-  if (*pcum < CSKY_NPARM_REGS)
-    return gen_rtx_REG (arg.mode, CSKY_FIRST_PARM_REGNUM + *pcum);
+  if (FUNCTION_VARG_MODE_P(mode)
+      && !pcum->is_stdarg)
+    {
+      reg = pcum->freg;
+
+      if (reg < CSKY_NPARM_FREGS)
+	return gen_rtx_REG (mode, CSKY_FIRST_VFP_REGNUM + reg);
+      else
+	return NULL_RTX;
+    }
+
+  if (reg < CSKY_NPARM_REGS)
+    return gen_rtx_REG (mode, CSKY_FIRST_PARM_REGNUM + reg);
 
   return NULL_RTX;
 }
@@ -1802,7 +1825,7 @@ csky_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
    MODE and TYPE.  */
 
 static int
-csky_num_arg_regs (machine_mode mode, const_tree type)
+csky_num_arg_regs (machine_mode mode, const_tree type, bool is_stdarg)
 {
   int size;
 
@@ -1811,6 +1834,14 @@ csky_num_arg_regs (machine_mode mode, const_tree type)
   else
     size = GET_MODE_SIZE (mode);
 
+  if (TARGET_HARD_FLOAT_ABI
+      && !is_stdarg)
+    {
+      if (CSKY_VREG_MODE_P(mode)
+	  && !TARGET_SINGLE_FPU)
+	return ((CSKY_NUM_WORDS (size) + 1) / 2);
+    }
+
   return CSKY_NUM_WORDS (size);
 }
 
@@ -1822,12 +1853,23 @@ csky_function_arg_advance (cumulative_args_t pcum_v,
 			   const function_arg_info &arg)
 {
   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
-  int param_size = csky_num_arg_regs (arg.mode, arg.type);
+  int *reg = &pcum->reg;
+  machine_mode mode = arg.mode;
 
-  if (*pcum + param_size > CSKY_NPARM_REGS)
-    *pcum = CSKY_NPARM_REGS;
+  int param_size = csky_num_arg_regs (mode, arg.type, pcum->is_stdarg);
+  int param_regs_nums = CSKY_NPARM_REGS;
+
+  if (FUNCTION_VARG_MODE_P(mode)
+      && !pcum->is_stdarg)
+    {
+      reg = &pcum->freg;
+      param_regs_nums = CSKY_NPARM_FREGS;
+    }
+
+  if (*reg + param_size > param_regs_nums)
+    *reg = param_regs_nums;
   else
-    *pcum += param_size;
+    *reg += param_size;
 }
 
 
@@ -1843,6 +1885,12 @@ csky_function_value (const_tree type, const_tree func,
   mode = TYPE_MODE (type);
   size = int_size_in_bytes (type);
 
+  if (FUNCTION_VARG_MODE_P(mode))
+    {
+      mode = promote_function_mode (type, mode, &unsignedp, func, 1);
+      return gen_rtx_REG (mode, CSKY_FIRST_VFP_REGNUM);
+    }
+
   /* Since we promote return types, we must promote the mode here too.  */
   if (INTEGRAL_TYPE_P (type))
     {
@@ -1877,6 +1925,10 @@ static rtx
 csky_libcall_value (machine_mode mode,
 		    const_rtx libcall ATTRIBUTE_UNUSED)
 {
+  if (FUNCTION_VARG_MODE_P(mode))
+    {
+      return gen_rtx_REG (mode, CSKY_FIRST_VFP_REGNUM);
+    }
   return gen_rtx_REG (mode, CSKY_FIRST_RET_REGNUM);
 }
 
@@ -1887,7 +1939,11 @@ csky_libcall_value (machine_mode mode,
 static bool
 csky_function_value_regno_p (const unsigned int regno)
 {
-  return (regno == CSKY_FIRST_RET_REGNUM);
+  if (regno == CSKY_FIRST_RET_REGNUM
+      || (TARGET_HARD_FLOAT_ABI
+	  && regno == CSKY_FIRST_VFP_REGNUM))
+    return true;
+  return false;
 }
 
 
@@ -1912,11 +1968,16 @@ static int
 csky_arg_partial_bytes (cumulative_args_t pcum_v, const function_arg_info &arg)
 {
   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
-  int param_size = csky_num_arg_regs (arg.mode, arg.type);
+  int param_size = csky_num_arg_regs (arg.mode, arg.type, pcum->is_stdarg);
+  int reg = pcum->reg;
+
+  if (FUNCTION_VARG_MODE_P(arg.mode)
+      && !pcum->is_stdarg)
+    return 0;
 
-  if (*pcum < CSKY_NPARM_REGS
-      && *pcum + param_size > CSKY_NPARM_REGS)
-    return (CSKY_NPARM_REGS - *pcum) * UNITS_PER_WORD;
+  if (reg < CSKY_NPARM_REGS
+      && reg + param_size > CSKY_NPARM_REGS)
+    return (CSKY_NPARM_REGS - reg) * UNITS_PER_WORD;
 
   return 0;
 }
@@ -1941,7 +2002,7 @@ csky_setup_incoming_varargs (cumulative_args_t pcum_v,
   cfun->machine->uses_anonymous_args = 1;
   local_cum = *pcum;
   csky_function_arg_advance (local_cum_v, arg);
-  regs_to_push = CSKY_NPARM_REGS - local_cum;
+  regs_to_push = CSKY_NPARM_REGS - local_cum.reg;
   if (regs_to_push)
     *pretend_size  = regs_to_push * UNITS_PER_WORD;
 }
@@ -2435,7 +2496,7 @@ csky_option_override (void)
   /* Create the default target_options structure.  We need this early
      to configure the overall build target.  */
   target_option_default_node = target_option_current_node
-			     = build_target_option_node (&global_options);
+    = build_target_option_node (&global_options, &global_options_set);
 
   csky_configure_build_target (&csky_active_target,
 			      TREE_TARGET_OPTION (target_option_default_node),
@@ -2600,7 +2661,7 @@ csky_option_override (void)
 
   /* Resynchronize the saved target options.  */
   cl_target_option_save (TREE_TARGET_OPTION (target_option_default_node),
-			 &global_options);
+			 &global_options, &global_options_set);
 
 #ifdef ENABLE_TPF_DEBUG
   /* Don't emit DWARF4 unless specifically selected.  The TPF
@@ -6775,6 +6836,15 @@ csky_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
   return true;
 }
 
+void
+csky_init_cumulative_args (CUMULATIVE_ARGS *pcum, tree fntype,
+			   rtx libname ATTRIBUTE_UNUSED,
+			   tree fndecl ATTRIBUTE_UNUSED)
+{
+  memset(pcum, 0, sizeof(*pcum));
+  if (stdarg_p (fntype))
+    pcum->is_stdarg = true;
+}
 
 struct gcc_target targetm = TARGET_INITIALIZER;
 
diff --git a/gcc/config/csky/csky.h b/gcc/config/csky/csky.h
index 2d5a66c..190a668 100644
--- a/gcc/config/csky/csky.h
+++ b/gcc/config/csky/csky.h
@@ -126,6 +126,29 @@
 #define TARGET_TLS \
   (CSKY_TARGET_ARCH (CK807) || CSKY_TARGET_ARCH (CK810))
 
+/* Run-time Target Specification.  */
+#define TARGET_SOFT_FLOAT       (csky_float_abi == CSKY_FLOAT_ABI_SOFT)
+/* Use hardware floating point instructions. */
+#define TARGET_HARD_FLOAT       (csky_float_abi != CSKY_FLOAT_ABI_SOFT)
+/* Use hardware floating point calling convention.  */
+#define TARGET_HARD_FLOAT_ABI   (csky_float_abi == CSKY_FLOAT_ABI_HARD)
+
+#define TARGET_SINGLE_FPU     (csky_fpu_index == TARGET_FPU_fpv2_sf)
+#define TARGET_DOUBLE_FPU     (TARGET_HARD_FLOAT && !TARGET_SINGLE_FPU)
+
+#define FUNCTION_VARG_REGNO_P(REGNO)      \
+  (TARGET_HARD_FLOAT_ABI                  \
+   && IN_RANGE ((REGNO), CSKY_FIRST_VFP_REGNUM, \
+		CSKY_FIRST_VFP_REGNUM + CSKY_NPARM_FREGS - 1))
+
+#define CSKY_VREG_MODE_P(mode) \
+  ((mode) == SFmode || (mode) == DFmode)
+
+#define FUNCTION_VARG_MODE_P(mode)  \
+  (TARGET_HARD_FLOAT_ABI            \
+   && CSKY_VREG_MODE_P(mode)        \
+   && !(mode == DFmode && TARGET_SINGLE_FPU))
+
 /* Number of loads/stores handled by ldm/stm.  */
 #define CSKY_MIN_MULTIPLE_STLD	3
 #define CSKY_MAX_MULTIPLE_STLD	12
@@ -353,7 +376,14 @@ extern int csky_arch_isa_features[];
 
 /* A C type for declaring a variable that is used as the first argument of
    TARGET_ FUNCTION_ARG and other related values.  */
-#define CUMULATIVE_ARGS	 int
+#if !defined (USED_FOR_TARGET)
+typedef struct
+{
+  int reg;
+  int freg;
+  bool is_stdarg;
+} CUMULATIVE_ARGS;
+#endif
 
 /* Initialize a variable CUM of type CUMULATIVE_ARGS
    for a call to a function whose data type is FNTYPE.
@@ -362,15 +392,16 @@ extern int csky_arch_isa_features[];
    On CSKY, the offset always starts at 0: the first parm reg is always
    the same reg.  */
 #define INIT_CUMULATIVE_ARGS(CUM, FNTYPE, LIBNAME, INDIRECT, N_NAMED_ARGS) \
-  ((CUM) = 0)
+  csky_init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (INDIRECT))
 
 /* True if N is a possible register number for function argument passing.
    On the CSKY, r0-r3 are used to pass args.
    The int cast is to prevent a complaint about unsigned comparison to
    zero, since CSKY_FIRST_PARM_REGNUM is zero.  */
-#define FUNCTION_ARG_REGNO_P(REGNO)	    \
-  (((int)(REGNO) >= CSKY_FIRST_PARM_REGNUM) &&		\
-   ((REGNO) < (CSKY_NPARM_REGS + CSKY_FIRST_PARM_REGNUM)))
+#define FUNCTION_ARG_REGNO_P(REGNO)                          \
+  (((REGNO) >= CSKY_FIRST_PARM_REGNUM                        \
+    && (REGNO) < (CSKY_NPARM_REGS + CSKY_FIRST_PARM_REGNUM)) \
+   || FUNCTION_VARG_REGNO_P(REGNO))
 
 /* How Large Values Are Returned  */
 
@@ -818,7 +849,7 @@ while (0)
   {"arch", "%{!march=*:%{!mcpu=*:-march=%(VALUE)}}" }, \
   {"cpu", "%{!march=*:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \
   {"endian", "%{!mbig-endian:%{!mlittle-endian:-m%(VALUE)-endian}}" }, \
-  {"float", "%{!msoft-float:%{!mhard-float:-m%(VALUE)-float}}" },
+  {"float", "%{!mfloat-abi=*:-mfloat-abi=%(VALUE)}" },
 
 
 /******************************************************************
diff --git a/gcc/config/csky/csky.md b/gcc/config/csky/csky.md
index 2644acd..78c9b80 100644
--- a/gcc/config/csky/csky.md
+++ b/gcc/config/csky/csky.md
@@ -50,6 +50,7 @@
    (CSKY_LAST_EH_RETDATA_REGNUM		1)
    (CSKY_EH_STACKADJ_REGNUM		2)
    (CSKY_STACKADJUST_REGNUM		4)
+   (CSKY_NPARM_FREGS 4)
 ])
 
 ;; Supported TLS relocations.
@@ -100,6 +101,7 @@
 
    ; Support for the eh_return pattern.
    VUNSPEC_EH_RETURN
+   VUNSPEC_BLOCKAGE
   ])
 
 
@@ -3310,6 +3312,88 @@
 				 force_reg (Pmode, XEXP (operands[1], 0)));
   }")
 
+;; Call subroutine returning any type.
+
+(define_expand "untyped_call"
+  [(parallel [(call (match_operand 0 "" "")
+        (const_int 0))
+        (match_operand 1 "" "")
+        (match_operand 2 "" "")])]
+  ""
+{
+  int i;
+
+  emit_call_insn (gen_call (operands[0], const0_rtx));
+
+  for (i = 0; i < XVECLEN (operands[2], 0); i++)
+    {
+      rtx set = XVECEXP (operands[2], 0, i);
+      emit_move_insn (SET_DEST (set), SET_SRC (set));
+    }
+
+  /* The optimizer does not know that the call sets the function value
+     registers we stored in the result block.  We avoid problems by
+     claiming that all hard registers are used and clobbered at this
+     point.  */
+  emit_insn (gen_blockage ());
+
+  DONE;
+})
+
+;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and
+;; all of memory.  This blocks insns from being moved across this point.
+
+(define_insn "blockage"
+  [(unspec_volatile [(const_int 0)] VUNSPEC_BLOCKAGE)]
+  ""
+  ""
+  [(set_attr "length" "0")])
+
+(define_insn "*call_value_internal_vs"
+  [(set (match_operand:SF               0 "register_operand"          "=v,v,v")
+        (call (mem:SI (match_operand:SI 1 "csky_call_address_operand" "b, r,S"))
+              (match_operand 2 "" "")))
+   (clobber (reg:SI CSKY_LR_REGNUM))]
+  "TARGET_HARD_FLOAT_ABI"
+  "@
+    jsr\t%1
+    jsr\t%1
+    jbsr\t%1"
+  [(set_attr "length" "2,4,4")
+   (set_attr "type"   "call_jsr,call_jsr,call")]
+)
+
+(define_insn "*call_value_internal_vd"
+  [(set (match_operand:DF               0 "register_operand"          "=v,v,v")
+        (call (mem:SI (match_operand:SI 1 "csky_call_address_operand" "b, r,S"))
+              (match_operand 2 "" "")))
+   (clobber (reg:SI CSKY_LR_REGNUM))]
+  "TARGET_HARD_FLOAT_ABI && TARGET_DOUBLE_FPU"
+  "@
+    jsr\t%1
+    jsr\t%1
+    jbsr\t%1"
+  [(set_attr "length" "2,4,4")
+   (set_attr "type"   "call_jsr,call_jsr,call")]
+)
+
+(define_insn "*call_value_internal_pic_vs"
+  [(set (match_operand:SF               0 "register_operand"    "=v")
+        (call (mem:SI (match_operand:SI 1 "csky_unspec_operand" "X"))
+                      (match_operand    2 "" "")))
+   (clobber (reg:SI CSKY_LR_REGNUM))]
+  "flag_pic && TARGET_HARD_FLOAT_ABI"
+  "* return csky_output_call (operands, 1);"
+)
+
+(define_insn "*call_value_internal_pic_vd"
+  [(set (match_operand:DF               0 "register_operand"    "=v")
+        (call (mem:SI (match_operand:SI 1 "csky_unspec_operand" "X"))
+                      (match_operand    2 "" "")))
+   (clobber (reg:SI CSKY_LR_REGNUM))]
+  "flag_pic && TARGET_HARD_FLOAT_ABI && TARGET_DOUBLE_FPU"
+  "* return csky_output_call (operands, 1);"
+)
 
 (define_insn "*call_value_internal"
   [(set (match_operand			0 "register_operand"	      "=r,r,r")
diff --git a/gcc/config/csky/csky.opt b/gcc/config/csky/csky.opt
index 5846e50..505a764 100644
--- a/gcc/config/csky/csky.opt
+++ b/gcc/config/csky/csky.opt
@@ -57,12 +57,33 @@ Target RejectNegative Report Alias(mlittle-endian) Undocumented
 ;; assembly.
 
 mhard-float
-Target Report RejectNegative Mask(HARD_FLOAT)
-Enable hardware floating-point instructions.
+Target RejectNegative Alias(mfloat-abi=, hard) Undocumented
 
 msoft-float
-Target Report RejectNegative InverseMask(HARD_FLOAT)
-Use library calls to perform floating-point operations (default).
+Target RejectNegative Alias(mfloat-abi=, soft) Undocumented
+
+mfloat-abi=v2
+Target RejectNegative Alias(mfloat-abi=, hard) Undocumented
+
+mfloat-abi=v1
+Target RejectNegative Alias(mfloat-abi=, softfp) Undocumented
+
+mfloat-abi=
+Target RejectNegative Joined Enum(float_abi_type) Var(csky_float_abi) Init(CSKY_FLOAT_ABI_SOFT)
+Specify if floating point hardware should be used.
+
+Enum
+Name(float_abi_type) Type(enum float_abi_type)
+Known floating-point ABIs (for use with the -mfloat-abi= option):
+
+EnumValue
+Enum(float_abi_type) String(soft) Value(CSKY_FLOAT_ABI_SOFT)
+
+EnumValue
+Enum(float_abi_type) String(softfp) Value(CSKY_FLOAT_ABI_SOFTFP)
+
+EnumValue
+Enum(float_abi_type) String(hard) Value(CSKY_FLOAT_ABI_HARD)
 
 mfpu=
 Target RejectNegative Joined Enum(csky_fpu) Var(csky_fpu_index) Init(TARGET_FPU_auto) Save
@@ -171,3 +192,7 @@ Set the branch costs to roughly the specified number of instructions.
 msched-prolog
 Target Report Var(flag_sched_prolog) Init(0)
 Permit scheduling of function prologue and epilogue sequences.
+
+msim
+Target
+Use the simulator runtime.
diff --git a/gcc/config/csky/csky_opts.h b/gcc/config/csky/csky_opts.h
index a6dbf5a..7ee56be 100644
--- a/gcc/config/csky/csky_opts.h
+++ b/gcc/config/csky/csky_opts.h
@@ -59,5 +59,12 @@ enum csky_fpu_type
 };
 #define CSKY_TARGET_FPU_GET(name) TARGET_FPU_ ## name
 
+enum float_abi_type
+{
+  CSKY_FLOAT_ABI_SOFT,
+  CSKY_FLOAT_ABI_SOFTFP,
+  CSKY_FLOAT_ABI_HARD
+};
+
 
 #endif /* CSKY_OPTS_H */
diff --git a/gcc/config/csky/t-csky-elf b/gcc/config/csky/t-csky-elf
index cd690bc..62a2d83 100644
--- a/gcc/config/csky/t-csky-elf
+++ b/gcc/config/csky/t-csky-elf
@@ -100,8 +100,11 @@ MULTILIB_MATCHES     += mcpu?ck807f=march?ck807ef
 MULTILIB_MATCHES     += mcpu?ck807f=march?ck807
 MULTILIB_MATCHES     += mcpu?ck807f=mcpu?ck807
 
-# For option -msoft-float/-mhard-float.
-MULTILIB_OPTIONS     += msoft-float/mhard-float
-MULTILIB_DIRNAMES    += soft-fp hard-fp
-MULTILIB_EXCEPTIONS  += *mcpu=ck801/*mhard-float*
-MULTILIB_EXCEPTIONS  += *mcpu=ck802/*mhard-float*
+# For option -mfloat-abi=
+MULTILIB_OPTIONS     += mfloat-abi=soft/mfloat-abi=softfp/mfloat-abi=hard
+MULTILIB_DIRNAMES    += soft soft-fp hard-fp
+
+MULTILIB_EXCEPTIONS  += *mcpu=ck801/*mfloat-abi=softfp*
+MULTILIB_EXCEPTIONS  += *mcpu=ck802/*mfloat-abi=softfp*
+MULTILIB_EXCEPTIONS  += *mcpu=ck801/*mfloat-abi=hard*
+MULTILIB_EXCEPTIONS  += *mcpu=ck802/*mfloat-abi=hard*
diff --git a/gcc/config/csky/t-csky-linux b/gcc/config/csky/t-csky-linux
index 16656c3..f4d656a 100644
--- a/gcc/config/csky/t-csky-linux
+++ b/gcc/config/csky/t-csky-linux
@@ -20,14 +20,8 @@
 # <http://www.gnu.org/licenses/>.
 
 
-# Endiannesses.
-MULTILIB_OPTIONS     = mlittle-endian/mbig-endian
-MULTILIB_DIRNAMES    = little big
-MULTILIB_MATCHES     = mlittle-endian=EL
-MULTILIB_MATCHES     = mbig-endian=EB
-
 MULTILIB_EXCEPTIONS  =
-CSKY_MULTILIB_OSDIRNAMES = mbig-endian=/big mlittle-endian=/. mhard-float=/hard-fp msoft-float=/. mcpu.ck810f=/. mcpu.ck807f=/ck807
+CSKY_MULTILIB_OSDIRNAMES = mfloat-abi.softfp=/soft-fp mfloat-abi.hard=/hard-fp mfloat-abi.soft=/. mcpu.ck810f=/. mcpu.ck807f=/ck807
 
 # Arch variants.
 MULTILIB_OPTIONS     += mcpu=ck810f/mcpu=ck807f
@@ -47,6 +41,6 @@ MULTILIB_MATCHES     += mcpu?ck810f=mcpu?ck810vf
 MULTILIB_MATCHES     += mcpu?ck810f=mcpu?ck810ft
 MULTILIB_MATCHES     += mcpu?ck810f=mcpu?ck810vft
 
-# For option -msoft-float/-mhard-float.
-MULTILIB_OPTIONS    += msoft-float/mhard-float
-MULTILIB_DIRNAMES   += soft-fp hard-fp
+# For option -mfloat-abi=
+MULTILIB_OPTIONS     += mfloat-abi=soft/mfloat-abi=softfp/mfloat-abi=hard
+MULTILIB_DIRNAMES    += soft soft-fp hard-fp
diff --git a/gcc/config/darwin-protos.h b/gcc/config/darwin-protos.h
index 54cd1e4..49c540f 100644
--- a/gcc/config/darwin-protos.h
+++ b/gcc/config/darwin-protos.h
@@ -125,6 +125,6 @@ extern bool darwin_kextabi_p (void);
 extern void darwin_override_options (void);
 extern void darwin_patch_builtins (void);
 extern void darwin_rename_builtins (void);
-extern bool darwin_libc_has_function (enum function_class fn_class);
+extern bool darwin_libc_has_function (enum function_class fn_class, tree);
 
 #endif /* CONFIG_DARWIN_PROTOS_H */
diff --git a/gcc/config/darwin-sections.def b/gcc/config/darwin-sections.def
index 98677f6..65bf5ad 100644
--- a/gcc/config/darwin-sections.def
+++ b/gcc/config/darwin-sections.def
@@ -198,3 +198,18 @@ DEF_SECTION (objc2_image_info_section, 0,
 	     ".section __DATA, __objc_imageinfo, regular, no_dead_strip", 1)
 DEF_SECTION (objc2_constant_string_object_section, 0,
 	     ".section __DATA, __objc_stringobj, regular, no_dead_strip", 1)
+
+/* Additions for compatibility with later runtime conventions especially for
+   sections containing strings.  */
+DEF_SECTION (objc2_data_section, 0, ".section __DATA, __data", 1)
+
+DEF_SECTION (objc2_ivar_section, 0, ".section __DATA, __objc_ivar", 1)
+
+DEF_SECTION (objc2_class_names_section, 0,
+	     ".section __TEXT, __objc_classname, cstring_literals", 1)
+
+DEF_SECTION (objc2_method_names_section, 0,
+	     ".section __TEXT, __objc_methname, cstring_literals", 1)
+
+DEF_SECTION (objc2_method_types_section, 0,
+	     ".section __TEXT, __objc_methtype, cstring_literals", 1)
diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c
index d3c0af8..dd4857f 100644
--- a/gcc/config/darwin.c
+++ b/gcc/config/darwin.c
@@ -18,8 +18,6 @@ You should have received a copy of the GNU General Public License
 along with GCC; see the file COPYING3.  If not see
 <http://www.gnu.org/licenses/>.  */
 
-#define IN_TARGET_CODE 1
-
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -138,7 +136,7 @@ output_objc_section_asm_op (const void *directive)
      order in the object.  The code below implements this by emitting
      a section header for each ObjC section the first time that an ObjC
      section is requested.  */
-  if (! been_here)
+  if (darwin_symbol_stubs && ! been_here)
     {
       section *saved_in_section = in_section;
       static const enum darwin_section_enum tomark[] =
@@ -176,20 +174,23 @@ output_objc_section_asm_op (const void *directive)
       /* ABI=2 */
       static const enum darwin_section_enum tomarkv2[] =
 	{
+	  objc2_method_names_section,
 	  objc2_message_refs_section,
+	  objc2_selector_refs_section,
+	  objc2_ivar_section,
 	  objc2_classdefs_section,
 	  objc2_metadata_section,
 	  objc2_classrefs_section,
+	  objc2_class_names_section,
 	  objc2_classlist_section,
 	  objc2_categorylist_section,
-	  objc2_selector_refs_section,
 	  objc2_nonlazy_class_section,
 	  objc2_nonlazy_category_section,
 	  objc2_protocollist_section,
 	  objc2_protocolrefs_section,
 	  objc2_super_classrefs_section,
+	  objc2_constant_string_object_section,
 	  objc2_image_info_section,
-	  objc2_constant_string_object_section
 	} ;
       size_t i;
 
@@ -1351,34 +1352,40 @@ darwin_mergeable_constant_section (tree exp,
 				   unsigned HOST_WIDE_INT align,
 				   bool zsize)
 {
-  machine_mode mode = DECL_MODE (exp);
-  unsigned int modesize = GET_MODE_BITSIZE (mode);
-
   if (zsize)
     return darwin_sections[zobj_const_section];
 
-  if (flag_merge_constants
-      && mode != VOIDmode
-      && mode != BLKmode
-      && modesize <= align
-      && align >= 8
-      && align <= 256
-      && (align & (align -1)) == 0)
-    {
-      tree size = TYPE_SIZE_UNIT (TREE_TYPE (exp));
+  machine_mode mode = DECL_MODE (exp);
+  if (!flag_merge_constants
+      || mode == VOIDmode
+      || mode == BLKmode
+      || align < 8
+      || align > 256
+      || (align & (align -1)) != 0)
+    return readonly_data_section;
 
-      if (TREE_CODE (size) == INTEGER_CST)
-	{
-	  if (wi::to_wide (size) == 4)
-	    return darwin_sections[literal4_section];
-	  else if (wi::to_wide (size) == 8)
-	    return darwin_sections[literal8_section];
-	  else if (HAVE_GAS_LITERAL16
-		   && TARGET_64BIT
-		   && wi::to_wide (size) == 16)
-	    return darwin_sections[literal16_section];
-	}
-    }
+  /* This will ICE if the mode is not a constant size, but that is reasonable,
+     since one cannot put a variable-sized thing into a constant section, we
+     shouldn't be trying.  */
+  const unsigned int modesize = GET_MODE_BITSIZE (mode).to_constant ();
+
+  if (modesize > align)
+    return readonly_data_section;
+
+  tree size = TYPE_SIZE_UNIT (TREE_TYPE (exp));
+
+  if (TREE_CODE (size) != INTEGER_CST)
+    return readonly_data_section;
+
+  unsigned isize = TREE_INT_CST_LOW (size);
+  if (isize == 4)
+    return darwin_sections[literal4_section];
+  else if (isize == 8)
+    return darwin_sections[literal8_section];
+  else if (HAVE_GAS_LITERAL16
+	   && TARGET_64BIT
+	   && isize == 16)
+    return darwin_sections[literal16_section];
 
   return readonly_data_section;
 }
@@ -1432,7 +1439,7 @@ darwin_objc2_section (tree decl ATTRIBUTE_UNUSED, tree meta, section * base)
   gcc_assert (TREE_CODE (ident) == IDENTIFIER_NODE);
   p = IDENTIFIER_POINTER (ident);
 
-  gcc_checking_assert (flag_next_runtime == 1 && flag_objc_abi == 2);
+  gcc_checking_assert (flag_next_runtime >= 1 && flag_objc_abi == 2);
 
   objc_metadata_seen = 1;
 
@@ -1443,11 +1450,20 @@ darwin_objc2_section (tree decl ATTRIBUTE_UNUSED, tree meta, section * base)
      first.  */
   if      (!strncmp (p, "V2_BASE", 7))
     return base;
+  else if (!strncmp (p, "V2_CNAM", 7))
+    return darwin_sections[objc2_class_names_section];
+  else if (!strncmp (p, "V2_MNAM", 7))
+    return darwin_sections[objc2_method_names_section];
+  else if (!strncmp (p, "V2_MTYP", 7))
+    return darwin_sections[objc2_method_types_section];
   else if (!strncmp (p, "V2_STRG", 7))
     return darwin_sections[cstring_section];
 
   else if (!strncmp (p, "G2_META", 7) || !strncmp (p, "G2_CLAS", 7))
     return darwin_sections[objc2_classdefs_section];
+  else if (!strncmp (p, "V2_PCOL", 7))
+    return ld_uses_coal_sects ? darwin_sections[data_coal_section]
+			      : darwin_sections[objc2_data_section];
   else if (!strncmp (p, "V2_MREF", 7))
     return darwin_sections[objc2_message_refs_section];
   else if (!strncmp (p, "V2_CLRF", 7))
@@ -1483,6 +1499,9 @@ darwin_objc2_section (tree decl ATTRIBUTE_UNUSED, tree meta, section * base)
   else if (!strncmp (p, "V2_CSTR", 7))
     return darwin_sections[objc2_constant_string_object_section];
 
+  else if (!strncmp (p, "V2_IVRF", 7))
+    return darwin_sections[objc2_ivar_section];
+
   /* Not recognized, default.  */
   return base;
 }
@@ -1496,7 +1515,7 @@ darwin_objc1_section (tree decl ATTRIBUTE_UNUSED, tree meta, section * base)
   gcc_assert (TREE_CODE (ident) == IDENTIFIER_NODE);
   p = IDENTIFIER_POINTER (ident);
 
-  gcc_checking_assert (flag_next_runtime == 1 && flag_objc_abi < 2);
+  gcc_checking_assert (flag_next_runtime >= 1 && flag_objc_abi < 2);
 
   objc_metadata_seen = 1;
 
@@ -1747,19 +1766,19 @@ section *
 machopic_select_rtx_section (machine_mode mode, rtx x,
 			     unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
 {
-  if (GET_MODE_SIZE (mode) == 8
+  if (known_eq (GET_MODE_SIZE (mode), 8)
       && (GET_CODE (x) == CONST_INT
 	  || GET_CODE (x) == CONST_WIDE_INT
 	  || GET_CODE (x) == CONST_DOUBLE))
     return darwin_sections[literal8_section];
-  else if (GET_MODE_SIZE (mode) == 4
+  else if (known_eq (GET_MODE_SIZE (mode), 4)
 	   && (GET_CODE (x) == CONST_INT
 	       || GET_CODE (x) == CONST_WIDE_INT
 	       || GET_CODE (x) == CONST_DOUBLE))
     return darwin_sections[literal4_section];
   else if (HAVE_GAS_LITERAL16
 	   && TARGET_64BIT
-	   && GET_MODE_SIZE (mode) == 16
+	   && known_eq (GET_MODE_SIZE (mode), 16)
 	   && (GET_CODE (x) == CONST_INT
 	       || GET_CODE (x) == CONST_WIDE_INT
 	       || GET_CODE (x) == CONST_DOUBLE
@@ -1857,6 +1876,14 @@ darwin_globalize_label (FILE *stream, const char *name)
 {
   if (!!strncmp (name, "_OBJC_", 6))
     default_globalize_label (stream, name);
+  /* We have some Objective C cases that need to be global, but only on newer
+     OS versions.  */
+  if (flag_objc_abi < 2 || flag_next_runtime < 100700)
+    return;
+  if (!strncmp (name+6, "LabelPro", 8))
+    default_globalize_label (stream, name);
+  if (!strncmp (name+6, "Protocol_", 9))
+    default_globalize_label (stream, name);
 }
 
 /* This routine returns non-zero if 'name' starts with the special objective-c
@@ -1875,7 +1902,49 @@ darwin_label_is_anonymous_local_objc_name (const char *name)
     while (*p >= '0' && *p <= '9')
       p++;
   }
-  return (!strncmp ((const char *)p, "_OBJC_", 6));
+  if (strncmp ((const char *)p, "_OBJC_", 6) != 0)
+    return false;
+
+  /* We need some of the objective c meta-data symbols to be visible to the
+     linker (when the target OS version is newer).  FIXME: this is horrible,
+     we need a better mechanism.  */
+
+  if (flag_objc_abi < 2 || flag_next_runtime < 100700)
+    return true;
+
+  p += 6;
+  if (!strncmp ((const char *)p, "ClassRef", 8))
+    return false;
+  else if (!strncmp ((const char *)p, "SelRef", 6))
+    return false;
+  else if (!strncmp ((const char *)p, "Category", 8))
+    {
+      if (p[8] == '_' || p[8] == 'I' || p[8] == 'P' || p[8] == 'C' )
+	return false;
+      return true;
+    }
+  else if (!strncmp ((const char *)p, "ClassMethods", 12))
+    return false;
+  else if (!strncmp ((const char *)p, "Instance", 8))
+    {
+      if (p[8] == 'I' || p[8] == 'M')
+	return false;
+      return true;
+    }
+  else if (!strncmp ((const char *)p, "CLASS_RO", 8))
+    return false;
+  else if (!strncmp ((const char *)p, "METACLASS_RO", 12))
+    return false;
+  else if (!strncmp ((const char *)p, "Protocol", 8))
+    {
+      if (p[8] == '_' || p[8] == 'I' || p[8] == 'P'
+	  || p[8] == 'M' || p[8] == 'C' || p[8] == 'O')
+	return false;
+      return true;
+    }
+  else if (!strncmp ((const char *)p, "LabelPro", 8))
+    return false;
+  return true;
 }
 
 /* LTO support for Mach-O.
@@ -2380,11 +2449,7 @@ darwin_emit_local_bss (FILE *fp, tree decl, const char *name,
 			unsigned HOST_WIDE_INT size,
 			unsigned int l2align)
 {
-   /* FIXME: We have a fudge to make this work with Java even when the target does
-   not use sections anchors -- Java seems to need at least one small item in a
-   non-zerofill segment.   */
-   if ((DARWIN_SECTION_ANCHORS && flag_section_anchors && size < BYTES_ZFILL)
-       || (size && size <= 2))
+   if (DARWIN_SECTION_ANCHORS && flag_section_anchors && size < BYTES_ZFILL)
     {
       /* Put smaller objects in _static_data, where the section anchors system
 	 can get them.
@@ -2410,16 +2475,13 @@ darwin_emit_local_bss (FILE *fp, tree decl, const char *name,
     }
   else
     {
-      /* When we are on a non-section anchor target, we can get zero-sized
-	 items here.  However, all we need to do is to bump them to one byte
-	 and the section alignment will take care of the rest.  */
+      /* When we are on a non-section anchor target (or not using section
+	 anchors, we can get zero-sized items here.  However, all we need to
+	 do is to bump them to one byte and the section alignment will take
+	 care of the rest.  */
       char secnam[64];
-      unsigned int flags ;
-      snprintf (secnam, 64, "__DATA,__%sbss%u", ((size)?"":"zo_"),
-						(unsigned) l2align);
-      /* We can't anchor (yet, if ever) in zerofill sections, because we can't
-	 switch to them and emit a label.  */
-      flags = SECTION_BSS|SECTION_WRITE|SECTION_NO_ANCHOR;
+      snprintf (secnam, 64, "__DATA,__bss");
+      unsigned int flags = SECTION_BSS|SECTION_WRITE|SECTION_NO_ANCHOR;
       in_section = get_section (secnam, flags, NULL);
       fprintf (fp, "\t.zerofill %s,", secnam);
       assemble_name (fp, name);
@@ -2430,7 +2492,7 @@ darwin_emit_local_bss (FILE *fp, tree decl, const char *name,
 	fprintf (fp, "," HOST_WIDE_INT_PRINT_UNSIGNED",%u\n",
 		 size, (unsigned) l2align);
       else
-	fprintf (fp, "," HOST_WIDE_INT_PRINT_UNSIGNED"\n", size);
+	fprintf (fp, "," HOST_WIDE_INT_PRINT_UNSIGNED",0\n", size);
     }
 
   (*targetm.encode_section_info) (decl, DECL_RTL (decl), false);
@@ -2555,9 +2617,8 @@ fprintf (fp, "# albss: %s (%lld,%d) ro %d cst %d stat %d com %d"
       return;
     }
 
-  /* So we have a public symbol (small item fudge for Java, see above).  */
-  if ((DARWIN_SECTION_ANCHORS && flag_section_anchors && size < BYTES_ZFILL)
-       || (size && size <= 2))
+  /* So we have a public symbol.  */
+  if (DARWIN_SECTION_ANCHORS && flag_section_anchors && size < BYTES_ZFILL)
     {
       /* Put smaller objects in data, where the section anchors system can get
 	 them.  However, if they are zero-sized punt them to yet a different
@@ -2582,16 +2643,10 @@ fprintf (fp, "# albss: %s (%lld,%d) ro %d cst %d stat %d com %d"
     }
   else
     {
+      /* Section anchors not in use.  */
+      unsigned int flags = SECTION_BSS|SECTION_WRITE|SECTION_NO_ANCHOR;
       char secnam[64];
-      unsigned int flags ;
-      /* When we are on a non-section anchor target, we can get zero-sized
-	 items here.  However, all we need to do is to bump them to one byte
-	 and the section alignment will take care of the rest.  */
-      snprintf (secnam, 64, "__DATA,__%spu_bss%u", ((size)?"":"zo_"), l2align);
-
-      /* We can't anchor in zerofill sections, because we can't switch
-	 to them and emit a label.  */
-      flags = SECTION_BSS|SECTION_WRITE|SECTION_NO_ANCHOR;
+      snprintf (secnam, 64, "__DATA,__common");
       in_section = get_section (secnam, flags, NULL);
       fprintf (fp, "\t.zerofill %s,", secnam);
       assemble_name (fp, name);
@@ -2601,7 +2656,7 @@ fprintf (fp, "# albss: %s (%lld,%d) ro %d cst %d stat %d com %d"
       if (l2align)
 	fprintf (fp, "," HOST_WIDE_INT_PRINT_UNSIGNED",%u\n", size, l2align);
       else
-	fprintf (fp, "," HOST_WIDE_INT_PRINT_UNSIGNED"\n", size);
+	fprintf (fp, "," HOST_WIDE_INT_PRINT_UNSIGNED",0\n", size);
     }
   (* targetm.encode_section_info) (decl, DECL_RTL (decl), false);
 }
@@ -3137,10 +3192,14 @@ darwin_override_options (void)
   /* Keep track of which (major) version we're generating code for.  */
   if (darwin_macosx_version_min)
     {
-      if (strverscmp (darwin_macosx_version_min, "10.6") >= 0)
+      if (strverscmp (darwin_macosx_version_min, "10.7") >= 0)
+	generating_for_darwin_version = 11;
+      else if (strverscmp (darwin_macosx_version_min, "10.6") >= 0)
 	generating_for_darwin_version = 10;
       else if (strverscmp (darwin_macosx_version_min, "10.5") >= 0)
 	generating_for_darwin_version = 9;
+      else if (strverscmp (darwin_macosx_version_min, "10.4") >= 0)
+	generating_for_darwin_version = 8;
 
       /* Earlier versions are not specifically accounted, until required.  */
     }
@@ -3156,6 +3215,20 @@ darwin_override_options (void)
      should check for correctness re. the ABI.  TODO: check and provide the
      flags (runtime & ABI) from the lto wrapper).  */
 
+  /* At present, make a hard update to the runtime version based on the target
+     OS version.  */
+  if (flag_next_runtime)
+    {
+      if (generating_for_darwin_version > 10)
+	flag_next_runtime = 100705;
+      else if (generating_for_darwin_version > 9)
+	flag_next_runtime = 100608;
+      else if (generating_for_darwin_version > 8)
+	flag_next_runtime = 100508;
+      else
+	flag_next_runtime = 100000;
+    }
+
   /* Unless set, force ABI=2 for NeXT and m64, 0 otherwise.  */
   if (!global_options_set.x_flag_objc_abi)
     global_options.x_flag_objc_abi
@@ -3538,10 +3611,12 @@ darwin_rename_builtins (void)
 }
 
 bool
-darwin_libc_has_function (enum function_class fn_class)
+darwin_libc_has_function (enum function_class fn_class,
+			  tree type ATTRIBUTE_UNUSED)
 {
   if (fn_class == function_sincos)
-    return false;
+    return (strverscmp (darwin_macosx_version_min, "10.9") >= 0);
+
   if (fn_class == function_c99_math_complex
       || fn_class == function_c99_misc)
     return (TARGET_64BIT
diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h
index f528b17..f9d4fec 100644
--- a/gcc/config/darwin.h
+++ b/gcc/config/darwin.h
@@ -107,7 +107,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 /* Default to using the NeXT-style runtime, since that's what is
    pre-installed on Darwin systems.  */
 
-#define NEXT_OBJC_RUNTIME 1
+#define NEXT_OBJC_RUNTIME 100508
 
 /* Don't default to pcc-struct-return, because gcc is the only compiler, and
    we want to retain compatibility with older gcc versions.  */
@@ -476,6 +476,7 @@ extern GTY(()) int darwin_ms_struct;
    debugging data.  */
 
 #define ASM_DEBUG_SPEC  "%{g*:%{%:debug-level-gt(0):%{!gdwarf*:--gstabs}}}"
+#define ASM_DEBUG_OPTION_SPEC ""
 #define ASM_FINAL_SPEC \
   "%{gsplit-dwarf:%ngsplit-dwarf is not supported on this platform} %<gsplit-dwarf"
 
@@ -652,6 +653,7 @@ extern GTY(()) int darwin_ms_struct;
    that the name *is* defined in this module, so it doesn't need to
    make them indirect.  */
 
+#undef ASM_DECLARE_FUNCTION_NAME
 #define ASM_DECLARE_FUNCTION_NAME(FILE, NAME, DECL)			\
   do {									\
     const char *xname = NAME;						\
@@ -1059,7 +1061,9 @@ extern void darwin_driver_init (unsigned int *,struct cl_decoded_option **);
    providing an osx-version-min of this unless overridden by the User.
    10.5 is the only version that fully supports all our archs so that's the
    fall-back default.  */
+#ifndef DEF_MIN_OSX_VERSION
 #define DEF_MIN_OSX_VERSION "10.5"
+#endif
 
 /* Later versions of ld64 support coalescing weak code/data without requiring
    that they be placed in specially identified sections.  This is the earliest
diff --git a/gcc/config/darwin9.h b/gcc/config/darwin9.h
index b7bdf63..787aca7 100644
--- a/gcc/config/darwin9.h
+++ b/gcc/config/darwin9.h
@@ -41,6 +41,9 @@ along with GCC; see the file COPYING3.  If not see
 #undef  ASM_DEBUG_SPEC
 #define ASM_DEBUG_SPEC  "%{g*:%{%:debug-level-gt(0):%{gstabs:--gstabs}}}"
 
+#undef  ASM_DEBUG_OPTION_SPEC
+#define ASM_DEBUG_OPTION_SPEC	""
+
 #undef  ASM_OUTPUT_ALIGNED_COMMON
 #define ASM_OUTPUT_ALIGNED_COMMON(FILE, NAME, SIZE, ALIGN)		\
   do {									\
diff --git a/gcc/config/gcn/gcn-passes.def b/gcc/config/gcn/gcn-passes.def
deleted file mode 100644
index bcf928d..0000000
--- a/gcc/config/gcn/gcn-passes.def
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (C) 2017-2020 Free Software Foundation, Inc.
-
-   This file is part of GCC.
-   
-   GCC is free software; you can redistribute it and/or modify it under
-   the terms of the GNU General Public License as published by the Free
-   Software Foundation; either version 3, or (at your option) any later
-   version.
-   
-   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
-   WARRANTY; without even the implied warranty of MERCHANTABILITY or
-   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-   for more details.
-   
-   You should have received a copy of the GNU General Public License
-   along with GCC; see the file COPYING3.  If not see
-   <http://www.gnu.org/licenses/>.  */
-
-INSERT_PASS_AFTER (pass_omp_target_link, 1, pass_omp_gcn);
diff --git a/gcc/config/gcn/gcn-run.c b/gcc/config/gcn/gcn-run.c
index 8961ea1..31f14f3 100644
--- a/gcc/config/gcn/gcn-run.c
+++ b/gcc/config/gcn/gcn-run.c
@@ -34,24 +34,6 @@
 #include <elf.h>
 #include <signal.h>
 
-/* These probably won't be in elf.h for a while.  */
-#ifndef R_AMDGPU_NONE
-#define R_AMDGPU_NONE		0
-#define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
-#define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
-#define R_AMDGPU_ABS64		3	/* S + A  */
-#define R_AMDGPU_REL32		4	/* S + A - P  */
-#define R_AMDGPU_REL64		5	/* S + A - P  */
-#define R_AMDGPU_ABS32		6	/* S + A  */
-#define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
-#define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
-#define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
-#define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
-#define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
-#define reserved		12
-#define R_AMDGPU_RELATIVE64	13	/* B + A  */
-#endif
-
 #include "hsa.h"
 
 #ifndef HSA_RUNTIME_LIB
diff --git a/gcc/config/gcn/gcn-tree.c b/gcc/config/gcn/gcn-tree.c
index 4dcb179..4304f13 100644
--- a/gcc/config/gcn/gcn-tree.c
+++ b/gcc/config/gcn/gcn-tree.c
@@ -46,125 +46,6 @@
 #include "langhooks-def.h"
 
 /* }}}  */
-/* {{{ OMP GCN pass.
- 
-   This pass is intended to make any GCN-specfic transformations to OpenMP
-   target regions.
- 
-   At present, its only purpose is to convert some "omp" built-in functions
-   to use closer-to-the-metal "gcn" built-in functions.  */
-
-unsigned int
-execute_omp_gcn (void)
-{
-  tree thr_num_tree = builtin_decl_explicit (BUILT_IN_OMP_GET_THREAD_NUM);
-  tree thr_num_id = DECL_NAME (thr_num_tree);
-  tree team_num_tree = builtin_decl_explicit (BUILT_IN_OMP_GET_TEAM_NUM);
-  tree team_num_id = DECL_NAME (team_num_tree);
-  basic_block bb;
-  gimple_stmt_iterator gsi;
-  unsigned int todo = 0;
-
-  FOR_EACH_BB_FN (bb, cfun)
-    for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
-    {
-      gimple *call = gsi_stmt (gsi);
-      tree decl;
-
-      if (is_gimple_call (call) && (decl = gimple_call_fndecl (call)))
-	{
-	  tree decl_id = DECL_NAME (decl);
-	  tree lhs = gimple_get_lhs (call);
-
-	  if (decl_id == thr_num_id)
-	    {
-	      if (dump_file && (dump_flags & TDF_DETAILS))
-		fprintf (dump_file,
-			 "Replace '%s' with __builtin_gcn_dim_pos.\n",
-			 IDENTIFIER_POINTER (decl_id));
-
-	      /* Transform this:
-	         lhs = __builtin_omp_get_thread_num ()
-	         to this:
-	         lhs = __builtin_gcn_dim_pos (1)  */
-	      tree fn = targetm.builtin_decl (GCN_BUILTIN_OMP_DIM_POS, 0);
-	      tree fnarg = build_int_cst (unsigned_type_node, 1);
-	      gimple *stmt = gimple_build_call (fn, 1, fnarg);
-	      gimple_call_set_lhs (stmt, lhs);
-	      gsi_replace (&gsi, stmt, true);
-
-	      todo |= TODO_update_ssa;
-	    }
-	  else if (decl_id == team_num_id)
-	    {
-	      if (dump_file && (dump_flags & TDF_DETAILS))
-		fprintf (dump_file,
-			 "Replace '%s' with __builtin_gcn_dim_pos.\n",
-			 IDENTIFIER_POINTER (decl_id));
-
-	      /* Transform this:
-	         lhs = __builtin_omp_get_team_num ()
-	         to this:
-	         lhs = __builtin_gcn_dim_pos (0)  */
-	      tree fn = targetm.builtin_decl (GCN_BUILTIN_OMP_DIM_POS, 0);
-	      tree fnarg = build_zero_cst (unsigned_type_node);
-	      gimple *stmt = gimple_build_call (fn, 1, fnarg);
-	      gimple_call_set_lhs (stmt, lhs);
-	      gsi_replace (&gsi, stmt, true);
-
-	      todo |= TODO_update_ssa;
-	    }
-	}
-    }
-
-  return todo;
-}
-
-namespace
-{
-
-  const pass_data pass_data_omp_gcn = {
-    GIMPLE_PASS,
-    "omp_gcn",			/* name */
-    OPTGROUP_NONE,		/* optinfo_flags */
-    TV_NONE,			/* tv_id */
-    0,				/* properties_required */
-    0,				/* properties_provided */
-    0,				/* properties_destroyed */
-    0,				/* todo_flags_start */
-    TODO_df_finish,		/* todo_flags_finish */
-  };
-
-  class pass_omp_gcn : public gimple_opt_pass
-  {
-  public:
-    pass_omp_gcn (gcc::context *ctxt)
-      : gimple_opt_pass (pass_data_omp_gcn, ctxt)
-    {
-    }
-
-    /* opt_pass methods: */
-    virtual bool gate (function *)
-    {
-      return flag_openmp;
-    }
-
-    virtual unsigned int execute (function *)
-    {
-      return execute_omp_gcn ();
-    }
-
-  }; /* class pass_omp_gcn.  */
-
-} /* anon namespace.  */
-
-gimple_opt_pass *
-make_pass_omp_gcn (gcc::context *ctxt)
-{
-  return new pass_omp_gcn (ctxt);
-}
-
-/* }}}  */
 /* {{{ OpenACC reductions.  */
 
 /* Global lock variable, needed for 128bit worker & gang reductions.  */
@@ -456,7 +337,7 @@ gcn_goacc_get_worker_red_decl (tree type, unsigned offset)
 
       varpool_node::finalize_decl (decl);
 
-      vec_safe_grow_cleared (machfun->reduc_decls, offset + 1);
+      vec_safe_grow_cleared (machfun->reduc_decls, offset + 1, true);
       (*machfun->reduc_decls)[offset] = decl;
 
       return decl;
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 26559ff..e4d7f2a 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -923,7 +923,7 @@
   {
     addr_space_t as = INTVAL (operands[3]);
     static char buf[200];
-    sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s",
+    sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s\;s_waitcnt\tlgkmcnt(0)",
 	     (AS_GDS_P (as) ? " gds" : ""));
     return buf;
   }
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index babecef7..e868a8d 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -350,6 +350,19 @@ static const struct attribute_spec gcn_attribute_table[] = {
 /* }}}  */
 /* {{{ Registers and modes.  */
 
+/* Implement TARGET_SCALAR_MODE_SUPPORTED_P.  */
+
+bool
+gcn_scalar_mode_supported_p (scalar_mode mode)
+{
+  return (mode == BImode
+	  || mode == QImode
+	  || mode == HImode /* || mode == HFmode  */
+	  || mode == SImode || mode == SFmode
+	  || mode == DImode || mode == DFmode
+	  || mode == TImode);
+}
+
 /* Implement TARGET_CLASS_MAX_NREGS.
  
    Return the number of hard registers needed to hold a value of MODE in
@@ -462,7 +475,8 @@ gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
     return (vgpr_1reg_mode_p (mode)
 	    || (!((regno - FIRST_VGPR_REG) & 1) && vgpr_2reg_mode_p (mode))
 	    /* TImode is used by DImode compare_and_swap.  */
-	    || mode == TImode);
+	    || (mode == TImode
+		&& !((regno - FIRST_VGPR_REG) & 3)));
   return false;
 }
 
@@ -4961,26 +4975,28 @@ gcn_fixup_accel_lto_options (tree fndecl)
   if (!func_optimize)
     return;
 
-  tree old_optimize = build_optimization_node (&global_options);
+  tree old_optimize
+    = build_optimization_node (&global_options, &global_options_set);
   tree new_optimize;
 
   /* If the function changed the optimization levels as well as
      setting target options, start with the optimizations
      specified.  */
   if (func_optimize != old_optimize)
-    cl_optimization_restore (&global_options,
+    cl_optimization_restore (&global_options, &global_options_set,
 			     TREE_OPTIMIZATION (func_optimize));
 
   gcn_option_override ();
 
   /* The target attributes may also change some optimization flags,
      so update the optimization options if necessary.  */
-  new_optimize = build_optimization_node (&global_options);
+  new_optimize = build_optimization_node (&global_options,
+					  &global_options_set);
 
   if (old_optimize != new_optimize)
     {
       DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
-      cl_optimization_restore (&global_options,
+      cl_optimization_restore (&global_options, &global_options_set,
 			       TREE_OPTIMIZATION (old_optimize));
     }
 }
@@ -6331,6 +6347,8 @@ gcn_dwarf_register_span (rtx rtl)
 #define TARGET_SECONDARY_RELOAD gcn_secondary_reload
 #undef  TARGET_SECTION_TYPE_FLAGS
 #define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
+#undef  TARGET_SCALAR_MODE_SUPPORTED_P
+#define TARGET_SCALAR_MODE_SUPPORTED_P gcn_scalar_mode_supported_p
 #undef  TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
 #define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
   gcn_small_register_classes_for_mode_p
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 8cfb3a8..763e770 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -67,6 +67,7 @@
   UNSPECV_ICACHE_INV])
 
 (define_c_enum "unspec" [
+  UNSPEC_ADDPTR
   UNSPEC_VECTOR
   UNSPEC_BPERMUTE
   UNSPEC_SGPRBASE
@@ -554,7 +555,7 @@
   flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
   flat_store_dword\t%A0, %1%O0%g0
   v_mov_b32\t%0, %1
-  ds_write_b32\t%A0, %1%O0
+  ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
   ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
   s_mov_b32\t%0, %1
   global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
@@ -582,7 +583,7 @@
   flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
   flat_store%s0\t%A0, %1%O0%g0
   v_mov_b32\t%0, %1
-  ds_write%b0\t%A0, %1%O0
+  ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
   ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
   global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
   global_store%s0\t%A0, %1%O0%g0"
@@ -611,7 +612,7 @@
   #
   flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
   flat_store_dwordx2\t%A0, %1%O0%g0
-  ds_write_b64\t%A0, %1%O0
+  ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
   ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
   global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
   global_store_dwordx2\t%A0, %1%O0%g0"
@@ -667,7 +668,7 @@
   #
   global_store_dwordx4\t%A0, %1%O0%g0
   global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  ds_write_b128\t%A0, %1%O0
+  ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
   ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)"
   "reload_completed
    && REG_P (operands[0])
@@ -677,6 +678,8 @@
    (set (match_dup 4) (match_dup 5))
    (set (match_dup 6) (match_dup 7))]
   {
+    gcc_assert (rtx_equal_p (operands[0], operands[1])
+		|| !reg_overlap_mentioned_p (operands[0], operands[1]));
     operands[6] = gcn_operand_part (TImode, operands[0], 3);
     operands[7] = gcn_operand_part (TImode, operands[1], 3);
     operands[4] = gcn_operand_part (TImode, operands[0], 2);
@@ -1217,29 +1220,47 @@
 
 ; "addptr" is the same as "add" except that it must not write to VCC or SCC
 ; as a side-effect.  Unfortunately GCN does not have a suitable instruction
-; for this, so we use a custom VOP3 add with CC_SAVE_REG as a temp.
-; Note that it is not safe to save/clobber/restore SCC because doing so will
-; break data-flow analysis, so this must use vector registers.
+; for this, so we use CC_SAVE_REG as a temp.
+; Note that it is not safe to save/clobber/restore as separate insns because
+; doing so will break data-flow analysis, so this must use multiple
+; instructions in one insn.
 ;
 ; The "v0" should be just "v", but somehow the "0" helps LRA not loop forever
 ; on testcase pr54713-2.c with -O0. It's only an optimization hint anyway.
+;
+; The SGPR alternative is preferred as it is typically used with mov_sgprbase.
 
 (define_insn "addptrdi3"
-  [(set (match_operand:DI 0 "register_operand"		 "= v")
-	(plus:DI (match_operand:DI 1 "register_operand"	 " v0")
-		 (match_operand:DI 2 "nonmemory_operand" "vDA")))]
+  [(set (match_operand:DI 0 "register_operand"		 "= v, Sg")
+    (unspec:DI [
+	(plus:DI (match_operand:DI 1 "register_operand"	 "^v0,Sg0")
+		 (match_operand:DI 2 "nonmemory_operand" "vDA,SgDB"))]
+	UNSPEC_ADDPTR))]
   ""
   {
-    rtx new_operands[4] = { operands[0], operands[1], operands[2],
-			    gen_rtx_REG (DImode, CC_SAVE_REG) };
+    if (which_alternative == 0)
+      {
+	rtx new_operands[4] = { operands[0], operands[1], operands[2],
+				gen_rtx_REG (DImode, CC_SAVE_REG) };
 
-    output_asm_insn ("v_add%^_u32 %L0, %3, %L2, %L1", new_operands);
-    output_asm_insn ("v_addc%^_u32 %H0, %3, %H2, %H1, %3", new_operands);
+	output_asm_insn ("v_add%^_u32\t%L0, %3, %L2, %L1", new_operands);
+	output_asm_insn ("v_addc%^_u32\t%H0, %3, %H2, %H1, %3", new_operands);
+      }
+    else
+      {
+	rtx new_operands[4] = { operands[0], operands[1], operands[2],
+				gen_rtx_REG (BImode, CC_SAVE_REG) };
+
+	output_asm_insn ("s_mov_b32\t%3, scc", new_operands);
+	output_asm_insn ("s_add_u32\t%L0, %L1, %L2", new_operands);
+	output_asm_insn ("s_addc_u32\t%H0, %H1, %H2", new_operands);
+	output_asm_insn ("s_cmpk_lg_u32\t%3, 0", new_operands);
+      }
 
     return "";
   }
-  [(set_attr "type" "vmult")
-   (set_attr "length" "16")])
+  [(set_attr "type" "vmult,mult")
+   (set_attr "length" "16,24")])
 
 ;; }}}
 ;; {{{ ALU special cases: Minus
@@ -1539,6 +1560,111 @@
    (set_attr "length" "8")])
 
 ;; }}}
+;; {{{ ALU: generic 128-bit binop
+
+; TImode shifts can't be synthesized by the middle-end
+(define_expand "<expander>ti3"
+  [(set (match_operand:TI 0 "register_operand")
+	(vec_and_scalar_nocom:TI
+	  (match_operand:TI 1 "gcn_alu_operand")
+	  (match_operand:SI 2 "gcn_alu_operand")))]
+  ""
+  {
+    rtx dest = operands[0];
+    rtx src = operands[1];
+    rtx shift = operands[2];
+
+    enum {ashr, lshr, ashl} shiftop = <expander>;
+    rtx (*inverse_shift_fn) (rtx, rtx, rtx)
+      = (shiftop == ashl ? gen_lshrdi3 : gen_ashldi3);
+    rtx (*logical_shift_fn) (rtx, rtx, rtx)
+      = (shiftop == ashl ? gen_ashldi3 : gen_lshrdi3);
+
+    /* We shift "from" one subreg "to" the other, according to shiftop.  */
+    int from = (shiftop == ashl ? 0 : 8);
+    int to = (shiftop == ashl ? 8 : 0);
+    rtx destfrom = simplify_gen_subreg (DImode, dest, TImode, from);
+    rtx destto = simplify_gen_subreg (DImode, dest, TImode, to);
+    rtx srcfrom = simplify_gen_subreg (DImode, src, TImode, from);
+    rtx srcto = simplify_gen_subreg (DImode, src, TImode, to);
+
+    int shiftval = (CONST_INT_P (shift) ? INTVAL (shift) : -1);
+    enum {RUNTIME, ZERO, SMALL, LARGE} shiftcomparison
+     = (!CONST_INT_P (shift) ? RUNTIME
+        : shiftval == 0 ? ZERO
+        : shiftval < 64 ? SMALL
+        : LARGE);
+
+    rtx large_label, zero_label, exit_label;
+
+    if (shiftcomparison == RUNTIME)
+      {
+        zero_label = gen_label_rtx ();
+        large_label = gen_label_rtx ();
+        exit_label = gen_label_rtx ();
+
+        rtx cond = gen_rtx_EQ (VOIDmode, shift, const0_rtx);
+        emit_insn (gen_cbranchsi4 (cond, shift, const0_rtx, zero_label));
+
+        rtx sixtyfour = GEN_INT (64);
+        cond = gen_rtx_GE (VOIDmode, shift, sixtyfour);
+        emit_insn (gen_cbranchsi4 (cond, shift, sixtyfour, large_label));
+      }
+
+    if (shiftcomparison == SMALL || shiftcomparison == RUNTIME)
+      {
+        /* Shift both parts by the same amount, then patch in the bits that
+           cross the boundary.
+           This does *not* work for zero-length shifts.  */
+        rtx tmpto1 = gen_reg_rtx (DImode);
+        rtx tmpto2 = gen_reg_rtx (DImode);
+        emit_insn (gen_<expander>di3 (destfrom, srcfrom, shift));
+        emit_insn (logical_shift_fn (tmpto1, srcto, shift));
+        rtx lessershiftval = gen_reg_rtx (SImode);
+        emit_insn (gen_subsi3 (lessershiftval, GEN_INT (64), shift));
+        emit_insn (inverse_shift_fn (tmpto2, srcfrom, lessershiftval));
+        emit_insn (gen_iordi3 (destto, tmpto1, tmpto2));
+      }
+
+    if (shiftcomparison == RUNTIME)
+      {
+        emit_jump_insn (gen_jump (exit_label));
+        emit_barrier ();
+
+        emit_label (zero_label);
+      }
+
+    if (shiftcomparison == ZERO || shiftcomparison == RUNTIME)
+      emit_move_insn (dest, src);
+
+    if (shiftcomparison == RUNTIME)
+      {
+        emit_jump_insn (gen_jump (exit_label));
+        emit_barrier ();
+
+        emit_label (large_label);
+      }
+
+    if (shiftcomparison == LARGE || shiftcomparison == RUNTIME)
+      {
+        /* Do the shift within one part, and set the other part appropriately.
+           Shifts of 128+ bits are an error.  */
+        rtx lessershiftval = gen_reg_rtx (SImode);
+        emit_insn (gen_subsi3 (lessershiftval, shift, GEN_INT (64)));
+        emit_insn (gen_<expander>di3 (destto, srcfrom, lessershiftval));
+        if (shiftop == ashr)
+          emit_insn (gen_ashrdi3 (destfrom, srcfrom, GEN_INT (63)));
+        else
+          emit_move_insn (destfrom, const0_rtx);
+      }
+
+    if (shiftcomparison == RUNTIME)
+      emit_label (exit_label);
+
+    DONE;
+  })
+
+;; }}}
 ;; {{{ Atomics
 
 ; Each compute unit has it's own L1 cache. The L2 cache is shared between
diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c
index 0415d94..0983b98 100644
--- a/gcc/config/gcn/mkoffload.c
+++ b/gcc/config/gcn/mkoffload.c
@@ -33,31 +33,58 @@
 #include <libgen.h>
 #include "collect-utils.h"
 #include "gomp-constants.h"
+#include "simple-object.h"
+#include "elf.h"
+
+/* These probably won't (all) be in elf.h for a while.  */
+#undef  EM_AMDGPU
+#define EM_AMDGPU		0xe0;
+
+#undef  ELFOSABI_AMDGPU_HSA
+#define ELFOSABI_AMDGPU_HSA	 64
+#undef  ELFABIVERSION_AMDGPU_HSA
+#define ELFABIVERSION_AMDGPU_HSA 1
+
+#undef  EF_AMDGPU_MACH_AMDGCN_GFX803
+#define EF_AMDGPU_MACH_AMDGCN_GFX803 0x2a
+#undef  EF_AMDGPU_MACH_AMDGCN_GFX900
+#define EF_AMDGPU_MACH_AMDGCN_GFX900 0x2c
+#undef  EF_AMDGPU_MACH_AMDGCN_GFX906
+#define EF_AMDGPU_MACH_AMDGCN_GFX906 0x2f
+
+#ifndef R_AMDGPU_NONE
+#define R_AMDGPU_NONE		0
+#define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
+#define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
+#define R_AMDGPU_ABS64		3	/* S + A  */
+#define R_AMDGPU_REL32		4	/* S + A - P  */
+#define R_AMDGPU_REL64		5	/* S + A - P  */
+#define R_AMDGPU_ABS32		6	/* S + A  */
+#define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
+#define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
+#define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
+#define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
+#define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
+#define R_AMDGPU_RELATIVE64	13	/* B + A  */
+#endif
 
 const char tool_name[] = "gcn mkoffload";
 
-/* Files to unlink.  */
-static const char *gcn_s1_name;
-static const char *gcn_s2_name;
-static const char *gcn_o_name;
-static const char *gcn_cfile_name;
 static const char *gcn_dumpbase;
+static struct obstack files_to_cleanup;
 
 enum offload_abi offload_abi = OFFLOAD_ABI_UNSET;
+uint32_t elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX803;  // Default GPU architecture.
 
 /* Delete tempfiles.  */
 
 void
 tool_cleanup (bool from_signal ATTRIBUTE_UNUSED)
 {
-  if (gcn_cfile_name)
-    maybe_unlink (gcn_cfile_name);
-  if (gcn_s1_name)
-    maybe_unlink (gcn_s1_name);
-  if (gcn_s2_name)
-    maybe_unlink (gcn_s2_name);
-  if (gcn_o_name)
-    maybe_unlink (gcn_o_name);
+  obstack_ptr_grow (&files_to_cleanup, NULL);
+  const char **files = XOBFINISH (&files_to_cleanup, const char **);
+  for (int i = 0; files[i]; i++)
+    maybe_unlink (files[i]);
 }
 
 static void
@@ -204,6 +231,180 @@ access_check (const char *name, int mode)
   return access (name, mode);
 }
 
+/* Copy the early-debug-info from the incoming LTO object to a new object
+   that will be linked into the output HSACO file.  The host relocations
+   must be translated into GCN relocations, and any global undefined symbols
+   must be weakened (so as not to have the debug info try to pull in host
+   junk).
+
+   Returns true if the file was created, false otherwise.  */
+
+static bool
+copy_early_debug_info (const char *infile, const char *outfile)
+{
+  const char *errmsg;
+  int err;
+
+  /* The simple_object code can handle extracting the debug sections.
+     This code is based on that in lto-wrapper.c.  */
+  int infd = open (infile, O_RDONLY | O_BINARY);
+  if (infd == -1)
+    return false;
+  simple_object_read *inobj = simple_object_start_read (infd, 0,
+							"__GNU_LTO",
+							&errmsg, &err);
+  if (!inobj)
+    return false;
+
+  off_t off, len;
+  if (simple_object_find_section (inobj, ".gnu.debuglto_.debug_info",
+				  &off, &len, &errmsg, &err) != 1)
+    {
+      simple_object_release_read (inobj);
+      close (infd);
+      return false;
+    }
+
+  errmsg = simple_object_copy_lto_debug_sections (inobj, outfile, &err, true);
+  if (errmsg)
+    {
+      unlink_if_ordinary (outfile);
+      return false;
+    }
+
+  simple_object_release_read (inobj);
+  close (infd);
+
+  /* Open the file we just created for some adjustments.
+     The simple_object code can't do this, so we do it manually.  */
+  FILE *outfd = fopen (outfile, "r+b");
+  if (!outfd)
+    return false;
+
+  Elf64_Ehdr ehdr;
+  if (fread (&ehdr, sizeof (ehdr), 1, outfd) != 1)
+    {
+      fclose (outfd);
+      return true;
+    }
+
+  /* We only support host relocations of x86_64, for now.  */
+  gcc_assert (ehdr.e_machine == EM_X86_64);
+
+  /* Patch the correct elf architecture flag into the file.  */
+  ehdr.e_ident[7] = ELFOSABI_AMDGPU_HSA;
+  ehdr.e_ident[8] = ELFABIVERSION_AMDGPU_HSA;
+  ehdr.e_type = ET_REL;
+  ehdr.e_machine = EM_AMDGPU;
+  ehdr.e_flags = elf_arch;
+
+  /* Load the section headers so we can walk them later.  */
+  Elf64_Shdr *sections = (Elf64_Shdr *)xmalloc (sizeof (Elf64_Shdr)
+						* ehdr.e_shnum);
+  if (fseek (outfd, ehdr.e_shoff, SEEK_SET) == -1
+      || fread (sections, sizeof (Elf64_Shdr), ehdr.e_shnum,
+		outfd) != ehdr.e_shnum)
+    {
+      free (sections);
+      fclose (outfd);
+      return true;
+    }
+
+  /* Convert the host relocations to target relocations.  */
+  for (int i = 0; i < ehdr.e_shnum; i++)
+    {
+      if (sections[i].sh_type != SHT_RELA)
+	continue;
+
+      char *data = (char *)xmalloc (sections[i].sh_size);
+      if (fseek (outfd, sections[i].sh_offset, SEEK_SET) == -1
+	  || fread (data, sections[i].sh_size, 1, outfd) != 1)
+	{
+	  free (data);
+	  continue;
+	}
+
+      for (size_t offset = 0;
+	   offset < sections[i].sh_size;
+	   offset += sections[i].sh_entsize)
+	{
+	  Elf64_Rela *reloc = (Elf64_Rela *) (data + offset);
+
+	  /* Map the host relocations to GCN relocations.
+	     Only relocations that can appear in DWARF need be handled.  */
+	  switch (ELF64_R_TYPE (reloc->r_info))
+	    {
+	    case R_X86_64_32:
+	    case R_X86_64_32S:
+	      reloc->r_info = R_AMDGPU_ABS32;
+	      break;
+	    case R_X86_64_PC32:
+	      reloc->r_info = R_AMDGPU_REL32;
+	      break;
+	    case R_X86_64_PC64:
+	      reloc->r_info = R_AMDGPU_REL64;
+	      break;
+	    case R_X86_64_64:
+	      reloc->r_info = R_AMDGPU_ABS64;
+	      break;
+	    case R_X86_64_RELATIVE:
+	      reloc->r_info = R_AMDGPU_RELATIVE64;
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+
+      /* Write back our relocation changes.  */
+      if (fseek (outfd, sections[i].sh_offset, SEEK_SET) != -1)
+	fwrite (data, sections[i].sh_size, 1, outfd);
+
+      free (data);
+    }
+
+  /* Weaken any global undefined symbols that would pull in unwanted
+     objects.  */
+  for (int i = 0; i < ehdr.e_shnum; i++)
+    {
+      if (sections[i].sh_type != SHT_SYMTAB)
+	continue;
+
+      char *data = (char *)xmalloc (sections[i].sh_size);
+      if (fseek (outfd, sections[i].sh_offset, SEEK_SET) == -1
+	  || fread (data, sections[i].sh_size, 1, outfd) != 1)
+	{
+	  free (data);
+	  continue;
+	}
+
+      for (size_t offset = 0;
+	   offset < sections[i].sh_size;
+	   offset += sections[i].sh_entsize)
+	{
+	  Elf64_Sym *sym = (Elf64_Sym *) (data + offset);
+	  int type = ELF64_ST_TYPE (sym->st_info);
+	  int bind = ELF64_ST_BIND (sym->st_info);
+
+	  if (bind == STB_GLOBAL && sym->st_shndx == 0)
+	    sym->st_info = ELF64_ST_INFO (STB_WEAK, type);
+	}
+
+      /* Write back our symbol changes.  */
+      if (fseek (outfd, sections[i].sh_offset, SEEK_SET) != -1)
+	fwrite (data, sections[i].sh_size, 1, outfd);
+
+      free (data);
+    }
+  free (sections);
+
+  /* Write back our header changes.  */
+  rewind (outfd);
+  fwrite (&ehdr, sizeof (ehdr), 1, outfd);
+
+  fclose (outfd);
+  return true;
+}
+
 /* Parse an input assembler file, extract the offload tables etc.,
    and output (1) the assembler code, minus the tables (which can contain
    problematic relocations), and (2) a C file with the offload tables
@@ -231,7 +432,7 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
     int sgpr_count;
     int vgpr_count;
     char *kernel_name;
-  } regcount;
+  } regcount = { -1, -1, NULL };
 
   /* Always add _init_array and _fini_array as kernels.  */
   obstack_ptr_grow (&fns_os, xstrdup ("_init_array"));
@@ -239,7 +440,12 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
   fn_count += 2;
 
   char buf[1000];
-  enum { IN_CODE, IN_AMD_KERNEL_CODE_T, IN_VARS, IN_FUNCS } state = IN_CODE;
+  enum
+    { IN_CODE,
+      IN_METADATA,
+      IN_VARS,
+      IN_FUNCS
+    } state = IN_CODE;
   while (fgets (buf, sizeof (buf), in))
     {
       switch (state)
@@ -252,21 +458,25 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
 		obstack_grow (&dims_os, &dim, sizeof (dim));
 		dims_count++;
 	      }
-	    else if (sscanf (buf, " .amdgpu_hsa_kernel %ms\n",
-			     &regcount.kernel_name) == 1)
-	      break;
 
 	    break;
 	  }
-	case IN_AMD_KERNEL_CODE_T:
+	case IN_METADATA:
 	  {
-	    gcc_assert (regcount.kernel_name);
-	    if (sscanf (buf, " wavefront_sgpr_count = %d\n",
-			&regcount.sgpr_count) == 1)
+	    if (sscanf (buf, " - .name: %ms\n", &regcount.kernel_name) == 1)
 	      break;
-	    else if (sscanf (buf, " workitem_vgpr_count = %d\n",
+	    else if (sscanf (buf, " .sgpr_count: %d\n",
+			     &regcount.sgpr_count) == 1)
+	      {
+		gcc_assert (regcount.kernel_name);
+		break;
+	      }
+	    else if (sscanf (buf, " .vgpr_count: %d\n",
 			     &regcount.vgpr_count) == 1)
-	      break;
+	      {
+		gcc_assert (regcount.kernel_name);
+		break;
+	      }
 
 	    break;
 	  }
@@ -307,9 +517,10 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
 	state = IN_VARS;
       else if (sscanf (buf, " .section .gnu.offload_funcs%c", &dummy) > 0)
 	state = IN_FUNCS;
-      else if (sscanf (buf, " .amd_kernel_code_%c", &dummy) > 0)
+      else if (sscanf (buf, " .amdgpu_metadata%c", &dummy) > 0)
 	{
-	  state = IN_AMD_KERNEL_CODE_T;
+	  state = IN_METADATA;
+	  regcount.kernel_name = NULL;
 	  regcount.sgpr_count = regcount.vgpr_count = -1;
 	}
       else if (sscanf (buf, " .section %c", &dummy) > 0
@@ -318,7 +529,7 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
 	       || sscanf (buf, " .data%c", &dummy) > 0
 	       || sscanf (buf, " .ident %c", &dummy) > 0)
 	state = IN_CODE;
-      else if (sscanf (buf, " .end_amd_kernel_code_%c", &dummy) > 0)
+      else if (sscanf (buf, " .end_amdgpu_metadata%c", &dummy) > 0)
 	{
 	  state = IN_CODE;
 	  gcc_assert (regcount.kernel_name != NULL
@@ -330,7 +541,7 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
 	  regcount.sgpr_count = regcount.vgpr_count = -1;
 	}
 
-      if (state == IN_CODE || state == IN_AMD_KERNEL_CODE_T)
+      if (state == IN_CODE || state == IN_METADATA)
 	fputs (buf, out);
     }
 
@@ -538,9 +749,15 @@ main (int argc, char **argv)
   FILE *cfile = stdout;
   const char *outname = 0;
 
+  const char *gcn_s1_name;
+  const char *gcn_s2_name;
+  const char *gcn_o_name;
+  const char *gcn_cfile_name;
+
   progname = "mkoffload";
   diagnostic_initialize (global_dc, 0);
 
+  obstack_init (&files_to_cleanup);
   if (atexit (mkoffload_cleanup) != 0)
     fatal_error (input_location, "atexit failed");
 
@@ -632,7 +849,14 @@ main (int argc, char **argv)
       else if (strcmp (argv[i], "-dumpbase") == 0
 	       && i + 1 < argc)
 	dumppfx = argv[++i];
+      else if (strcmp (argv[i], "-march=fiji") == 0)
+	elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX803;
+      else if (strcmp (argv[i], "-march=gfx900") == 0)
+	elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX900;
+      else if (strcmp (argv[i], "-march=gfx906") == 0)
+	elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX906;
     }
+
   if (!(fopenacc ^ fopenmp))
     fatal_error (input_location, "either -fopenacc or -fopenmp must be set");
 
@@ -693,6 +917,10 @@ main (int argc, char **argv)
       gcn_o_name = make_temp_file (".mkoffload.hsaco");
       gcn_cfile_name = make_temp_file (".c");
     }
+  obstack_ptr_grow (&files_to_cleanup, gcn_s1_name);
+  obstack_ptr_grow (&files_to_cleanup, gcn_s2_name);
+  obstack_ptr_grow (&files_to_cleanup, gcn_o_name);
+  obstack_ptr_grow (&files_to_cleanup, gcn_cfile_name);
 
   obstack_ptr_grow (&cc_argv_obstack, "-dumpdir");
   obstack_ptr_grow (&cc_argv_obstack, "");
@@ -710,6 +938,39 @@ main (int argc, char **argv)
   struct obstack ld_argv_obstack;
   obstack_init (&ld_argv_obstack);
   obstack_ptr_grow (&ld_argv_obstack, driver);
+
+  /* Extract early-debug information from the input objects.
+     This loop finds all the inputs that end ".o" and aren't the output.  */
+  int dbgcount = 0;
+  for (int ix = 1; ix != argc; ix++)
+    {
+      if (!strcmp (argv[ix], "-o") && ix + 1 != argc)
+	++ix;
+      else
+	{
+	  if (strcmp (argv[ix] + strlen(argv[ix]) - 2, ".o") == 0)
+	    {
+	      char *dbgobj;
+	      if (save_temps)
+		{
+		  char buf[10];
+		  sprintf (buf, "%d", dbgcount++);
+		  dbgobj = concat (dumppfx, ".mkoffload.dbg", buf, ".o", NULL);
+		}
+	      else
+		dbgobj = make_temp_file (".mkoffload.dbg.o");
+
+	      /* If the copy fails then just ignore it.  */
+	      if (copy_early_debug_info (argv[ix], dbgobj))
+		{
+		  obstack_ptr_grow (&ld_argv_obstack, dbgobj);
+		  obstack_ptr_grow (&files_to_cleanup, dbgobj);
+		}
+	      else
+		free (dbgobj);
+	    }
+	}
+    }
   obstack_ptr_grow (&ld_argv_obstack, gcn_s2_name);
   obstack_ptr_grow (&ld_argv_obstack, "-lgomp");
 
diff --git a/gcc/config/gcn/t-gcn-hsa b/gcc/config/gcn/t-gcn-hsa
index af203c5..16d243c 100644
--- a/gcc/config/gcn/t-gcn-hsa
+++ b/gcc/config/gcn/t-gcn-hsa
@@ -45,7 +45,6 @@ gcn-run$(exeext): gcn-run.o
 MULTILIB_OPTIONS = march=gfx900/march=gfx906
 MULTILIB_DIRNAMES = gfx900 gfx906
 
-PASSES_EXTRA += $(srcdir)/config/gcn/gcn-passes.def
 gcn-tree.o: $(srcdir)/config/gcn/gcn-tree.c
 	$(COMPILE) $<
 	$(POSTCOMPILE)
diff --git a/gcc/config/i386/adxintrin.h b/gcc/config/i386/adxintrin.h
index 6c15417..6dffe45 100644
--- a/gcc/config/i386/adxintrin.h
+++ b/gcc/config/i386/adxintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <adxintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _ADXINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/amxbf16intrin.h b/gcc/config/i386/amxbf16intrin.h
new file mode 100644
index 0000000..77cc395
--- /dev/null
+++ b/gcc/config/i386/amxbf16intrin.h
@@ -0,0 +1,52 @@
+/* Copyright (C) 2020 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxbf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXBF16INTRIN_H_INCLUDED
+#define _AMXBF16INTRIN_H_INCLUDED
+
+#if !defined(__AMX_BF16__)
+#pragma GCC push_options
+#pragma GCC target("amx-bf16")
+#define __DISABLE_AMX_BF16__
+#endif /* __AMX_BF16__ */
+
+#if defined(__x86_64__) && defined(__AMX_BF16__)
+#define _tile_dpbf16ps_internal(dst,src1,src2)					\
+  __asm__ volatile\
+  ("{tdpbf16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dpbf16ps(dst,src1,src2)					\
+  _tile_dpbf16ps_internal (dst, src1, src2)
+
+#endif
+
+#ifdef __DISABLE_AMX_BF16__
+#undef __DISABLE_AMX_BF16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_BF16__ */
+
+#endif /* _AMXBF16INTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/amxint8intrin.h b/gcc/config/i386/amxint8intrin.h
new file mode 100644
index 0000000..f4e410b
--- /dev/null
+++ b/gcc/config/i386/amxint8intrin.h
@@ -0,0 +1,61 @@
+/* Copyright (C) 2020 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxint8intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXINT8INTRIN_H_INCLUDED
+#define _AMXINT8INTRIN_H_INCLUDED
+
+#if !defined(__AMX_INT8__)
+#pragma GCC push_options
+#pragma GCC target("amx-int8")
+#define __DISABLE_AMX_INT8__
+#endif /* __AMX_INT8__ */
+
+#if defined(__x86_64__) && defined(__AMX_INT8__)
+#define _tile_int8_dp_internal(name,dst,src1,src2)					\
+  __asm__ volatile							\
+  ("{"#name"\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|"#name"\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dpbssd(dst,src1,src2)					\
+  _tile_int8_dp_internal (tdpbssd, dst, src1, src2)
+
+#define _tile_dpbsud(dst,src1,src2)					\
+  _tile_int8_dp_internal (tdpbsud, dst, src1, src2)
+
+#define _tile_dpbusd(dst,src1,src2)					\
+  _tile_int8_dp_internal (tdpbusd, dst, src1, src2)
+
+#define _tile_dpbuud(dst,src1,src2)					\
+  _tile_int8_dp_internal (tdpbuud, dst, src1, src2)
+
+#endif
+
+#ifdef __DISABLE_AMX_INT8__
+#undef __DISABLE_AMX_INT8__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_INT8__ */
+
+#endif /* _AMXINT8INTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h
new file mode 100644
index 0000000..41fb9a5
--- /dev/null
+++ b/gcc/config/i386/amxtileintrin.h
@@ -0,0 +1,98 @@
+/* Copyright (C) 2020 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxtileintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXTILEINTRIN_H_INCLUDED
+#define _AMXTILEINTRIN_H_INCLUDED
+
+#if !defined(__AMX_TILE__)
+#pragma GCC push_options
+#pragma GCC target("amx-tile")
+#define __DISABLE_AMX_TILE__
+#endif /* __AMX_TILE__ */
+
+#if defined(__x86_64__) && defined(__AMX_TILE__)
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tile_loadconfig (const void *__config)
+{
+  __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tile_storeconfig (void *__config)
+{
+  __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tile_release (void)
+{
+  __asm__ volatile ("tilerelease" ::);
+}
+
+#define _tile_loadd(dst,base,stride)		\
+  _tile_loadd_internal (dst, base, stride)
+
+#define _tile_loadd_internal(dst,base,stride)				\
+  __asm__ volatile							\
+  ("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" \
+   :: "r" ((const void*) base), "r" ((long) stride))
+
+#define _tile_stream_loadd(dst,base,stride)		\
+  _tile_stream_loadd_internal (dst, base, stride)
+
+#define _tile_stream_loadd_internal(dst,base,stride)			\
+  __asm__ volatile							\
+  ("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", [%0+%1*1]}" \
+   :: "r" ((const void*) base), "r" ((long) stride))
+
+#define _tile_stored(dst,base,stride)		\
+  _tile_stored_internal (dst, base, stride)
+
+#define _tile_stored_internal(src,base,stride)				\
+  __asm__ volatile							\
+  ("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], %%tmm"#src"}" \
+   :: "r" ((void*) base), "r" ((long) stride) \
+   : "memory")
+
+#define _tile_zero(dst)				\
+  _tile_zero_internal (dst)
+
+#define _tile_zero_internal(dst)		\
+  __asm__ volatile				\
+  ("tilezero\t%%tmm"#dst ::)
+
+#endif
+
+#ifdef __DISABLE_AMX_TILE__
+#undef __DISABLE_AMX_TILE__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_TILE__ */
+
+#endif /* _AMXTILEINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h
index 4bcd697..729d568 100644
--- a/gcc/config/i386/avx512fintrin.h
+++ b/gcc/config/i386/avx512fintrin.h
@@ -239,22 +239,17 @@ extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_set1_pd (double __A)
 {
-  return (__m512d) __builtin_ia32_broadcastsd512 (__extension__
-						  (__v2df) { __A, },
-						  (__v8df)
-						  _mm512_undefined_pd (),
-						  (__mmask8) -1);
+  return __extension__ (__m512d)(__v8df)
+    { __A, __A, __A, __A, __A, __A, __A, __A };
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_set1_ps (float __A)
 {
-  return (__m512) __builtin_ia32_broadcastss512 (__extension__
-						 (__v4sf) { __A, },
-						 (__v16sf)
-						 _mm512_undefined_ps (),
-						 (__mmask16) -1);
+  return __extension__ (__m512)(__v16sf)
+    { __A, __A, __A, __A, __A, __A, __A, __A,
+      __A, __A, __A, __A, __A, __A, __A, __A };
 }
 
 /* Create the vector [A B C D A B C D A B C D A B C D].  */
@@ -4072,10 +4067,9 @@ extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_set1_epi32 (int __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
-							   (__v16si)
-							   _mm512_undefined_epi32 (),
-							   (__mmask16)(-1));
+  return (__m512i)(__v16si)
+    { __A, __A, __A, __A, __A, __A, __A, __A,
+      __A, __A, __A, __A, __A, __A, __A, __A };
 }
 
 extern __inline __m512i
@@ -4128,10 +4122,7 @@ extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_set1_epi64 (long long __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
-							   (__v8di)
-							   _mm512_undefined_epi32 (),
-							   (__mmask8)(-1));
+  return (__m512i)(__v8di) { __A, __A, __A, __A, __A, __A, __A, __A };
 }
 
 extern __inline __m512i
@@ -15127,6 +15118,88 @@ _mm512_mask_cmp_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, const int __P)
 
 extern __inline __mmask8
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_sd_mask (__m128d __X, __m128d __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+					       (__v2df) __Y, __P,
+					       (__mmask8) -1,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+					       (__v2df) __Y, __P,
+					       (__mmask8) __M,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ss_mask (__m128 __X, __m128 __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+					       (__v4sf) __Y, __P,
+					       (__mmask8) -1,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+					       (__v4sf) __Y, __P,
+					       (__mmask8) __M,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+#else
+#define _mm512_cmp_pd_mask(X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
+					    (__v8df)(__m512d)(Y), (int)(P),\
+					    (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_cmp_ps_mask(X, Y, P)					\
+  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
+					     (__v16sf)(__m512)(Y), (int)(P),\
+					     (__mmask16)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_cmp_pd_mask(M, X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
+					    (__v8df)(__m512d)(Y), (int)(P),\
+					    (__mmask8)(M), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_cmp_ps_mask(M, X, Y, P)					\
+  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
+					     (__v16sf)(__m512)(Y), (int)(P),\
+					     (__mmask16)(M),_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_cmp_sd_mask(X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
+					 (__v2df)(__m128d)(Y), (int)(P),\
+					 (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_cmp_sd_mask(M, X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
+					 (__v2df)(__m128d)(Y), (int)(P),\
+					 M,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_cmp_ss_mask(X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
+					 (__v4sf)(__m128)(Y), (int)(P), \
+					 (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_cmp_ss_mask(M, X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
+					 (__v4sf)(__m128)(Y), (int)(P), \
+					 M,_MM_FROUND_CUR_DIRECTION))
+#endif
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_cmpeq_pd_mask (__m512d __X, __m512d __Y)
 {
   return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
@@ -15445,88 +15518,6 @@ _mm512_mask_cmpord_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
 						   _MM_FROUND_CUR_DIRECTION);
 }
 
-extern __inline __mmask8
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmp_sd_mask (__m128d __X, __m128d __Y, const int __P)
-{
-  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
-					       (__v2df) __Y, __P,
-					       (__mmask8) -1,
-					       _MM_FROUND_CUR_DIRECTION);
-}
-
-extern __inline __mmask8
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmp_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, const int __P)
-{
-  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
-					       (__v2df) __Y, __P,
-					       (__mmask8) __M,
-					       _MM_FROUND_CUR_DIRECTION);
-}
-
-extern __inline __mmask8
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmp_ss_mask (__m128 __X, __m128 __Y, const int __P)
-{
-  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
-					       (__v4sf) __Y, __P,
-					       (__mmask8) -1,
-					       _MM_FROUND_CUR_DIRECTION);
-}
-
-extern __inline __mmask8
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmp_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, const int __P)
-{
-  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
-					       (__v4sf) __Y, __P,
-					       (__mmask8) __M,
-					       _MM_FROUND_CUR_DIRECTION);
-}
-
-#else
-#define _mm512_cmp_pd_mask(X, Y, P)					\
-  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
-					    (__v8df)(__m512d)(Y), (int)(P),\
-					    (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_cmp_ps_mask(X, Y, P)					\
-  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
-					     (__v16sf)(__m512)(Y), (int)(P),\
-					     (__mmask16)-1,_MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_cmp_pd_mask(M, X, Y, P)					\
-  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
-					    (__v8df)(__m512d)(Y), (int)(P),\
-					    (__mmask8)(M), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_cmp_ps_mask(M, X, Y, P)					\
-  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
-					     (__v16sf)(__m512)(Y), (int)(P),\
-					     (__mmask16)(M),_MM_FROUND_CUR_DIRECTION))
-
-#define _mm_cmp_sd_mask(X, Y, P)					\
-  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
-					 (__v2df)(__m128d)(Y), (int)(P),\
-					 (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_cmp_sd_mask(M, X, Y, P)					\
-  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
-					 (__v2df)(__m128d)(Y), (int)(P),\
-					 M,_MM_FROUND_CUR_DIRECTION))
-
-#define _mm_cmp_ss_mask(X, Y, P)					\
-  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
-					 (__v4sf)(__m128)(Y), (int)(P), \
-					 (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_cmp_ss_mask(M, X, Y, P)					\
-  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
-					 (__v4sf)(__m128)(Y), (int)(P), \
-					 M,_MM_FROUND_CUR_DIRECTION))
-#endif
-
 extern __inline __mmask16
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_kmov (__mmask16 __A)
diff --git a/gcc/config/i386/avx512vp2intersectintrin.h b/gcc/config/i386/avx512vp2intersectintrin.h
index 60cb52c..f368d83 100644
--- a/gcc/config/i386/avx512vp2intersectintrin.h
+++ b/gcc/config/i386/avx512vp2intersectintrin.h
@@ -1,3 +1,26 @@
+/* Copyright (C) 2019-2020 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
 #if !defined _IMMINTRIN_H_INCLUDED
 #error "Never use <avx512vp2intersectintrin.h> directly; include <immintrin.h> instead."
 #endif
diff --git a/gcc/config/i386/avx512vp2intersectvlintrin.h b/gcc/config/i386/avx512vp2intersectvlintrin.h
index 26eee36..f657840 100644
--- a/gcc/config/i386/avx512vp2intersectvlintrin.h
+++ b/gcc/config/i386/avx512vp2intersectvlintrin.h
@@ -1,3 +1,26 @@
+/* Copyright (C) 2019-2020 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
 #if !defined _IMMINTRIN_H_INCLUDED
 #error "Never use <avx512vp2intersectintrin.h> directly; include <immintrin.h> instead."
 #endif
diff --git a/gcc/config/i386/bmi2intrin.h b/gcc/config/i386/bmi2intrin.h
index c5de9eb..9fdd08c 100644
--- a/gcc/config/i386/bmi2intrin.h
+++ b/gcc/config/i386/bmi2intrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _BMI2INTRIN_H_INCLUDED
diff --git a/gcc/config/i386/bmiintrin.h b/gcc/config/i386/bmiintrin.h
index 8ba6e5b..5bd712a 100644
--- a/gcc/config/i386/bmiintrin.h
+++ b/gcc/config/i386/bmiintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _BMIINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/cetintrin.h b/gcc/config/i386/cetintrin.h
index 095bbe0..81c4d72 100644
--- a/gcc/config/i386/cetintrin.h
+++ b/gcc/config/i386/cetintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <cetintrin.h> directly; include <x86intrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <cetintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _CETINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/cldemoteintrin.h b/gcc/config/i386/cldemoteintrin.h
index 8c0feca..0c31c35 100644
--- a/gcc/config/i386/cldemoteintrin.h
+++ b/gcc/config/i386/cldemoteintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <cldemoteintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <cldemoteintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _CLDEMOTE_H_INCLUDED
diff --git a/gcc/config/i386/clflushoptintrin.h b/gcc/config/i386/clflushoptintrin.h
index 037f044..a3697f0 100644
--- a/gcc/config/i386/clflushoptintrin.h
+++ b/gcc/config/i386/clflushoptintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <clflushoptintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _CLFLUSHOPTINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/clwbintrin.h b/gcc/config/i386/clwbintrin.h
index 84d0939..3f83962 100644
--- a/gcc/config/i386/clwbintrin.h
+++ b/gcc/config/i386/clwbintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <clwbintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <clwbintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _CLWBINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index 94af491..4598434 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -21,6 +21,9 @@
  * <http://www.gnu.org/licenses/>.
  */
 
+#ifndef _CPUID_H_INCLUDED
+#define _CPUID_H_INCLUDED
+
 /* %eax */
 #define bit_AVX512BF16	(1 << 5)
 
@@ -124,6 +127,9 @@
 #define bit_PCONFIG	(1 << 18)
 #define bit_SERIALIZE	(1 << 14)
 #define bit_TSXLDTRK    (1 << 16)
+#define bit_AMX_BF16    (1 << 22)
+#define bit_AMX_TILE    (1 << 24)
+#define bit_AMX_INT8    (1 << 25)
 
 /* XFEATURE_ENABLED_MASK register bits (%eax == 0xd, %ecx == 0) */
 #define bit_BNDREGS     (1 << 3)
@@ -313,3 +319,12 @@ __get_cpuid_count (unsigned int __leaf, unsigned int __subleaf,
   __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
   return 1;
 }
+
+static __inline void
+__cpuidex (int __cpuid_info[4], int __leaf, int __subleaf)
+{
+  __cpuid_count (__leaf, __subleaf, __cpuid_info[0], __cpuid_info[1],
+		 __cpuid_info[2], __cpuid_info[3]);
+}
+
+#endif /* _CPUID_H_INCLUDED */
diff --git a/gcc/config/i386/enqcmdintrin.h b/gcc/config/i386/enqcmdintrin.h
index 4b2efcb..dcb6507 100644
--- a/gcc/config/i386/enqcmdintrin.h
+++ b/gcc/config/i386/enqcmdintrin.h
@@ -21,12 +21,12 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <enqcmdntrin.h> directly; include <x86intrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <enqcmdintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
-#ifndef _ENQCMDNTRIN_H_INCLUDED
-#define _ENQCMDNTRIN_H_INCLUDED
+#ifndef _ENQCMDINTRIN_H_INCLUDED
+#define _ENQCMDINTRIN_H_INCLUDED
 
 #ifndef __ENQCMD__
 #pragma GCC push_options
@@ -52,4 +52,4 @@ _enqcmds (void * __P, const void * __Q)
 #undef __DISABLE_ENQCMD__
 #pragma GCC pop_options
 #endif /* __DISABLE_ENQCMD__ */
-#endif /* _ENQCMDNTRIN_H_INCLUDED.  */
+#endif /* _ENQCMDINTRIN_H_INCLUDED.  */
diff --git a/gcc/config/i386/fxsrintrin.h b/gcc/config/i386/fxsrintrin.h
index fde05a7..6e059df 100644
--- a/gcc/config/i386/fxsrintrin.h
+++ b/gcc/config/i386/fxsrintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <fxsrintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _FXSRINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 6270068..fec5cef 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -260,12 +260,12 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadsf_mask, "__builtin_ia32
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storedf_mask, "__builtin_ia32_storesd_mask", IX86_BUILTIN_STORESD_MASK, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_storesf_mask, "__builtin_ia32_storess_mask", IX86_BUILTIN_STORESS_MASK, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF_UQI)
 
-BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID)
-BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID)
-BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT)
-BDESC (OPTION_MASK_ISA_LWP | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT)
-BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT)
-BDESC (OPTION_MASK_ISA_LWP | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT)
+BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_nothing, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID)
+BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_nothing, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID)
+BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_nothing, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT)
+BDESC (OPTION_MASK_ISA_LWP | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT)
+BDESC (OPTION_MASK_ISA_LWP, 0, CODE_FOR_nothing, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT)
+BDESC (OPTION_MASK_ISA_LWP | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT)
 
 /* FSGSBASE */
 BDESC (OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID)
@@ -1270,8 +1270,8 @@ BDESC (OPTION_MASK_ISA_BMI, 0, CODE_FOR_tzcnt_si, "__builtin_ia32_tzcnt_u32", IX
 BDESC (OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_tzcnt_di, "__builtin_ia32_tzcnt_u64", IX86_BUILTIN_TZCNT64, UNKNOWN, (int) UINT64_FTYPE_UINT64)
 
 /* TBM */
-BDESC (OPTION_MASK_ISA_TBM, 0, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT)
-BDESC (OPTION_MASK_ISA_TBM | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64)
+BDESC (OPTION_MASK_ISA_TBM, 0, CODE_FOR_nothing, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT)
+BDESC (OPTION_MASK_ISA_TBM | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64)
 
 /* F16C */
 BDESC (OPTION_MASK_ISA_F16C, 0, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI)
@@ -3126,21 +3126,17 @@ BDESC_END (MULTI_ARG, CET)
 
 /* CET.  */
 BDESC_FIRST (cet, CET,
-       OPTION_MASK_ISA_SHSTK, 0, CODE_FOR_incsspsi, "__builtin_ia32_incsspd", IX86_BUILTIN_INCSSPD, UNKNOWN, (int) VOID_FTYPE_UNSIGNED)
-BDESC (OPTION_MASK_ISA_SHSTK | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_incsspdi, "__builtin_ia32_incsspq", IX86_BUILTIN_INCSSPQ, UNKNOWN, (int) VOID_FTYPE_UINT64)
+       OPTION_MASK_ISA_SHSTK, 0, CODE_FOR_nothing, "__builtin_ia32_rdsspd", IX86_BUILTIN_RDSSPD, UNKNOWN, (int) UINT_FTYPE_VOID)
+BDESC (OPTION_MASK_ISA_SHSTK | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_rdsspq", IX86_BUILTIN_RDSSPQ, UNKNOWN, (int) UINT64_FTYPE_VOID)
+BDESC (OPTION_MASK_ISA_SHSTK, 0, CODE_FOR_nothing, "__builtin_ia32_incsspd", IX86_BUILTIN_INCSSPD, UNKNOWN, (int) VOID_FTYPE_UNSIGNED)
+BDESC (OPTION_MASK_ISA_SHSTK | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_incsspq", IX86_BUILTIN_INCSSPQ, UNKNOWN, (int) VOID_FTYPE_UINT64)
 BDESC (OPTION_MASK_ISA_SHSTK, 0, CODE_FOR_saveprevssp, "__builtin_ia32_saveprevssp", IX86_BUILTIN_SAVEPREVSSP, UNKNOWN, (int) VOID_FTYPE_VOID)
 BDESC (OPTION_MASK_ISA_SHSTK, 0, CODE_FOR_rstorssp, "__builtin_ia32_rstorssp", IX86_BUILTIN_RSTORSSP, UNKNOWN, (int) VOID_FTYPE_PVOID)
-BDESC (OPTION_MASK_ISA_SHSTK, 0, CODE_FOR_wrsssi, "__builtin_ia32_wrssd", IX86_BUILTIN_WRSSD, UNKNOWN, (int) VOID_FTYPE_UNSIGNED_PVOID)
-BDESC (OPTION_MASK_ISA_SHSTK | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_wrssdi, "__builtin_ia32_wrssq", IX86_BUILTIN_WRSSQ, UNKNOWN, (int) VOID_FTYPE_UINT64_PVOID)
-BDESC (OPTION_MASK_ISA_SHSTK, 0, CODE_FOR_wrusssi, "__builtin_ia32_wrussd", IX86_BUILTIN_WRUSSD, UNKNOWN, (int) VOID_FTYPE_UNSIGNED_PVOID)
-BDESC (OPTION_MASK_ISA_SHSTK | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_wrussdi, "__builtin_ia32_wrussq", IX86_BUILTIN_WRUSSQ, UNKNOWN, (int) VOID_FTYPE_UINT64_PVOID)
+BDESC (OPTION_MASK_ISA_SHSTK, 0, CODE_FOR_nothing, "__builtin_ia32_wrssd", IX86_BUILTIN_WRSSD, UNKNOWN, (int) VOID_FTYPE_UNSIGNED_PVOID)
+BDESC (OPTION_MASK_ISA_SHSTK | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_wrssq", IX86_BUILTIN_WRSSQ, UNKNOWN, (int) VOID_FTYPE_UINT64_PVOID)
+BDESC (OPTION_MASK_ISA_SHSTK, 0, CODE_FOR_nothing, "__builtin_ia32_wrussd", IX86_BUILTIN_WRUSSD, UNKNOWN, (int) VOID_FTYPE_UNSIGNED_PVOID)
+BDESC (OPTION_MASK_ISA_SHSTK | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_wrussq", IX86_BUILTIN_WRUSSQ, UNKNOWN, (int) VOID_FTYPE_UINT64_PVOID)
 BDESC (OPTION_MASK_ISA_SHSTK, 0, CODE_FOR_setssbsy, "__builtin_ia32_setssbsy", IX86_BUILTIN_SETSSBSY, UNKNOWN, (int) VOID_FTYPE_VOID)
 BDESC (OPTION_MASK_ISA_SHSTK, 0, CODE_FOR_clrssbsy, "__builtin_ia32_clrssbsy", IX86_BUILTIN_CLRSSBSY, UNKNOWN, (int) VOID_FTYPE_PVOID)
 
-BDESC_END (CET, CET_NORMAL)
-
-BDESC_FIRST (cet_rdssp, CET_NORMAL,
-       OPTION_MASK_ISA_SHSTK, 0, CODE_FOR_rdsspsi, "__builtin_ia32_rdsspd", IX86_BUILTIN_RDSSPD, UNKNOWN, (int) UINT_FTYPE_VOID)
-BDESC (OPTION_MASK_ISA_SHSTK | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_rdsspdi, "__builtin_ia32_rdsspq", IX86_BUILTIN_RDSSPQ, UNKNOWN, (int) UINT64_FTYPE_VOID)
-
-BDESC_END (CET_NORMAL, MAX)
+BDESC_END (CET, MAX)
diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c
index 2246507..ca7a870 100644
--- a/gcc/config/i386/i386-builtins.c
+++ b/gcc/config/i386/i386-builtins.c
@@ -116,10 +116,8 @@ BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
 	       IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
 	       IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
-	       IX86_BUILTIN__BDESC_CET_LAST, 1);
 BDESC_VERIFYS (IX86_BUILTIN_MAX,
-	       IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
+	       IX86_BUILTIN__BDESC_CET_LAST, 1);
 
 
 /* Table for the ix86 builtin non-function types.  */
@@ -1227,21 +1225,6 @@ ix86_init_mmx_sse_builtins (void)
   BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
 		 IX86_BUILTIN__BDESC_CET_FIRST,
 		 ARRAY_SIZE (bdesc_cet) - 1);
-
-  for (i = 0, d = bdesc_cet_rdssp;
-       i < ARRAY_SIZE (bdesc_cet_rdssp);
-       i++, d++)
-    {
-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
-      if (d->name == 0)
-	continue;
-
-      ftype = (enum ix86_builtin_func_type) d->flag;
-      def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
-    }
-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
-		 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
-		 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
 }
 
 #undef BDESC_VERIFY
@@ -1883,7 +1866,8 @@ get_builtin_code_for_version (tree decl, tree *predicate_list)
      before the ssse3 version. */
   if (strstr (attrs_str, "arch=") != NULL)
     {
-      cl_target_option_save (&cur_target, &global_options);
+      cl_target_option_save (&cur_target, &global_options,
+			     &global_options_set);
       target_node
 	= ix86_valid_target_attribute_tree (decl, attrs, &global_options,
 					    &global_options_set, 0);
@@ -1952,7 +1936,8 @@ get_builtin_code_for_version (tree decl, tree *predicate_list)
 	      break;
 	    }
 
-      cl_target_option_restore (&global_options, &cur_target);
+      cl_target_option_restore (&global_options, &global_options_set,
+				&cur_target);
 	
       if (predicate_list && arg_str == NULL)
 	{
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 2d61a0c..e647fce 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -588,6 +588,17 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
     def_or_undef (parse_in, "__ENQCMD__");
   if (isa_flag2 & OPTION_MASK_ISA2_TSXLDTRK)
     def_or_undef (parse_in, "__TSXLDTRK__");
+  if (isa_flag2 & OPTION_MASK_ISA2_AMX_TILE)
+    def_or_undef (parse_in, "__AMX_TILE__");
+  if (isa_flag2 & OPTION_MASK_ISA2_AMX_INT8)
+    def_or_undef (parse_in, "__AMX_INT8__");
+  if (isa_flag2 & OPTION_MASK_ISA2_AMX_BF16)
+    def_or_undef (parse_in, "__AMX_BF16__");
+  if (isa_flag & OPTION_MASK_ISA_SAHF)
+    def_or_undef (parse_in, "__LAHF_SAHF__");
+  if (isa_flag2 & OPTION_MASK_ISA2_MOVBE)
+    def_or_undef (parse_in, "__MOVBE__");
+
   if (TARGET_IAMCU)
     {
       def_or_undef (parse_in, "__iamcu");
@@ -603,7 +614,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
 static bool
 ix86_pragma_target_parse (tree args, tree pop_target)
 {
-  tree prev_tree = build_target_option_node (&global_options);
+  tree prev_tree
+    = build_target_option_node (&global_options, &global_options_set);
   tree cur_tree;
   struct cl_target_option *prev_opt;
   struct cl_target_option *cur_opt;
@@ -621,7 +633,7 @@ ix86_pragma_target_parse (tree args, tree pop_target)
   if (! args)
     {
       cur_tree = (pop_target ? pop_target : target_option_default_node);
-      cl_target_option_restore (&global_options,
+      cl_target_option_restore (&global_options, &global_options_set,
 				TREE_TARGET_OPTION (cur_tree));
     }
   else
@@ -631,7 +643,7 @@ ix86_pragma_target_parse (tree args, tree pop_target)
 						   &global_options_set, 0);
       if (!cur_tree || cur_tree == error_mark_node)
        {
-         cl_target_option_restore (&global_options,
+         cl_target_option_restore (&global_options, &global_options_set,
                                    TREE_TARGET_OPTION (prev_tree));
          return false;
        }
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index e194214..e6f8b31 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -116,6 +116,12 @@ split_double_mode (machine_mode mode, rtx operands[],
     case E_DImode:
       half_mode = SImode;
       break;
+    case E_P2HImode:
+      half_mode = HImode;
+      break;
+    case E_P2QImode:
+      half_mode = QImode;
+      break;
     default:
       gcc_unreachable ();
     }
@@ -3305,7 +3311,17 @@ ix86_expand_int_movcc (rtx operands[])
 	{
 	  var = operands[2];
 	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
-	    operands[2] = constm1_rtx, op = and_optab;
+	    {
+	      /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
+		 "x <= 0 ? x : 0" to enable sign_bit_compare_p.  */
+	      if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
+		operands[1] = simplify_gen_relational (LT, VOIDmode,
+						       GET_MODE (op0),
+						       op0, const0_rtx);
+
+	      operands[2] = constm1_rtx;
+	      op = and_optab;
+	    }
 	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
 	    operands[2] = const0_rtx, op = ior_optab;
 	  else
@@ -3480,6 +3496,13 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
       || (op_false && reg_overlap_mentioned_p (dest, op_false)))
     dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
 
+  if (maskcmp)
+    {
+      bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
+      gcc_assert (ok);
+      return dest;
+    }
+
   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
 
   if (cmp_mode != mode && !maskcmp)
@@ -3915,11 +3938,10 @@ ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
 /* Expand AVX-512 vector comparison.  */
 
 bool
-ix86_expand_mask_vec_cmp (rtx operands[])
+ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
 {
-  machine_mode mask_mode = GET_MODE (operands[0]);
-  machine_mode cmp_mode = GET_MODE (operands[2]);
-  enum rtx_code code = GET_CODE (operands[1]);
+  machine_mode mask_mode = GET_MODE (dest);
+  machine_mode cmp_mode = GET_MODE (cmp_op0);
   rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
   int unspec_code;
   rtx unspec;
@@ -3937,10 +3959,9 @@ ix86_expand_mask_vec_cmp (rtx operands[])
       unspec_code = UNSPEC_PCMP;
     }
 
-  unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
-						 operands[3], imm),
+  unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
 			   unspec_code);
-  emit_insn (gen_rtx_SET (operands[0], unspec));
+  emit_insn (gen_rtx_SET (dest, unspec));
 
   return true;
 }
@@ -10650,15 +10671,6 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
       klass = load;
       memory = 0;
       break;
-    case VOID_FTYPE_UINT_UINT_UINT:
-    case VOID_FTYPE_UINT64_UINT_UINT:
-    case UCHAR_FTYPE_UINT_UINT_UINT:
-    case UCHAR_FTYPE_UINT64_UINT_UINT:
-      nargs = 3;
-      klass = load;
-      memory = ARRAY_SIZE (args);
-      last_arg_constant = true;
-      break;
     default:
       gcc_unreachable ();
     }
@@ -10713,13 +10725,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
 	{
 	  if (!match)
 	    {
-	      if (icode == CODE_FOR_lwp_lwpvalsi3
-		  || icode == CODE_FOR_lwp_lwpinssi3
-		  || icode == CODE_FOR_lwp_lwpvaldi3
-		  || icode == CODE_FOR_lwp_lwpinsdi3)
-		error ("the last argument must be a 32-bit immediate");
-	      else
-		error ("the last argument must be an 8-bit immediate");
+	      error ("the last argument must be an 8-bit immediate");
 	      return const0_rtx;
 	    }
 	}
@@ -11433,24 +11439,24 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
 	}
       else
 	{
-	  rtx pat;
+	  if (target == 0
+	      || !register_operand (target, SImode))
+	    target = gen_reg_rtx (SImode);
 
-	  target = gen_reg_rtx (SImode);
 	  emit_move_insn (target, const0_rtx);
 	  target = gen_rtx_SUBREG (QImode, target, 0);
 
-	  if (fcode == IX86_BUILTIN_ENQCMD)
-	    pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1);
-	  else
-	    pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1);
-
-	  emit_insn (pat);
-
-	  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
-				  gen_rtx_fmt_ee (EQ, QImode,
-						  SET_DEST (pat),
-						  const0_rtx)));
+	  int unspecv = (fcode == IX86_BUILTIN_ENQCMD
+			 ? UNSPECV_ENQCMD
+			 : UNSPECV_ENQCMDS);
+	  icode = code_for_enqcmd (unspecv, Pmode);
+	  emit_insn (GEN_FCN (icode) (op0, op1));
 
+	  emit_insn
+	    (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+			  gen_rtx_fmt_ee (EQ, QImode,
+					  gen_rtx_REG (CCZmode, FLAGS_REG),
+					  const0_rtx)));
 	  return SUBREG_REG (target);
 	}
 
@@ -11643,40 +11649,92 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
     case IX86_BUILTIN_LLWPCB:
       arg0 = CALL_EXPR_ARG (exp, 0);
       op0 = expand_normal (arg0);
-      icode = CODE_FOR_lwp_llwpcb;
-      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+
+      if (!register_operand (op0, Pmode))
 	op0 = ix86_zero_extend_to_Pmode (op0);
-      emit_insn (gen_lwp_llwpcb (op0));
+      emit_insn (gen_lwp_llwpcb (Pmode, op0));
       return 0;
 
     case IX86_BUILTIN_SLWPCB:
-      icode = CODE_FOR_lwp_slwpcb;
       if (!target
-	  || !insn_data[icode].operand[0].predicate (target, Pmode))
+	  || !register_operand (target, Pmode))
 	target = gen_reg_rtx (Pmode);
-      emit_insn (gen_lwp_slwpcb (target));
+      emit_insn (gen_lwp_slwpcb (Pmode, target));
       return target;
 
+    case IX86_BUILTIN_LWPVAL32:
+    case IX86_BUILTIN_LWPVAL64:
+    case IX86_BUILTIN_LWPINS32:
+    case IX86_BUILTIN_LWPINS64:
+      mode = ((fcode == IX86_BUILTIN_LWPVAL32
+	       || fcode == IX86_BUILTIN_LWPINS32)
+	      ? SImode : DImode);
+
+      if (fcode == IX86_BUILTIN_LWPVAL32
+	  || fcode == IX86_BUILTIN_LWPVAL64)
+	icode = code_for_lwp_lwpval (mode);
+      else
+	icode = code_for_lwp_lwpins (mode);
+
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      arg2 = CALL_EXPR_ARG (exp, 2);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      mode0 = insn_data[icode].operand[0].mode;
+
+      if (!insn_data[icode].operand[0].predicate (op0, mode0))
+	op0 = copy_to_mode_reg (mode0, op0);
+      if (!insn_data[icode].operand[1].predicate (op1, SImode))
+	op1 = copy_to_mode_reg (SImode, op1);
+
+      if (!CONST_INT_P (op2))
+	{
+	  error ("the last argument must be a 32-bit immediate");
+	  return const0_rtx;
+	}
+
+      emit_insn (GEN_FCN (icode) (op0, op1, op2));
+
+      if (fcode == IX86_BUILTIN_LWPINS32
+	  || fcode == IX86_BUILTIN_LWPINS64)
+	{
+	  if (target == 0
+	      || !nonimmediate_operand (target, QImode))
+	    target = gen_reg_rtx (QImode);
+
+	  pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
+			    const0_rtx);
+	  emit_insn (gen_rtx_SET (target, pat));
+
+	  return target;
+	}
+      else
+	return 0;
+
     case IX86_BUILTIN_BEXTRI32:
     case IX86_BUILTIN_BEXTRI64:
+      mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
+
       arg0 = CALL_EXPR_ARG (exp, 0);
       arg1 = CALL_EXPR_ARG (exp, 1);
       op0 = expand_normal (arg0);
       op1 = expand_normal (arg1);
-      icode = (fcode == IX86_BUILTIN_BEXTRI32
-	  ? CODE_FOR_tbm_bextri_si
-	  : CODE_FOR_tbm_bextri_di);
+
       if (!CONST_INT_P (op1))
-        {
-          error ("last argument must be an immediate");
-          return const0_rtx;
-        }
+	{
+	  error ("last argument must be an immediate");
+	  return const0_rtx;
+	}
       else
-        {
-          unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
-          unsigned char lsb_index = INTVAL (op1) & 0xFF;
-          op1 = GEN_INT (length);
-          op2 = GEN_INT (lsb_index);
+	{
+	  unsigned char lsb_index = UINTVAL (op1);
+	  unsigned char length = UINTVAL (op1) >> 8;
+
+	  unsigned char bitsize = GET_MODE_BITSIZE (mode);
+
+	  icode = code_for_tbm_bextri (mode);
 
 	  mode1 = insn_data[icode].operand[1].mode;
 	  if (!insn_data[icode].operand[1].predicate (op0, mode1))
@@ -11687,25 +11745,32 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
 	      || !register_operand (target, mode0))
 	    target = gen_reg_rtx (mode0);
 
-          pat = GEN_FCN (icode) (target, op0, op1, op2);
-          if (pat)
-            emit_insn (pat);
-          return target;
-        }
+	  if (length == 0 || lsb_index >= bitsize)
+	    {
+	      emit_move_insn (target, const0_rtx);
+	      return target;
+	    }
+
+	  if (length + lsb_index > bitsize)
+	    length = bitsize - lsb_index;
+
+	  op1 = GEN_INT (length);
+	  op2 = GEN_INT (lsb_index);
+
+	  emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
+	  return target;
+	}
 
     case IX86_BUILTIN_RDRAND16_STEP:
-      icode = CODE_FOR_rdrandhi_1;
-      mode0 = HImode;
+      mode = HImode;
       goto rdrand_step;
 
     case IX86_BUILTIN_RDRAND32_STEP:
-      icode = CODE_FOR_rdrandsi_1;
-      mode0 = SImode;
+      mode = SImode;
       goto rdrand_step;
 
     case IX86_BUILTIN_RDRAND64_STEP:
-      icode = CODE_FOR_rdranddi_1;
-      mode0 = DImode;
+      mode = DImode;
 
 rdrand_step:
       arg0 = CALL_EXPR_ARG (exp, 0);
@@ -11716,16 +11781,15 @@ rdrand_step:
 	  op1 = copy_addr_to_reg (op1);
 	}
 
-      op0 = gen_reg_rtx (mode0);
-      emit_insn (GEN_FCN (icode) (op0));
+      op0 = gen_reg_rtx (mode);
+      emit_insn (gen_rdrand (mode, op0));
 
-      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+      emit_move_insn (gen_rtx_MEM (mode, op1), op0);
 
-      op1 = gen_reg_rtx (SImode);
-      emit_move_insn (op1, CONST1_RTX (SImode));
+      op1 = force_reg (SImode, const1_rtx);
 
       /* Emit SImode conditional move.  */
-      if (mode0 == HImode)
+      if (mode == HImode)
 	{
 	  if (TARGET_ZERO_EXTEND_WITH_AND
 	      && optimize_function_for_speed_p (cfun))
@@ -11742,7 +11806,7 @@ rdrand_step:
 	      emit_insn (gen_zero_extendhisi2 (op2, op0));
 	    }
 	}
-      else if (mode0 == SImode)
+      else if (mode == SImode)
 	op2 = op0;
       else
 	op2 = gen_rtx_SUBREG (SImode, op0, 0);
@@ -11758,18 +11822,15 @@ rdrand_step:
       return target;
 
     case IX86_BUILTIN_RDSEED16_STEP:
-      icode = CODE_FOR_rdseedhi_1;
-      mode0 = HImode;
+      mode = HImode;
       goto rdseed_step;
 
     case IX86_BUILTIN_RDSEED32_STEP:
-      icode = CODE_FOR_rdseedsi_1;
-      mode0 = SImode;
+      mode = SImode;
       goto rdseed_step;
 
     case IX86_BUILTIN_RDSEED64_STEP:
-      icode = CODE_FOR_rdseeddi_1;
-      mode0 = DImode;
+      mode = DImode;
 
 rdseed_step:
       arg0 = CALL_EXPR_ARG (exp, 0);
@@ -11780,10 +11841,10 @@ rdseed_step:
 	  op1 = copy_addr_to_reg (op1);
 	}
 
-      op0 = gen_reg_rtx (mode0);
-      emit_insn (GEN_FCN (icode) (op0));
+      op0 = gen_reg_rtx (mode);
+      emit_insn (gen_rdseed (mode, op0));
 
-      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+      emit_move_insn (gen_rtx_MEM (mode, op1), op0);
 
       op2 = gen_reg_rtx (QImode);
 
@@ -12721,55 +12782,75 @@ rdseed_step:
       emit_insn (gen_xabort (op0));
       return 0;
 
+    case IX86_BUILTIN_RDSSPD:
+    case IX86_BUILTIN_RDSSPQ:
+      mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
+
+      if (target == 0
+	  || !register_operand (target, mode))
+	target = gen_reg_rtx (mode);
+
+      op0 = force_reg (mode, const0_rtx);
+
+      emit_insn (gen_rdssp (mode, target, op0));
+      return target;
+
+    case IX86_BUILTIN_INCSSPD:
+    case IX86_BUILTIN_INCSSPQ:
+      mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
+
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+
+      op0 = force_reg (mode, op0);
+
+      emit_insn (gen_incssp (mode, op0));
+      return 0;
+
     case IX86_BUILTIN_RSTORSSP:
     case IX86_BUILTIN_CLRSSBSY:
       arg0 = CALL_EXPR_ARG (exp, 0);
       op0 = expand_normal (arg0);
       icode = (fcode == IX86_BUILTIN_RSTORSSP
-	  ? CODE_FOR_rstorssp
-	  : CODE_FOR_clrssbsy);
+	       ? CODE_FOR_rstorssp
+	       : CODE_FOR_clrssbsy);
+
       if (!address_operand (op0, VOIDmode))
 	{
-	  op1 = convert_memory_address (Pmode, op0);
-	  op0 = copy_addr_to_reg (op1);
+	  op0 = convert_memory_address (Pmode, op0);
+	  op0 = copy_addr_to_reg (op0);
 	}
-      emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
+      emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
       return 0;
 
     case IX86_BUILTIN_WRSSD:
     case IX86_BUILTIN_WRSSQ:
     case IX86_BUILTIN_WRUSSD:
     case IX86_BUILTIN_WRUSSQ:
+      mode = ((fcode == IX86_BUILTIN_WRSSD
+	       || fcode == IX86_BUILTIN_WRUSSD)
+	      ? SImode : DImode);
+
       arg0 = CALL_EXPR_ARG (exp, 0);
       op0 = expand_normal (arg0);
       arg1 = CALL_EXPR_ARG (exp, 1);
       op1 = expand_normal (arg1);
-      switch (fcode)
-	{
-	case IX86_BUILTIN_WRSSD:
-	  icode = CODE_FOR_wrsssi;
-	  mode = SImode;
-	  break;
-	case IX86_BUILTIN_WRSSQ:
-	  icode = CODE_FOR_wrssdi;
-	  mode = DImode;
-	  break;
-	case IX86_BUILTIN_WRUSSD:
-	  icode = CODE_FOR_wrusssi;
-	  mode = SImode;
-	  break;
-	case IX86_BUILTIN_WRUSSQ:
-	  icode = CODE_FOR_wrussdi;
-	  mode = DImode;
-	  break;
-	}
+
       op0 = force_reg (mode, op0);
+
       if (!address_operand (op1, VOIDmode))
 	{
-	  op2 = convert_memory_address (Pmode, op1);
-	  op1 = copy_addr_to_reg (op2);
+	  op1 = convert_memory_address (Pmode, op1);
+	  op1 = copy_addr_to_reg (op1);
 	}
-      emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
+      op1 = gen_rtx_MEM (mode, op1);
+
+      icode = ((fcode == IX86_BUILTIN_WRSSD
+		|| fcode == IX86_BUILTIN_WRSSQ)
+	       ? code_for_wrss (mode)
+	       : code_for_wruss (mode));
+      emit_insn (GEN_FCN (icode) (op0, op1));
+
       return 0;
 
     default:
@@ -13071,14 +13152,6 @@ s4fma_expand:
 					       target);
     }
 
-  if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
-      && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
-    {
-      i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
-      return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
-				       target);
-    }
-
   gcc_unreachable ();
 }
 
@@ -19537,7 +19610,7 @@ bool
 ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
 {
   machine_mode qimode, himode;
-  unsigned int and_constant, xor_constant;
+  HOST_WIDE_INT and_constant, xor_constant;
   HOST_WIDE_INT shift_amount;
   rtx vec_const_and, vec_const_xor;
   rtx tmp, op1_subreg;
@@ -19612,7 +19685,7 @@ ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx
   emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
   emit_move_insn (vec_const_and,
 		  ix86_build_const_vector (qimode, true,
-					   GEN_INT (and_constant)));
+					   gen_int_mode (and_constant, QImode)));
   emit_insn (gen_and (dest, dest, vec_const_and));
 
   /* For ASHIFTRT, perform extra operation like
@@ -19623,7 +19696,7 @@ ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx
       vec_const_xor = gen_reg_rtx (qimode);
       emit_move_insn (vec_const_xor,
 		      ix86_build_const_vector (qimode, true,
-					       GEN_INT (xor_constant)));
+					       gen_int_mode (xor_constant, QImode)));
       emit_insn (gen_xor (dest, dest, vec_const_xor));
       emit_insn (gen_sub (dest, dest, vec_const_xor));
     }
@@ -20237,7 +20310,6 @@ ix86_expand_pextr (rtx *operands)
     case E_V4SImode:
     case E_V2DImode:
     case E_V1TImode:
-    case E_TImode:
       {
 	machine_mode srcmode, dstmode;
 	rtx d, pat;
@@ -20333,7 +20405,6 @@ ix86_expand_pinsr (rtx *operands)
     case E_V4SImode:
     case E_V2DImode:
     case E_V1TImode:
-    case E_TImode:
       {
 	machine_mode srcmode, dstmode;
 	rtx (*pinsr)(rtx, rtx, rtx, rtx);
diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index 535fc7e..620f7f1 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -2162,6 +2162,81 @@ make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
   return new pass_insert_endbr_and_patchable_area (ctxt);
 }
 
+/* Replace all one-value const vector that are referenced by SYMBOL_REFs in x
+   with embedded broadcast. i.e.transform
+
+     vpaddq .LC0(%rip), %zmm0, %zmm0
+     ret
+  .LC0:
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+
+    to
+
+     vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0
+     ret
+  .LC0:
+    .quad 3  */
+static void
+replace_constant_pool_with_broadcast (rtx_insn *insn)
+{
+  subrtx_ptr_iterator::array_type array;
+  FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL)
+    {
+      rtx *loc = *iter;
+      rtx x = *loc;
+      rtx broadcast_mem, vec_dup, constant, first;
+      machine_mode mode;
+
+      /* Constant pool.  */
+      if (!MEM_P (x)
+	  || !SYMBOL_REF_P (XEXP (x, 0))
+	  || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0)))
+	continue;
+
+      /* Const vector.  */
+      mode = GET_MODE (x);
+      if (!VECTOR_MODE_P (mode))
+	return;
+      constant = get_pool_constant (XEXP (x, 0));
+      if (GET_CODE (constant) != CONST_VECTOR)
+	return;
+
+      /* There could be some rtx like
+	 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+	 but with "*.LC1" refer to V2DI constant vector.  */
+      if (GET_MODE (constant) != mode)
+	{
+	  constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
+	  if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR)
+	    return;
+	}
+      first = XVECEXP (constant, 0, 0);
+
+      for (int i = 1; i < GET_MODE_NUNITS (mode); ++i)
+	{
+	  rtx tmp = XVECEXP (constant, 0, i);
+	  /* Vector duplicate value.  */
+	  if (!rtx_equal_p (tmp, first))
+	    return;
+	}
+
+      /* Replace with embedded broadcast.  */
+      broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first);
+      vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem);
+      validate_change (insn, loc, vec_dup, 0);
+
+      /* At most 1 memory_operand in an insn.  */
+      return;
+    }
+}
+
 /* At entry of the nearest common dominator for basic blocks with
    conversions, generate a single
 	vxorps %xmmN, %xmmN, %xmmN
@@ -2197,6 +2272,10 @@ remove_partial_avx_dependency (void)
 	  if (!NONDEBUG_INSN_P (insn))
 	    continue;
 
+	  /* Handle AVX512 embedded broadcast here to save compile time.  */
+	  if (TARGET_AVX512F)
+	    replace_constant_pool_with_broadcast (insn);
+
 	  set = single_set (insn);
 	  if (!set)
 	    continue;
@@ -2333,6 +2412,16 @@ remove_partial_avx_dependency (void)
   return 0;
 }
 
+static bool
+remove_partial_avx_dependency_gate ()
+{
+  return (TARGET_AVX
+	  && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+	  && TARGET_SSE_MATH
+	  && optimize
+	  && optimize_function_for_speed_p (cfun));
+}
+
 namespace {
 
 const pass_data pass_data_remove_partial_avx_dependency =
@@ -2358,11 +2447,7 @@ public:
   /* opt_pass methods: */
   virtual bool gate (function *)
     {
-      return (TARGET_AVX
-	      && TARGET_SSE_PARTIAL_REG_DEPENDENCY
-	      && TARGET_SSE_MATH
-	      && optimize
-	      && optimize_function_for_speed_p (cfun));
+      return remove_partial_avx_dependency_gate ();
     }
 
   virtual unsigned int execute (function *)
@@ -2379,6 +2464,68 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
   return new pass_remove_partial_avx_dependency (ctxt);
 }
 
+/* For const vector having one duplicated value, there's no need to put
+   whole vector in the constant pool when target supports embedded broadcast. */
+static unsigned int
+constant_pool_broadcast (void)
+{
+  timevar_push (TV_MACH_DEP);
+  rtx_insn *insn;
+
+  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    {
+      if (INSN_P (insn))
+	replace_constant_pool_with_broadcast (insn);
+    }
+  timevar_pop (TV_MACH_DEP);
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_constant_pool_broadcast =
+{
+  RTL_PASS, /* type */
+  "cpb", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_constant_pool_broadcast : public rtl_opt_pass
+{
+public:
+  pass_constant_pool_broadcast (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      /* Return false if rpad pass gate is true.
+	 replace_constant_pool_with_broadcast is called
+	 from both this pass and rpad pass.  */
+      return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ());
+    }
+
+  virtual unsigned int execute (function *)
+    {
+      return constant_pool_broadcast ();
+    }
+}; // class pass_cpb
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_constant_pool_broadcast (gcc::context *ctxt)
+{
+  return new pass_constant_pool_broadcast (ctxt);
+}
+
 /* This compares the priority of target features in function DECL1
    and DECL2.  It returns positive value if DECL1 is higher priority,
    negative value if DECL2 is higher priority and 0 if they are the
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index 26d1ea1..a59bd70 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -209,7 +209,10 @@ static struct ix86_target_opts isa2_opts[] =
   { "-mavx512bf16",	OPTION_MASK_ISA2_AVX512BF16 },
   { "-menqcmd",		OPTION_MASK_ISA2_ENQCMD },
   { "-mserialize",	OPTION_MASK_ISA2_SERIALIZE },
-  { "-mtsxldtrk",	OPTION_MASK_ISA2_TSXLDTRK }
+  { "-mtsxldtrk",	OPTION_MASK_ISA2_TSXLDTRK },
+  { "-mamx-tile",	OPTION_MASK_ISA2_AMX_TILE },
+  { "-mamx-int8",	OPTION_MASK_ISA2_AMX_INT8 },
+  { "-mamx-bf16",	OPTION_MASK_ISA2_AMX_BF16 }
 };
 static struct ix86_target_opts isa_opts[] =
 {
@@ -627,7 +630,8 @@ ix86_debug_options (void)
 
 void
 ix86_function_specific_save (struct cl_target_option *ptr,
-			     struct gcc_options *opts)
+			     struct gcc_options *opts,
+			     struct gcc_options */* opts_set */)
 {
   ptr->arch = ix86_arch;
   ptr->schedule = ix86_schedule;
@@ -754,6 +758,7 @@ set_ix86_tune_features (struct gcc_options *opts,
 
 void
 ix86_function_specific_restore (struct gcc_options *opts,
+				struct gcc_options */* opts_set */,
 				struct cl_target_option *ptr)
 {
   enum processor_type old_tune = ix86_tune;
@@ -922,12 +927,18 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 #define IX86_ATTR_ENUM(S,O)  { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
 #define IX86_ATTR_NO(S,O,M)  { S, sizeof (S)-1, ix86_opt_no,  O, M }
+#define IX86_ATTR_IX86_YES(S,O,M) \
+  { S, sizeof (S)-1, ix86_opt_ix86_yes, O, M }
+#define IX86_ATTR_IX86_NO(S,O,M) \
+  { S, sizeof (S)-1, ix86_opt_ix86_no,  O, M }
 
   enum ix86_opt_type
   {
     ix86_opt_unknown,
     ix86_opt_yes,
     ix86_opt_no,
+    ix86_opt_ix86_yes,
+    ix86_opt_ix86_no,
     ix86_opt_str,
     ix86_opt_enum,
     ix86_opt_isa
@@ -1025,6 +1036,9 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
     IX86_ATTR_ISA ("enqcmd", OPT_menqcmd),
     IX86_ATTR_ISA ("serialize", OPT_mserialize),
     IX86_ATTR_ISA ("tsxldtrk", OPT_mtsxldtrk),
+    IX86_ATTR_ISA ("amx-tile", OPT_mamx_tile),
+    IX86_ATTR_ISA ("amx-int8", OPT_mamx_int8),
+    IX86_ATTR_ISA ("amx-bf16", OPT_mamx_bf16),
 
     /* enum options */
     IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
@@ -1062,6 +1076,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
     IX86_ATTR_YES ("recip",
 		   OPT_mrecip,
 		   MASK_RECIP),
+
+    IX86_ATTR_IX86_YES ("general-regs-only",
+			OPT_mgeneral_regs_only,
+			OPTION_MASK_GENERAL_REGS_ONLY),
   };
 
   location_t loc
@@ -1175,6 +1193,40 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 	    opts->x_target_flags &= ~mask;
 	}
 
+      else if (type == ix86_opt_ix86_yes || type == ix86_opt_ix86_no)
+	{
+	  if (mask == OPTION_MASK_GENERAL_REGS_ONLY)
+	    {
+	      if (!opt_set_p)
+		{
+		  error_at (loc, "pragma or attribute %<target(\"%s\")%>  "
+			    "does not allow a negated form", p);
+		  return false;
+		}
+
+	      if (type != ix86_opt_ix86_yes)
+		gcc_unreachable ();
+
+	      opts->x_ix86_target_flags |= mask;
+
+	      struct cl_decoded_option decoded;
+	      generate_option (opt, NULL, opt_set_p, CL_TARGET,
+			       &decoded);
+	      ix86_handle_option (opts, opts_set, &decoded,
+				  input_location);
+	    }
+	  else
+	    {
+	      if (type == ix86_opt_ix86_no)
+		opt_set_p = !opt_set_p;
+
+	      if (opt_set_p)
+		opts->x_ix86_target_flags |= mask;
+	      else
+		opts->x_ix86_target_flags &= ~mask;
+	    }
+	}
+
       else if (type == ix86_opt_str)
 	{
 	  if (p_strings[opt])
@@ -1312,7 +1364,7 @@ ix86_valid_target_attribute_tree (tree fndecl, tree args,
 
       /* Save the current options unless we are validating options for
 	 #pragma.  */
-      t = build_target_option_node (opts);
+      t = build_target_option_node (opts, opts_set);
 
       opts->x_ix86_arch_string = orig_arch_string;
       opts->x_ix86_tune_string = orig_tune_string;
@@ -1333,7 +1385,7 @@ ix86_valid_target_attribute_p (tree fndecl,
 			       tree args,
 			       int flags)
 {
-  struct gcc_options func_options;
+  struct gcc_options func_options, func_options_set;
   tree new_target, new_optimize;
   bool ret = true;
 
@@ -1345,7 +1397,8 @@ ix86_valid_target_attribute_p (tree fndecl,
       && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
     return true;
 
-  tree old_optimize = build_optimization_node (&global_options);
+  tree old_optimize = build_optimization_node (&global_options,
+					       &global_options_set);
 
   /* Get the optimization options of the current function.  */  
   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
@@ -1357,21 +1410,22 @@ ix86_valid_target_attribute_p (tree fndecl,
   memset (&func_options, 0, sizeof (func_options));
   init_options_struct (&func_options, NULL);
   lang_hooks.init_options_struct (&func_options);
- 
-  cl_optimization_restore (&func_options,
+  memset (&func_options_set, 0, sizeof (func_options_set));
+
+  cl_optimization_restore (&func_options, &func_options_set,
 			   TREE_OPTIMIZATION (func_optimize));
 
   /* Initialize func_options to the default before its target options can
      be set.  */
-  cl_target_option_restore (&func_options,
+  cl_target_option_restore (&func_options, &func_options_set,
 			    TREE_TARGET_OPTION (target_option_default_node));
 
   /* FLAGS == 1 is used for target_clones attribute.  */
   new_target
     = ix86_valid_target_attribute_tree (fndecl, args, &func_options,
-					&global_options_set, flags == 1);
+					&func_options_set, flags == 1);
 
-  new_optimize = build_optimization_node (&func_options);
+  new_optimize = build_optimization_node (&func_options, &func_options_set);
 
   if (new_target == error_mark_node)
     ret = false;
@@ -2004,10 +2058,27 @@ ix86_option_override_internal (bool main_args_p,
 	    return false;
 	  }
 
+	/* The feature-only micro-architecture levels that use
+	   PTA_NO_TUNE are only defined for the x86-64 psABI.  */
+	if ((processor_alias_table[i].flags & PTA_NO_TUNE) != 0
+	    && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
+		|| opts->x_ix86_abi != SYSV_ABI))
+	  {
+	    error (G_("%<%s%> architecture level is only defined"
+		      " for the x86-64 psABI"), opts->x_ix86_arch_string);
+	    return false;
+	  }
+
 	ix86_schedule = processor_alias_table[i].schedule;
 	ix86_arch = processor_alias_table[i].processor;
-	/* Default cpu tuning to the architecture.  */
-	ix86_tune = ix86_arch;
+
+	/* Default cpu tuning to the architecture, unless the table
+	   entry requests not to do this.  Used by the x86-64 psABI
+	   micro-architecture levels.  */
+	if ((processor_alias_table[i].flags & PTA_NO_TUNE) == 0)
+	  ix86_tune = ix86_arch;
+	else
+	  ix86_tune = PROCESSOR_GENERIC;
 
 	if (((processor_alias_table[i].flags & PTA_MMX) != 0)
 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
@@ -2210,6 +2281,18 @@ ix86_option_override_internal (bool main_args_p,
 	    && !(opts->x_ix86_isa_flags2_explicit
 		 & OPTION_MASK_ISA2_AVX512BF16))
 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVX512BF16;
+	if (((processor_alias_table[i].flags & PTA_AMX_TILE) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit
+		 & OPTION_MASK_ISA2_AMX_TILE))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_TILE;
+	if (((processor_alias_table[i].flags & PTA_AMX_INT8) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit
+		 & OPTION_MASK_ISA2_AMX_INT8))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_INT8;
+	if (((processor_alias_table[i].flags & PTA_AMX_BF16) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit
+		 & OPTION_MASK_ISA2_AMX_BF16))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_BF16;
         if (((processor_alias_table[i].flags & PTA_MOVDIRI) != 0)
             && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVDIRI))
           opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVDIRI;
@@ -2260,9 +2343,10 @@ ix86_option_override_internal (bool main_args_p,
 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
 
-	/* Don't enable x87 instructions if only
-	   general registers are allowed.  */
-	if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
+	/* Don't enable x87 instructions if only general registers are
+	   allowed by target("general-regs-only") function attribute or
+	   -mgeneral-regs-only.  */
+	if (!(opts->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
 	    && !(opts_set->x_target_flags & MASK_80387))
 	  {
 	    if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
@@ -2317,7 +2401,8 @@ ix86_option_override_internal (bool main_args_p,
     ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
 
   for (i = 0; i < pta_size; i++)
-    if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
+    if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name)
+	&& (processor_alias_table[i].flags & PTA_NO_TUNE) == 0)
       {
 	ix86_schedule = processor_alias_table[i].schedule;
 	ix86_tune = processor_alias_table[i].processor;
@@ -2361,8 +2446,9 @@ ix86_option_override_internal (bool main_args_p,
 
       auto_vec <const char *> candidates;
       for (i = 0; i < pta_size; i++)
-	if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
-	    || ((processor_alias_table[i].flags & PTA_64BIT) != 0))
+	if ((!TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	     || ((processor_alias_table[i].flags & PTA_64BIT) != 0))
+	    && (processor_alias_table[i].flags & PTA_NO_TUNE) == 0)
 	  candidates.safe_push (processor_alias_table[i].name);
 
 #ifdef HAVE_LOCAL_CPU_DETECT
@@ -2909,7 +2995,7 @@ ix86_option_override_internal (bool main_args_p,
      options.  */
   if (main_args_p)
     target_option_default_node = target_option_current_node
-      = build_target_option_node (opts);
+      = build_target_option_node (opts, opts_set);
 
   if (opts->x_flag_cf_protection != CF_NONE)
     opts->x_flag_cf_protection
@@ -2946,7 +3032,8 @@ void
 ix86_reset_previous_fndecl (void)
 {
   tree new_tree = target_option_current_node;
-  cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
+  cl_target_option_restore (&global_options, &global_options_set,
+			    TREE_TARGET_OPTION (new_tree));
   if (TREE_TARGET_GLOBALS (new_tree))
     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
   else if (new_tree == target_option_default_node)
@@ -3205,7 +3292,8 @@ ix86_set_current_function (tree fndecl)
 
   if (old_tree != new_tree)
     {
-      cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
+      cl_target_option_restore (&global_options, &global_options_set,
+				TREE_TARGET_OPTION (new_tree));
       if (TREE_TARGET_GLOBALS (new_tree))
 	restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
       else if (new_tree == target_option_default_node)
diff --git a/gcc/config/i386/i386-options.h b/gcc/config/i386/i386-options.h
index 646d3d5..9172936 100644
--- a/gcc/config/i386/i386-options.h
+++ b/gcc/config/i386/i386-options.h
@@ -70,8 +70,10 @@ extern const char *stringop_alg_names[];
 
 void ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2);
 void ix86_function_specific_save (struct cl_target_option *,
-				  struct gcc_options *opts);
+				  struct gcc_options *opts,
+				  struct gcc_options *opts_set);
 void ix86_function_specific_restore (struct gcc_options *opts,
+				     struct gcc_options *opts_set,
 				     struct cl_target_option *);
 void ix86_function_specific_post_stream_in (struct cl_target_option *);
 void ix86_function_specific_print (FILE *, int,
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index d83c7b9..07ecf8e 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -33,3 +33,4 @@ along with GCC; see the file COPYING3.  If not see
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
 
   INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
+  INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 7c2ce61..c5b700e 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -143,7 +143,7 @@ extern bool ix86_expand_fp_movcc (rtx[]);
 extern bool ix86_expand_fp_vcond (rtx[]);
 extern bool ix86_expand_int_vcond (rtx[]);
 extern void ix86_expand_vec_perm (rtx[]);
-extern bool ix86_expand_mask_vec_cmp (rtx[]);
+extern bool ix86_expand_mask_vec_cmp (rtx, enum rtx_code, rtx, rtx);
 extern bool ix86_expand_int_vec_cmp (rtx[]);
 extern bool ix86_expand_fp_vec_cmp (rtx[]);
 extern void ix86_expand_sse_movcc (rtx, rtx, rtx, rtx);
@@ -223,7 +223,7 @@ extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
 #ifdef TREE_CODE
 extern int ix86_data_alignment (tree, unsigned int, bool);
 extern unsigned int ix86_local_alignment (tree, machine_mode,
-					  unsigned int);
+					  unsigned int, bool = false);
 extern unsigned int ix86_minimum_alignment (tree, machine_mode,
 					    unsigned int);
 extern tree ix86_handle_shared_attribute (tree *, tree, tree, int, bool *);
@@ -386,3 +386,4 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
   (gcc::context *);
 extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
   (gcc::context *);
+extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5c373c0..f684954 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1484,7 +1484,7 @@ ix86_reg_parm_stack_space (const_tree fndecl)
 bool
 ix86_libc_has_function (enum function_class fn_class)
 {
-  return targetm.libc_has_function (fn_class);
+  return targetm.libc_has_function (fn_class, NULL_TREE);
 }
 
 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
@@ -6169,10 +6169,7 @@ ix86_compute_frame_layout (void)
     }
 
   frame->save_regs_using_mov
-    = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
-       /* If static stack checking is enabled and done with probes,
-	  the registers need to be saved before allocating the frame.  */
-       && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
+    = TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue;
 
   /* Skip return address and error code in exception handler.  */
   offset = INCOMING_FRAME_SP_OFFSET;
@@ -6329,6 +6326,9 @@ ix86_compute_frame_layout (void)
 
   if ((!to_allocate && frame->nregs <= 1)
       || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
+       /* If static stack checking is enabled and done with probes,
+	  the registers need to be saved before allocating the frame.  */
+      || flag_stack_check == STATIC_BUILTIN_STACK_CHECK
       /* If stack clash probing needs a loop, then it needs a
 	 scratch register.  But the returned register is only guaranteed
 	 to be safe to use after register saves are complete.  So if
@@ -6968,10 +6968,12 @@ ix86_update_stack_boundary (void)
 static rtx
 ix86_get_drap_rtx (void)
 {
-  /* We must use DRAP if there are outgoing arguments on stack and
+  /* We must use DRAP if there are outgoing arguments on stack or
+     the stack pointer register is clobbered by asm statment and
      ACCUMULATE_OUTGOING_ARGS is false.  */
   if (ix86_force_drap
-      || (cfun->machine->outgoing_args_on_stack
+      || ((cfun->machine->outgoing_args_on_stack
+	   || crtl->sp_is_clobbered_by_asm)
 	  && !ACCUMULATE_OUTGOING_ARGS))
     crtl->need_drap = true;
 
@@ -7122,17 +7124,20 @@ release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
 
 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
 
-   This differs from the next routine in that it tries hard to prevent
-   attacks that jump the stack guard.  Thus it is never allowed to allocate
-   more than PROBE_INTERVAL bytes of stack space without a suitable
-   probe.
+   If INT_REGISTERS_SAVED is true, then integer registers have already been
+   pushed on the stack.
 
-   INT_REGISTERS_SAVED is true if integer registers have already been
-   pushed on the stack.  */
+   If PROTECTION AREA is true, then probe PROBE_INTERVAL plus a small dope
+   beyond SIZE bytes.
+
+   This assumes no knowledge of the current probing state, i.e. it is never
+   allowed to allocate more than PROBE_INTERVAL bytes of stack space without
+   a suitable probe.  */
 
 static void
-ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
-					 const bool int_registers_saved)
+ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
+			     const bool int_registers_saved,
+			     const bool protection_area)
 {
   struct machine_function *m = cfun->machine;
 
@@ -7194,10 +7199,17 @@ ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
       emit_insn (gen_blockage ());
     }
 
+  const HOST_WIDE_INT probe_interval = get_probe_interval ();
+  const int dope = 4 * UNITS_PER_WORD;
+
+  /* If there is protection area, take it into account in the size.  */
+  if (protection_area)
+    size += probe_interval + dope;
+
   /* If we allocate less than the size of the guard statically,
      then no probing is necessary, but we do need to allocate
      the stack.  */
-  if (size < (1 << param_stack_clash_protection_guard_size))
+  else if (size < (1 << param_stack_clash_protection_guard_size))
     {
       pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
 			         GEN_INT (-size), -1,
@@ -7209,7 +7221,6 @@ ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
   /* We're allocating a large enough stack frame that we need to
      emit probes.  Either emit them inline or in a loop depending
      on the size.  */
-  HOST_WIDE_INT probe_interval = get_probe_interval ();
   if (size <= 4 * probe_interval)
     {
       HOST_WIDE_INT i;
@@ -7228,12 +7239,19 @@ ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
 	}
 
       /* We need to allocate space for the residual, but we do not need
-	 to probe the residual.  */
+	 to probe the residual...  */
       HOST_WIDE_INT residual = (i - probe_interval - size);
       if (residual)
-	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-				   GEN_INT (residual), -1,
-				   m->fs.cfa_reg == stack_pointer_rtx);
+	{
+	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				     GEN_INT (residual), -1,
+				     m->fs.cfa_reg == stack_pointer_rtx);
+
+	  /* ...except if there is a protection area to maintain.  */
+	  if (protection_area)
+	    emit_stack_probe (stack_pointer_rtx);
+	}
+
       dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
     }
   else
@@ -7296,186 +7314,27 @@ ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
 	 is equal to ROUNDED_SIZE.  */
 
       if (size != rounded_size)
-	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-				   GEN_INT (rounded_size - size), -1,
-				   m->fs.cfa_reg == stack_pointer_rtx);
-      dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
-
-      /* This does not deallocate the space reserved for the scratch
-	 register.  That will be deallocated in the epilogue.  */
-      release_scratch_register_on_entry (&sr, size, false);
-    }
-
-  /* Make sure nothing is scheduled before we are done.  */
-  emit_insn (gen_blockage ());
-}
-
-/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
-
-   INT_REGISTERS_SAVED is true if integer registers have already been
-   pushed on the stack.  */
-
-static void
-ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
-			     const bool int_registers_saved)
-{
-  /* We skip the probe for the first interval + a small dope of 4 words and
-     probe that many bytes past the specified size to maintain a protection
-     area at the botton of the stack.  */
-  const int dope = 4 * UNITS_PER_WORD;
-  rtx size_rtx = GEN_INT (size), last;
-
-  /* See if we have a constant small number of probes to generate.  If so,
-     that's the easy case.  The run-time loop is made up of 9 insns in the
-     generic case while the compile-time loop is made up of 3+2*(n-1) insns
-     for n # of intervals.  */
-  if (size <= 4 * get_probe_interval ())
-    {
-      HOST_WIDE_INT i, adjust;
-      bool first_probe = true;
-
-      /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
-	 values of N from 1 until it exceeds SIZE.  If only one probe is
-	 needed, this will not generate any code.  Then adjust and probe
-	 to PROBE_INTERVAL + SIZE.  */
-      for (i = get_probe_interval (); i < size; i += get_probe_interval ())
-	{
-	  if (first_probe)
-	    {
-	      adjust = 2 * get_probe_interval () + dope;
-	      first_probe = false;
-	    }
-	  else
-	    adjust = get_probe_interval ();
-
-	  emit_insn (gen_rtx_SET (stack_pointer_rtx,
-				  plus_constant (Pmode, stack_pointer_rtx,
-						 -adjust)));
-	  emit_stack_probe (stack_pointer_rtx);
-	}
-
-      if (first_probe)
-	adjust = size + get_probe_interval () + dope;
-      else
-        adjust = size + get_probe_interval () - i;
-
-      emit_insn (gen_rtx_SET (stack_pointer_rtx,
-			      plus_constant (Pmode, stack_pointer_rtx,
-					     -adjust)));
-      emit_stack_probe (stack_pointer_rtx);
-
-      /* Adjust back to account for the additional first interval.  */
-      last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
-				     plus_constant (Pmode, stack_pointer_rtx,
-						    (get_probe_interval ()
-						     + dope))));
-    }
-
-  /* Otherwise, do the same as above, but in a loop.  Note that we must be
-     extra careful with variables wrapping around because we might be at
-     the very top (or the very bottom) of the address space and we have
-     to be able to handle this case properly; in particular, we use an
-     equality test for the loop condition.  */
-  else
-    {
-      /* We expect the GP registers to be saved when probes are used
-	 as the probing sequences might need a scratch register and
-	 the routine to allocate one assumes the integer registers
-	 have already been saved.  */
-      gcc_assert (int_registers_saved);
-
-      HOST_WIDE_INT rounded_size;
-      struct scratch_reg sr;
-
-      get_scratch_register_on_entry (&sr);
-
-      /* If we needed to save a register, then account for any space
-	 that was pushed (we are not going to pop the register when
-	 we do the restore).  */
-      if (sr.saved)
-	size -= UNITS_PER_WORD;
-
-      /* Step 1: round SIZE to the previous multiple of the interval.  */
-
-      rounded_size = ROUND_DOWN (size, get_probe_interval ());
-
-
-      /* Step 2: compute initial and final value of the loop counter.  */
-
-      /* SP = SP_0 + PROBE_INTERVAL.  */
-      emit_insn (gen_rtx_SET (stack_pointer_rtx,
-			      plus_constant (Pmode, stack_pointer_rtx,
-					     - (get_probe_interval () + dope))));
-
-      /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE.  */
-      if (rounded_size <= (HOST_WIDE_INT_1 << 31))
-	emit_insn (gen_rtx_SET (sr.reg,
-				plus_constant (Pmode, stack_pointer_rtx,
-					       -rounded_size)));
-      else
 	{
-	  emit_move_insn (sr.reg, GEN_INT (-rounded_size));
-	  emit_insn (gen_rtx_SET (sr.reg,
-				  gen_rtx_PLUS (Pmode, sr.reg,
-						stack_pointer_rtx)));
-	}
-
-
-      /* Step 3: the loop
-
-	 do
-	   {
-	     SP = SP + PROBE_INTERVAL
-	     probe at SP
-	   }
-	 while (SP != LAST_ADDR)
-
-	 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
-	 values of N from 1 until it is equal to ROUNDED_SIZE.  */
-
-      emit_insn (gen_adjust_stack_and_probe (Pmode, sr.reg, sr.reg, size_rtx));
-
-
-      /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
-	 assert at compile-time that SIZE is equal to ROUNDED_SIZE.  */
+	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				     GEN_INT (rounded_size - size), -1,
+				     m->fs.cfa_reg == stack_pointer_rtx);
 
-      if (size != rounded_size)
-	{
-	  emit_insn (gen_rtx_SET (stack_pointer_rtx,
-			          plus_constant (Pmode, stack_pointer_rtx,
-						 rounded_size - size)));
-	  emit_stack_probe (stack_pointer_rtx);
+	  if (protection_area)
+	    emit_stack_probe (stack_pointer_rtx);
 	}
 
-      /* Adjust back to account for the additional first interval.  */
-      last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
-				     plus_constant (Pmode, stack_pointer_rtx,
-						    (get_probe_interval ()
-						     + dope))));
+      dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
 
       /* This does not deallocate the space reserved for the scratch
 	 register.  That will be deallocated in the epilogue.  */
       release_scratch_register_on_entry (&sr, size, false);
     }
 
-  /* Even if the stack pointer isn't the CFA register, we need to correctly
-     describe the adjustments made to it, in particular differentiate the
-     frame-related ones from the frame-unrelated ones.  */
-  if (size > 0)
-    {
-      rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
-      XVECEXP (expr, 0, 0)
-	= gen_rtx_SET (stack_pointer_rtx,
-		       plus_constant (Pmode, stack_pointer_rtx, -size));
-      XVECEXP (expr, 0, 1)
-	= gen_rtx_SET (stack_pointer_rtx,
-		       plus_constant (Pmode, stack_pointer_rtx,
-				      get_probe_interval () + dope + size));
-      add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
-      RTX_FRAME_RELATED_P (last) = 1;
-
-      cfun->machine->fs.sp_offset += size;
-    }
+  /* Adjust back to account for the protection area.  */
+  if (protection_area)
+    pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+			       GEN_INT (probe_interval + dope), -1,
+			       m->fs.cfa_reg == stack_pointer_rtx);
 
   /* Make sure nothing is scheduled before we are done.  */
   emit_insn (gen_blockage ());
@@ -7527,18 +7386,20 @@ static void
 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
 			     const bool int_registers_saved)
 {
+  const HOST_WIDE_INT probe_interval = get_probe_interval ();
+
   /* See if we have a constant small number of probes to generate.  If so,
      that's the easy case.  The run-time loop is made up of 6 insns in the
      generic case while the compile-time loop is made up of n insns for n #
      of intervals.  */
-  if (size <= 6 * get_probe_interval ())
+  if (size <= 6 * probe_interval)
     {
       HOST_WIDE_INT i;
 
       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
 	 it exceeds SIZE.  If only one probe is needed, this will not
 	 generate any code.  Then probe at FIRST + SIZE.  */
-      for (i = get_probe_interval (); i < size; i += get_probe_interval ())
+      for (i = probe_interval; i < size; i += probe_interval)
 	emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
 					 -(first + i)));
 
@@ -7567,7 +7428,7 @@ ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
 
       /* Step 1: round SIZE to the previous multiple of the interval.  */
 
-      rounded_size = ROUND_DOWN (size, get_probe_interval ());
+      rounded_size = ROUND_DOWN (size, probe_interval);
 
 
       /* Step 2: compute initial and final value of the loop counter.  */
@@ -8324,27 +8185,33 @@ ix86_expand_prologue (void)
       sse_registers_saved = true;
     }
 
+  /* If stack clash protection is requested, then probe the stack.  */
+  if (allocate >= 0 && flag_stack_clash_protection)
+    {
+      ix86_adjust_stack_and_probe (allocate, int_registers_saved, false);
+      allocate = 0;
+    }
+
   /* The stack has already been decremented by the instruction calling us
      so probe if the size is non-negative to preserve the protection area.  */
-  if (allocate >= 0
-      && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
-	  || flag_stack_clash_protection))
+  else if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
     {
-      if (flag_stack_clash_protection)
-	{
-	  ix86_adjust_stack_and_probe_stack_clash (allocate,
-						   int_registers_saved);
-	  allocate = 0;
-	}
-      else if (STACK_CHECK_MOVING_SP)
+      const HOST_WIDE_INT probe_interval = get_probe_interval ();
+
+      if (STACK_CHECK_MOVING_SP)
 	{
-	  if (!(crtl->is_leaf && !cfun->calls_alloca
-		&& allocate <= get_probe_interval ()))
+	  if (crtl->is_leaf
+	      && !cfun->calls_alloca
+	      && allocate <= probe_interval)
+	    ;
+
+	  else
 	    {
-	      ix86_adjust_stack_and_probe (allocate, int_registers_saved);
+	      ix86_adjust_stack_and_probe (allocate, int_registers_saved, true);
 	      allocate = 0;
 	    }
 	}
+
       else
 	{
 	  HOST_WIDE_INT size = allocate;
@@ -8356,7 +8223,7 @@ ix86_expand_prologue (void)
 	    {
 	      if (crtl->is_leaf && !cfun->calls_alloca)
 		{
-		  if (size > get_probe_interval ())
+		  if (size > probe_interval)
 		    ix86_emit_probe_stack_range (0, size, int_registers_saved);
 		}
 	      else
@@ -8368,7 +8235,7 @@ ix86_expand_prologue (void)
 	    {
 	      if (crtl->is_leaf && !cfun->calls_alloca)
 		{
-		  if (size > get_probe_interval ()
+		  if (size > probe_interval
 		      && size > get_stack_check_protect ())
 		    ix86_emit_probe_stack_range (get_stack_check_protect (),
 						 (size
@@ -10191,6 +10058,9 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
       break;
 
     CASE_CONST_SCALAR_INT:
+      if (ix86_endbr_immediate_operand (x, VOIDmode))
+	return false;
+
       switch (mode)
 	{
 	case E_TImode:
@@ -10584,6 +10454,9 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
   /* Validate displacement.  */
   if (disp)
     {
+      if (ix86_endbr_immediate_operand (disp, VOIDmode))
+	return false;
+
       if (GET_CODE (disp) == CONST
 	  && GET_CODE (XEXP (disp, 0)) == UNSPEC
 	  && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
@@ -12544,7 +12417,6 @@ print_reg (rtx x, int code, FILE *file)
    M -- print addr32 prefix for TARGET_X32 with VSIB address.
    ! -- print NOTRACK prefix for jxx/call/ret instructions if required.
    N -- print maskz if it's constant 0 operand.
-   I -- print comparision predicate operand for sse cmp condition.
  */
 
 void
@@ -12774,40 +12646,6 @@ ix86_print_operand (FILE *file, rtx x, int code)
 	    }
 	  return;
 
-	case 'I':
-	  if (ASSEMBLER_DIALECT == ASM_ATT)
-	    putc ('$', file);
-	  switch (GET_CODE (x))
-	    {
-	    case EQ:
-	      putc ('0', file);
-	      break;
-	    case NE:
-	      putc ('4', file);
-	      break;
-	    case GE:
-	    case GEU:
-	      putc ('5', file);
-	      break;
-	    case GT:
-	    case GTU:
-	      putc ('6', file);
-	      break;
-	    case LE:
-	    case LEU:
-	      putc ('2', file);
-	      break;
-	    case LT:
-	    case LTU:
-	      putc ('1', file);
-	      break;
-	    default:
-	      output_operand_lossage ("operand is not a condition code, "
-				      "invalid operand code 'I'");
-	      return;
-	    }
-	  return;
-
 	case 'Y':
 	  switch (GET_CODE (x))
 	    {
@@ -16651,7 +16489,11 @@ iamcu_alignment (tree type, int align)
 
   /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
      bytes.  */
-  mode = TYPE_MODE (strip_array_types (type));
+  type = strip_array_types (type);
+  if (TYPE_ATOMIC (type))
+    return align;
+
+  mode = TYPE_MODE (type);
   switch (GET_MODE_CLASS (mode))
     {
     case MODE_INT:
@@ -16768,6 +16610,16 @@ ix86_data_alignment (tree type, unsigned int align, bool opt)
   return align;
 }
 
+/* Implememnt TARGET_LOWER_LOCAL_DECL_ALIGNMENT.  */
+static void
+ix86_lower_local_decl_alignment (tree decl)
+{
+  unsigned int new_align = ix86_local_alignment (decl, VOIDmode,
+						 DECL_ALIGN (decl), true);
+  if (new_align < DECL_ALIGN (decl))
+    SET_DECL_ALIGN (decl, new_align);
+}
+
 /* Compute the alignment for a local variable or a stack slot.  EXP is
    the data type or decl itself, MODE is the widest mode available and
    ALIGN is the alignment that the object would ordinarily have.  The
@@ -16776,7 +16628,7 @@ ix86_data_alignment (tree type, unsigned int align, bool opt)
 
 unsigned int
 ix86_local_alignment (tree exp, machine_mode mode,
-		      unsigned int align)
+		      unsigned int align, bool may_lower)
 {
   tree type, decl;
 
@@ -16793,11 +16645,13 @@ ix86_local_alignment (tree exp, machine_mode mode,
 
   /* Don't do dynamic stack realignment for long long objects with
      -mpreferred-stack-boundary=2.  */
-  if (!TARGET_64BIT
+  if (may_lower
+      && !TARGET_64BIT
       && align == 64
       && ix86_preferred_stack_boundary < 64
       && (mode == DImode || (type && TYPE_MODE (type) == DImode))
-      && (!type || !TYPE_USER_ALIGN (type))
+      && (!type || (!TYPE_USER_ALIGN (type)
+		    && !TYPE_ATOMIC (strip_array_types (type))))
       && (!decl || !DECL_USER_ALIGN (decl)))
     align = 32;
 
@@ -16910,7 +16764,8 @@ ix86_minimum_alignment (tree exp, machine_mode mode,
   /* Don't do dynamic stack realignment for long long objects with
      -mpreferred-stack-boundary=2.  */
   if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
-      && (!type || !TYPE_USER_ALIGN (type))
+      && (!type || (!TYPE_USER_ALIGN (type)
+		    && !TYPE_ATOMIC (strip_array_types (type))))
       && (!decl || !DECL_USER_ALIGN (decl)))
     {
       gcc_checking_assert (!TARGET_STV);
@@ -18531,13 +18386,15 @@ ix86_preferred_reload_class (rtx x, reg_class_t regclass)
     return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
 
   /* QImode constants are easy to load, but non-constant QImode data
-     must go into Q_REGS.  */
+     must go into Q_REGS or ALL_MASK_REGS.  */
   if (GET_MODE (x) == QImode && !CONSTANT_P (x))
     {
       if (Q_CLASS_P (regclass))
 	return regclass;
       else if (reg_class_subset_p (Q_REGS, regclass))
 	return Q_REGS;
+      else if (MASK_CLASS_P (regclass))
+	return regclass;
       else
 	return NO_REGS;
     }
@@ -18893,6 +18750,29 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
       return in ? ix86_cost->hard_register.sse_load [index]
 		: ix86_cost->hard_register.sse_store [index];
     }
+  if (MASK_CLASS_P (regclass))
+    {
+      int index;
+      switch (GET_MODE_SIZE (mode))
+	{
+	case 1:
+	  index = 0;
+	  break;
+	case 2:
+	  index = 1;
+	  break;
+	/* DImode loads and stores assumed to cost the same as SImode.  */
+	default:
+	  index = 2;
+	  break;
+	}
+
+      if (in == 2)
+	return MAX (ix86_cost->hard_register.mask_load[index],
+		    ix86_cost->hard_register.mask_store[index]);
+      return in ? ix86_cost->hard_register.mask_load[2]
+		: ix86_cost->hard_register.mask_store[2];
+    }
   if (MMX_CLASS_P (regclass))
     {
       int index;
@@ -19018,6 +18898,17 @@ ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
 	    ? ix86_cost->hard_register.sse_to_integer
 	    : ix86_cost->hard_register.integer_to_sse);
 
+  /* Moves between mask register and GPR.  */
+  if (MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
+    {
+      return (MASK_CLASS_P (class1)
+	      ? ix86_cost->hard_register.mask_to_integer
+	      : ix86_cost->hard_register.integer_to_mask);
+    }
+  /* Moving between mask registers.  */
+  if (MASK_CLASS_P (class1) && MASK_CLASS_P (class2))
+    return ix86_cost->hard_register.mask_move;
+
   if (MAYBE_FLOAT_CLASS_P (class1))
     return ix86_cost->hard_register.fp_move;
   if (MAYBE_SSE_CLASS_P (class1))
@@ -19090,7 +18981,7 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
       if ((mode == P2QImode || mode == P2HImode))
 	return MASK_PAIR_REGNO_P(regno);
 
-      return (VALID_MASK_REG_MODE (mode)
+      return ((TARGET_AVX512F && VALID_MASK_REG_MODE (mode))
 	      || (TARGET_AVX512BW
 		  && VALID_MASK_AVX512BW_MODE (mode)));
     }
@@ -20410,11 +20301,30 @@ x86_field_alignment (tree type, int computed)
     return computed;
   if (TARGET_IAMCU)
     return iamcu_alignment (type, computed);
-  mode = TYPE_MODE (strip_array_types (type));
+  type = strip_array_types (type);
+  mode = TYPE_MODE (type);
   if (mode == DFmode || mode == DCmode
       || GET_MODE_CLASS (mode) == MODE_INT
       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
-    return MIN (32, computed);
+    {
+      if (TYPE_ATOMIC (type) && computed > 32)
+	{
+	  static bool warned;
+
+	  if (!warned && warn_psabi)
+	    {
+	      const char *url
+		= CHANGES_ROOT_URL "gcc-11/changes.html#ia32_atomic";
+
+	      warned = true;
+	      inform (input_location, "the alignment of %<_Atomic %T%> "
+				      "fields changed in %{GCC 11.1%}",
+		      TYPE_MAIN_VARIANT (type), url);
+	    }
+	}
+      else
+      return MIN (32, computed);
+    }
   return computed;
 }
 
@@ -23521,6 +23431,9 @@ ix86_run_selftests (void)
 #undef TARGET_CAN_CHANGE_MODE_CLASS
 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
 
+#undef TARGET_LOWER_LOCAL_DECL_ALIGNMENT
+#define TARGET_LOWER_LOCAL_DECL_ALIGNMENT ix86_lower_local_decl_alignment
+
 #undef TARGET_STATIC_RTX_ALIGNMENT
 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
 #undef TARGET_CONSTANT_ALIGNMENT
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index f4a8f13..9a5de6a 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -203,6 +203,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_SERIALIZE_P(x) TARGET_ISA2_SERIALIZE_P(x)
 #define TARGET_TSXLDTRK	TARGET_ISA2_TSXLDTRK
 #define TARGET_TSXLDTRK_P(x) TARGET_ISA2_TSXLDTRK_P(x)
+#define TARGET_AMX_TILE TARGET_ISA2_AMX_TILE
+#define TARGET_AMX_TILE_P(x) TARGET_ISA2_AMX_TILE(x)
+#define TARGET_AMX_INT8 TARGET_ISA2_AMX_INT8
+#define TARGET_AMX_INT8_P(x) TARGET_ISA2_AMX_INT8(x)
+#define TARGET_AMX_BF16 TARGET_ISA2_AMX_BF16
+#define TARGET_AMX_BF16_P(x) TARGET_ISA2_AMX_BF16(x)
 
 #define TARGET_LP64	TARGET_ABI_64
 #define TARGET_LP64_P(x)	TARGET_ABI_64_P(x)
@@ -279,6 +285,13 @@ struct processor_costs {
 				   in SImode, DImode and TImode.  */
       const int sse_to_integer;	/* cost of moving SSE register to integer.  */
       const int integer_to_sse;	/* cost of moving integer register to SSE. */
+      const int mask_to_integer; /* cost of moving mask register to integer.  */
+      const int integer_to_mask; /* cost of moving integer register to mask.  */
+      const int mask_load[3]; /* cost of loading mask registers
+				 in QImode, HImode and SImode.  */
+      const int mask_store[3]; /* cost of storing mask register
+				  in QImode, HImode and SImode.  */
+      const int mask_move; /* cost of moving mask register.  */
     } hard_register;
 
   const int add;		/* cost of an add instruction */
@@ -598,8 +611,7 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BMI]
 #define TARGET_ONE_IF_CONV_INSN \
 	ix86_tune_features[X86_TUNE_ONE_IF_CONV_INSN]
-#define TARGET_USE_XCHG_FOR_ATOMIC_STORE \
-	ix86_tune_features[X86_TUNE_USE_XCHG_FOR_ATOMIC_STORE]
+#define TARGET_AVOID_MFENCE ix86_tune_features[X86_TUNE_AVOID_MFENCE]
 #define TARGET_EMIT_VZEROUPPER \
 	ix86_tune_features[X86_TUNE_EMIT_VZEROUPPER]
 #define TARGET_EXPAND_ABS \
@@ -1412,6 +1424,7 @@ enum reg_class
   FLOAT_INT_SSE_REGS,
   MASK_REGS,
   ALL_MASK_REGS,
+  INT_MASK_REGS,
   ALL_REGS,
   LIM_REG_CLASSES
 };
@@ -1471,6 +1484,7 @@ enum reg_class
    "FLOAT_INT_SSE_REGS",		\
    "MASK_REGS",				\
    "ALL_MASK_REGS",			\
+   "INT_MASK_REGS",			\
    "ALL_REGS" }
 
 /* Define which registers fit in which classes.  This is an initializer
@@ -1509,6 +1523,7 @@ enum reg_class
  { 0xff9ffff, 0xfffffff0,   0xf },	/* FLOAT_INT_SSE_REGS */	\
        { 0x0,        0x0, 0xfe0 },	/* MASK_REGS */			\
        { 0x0,        0x0, 0xff0 },	/* ALL_MASK_REGS */		\
+   { 0x900ff,      0xff0, 0xff0 },	/* INT_MASK_REGS */	\
 { 0xffffffff, 0xffffffff, 0xfff }	/* ALL_REGS  */			\
 }
 
@@ -2418,7 +2433,7 @@ const wide_int_bitmask PTA_AVX512F (HOST_WIDE_INT_1U << 40);
 const wide_int_bitmask PTA_AVX512ER (HOST_WIDE_INT_1U << 41);
 const wide_int_bitmask PTA_AVX512PF (HOST_WIDE_INT_1U << 42);
 const wide_int_bitmask PTA_AVX512CD (HOST_WIDE_INT_1U << 43);
-/* Hole after PTA_MPX was removed.  */
+const wide_int_bitmask PTA_NO_TUNE (HOST_WIDE_INT_1U << 44);
 const wide_int_bitmask PTA_SHA (HOST_WIDE_INT_1U << 45);
 const wide_int_bitmask PTA_PREFETCHWT1 (HOST_WIDE_INT_1U << 46);
 const wide_int_bitmask PTA_CLFLUSHOPT (HOST_WIDE_INT_1U << 47);
@@ -2457,7 +2472,19 @@ const wide_int_bitmask PTA_ENQCMD (0, HOST_WIDE_INT_1U << 15);
 const wide_int_bitmask PTA_CLDEMOTE (0, HOST_WIDE_INT_1U << 16);
 const wide_int_bitmask PTA_SERIALIZE (0, HOST_WIDE_INT_1U << 17);
 const wide_int_bitmask PTA_TSXLDTRK (0, HOST_WIDE_INT_1U << 18);
-
+const wide_int_bitmask PTA_AMX_TILE(0, HOST_WIDE_INT_1U << 19);
+const wide_int_bitmask PTA_AMX_INT8(0, HOST_WIDE_INT_1U << 20);
+const wide_int_bitmask PTA_AMX_BF16(0, HOST_WIDE_INT_1U << 21);
+
+const wide_int_bitmask PTA_X86_64_BASELINE = PTA_64BIT | PTA_MMX | PTA_SSE
+  | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR;
+const wide_int_bitmask PTA_X86_64_V2 = (PTA_X86_64_BASELINE & (~PTA_NO_SAHF))
+  | PTA_CX16 | PTA_POPCNT | PTA_SSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_SSSE3;
+const wide_int_bitmask PTA_X86_64_V3 = PTA_X86_64_V2
+  | PTA_AVX | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_LZCNT
+  | PTA_MOVBE | PTA_XSAVE;
+const wide_int_bitmask PTA_X86_64_V4 = PTA_X86_64_V3
+  | PTA_AVX512F | PTA_AVX512BW | PTA_AVX512CD | PTA_AVX512DQ | PTA_AVX512VL;
 const wide_int_bitmask PTA_CORE2 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
   | PTA_SSE3 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR;
 const wide_int_bitmask PTA_NEHALEM = PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2
@@ -2490,7 +2517,8 @@ const wide_int_bitmask PTA_TIGERLAKE = PTA_ICELAKE_CLIENT | PTA_MOVDIRI
   | PTA_MOVDIR64B | PTA_CLWB | PTA_AVX512VP2INTERSECT;
 const wide_int_bitmask PTA_SAPPHIRERAPIDS = PTA_COOPERLAKE | PTA_MOVDIRI
   | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_ENQCMD | PTA_CLDEMOTE
-  | PTA_PTWRITE | PTA_WAITPKG | PTA_SERIALIZE | PTA_TSXLDTRK;
+  | PTA_PTWRITE | PTA_WAITPKG | PTA_SERIALIZE | PTA_TSXLDTRK | PTA_AMX_TILE
+  | PTA_AMX_INT8 | PTA_AMX_BF16;
 const wide_int_bitmask PTA_ALDERLAKE = PTA_SKYLAKE | PTA_CLDEMOTE | PTA_PTWRITE
   | PTA_WAITPKG | PTA_SERIALIZE;
 const wide_int_bitmask PTA_KNL = PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER
@@ -2946,9 +2974,9 @@ extern void debug_dispatch_window (int);
 /* The value at zero is only defined for the BMI instructions
    LZCNT and TZCNT, not the BSR/BSF insns in the original isa.  */
 #define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
-	((VALUE) = GET_MODE_BITSIZE (MODE), TARGET_BMI ? 1 : 0)
+	((VALUE) = GET_MODE_BITSIZE (MODE), TARGET_BMI ? 2 : 0)
 #define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
-	((VALUE) = GET_MODE_BITSIZE (MODE), TARGET_LZCNT ? 1 : 0)
+	((VALUE) = GET_MODE_BITSIZE (MODE), TARGET_LZCNT ? 2 : 0)
 
 
 /* Flags returned by ix86_get_callcvt ().  */
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index d0ecd9e..9dd12cf 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -186,6 +186,10 @@
 
   ;; IRET support
   UNSPEC_INTERRUPT_RETURN
+
+  ;; For MOVDIRI and MOVDIR64B support
+  UNSPEC_MOVDIRI
+  UNSPEC_MOVDIR64B
 ])
 
 (define_c_enum "unspecv" [
@@ -280,10 +284,6 @@
   UNSPECV_SETSSBSY
   UNSPECV_CLRSSBSY
 
-  ;; For MOVDIRI and MOVDIR64B support
-  UNSPECV_MOVDIRI
-  UNSPECV_MOVDIR64B
-
   ;; For TSXLDTRK support
   UNSPECV_XSUSLDTRK
   UNSPECV_XRESLDTRK
@@ -2403,8 +2403,8 @@
 	   (symbol_ref "true")))])
 
 (define_insn "*movhi_internal"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,k,k ,r,m,k")
-	(match_operand:HI 1 "general_operand"      "r ,rn,rm,rn,r,km,k,k,CBC"))]
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k ,*r,*m,*k")
+	(match_operand:HI 1 "general_operand"      "r ,rn,rm,rn,*r,*km,*k,*k,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2491,9 +2491,9 @@
 
 (define_insn "*movqi_internal"
   [(set (match_operand:QI 0 "nonimmediate_operand"
-			"=Q,R,r,q,q,r,r ,?r,m ,k,k,r,m,k,k,k")
+			"=Q,R,r,q,q,r,r ,?r,m ,*k,*k,*r,*m,*k,*k,*k")
 	(match_operand:QI 1 "general_operand"
-			"Q ,R,r,n,m,q,rn, m,qn,r,k,k,k,m,C,BC"))]
+			"Q ,R,r,n,m,q,rn, m,qn,*r,*k,*k,*k,*m,C,BC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   char buf[128];
@@ -2624,6 +2624,19 @@
 	   ]
 	   (const_string "QI")))])
 
+/* Reload dislikes loading 0/-1 directly into mask registers.
+   Try to tidy things up here.  */
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+	(match_operand:SWI 1 "immediate_operand"))
+   (set (match_operand:SWI 2 "mask_reg_operand")
+	(match_dup 0))]
+  "peep2_reg_dead_p (2, operands[0])
+   && (const0_operand (operands[1], <MODE>mode)
+       || (constm1_operand (operands[1], <MODE>mode)
+	   && (<MODE_SIZE> > 1 || TARGET_AVX512DQ)))"
+  [(set (match_dup 2) (match_dup 1))])
+
 ;; Stores and loads of ax to arbitrary constant address.
 ;; We fake an second form of instruction to force reload to load address
 ;; into register when rax is not available
@@ -9044,19 +9057,21 @@
 })
 
 (define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r")
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
 	(and:DI
-	 (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm")
-	 (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L")))
+	 (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
+	 (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)"
   "@
    and{l}\t{%k2, %k0|%k0, %k2}
    and{q}\t{%2, %0|%0, %2}
    and{q}\t{%2, %0|%0, %2}
+   #
    #"
-  [(set_attr "type" "alu,alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,*,0")
+  [(set_attr "isa" "x64,x64,x64,x64,avx512bw")
+   (set_attr "type" "alu,alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
@@ -9064,7 +9079,7 @@
 		 (match_operand 1 "ext_QIreg_operand")))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "SI,DI,DI,SI")])
+   (set_attr "mode" "SI,DI,DI,SI,DI")])
 
 (define_insn_and_split "*anddi_1_btr"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=rm")
@@ -9130,17 +9145,25 @@
    (set_attr "mode" "SI")])
 
 (define_insn "*and<mode>_1"
-  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya")
-	(and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm")
-		   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L")))
+  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
+	(and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
+		   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, <MODE>mode, operands)"
   "@
    and{<imodesuffix>}\t{%2, %0|%0, %2}
    and{<imodesuffix>}\t{%2, %0|%0, %2}
+   #
    #"
-  [(set_attr "type" "alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,0")
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "3")
+		 (if_then_else (eq_attr "mode" "SI")
+		   (const_string "avx512bw")
+		   (const_string "avx512f"))
+	      ]
+	      (const_string "*")))
+   (set_attr "type" "alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
@@ -9148,20 +9171,28 @@
 		 (match_operand 1 "ext_QIreg_operand")))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "<MODE>,<MODE>,SI")])
+   (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])
 
 (define_insn "*andqi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
-	(and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-		(match_operand:QI 2 "general_operand" "qn,m,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+	(and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+		(match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, QImode, operands)"
   "@
    and{b}\t{%2, %0|%0, %2}
    and{b}\t{%2, %0|%0, %2}
-   and{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")
+   and{l}\t{%k2, %k0|%k0, %k2}
+   #"
+  [(set_attr "type" "alu,alu,alu,msklog")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "2")
+		 (const_string "SI")
+		(and (eq_attr "alternative" "3")
+		     (match_test "!TARGET_AVX512DQ"))
+		 (const_string "HI")
+	       ]
+	       (const_string "QI")))
    ;; Potential partial reg stall on alternative 2.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "2")
@@ -9539,28 +9570,42 @@
 })
 
 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
+  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
 	(and:SWI48
-	  (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r"))
-	  (match_operand:SWI48 2 "nonimmediate_operand" "r,m")))
+	  (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
+	  (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI"
-  "andn\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "bitmanip")
-   (set_attr "btver2_decode" "direct, double")
+  "TARGET_BMI || TARGET_AVX512BW"
+  "@
+   andn\t{%2, %1, %0|%0, %1, %2}
+   andn\t{%2, %1, %0|%0, %1, %2}
+   #"
+  [(set_attr "isa" "bmi,bmi,avx512bw")
+   (set_attr "type" "bitmanip,bitmanip,msklog")
+   (set_attr "btver2_decode" "direct, double,*")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI12 0 "register_operand" "=r")
+  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
 	(and:SWI12
-	  (not:SWI12 (match_operand:SWI12 1 "register_operand" "r"))
-	  (match_operand:SWI12 2 "register_operand" "r")))
+	  (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
+	  (match_operand:SWI12 2 "register_operand" "r,k")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI"
-  "andn\t{%k2, %k1, %k0|%k0, %k1, %k2}"
-  [(set_attr "type" "bitmanip")
-   (set_attr "btver2_decode" "direct")
-   (set_attr "mode" "SI")])
+  "TARGET_BMI || TARGET_AVX512BW"
+  "@
+   andn\t{%k2, %k1, %k0|%k0, %k1, %k2}
+   #"
+  [(set_attr "isa" "bmi,avx512f")
+   (set_attr "type" "bitmanip,msklog")
+   (set_attr "btver2_decode" "direct,*")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "0")
+		 (const_string "SI")
+	       (and (eq_attr "alternative" "1")
+		    (match_test "!TARGET_AVX512DQ"))
+		  (const_string "HI")
+	      ]
+	      (const_string "<MODE>")))])
 
 (define_insn "*andn_<mode>_ccno"
   [(set (reg FLAGS_REG)
@@ -9631,14 +9676,24 @@
 })
 
 (define_insn "*<code><mode>_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r")
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
 	(any_or:SWI248
-	 (match_operand:SWI248 1 "nonimmediate_operand" "%0,0")
-	 (match_operand:SWI248 2 "<general_operand>" "r<i>,m")))
+	 (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
+	 (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
-  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
-  [(set_attr "type" "alu")
+  "@
+   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
+   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
+   #"
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "2")
+		 (if_then_else (eq_attr "mode" "SI,DI")
+		   (const_string "avx512bw")
+		   (const_string "avx512f"))
+	      ]
+	      (const_string "*")))
+   (set_attr "type" "alu, alu, msklog")
    (set_attr "mode" "<MODE>")])
 
 (define_insn_and_split "*iordi_1_bts"
@@ -9711,17 +9766,26 @@
    (set_attr "mode" "SI")])
 
 (define_insn "*<code>qi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
-	(any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-		   (match_operand:QI 2 "general_operand" "qn,m,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+	(any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+		   (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, QImode, operands)"
   "@
    <logic>{b}\t{%2, %0|%0, %2}
    <logic>{b}\t{%2, %0|%0, %2}
-   <logic>{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")
+   <logic>{l}\t{%k2, %k0|%k0, %k2}
+   #"
+  [(set_attr "isa" "*,*,*,avx512f")
+   (set_attr "type" "alu,alu,alu,msklog")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "2")
+		 (const_string "SI")
+		(and (eq_attr "alternative" "3")
+		     (match_test "!TARGET_AVX512DQ"))
+		 (const_string "HI")
+	       ]
+	       (const_string "QI")))
    ;; Potential partial reg stall on alternative 2.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "2")
@@ -10370,31 +10434,52 @@
   "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")
 
 (define_insn "*one_cmpl<mode>2_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm")
-	(not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))]
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
+	(not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
   "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
-  "not{<imodesuffix>}\t%0"
-  [(set_attr "type" "negnot")
+  "@
+   not{<imodesuffix>}\t%0
+   #"
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "2")
+		 (if_then_else (eq_attr "mode" "SI,DI")
+		   (const_string "avx512bw")
+		   (const_string "avx512f"))
+	      ]
+	      (const_string "*")))
+   (set_attr "type" "negnot,msklog")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*one_cmplsi2_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r")
+  [(set (match_operand:DI 0 "register_operand" "=r,k")
 	(zero_extend:DI
-	  (not:SI (match_operand:SI 1 "register_operand" "0"))))]
+	  (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
   "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
-  "not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "SI")])
+  "@
+   not{l}\t%k0
+   #"
+  [(set_attr "isa" "x64,avx512bw")
+   (set_attr "type" "negnot,msklog")
+   (set_attr "mode" "SI,SI")])
 
 (define_insn "*one_cmplqi2_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r")
-	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
+	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
   "ix86_unary_operator_ok (NOT, QImode, operands)"
   "@
    not{b}\t%0
-   not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "QI,SI")
+   not{l}\t%k0
+   #"
+  [(set_attr "isa" "*,*,avx512f")
+   (set_attr "type" "negnot,negnot,msklog")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "1")
+		 (const_string "SI")
+		(and (eq_attr "alternative" "2")
+		     (match_test "!TARGET_AVX512DQ"))
+		 (const_string "HI")
+	       ]
+	       (const_string "QI")))
    ;; Potential partial reg stall on alternative 1.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "1")
@@ -14524,28 +14609,7 @@
    (set_attr "mode" "<MODE>")])
 
 ;; TBM instructions.
-(define_expand "tbm_bextri_<mode>"
-  [(parallel
-    [(set (match_operand:SWI48 0 "register_operand")
-	  (zero_extract:SWI48
-	    (match_operand:SWI48 1 "nonimmediate_operand")
-	    (match_operand 2 "const_0_to_255_operand" "N")
-	    (match_operand 3 "const_0_to_255_operand" "N")))
-     (clobber (reg:CC FLAGS_REG))])]
-  "TARGET_TBM"
-{
-  if (operands[2] == const0_rtx
-      || INTVAL (operands[3]) >= <MODE_SIZE> * BITS_PER_UNIT)
-    {
-      emit_move_insn (operands[0], const0_rtx);
-      DONE;
-    }
-  if (INTVAL (operands[2]) + INTVAL (operands[3])
-      > <MODE_SIZE> * BITS_PER_UNIT)
-    operands[2] = GEN_INT (<MODE_SIZE> * BITS_PER_UNIT - INTVAL (operands[3]));
-})
-
-(define_insn "*tbm_bextri_<mode>"
+(define_insn "@tbm_bextri_<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (zero_extract:SWI48
           (match_operand:SWI48 1 "nonimmediate_operand" "rm")
@@ -15047,7 +15111,7 @@
 ;; Replace zero_extend:HI followed by parityhi2_cmp with parityqi2_cmp
 (define_peephole2
   [(set (match_operand:HI 0 "register_operand")
-	(zero_extend:HI (match_operand:QI 1 "register_operand")))
+	(zero_extend:HI (match_operand:QI 1 "general_reg_operand")))
    (parallel [(set (reg:CC FLAGS_REG)
 		   (unspec:CC [(match_dup 0)] UNSPEC_PARITY))
 	      (clobber (match_dup 0))])]
@@ -15058,7 +15122,7 @@
 ;; Eliminate QImode popcount&1 using parity flag
 (define_peephole2
   [(set (match_operand:SI 0 "register_operand")
-	(zero_extend:SI (match_operand:QI 1 "register_operand")))
+	(zero_extend:SI (match_operand:QI 1 "general_reg_operand")))
    (parallel [(set (match_operand:SI 2 "register_operand")
 		   (popcount:SI (match_dup 0)))
 	      (clobber (reg:CC FLAGS_REG))])
@@ -15369,6 +15433,16 @@
       (clobber (reg:CC FLAGS_REG))])])
 
 ;; Load and add the thread base pointer from %<tp_seg>:0.
+(define_expand "get_thread_pointer<mode>"
+  [(set (match_operand:PTR 0 "register_operand")
+	(unspec:PTR [(const_int 0)] UNSPEC_TP))]
+  ""
+{
+  /* targetm is not visible in the scope of the condition.  */
+  if (!targetm.have_tls)
+    error ("%<__builtin_thread_pointer%> is not supported on this target");
+})
+
 (define_insn_and_split "*load_tp_<mode>"
   [(set (match_operand:PTR 0 "register_operand" "=r")
 	(unspec:PTR [(const_int 0)] UNSPEC_TP))]
@@ -18057,7 +18131,11 @@
   if (addr2 != XEXP (operands[2], 0))
     operands[2] = replace_equiv_address_nv (operands[2], addr2);
 
-  countreg = ix86_zero_extend_to_Pmode (operands[3]);
+  /* NB: Make a copy of the data length to avoid changing the original
+     data length by cmpstrnqi patterns.  */
+  rtx count = ix86_zero_extend_to_Pmode (operands[3]);
+  countreg = gen_reg_rtx (Pmode);
+  emit_move_insn (countreg, count);
 
   /* %%% Iff we are testing strict equality, we can use known alignment
      to good advantage.  This may be possible with combine, particularly
@@ -18805,45 +18883,68 @@
 
 ;; min/max patterns
 
-(define_mode_iterator MAXMIN_IMODE
-  [(SI "TARGET_SSE4_1") (DI "TARGET_AVX512VL")])
 (define_code_attr maxmin_rel
   [(smax "GE") (smin "LE") (umax "GEU") (umin "LEU")])
 
 (define_expand "<code><mode>3"
   [(parallel
-    [(set (match_operand:MAXMIN_IMODE 0 "register_operand")
-	  (maxmin:MAXMIN_IMODE
-	    (match_operand:MAXMIN_IMODE 1 "register_operand")
-	    (match_operand:MAXMIN_IMODE 2 "nonimmediate_operand")))
+    [(set (match_operand:SWI248 0 "register_operand")
+	  (maxmin:SWI248
+	    (match_operand:SWI248 1 "register_operand")
+	    (match_operand:SWI248 2 "general_operand")))
      (clobber (reg:CC FLAGS_REG))])]
-  "TARGET_STV")
+  "TARGET_CMOVE")
 
 (define_insn_and_split "*<code><mode>3_1"
-  [(set (match_operand:MAXMIN_IMODE 0 "register_operand")
-	(maxmin:MAXMIN_IMODE
-	  (match_operand:MAXMIN_IMODE 1 "register_operand")
-	  (match_operand:MAXMIN_IMODE 2 "nonimmediate_operand")))
+  [(set (match_operand:SWI248 0 "register_operand")
+	(maxmin:SWI248
+	  (match_operand:SWI248 1 "register_operand")
+	  (match_operand:SWI248 2 "general_operand")))
    (clobber (reg:CC FLAGS_REG))]
-  "(TARGET_64BIT || <MODE>mode != DImode) && TARGET_STV
+  "TARGET_CMOVE
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
   [(set (match_dup 0)
-	(if_then_else:MAXMIN_IMODE (match_dup 3)
+	(if_then_else:SWI248 (match_dup 3)
 	  (match_dup 1)
 	  (match_dup 2)))]
 {
   machine_mode mode = <MODE>mode;
+  rtx cmp_op = operands[2];
 
-  if (!register_operand (operands[2], mode))
-    operands[2] = force_reg (mode, operands[2]);
+  if (!register_operand (cmp_op, mode))
+    operands[2] = force_reg (mode, cmp_op);
 
   enum rtx_code code = <maxmin_rel>;
-  machine_mode cmpmode = SELECT_CC_MODE (code, operands[1], operands[2]);
+
+  if (cmp_op == const1_rtx)
+    {
+      /* Convert smax (x, 1) into (x > 0 ? x : 1).
+	 Convert umax (x, 1) into (x != 0 ? x : 1).
+	 Convert ?min (x, 1) into (x <= 0 ? x : 1).  */
+      cmp_op = const0_rtx;
+      if (code == GE)
+	code = GT;
+      else if (code == GEU)
+	code = NE;
+    }
+  /* Convert smin (x, -1) into (x < 0 ? x : -1).  */
+  else if (cmp_op == constm1_rtx && code == LE)
+    {
+      cmp_op = const0_rtx;
+      code = LT;
+    }
+  /* Convert smax (x, -1) into (x >= 0 ? x : -1).  */
+  else if (cmp_op == constm1_rtx && code == GE)
+    cmp_op = const0_rtx;
+  else if (cmp_op != const0_rtx)
+    cmp_op = operands[2];
+
+  machine_mode cmpmode = SELECT_CC_MODE (code, operands[1], cmp_op);
   rtx flags = gen_rtx_REG (cmpmode, FLAGS_REG);
 
-  rtx tmp = gen_rtx_COMPARE (cmpmode, operands[1], operands[2]);
+  rtx tmp = gen_rtx_COMPARE (cmpmode, operands[1], cmp_op);
   emit_insn (gen_rtx_SET (flags, tmp));
 
   operands[3] = gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
@@ -18852,9 +18953,9 @@
 (define_insn_and_split "*<code>di3_doubleword"
   [(set (match_operand:DI 0 "register_operand")
 	(maxmin:DI (match_operand:DI 1 "register_operand")
-		   (match_operand:DI 2 "nonimmediate_operand")))
+		   (match_operand:DI 2 "general_operand")))
    (clobber (reg:CC FLAGS_REG))]
-  "!TARGET_64BIT && TARGET_STV && TARGET_AVX512VL
+  "!TARGET_64BIT && TARGET_CMOVE
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
@@ -18906,6 +19007,29 @@
       gcc_unreachable ();
     }
 })
+
+;; Avoid clearing a register between a flags setting comparison and its use,
+;; i.e. prefer "xorl %eax,%eax; test/cmp" over "test/cmp; movl $0, %eax".
+(define_peephole2
+  [(set (reg FLAGS_REG) (match_operand 0))
+   (set (match_operand:SWI 1 "general_reg_operand") (const_int 0))]
+  "peep2_regno_dead_p (0, FLAGS_REG)
+   && !reg_overlap_mentioned_p (operands[1], operands[0])"
+   [(set (match_dup 2) (match_dup 0))]
+{
+  operands[2] = gen_rtx_REG (GET_MODE (operands[0]), FLAGS_REG);
+  ix86_expand_clear (operands[1]);
+})
+
+;; Reload dislikes loading constants directly into class_likely_spilled
+;; hard registers.  Try to tidy things up here.
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+	(match_operand:SWI 1 "x86_64_general_operand"))
+   (set (match_operand:SWI 2 "general_reg_operand")
+	(match_dup 0))]
+  "peep2_reg_dead_p (2, operands[0])"
+  [(set (match_dup 2) (match_dup 1))])
 
 ;; Misc patterns (?)
 
@@ -19083,17 +19207,17 @@
   ""
 {
   rtx stack_slot;
-  if ((flag_cf_protection & CF_RETURN))
+
+  if (flag_cf_protection & CF_RETURN)
     {
-      /* Copy shadow stack pointer to the first slot and stack ppointer
-	 to the second slot.  */
+      /* Copy shadow stack pointer to the first slot
+	 and stack pointer to the second slot.  */
       rtx ssp_slot = adjust_address (operands[0], word_mode, 0);
       stack_slot = adjust_address (operands[0], Pmode, UNITS_PER_WORD);
-      rtx ssp = gen_reg_rtx (word_mode);
-      emit_insn ((word_mode == SImode)
-		 ? gen_rdsspsi (ssp)
-		 : gen_rdsspdi (ssp));
-      emit_move_insn (ssp_slot, ssp);
+
+      rtx reg_ssp = force_reg (word_mode, const0_rtx);
+      emit_insn (gen_rdssp (word_mode, reg_ssp, reg_ssp));
+      emit_move_insn (ssp_slot, reg_ssp);
     }
   else
     stack_slot = adjust_address (operands[0], Pmode, 0);
@@ -19107,103 +19231,65 @@
   ""
 {
   rtx stack_slot;
-  if ((flag_cf_protection & CF_RETURN))
+
+  if (flag_cf_protection & CF_RETURN)
     {
-      /* Restore shadow stack pointer from the first slot and stack
-	 pointer from the second slot.  */
+      /* Restore shadow stack pointer from the first slot
+	 and stack pointer from the second slot.  */
       rtx ssp_slot = adjust_address (operands[1], word_mode, 0);
       stack_slot = adjust_address (operands[1], Pmode, UNITS_PER_WORD);
 
-      rtx flags, jump, noadj_label, inc_label, loop_label;
-      rtx reg_adj, reg_ssp, tmp, clob;
-
       /* Get the current shadow stack pointer.  The code below will check if
 	 SHSTK feature is enabled.  If it is not enabled the RDSSP instruction
 	 is a NOP.  */
-      reg_ssp = gen_reg_rtx (word_mode);
-      emit_insn (gen_rtx_SET (reg_ssp, const0_rtx));
-      emit_insn ((word_mode == SImode)
-		 ? gen_rdsspsi (reg_ssp)
-		 : gen_rdsspdi (reg_ssp));
-
-      /* Compare through substraction the saved and the current ssp to decide
-	 if ssp has to be adjusted.  */
-      tmp = gen_rtx_SET (reg_ssp, gen_rtx_MINUS (word_mode, reg_ssp,
-						 ssp_slot));
-      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
-      emit_insn (tmp);
+      rtx reg_ssp = force_reg (word_mode, const0_rtx);
+      emit_insn (gen_rdssp (word_mode, reg_ssp, reg_ssp));
+
+      /* Compare through subtraction the saved and the current ssp
+	 to decide if ssp has to be adjusted.  */
+      reg_ssp = expand_simple_binop (word_mode, MINUS,
+				     reg_ssp, ssp_slot,
+				     reg_ssp, 1, OPTAB_DIRECT);
 
       /* Compare and jump over adjustment code.  */
-      noadj_label = gen_label_rtx ();
-      flags = gen_rtx_REG (CCZmode, FLAGS_REG);
-      tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
-      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
-				  gen_rtx_LABEL_REF (VOIDmode, noadj_label),
-				  pc_rtx);
-      jump = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-      JUMP_LABEL (jump) = noadj_label;
-
-      /* Compute the numebr of frames to adjust.  */
-      reg_adj = gen_lowpart (ptr_mode, reg_ssp);
-      tmp = gen_rtx_SET (reg_adj,
-			 gen_rtx_LSHIFTRT (ptr_mode,
-					   negate_rtx (ptr_mode, reg_adj),
-					   GEN_INT ((word_mode == SImode)
-						    ? 2
-						    : 3)));
-      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
-      emit_insn (tmp);
+      rtx noadj_label = gen_label_rtx ();
+      emit_cmp_and_jump_insns (reg_ssp, const0_rtx, EQ, NULL_RTX,
+			       word_mode, 1, noadj_label);
 
-      /* Check if number of frames <= 255 so no loop is needed.  */
-      tmp = gen_rtx_COMPARE (CCmode, reg_adj, GEN_INT (255));
-      flags = gen_rtx_REG (CCmode, FLAGS_REG);
-      emit_insn (gen_rtx_SET (flags, tmp));
+      /* Compute the number of frames to adjust.  */
+      rtx reg_adj = gen_lowpart (ptr_mode, reg_ssp);
+      rtx reg_adj_neg = expand_simple_unop (ptr_mode, NEG, reg_adj,
+					    NULL_RTX, 1);
 
-      inc_label = gen_label_rtx ();
-      tmp = gen_rtx_LEU (VOIDmode, flags, const0_rtx);
-      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
-				  gen_rtx_LABEL_REF (VOIDmode, inc_label),
-				  pc_rtx);
-      jump = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-      JUMP_LABEL (jump) = inc_label;
+      reg_adj = expand_simple_binop (ptr_mode, LSHIFTRT, reg_adj_neg,
+				     GEN_INT (exact_log2 (UNITS_PER_WORD)),
+				     reg_adj, 1, OPTAB_DIRECT);
 
-      rtx reg_255 = gen_reg_rtx (word_mode);
-      emit_move_insn (reg_255, GEN_INT (255));
+      /* Check if number of frames <= 255 so no loop is needed.  */
+      rtx inc_label = gen_label_rtx ();
+      emit_cmp_and_jump_insns (reg_adj, GEN_INT (255), LEU, NULL_RTX,
+			       ptr_mode, 1, inc_label);
 
       /* Adjust the ssp in a loop.  */
-      loop_label = gen_label_rtx ();
+      rtx loop_label = gen_label_rtx ();
       emit_label (loop_label);
       LABEL_NUSES (loop_label) = 1;
 
-      emit_insn ((word_mode == SImode)
-		 ? gen_incsspsi (reg_255)
-		 : gen_incsspdi (reg_255));
-      tmp = gen_rtx_SET (reg_adj, gen_rtx_MINUS (ptr_mode,
-						 reg_adj,
-						 GEN_INT (255)));
-      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
-      emit_insn (tmp);
-
-      tmp = gen_rtx_COMPARE (CCmode, reg_adj, GEN_INT (255));
-      flags = gen_rtx_REG (CCmode, FLAGS_REG);
-      emit_insn (gen_rtx_SET (flags, tmp));
-
-      /* Jump to the loop label.  */
-      tmp = gen_rtx_GTU (VOIDmode, flags, const0_rtx);
-      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
-				  gen_rtx_LABEL_REF (VOIDmode, loop_label),
-				  pc_rtx);
-      jump = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-      JUMP_LABEL (jump) = loop_label;
+      rtx reg_255 = force_reg (word_mode, GEN_INT (255));
+      emit_insn (gen_incssp (word_mode, reg_255));
+
+      reg_adj = expand_simple_binop (ptr_mode, MINUS,
+				     reg_adj, GEN_INT (255),
+				     reg_adj, 1, OPTAB_DIRECT);
+
+      /* Compare and jump to the loop label.  */
+      emit_cmp_and_jump_insns (reg_adj, GEN_INT (255), GTU, NULL_RTX,
+			       ptr_mode, 1, loop_label);
 
       emit_label (inc_label);
       LABEL_NUSES (inc_label) = 1;
-      emit_insn ((word_mode == SImode)
-		 ? gen_incsspsi (reg_ssp)
-		 : gen_incsspdi (reg_ssp));
+
+      emit_insn (gen_incssp (word_mode, reg_ssp));
 
       emit_label (noadj_label);
       LABEL_NUSES (noadj_label) = 1;
@@ -21052,12 +21138,7 @@
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(define_expand "lwp_llwpcb"
-  [(unspec_volatile [(match_operand 0 "register_operand")]
-		    UNSPECV_LLWP_INTRINSIC)]
-  "TARGET_LWP")
-
-(define_insn "*lwp_llwpcb<mode>_1"
+(define_insn "@lwp_llwpcb<mode>"
   [(unspec_volatile [(match_operand:P 0 "register_operand" "r")]
 		    UNSPECV_LLWP_INTRINSIC)]
   "TARGET_LWP"
@@ -21066,13 +21147,7 @@
    (set_attr "mode" "<MODE>")
    (set_attr "length" "5")])
 
-(define_expand "lwp_slwpcb"
-  [(set (match_operand 0 "register_operand")
-	(unspec_volatile [(const_int 0)] UNSPECV_SLWP_INTRINSIC))]
-  "TARGET_LWP"
-  "emit_insn (gen_lwp_slwpcb_1 (Pmode, operands[0])); DONE;")
-
-(define_insn "@lwp_slwpcb<mode>_1"
+(define_insn "@lwp_slwpcb<mode>"
   [(set (match_operand:P 0 "register_operand" "=r")
 	(unspec_volatile:P [(const_int 0)] UNSPECV_SLWP_INTRINSIC))]
   "TARGET_LWP"
@@ -21081,16 +21156,7 @@
    (set_attr "mode" "<MODE>")
    (set_attr "length" "5")])
 
-(define_expand "lwp_lwpval<mode>3"
-  [(unspec_volatile [(match_operand:SWI48 1 "register_operand")
-    	    	     (match_operand:SI 2 "nonimmediate_operand")
-		     (match_operand:SI 3 "const_int_operand")]
-		    UNSPECV_LWPVAL_INTRINSIC)]
-  "TARGET_LWP"
-  ;; Avoid unused variable warning.
-  "(void) operands[0];")
-
-(define_insn "*lwp_lwpval<mode>3_1"
+(define_insn "@lwp_lwpval<mode>"
   [(unspec_volatile [(match_operand:SWI48 0 "register_operand" "r")
     	    	     (match_operand:SI 1 "nonimmediate_operand" "rm")
 		     (match_operand:SI 2 "const_int_operand" "i")]
@@ -21102,17 +21168,7 @@
    (set (attr "length")
         (symbol_ref "ix86_attr_length_address_default (insn) + 9"))])
 
-(define_expand "lwp_lwpins<mode>3"
-  [(set (reg:CCC FLAGS_REG)
-	(unspec_volatile:CCC [(match_operand:SWI48 1 "register_operand")
-			      (match_operand:SI 2 "nonimmediate_operand")
-			      (match_operand:SI 3 "const_int_operand")]
-			     UNSPECV_LWPINS_INTRINSIC))
-   (set (match_operand:QI 0 "nonimmediate_operand")
-	(eq:QI (reg:CCC FLAGS_REG) (const_int 0)))]
-  "TARGET_LWP")
-
-(define_insn "*lwp_lwpins<mode>3_1"
+(define_insn "@lwp_lwpins<mode>"
   [(set (reg:CCC FLAGS_REG)
 	(unspec_volatile:CCC [(match_operand:SWI48 0 "register_operand" "r")
 			      (match_operand:SI 1 "nonimmediate_operand" "rm")
@@ -21163,7 +21219,7 @@
   [(set_attr "type" "other")
    (set_attr "prefix_extra" "2")])
 
-(define_insn "rdrand<mode>_1"
+(define_insn "@rdrand<mode>"
   [(set (match_operand:SWI248 0 "register_operand" "=r")
 	(unspec_volatile:SWI248 [(const_int 0)] UNSPECV_RDRAND))
    (set (reg:CCC FLAGS_REG)
@@ -21173,7 +21229,7 @@
   [(set_attr "type" "other")
    (set_attr "prefix_extra" "1")])
 
-(define_insn "rdseed<mode>_1"
+(define_insn "@rdseed<mode>"
   [(set (match_operand:SWI248 0 "register_operand" "=r")
 	(unspec_volatile:SWI248 [(const_int 0)] UNSPECV_RDSEED))
    (set (reg:CCC FLAGS_REG)
@@ -21203,16 +21259,17 @@
    (set_attr "memory" "unknown")])
 
 ;; CET instructions
-(define_insn "rdssp<mode>"
-  [(set (match_operand:SWI48x 0 "register_operand" "=r")
-	(unspec_volatile:SWI48x [(const_int 0)] UNSPECV_NOP_RDSSP))]
+(define_insn "@rdssp<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(unspec_volatile:SWI48 [(match_operand:SWI48 1 "register_operand" "0")]
+			       UNSPECV_NOP_RDSSP))]
   "TARGET_SHSTK || (flag_cf_protection & CF_RETURN)"
-  "xor{l}\t%k0, %k0\n\trdssp<mskmodesuffix>\t%0"
+  "rdssp<mskmodesuffix>\t%0"
   [(set_attr "length" "6")
    (set_attr "type" "other")])
 
-(define_insn "incssp<mode>"
-  [(unspec_volatile [(match_operand:SWI48x 0 "register_operand" "r")]
+(define_insn "@incssp<mode>"
+  [(unspec_volatile [(match_operand:SWI48 0 "register_operand" "r")]
 		    UNSPECV_INCSSP)]
   "TARGET_SHSTK || (flag_cf_protection & CF_RETURN)"
   "incssp<mskmodesuffix>\t%0"
@@ -21226,31 +21283,26 @@
   [(set_attr "length" "5")
    (set_attr "type" "other")])
 
-(define_expand "rstorssp"
-  [(unspec_volatile [(match_operand 0 "memory_operand")]
-		    UNSPECV_RSTORSSP)]
-  "TARGET_SHSTK")
-
-(define_insn "*rstorssp<mode>"
-  [(unspec_volatile [(match_operand:P 0 "memory_operand" "m")]
+(define_insn "rstorssp"
+  [(unspec_volatile [(match_operand:DI 0 "memory_operand" "m")]
 		    UNSPECV_RSTORSSP)]
   "TARGET_SHSTK"
   "rstorssp\t%0"
   [(set_attr "length" "5")
    (set_attr "type" "other")])
 
-(define_insn "wrss<mode>"
-  [(unspec_volatile [(match_operand:SWI48x 0 "register_operand" "r")
-		     (match_operand:SWI48x 1 "memory_operand" "m")]
+(define_insn "@wrss<mode>"
+  [(unspec_volatile [(match_operand:SWI48 0 "register_operand" "r")
+		     (match_operand:SWI48 1 "memory_operand" "m")]
 		    UNSPECV_WRSS)]
   "TARGET_SHSTK"
   "wrss<mskmodesuffix>\t%0, %1"
   [(set_attr "length" "3")
    (set_attr "type" "other")])
 
-(define_insn "wruss<mode>"
-  [(unspec_volatile [(match_operand:SWI48x 0 "register_operand" "r")
-		     (match_operand:SWI48x 1 "memory_operand" "m")]
+(define_insn "@wruss<mode>"
+  [(unspec_volatile [(match_operand:SWI48 0 "register_operand" "r")
+		     (match_operand:SWI48 1 "memory_operand" "m")]
 		    UNSPECV_WRUSS)]
   "TARGET_SHSTK"
   "wruss<mskmodesuffix>\t%0, %1"
@@ -21264,13 +21316,8 @@
   [(set_attr "length" "4")
    (set_attr "type" "other")])
 
-(define_expand "clrssbsy"
-  [(unspec_volatile [(match_operand 0 "memory_operand")]
-		    UNSPECV_CLRSSBSY)]
-  "TARGET_SHSTK")
-
-(define_insn "*clrssbsy<mode>"
-  [(unspec_volatile [(match_operand:P 0 "memory_operand" "m")]
+(define_insn "clrssbsy"
+  [(unspec_volatile [(match_operand:DI 0 "memory_operand" "m")]
 		    UNSPECV_CLRSSBSY)]
   "TARGET_SHSTK"
   "clrssbsy\t%0"
@@ -21484,17 +21531,17 @@
 ;; MOVDIRI and MOVDIR64B
 
 (define_insn "movdiri<mode>"
-  [(unspec_volatile:SWI48 [(match_operand:SWI48 0 "memory_operand" "m")
-			   (match_operand:SWI48 1 "register_operand" "r")]
-			  UNSPECV_MOVDIRI)]
+  [(set (match_operand:SWI48 0 "memory_operand" "=m")
+	(unspec:SWI48 [(match_operand:SWI48 1 "register_operand" "r")]
+		      UNSPEC_MOVDIRI))]
   "TARGET_MOVDIRI"
   "movdiri\t{%1, %0|%0, %1}"
   [(set_attr "type" "other")])
 
 (define_insn "@movdir64b_<mode>"
-  [(unspec_volatile:XI [(match_operand:P 0 "register_operand" "r")
-			(match_operand:XI 1 "memory_operand")]
-		       UNSPECV_MOVDIR64B)]
+  [(set (mem:XI (match_operand:P 0 "register_operand" "r"))
+	(unspec:XI [(match_operand:XI 1 "memory_operand" "m")]
+		   UNSPEC_MOVDIR64B))]
   "TARGET_MOVDIR64B"
   "movdir64b\t{%1, %0|%0, %1}"
   [(set_attr "type" "other")])
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index c9f7195..9389dc2 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1114,4 +1114,16 @@ Support SERIALIZE built-in functions and code generation.
 
 mtsxldtrk
 Target Report Mask(ISA2_TSXLDTRK) Var(ix86_isa_flags2) Save
-Support TSXLDTRK built-in functions and code generation.
-\ No newline at end of file
+Support TSXLDTRK built-in functions and code generation.
+
+mamx-tile
+Target Report Mask(ISA2_AMX_TILE) Var(ix86_isa_flags2) Save
+Support AMX-TILE built-in functions and code generation.
+
+mamx-int8
+Target Report Mask(ISA2_AMX_INT8) Var(ix86_isa_flags2) Save
+Support AMX-INT8 built-in functions and code generation.
+
+mamx-bf16
+Target Report Mask(ISA2_AMX_BF16) Var(ix86_isa_flags2) Save
+Support AMX-BF16 built-in functions and code generation.
diff --git a/gcc/config/i386/ia32intrin.h b/gcc/config/i386/ia32intrin.h
index fd29797..3568d1f 100644
--- a/gcc/config/i386/ia32intrin.h
+++ b/gcc/config/i386/ia32intrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef _X86INTRIN_H_INCLUDED
-# error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <ia32intrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 /* 32bit bsf */
diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
index b660d0d..71eae83 100644
--- a/gcc/config/i386/immintrin.h
+++ b/gcc/config/i386/immintrin.h
@@ -24,6 +24,8 @@
 #ifndef _IMMINTRIN_H_INCLUDED
 #define _IMMINTRIN_H_INCLUDED
 
+#include <x86gprintrin.h>
+
 #include <mmintrin.h>
 
 #include <xmmintrin.h>
@@ -38,16 +40,6 @@
 
 #include <wmmintrin.h>
 
-#include <fxsrintrin.h>
-
-#include <xsaveintrin.h>
-
-#include <xsaveoptintrin.h>
-
-#include <xsavesintrin.h>
-
-#include <xsavecintrin.h>
-
 #include <avxintrin.h>
 
 #include <avx2intrin.h>
@@ -102,217 +94,28 @@
 
 #include <shaintrin.h>
 
-#include <lzcntintrin.h>
-
-#include <bmiintrin.h>
-
-#include <bmi2intrin.h>
-
 #include <fmaintrin.h>
 
 #include <f16cintrin.h>
 
 #include <rtmintrin.h>
 
-#include <xtestintrin.h>
-
-#include <cetintrin.h>
-
 #include <gfniintrin.h>
 
 #include <vaesintrin.h>
 
 #include <vpclmulqdqintrin.h>
 
-#include <movdirintrin.h>
-
-#include <sgxintrin.h>
-
-#include <pconfigintrin.h>
-
-#include <waitpkgintrin.h>
-
-#include <cldemoteintrin.h>
-
 #include <avx512bf16vlintrin.h>
 
 #include <avx512bf16intrin.h>
 
-#include <enqcmdintrin.h>
+#include <amxtileintrin.h>
 
-#include <serializeintrin.h>
+#include <amxint8intrin.h>
 
-#include <tsxldtrkintrin.h>
-
-#include <rdseedintrin.h>
+#include <amxbf16intrin.h>
 
 #include <prfchwintrin.h>
 
-#include <adxintrin.h>
-
-#include <clwbintrin.h>
-
-#include <clflushoptintrin.h>
-
-#include <wbnoinvdintrin.h>
-
-#include <pkuintrin.h>
-
-extern __inline void
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_wbinvd (void)
-{
-  __builtin_ia32_wbinvd ();
-}
-
-#ifndef __RDRND__
-#pragma GCC push_options
-#pragma GCC target("rdrnd")
-#define __DISABLE_RDRND__
-#endif /* __RDRND__ */
-extern __inline int
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_rdrand16_step (unsigned short *__P)
-{
-  return __builtin_ia32_rdrand16_step (__P);
-}
-
-extern __inline int
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_rdrand32_step (unsigned int *__P)
-{
-  return __builtin_ia32_rdrand32_step (__P);
-}
-#ifdef __DISABLE_RDRND__
-#undef __DISABLE_RDRND__
-#pragma GCC pop_options
-#endif /* __DISABLE_RDRND__ */
-
-#ifndef __RDPID__
-#pragma GCC push_options
-#pragma GCC target("rdpid")
-#define __DISABLE_RDPID__
-#endif /* __RDPID__ */
-extern __inline unsigned int
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_rdpid_u32 (void)
-{
-  return __builtin_ia32_rdpid ();
-}
-#ifdef __DISABLE_RDPID__
-#undef __DISABLE_RDPID__
-#pragma GCC pop_options
-#endif /* __DISABLE_RDPID__ */
-
-#ifdef  __x86_64__
-
-#ifndef __FSGSBASE__
-#pragma GCC push_options
-#pragma GCC target("fsgsbase")
-#define __DISABLE_FSGSBASE__
-#endif /* __FSGSBASE__ */
-extern __inline unsigned int
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_readfsbase_u32 (void)
-{
-  return __builtin_ia32_rdfsbase32 ();
-}
-
-extern __inline unsigned long long
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_readfsbase_u64 (void)
-{
-  return __builtin_ia32_rdfsbase64 ();
-}
-
-extern __inline unsigned int
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_readgsbase_u32 (void)
-{
-  return __builtin_ia32_rdgsbase32 ();
-}
-
-extern __inline unsigned long long
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_readgsbase_u64 (void)
-{
-  return __builtin_ia32_rdgsbase64 ();
-}
-
-extern __inline void
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_writefsbase_u32 (unsigned int __B)
-{
-  __builtin_ia32_wrfsbase32 (__B);
-}
-
-extern __inline void
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_writefsbase_u64 (unsigned long long __B)
-{
-  __builtin_ia32_wrfsbase64 (__B);
-}
-
-extern __inline void
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_writegsbase_u32 (unsigned int __B)
-{
-  __builtin_ia32_wrgsbase32 (__B);
-}
-
-extern __inline void
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_writegsbase_u64 (unsigned long long __B)
-{
-  __builtin_ia32_wrgsbase64 (__B);
-}
-#ifdef __DISABLE_FSGSBASE__
-#undef __DISABLE_FSGSBASE__
-#pragma GCC pop_options
-#endif /* __DISABLE_FSGSBASE__ */
-
-#ifndef __RDRND__
-#pragma GCC push_options
-#pragma GCC target("rdrnd")
-#define __DISABLE_RDRND__
-#endif /* __RDRND__ */
-extern __inline int
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_rdrand64_step (unsigned long long *__P)
-{
-  return __builtin_ia32_rdrand64_step (__P);
-}
-#ifdef __DISABLE_RDRND__
-#undef __DISABLE_RDRND__
-#pragma GCC pop_options
-#endif /* __DISABLE_RDRND__ */
-
-#endif /* __x86_64__  */
-
-#ifndef __PTWRITE__
-#pragma GCC push_options
-#pragma GCC target("ptwrite")
-#define __DISABLE_PTWRITE__
-#endif
-
-#ifdef __x86_64__
-extern __inline void
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_ptwrite64 (unsigned long long __B)
-{
-  __builtin_ia32_ptwrite64 (__B);
-}
-#endif /* __x86_64__ */
-
-extern __inline void
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_ptwrite32 (unsigned __B)
-{
-  __builtin_ia32_ptwrite32 (__B);
-}
-#ifdef __DISABLE_PTWRITE__
-#undef __DISABLE_PTWRITE__
-#pragma GCC pop_options
-#endif /* __DISABLE_PTWRITE__ */
-
 #endif /* _IMMINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/intelmic-mkoffload.c b/gcc/config/i386/intelmic-mkoffload.c
index e108bc0..15b5c3d 100644
--- a/gcc/config/i386/intelmic-mkoffload.c
+++ b/gcc/config/i386/intelmic-mkoffload.c
@@ -245,8 +245,13 @@ compile_for_target (struct obstack *argv_obstack)
 static const char *
 generate_target_descr_file (const char *target_compiler)
 {
-  const char *src_filename = make_temp_file ("_target_descr.c");
-  const char *obj_filename = make_temp_file ("_target_descr.o");
+  char *dump_filename = concat (dumppfx, "_target_descr.c", NULL);
+  const char *src_filename = save_temps
+    ? dump_filename
+    : make_temp_file ("_target_descr.c");
+  const char *obj_filename = save_temps
+    ? concat (dumppfx, "_target_descr.o", NULL)
+    : make_temp_file ("_target_descr.o");
   temp_files[num_temps++] = src_filename;
   temp_files[num_temps++] = obj_filename;
   FILE *src_file = fopen (src_filename, "w");
@@ -293,6 +298,12 @@ generate_target_descr_file (const char *target_compiler)
     obstack_ptr_grow (&argv_obstack, "-save-temps");
   if (verbose)
     obstack_ptr_grow (&argv_obstack, "-v");
+  obstack_ptr_grow (&argv_obstack, "-dumpdir");
+  obstack_ptr_grow (&argv_obstack, "");
+  obstack_ptr_grow (&argv_obstack, "-dumpbase");
+  obstack_ptr_grow (&argv_obstack, dump_filename);
+  obstack_ptr_grow (&argv_obstack, "-dumpbase-ext");
+  obstack_ptr_grow (&argv_obstack, ".c");
   obstack_ptr_grow (&argv_obstack, "-c");
   obstack_ptr_grow (&argv_obstack, "-shared");
   obstack_ptr_grow (&argv_obstack, "-fPIC");
@@ -309,8 +320,13 @@ generate_target_descr_file (const char *target_compiler)
 static const char *
 generate_target_offloadend_file (const char *target_compiler)
 {
-  const char *src_filename = make_temp_file ("_target_offloadend.c");
-  const char *obj_filename = make_temp_file ("_target_offloadend.o");
+  char *dump_filename = concat (dumppfx, "_target_offloadend.c", NULL);
+  const char *src_filename = save_temps
+    ? dump_filename
+    : make_temp_file ("_target_offloadend.c");
+  const char *obj_filename = save_temps
+    ? concat (dumppfx, "_target_offloadend.o", NULL)
+    : make_temp_file ("_target_offloadend.o");
   temp_files[num_temps++] = src_filename;
   temp_files[num_temps++] = obj_filename;
   FILE *src_file = fopen (src_filename, "w");
@@ -335,6 +351,12 @@ generate_target_offloadend_file (const char *target_compiler)
     obstack_ptr_grow (&argv_obstack, "-save-temps");
   if (verbose)
     obstack_ptr_grow (&argv_obstack, "-v");
+  obstack_ptr_grow (&argv_obstack, "-dumpdir");
+  obstack_ptr_grow (&argv_obstack, "");
+  obstack_ptr_grow (&argv_obstack, "-dumpbase");
+  obstack_ptr_grow (&argv_obstack, dump_filename);
+  obstack_ptr_grow (&argv_obstack, "-dumpbase-ext");
+  obstack_ptr_grow (&argv_obstack, ".c");
   obstack_ptr_grow (&argv_obstack, "-c");
   obstack_ptr_grow (&argv_obstack, "-shared");
   obstack_ptr_grow (&argv_obstack, "-fPIC");
@@ -350,8 +372,13 @@ generate_target_offloadend_file (const char *target_compiler)
 static const char *
 generate_host_descr_file (const char *host_compiler)
 {
-  const char *src_filename = make_temp_file ("_host_descr.c");
-  const char *obj_filename = make_temp_file ("_host_descr.o");
+  char *dump_filename = concat (dumppfx, "_host_descr.c", NULL);
+  const char *src_filename = save_temps
+    ? dump_filename
+    : make_temp_file ("_host_descr.c");
+  const char *obj_filename = save_temps
+    ? concat (dumppfx, "_host_descr.o", NULL)
+    : make_temp_file ("_host_descr.o");
   temp_files[num_temps++] = src_filename;
   temp_files[num_temps++] = obj_filename;
   FILE *src_file = fopen (src_filename, "w");
@@ -402,6 +429,12 @@ generate_host_descr_file (const char *host_compiler)
     obstack_ptr_grow (&argv_obstack, "-save-temps");
   if (verbose)
     obstack_ptr_grow (&argv_obstack, "-v");
+  obstack_ptr_grow (&argv_obstack, "-dumpdir");
+  obstack_ptr_grow (&argv_obstack, "");
+  obstack_ptr_grow (&argv_obstack, "-dumpbase");
+  obstack_ptr_grow (&argv_obstack, dump_filename);
+  obstack_ptr_grow (&argv_obstack, "-dumpbase-ext");
+  obstack_ptr_grow (&argv_obstack, ".c");
   obstack_ptr_grow (&argv_obstack, "-c");
   obstack_ptr_grow (&argv_obstack, "-fPIC");
   obstack_ptr_grow (&argv_obstack, "-shared");
@@ -443,7 +476,10 @@ prepare_target_image (const char *target_compiler, int argc, char **argv)
   sprintf (opt1, "-Wl,%s", target_descr_filename);
   sprintf (opt2, "-Wl,%s", target_offloadend_filename);
 
-  const char *target_so_filename = make_temp_file ("_offload_intelmic.so");
+  char *dump_filename = concat (dumppfx, ".mkoffload", NULL);
+  const char *target_so_filename = save_temps
+    ? concat (dumppfx, "_offload_intelmic.so", NULL)
+    : make_temp_file ("_offload_intelmic.so");
   temp_files[num_temps++] = target_so_filename;
   struct obstack argv_obstack;
   obstack_init (&argv_obstack);
@@ -457,16 +493,20 @@ prepare_target_image (const char *target_compiler, int argc, char **argv)
   for (int i = 1; i < argc; i++)
     {
       if (!strcmp (argv[i], "-o") && i + 1 != argc)
-	out_obj_filename = argv[++i];
+	++i;
       else
 	obstack_ptr_grow (&argv_obstack, argv[i]);
     }
-  if (!out_obj_filename)
-    fatal_error (input_location, "output file not specified");
   obstack_ptr_grow (&argv_obstack, opt2);
   /* NB: Put -fPIC and -shared the last to create shared library.  */
   obstack_ptr_grow (&argv_obstack, "-fPIC");
   obstack_ptr_grow (&argv_obstack, "-shared");
+  obstack_ptr_grow (&argv_obstack, "-dumpdir");
+  obstack_ptr_grow (&argv_obstack, "");
+  obstack_ptr_grow (&argv_obstack, "-dumpbase");
+  obstack_ptr_grow (&argv_obstack, dump_filename);
+  obstack_ptr_grow (&argv_obstack, "-dumpbase-ext");
+  obstack_ptr_grow (&argv_obstack, "");
   obstack_ptr_grow (&argv_obstack, "-o");
   obstack_ptr_grow (&argv_obstack, target_so_filename);
   compile_for_target (&argv_obstack);
@@ -589,8 +629,20 @@ main (int argc, char **argv)
 	save_temps = true;
       else if (strcmp (argv[i], "-v") == 0)
 	verbose = true;
+      else if (strcmp (argv[i], "-dumpbase") == 0
+	       && i + 1 < argc)
+	dumppfx = argv[++i];
+      else if (strcmp (argv[i], "-o") == 0
+	       && i + 1 < argc)
+	out_obj_filename = argv[++i];
     }
 
+  if (!out_obj_filename)
+    fatal_error (input_location, "output file not specified");
+
+  if (!dumppfx)
+    dumppfx = out_obj_filename;
+
   const char *target_so_filename
     = prepare_target_image (target_compiler, argc, argv);
 
diff --git a/gcc/config/i386/lwpintrin.h b/gcc/config/i386/lwpintrin.h
index d7c3acb..0b5c8bb 100644
--- a/gcc/config/i386/lwpintrin.h
+++ b/gcc/config/i386/lwpintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef _X86INTRIN_H_INCLUDED
-# error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <lwpintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _LWPINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/lzcntintrin.h b/gcc/config/i386/lzcntintrin.h
index 1863a58..6d00e9f 100644
--- a/gcc/config/i386/lzcntintrin.h
+++ b/gcc/config/i386/lzcntintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <lzcntintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 
diff --git a/gcc/config/i386/movdirintrin.h b/gcc/config/i386/movdirintrin.h
index e7f374a..b2f8406 100644
--- a/gcc/config/i386/movdirintrin.h
+++ b/gcc/config/i386/movdirintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <movdirintrin.h> directly; include <x86intrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <movdirintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _MOVDIRINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/pconfigintrin.h b/gcc/config/i386/pconfigintrin.h
index d2a3261..31c493a 100644
--- a/gcc/config/i386/pconfigintrin.h
+++ b/gcc/config/i386/pconfigintrin.h
@@ -1,5 +1,28 @@
-#ifndef _IMMINTRIN_H_INCLUDED
-#error "Never use <pconfigintrin.h> directly; include <immintrin.h> instead."
+/* Copyright (C) 2018-2020 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <pconfigintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _PCONFIGINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/pkuintrin.h b/gcc/config/i386/pkuintrin.h
index 6840914..0d2dd51 100644
--- a/gcc/config/i386/pkuintrin.h
+++ b/gcc/config/i386/pkuintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <pkuintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <pkuintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _PKUINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 07e69d5..b03f9cd 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -87,6 +87,11 @@
   (and (match_code "reg")
        (match_test "REGNO (op) == FLAGS_REG")))
 
+;; True if the operand is a MASK register.
+(define_predicate "mask_reg_operand"
+  (and (match_code "reg")
+       (match_test "MASK_REGNO_P (REGNO (op))")))
+
 ;; Match a DI, SI, HI or QImode nonimmediate_operand.
 (define_special_predicate "int_nonimmediate_operand"
   (and (match_operand 0 "nonimmediate_operand")
@@ -130,10 +135,35 @@
 (define_predicate "symbol_operand"
   (match_code "symbol_ref"))
 
+;; Return true if VALUE is an ENDBR opcode in immediate field.
+(define_predicate "ix86_endbr_immediate_operand"
+  (match_code "const_int")
+{
+  if (flag_cf_protection & CF_BRANCH)
+     {
+       unsigned HOST_WIDE_INT imm = UINTVAL (op);
+       unsigned HOST_WIDE_INT val = TARGET_64BIT ? 0xfa1e0ff3 : 0xfb1e0ff3;
+
+       if (imm == val)
+	 return 1;
+
+       /* NB: Encoding is byte based.  */
+       if (TARGET_64BIT)
+	 for (; imm >= val; imm >>= 8)
+	   if (imm == val)
+	     return 1;
+      }
+
+  return 0;
+})
+
 ;; Return true if VALUE can be stored in a sign extended immediate field.
 (define_predicate "x86_64_immediate_operand"
   (match_code "const_int,symbol_ref,label_ref,const")
 {
+  if (ix86_endbr_immediate_operand (op, VOIDmode))
+    return false;
+
   if (!TARGET_64BIT)
     return immediate_operand (op, mode);
 
@@ -260,6 +290,9 @@
 (define_predicate "x86_64_zext_immediate_operand"
   (match_code "const_int,symbol_ref,label_ref,const")
 {
+  if (ix86_endbr_immediate_operand (op, VOIDmode))
+    return false;
+
   switch (GET_CODE (op))
     {
     case CONST_INT:
@@ -374,6 +407,9 @@
 (define_predicate "x86_64_dwzext_immediate_operand"
   (match_code "const_int,const_wide_int")
 {
+  if (ix86_endbr_immediate_operand (op, VOIDmode))
+    return false;
+
   switch (GET_CODE (op))
     {
     case CONST_INT:
diff --git a/gcc/config/i386/rdseedintrin.h b/gcc/config/i386/rdseedintrin.h
index efc7cea..168053a 100644
--- a/gcc/config/i386/rdseedintrin.h
+++ b/gcc/config/i386/rdseedintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <rdseedintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <rdseedintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _RDSEEDINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/rtmintrin.h b/gcc/config/i386/rtmintrin.h
index 463a989..436e517 100644
--- a/gcc/config/i386/rtmintrin.h
+++ b/gcc/config/i386/rtmintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef _IMMINTRIN_H_INCLUDED
-# error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <rtmintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _RTMINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/serializeintrin.h b/gcc/config/i386/serializeintrin.h
index 0c35b9e..95f26d6 100644
--- a/gcc/config/i386/serializeintrin.h
+++ b/gcc/config/i386/serializeintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <serializeintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <serializeintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _SERIALIZE_H_INCLUDED
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d3ad583..934b60a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -326,11 +326,9 @@
   [V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
    V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
 
-;; AVX512VL SF/DF plus 128- and 256-bit SF vector modes
-(define_mode_iterator VF_AVX512VL_VF1_128_256
-  [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
-   (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX512VL")
-   (V2DF "TARGET_AVX512VL")])
+;; AVX512ER SF plus 128- and 256-bit SF vector modes
+(define_mode_iterator VF1_AVX512ER_128_256
+  [(V16SF "TARGET_AVX512ER") (V8SF "TARGET_AVX") V4SF])
 
 (define_mode_iterator VF2_AVX512VL
   [V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
@@ -1057,11 +1055,15 @@
 (define_insn "<avx512>_load<mode>_mask"
   [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:V48_AVX512VL
-	  (match_operand:V48_AVX512VL 1 "nonimmediate_operand" "v,m")
-	  (match_operand:V48_AVX512VL 2 "nonimm_or_0_operand" "0C,0C")
+	  (match_operand:V48_AVX512VL 1 "nonimmediate_operand" "vm,vm")
+	  (match_operand:V48_AVX512VL 2 "nonimm_or_0_operand" "0C,v")
 	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))]
   "TARGET_AVX512F"
 {
+  if (REG_P (operands[2])
+     && REGNO (operands[2]) != REGNO (operands[0]))
+    return "v<sseintprefix>blendm<ssemodesuffix>\t{%1, %2, %0%{%3%}|%0%{%3%}, %2, %1}";
+
   if (FLOAT_MODE_P (GET_MODE_INNER (<MODE>mode)))
     {
       if (misaligned_operand (operands[1], <MODE>mode))
@@ -1079,20 +1081,20 @@
 }
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
-   (set_attr "memory" "none,load")
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<avx512>_load<mode>_mask"
   [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:VI12_AVX512VL
-	  (match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v,m")
-	  (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C,0C")
+	  (match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "vm,vm")
+	  (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C,v")
 	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))]
   "TARGET_AVX512BW"
-  "vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+  "@
+    vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}
+    vpblendm<ssemodesuffix>\t{%1, %2, %0%{%3%}|%0%{%3%}, %2, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
-   (set_attr "memory" "none,load")
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "avx512f_mov<ssescalarmodelower>_mask"
@@ -1156,29 +1158,21 @@
    (set_attr "memory" "store")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "<avx512>_blendm<mode>"
+(define_expand "<avx512>_blendm<mode>"
   [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v")
 	(vec_merge:V48_AVX512VL
 	  (match_operand:V48_AVX512VL 2 "nonimmediate_operand" "vm")
 	  (match_operand:V48_AVX512VL 1 "register_operand" "v")
 	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
-  "TARGET_AVX512F"
-  "v<sseintprefix>blendm<ssemodesuffix>\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}"
-  [(set_attr "type" "ssemov")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "<sseinsnmode>")])
+  "TARGET_AVX512F")
 
-(define_insn "<avx512>_blendm<mode>"
+(define_expand "<avx512>_blendm<mode>"
   [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
 	(vec_merge:VI12_AVX512VL
 	  (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
 	  (match_operand:VI12_AVX512VL 1 "register_operand" "v")
 	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
-  "TARGET_AVX512BW"
-  "vpblendm<ssemodesuffix>\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}"
-  [(set_attr "type" "ssemov")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "<sseinsnmode>")])
+  "TARGET_AVX512BW")
 
 (define_insn "<avx512>_store<mode>_mask"
   [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
@@ -1476,6 +1470,18 @@
 	   ]
 	   (const_string "<MODE>")))])
 
+(define_split
+  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
+	(any_logic:SWI1248_AVX512BW
+	  (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
+	  (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+	   (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "kandn<mode>"
   [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
 	(and:SWI1248_AVX512BW
@@ -1499,6 +1505,21 @@
 	   ]
 	   (const_string "<MODE>")))])
 
+(define_split
+  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
+	(and:SWI1248_AVX512BW
+	  (not:SWI1248_AVX512BW
+	    (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand"))
+	  (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+	   (and:SWI1248_AVX512BW
+	     (not:SWI1248_AVX512BW (match_dup 1))
+	     (match_dup 2)))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "kxnor<mode>"
   [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
 	(not:SWI1248_AVX512BW
@@ -1543,6 +1564,38 @@
 	   ]
 	   (const_string "<MODE>")))])
 
+(define_split
+  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
+	(not:SWI1248_AVX512BW
+	  (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")))]
+  "TARGET_AVX512F && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+	   (not:SWI1248_AVX512BW (match_dup 1)))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
+(define_insn "*knotsi_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+	(zero_extend:DI
+	  (not:SI (match_operand:SI 1 "register_operand" "k"))))
+   (unspec [(const_int 0)] UNSPEC_MASKOP)]
+  "TARGET_AVX512BW"
+  "knotd\t{%1, %0|%0, %1}";
+  [(set_attr "type" "msklog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SI")])
+
+(define_split
+  [(set (match_operand:DI 0 "mask_reg_operand")
+	(zero_extend:DI
+	  (not:SI (match_operand:SI 1 "mask_reg_operand"))))]
+  "TARGET_AVX512BW && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+	   (zero_extend:DI
+	     (not:SI (match_dup 1))))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "kadd<mode>"
   [(set (match_operand:SWI1248_AVX512BWDQ2 0 "register_operand" "=k")
 	(plus:SWI1248_AVX512BWDQ2
@@ -1814,7 +1867,7 @@
 	     (match_operand:<ssescalarmode> 1 "memory_operand" "m"))
 	  (match_operand:VF_AVX512 2 "register_operand" "v")))]
   "TARGET_AVX512F && <mask_mode512bit_condition>"
-  "vmul<ssemodesuffix>\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<<avx512bcst>>}"
+  "vmul<ssemodesuffix>\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}"
   [(set_attr "prefix" "evex")
    (set_attr "type" "ssemul")
    (set_attr "mode" "<MODE>")])
@@ -1907,7 +1960,7 @@
 	  (vec_duplicate:VF_AVX512
 	     (match_operand:<ssescalarmode> 2 "memory_operand" "m"))))]
   "TARGET_AVX512F && <mask_mode512bit_condition>"
-  "vdiv<ssemodesuffix>\t{%2<avx512bcst>, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<<avx512bcst>>}"
+  "vdiv<ssemodesuffix>\t{%2<avx512bcst>, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<avx512bcst>}"
   [(set_attr "prefix" "evex")
     (set_attr "type" "ssediv")
    (set_attr "mode" "<MODE>")])
@@ -2076,9 +2129,9 @@
    (set_attr "mode" "<ssescalarmode>")])
 
 (define_expand "rsqrt<mode>2"
-  [(set (match_operand:VF_AVX512VL_VF1_128_256 0 "register_operand")
-	(unspec:VF_AVX512VL_VF1_128_256
-	  [(match_operand:VF_AVX512VL_VF1_128_256 1 "vector_operand")]
+  [(set (match_operand:VF1_AVX512ER_128_256 0 "register_operand")
+	(unspec:VF1_AVX512ER_128_256
+	  [(match_operand:VF1_AVX512ER_128_256 1 "vector_operand")]
 	  UNSPEC_RSQRT))]
   "TARGET_SSE && TARGET_SSE_MATH"
 {
@@ -2947,18 +3000,6 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "*<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
-	(match_operator:<avx512fmaskmode> 3 "ix86_comparison_int_operator"
-	  [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
-	   (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "<round_saeonly_constraint>")]))]
-  "TARGET_AVX512F && <round_saeonly_mode512bit_condition>"
-  "vpcmp<ssemodesuffix>\t{%I3, <round_saeonly_mask_scalar_merge_op4>%2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2<round_saeonly_mask_scalar_merge_op4>, %I3}"
-  [(set_attr "type" "ssecmp")
-   (set_attr "length_immediate" "1")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "<sseinsnmode>")])
-
 (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
 	(unspec:<avx512fmaskmode>
@@ -2973,18 +3014,6 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "*<avx512>_cmp<mode>3<mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
-	(match_operator:<avx512fmaskmode> 3 "ix86_comparison_int_operator"
-	  [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
-	   (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")]))]
-  "TARGET_AVX512BW"
-  "vpcmp<ssemodesuffix>\t{%I3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %I3}"
-  [(set_attr "type" "ssecmp")
-   (set_attr "length_immediate" "1")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "<sseinsnmode>")])
-
 (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
 	(unspec:<avx512fmaskmode>
@@ -2999,18 +3028,6 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "*<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
-	(match_operator:<avx512fmaskmode> 3 "ix86_comparison_uns_operator"
-	  [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
-	   (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")]))]
-  "TARGET_AVX512BW"
-  "vpcmpu<ssemodesuffix>\t{%I3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %I3}"
-  [(set_attr "type" "ssecmp")
-   (set_attr "length_immediate" "1")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "<sseinsnmode>")])
-
 (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
 	(unspec:<avx512fmaskmode>
@@ -3025,18 +3042,6 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "*<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
-	(match_operator:<avx512fmaskmode> 3 "ix86_comparison_uns_operator"
-	  [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
-	   (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")]))]
-  "TARGET_AVX512F"
-  "vpcmpu<ssemodesuffix>\t{%I3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %I3}"
-  [(set_attr "type" "ssecmp")
-   (set_attr "length_immediate" "1")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "<sseinsnmode>")])
-
 (define_insn "avx512f_vmcmp<mode>3<round_saeonly_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
 	(and:<avx512fmaskmode>
@@ -3071,18 +3076,6 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<ssescalarmode>")])
 
-(define_insn "avx512f_maskcmp<mode>3"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
-	(match_operator:<avx512fmaskmode> 3 "sse_comparison_operator"
-	  [(match_operand:VF_AVX512VL 1 "register_operand" "v")
-	   (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "vm")]))]
-  "TARGET_AVX512F"
-  "vcmp%D3<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "ssecmp")
-   (set_attr "length_immediate" "1")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "<sseinsnmode>")])
-
 (define_insn "<sse>_<unord>comi<round_saeonly_name>"
   [(set (reg:CCFP FLAGS_REG)
 	(compare:CCFP
@@ -3110,7 +3103,8 @@
 	   (match_operand:V48_AVX512VL 3 "nonimmediate_operand")]))]
   "TARGET_AVX512F"
 {
-  bool ok = ix86_expand_mask_vec_cmp (operands);
+  bool ok = ix86_expand_mask_vec_cmp (operands[0], GET_CODE (operands[1]),
+				      operands[2], operands[3]);
   gcc_assert (ok);
   DONE;
 })
@@ -3122,7 +3116,8 @@
 	   (match_operand:VI12_AVX512VL 3 "nonimmediate_operand")]))]
   "TARGET_AVX512BW"
 {
-  bool ok = ix86_expand_mask_vec_cmp (operands);
+  bool ok = ix86_expand_mask_vec_cmp (operands[0], GET_CODE (operands[1]),
+				      operands[2], operands[3]);
   gcc_assert (ok);
   DONE;
 })
@@ -3194,7 +3189,8 @@
 	   (match_operand:VI48_AVX512VL 3 "nonimmediate_operand")]))]
   "TARGET_AVX512F"
 {
-  bool ok = ix86_expand_mask_vec_cmp (operands);
+  bool ok = ix86_expand_mask_vec_cmp (operands[0], GET_CODE (operands[1]),
+				      operands[2], operands[3]);
   gcc_assert (ok);
   DONE;
 })
@@ -3206,7 +3202,8 @@
 	   (match_operand:VI12_AVX512VL 3 "nonimmediate_operand")]))]
   "TARGET_AVX512BW"
 {
-  bool ok = ix86_expand_mask_vec_cmp (operands);
+  bool ok = ix86_expand_mask_vec_cmp (operands[0], GET_CODE (operands[1]),
+				      operands[2], operands[3]);
   gcc_assert (ok);
   DONE;
 })
@@ -7029,7 +7026,7 @@
 
   emit_insn (gen_vec_extract_hi_v16si (tmp[3], operands[1]));
   emit_insn (gen_floatv8siv8df2 (tmp[2], tmp[3]));
-  emit_insn (gen_rtx_SET (k, gen_rtx_LT (QImode, tmp[2], tmp[0])));
+  ix86_expand_mask_vec_cmp (k, LT, tmp[2], tmp[0]);
   emit_insn (gen_addv8df3_mask (tmp[2], tmp[2], tmp[1], tmp[2], k));
   emit_move_insn (operands[0], tmp[2]);
   DONE;
@@ -7076,7 +7073,7 @@
   k = gen_reg_rtx (QImode);
 
   emit_insn (gen_avx512f_cvtdq2pd512_2 (tmp[2], operands[1]));
-  emit_insn (gen_rtx_SET (k, gen_rtx_LT (QImode, tmp[2], tmp[0])));
+  ix86_expand_mask_vec_cmp (k, LT, tmp[2], tmp[0]);
   emit_insn (gen_addv8df3_mask (tmp[2], tmp[2], tmp[1], tmp[2], k));
   emit_move_insn (operands[0], tmp[2]);
   DONE;
@@ -12123,6 +12120,18 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*avx512dq_mul<mode>3<mask_name>_bcst"
+  [(set (match_operand:VI8_AVX512VL 0 "register_operand" "=v")
+	(mult:VI8_AVX512VL
+	  (vec_duplicate:VI8_AVX512VL
+	    (match_operand:<ssescalarmode> 1 "memory_operand" "m"))
+	  (match_operand:VI8_AVX512VL 2 "register_operand" "v")))]
+  "TARGET_AVX512DQ"
+  "vpmullq\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_expand "mul<mode>3<mask_name>"
   [(set (match_operand:VI4_AVX512F 0 "register_operand")
 	(mult:VI4_AVX512F
@@ -12163,6 +12172,18 @@
    (set_attr "btver2_decode" "vector,vector,vector")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*avx512f_mul<mode>3<mask_name>_bcst"
+  [(set (match_operand:VI4_AVX512VL 0 "register_operand" "=v")
+	(mult:VI4_AVX512VL
+	  (vec_duplicate:VI4_AVX512VL
+	    (match_operand:<ssescalarmode> 1 "memory_operand" "m"))
+	  (match_operand:VI4_AVX512VL 2 "register_operand" "v")))]
+  "TARGET_AVX512F"
+   "vpmulld\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_expand "mul<mode>3"
   [(set (match_operand:VI8_AVX2_AVX512F 0 "register_operand")
 	(mult:VI8_AVX2_AVX512F
@@ -16941,11 +16962,8 @@
 				GET_MODE (operands[2]));
   operands[4] = lowpart_subreg (V16QImode, operands[3],
 				GET_MODE (operands[3]));
-  rtvec par = gen_rtvec (4, GEN_INT (0xf7f7f7f7),
-			 GEN_INT (0xf7f7f7f7),
-			 GEN_INT (0xf7f7f7f7),
-			 GEN_INT (0xf7f7f7f7));
-  rtx vec_const = gen_rtx_CONST_VECTOR (V4SImode, par);
+  rtx vec_const = ix86_build_const_vector (V4SImode, true,
+					   gen_int_mode (0xf7f7f7f7, SImode));
   operands[5] = force_const_mem (V4SImode, vec_const);
 }
   [(set_attr "mmx_isa" "native,sse_noavx,avx")
@@ -23466,6 +23484,30 @@
   (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")
   (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")])
 
+(define_mode_iterator MASK_DWI [P2QI P2HI])
+
+(define_expand "mov<mode>"
+  [(set (match_operand:MASK_DWI 0 "nonimmediate_operand")
+	(match_operand:MASK_DWI 1 "nonimmediate_operand"))]
+  "TARGET_AVX512VP2INTERSECT"
+{
+  if (MEM_P (operands[0]) && MEM_P (operands[1]))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn_and_split "*mov<mode>_internal"
+  [(set (match_operand:MASK_DWI 0 "nonimmediate_operand" "=k,o")
+	(match_operand:MASK_DWI 1 "nonimmediate_operand" "ko,k"))]
+  "TARGET_AVX512VP2INTERSECT
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 2) (match_dup 3))]
+{
+  split_double_mode (<MODE>mode, &operands[0], 2, &operands[0], &operands[2]);
+})
+
 (define_insn "avx512vp2intersect_2intersect<mode>"
   [(set (match_operand:P2QI 0 "register_operand" "=k")
 	(unspec:P2QI
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index 9ab5456..ed17bb0 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -100,8 +100,13 @@
   [(set (match_operand:BLK 0)
 	(unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))
    (clobber (reg:CC FLAGS_REG))]
-  "!(TARGET_64BIT || TARGET_SSE2)"
-  "lock{%;} or{l}\t{$0, (%%esp)|DWORD PTR [esp], 0}"
+  ""
+{
+  rtx mem = gen_rtx_MEM (word_mode, stack_pointer_rtx);
+
+  output_asm_insn ("lock{%;} or%z0\t{$0, %0|%0, 0}", &mem);
+  return "";
+}
   [(set_attr "memory" "unknown")])
 
 (define_expand "mem_thread_fence"
@@ -117,7 +122,9 @@
       rtx (*mfence_insn)(rtx);
       rtx mem;
 
-      if (TARGET_64BIT || TARGET_SSE2)
+      if ((TARGET_64BIT || TARGET_SSE2)
+	  && (optimize_function_for_size_p (cfun)
+	      || !TARGET_AVOID_MFENCE))
 	mfence_insn = gen_mfence_sse2;
       else
 	mfence_insn = gen_mfence_nosse;
@@ -306,11 +313,10 @@
     {
       operands[1] = force_reg (<MODE>mode, operands[1]);
 
-      /* For seq-cst stores, use XCHG when we lack MFENCE
-      	 or when target prefers XCHG.  */
+      /* For seq-cst stores, use XCHG when we lack MFENCE.  */
       if (is_mm_seq_cst (model)
 	  && (!(TARGET_64BIT || TARGET_SSE2)
-	      || TARGET_USE_XCHG_FOR_ATOMIC_STORE))
+	      || TARGET_AVOID_MFENCE))
 	{
 	  emit_insn (gen_atomic_exchange<mode> (gen_reg_rtx (<MODE>mode),
 						operands[0], operands[1],
@@ -594,6 +600,75 @@
   "TARGET_CMPXCHG"
   "lock{%;} %K4cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}")
 
+(define_peephole2
+  [(set (match_operand:SWI 0 "register_operand")
+	(match_operand:SWI 1 "general_operand"))
+   (parallel [(set (match_dup 0)
+		   (unspec_volatile:SWI
+		     [(match_operand:SWI 2 "memory_operand")
+		      (match_dup 0)
+		      (match_operand:SWI 3 "register_operand")
+		      (match_operand:SI 4 "const_int_operand")]
+		     UNSPECV_CMPXCHG))
+	      (set (match_dup 2)
+		   (unspec_volatile:SWI [(const_int 0)] UNSPECV_CMPXCHG))
+	      (set (reg:CCZ FLAGS_REG)
+		   (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG))])
+   (set (reg:CCZ FLAGS_REG)
+	(compare:CCZ (match_operand:SWI 5 "register_operand")
+		     (match_operand:SWI 6 "general_operand")))]
+  "(rtx_equal_p (operands[0], operands[5])
+    && rtx_equal_p (operands[1], operands[6]))
+   || (rtx_equal_p (operands[0], operands[6])
+       && rtx_equal_p (operands[1], operands[5]))"
+  [(set (match_dup 0)
+	(match_dup 1))
+   (parallel [(set (match_dup 0)
+		   (unspec_volatile:SWI
+		     [(match_dup 2)
+		      (match_dup 0)
+		      (match_dup 3)
+		      (match_dup 4)]
+		     UNSPECV_CMPXCHG))
+	      (set (match_dup 2)
+		   (unspec_volatile:SWI [(const_int 0)] UNSPECV_CMPXCHG))
+	      (set (reg:CCZ FLAGS_REG)
+		   (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG))])])
+
+(define_peephole2
+  [(parallel [(set (match_operand:SWI48 0 "register_operand")
+		   (match_operand:SWI48 1 "const_int_operand"))
+	      (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_operand:SWI 2 "register_operand")
+		   (unspec_volatile:SWI
+		     [(match_operand:SWI 3 "memory_operand")
+		      (match_dup 2)
+		      (match_operand:SWI 4 "register_operand")
+		      (match_operand:SI 5 "const_int_operand")]
+		     UNSPECV_CMPXCHG))
+	      (set (match_dup 3)
+		   (unspec_volatile:SWI [(const_int 0)] UNSPECV_CMPXCHG))
+	      (set (reg:CCZ FLAGS_REG)
+		   (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG))])
+   (set (reg:CCZ FLAGS_REG)
+	(compare:CCZ (match_dup 2)
+		     (match_dup 1)))]
+  "REGNO (operands[0]) == REGNO (operands[2])"
+  [(parallel [(set (match_dup 0)
+		   (match_dup 1))
+	      (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 2)
+		   (unspec_volatile:SWI
+		     [(match_dup 3)
+		      (match_dup 2)
+		      (match_dup 4)
+		      (match_dup 5)]
+		     UNSPECV_CMPXCHG))
+	      (set (match_dup 3)
+		   (unspec_volatile:SWI [(const_int 0)] UNSPECV_CMPXCHG))
+	      (set (reg:CCZ FLAGS_REG)
+		   (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG))])])
+
 ;; For operand 2 nonmemory_operand predicate is used instead of
 ;; register_operand to allow combiner to better optimize atomic
 ;; additions of constants.
diff --git a/gcc/config/i386/t-rtems b/gcc/config/i386/t-rtems
index 7626970..5f078c6 100644
--- a/gcc/config/i386/t-rtems
+++ b/gcc/config/i386/t-rtems
@@ -17,10 +17,10 @@
 # <http://www.gnu.org/licenses/>.
 #
 
-MULTILIB_OPTIONS = mtune=i486/mtune=pentium/mtune=pentiumpro msoft-float
+MULTILIB_OPTIONS = march=i486/march=pentium/march=pentiumpro msoft-float
 MULTILIB_DIRNAMES= m486 mpentium mpentiumpro soft-float
 MULTILIB_MATCHES = msoft-float=mno-80387
-MULTILIB_MATCHES += mtune?pentium=mtune?k6 mtune?pentiumpro=mtune?athlon
+MULTILIB_MATCHES += march?pentium=march?k6 march?pentiumpro=march?athlon
 MULTILIB_EXCEPTIONS = \
-mtune=pentium/*msoft-float* \
-mtune=pentiumpro/*msoft-float*
+march=pentium/*msoft-float* \
+march=pentiumpro/*msoft-float*
diff --git a/gcc/config/i386/tbmintrin.h b/gcc/config/i386/tbmintrin.h
index c8a9d77..e03bf91 100644
--- a/gcc/config/i386/tbmintrin.h
+++ b/gcc/config/i386/tbmintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef _X86INTRIN_H_INCLUDED
-# error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <tbmintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _TBMINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/tsxldtrkintrin.h b/gcc/config/i386/tsxldtrkintrin.h
index 08b76a9..eab36d0 100644
--- a/gcc/config/i386/tsxldtrkintrin.h
+++ b/gcc/config/i386/tsxldtrkintrin.h
@@ -1,5 +1,28 @@
-#if !defined _IMMINTRIN_H_INCLUDED
-#error "Never use <tsxldtrkintrin.h> directly; include <immintrin.h> instead."
+/* Copyright (C) 2020 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <tsxldtrkintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _TSXLDTRKINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/waitpkgintrin.h b/gcc/config/i386/waitpkgintrin.h
index 5dbcde3..5046c98 100644
--- a/gcc/config/i386/waitpkgintrin.h
+++ b/gcc/config/i386/waitpkgintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <waitpkgintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <waitpkgintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _WAITPKG_H_INCLUDED
diff --git a/gcc/config/i386/wbnoinvdintrin.h b/gcc/config/i386/wbnoinvdintrin.h
index 5393698..7089e61 100644
--- a/gcc/config/i386/wbnoinvdintrin.h
+++ b/gcc/config/i386/wbnoinvdintrin.h
@@ -1,5 +1,28 @@
-#ifndef _IMMINTRIN_H_INCLUDED
-#error "Never use <wbnoinvdintrin.h> directly; include <immintrin.h> instead."
+/* Copyright (C) 2018-2020 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <wbnoinvdintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _WBNOINVDINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/x86-64.h b/gcc/config/i386/x86-64.h
index 88db428..0c5b8af 100644
--- a/gcc/config/i386/x86-64.h
+++ b/gcc/config/i386/x86-64.h
@@ -59,6 +59,17 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define ASM_OUTPUT_ALIGNED_DECL_COMMON(FILE, DECL, NAME, SIZE, ALIGN)		\
   x86_elf_aligned_decl_common (FILE, DECL, NAME, SIZE, ALIGN);
 
+#undef  ASM_OUTPUT_ALIGNED_DECL_LOCAL
+#define ASM_OUTPUT_ALIGNED_DECL_LOCAL(FILE, DECL, NAME, SIZE, ALIGN)  \
+  do								      \
+    {								      \
+      fprintf ((FILE), "%s", LOCAL_ASM_OP);			      \
+      assemble_name ((FILE), (NAME));				      \
+      fprintf ((FILE), "\n");					      \
+      ASM_OUTPUT_ALIGNED_DECL_COMMON (FILE, DECL, NAME, SIZE, ALIGN); \
+    }								      \
+  while (0)
+
 /* This is used to align code labels according to Intel recommendations.  */
 
 #define SUBALIGN_LOG 3
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index c73917e..5de4149 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -58,7 +58,13 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
 					   in 32,64,128,256 and 512-bit */
   {3, 3, 3, 3, 3},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  3, 3,				/* SSE->integer and integer->SSE moves */
+  3, 3,				/* mask->integer and integer->mask moves */
+  {2, 2, 2},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {2, 2, 2},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -163,7 +169,13 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
 					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  3, 3,				/* SSE->integer and integer->SSE moves */
+  3, 3,				/* mask->integer and integer->mask moves */
+  {2, 4, 2},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {2, 4, 2},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -265,7 +277,13 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
 					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  3, 3,				/* SSE->integer and integer->SSE moves */
+  3, 3,				/* mask->integer and integer->mask moves */
+  {2, 4, 2},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {2, 4, 2},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -369,7 +387,13 @@ struct processor_costs pentium_cost = {
 					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  3, 3,				/* SSE->integer and integer->SSE moves */
+  3, 3,				/* mask->integer and integer->mask moves */
+  {2, 4, 2},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {2, 4, 2},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -464,7 +488,13 @@ struct processor_costs lakemont_cost = {
 					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  3, 3,				/* SSE->integer and integer->SSE moves */
+  3, 3,				/* mask->integer and integer->mask moves */
+  {2, 4, 2},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {2, 4, 2},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -574,7 +604,13 @@ struct processor_costs pentiumpro_cost = {
 					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  3, 3,				/* SSE->integer and integer->SSE moves */
+  3, 3,				/* mask->integer and integer->mask moves */
+  {4, 4, 4},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {2, 2, 2},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -675,7 +711,13 @@ struct processor_costs geode_cost = {
 					   in 32,64,128,256 and 512-bit */
   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  6, 6,					/* SSE->integer and integer->SSE moves */
+  6, 6,				/* SSE->integer and integer->SSE moves */
+  6, 6,				/* mask->integer and integer->mask moves */
+  {2, 2, 2},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {2, 2, 2},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -776,7 +818,13 @@ struct processor_costs k6_cost = {
 					   in 32,64,128,256 and 512-bit */
   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  6, 6,					/* SSE->integer and integer->SSE moves */
+  6, 6,				/* SSE->integer and integer->SSE moves */
+  6, 6,				/* mask->integer and integer->mask moves */
+  {4, 5, 4},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {2, 3, 2},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -883,7 +931,13 @@ struct processor_costs athlon_cost = {
 					   in 32,64,128,256 and 512-bit */
   {4, 4, 10, 10, 20},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  5, 5,					/* SSE->integer and integer->SSE moves */
+  5, 5,				/* SSE->integer and integer->SSE moves */
+  5, 5,				/* mask->integer and integer->mask moves */
+  {3, 4, 3},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {3, 4, 3},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -992,7 +1046,13 @@ struct processor_costs k8_cost = {
 					   in 32,64,128,256 and 512-bit */
   {4, 4, 10, 10, 20},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  5, 5,					/* SSE->integer and integer->SSE moves */
+  5, 5,				/* SSE->integer and integer->SSE moves */
+  5, 5,				/* mask->integer and integer->mask moves */
+  {3, 4, 3},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {3, 4, 3},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -1105,7 +1165,13 @@ struct processor_costs amdfam10_cost = {
 					   in 32,64,128,256 and 512-bit */
   {4, 4, 5, 10, 20},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  3, 3,				/* SSE->integer and integer->SSE moves */
+  3, 3,				/* mask->integer and integer->mask moves */
+  {3, 4, 3},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {3, 4, 3},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
 
   					/* On K8:
   					    MOVD reg64, xmmreg Double FSTORE 4
@@ -1229,6 +1295,12 @@ const struct processor_costs bdver_cost = {
   {10, 10, 10, 40, 60},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
   16, 20,				/* SSE->integer and integer->SSE moves */
+  16, 20,				/* mask->integer and integer->mask moves */
+  {8, 8, 8},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {8, 8, 8},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -1311,14 +1383,23 @@ const struct processor_costs bdver_cost = {
     very small blocks it is better to use loop.  For large blocks, libcall
     can do nontemporary accesses and beat inline considerably.  */
 static stringop_algs znver1_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-	     {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+  /* 32-bit tuning.  */
+  {libcall, {{6, loop, false},
+	     {14, unrolled_loop, false},
+	     {-1, libcall, false}}},
+  /* 64-bit tuning.  */
+  {libcall, {{16, loop, false},
+	     {128, rep_prefix_8_byte, false},
 	     {-1, libcall, false}}}};
 static stringop_algs znver1_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-	     {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+  /* 32-bit tuning.  */
+  {libcall, {{8, loop, false},
+	     {24, unrolled_loop, false},
+	     {128, rep_prefix_4_byte, false},
+	     {-1, libcall, false}}},
+  /* 64-bit tuning.  */
+  {libcall, {{48, unrolled_loop, false},
+	     {128, rep_prefix_8_byte, false},
 	     {-1, libcall, false}}}};
 struct processor_costs znver1_cost = {
   {
@@ -1350,7 +1431,13 @@ struct processor_costs znver1_cost = {
 					   in 32,64,128,256 and 512-bit.  */
   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit.  */
-  6, 6,					/* SSE->integer and integer->SSE moves.  */
+  6, 6,				/* SSE->integer and integer->SSE moves.  */
+  8, 8,				/* mask->integer and integer->mask moves */
+  {6, 6, 6},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {8, 8, 8},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -1448,14 +1535,23 @@ struct processor_costs znver1_cost = {
     very small blocks it is better to use loop.  For large blocks, libcall
     can do nontemporary accesses and beat inline considerably.  */
 static stringop_algs znver2_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-	     {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {64, rep_prefix_4_byte, false},
+  /* 32-bit tuning.  */
+  {libcall, {{6, loop, false},
+	     {14, unrolled_loop, false},
+	     {-1, libcall, false}}},
+  /* 64-bit tuning.  */
+  {libcall, {{16, loop, false},
+	     {64, rep_prefix_4_byte, false},
 	     {-1, libcall, false}}}};
 static stringop_algs znver2_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-	     {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{24, rep_prefix_4_byte, false}, {128, rep_prefix_8_byte, false},
+  /* 32-bit tuning.  */
+  {libcall, {{8, loop, false},
+	     {24, unrolled_loop, false},
+	     {128, rep_prefix_4_byte, false},
+	     {-1, libcall, false}}},
+  /* 64-bit tuning.  */
+  {libcall, {{24, rep_prefix_4_byte, false},
+	     {128, rep_prefix_8_byte, false},
 	     {-1, libcall, false}}}};
 
 struct processor_costs znver2_cost = {
@@ -1491,6 +1587,12 @@ struct processor_costs znver2_cost = {
 					   in 32,64,128,256 and 512-bit.  */
   6, 6,					/* SSE->integer and integer->SSE
 					   moves.  */
+  8, 8,				/* mask->integer and integer->mask moves */
+  {6, 6, 6},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {8, 8, 8},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -1624,7 +1726,13 @@ struct processor_costs skylake_cost = {
 					   in 32,64,128,256 and 512-bit */
   {8, 8, 8, 12, 24},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  6, 6,					/* SSE->integer and integer->SSE moves */
+  6, 6,				/* SSE->integer and integer->SSE moves */
+  5, 5,				/* mask->integer and integer->mask moves */
+  {8, 8, 8},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {6, 6, 6},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  3,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -1661,7 +1769,7 @@ struct processor_costs skylake_cost = {
   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
-  2,					/* cost of moving SSE register to integer.  */
+  6,					/* cost of moving SSE register to integer.  */
   20, 8,				/* Gather load static, per_elt.  */
   22, 10,				/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -1733,6 +1841,12 @@ const struct processor_costs btver1_cost = {
   {10, 10, 12, 48, 96},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
   14, 14,				/* SSE->integer and integer->SSE moves */
+  14, 14,				/* mask->integer and integer->mask moves */
+  {6, 8, 6},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {6, 8, 6},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -1837,6 +1951,12 @@ const struct processor_costs btver2_cost = {
   {10, 10, 12, 48, 96},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
   14, 14,				/* SSE->integer and integer->SSE moves */
+  14, 14,				/* mask->integer and integer->mask moves */
+  {8, 8, 6},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {8, 8, 6},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -1940,6 +2060,12 @@ struct processor_costs pentium4_cost = {
   {16, 16, 16, 32, 64},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
   20, 12,				/* SSE->integer and integer->SSE moves */
+  20, 12,				/* mask->integer and integer->mask moves */
+  {4, 5, 4},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {2, 3, 2},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -2046,6 +2172,12 @@ struct processor_costs nocona_cost = {
   {12, 12, 12, 24, 48},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
   20, 12,				/* SSE->integer and integer->SSE moves */
+  20, 12,				/* mask->integer and integer->mask moves */
+  {4, 4, 4},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {4, 4, 4},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -2149,7 +2281,13 @@ struct processor_costs atom_cost = {
 					   in 32,64,128,256 and 512-bit */
   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  8, 6,					/* SSE->integer and integer->SSE moves */
+  8, 6,				/* SSE->integer and integer->SSE moves */
+  8, 6,				/* mask->integer and integer->mask moves */
+  {6, 6, 6},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {6, 6, 6},			/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -2253,7 +2391,13 @@ struct processor_costs slm_cost = {
 					   in 32,64,128,256 and 512-bit */
   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  8, 6,					/* SSE->integer and integer->SSE moves */
+  8, 6,				/* SSE->integer and integer->SSE moves */
+  8, 6,				/* mask->integer and integer->mask moves */
+  {8, 8, 8},			/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {6, 6, 6},			/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -2357,7 +2501,13 @@ struct processor_costs intel_cost = {
 					   in 32,64,128,256 and 512-bit */
   {6, 6, 6, 6, 6},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  4, 4,					/* SSE->integer and integer->SSE moves */
+  4, 4,				/* SSE->integer and integer->SSE moves */
+  4, 4,				/* mask->integer and integer->mask moves */
+  {4, 4, 4},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {6, 6, 6},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -2465,7 +2615,13 @@ struct processor_costs generic_cost = {
 					   in 32,64,128,256 and 512-bit */
   {6, 6, 6, 10, 15},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  6, 6,					/* SSE->integer and integer->SSE moves */
+  6, 6,				/* SSE->integer and integer->SSE moves */
+  6, 6,				/* mask->integer and integer->mask moves */
+  {6, 6, 6},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {6, 6, 6},			/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
@@ -2578,7 +2734,13 @@ struct processor_costs core_cost = {
 					   in 32,64,128,256 and 512-bit */
   {6, 6, 6, 6, 12},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  6, 6,					/* SSE->integer and integer->SSE moves */
+  6, 6,				/* SSE->integer and integer->SSE moves */
+  6, 6,				/* mask->integer and integer->mask moves */
+  {4, 4, 4},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {6, 6, 6},				/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
   /* End of register allocator costs.  */
   },
 
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 1776aba..6eff825 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -313,8 +313,8 @@ DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
 	  m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GOLDMONT
 	  | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC)
 
-/* X86_TUNE_USE_XCHG_FOR_ATOMIC_STORE: Use xchg instead of mov+mfence.  */
-DEF_TUNE (X86_TUNE_USE_XCHG_FOR_ATOMIC_STORE, "use_xchg_for_atomic_store",
+/* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence.  */
+DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
 	 m_CORE_ALL | m_BDVER | m_ZNVER | m_GENERIC)
 
 /* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by
diff --git a/gcc/config/i386/x86gprintrin.h b/gcc/config/i386/x86gprintrin.h
new file mode 100644
index 0000000..ecfb1c0
--- /dev/null
+++ b/gcc/config/i386/x86gprintrin.h
@@ -0,0 +1,252 @@
+/* Copyright (C) 2020 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+#define _X86GPRINTRIN_H_INCLUDED
+
+#include <ia32intrin.h>
+
+#ifndef __iamcu__
+
+#include <stddef.h>
+
+#include <adxintrin.h>
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#include <cetintrin.h>
+
+#include <cldemoteintrin.h>
+
+#include <clflushoptintrin.h>
+
+#include <clwbintrin.h>
+
+#include <clzerointrin.h>
+
+#include <enqcmdintrin.h>
+
+#include <fxsrintrin.h>
+
+#include <lzcntintrin.h>
+
+#include <lwpintrin.h>
+
+#include <movdirintrin.h>
+
+#include <mwaitxintrin.h>
+
+#include <pconfigintrin.h>
+
+#include <popcntintrin.h>
+
+#include <pkuintrin.h>
+
+#include <rdseedintrin.h>
+
+#include <rtmintrin.h>
+
+#include <serializeintrin.h>
+
+#include <sgxintrin.h>
+
+#include <tbmintrin.h>
+
+#include <tsxldtrkintrin.h>
+
+#include <waitpkgintrin.h>
+
+#include <wbnoinvdintrin.h>
+
+#include <xsaveintrin.h>
+
+#include <xsavecintrin.h>
+
+#include <xsaveoptintrin.h>
+
+#include <xsavesintrin.h>
+
+#include <xtestintrin.h>
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wbinvd (void)
+{
+  __builtin_ia32_wbinvd ();
+}
+
+#ifndef __RDRND__
+#pragma GCC push_options
+#pragma GCC target("rdrnd")
+#define __DISABLE_RDRND__
+#endif /* __RDRND__ */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdrand16_step (unsigned short *__P)
+{
+  return __builtin_ia32_rdrand16_step (__P);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdrand32_step (unsigned int *__P)
+{
+  return __builtin_ia32_rdrand32_step (__P);
+}
+#ifdef __DISABLE_RDRND__
+#undef __DISABLE_RDRND__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDRND__ */
+
+#ifndef __RDPID__
+#pragma GCC push_options
+#pragma GCC target("rdpid")
+#define __DISABLE_RDPID__
+#endif /* __RDPID__ */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdpid_u32 (void)
+{
+  return __builtin_ia32_rdpid ();
+}
+#ifdef __DISABLE_RDPID__
+#undef __DISABLE_RDPID__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDPID__ */
+
+#ifdef  __x86_64__
+
+#ifndef __FSGSBASE__
+#pragma GCC push_options
+#pragma GCC target("fsgsbase")
+#define __DISABLE_FSGSBASE__
+#endif /* __FSGSBASE__ */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readfsbase_u32 (void)
+{
+  return __builtin_ia32_rdfsbase32 ();
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readfsbase_u64 (void)
+{
+  return __builtin_ia32_rdfsbase64 ();
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readgsbase_u32 (void)
+{
+  return __builtin_ia32_rdgsbase32 ();
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readgsbase_u64 (void)
+{
+  return __builtin_ia32_rdgsbase64 ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writefsbase_u32 (unsigned int __B)
+{
+  __builtin_ia32_wrfsbase32 (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writefsbase_u64 (unsigned long long __B)
+{
+  __builtin_ia32_wrfsbase64 (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writegsbase_u32 (unsigned int __B)
+{
+  __builtin_ia32_wrgsbase32 (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writegsbase_u64 (unsigned long long __B)
+{
+  __builtin_ia32_wrgsbase64 (__B);
+}
+#ifdef __DISABLE_FSGSBASE__
+#undef __DISABLE_FSGSBASE__
+#pragma GCC pop_options
+#endif /* __DISABLE_FSGSBASE__ */
+
+#ifndef __RDRND__
+#pragma GCC push_options
+#pragma GCC target("rdrnd")
+#define __DISABLE_RDRND__
+#endif /* __RDRND__ */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdrand64_step (unsigned long long *__P)
+{
+  return __builtin_ia32_rdrand64_step (__P);
+}
+#ifdef __DISABLE_RDRND__
+#undef __DISABLE_RDRND__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDRND__ */
+
+#endif /* __x86_64__  */
+
+#ifndef __PTWRITE__
+#pragma GCC push_options
+#pragma GCC target("ptwrite")
+#define __DISABLE_PTWRITE__
+#endif
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_ptwrite64 (unsigned long long __B)
+{
+  __builtin_ia32_ptwrite64 (__B);
+}
+#endif /* __x86_64__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_ptwrite32 (unsigned __B)
+{
+  __builtin_ia32_ptwrite32 (__B);
+}
+#ifdef __DISABLE_PTWRITE__
+#undef __DISABLE_PTWRITE__
+#pragma GCC pop_options
+#endif /* __DISABLE_PTWRITE__ */
+
+#endif /* __iamcu__ */
+
+#endif /* _X86GPRINTRIN_H_INCLUDED.  */
diff --git a/gcc/config/i386/x86intrin.h b/gcc/config/i386/x86intrin.h
index 59fdceb..bc6cb40 100644
--- a/gcc/config/i386/x86intrin.h
+++ b/gcc/config/i386/x86intrin.h
@@ -24,7 +24,7 @@
 #ifndef _X86INTRIN_H_INCLUDED
 #define _X86INTRIN_H_INCLUDED
 
-#include <ia32intrin.h>
+#include <x86gprintrin.h>
 
 #ifndef __iamcu__
 
@@ -37,16 +37,6 @@
 
 #include <xopintrin.h>
 
-#include <lwpintrin.h>
-
-#include <tbmintrin.h>
-
-#include <popcntintrin.h>
-
-#include <mwaitxintrin.h>
-
-#include <clzerointrin.h>
-
 #endif /* __iamcu__ */
 
 #endif /* _X86INTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/xsavecintrin.h b/gcc/config/i386/xsavecintrin.h
index 039e215..06c9f36 100644
--- a/gcc/config/i386/xsavecintrin.h
+++ b/gcc/config/i386/xsavecintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <xsavecintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xsavecintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _XSAVECINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/xsaveintrin.h b/gcc/config/i386/xsaveintrin.h
index 9f0b8bb..f9cac0d 100644
--- a/gcc/config/i386/xsaveintrin.h
+++ b/gcc/config/i386/xsaveintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <xsaveintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xsaveintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _XSAVEINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/xsaveoptintrin.h b/gcc/config/i386/xsaveoptintrin.h
index 9da3297..4f2756b 100644
--- a/gcc/config/i386/xsaveoptintrin.h
+++ b/gcc/config/i386/xsaveoptintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <xsaveoptintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xsaveoptintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _XSAVEOPTINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/xsavesintrin.h b/gcc/config/i386/xsavesintrin.h
index 264f1c4..629a1f3 100644
--- a/gcc/config/i386/xsavesintrin.h
+++ b/gcc/config/i386/xsavesintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use <xsavesintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xsavesintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _XSAVESINTRIN_H_INCLUDED
diff --git a/gcc/config/i386/xtestintrin.h b/gcc/config/i386/xtestintrin.h
index cb187e4..757cc34 100644
--- a/gcc/config/i386/xtestintrin.h
+++ b/gcc/config/i386/xtestintrin.h
@@ -21,8 +21,8 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef _IMMINTRIN_H_INCLUDED
-# error "Never use <xtestintrin.h> directly; include <immintrin.h> instead."
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xtestintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 
 #ifndef _XTESTINTRIN_H_INCLUDED
diff --git a/gcc/config/linux-protos.h b/gcc/config/linux-protos.h
index 3759187..c52778b 100644
--- a/gcc/config/linux-protos.h
+++ b/gcc/config/linux-protos.h
@@ -19,4 +19,4 @@ along with GCC; see the file COPYING3.  If not see
 
 extern bool linux_has_ifunc_p (void);
 
-extern bool linux_libc_has_function (enum function_class fn_class);
+extern bool linux_libc_has_function (enum function_class fn_class, tree);
diff --git a/gcc/config/linux.c b/gcc/config/linux.c
index 9876153..83ffff4 100644
--- a/gcc/config/linux.c
+++ b/gcc/config/linux.c
@@ -25,7 +25,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "linux-protos.h"
 
 bool
-linux_libc_has_function (enum function_class fn_class)
+linux_libc_has_function (enum function_class fn_class,
+			 tree type ATTRIBUTE_UNUSED)
 {
   if (OPTION_GLIBC || OPTION_MUSL)
     return true;
diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md
index 6383a31..2e75a72 100644
--- a/gcc/config/mips/mips.md
+++ b/gcc/config/mips/mips.md
@@ -6562,9 +6562,19 @@
 
   /* This bit is similar to expand_builtin_longjmp except that it
      restores $gp as well.  */
-  mips_emit_move (hard_frame_pointer_rtx, fp);
   mips_emit_move (pv, lab);
+  /* Restore the frame pointer and stack pointer and gp.  We must use a
+     temporary since the setjmp buffer may be a local.  */
+  fp = copy_to_reg (fp);
+  gpv = copy_to_reg (gpv);
   emit_stack_restore (SAVE_NONLOCAL, stack);
+
+  /* Ensure the frame pointer move is not optimized.  */
+  emit_insn (gen_blockage ());
+  emit_clobber (hard_frame_pointer_rtx);
+  emit_clobber (frame_pointer_rtx);
+  emit_clobber (gp);
+  mips_emit_move (hard_frame_pointer_rtx, fp);
   mips_emit_move (gp, gpv);
   emit_use (hard_frame_pointer_rtx);
   emit_use (stack_pointer_rtx);
diff --git a/gcc/config/mmix/mmix.c b/gcc/config/mmix/mmix.c
index 68f7ec3..4c4fb21 100644
--- a/gcc/config/mmix/mmix.c
+++ b/gcc/config/mmix/mmix.c
@@ -220,6 +220,9 @@ static HOST_WIDE_INT mmix_starting_frame_offset (void);
 #undef TARGET_ASM_OUTPUT_SOURCE_FILENAME
 #define TARGET_ASM_OUTPUT_SOURCE_FILENAME mmix_asm_output_source_filename
 
+#undef TARGET_ASM_OUTPUT_IDENT
+#define TARGET_ASM_OUTPUT_IDENT default_asm_output_ident_directive
+
 #undef TARGET_INIT_LIBFUNCS
 #define TARGET_INIT_LIBFUNCS mmix_init_libfuncs
 
@@ -318,6 +321,32 @@ mmix_option_override (void)
 	       (flag_pic > 1) ? "PIC" : "pic");
       flag_pic = 0;
     }
+
+  /* Don't bother with mmixal-compatible syntax if it's likely that a
+     certain format of the assembly is expected, like no new-line
+     after the .byte (or BYTE) parameter, when scanning debug-info
+     output, as happens in many places in the gcc testsuite.  The
+     dwarf2 output code (maybe others) takes a shortcut based on the
+     presence of certain assembler directives, instead of calling
+     assemble_integer.  Not worthwhile editing the test-cases:
+     mixed-syntax assembly output already looks too ugly for the
+     intent of being readable, and the resulting mix certainly fails
+     the intent of being compatible with mmixal.  See
+     varasm.c:default_file_start for this triple.  See also
+     mmix_assemble_integer.  */
+  if (flag_verbose_asm || flag_debug_asm || flag_dump_rtl_in_asm)
+    {
+      /* "Reinstate" the defaults from target-def.h that we
+	 overrode.  */
+      targetm.asm_out.byte_op = "\t.byte\t";
+      targetm.asm_out.aligned_op.hi = "\t.short\t";
+      targetm.asm_out.aligned_op.si = "\t.long\t";
+
+      /* Note that TARGET_ASM_ALIGNED_DI_OP is default NULL, so
+	 there's nothing to "reinstate".  Still, we add the universal
+	 default (with "recent" gas) for an address.  */
+      targetm.asm_out.aligned_op.di = "\t.dc.a\t";
+    }
 }
 
 /* INIT_EXPANDERS.  */
@@ -1379,10 +1408,11 @@ mmix_assemble_integer (rtx x, unsigned int size, int aligned_p)
 	   that's ok, because we can punt to generic functions.  We then
 	   pretend that aligned data isn't needed, so the usual .<pseudo>
 	   syntax is used (which works for aligned data too).  We actually
-	   *must* do that, since we say we don't have simple aligned
-	   pseudos, causing this function to be called.  We just try and
-	   keep as much compatibility as possible with mmixal syntax for
-	   normal cases (i.e. without GNU extensions and C only).  */
+	   *must* do that, since we (usually) say we don't have simple aligned
+	   pseudos, causing this function to be called.  See
+	   mmix_option_override for an exception.  We just try and keep as
+	   much compatibility as possible with mmixal syntax for normal
+	   cases (i.e. without GNU extensions and C only).  */
       case 1:
 	if (GET_CODE (x) != CONST_INT)
 	  {
@@ -1987,6 +2017,7 @@ mmix_expand_prologue (void)
        + crtl->args.pretend_args_size
        + locals_size + 7) & ~7;
   HOST_WIDE_INT offset = -8;
+  HOST_WIDE_INT total_allocated_stack_space = 0;
 
   /* Add room needed to save global non-register-stack registers.  */
   for (regno = 255;
@@ -2036,6 +2067,8 @@ mmix_expand_prologue (void)
 		? (256 - 8) : stack_space_to_allocate;
 
 	      mmix_emit_sp_add (-stack_chunk);
+	      total_allocated_stack_space += stack_chunk;
+
 	      offset += stack_chunk;
 	      stack_space_to_allocate -= stack_chunk;
 	    }
@@ -2064,6 +2097,7 @@ mmix_expand_prologue (void)
 	    ? (256 - 8 - 8) : stack_space_to_allocate;
 
 	  mmix_emit_sp_add (-stack_chunk);
+	  total_allocated_stack_space += stack_chunk;
 
 	  offset += stack_chunk;
 	  stack_space_to_allocate -= stack_chunk;
@@ -2099,6 +2133,7 @@ mmix_expand_prologue (void)
 	    ? (256 - 8 - 8) : stack_space_to_allocate;
 
 	  mmix_emit_sp_add (-stack_chunk);
+	  total_allocated_stack_space += stack_chunk;
 
 	  offset += stack_chunk;
 	  stack_space_to_allocate -= stack_chunk;
@@ -2143,6 +2178,7 @@ mmix_expand_prologue (void)
 	    ? (256 - 8 - 8) : stack_space_to_allocate;
 
 	  mmix_emit_sp_add (-stack_chunk);
+	  total_allocated_stack_space += stack_chunk;
 
 	  offset += stack_chunk;
 	  stack_space_to_allocate -= stack_chunk;
@@ -2193,6 +2229,8 @@ mmix_expand_prologue (void)
 		 ? (256 - offset - 8) : stack_space_to_allocate);
 
 	    mmix_emit_sp_add (-stack_chunk);
+	    total_allocated_stack_space += stack_chunk;
+
 	    offset += stack_chunk;
 	    stack_space_to_allocate -= stack_chunk;
 	  }
@@ -2210,6 +2248,14 @@ mmix_expand_prologue (void)
      wasn't allocated above.  */
   if (stack_space_to_allocate)
     mmix_emit_sp_add (-stack_space_to_allocate);
+  total_allocated_stack_space += stack_space_to_allocate;
+
+  /* Let's assume that reporting the usage of the regular stack on its
+     own, is more useful than either not supporting -fstack-usage or
+     reporting the sum of the usages of the regular stack and the
+     register stack.  */
+  if (flag_stack_usage_info)
+    current_function_static_stack_size = total_allocated_stack_space;
 }
 
 /* Expands the function epilogue into RTX.  */
diff --git a/gcc/config/mmix/mmix.h b/gcc/config/mmix/mmix.h
index dd04dd3..ac0be10 100644
--- a/gcc/config/mmix/mmix.h
+++ b/gcc/config/mmix/mmix.h
@@ -577,6 +577,9 @@ typedef struct { int regs; int lib; } CUMULATIVE_ARGS;
 
 #define SLOW_BYTE_ACCESS 0
 
+/* A PUSHJ doesn't cost more than a PUSHGO, so don't needlessly create
+   the latter.  */
+#define NO_FUNCTION_CSE 1
 
 /* Node: Sections */
 
@@ -617,6 +620,11 @@ typedef struct { int regs; int lib; } CUMULATIVE_ARGS;
 #define ASM_OUTPUT_ASCII(STREAM, PTR, LEN) \
  mmix_asm_output_ascii (STREAM, PTR, LEN)
 
+/* Make output more ELF-like, by emitting .hidden for hidden symbols
+   (which don't really matter for mmix-knuth-mmixware). */
+#define ASM_OUTPUT_EXTERNAL(FILE, DECL, NAME) \
+ default_elf_asm_output_external (FILE, DECL, NAME)
+
 /* Node: Uninitialized Data */
 
 #define ASM_OUTPUT_ALIGNED_COMMON(ST, N, S, A) \
diff --git a/gcc/config/mmix/mmix.md b/gcc/config/mmix/mmix.md
index d49297a..f41a5b2 100644
--- a/gcc/config/mmix/mmix.md
+++ b/gcc/config/mmix/mmix.md
@@ -38,6 +38,8 @@
    (MMIX_rR_REGNUM 260)
    (MMIX_fp_rO_OFFSET -24)]
 )
+
+(define_mode_iterator MM [QI HI SI DI SF DF])
 
 ;; Operand and operator predicates.
 
@@ -46,10 +48,25 @@
 
 ;; FIXME: Can we remove the reg-to-reg for smaller modes?  Shouldn't they
 ;; be synthesized ok?
-(define_insn "movqi"
+(define_expand "mov<mode>"
+  [(set (match_operand:MM 0 "nonimmediate_operand")
+	(match_operand:MM 1 "general_operand"))]
+  ""
+{
+  /*  Help pre-register-allocation to use at least one register in a move.
+      FIXME: support STCO also for DFmode (storing 0.0).  */
+  if (!REG_P (operands[0]) && !REG_P (operands[1])
+      && (<MODE>mode != DImode
+	  || !memory_operand (operands[0], DImode)
+	  || !satisfies_constraint_I (operands[1])))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*movqi_expanded"
   [(set (match_operand:QI 0 "nonimmediate_operand" "=r,r ,r,x ,r,r,m,??r")
 	(match_operand:QI 1 "general_operand"	    "r,LS,K,rI,x,m,r,n"))]
-  ""
+  "register_operand (operands[0], QImode)
+   || register_operand (operands[1], QImode)"
   "@
    SET %0,%1
    %s1 %0,%v1
@@ -60,10 +77,11 @@
    STBU %1,%0
    %r0%I1")
 
-(define_insn "movhi"
+(define_insn "*movhi_expanded"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,x,r,r,m,??r")
 	(match_operand:HI 1 "general_operand"	    "r,LS,K,r,x,m,r,n"))]
-  ""
+  "register_operand (operands[0], HImode)
+   || register_operand (operands[1], HImode)"
   "@
    SET %0,%1
    %s1 %0,%v1
@@ -75,10 +93,11 @@
    %r0%I1")
 
 ;; gcc.c-torture/compile/920428-2.c fails if there's no "n".
-(define_insn "movsi"
+(define_insn "*movsi_expanded"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r ,r,x,r,r,m,??r")
 	(match_operand:SI 1 "general_operand"	    "r,LS,K,r,x,m,r,n"))]
-  ""
+  "register_operand (operands[0], SImode)
+   || register_operand (operands[1], SImode)"
   "@
    SET %0,%1
    %s1 %0,%v1
@@ -90,10 +109,13 @@
    %r0%I1")
 
 ;; We assume all "s" are addresses.  Does that hold?
-(define_insn "movdi"
+(define_insn "*movdi_expanded"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r ,r,x,r,m,r,m,r,r,??r")
 	(match_operand:DI 1 "general_operand"	    "r,LS,K,r,x,I,m,r,R,s,n"))]
-  ""
+  "register_operand (operands[0], DImode)
+   || register_operand (operands[1], DImode)
+   || (memory_operand (operands[0], DImode)
+       && satisfies_constraint_I (operands[1]))"
   "@
    SET %0,%1
    %s1 %0,%v1
@@ -109,10 +131,11 @@
 
 ;; Note that we move around the float as a collection of bits; no
 ;; conversion to double.
-(define_insn "movsf"
+(define_insn "*movsf_expanded"
  [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,x,r,r,m,??r")
        (match_operand:SF 1 "general_operand"	   "r,G,r,x,m,r,F"))]
-  ""
+  "register_operand (operands[0], SFmode)
+   || register_operand (operands[1], SFmode)"
   "@
    SET %0,%1
    SETL %0,0
@@ -122,10 +145,11 @@
    STTU %1,%0
    %r0%I1")
 
-(define_insn "movdf"
+(define_insn "*movdf_expanded"
   [(set (match_operand:DF 0 "nonimmediate_operand" "=r,r,x,r,r,m,??r")
 	(match_operand:DF 1 "general_operand"	    "r,G,r,x,m,r,F"))]
-  ""
+  "register_operand (operands[0], DFmode)
+   || register_operand (operands[1], DFmode)"
   "@
    SET %0,%1
    SETL %0,0
diff --git a/gcc/config/msp430/constraints.md b/gcc/config/msp430/constraints.md
index 14368f7..b8f9674 100644
--- a/gcc/config/msp430/constraints.md
+++ b/gcc/config/msp430/constraints.md
@@ -25,15 +25,16 @@
   "Register R13.")
 
 (define_constraint "K"
-  "Integer constant 1."
+  "Integer constant 1-19."
   (and (match_code "const_int")
-       (match_test "IN_RANGE (ival, 1, 1)")))
+       (match_test "IN_RANGE (ival, 1, 19)")))
 
 (define_constraint "L"
   "Integer constant -1^20..1^19."
   (and (match_code "const_int")
        (match_test "IN_RANGE (ival, HOST_WIDE_INT_M1U << 20, 1 << 19)")))
 
+;; Valid shift amount for RRUM, RRAM, RLAM, RRCM.
 (define_constraint "M"
   "Integer constant 1-4."
   (and (match_code "const_int")
@@ -49,6 +50,11 @@
   (and (match_code "const_int")
        (match_test "IN_RANGE (ival, 256, 65535)")))
 
+(define_constraint "P"
+  "Integer constant 1-16."
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (ival, 1, 16)")))
+
 ;; We do not allow arbitrary constants, eg symbols or labels,
 ;; because their address may be above the 16-bit address limit
 ;; supported by the offset used in the MOVA instruction.
diff --git a/gcc/config/msp430/msp430-opts.h b/gcc/config/msp430/msp430-opts.h
index 4d20830..fa64677 100644
--- a/gcc/config/msp430/msp430-opts.h
+++ b/gcc/config/msp430/msp430-opts.h
@@ -29,6 +29,18 @@ enum msp430_hwmult_types
   MSP430_HWMULT_F5SERIES
 };
 
+enum msp430_cpu_types
+{
+  MSP430_CPU_MSP430,
+  MSP430_CPU_430,
+  MSP430_CPU_MSP430X_DEFAULT, /* The default setting, which will be overriden
+				 by any other -mcpu= value.  */
+  MSP430_CPU_MSP430X,
+  MSP430_CPU_430X,
+  MSP430_CPU_MSP430XV2,
+  MSP430_CPU_430XV2
+};
+
 enum msp430_regions
 {
   MSP430_REGION_ANY,
diff --git a/gcc/config/msp430/msp430-protos.h b/gcc/config/msp430/msp430-protos.h
index 29ce9bab..0b4d9a4 100644
--- a/gcc/config/msp430/msp430-protos.h
+++ b/gcc/config/msp430/msp430-protos.h
@@ -21,7 +21,6 @@
 #ifndef GCC_MSP430_PROTOS_H
 #define GCC_MSP430_PROTOS_H
 
-bool	msp430_do_not_relax_short_jumps (void);
 rtx	msp430_eh_return_stackadj_rtx (void);
 void	msp430_expand_eh_return (rtx);
 void	msp430_expand_epilogue (int);
@@ -36,7 +35,6 @@ rtx	msp430_incoming_return_addr_rtx (void);
 void	msp430_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
 int	msp430_initial_elimination_offset (int, int);
 bool    msp430_is_interrupt_func (void);
-const char * msp430x_logical_shift_right (rtx);
 const char * msp430_mcu_name (void);
 void    msp430_output_aligned_decl_common (FILE *, const tree, const char *,
 					   unsigned HOST_WIDE_INT, unsigned,
@@ -52,4 +50,9 @@ bool    msp430_use_f5_series_hwmult (void);
 bool	msp430_has_hwmult (void);
 bool msp430_op_not_in_high_mem (rtx op);
 
+#ifdef RTX_CODE
+int msp430_expand_shift (enum rtx_code code, machine_mode mode, rtx *operands);
+const char * msp430_output_asm_shift_insns (enum rtx_code code, machine_mode mode, rtx *operands);
+#endif
+
 #endif /* GCC_MSP430_PROTOS_H */
diff --git a/gcc/config/msp430/msp430.c b/gcc/config/msp430/msp430.c
index 6bb1714..de4b16b 100644
--- a/gcc/config/msp430/msp430.c
+++ b/gcc/config/msp430/msp430.c
@@ -160,15 +160,7 @@ msp430_option_override (void)
 
   init_machine_status = msp430_init_machine_status;
 
-  if (target_cpu)
-    {
-      /* gcc/common/config/msp430-common.c will have
-	 already canonicalised the string in target_cpu.  */
-      if (strcasecmp (target_cpu, "msp430x") == 0)
-	msp430x = true;
-      else /* target_cpu == "msp430" - already handled by the front end.  */
-	msp430x = false;
-    }
+  msp430x = target_cpu >= MSP430_CPU_MSP430X_DEFAULT;
 
   if (target_mcu)
     {
@@ -180,7 +172,7 @@ msp430_option_override (void)
 
 	  if (msp430_warn_mcu)
 	    {
-	      if (target_cpu && msp430x != xisa)
+	      if (target_cpu != MSP430_CPU_MSP430X_DEFAULT && msp430x != xisa)
 		warning (0, "MCU %qs supports %s ISA but %<-mcpu%> option "
 			 "is set to %s",
 			 target_mcu, xisa ? "430X" : "430",
@@ -212,7 +204,10 @@ msp430_option_override (void)
 			 "but %<-mhwmult%> is set to f5series",
 			 target_mcu, hwmult_name (extracted_mcu_data.hwmpy));
 	    }
-	  msp430x = xisa;
+	  /* Only override the default setting with the extracted ISA value if
+	     the user has not passed -mcpu=.  */
+	  if (target_cpu == MSP430_CPU_MSP430X_DEFAULT)
+	    msp430x = xisa;
 	}
       else
 	{
@@ -220,10 +215,10 @@ msp430_option_override (void)
 	    {
 	      if (msp430_warn_mcu)
 		{
-		  if (target_cpu == NULL)
+		  if (target_cpu == MSP430_CPU_MSP430X_DEFAULT)
 		    warning (0,
 			     "Unrecognized MCU name %qs, assuming that it is "
-			     "just a MSP430 with no hardware multiply.\n"
+			     "just a MSP430X with no hardware multiply.\n"
 			     "Use the %<-mcpu%> and %<-mhwmult%> options to "
 			     "set these explicitly.",
 			     target_mcu);
@@ -237,27 +232,20 @@ msp430_option_override (void)
 
 	      msp430_hwmult_type = MSP430_HWMULT_NONE;
 	    }
-	  else if (target_cpu == NULL)
+	  else if (target_cpu == MSP430_CPU_MSP430X_DEFAULT)
 	    {
 	      if (msp430_warn_mcu)
 		warning (0,
 			 "Unrecognized MCU name %qs, assuming that it just "
-			 "supports the MSP430 ISA.\nUse the %<-mcpu%> option "
+			 "supports the MSP430X ISA.\nUse the %<-mcpu%> option "
 			 "to set the ISA explicitly.",
 			 target_mcu);
-
-	      msp430x = false;
 	    }
 	  else if (msp430_warn_mcu)
 	    warning (0, "Unrecognized MCU name %qs.", target_mcu);
 	}
     }
 
-  /* The F5 series are all able to support the 430X ISA.  */
-  if (target_cpu == NULL && target_mcu == NULL
-      && msp430_hwmult_type == MSP430_HWMULT_F5SERIES)
-    msp430x = true;
-
   if (TARGET_LARGE && !msp430x)
     error ("%<-mlarge%> requires a 430X-compatible %<-mmcu=%>");
 
@@ -1064,15 +1052,6 @@ static bool msp430_rtx_costs (rtx	   x ATTRIBUTE_UNUSED,
 	  return true;
 	}
       break;
-    case ASHIFT:
-    case ASHIFTRT:
-    case LSHIFTRT:
-      if (!msp430x)
-	{
-	  *total = COSTS_N_INSNS (100);
-	  return true;
-	}
-      break;
     }
   return false;
 }
@@ -1716,9 +1695,9 @@ increment_stack (HOST_WIDE_INT amount)
     {
       inc = GEN_INT (amount);
       if (TARGET_LARGE)
-	emit_insn (gen_addpsi3 (sp, sp, inc));
+	F (emit_insn (gen_addpsi3 (sp, sp, inc)));
       else
-	emit_insn (gen_addhi3 (sp, sp, inc));
+	F (emit_insn (gen_addhi3 (sp, sp, inc)));
     }
 }
 
@@ -2112,7 +2091,7 @@ msp430_output_aligned_decl_common (FILE *		  stream,
 static void
 msp430_file_end (void)
 {
-#ifdef HAVE_AS_GNU_ATTRIBUTE
+#ifdef HAVE_AS_MSPABI_ATTRIBUTE
   /* Enum for tag names.  */
   enum
     {
@@ -2151,7 +2130,7 @@ msp430_file_end (void)
 	   OFBA_MSPABI_Tag_Data_Model,
 	   TARGET_LARGE ? OFBA_MSPABI_Val_Model_Large
 	   : OFBA_MSPABI_Val_Model_Small);
-#ifdef HAVE_AS_MSPABI_ATTRIBUTE
+#ifdef HAVE_AS_GNU_ATTRIBUTE
   /* Emit .gnu_attribute directive for Tag_GNU_MSP430_Data_Region.  */
   fprintf (asm_out_file, "\t%s %d, %d\n", gnu_attr, Tag_GNU_MSP430_Data_Region,
 	   msp430_data_region == MSP430_REGION_LOWER
@@ -2161,19 +2140,6 @@ msp430_file_end (void)
 #endif
 }
 
-bool
-msp430_do_not_relax_short_jumps (void)
-{
-  /* When placing code into "either" low or high memory we do not want the
-     linker to grow the size of sections, which it can do if it is encounters a
-     branch to a label that is too far away.  So we tell the cbranch patterns to
-     avoid using short jumps when there is a chance that the instructions will
-     end up in a low section.  */
-  return
-    msp430_code_region == MSP430_REGION_EITHER
-    || has_attr (ATTR_EITHER, current_function_decl);
-}
-
 enum msp430_builtin
 {
   MSP430_BUILTIN_BIC_SR,
@@ -2442,6 +2408,8 @@ msp430_expand_prologue (void)
   for (i = 15; i >= 4; i--)
     if (cfun->machine->need_to_save[i])
       {
+	/* We need to save COUNT sequential registers starting from regnum
+	   I.  */
 	int seq, count;
 	rtx note;
 
@@ -2456,6 +2424,7 @@ msp430_expand_prologue (void)
 	    p = F (emit_insn (gen_pushm (gen_rtx_REG (Pmode, i),
 					 GEN_INT (count))));
 
+	    /* Document the stack decrement as a result of PUSHM.  */
 	    note = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (count + 1));
 
 	    XVECEXP (note, 0, 0)
@@ -2504,8 +2473,10 @@ msp430_expand_prologue (void)
 void
 msp430_expand_epilogue (int is_eh)
 {
-  int i;
+  int i, j;
   int fs;
+  rtx sp = stack_pointer_rtx;
+  rtx p;
   int helper_n = 0;
 
   if (is_naked_func ())
@@ -2574,19 +2545,27 @@ msp430_expand_epilogue (int is_eh)
   for (i = 4; i <= 15; i++)
     if (cfun->machine->need_to_save[i])
       {
-	int seq, count;
+	/* We need to restore COUNT sequential registers starting from regnum
+	   I.  */
+	int seq;
+	int count = 1;
+	int helper_used = 0;
+	rtx note, addr;
 
-	for (seq = i + 1; seq <= 15 && cfun->machine->need_to_save[seq]; seq ++)
-	  ;
-	count = seq - i;
+	if (msp430x)
+	  {
+	    for (seq = i + 1; seq <= 15 && cfun->machine->need_to_save[seq];
+		 seq++)
+	      ;
+	    count = seq - i;
+	  }
 
 	if (msp430x)
 	  {
 	    /* Note: With TARGET_LARGE we still use
 	       POPM as POPX.A is two bytes bigger.  */
-	    emit_insn (gen_popm (stack_pointer_rtx, GEN_INT (seq - 1),
-				 GEN_INT (count)));
-	    i += count - 1;
+	    p = F (emit_insn (gen_popm (stack_pointer_rtx, GEN_INT (seq - 1),
+					GEN_INT (count))));
 	  }
 	else if (i == 11 - helper_n
 		 && ! msp430_is_interrupt_func ()
@@ -2598,11 +2577,44 @@ msp430_expand_epilogue (int is_eh)
 		 && helper_n > 1
 		 && !is_eh)
 	  {
-	    emit_jump_insn (gen_epilogue_helper (GEN_INT (helper_n)));
-	    return;
+	    p = F (emit_jump_insn (gen_epilogue_helper (GEN_INT (helper_n))));
+	    count = helper_n;
+	    helper_used = 1;
 	  }
 	else
-	  emit_insn (gen_pop (gen_rtx_REG (Pmode, i)));
+	  p = F (emit_insn (gen_pop (gen_rtx_REG (Pmode, i))));
+
+	/* Document the stack increment as a result of POPM.  */
+	note = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (count + 1));
+
+	addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+			     GEN_INT (count * (TARGET_LARGE ? 4 : 2)));
+
+	XVECEXP (note, 0, 0) = F (gen_rtx_SET (stack_pointer_rtx, addr));
+
+
+	/* *sp++ = R[i+j] */
+	/* sp	R4
+	   ...
+	   sp+N	R10.  */
+	for (j = 0; j < count; j++)
+	  {
+	    int ofs = j * (TARGET_LARGE ? 4 : 2);
+
+	    if (ofs)
+	      addr = gen_rtx_PLUS (Pmode, sp, GEN_INT (ofs));
+	    else
+	      addr = stack_pointer_rtx;
+
+	    XVECEXP (note, 0, j + 1)
+	      = F (gen_rtx_SET (gen_rtx_MEM (Pmode, addr),
+				gen_rtx_REG (Pmode, i + j)));
+	  }
+	add_reg_note (p, REG_FRAME_RELATED_EXPR, note);
+	i += count - 1;
+
+	if (helper_used)
+	  return;
       }
 
   if (is_eh)
@@ -2687,32 +2699,6 @@ msp430_init_dwarf_reg_sizes_extra (tree address)
     }
 }
 
-/* This is a list of MD patterns that implement fixed-count shifts.  */
-static struct
-{
-  const char *name;
-  int count;
-  int need_430x;
-  rtx (*genfunc)(rtx,rtx);
-}
-const_shift_helpers[] =
-{
-#define CSH(N,C,X,G) { "__mspabi_" N, C, X, gen_##G }
-
-  CSH ("slli", 1, 1, slli_1),
-  CSH ("slll", 1, 1, slll_1),
-  CSH ("slll", 2, 1, slll_2),
-
-  CSH ("srai", 1, 0, srai_1),
-  CSH ("sral", 1, 0, sral_1),
-  CSH ("sral", 2, 0, sral_2),
-
-  CSH ("srll", 1, 0, srll_1),
-  CSH ("srll", 2, 1, srll_2x),
-  { 0, 0, 0, 0 }
-#undef CSH
-};
-
 /* The MSP430 ABI defines a number of helper functions that should be
    used for, for example, 32-bit shifts.  This function is called to
    emit such a function, using the table above to optimize some
@@ -2729,31 +2715,12 @@ msp430_expand_helper (rtx *operands, const char *helper_name,
   machine_mode arg0mode = GET_MODE (operands[0]);
   machine_mode arg1mode = GET_MODE (operands[1]);
   machine_mode arg2mode = GET_MODE (operands[2]);
-  int have_430x = msp430x ? 1 : 0;
   int expand_mpy = strncmp (helper_name, "__mspabi_mpy",
 			    sizeof ("__mspabi_mpy") - 1) == 0;
   /* This function has been used incorrectly if CONST_VARIANTS is TRUE for a
      hwmpy function.  */
   gcc_assert (!(expand_mpy && const_variants));
 
-  /* Emit size-optimal insns for small shifts we can easily do inline.  */
-  if (CONST_INT_P (operands[2]) && !expand_mpy)
-    {
-      int i;
-
-      for (i=0; const_shift_helpers[i].name; i++)
-	{
-	  if (const_shift_helpers[i].need_430x <= have_430x
-	      && strcmp (helper_name, const_shift_helpers[i].name) == 0
-	      && INTVAL (operands[2]) == const_shift_helpers[i].count)
-	    {
-	      emit_insn (const_shift_helpers[i].genfunc (operands[0],
-							 operands[1]));
-	      return;
-	    }
-	}
-    }
-
   if (arg1mode != VOIDmode && arg2mode != VOIDmode)
     /* Modes of arguments must be equal if not constants.  */
     gcc_assert (arg1mode == arg2mode);
@@ -2848,6 +2815,190 @@ msp430_expand_helper (rtx *operands, const char *helper_name,
 		  gen_rtx_REG (arg0mode, 12));
 }
 
+/* Return TRUE if the helper function should be used and FALSE if the shifts
+   insns should be emitted inline.  */
+static bool
+use_helper_for_const_shift (enum rtx_code code, machine_mode mode,
+			    HOST_WIDE_INT amt)
+{
+  const int default_inline_shift = 4;
+  /* We initialize the option to 65 so we know if the user set it or not.  */
+  int user_set_max_inline = (msp430_max_inline_shift == 65 ? 0 : 1);
+  int max_inline = (user_set_max_inline ? msp430_max_inline_shift
+		    : default_inline_shift);
+  /* 32-bit shifts are roughly twice as costly as 16-bit shifts so we adjust
+     the heuristic accordingly.  */
+  int max_inline_32 = max_inline / 2;
+
+  /* Don't use helpers for these modes on 430X, when optimizing for speed, or
+     when emitting a small number of insns.  */
+  if ((mode == E_QImode || mode == E_HImode || mode == E_PSImode)
+      && (msp430x
+	  /* If the user set max_inline then we always obey that number.
+	     Otherwise we always emit the shifts inline at -O2 and above.  */
+	  || amt <= max_inline
+	  || (!user_set_max_inline
+	      && (optimize >= 2 && !optimize_size))))
+    return false;
+
+  /* 430 and 430X codegen for SImode shifts is the same.
+     Set a hard limit of 15 for the number of shifts that will be emitted
+     inline by default, even at -O2 and above, to prevent code size
+     explosion.  */
+  if (mode == E_SImode
+      && (amt <= max_inline_32
+	  || (!user_set_max_inline
+	      && (optimize >= 2 && !optimize_size)
+	      && amt <= 15)))
+    return false;
+
+  return true;
+}
+
+/* For shift operations which will use an mspabi helper function, setup the
+   call to msp430_expand helper.  Return 1 to indicate we have finished with
+   this insn and invoke "DONE".
+   Otherwise return 0 to indicate the insn should fallthrough.
+   Never FAIL.  */
+int
+msp430_expand_shift (enum rtx_code code, machine_mode mode, rtx *operands)
+{
+  /* Always use the helper function when the shift amount is not a
+     constant.  */
+  if (!CONST_INT_P (operands[2])
+      || mode == E_DImode
+      || use_helper_for_const_shift (code, mode, INTVAL (operands[2])))
+    {
+      const char *helper_name = NULL;
+      /* The const variants of mspabi shifts have significantly larger code
+	 size than the generic version, so use the generic version if
+	 optimizing for size.  */
+      bool const_variant = !optimize_size;
+      switch (mode)
+	{
+	case E_HImode:
+	  helper_name = (code == ASHIFT ? "__mspabi_slli" :
+			 (code == ASHIFTRT ? "__mspabi_srai" :
+			  (code == LSHIFTRT ? "__mspabi_srli" :
+			   NULL)));
+	  break;
+	case E_PSImode:
+	  helper_name = (code == ASHIFT ? "__gnu_mspabi_sllp" :
+			 (code == ASHIFTRT ? "__gnu_mspabi_srap" :
+			  (code == LSHIFTRT ? "__gnu_mspabi_srlp" :
+			   NULL)));
+	  /* No const variant for PSImode shifts FIXME.  */
+	  const_variant = false;
+	  break;
+	case E_SImode:
+	  helper_name = (code == ASHIFT ? "__mspabi_slll" :
+			 (code == ASHIFTRT ? "__mspabi_sral" :
+			  (code == LSHIFTRT ? "__mspabi_srll" :
+			   NULL)));
+	  break;
+	case E_DImode:
+	  helper_name = (code == ASHIFT ? "__mspabi_sllll" :
+			 (code == ASHIFTRT ? "__mspabi_srall" :
+			  (code == LSHIFTRT ? "__mspabi_srlll" :
+			   NULL)));
+	  /* No const variant for DImode shifts.  */
+	  const_variant = false;
+	  break;
+	default:
+	  gcc_unreachable ();
+	  break;
+	}
+      gcc_assert (helper_name);
+      msp430_expand_helper (operands, helper_name, const_variant);
+      return 1;
+    }
+  /* When returning 0, there must be an insn to match the RTL pattern
+     otherwise there will be an unrecognizeable insn.  */
+  return 0;
+}
+
+/* Helper function to emit a sequence of shift instructions.  The amount of
+   shift instructions to emit is in OPERANDS[2].
+   For 430 we output copies of identical inline shifts for all modes.
+   For 430X it is inneficient to do so for any modes except SI and DI, since we
+   can make use of R*M insns or RPT with 430X insns, so this function is only
+   used for SImode in that case.  */
+const char *
+msp430_output_asm_shift_insns (enum rtx_code code, machine_mode mode,
+			       rtx *operands)
+{
+  int i;
+  int amt;
+  int max_shift = GET_MODE_BITSIZE (mode) - 1;
+  gcc_assert (CONST_INT_P (operands[2]));
+  amt = INTVAL (operands[2]);
+
+  if (amt == 0 || amt > max_shift)
+    {
+      switch (code)
+	{
+	case ASHIFT:
+	  output_asm_insn ("# ignored undefined behaviour left shift "
+			   "of %1 by %2", operands);
+	  break;
+	case ASHIFTRT:
+	  output_asm_insn ("# ignored undefined behaviour arithmetic right "
+			   "shift of %1 by %2", operands);
+	  break;
+	case LSHIFTRT:
+	  output_asm_insn ("# ignored undefined behaviour logical right shift "
+			   "of %1 by %2", operands);
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      return "";
+    }
+
+  if (code == ASHIFT)
+    {
+      if (!msp430x && mode == HImode)
+	for (i = 0; i < amt; i++)
+	  output_asm_insn ("RLA.W\t%0", operands);
+      else if (mode == SImode)
+	for (i = 0; i < amt; i++)
+	  output_asm_insn ("RLA%X0.W\t%L0 { RLC%X0.W\t%H0", operands);
+      else
+	/* Catch unhandled cases.  */
+	gcc_unreachable ();
+    }
+  else if (code == ASHIFTRT)
+    {
+      if (!msp430x && mode == HImode)
+	for (i = 0; i < amt; i++)
+	  output_asm_insn ("RRA.W\t%0", operands);
+      else if (mode == SImode)
+	for (i = 0; i < amt; i++)
+	  output_asm_insn ("RRA%X0.W\t%H0 { RRC%X0.W\t%L0", operands);
+      else
+	gcc_unreachable ();
+    }
+  else if (code == LSHIFTRT)
+    {
+      if (!msp430x && mode == HImode)
+	for (i = 0; i < amt; i++)
+	  output_asm_insn ("CLRC { RRC.W\t%0", operands);
+      else if (mode == SImode)
+	for (i = 0; i < amt; i++)
+	  output_asm_insn ("CLRC { RRC%X0.W\t%H0 { RRC%X0.W\t%L0", operands);
+      /* FIXME: Why doesn't "RRUX.W\t%H0 { RRC%X0.W\t%L0" work for msp430x?
+	 It causes execution timeouts e.g. pr41963.c.  */
+#if 0
+      else if (msp430x && mode == SImode)
+	for (i = 0; i < amt; i++)
+	  output_asm_insn ("RRUX.W\t%H0 { RRC%X0.W\t%L0", operands);
+#endif
+      else
+	gcc_unreachable ();
+    }
+  return "";
+}
+
 /* Called by cbranch<mode>4 to coerce operands into usable forms.  */
 void
 msp430_fixup_compare_operands (machine_mode my_mode, rtx * operands)
@@ -3370,29 +3521,43 @@ msp430_op_not_in_high_mem (rtx op)
 #undef  TARGET_PRINT_OPERAND
 #define TARGET_PRINT_OPERAND		msp430_print_operand
 
-/* A   low 16-bits of int/lower of register pair
-   B   high 16-bits of int/higher of register pair
-   C   bits 32-47 of a 64-bit value/reg 3 of a DImode value
-   D   bits 48-63 of a 64-bit value/reg 4 of a DImode value
-   H   like %B (for backwards compatibility)
-   I   inverse of value
-   J   an integer without a # prefix
-   L   like %A (for backwards compatibility)
-   O   offset of the top of the stack
-   Q   like X but generates an A postfix
-   R   inverse of condition code, unsigned.
-   X   X instruction postfix in large mode
-   Y   value - 4
-   Z   value - 1
-   b   .B or .W or .A, depending upon the mode
-   p   bit position
-   r   inverse of condition code
-   x   like X but only for pointers.  */
+/* A   Select low 16-bits of the constant/register/memory operand.
+   B   Select high 16-bits of the constant/register/memory
+       operand.
+   C   Select bits 32-47 of the constant/register/memory operand.
+   D   Select bits 48-63 of the constant/register/memory operand.
+   H   Equivalent to @code{B} (for backwards compatibility).
+   I   Print the inverse (logical @code{NOT}) of the constant
+       value.
+   J   Print an integer without a @code{#} prefix.
+   L   Equivalent to @code{A} (for backwards compatibility).
+   O   Offset of the current frame from the top of the stack.
+   Q   Use the @code{A} instruction postfix.
+   R   Inverse of condition code, for unsigned comparisons.
+   W   Subtract 16 from the constant value.
+   X   Use the @code{X} instruction postfix.
+   Y   Subtract 4 from the constant value.
+   Z   Subtract 1 from the constant value.
+   b   Append @code{.B}, @code{.W} or @code{.A} to the
+       instruction, depending on the mode.
+   d   Offset 1 byte of a memory reference or constant value.
+   e   Offset 3 bytes of a memory reference or constant value.
+   f   Offset 5 bytes of a memory reference or constant value.
+   g   Offset 7 bytes of a memory reference or constant value.
+   p   Print the value of 2, raised to the power of the given
+       constant.  Used to select the specified bit position.
+   r   Inverse of condition code, for signed comparisons.
+   x   Equivialent to @code{X}, but only for pointers.  */
 
 static void
 msp430_print_operand (FILE * file, rtx op, int letter)
 {
   rtx addr;
+  /* These are used by the 'A', 'B', 'C', 'D', 'd', 'e', 'f' and 'g' modifiers
+     to describe how to process the operand to get the requested value.  */
+  int mem_off = 0;
+  int reg_off = 0;
+  int const_shift = 0;
 
   /* We can't use c, n, a, or l.  */
   switch (letter)
@@ -3400,12 +3565,17 @@ msp430_print_operand (FILE * file, rtx op, int letter)
     case 'Z':
       gcc_assert (CONST_INT_P (op));
       /* Print the constant value, less one.  */
-      fprintf (file, "#%ld", INTVAL (op) - 1);
+      fprintf (file, "#%ld", (long) (INTVAL (op) - 1));
       return;
     case 'Y':
       gcc_assert (CONST_INT_P (op));
       /* Print the constant value, less four.  */
-      fprintf (file, "#%ld", INTVAL (op) - 4);
+      fprintf (file, "#%ld", (long) (INTVAL (op) - 4));
+      return;
+    case 'W':
+      gcc_assert (CONST_INT_P (op));
+      /* Print the constant value, less 16.  */
+      fprintf (file, "#%ld", (long) (INTVAL (op) - 16));
       return;
     case 'I':
       if (GET_CODE (op) == CONST_INT)
@@ -3462,76 +3632,71 @@ msp430_print_operand (FILE * file, rtx op, int letter)
 	default:
 	  return;
 	}
-    case 'A':
-    case 'L': /* Low half.  */
-      switch (GET_CODE (op))
+    case 'd': case 'e': case 'f': case 'g':
+      if (REG_P (op))
 	{
-	case MEM:
-	  op = adjust_address (op, Pmode, 0);
-	  break;
-	case REG:
-	  break;
-	case CONST_INT:
-	  op = GEN_INT (INTVAL (op) & 0xffff);
-	  letter = 0;
-	  break;
-	default:
-	  /* If you get here, figure out a test case :-) */
-	  gcc_unreachable ();
+	  output_operand_lossage ("%%d, %%e, %%f, %%g operand modifiers are "
+				  "for memory references or constant values "
+				  "only");
+	  return;
 	}
-      break;
-    case 'B':
-    case 'H': /* high half */
-      switch (GET_CODE (op))
+      /* fallthru */
+    case 'B': case 'H': /* high half */
+    case 'C':
+    case 'D':
+      switch (letter)
 	{
-	case MEM:
-	  /* We don't need to adjust the address for post_inc.  */
-	  op = adjust_address (op, Pmode,
-			       (GET_CODE (XEXP (op, 0)) == POST_INC) ? 0 : 2);
+	case 'd':
+	  mem_off = 1;
+	  const_shift = 8;
 	  break;
-	case REG:
-	  op = gen_rtx_REG (Pmode, REGNO (op) + 1);
+	case 'B':
+	case 'H':
+	  mem_off = 2;
+	  reg_off = 1;
+	  const_shift = 16;
 	  break;
-	case CONST_INT:
-	  op = GEN_INT (INTVAL (op) >> 16);
-	  letter = 0;
+	case 'e':
+	  mem_off = 3;
+	  const_shift = 24;
 	  break;
-	default:
-	  /* If you get here, figure out a test case :-) */
-	  gcc_unreachable ();
-	}
-      break;
-    case 'C':
-      switch (GET_CODE (op))
-	{
-	case MEM:
-	  op = adjust_address (op, Pmode,
-			       (GET_CODE (XEXP (op, 0)) == POST_INC) ? 0 : 4);
+	case 'C':
+	  mem_off = 4;
+	  reg_off = 2;
+	  const_shift = 32;
 	  break;
-	case REG:
-	  op = gen_rtx_REG (Pmode, REGNO (op) + 2);
+	case 'f':
+	  mem_off = 5;
+	  const_shift = 40;
 	  break;
-	case CONST_INT:
-	  op = GEN_INT ((long long) INTVAL (op) >> 32);
-	  letter = 0;
+	case 'D':
+	  mem_off = 6;
+	  reg_off = 3;
+	  const_shift = 48;
+	  break;
+	case 'g':
+	  mem_off = 7;
+	  const_shift = 56;
 	  break;
 	default:
-	  /* If you get here, figure out a test case :-) */
 	  gcc_unreachable ();
+	  break;
 	}
-      break;
-    case 'D':
+      /* fallthru */
+    case 'A': case 'L': /* Low half.  */
       switch (GET_CODE (op))
 	{
 	case MEM:
+	  /* We don't need to adjust the address for post_inc.  */
 	  op = adjust_address (op, Pmode,
-			       (GET_CODE (XEXP (op, 0)) == POST_INC) ? 0 : 6);
+			       (GET_CODE (XEXP (op, 0)) == POST_INC)
+			       ? 0 : mem_off);
 	  break;
 	case REG:
-	  op = gen_rtx_REG (Pmode, REGNO (op) + 3);
+	  op = gen_rtx_REG (Pmode, REGNO (op) + reg_off);
 	  break;
 	case CONST_INT:
-	  op = GEN_INT ((long long) INTVAL (op) >> 48);
+	  op = GEN_INT (((long long) INTVAL (op) >> const_shift) & 0xffff);
 	  letter = 0;
 	  break;
 	default:
@@ -3724,34 +3889,6 @@ msp430x_extendhisi (rtx * operands)
   return "MOV.W\t%1, %L0 { MOV.W\t%1, %H0 { RPT\t#15 { RRAX.W\t%H0";
 }
 
-/* Likewise for logical right shifts.  */
-const char *
-msp430x_logical_shift_right (rtx amount)
-{
-  /* The MSP430X's logical right shift instruction - RRUM - does
-     not use an extension word, so we cannot encode a repeat count.
-     Try various alternatives to work around this.  If the count
-     is in a register we are stuck, hence the assert.  */
-  gcc_assert (CONST_INT_P (amount));
-
-  if (INTVAL (amount) <= 0
-      || INTVAL (amount) >= 16)
-    return "# nop logical shift.";
-
-  if (INTVAL (amount) > 0
-      && INTVAL (amount) < 5)
-    return "rrum.w\t%2, %0"; /* Two bytes.  */
-
-  if (INTVAL (amount) > 4
-      && INTVAL (amount) < 9)
-    return "rrum.w\t#4, %0 { rrum.w\t%Y2, %0 "; /* Four bytes.  */
-
-  /* First we logically shift right by one.  Now we know
-     that the top bit is zero and we can use the arithmetic
-     right shift instruction to perform the rest of the shift.  */
-  return "rrum.w\t#1, %0 { rpt\t%Z2 { rrax.w\t%0"; /* Six bytes.  */
-}
-
 /* Stop GCC from thinking that it can eliminate (SUBREG:PSI (SI)).  */
 
 #undef TARGET_CAN_CHANGE_MODE_CLASS
diff --git a/gcc/config/msp430/msp430.h b/gcc/config/msp430/msp430.h
index f198981..2500771 100644
--- a/gcc/config/msp430/msp430.h
+++ b/gcc/config/msp430/msp430.h
@@ -65,8 +65,6 @@ extern bool msp430x;
   "%{mrelax=-mQ} " /* Pass the relax option on to the assembler.  */ \
   /* Tell the assembler if we are building for the LARGE pointer model.  */ \
   "%{mlarge:-ml} " \
-  /* Copy data from ROM to RAM if necessary.  */ \
-  "%{!msim:-md} %{msim:%{mlarge:-md}} " \
   "%{msilicon-errata=*:-msilicon-errata=%*} " \
   "%{msilicon-errata-warn=*:-msilicon-errata-warn=%*} " \
   /* Create DWARF line number sections for -ffunction-sections.  */ \
@@ -257,6 +255,11 @@ extern const char *msp430_get_linker_devices_include_path (int, const char **);
   msp430_return_addr_rtx (COUNT)
 
 #define SLOW_BYTE_ACCESS		0
+
+/* Calling a constant function address costs the same number of clock
+   cycles as calling an address stored in a register. However, in terms of
+   instruction length, calling a constant address is more expensive.  */
+#define NO_FUNCTION_CSE (optimize >= 2 && !optimize_size)
 
 
 /* Register Usage */
diff --git a/gcc/config/msp430/msp430.md b/gcc/config/msp430/msp430.md
index b6602fb..f70e61b 100644
--- a/gcc/config/msp430/msp430.md
+++ b/gcc/config/msp430/msp430.md
@@ -65,6 +65,15 @@
 (include "constraints.md")
 
 (define_mode_iterator QHI [QI HI PSI])
+(define_mode_iterator HPSI [HI PSI])
+(define_mode_iterator HDI [HI PSI SI DI])
+
+;; Mapping of all shift operators
+(define_code_iterator any_shift [ashift ashiftrt lshiftrt])
+
+;; Base name for define_insn
+(define_code_attr shift_insn
+  [(ashift "ashl") (lshiftrt "lshr") (ashiftrt "ashr")])
 
 ;; There are two basic "family" tests we do here:
 ;;
@@ -553,6 +562,15 @@
    SXT%X0\t%0"
 )
 
+(define_insn "extendqipsi2"
+  [(set (match_operand:PSI		   0 "msp430_general_dst_operand" "=r,m")
+	(sign_extend:PSI (match_operand:QI 1 "msp430_general_operand" "0,0")))]
+  ""
+  "@
+  SXT\t%0
+  SXTX.A\t%0"
+)
+
 ;; ------------------------
 ;; ZERO EXTEND INSTRUCTIONS
 ;; Byte-writes to registers clear bits 19:8
@@ -680,31 +698,42 @@
    MOV%X1.B\t%1, %0"
 )
 
+;; The next three insns emit identical assembly code.
+;; They take a QImode and shift it in SImode.  Only shift counts <= 8
+;; are handled since that is the simple case where the high 16-bits (i.e. the
+;; high register) are always 0.
 (define_insn ""
-  [(set (match_operand:SI 0 "register_operand" "=r")
-	(ashift:SI (zero_extend:SI (match_operand:QI 1 "general_operand" "rm"))
-		   (match_operand:HI 2 "immediate_operand" "M")))]
+  [(set (match_operand:SI			     0 "register_operand" "=r,r,r")
+	(ashift:SI (zero_extend:SI (match_operand:QI 1 "general_operand" "0,rm,rm"))
+		   (match_operand:HI		     2 "const_1_to_8_operand" "M,M,i")))]
   "msp430x"
-  "MOV%X1.B %1, %L0 { RLAM.W %2, %L0 { CLR %H0"
+  "@
+  RLAM.W %2, %L0 { CLR %H0
+  MOV%X1.B %1, %L0 { RLAM.W %2, %L0 { CLR %H0
+  MOV%X1.B %1, %L0 { RPT %2 { RLAX.W %L0 { CLR %H0"
 )
 
-;; We are taking a char and shifting it and putting the result in 2 registers.
-;; the high register will always be for 0 shift counts < 8.
 (define_insn ""
-  [(set (match_operand:SI 0 "register_operand" "=r")
-	(ashift:SI (zero_extend:SI (subreg:HI (match_operand:QI 1 "general_operand" "rm") 0))
-		   (match_operand:HI 2 "immediate_operand" "M")))]
+  [(set (match_operand:SI			     		0 "register_operand" "=r,r,r")
+	(ashift:SI (zero_extend:SI (subreg:HI (match_operand:QI 1 "general_operand" "0,rm,rm") 0))
+		   (match_operand:HI		     		2 "const_1_to_8_operand" "M,M,i")))]
   "msp430x"
-  "MOV%X1.B %1, %L0 { RLAM.W %2, %L0 { CLR %H0"
+  "@
+  RLAM.W %2, %L0 { CLR %H0
+  MOV%X1.B %1, %L0 { RLAM.W %2, %L0 { CLR %H0
+  MOV%X1.B %1, %L0 { RPT %2 { RLAX.W %L0 { CLR %H0"
 )
 
 ;; Same as above but with a NOP sign_extend round the subreg
 (define_insn ""
-  [(set (match_operand:SI 0 "register_operand" "=r")
-	(ashift:SI (zero_extend:SI (sign_extend:PSI (subreg:HI (match_operand:QI 1 "general_operand" "rm") 0)))
-		   (match_operand:HI 2 "immediate_operand" "M")))]
+  [(set (match_operand:SI			     				 0 "register_operand" "=r,r,r")
+	(ashift:SI (zero_extend:SI (sign_extend:PSI (subreg:HI (match_operand:QI 1 "general_operand" "0,rm,rm") 0)))
+		   (match_operand:HI		     				 2 "const_1_to_8_operand" "M,M,i")))]
   "msp430x"
-  "MOV%X1.B %1, %L0 { RLAM.W %2, %L0 { CLR %H0"
+  "@
+  RLAM.W %2, %L0 { CLR %H0
+  MOV%X1.B %1, %L0 { RLAM.W %2, %L0 { CLR %H0
+  MOV%X1.B %1, %L0 { RPT %2 { RLAX.W %L0 { CLR %H0"
 )
 
 (define_insn ""
@@ -715,11 +744,14 @@
 )
 
 (define_insn ""
-  [(set (match_operand:PSI 0 "register_operand" "=r")
-	(ashift:PSI (sign_extend:PSI (subreg:HI (match_operand:QI 1 "general_operand" "rm") 0))
-		    (match_operand:HI 2 "immediate_operand" "M")))]
+  [(set (match_operand:PSI					  0 "register_operand" "=r,r,r")
+	(ashift:PSI (sign_extend:PSI (subreg:HI (match_operand:QI 1 "general_operand" "0,rm,rm") 0))
+		    (match_operand:HI				  2 "const_1_to_19_operand" "M,M,i")))]
   "msp430x"
-  "MOV%X1.B %1, %0 { RLAM.W %2, %0"
+  "@
+  RLAM.W %2, %0
+  MOV%X1.B %1, %0 { RLAM.W %2, %0
+  MOV%X1.B %1, %0 { RPT %2 { RLAX.A %0"
 )
 ;; END msp430 pointer manipulation combine insn patterns
 
@@ -831,287 +863,75 @@
 ;; Note - we ignore shift counts of less than one or more than 15.
 ;; This is permitted by the ISO C99 standard as such shifts result
 ;; in "undefined" behavior.  [6.5.7 (3)]
+;;
+;; We avoid emitting insns in msp430_expand_shift, since we would have to handle
+;; many extra cases such as op0 != op1, or, op0 or op1 in memory.  Instead we
+;; let reload coerce op0 and op1 into the same register.
 
-;; signed A << C
-
-(define_expand "ashlhi3"
-  [(set (match_operand:HI	     0 "msp430_general_dst_nonv_operand")
-	(ashift:HI (match_operand:HI 1 "general_operand")
-		   (match_operand:HI 2 "general_operand")))]
+(define_expand "<shift_insn><mode>3"
+  [(set (match_operand:HDI		  0 "msp430_general_dst_nonv_operand")
+	(any_shift:HDI (match_operand:HDI 1 "general_operand")
+		       (match_operand:HDI 2 "general_operand")))]
   ""
   {
-    if ((GET_CODE (operands[1]) == SUBREG
-	 && REG_P (XEXP (operands[1], 0)))
-	|| MEM_P (operands[1]))
-      operands[1] = force_reg (HImode, operands[1]);
-    if (msp430x
-        && REG_P (operands[0])
-        && REG_P (operands[1])
-        && CONST_INT_P (operands[2]))
-      emit_insn (gen_430x_shift_left (operands[0], operands[1], operands[2]));
-    else if (CONST_INT_P (operands[2])
-	     && INTVAL (operands[2]) == 1)
-      emit_insn (gen_slli_1 (operands[0], operands[1]));
-    else		 
-      /* The const variants of mspabi shifts have larger code size than the
-	 generic version, so use the generic version if optimizing for
-	 size.  */
-      msp430_expand_helper (operands, \"__mspabi_slli\", !optimize_size);
-    DONE;
+    if (msp430_expand_shift (<CODE>, <MODE>mode, operands))
+      DONE;
+    /* Otherwise, fallthrough.  */
   }
 )
 
-(define_insn "slli_1"
-  [(set (match_operand:HI	     0 "msp430_general_dst_nonv_operand" "=rm")
-	(ashift:HI (match_operand:HI 1 "general_operand"       "0")
-		   (const_int 1)))]
-  ""
-  "RLA%X0.W\t%0" ;; Note - this is a macro for ADD
-)
-
-(define_insn "430x_shift_left"
-  [(set (match_operand:HI            0 "register_operand" "=r")
-	(ashift:HI (match_operand:HI 1 "register_operand"  "0")
-		   (match_operand    2 "immediate_operand" "n")))]
-  "msp430x"
-  "*
-  if (INTVAL (operands[2]) > 0 && INTVAL (operands[2]) < 5)
-    return \"RLAM.W\t%2, %0\";
-  else if (INTVAL (operands[2]) >= 5 && INTVAL (operands[2]) < 16)
-    return \"RPT\t%2 { RLAX.W\t%0\";
-  return \"# nop left shift\";
-  "
-)
-
-(define_insn "slll_1"
-  [(set (match_operand:SI	     0 "msp430_general_dst_nonv_operand" "=rm")
-	(ashift:SI (match_operand:SI 1 "general_operand"       "0")
-		   (const_int 1)))]
-  ""
-  "RLA%X0.W\t%L0 { RLC%X0.W\t%H0"
-)
-
-(define_insn "slll_2"
-  [(set (match_operand:SI	     0 "msp430_general_dst_nonv_operand" "=rm")
-	(ashift:SI (match_operand:SI 1 "general_operand"       "0")
-		   (const_int 2)))]
-  ""
-  "RLA%X0.W\t%L0 { RLC%X0.W\t%H0 { RLA%X0.W\t%L0 { RLC%X0.W\t%H0"
-)
-
-(define_expand "ashlsi3"
-  [(set (match_operand:SI	     0 "msp430_general_dst_nonv_operand")
-	(ashift:SI (match_operand:SI 1 "general_operand")
-		   (match_operand:SI 2 "general_operand")))]
-  ""
-  "msp430_expand_helper (operands, \"__mspabi_slll\", !optimize_size);
-   DONE;"
-)
-
-(define_expand "ashldi3"
-  [(set (match_operand:DI	     0 "msp430_general_dst_nonv_operand")
-	(ashift:DI (match_operand:DI 1 "general_operand")
-		   (match_operand:DI 2 "general_operand")))]
-  ""
-  {
-    /* No const_variant for 64-bit shifts.  */
-    msp430_expand_helper (operands, \"__mspabi_sllll\", false);
-    DONE;
-  }
+;; All 430 HImode constant shifts
+(define_insn "<shift_insn>hi3_430"
+  [(set (match_operand:HI		0 "msp430_general_dst_nonv_operand" "=rm")
+	(any_shift:HI (match_operand:HI 1 "general_operand"       "0")
+		      (match_operand:HI 2 "const_int_operand"     "n")))]
+  "!msp430x"
+  "* return msp430_output_asm_shift_insns (<CODE>, HImode, operands);"
 )
 
-;;----------
-
-;; signed A >> C
-
-(define_expand "ashrhi3"
-  [(set (match_operand:HI	       0 "msp430_general_dst_nonv_operand")
-	(ashiftrt:HI (match_operand:HI 1 "general_operand")
-		     (match_operand:HI 2 "general_operand")))]
+;; All 430 and 430X SImode constant shifts
+(define_insn "<shift_insn>si3_const"
+  [(set (match_operand:SI		0 "msp430_general_dst_nonv_operand" "=rm")
+	(any_shift:SI (match_operand:SI 1 "general_operand"       "0")
+		      (match_operand:SI 2 "const_int_operand"     "n")))]
   ""
-  {
-    if ((GET_CODE (operands[1]) == SUBREG
-	 && REG_P (XEXP (operands[1], 0)))
-	|| MEM_P (operands[1]))
-      operands[1] = force_reg (HImode, operands[1]);
-    if (msp430x
-        && REG_P (operands[0])
-        && REG_P (operands[1])
-        && CONST_INT_P (operands[2]))
-      emit_insn (gen_430x_arithmetic_shift_right (operands[0], operands[1], operands[2]));
-    else if (CONST_INT_P (operands[2])
-	     && INTVAL (operands[2]) == 1)
-      emit_insn (gen_srai_1 (operands[0], operands[1]));
-    else		 
-       msp430_expand_helper (operands, \"__mspabi_srai\", !optimize_size);
-   DONE;
-   }
-)
-
-(define_insn "srai_1"
-  [(set (match_operand:HI	       0 "msp430_general_dst_operand" "=rm")
-	(ashiftrt:HI (match_operand:HI 1 "msp430_general_operand"      "0")
-		     (const_int 1)))]
-  ""
-  "RRA%X0.W\t%0"
-)
-
-(define_insn "430x_arithmetic_shift_right"
-  [(set (match_operand:HI              0 "register_operand" "=r")
-	(ashiftrt:HI (match_operand:HI 1 "register_operand"  "0")
-		     (match_operand    2 "immediate_operand" "n")))]
-  "msp430x"
-  "*
-  if (INTVAL (operands[2]) > 0 && INTVAL (operands[2]) < 5)
-    return \"RRAM.W\t%2, %0\";
-  else if (INTVAL (operands[2]) >= 5 && INTVAL (operands[2]) < 16)
-    return \"RPT\t%2 { RRAX.W\t%0\";
-  return \"# nop arith right shift\";
-  "
-)
-
-(define_insn "srap_1"
-  [(set (match_operand:PSI              0 "register_operand" "=r")
-	(ashiftrt:PSI (match_operand:PSI 1 "general_operand" "0")
-		      (const_int 1)))]
-  "msp430x"
-  "RRAM.A #1,%0"
+  "* return msp430_output_asm_shift_insns (<CODE>, SImode, operands);"
 )
 
-(define_insn "srap_2"
-  [(set (match_operand:PSI              0 "register_operand" "=r")
-	(ashiftrt:PSI (match_operand:PSI 1 "general_operand" "0")
-		      (const_int 2)))]
+(define_insn "ashl<mode>3_430x"
+  [(set (match_operand:HPSI		 0 "msp430_general_dst_nonv_operand" "=r,r,r,r")
+	(ashift:HPSI (match_operand:HPSI 1 "general_operand" 		     "0 ,0,0,0")
+		     (match_operand:HPSI 2 "const_int_operand" 		     "M ,P,K,i")))]
   "msp430x"
-  "RRAM.A #2,%0"
-)
-
-(define_insn "sral_1"
-  [(set (match_operand:SI	       0 "msp430_general_dst_nonv_operand" "=rm")
-	(ashiftrt:SI (match_operand:SI 1 "general_operand"       "0")
-		     (const_int 1)))]
-  ""
-  "RRA%X0.W\t%H0 { RRC%X0.W\t%L0"
-)
-
-(define_insn "sral_2"
-  [(set (match_operand:SI	       0 "msp430_general_dst_nonv_operand" "=rm")
-	(ashiftrt:SI (match_operand:SI 1 "general_operand"       "0")
-		     (const_int 2)))]
-  ""
-  "RRA%X0.W\t%H0 { RRC%X0.W\t%L0 { RRA%X0.W\t%H0 { RRC%X0.W\t%L0"
-)
-
-(define_expand "ashrsi3"
-  [(set (match_operand:SI	       0 "msp430_general_dst_nonv_operand")
-	(ashiftrt:SI (match_operand:SI 1 "general_operand")
-		     (match_operand:SI 2 "general_operand")))]
-  ""
-  "msp430_expand_helper (operands, \"__mspabi_sral\", !optimize_size);
-   DONE;"
-)
-
-(define_expand "ashrdi3"
-  [(set (match_operand:DI	     0 "msp430_general_dst_nonv_operand")
-	(ashift:DI (match_operand:DI 1 "general_operand")
-		   (match_operand:DI 2 "general_operand")))]
-  ""
-  {
-    /* No const_variant for 64-bit shifts.  */
-    msp430_expand_helper (operands, \"__mspabi_srall\", false);
-    DONE;
-  }
-)
-
-;;----------
-
-;; unsigned A >> C
-
-(define_expand "lshrhi3"
-  [(set (match_operand:HI	       0 "msp430_general_dst_nonv_operand")
-	(lshiftrt:HI (match_operand:HI 1 "general_operand")
-		     (match_operand:HI 2 "general_operand")))]
-  ""
-  {
-    if ((GET_CODE (operands[1]) == SUBREG
-	 && REG_P (XEXP (operands[1], 0)))
-	|| MEM_P (operands[1]))
-      operands[1] = force_reg (HImode, operands[1]);
-    if (msp430x
-        && REG_P (operands[0])
-        && REG_P (operands[1])
-        && CONST_INT_P (operands[2]))
-      emit_insn (gen_430x_logical_shift_right (operands[0], operands[1], operands[2]));
-    else if (CONST_INT_P (operands[2])
-	     && INTVAL (operands[2]) == 1)
-      emit_insn (gen_srli_1 (operands[0], operands[1]));
-    else		 
-      msp430_expand_helper (operands, \"__mspabi_srli\", !optimize_size);
-    DONE;
-  }
-)
-
-(define_insn "srli_1"
-  [(set (match_operand:HI	       0 "msp430_general_dst_nonv_operand" "=rm")
-	(lshiftrt:HI (match_operand:HI 1 "general_operand"       "0")
-		     (const_int 1)))]
-  ""
-  "CLRC { RRC%X0.W\t%0"
+  "@
+  RLAM%b0\t%2, %0
+  RPT\t%2 { RLAX%b0\t%0
+  RPT\t#16 { RLAX%b0\t%0 { RPT\t%W2 { RLAX%b0\t%0
+  # undefined behavior left shift of %1 by %2"
 )
 
-(define_insn "430x_logical_shift_right"
-  [(set (match_operand:HI              0 "register_operand" "=r")
-	(lshiftrt:HI (match_operand:HI 1 "register_operand"  "0")
-		     (match_operand    2 "immediate_operand" "n")))]
+(define_insn "ashr<mode>3_430x"
+  [(set (match_operand:HPSI		   0 "msp430_general_dst_nonv_operand" "=r,r,r,r")
+	(ashiftrt:HPSI (match_operand:HPSI 1 "general_operand"	  	     "0,0,0,0")
+		       (match_operand:HPSI 2 "const_int_operand" 	     "M,P,K,i")))]
   "msp430x"
-  {
-    return msp430x_logical_shift_right (operands[2]);
-  }
-)
-
-(define_insn "srlp_1"
-  [(set (match_operand:PSI              0 "register_operand" "=r")
-	(lshiftrt:PSI (match_operand:PSI 1 "general_operand" "0")
-		      (const_int 1)))]
-  ""
-  "RRUM.A #1,%0"
-)
-
-(define_insn "srll_1"
-  [(set (match_operand:SI	       0 "msp430_general_dst_nonv_operand" "=rm")
-	(lshiftrt:SI (match_operand:SI 1 "general_operand"       "0")
-		     (const_int 1)))]
-  ""
-  "CLRC { RRC%X0.W\t%H0 { RRC%X0.W\t%L0"
+  "@
+  RRAM%b0\t%2, %0
+  RPT\t%2 { RRAX%b0\t%0
+  RPT\t#16 { RRAX%b0\t%0 { RPT\t%W2 { RRAX%b0\t%0
+  # undefined behavior arithmetic right shift of %1 by %2"
 )
 
-(define_insn "srll_2x"
-  [(set (match_operand:SI	       0 "msp430_general_dst_nonv_operand" "=r")
-	(lshiftrt:SI (match_operand:SI 1 "general_operand"       "0")
-		     (const_int 2)))]
+(define_insn "lshr<mode>3_430x"
+  [(set (match_operand:HPSI		   0 "msp430_general_dst_nonv_operand" "=r,r,r,r")
+	(lshiftrt:HPSI (match_operand:HPSI 1 "general_operand"	  	     "0,0,0,0")
+		       (match_operand:HPSI 2 "const_int_operand" 	     "M,P,K,i")))]
   "msp430x"
-  "RRUX.W\t%H0 { RRC.W\t%L0 { RRUX.W\t%H0 { RRC.W\t%L0"
-)
-
-(define_expand "lshrsi3"
-  [(set (match_operand:SI	       0 "msp430_general_dst_nonv_operand")
-	(lshiftrt:SI (match_operand:SI 1 "general_operand")
-		     (match_operand:SI 2 "general_operand")))]
-  ""
-  "msp430_expand_helper (operands, \"__mspabi_srll\", !optimize_size);
-   DONE;"
-)
-
-(define_expand "lshrdi3"
-  [(set (match_operand:DI	     0 "msp430_general_dst_nonv_operand")
-	(ashift:DI (match_operand:DI 1 "general_operand")
-		   (match_operand:DI 2 "general_operand")))]
-  ""
-  {
-    /* No const_variant for 64-bit shifts.  */
-    msp430_expand_helper (operands, \"__mspabi_srlll\", false);
-    DONE;
-  }
+  "@
+  RRUM%b0\t%2, %0
+  RPT\t%2 { RRUX%b0\t%0
+  RPT\t#16 { RRUX%b0\t%0 { RPT\t%W2 { RRUX%b0\t%0
+  # undefined behavior logical right shift of %1 by %2"
 )
 
 ;;------------------------------------------------------------
@@ -1300,33 +1120,10 @@
    (clobber (reg:BI CARRY))
    ]
   ""
-  "*
-    /* This is nasty.  If we are splitting code between low and high memory
-       then we do not want the linker to increase the size of sections by
-       relaxing out of range jump instructions.  (Since relaxation occurs
-       after section placement).  So we have to generate pessimal branches
-       here.  But we only want to do this when really necessary.
-
-       FIXME: Do we need code in the other cbranch patterns ?  */
-    if (msp430_do_not_relax_short_jumps () && get_attr_length (insn) > 6)
-      {
-        return which_alternative == 0 ?
-            \"CMP.W\t%2, %1 { J%r0 1f { BRA #%l3 { 1:\" :
-	    \"CMPX.W\t%2, %1 { J%r0 1f { BRA #%l3 { 1:\";
-      }
-
-    return which_alternative == 0 ?
-         \"CMP.W\t%2, %1 { J%0\t%l3\" :
-	 \"CMPX.W\t%2, %1 { J%0\t%l3\";
-  "
-  [(set (attr "length")
-	(if_then_else
-	  (and (ge (minus (match_dup 3) (pc)) (const_int -510))
-	       (le (minus (match_dup 3) (pc)) (const_int 510)))
-	  (const_int 6)
-	  (const_int 10))
-	)]
-  )
+  "@
+   CMP.W\t%2, %1 { J%0\t%l3
+   CMPX.W\t%2, %1 { J%0\t%l3"
+)
 
 (define_insn "cbranchpsi4_reversed"
   [(set (pc) (if_then_else
@@ -1441,7 +1238,7 @@
   [(set (pc) (if_then_else
 	      (ne (zero_extract:HI (match_operand:QHI 0 "msp430_general_dst_operand" "rYs,rm")
 				    (const_int 1)
-				    (match_operand 1 "msp430_bitpos" "i,i"))
+				    (match_operand 1 "const_0_to_15_operand" "i,i"))
 		  (const_int 0))
               (label_ref (match_operand 2 "" ""))
 	      (pc)))
@@ -1457,7 +1254,7 @@
   [(set (pc) (if_then_else
 	      (eq (zero_extract:HI (match_operand:QHI 0 "msp430_general_dst_operand" "rm")
 				   (const_int 1)
-				   (match_operand 1 "msp430_bitpos" "i"))
+				   (match_operand 1 "const_0_to_15_operand" "i"))
 		  (const_int 0))
               (label_ref (match_operand 2 "" ""))
 	      (pc)))
@@ -1471,7 +1268,7 @@
   [(set (pc) (if_then_else
 	      (eq (zero_extract:HI (match_operand:QHI 0 "msp430_general_dst_operand" "rm")
 				   (const_int 1)
-				   (match_operand 1 "msp430_bitpos" "i"))
+				   (match_operand 1 "const_0_to_15_operand" "i"))
 		  (const_int 0))
               (pc)
 	      (label_ref (match_operand 2 "" ""))))
@@ -1485,7 +1282,7 @@
   [(set (pc) (if_then_else
 	      (ne (zero_extract:HI (match_operand:QHI 0 "msp430_general_dst_operand" "rm")
 				   (const_int 1)
-				   (match_operand 1 "msp430_bitpos" "i"))
+				   (match_operand 1 "const_0_to_15_operand" "i"))
 		  (const_int 0))
               (pc)
 	      (label_ref (match_operand 2 "" ""))))
diff --git a/gcc/config/msp430/msp430.opt b/gcc/config/msp430/msp430.opt
index b451174..692e7dc 100644
--- a/gcc/config/msp430/msp430.opt
+++ b/gcc/config/msp430/msp430.opt
@@ -23,9 +23,30 @@ Target Report Var(msp430_warn_devices_csv) Init(1)
 Warn if devices.csv is not found or there are problem parsing it (default: on).
 
 mcpu=
-Target Report Joined RejectNegative Var(target_cpu)
+Target Report Joined RejectNegative Var(target_cpu) ToLower Enum(msp430_cpu_types) Init(MSP430_CPU_MSP430X_DEFAULT)
 Specify the ISA to build for: msp430, msp430x, msp430xv2.
 
+Enum
+Name(msp430_cpu_types) Type(enum msp430_cpu_types)
+
+EnumValue
+Enum(msp430_cpu_types) String(msp430) Value(MSP430_CPU_MSP430) Canonical
+
+EnumValue
+Enum(msp430_cpu_types) String(430) Value(MSP430_CPU_MSP430)
+
+EnumValue
+Enum(msp430_cpu_types) String(msp430x) Value(MSP430_CPU_MSP430X) Canonical
+
+EnumValue
+Enum(msp430_cpu_types) String(430x) Value(MSP430_CPU_MSP430X)
+
+EnumValue
+Enum(msp430_cpu_types) String(msp430xv2) Value(MSP430_CPU_MSP430XV2) Canonical
+
+EnumValue
+Enum(msp430_cpu_types) String(430xv2) Value(MSP430_CPU_MSP430XV2)
+
 mlarge
 Target Report Mask(LARGE) RejectNegative
 Select large model - 20-bit addresses/pointers.
@@ -109,3 +130,9 @@ mdevices-csv-loc=
 Target Joined Var(msp430_devices_csv_loc) RejectNegative Report
 The path to devices.csv.  The GCC driver can normally locate devices.csv itself
 and pass this option to the compiler, so the user shouldn't need to pass this.
+
+mmax-inline-shift=
+Target RejectNegative Joined UInteger IntegerRange(0,65) Var(msp430_max_inline_shift) Init(65) Report
+For shift operations by a constant amount, which require an individual instruction to shift by one
+position, set the maximum number of inline shift instructions (maximum value 64) to emit instead of using the corresponding __mspabi helper function.
+The default value is 4.
diff --git a/gcc/config/msp430/predicates.md b/gcc/config/msp430/predicates.md
index 408d56f..4bfa0c0 100644
--- a/gcc/config/msp430/predicates.md
+++ b/gcc/config/msp430/predicates.md
@@ -113,12 +113,21 @@
        (ior (match_code "reg,mem")
 	    (match_operand 0 "immediate_operand"))))
 
-; TRUE for constants which are bit positions for zero_extract
-(define_predicate "msp430_bitpos"
+(define_predicate "const_1_to_8_operand"
+  (and (match_code "const_int")
+       (match_test ("   INTVAL (op) >= 1
+		     && INTVAL (op) <= 8 "))))
+
+(define_predicate "const_0_to_15_operand"
   (and (match_code "const_int")
        (match_test ("   INTVAL (op) >= 0
 		     && INTVAL (op) <= 15 "))))
 
+(define_predicate "const_1_to_19_operand"
+  (and (match_code "const_int")
+       (match_test ("   INTVAL (op) >= 1
+		     && INTVAL (op) <= 19 "))))
+
 (define_predicate "msp430_symbol_operand"
   (match_code "symbol_ref")
 )
diff --git a/gcc/config/nios2/nios2.c b/gcc/config/nios2/nios2.c
index ba0a0a9..5566435 100644
--- a/gcc/config/nios2/nios2.c
+++ b/gcc/config/nios2/nios2.c
@@ -1448,7 +1448,7 @@ nios2_option_override (void)
   /* Save the initial options in case the user does function specific
      options.  */
   target_option_default_node = target_option_current_node
-    = build_target_option_node (&global_options);
+    = build_target_option_node (&global_options, &global_options_set);
 }
 
 
@@ -4137,7 +4137,8 @@ nios2_deregister_custom_code (unsigned int N)
 
 static void
 nios2_option_save (struct cl_target_option *ptr,
-		   struct gcc_options *opts ATTRIBUTE_UNUSED)
+		   struct gcc_options *opts ATTRIBUTE_UNUSED,
+		   struct gcc_options *opts_set ATTRIBUTE_UNUSED)
 {
   unsigned int i;
   for (i = 0; i < ARRAY_SIZE (nios2_fpu_insn); i++)
@@ -4150,6 +4151,7 @@ nios2_option_save (struct cl_target_option *ptr,
 
 static void
 nios2_option_restore (struct gcc_options *opts ATTRIBUTE_UNUSED,
+		      struct gcc_options *opts_set ATTRIBUTE_UNUSED,
 		      struct cl_target_option *ptr)
 {
   unsigned int i;
@@ -4310,7 +4312,7 @@ nios2_valid_target_attribute_tree (tree args)
   if (!nios2_valid_target_attribute_rec (args))
     return NULL_TREE;
   nios2_custom_check_insns ();
-  return build_target_option_node (&global_options);
+  return build_target_option_node (&global_options, &global_options_set);
 }
 
 /* Hook to validate attribute((target("string"))).  */
@@ -4321,21 +4323,22 @@ nios2_valid_target_attribute_p (tree fndecl, tree ARG_UNUSED (name),
 {
   struct cl_target_option cur_target;
   bool ret = true;
-  tree old_optimize = build_optimization_node (&global_options);
+  tree old_optimize
+    = build_optimization_node (&global_options, &global_options_set);
   tree new_target, new_optimize;
   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
 
   /* If the function changed the optimization levels as well as setting target
      options, start with the optimizations specified.  */
   if (func_optimize && func_optimize != old_optimize)
-    cl_optimization_restore (&global_options,
+    cl_optimization_restore (&global_options, &global_options_set,
 			     TREE_OPTIMIZATION (func_optimize));
 
   /* The target attributes may also change some optimization flags, so update
      the optimization options if necessary.  */
-  cl_target_option_save (&cur_target, &global_options);
+  cl_target_option_save (&cur_target, &global_options, &global_options_set);
   new_target = nios2_valid_target_attribute_tree (args);
-  new_optimize = build_optimization_node (&global_options);
+  new_optimize = build_optimization_node (&global_options, &global_options_set);
 
   if (!new_target)
     ret = false;
@@ -4348,10 +4351,10 @@ nios2_valid_target_attribute_p (tree fndecl, tree ARG_UNUSED (name),
 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
     }
 
-  cl_target_option_restore (&global_options, &cur_target);
+  cl_target_option_restore (&global_options, &global_options_set, &cur_target);
 
   if (old_optimize != new_optimize)
-    cl_optimization_restore (&global_options,
+    cl_optimization_restore (&global_options, &global_options_set,
 			     TREE_OPTIMIZATION (old_optimize));
   return ret;
 }
@@ -4381,7 +4384,7 @@ nios2_set_current_function (tree fndecl)
 
       else if (new_tree)
 	{
-	  cl_target_option_restore (&global_options,
+	  cl_target_option_restore (&global_options, &global_options_set,
 				    TREE_TARGET_OPTION (new_tree));
 	  target_reinit ();
 	}
@@ -4391,7 +4394,7 @@ nios2_set_current_function (tree fndecl)
 	  struct cl_target_option *def
 	    = TREE_TARGET_OPTION (target_option_current_node);
 
-	  cl_target_option_restore (&global_options, def);
+	  cl_target_option_restore (&global_options, &global_options_set, def);
 	  target_reinit ();
 	}
     }
@@ -4409,7 +4412,7 @@ nios2_pragma_target_parse (tree args, tree pop_target)
       cur_tree = ((pop_target)
 		  ? pop_target
 		  : target_option_default_node);
-      cl_target_option_restore (&global_options,
+      cl_target_option_restore (&global_options, &global_options_set,
 				TREE_TARGET_OPTION (cur_tree));
     }
   else
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index d2f321f..1734947 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -368,6 +368,22 @@ nvptx_name_replacement (const char *name)
   return name;
 }
 
+/* Return NULL if NAME contains no dot.  Otherwise return a copy of NAME
+   with the dots replaced with dollar signs.  */
+
+static char *
+nvptx_replace_dot (const char *name)
+{
+  if (strchr (name, '.') == NULL)
+    return NULL;
+
+  char *p = xstrdup (name);
+  for (size_t i = 0; i < strlen (p); ++i)
+    if (p[i] == '.')
+      p[i] = '$';
+  return p;
+}
+
 /* If MODE should be treated as two registers of an inner mode, return
    that inner mode.  Otherwise return VOIDmode.  */
 
@@ -827,26 +843,12 @@ write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
   fputs ("\n", file);
 }
 
-/* Write a .func or .kernel declaration or definition along with
-   a helper comment for use by ld.  S is the stream to write to, DECL
-   the decl for the function with name NAME.   For definitions, emit
-   a declaration too.  */
+/* Helper function for write_fn_proto.  */
 
-static const char *
-write_fn_proto (std::stringstream &s, bool is_defn,
-		const char *name, const_tree decl)
+static void
+write_fn_proto_1 (std::stringstream &s, bool is_defn,
+		  const char *name, const_tree decl)
 {
-  if (is_defn)
-    /* Emit a declaration. The PTX assembler gets upset without it.   */
-    name = write_fn_proto (s, false, name, decl);
-  else
-    {
-      /* Avoid repeating the name replacement.  */
-      name = nvptx_name_replacement (name);
-      if (name[0] == '*')
-	name++;
-    }
-
   write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
 
   /* PTX declaration.  */
@@ -910,7 +912,7 @@ write_fn_proto (std::stringstream &s, bool is_defn,
       if (not_atomic_weak_arg)
 	argno = write_arg_type (s, -1, argno, type, prototyped);
       else
-	gcc_assert (type == boolean_type_node);
+	gcc_assert (TREE_CODE (type) == BOOLEAN_TYPE);
     }
 
   if (stdarg_p (fntype))
@@ -929,8 +931,38 @@ write_fn_proto (std::stringstream &s, bool is_defn,
     s << ")";
 
   s << (is_defn ? "\n" : ";\n");
+}
 
-  return name;
+/* Write a .func or .kernel declaration or definition along with
+   a helper comment for use by ld.  S is the stream to write to, DECL
+   the decl for the function with name NAME.  For definitions, emit
+   a declaration too.  */
+
+static void
+write_fn_proto (std::stringstream &s, bool is_defn,
+		const char *name, const_tree decl)
+{
+  const char *replacement = nvptx_name_replacement (name);
+  char *replaced_dots = NULL;
+  if (replacement != name)
+    name = replacement;
+  else
+    {
+      replaced_dots = nvptx_replace_dot (name);
+      if (replaced_dots)
+	name = replaced_dots;
+    }
+  if (name[0] == '*')
+    name++;
+
+  if (is_defn)
+    /* Emit a declaration.  The PTX assembler gets upset without it.  */
+    write_fn_proto_1 (s, false, name, decl);
+
+  write_fn_proto_1 (s, is_defn, name, decl);
+
+  if (replaced_dots)
+    XDELETE (replaced_dots);
 }
 
 /* Construct a function declaration from a call insn.  This can be
@@ -942,6 +974,8 @@ static void
 write_fn_proto_from_insn (std::stringstream &s, const char *name,
 			  rtx result, rtx pat)
 {
+  char *replaced_dots = NULL;
+
   if (!name)
     {
       s << "\t.callprototype ";
@@ -949,7 +983,15 @@ write_fn_proto_from_insn (std::stringstream &s, const char *name,
     }
   else
     {
-      name = nvptx_name_replacement (name);
+      const char *replacement = nvptx_name_replacement (name);
+      if (replacement != name)
+	name = replacement;
+      else
+	{
+	  replaced_dots = nvptx_replace_dot (name);
+	  if (replaced_dots)
+	    name = replaced_dots;
+	}
       write_fn_marker (s, false, true, name);
       s << "\t.extern .func ";
     }
@@ -958,6 +1000,8 @@ write_fn_proto_from_insn (std::stringstream &s, const char *name,
     write_return_mode (s, true, GET_MODE (result));
 
   s << name;
+  if (replaced_dots)
+    XDELETE (replaced_dots);
 
   int arg_end = XVECLEN (pat, 0);
   for (int i = 1; i < arg_end; i++)
@@ -1796,6 +1840,44 @@ nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
 	end_sequence ();
       }
       break;
+    case E_V2SImode:
+      {
+	rtx src0 = gen_rtx_SUBREG (SImode, src, 0);
+	rtx src1 = gen_rtx_SUBREG (SImode, src, 4);
+	rtx dst0 = gen_rtx_SUBREG (SImode, dst, 0);
+	rtx dst1 = gen_rtx_SUBREG (SImode, dst, 4);
+	rtx tmp0 = gen_reg_rtx (SImode);
+	rtx tmp1 = gen_reg_rtx (SImode);
+	start_sequence ();
+	emit_insn (gen_movsi (tmp0, src0));
+	emit_insn (gen_movsi (tmp1, src1));
+	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
+	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
+	emit_insn (gen_movsi (dst0, tmp0));
+	emit_insn (gen_movsi (dst1, tmp1));
+	res = get_insns ();
+	end_sequence ();
+      }
+      break;
+    case E_V2DImode:
+      {
+	rtx src0 = gen_rtx_SUBREG (DImode, src, 0);
+	rtx src1 = gen_rtx_SUBREG (DImode, src, 8);
+	rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0);
+	rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8);
+	rtx tmp0 = gen_reg_rtx (DImode);
+	rtx tmp1 = gen_reg_rtx (DImode);
+	start_sequence ();
+	emit_insn (gen_movdi (tmp0, src0));
+	emit_insn (gen_movdi (tmp1, src1));
+	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
+	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
+	emit_insn (gen_movdi (dst0, tmp0));
+	emit_insn (gen_movdi (dst1, tmp1));
+	res = get_insns ();
+	end_sequence ();
+      }
+      break;
     case E_BImode:
       {
 	rtx tmp = gen_reg_rtx (SImode);
@@ -2012,11 +2094,20 @@ output_init_frag (rtx sym)
 static void
 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
 {
-  val &= ((unsigned  HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
+  bool negative_p
+    = val & (HOST_WIDE_INT_1U << (HOST_BITS_PER_WIDE_INT - 1));
+
+  /* Avoid undefined behaviour.  */
+  if (size * BITS_PER_UNIT < HOST_BITS_PER_WIDE_INT)
+    val &= (HOST_WIDE_INT_1U << (size * BITS_PER_UNIT)) - 1;
 
   for (unsigned part = 0; size; size -= part)
     {
-      val >>= part * BITS_PER_UNIT;
+      if (part * BITS_PER_UNIT == HOST_BITS_PER_WIDE_INT)
+	/* Avoid undefined behaviour.  */
+	val = negative_p ? -1 : 0;
+      else
+	val >>= (part * BITS_PER_UNIT);
       part = init_frag.size - init_frag.offset;
       part = MIN (part, size);
 
@@ -2054,7 +2145,7 @@ nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
       val = INTVAL (XEXP (x, 1));
       x = XEXP (x, 0);
       gcc_assert (GET_CODE (x) == SYMBOL_REF);
-      /* FALLTHROUGH */
+      gcc_fallthrough (); /* FALLTHROUGH */
 
     case SYMBOL_REF:
       gcc_assert (size == init_frag.size);
@@ -2164,7 +2255,7 @@ nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
     /* Neither vector nor complex types can contain the other.  */
     type = TREE_TYPE (type);
 
-  unsigned elt_size = int_size_in_bytes (type);
+  unsigned HOST_WIDE_INT elt_size = int_size_in_bytes (type);
 
   /* Largest mode we're prepared to accept.  For BLKmode types we
      don't know if it'll contain pointer constants, so have to choose
@@ -2186,7 +2277,7 @@ nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
      elt_size. */
   init_frag.remaining = (size + elt_size - 1) / elt_size;
 
-  fprintf (file, "%s .align %d .u%d ",
+  fprintf (file, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED " ",
 	   section, align / BITS_PER_UNIT,
 	   elt_size * BITS_PER_UNIT);
   assemble_name (file, name);
@@ -2194,7 +2285,7 @@ nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
   if (size)
     /* We make everything an array, to simplify any initialization
        emission.  */
-    fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
+    fprintf (file, "[" HOST_WIDE_INT_PRINT_UNSIGNED "]", init_frag.remaining);
   else if (atype)
     fprintf (file, "[]");
 }
@@ -2302,6 +2393,7 @@ const char *
 nvptx_output_mov_insn (rtx dst, rtx src)
 {
   machine_mode dst_mode = GET_MODE (dst);
+  machine_mode src_mode = GET_MODE (src);
   machine_mode dst_inner = (GET_CODE (dst) == SUBREG
 			    ? GET_MODE (XEXP (dst, 0)) : dst_mode);
   machine_mode src_inner = (GET_CODE (src) == SUBREG
@@ -2328,7 +2420,7 @@ nvptx_output_mov_insn (rtx dst, rtx src)
   if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
     {
       if (GET_MODE_BITSIZE (dst_mode) == 128
-	  && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
+	  && GET_MODE_BITSIZE (src_mode) == 128)
 	{
 	  /* mov.b128 is not supported.  */
 	  if (dst_inner == V2DImode && src_inner == TImode)
@@ -2341,6 +2433,10 @@ nvptx_output_mov_insn (rtx dst, rtx src)
       return "%.\tmov.b%T0\t%0, %1;";
     }
 
+  if (GET_MODE_BITSIZE (src_inner) == 128
+      && GET_MODE_BITSIZE (src_mode) == 64)
+    return "%.\tmov.b%T0\t%0, %1;";
+
   return "%.\tcvt%t0%t1\t%0, %1;";
 }
 
@@ -2411,9 +2507,20 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
   
   if (decl)
     {
+      char *replaced_dots = NULL;
       const char *name = get_fnname_from_decl (decl);
-      name = nvptx_name_replacement (name);
+      const char *replacement = nvptx_name_replacement (name);
+      if (replacement != name)
+	name = replacement;
+      else
+	{
+	  replaced_dots = nvptx_replace_dot (name);
+	  if (replaced_dots)
+	    name = replaced_dots;
+	}
       assemble_name (asm_out_file, name);
+      if (replaced_dots)
+	XDELETE (replaced_dots);
     }
   else
     output_address (VOIDmode, callee);
@@ -2551,7 +2658,7 @@ nvptx_print_operand (FILE *file, rtx x, int code)
     {
     case 'A':
       x = XEXP (x, 0);
-      /* FALLTHROUGH.  */
+      gcc_fallthrough (); /* FALLTHROUGH. */
 
     case 'D':
       if (GET_CODE (x) == CONST)
@@ -6463,6 +6570,14 @@ nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
   return false;
 }
 
+/* Implement TARGET_TRULY_NOOP_TRUNCATION.  */
+
+static bool
+nvptx_truly_noop_truncation (poly_uint64, poly_uint64)
+{
+  return false;
+}
+
 static GTY(()) tree nvptx_previous_fndecl;
 
 static void
@@ -6476,6 +6591,23 @@ nvptx_set_current_function (tree fndecl)
   oacc_bcast_partition = 0;
 }
 
+/* Implement TARGET_LIBC_HAS_FUNCTION.  */
+
+bool
+nvptx_libc_has_function (enum function_class fn_class, tree type)
+{
+  if (fn_class == function_sincos)
+    {
+      if (type != NULL_TREE)
+	/* Currently, newlib does not support sincosl.  */
+	return type == float_type_node || type == double_type_node;
+      else
+	return true;
+    }
+
+  return default_libc_has_function (fn_class, type);
+}
+
 #undef TARGET_OPTION_OVERRIDE
 #define TARGET_OPTION_OVERRIDE nvptx_option_override
 
@@ -6612,12 +6744,18 @@ nvptx_set_current_function (tree fndecl)
 #undef TARGET_CAN_CHANGE_MODE_CLASS
 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
 
+#undef TARGET_TRULY_NOOP_TRUNCATION
+#define TARGET_TRULY_NOOP_TRUNCATION nvptx_truly_noop_truncation
+
 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
 
 #undef TARGET_SET_CURRENT_FUNCTION
 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
 
+#undef TARGET_LIBC_HAS_FUNCTION
+#define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-nvptx.h"
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 6ebcc76..17fe157 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -29,7 +29,10 @@
 
 #define STARTFILE_SPEC "%{mmainkernel:crt0.o}"
 
-#define ASM_SPEC "%{misa=*:-m %*}"
+/* Default needs to be in sync with default for misa in nvptx.opt.
+   We add a default here to work around a hard-coded sm_30 default in
+   nvptx-as.  */
+#define ASM_SPEC "%{misa=*:-m %*; :-m sm_35}"
 
 #define TARGET_CPU_CPP_BUILTINS()		\
   do						\
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 6545b81..ccbcd09 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -118,6 +118,10 @@
 (define_predicate "nvptx_float_comparison_operator"
   (match_code "eq,ne,le,ge,lt,gt,uneq,unle,unge,unlt,ungt,unordered,ordered"))
 
+(define_predicate "nvptx_vector_index_operand"
+  (and (match_code "const_int")
+       (match_test "UINTVAL (op) < 4")))
+
 ;; Test for a valid operand for a call instruction.
 (define_predicate "call_insn_operand"
   (match_code "symbol_ref,reg")
@@ -142,6 +146,13 @@
   return true;
 })
 
+;; Test for a function symbol ref operand
+(define_predicate "symbol_ref_function_operand"
+  (match_code "symbol_ref")
+{
+  return SYMBOL_REF_FUNCTION_P (op);
+})
+
 (define_attr "predicable" "false,true"
   (const_string "true"))
 
@@ -194,6 +205,10 @@
 ;; pointer-sized quantities.  Exactly one of the two alternatives will match.
 (define_mode_iterator P [(SI "Pmode == SImode") (DI "Pmode == DImode")])
 
+;; Define element mode for each vector mode.
+(define_mode_attr VECELEM [(V2SI "SI") (V2DI "DI")])
+(define_mode_attr Vecelem [(V2SI "si") (V2DI "di")])
+
 ;; We should get away with not defining memory alternatives, since we don't
 ;; get variables in this mode and pseudos are never spilled.
 (define_insn "movbi"
@@ -233,6 +248,17 @@
 }
   [(set_attr "subregs_ok" "true")])
 
+;; ptxas segfaults on 'mov.u64 %r24,bar+4096', so break it up.
+(define_split
+  [(set (match_operand:DI 0 "nvptx_register_operand")
+	(const:DI (plus:DI (match_operand:DI 1 "symbol_ref_function_operand")
+			   (match_operand 2 "const_int_operand"))))]
+  ""
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 0) (plus:DI (match_dup 0) (match_dup 2)))
+  ]
+  "")
+
 (define_insn "*mov<mode>_insn"
   [(set (match_operand:SDFM 0 "nonimmediate_operand" "=R,R,m")
 	(match_operand:SDFM 1 "general_operand" "RF,m,R"))]
@@ -319,6 +345,13 @@
    %.\\tld%A1%u1\\t%0, %1;"
   [(set_attr "subregs_ok" "true")])
 
+(define_insn "extendqihi2"
+  [(set (match_operand:HI 0 "nvptx_register_operand" "=R")
+	(sign_extend:HI (match_operand:QI 1 "nvptx_register_operand" "R")))]
+  ""
+  "%.\\tcvt.s16.s8\\t%0, %1;"
+  [(set_attr "subregs_ok" "true")])
+
 (define_insn "extend<mode>si2"
   [(set (match_operand:SI 0 "nvptx_register_operand" "=R,R")
 	(sign_extend:SI (match_operand:QHIM 1 "nvptx_nonimmediate_operand" "R,m")))]
@@ -350,9 +383,13 @@
   [(set (match_operand:QHIM 0 "nvptx_nonimmediate_operand" "=R,m")
 	(truncate:QHIM (match_operand:SI 1 "nvptx_register_operand" "R,R")))]
   ""
-  "@
-   %.\\tcvt%t0.u32\\t%0, %1;
-   %.\\tst%A0.u%T0\\t%0, %1;"
+  {
+    if (which_alternative == 1)
+      return "%.\\tst%A0.u%T0\\t%0, %1;";
+    if (GET_MODE (operands[0]) == QImode)
+      return "%.\\tmov%t0\\t%0, %1;";
+    return "%.\\tcvt%t0.u32\\t%0, %1;";
+  }
   [(set_attr "subregs_ok" "true")])
 
 (define_insn "truncdi<mode>2"
@@ -553,26 +590,74 @@
   ""
   "%.\\tmul.wide.u32\\t%0, %1, %2;")
 
+(define_insn "smulhi3_highpart"
+  [(set (match_operand:HI 0 "nvptx_register_operand" "=R")
+	(truncate:HI
+	 (lshiftrt:SI
+	  (mult:SI (sign_extend:SI
+		    (match_operand:HI 1 "nvptx_register_operand" "R"))
+		   (sign_extend:SI
+		    (match_operand:HI 2 "nvptx_register_operand" "R")))
+	  (const_int 16))))]
+  ""
+  "%.\\tmul.hi.s16\\t%0, %1, %2;")
+
+(define_insn "smulsi3_highpart"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+	(truncate:SI
+	 (lshiftrt:DI
+	  (mult:DI (sign_extend:DI
+		    (match_operand:SI 1 "nvptx_register_operand" "R"))
+		   (sign_extend:DI
+		    (match_operand:SI 2 "nvptx_register_operand" "R")))
+	  (const_int 32))))]
+  ""
+  "%.\\tmul.hi.s32\\t%0, %1, %2;")
+
+(define_insn "umulhi3_highpart"
+  [(set (match_operand:HI 0 "nvptx_register_operand" "=R")
+	(truncate:HI
+	 (lshiftrt:SI
+	  (mult:SI (zero_extend:SI
+		    (match_operand:HI 1 "nvptx_register_operand" "R"))
+		   (zero_extend:SI
+		    (match_operand:HI 2 "nvptx_register_operand" "R")))
+	  (const_int 16))))]
+  ""
+  "%.\\tmul.hi.u16\\t%0, %1, %2;")
+
+(define_insn "umulsi3_highpart"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+	(truncate:SI
+	 (lshiftrt:DI
+	  (mult:DI (zero_extend:DI
+		    (match_operand:SI 1 "nvptx_register_operand" "R"))
+		   (zero_extend:DI
+		    (match_operand:SI 2 "nvptx_register_operand" "R")))
+	  (const_int 32))))]
+  ""
+  "%.\\tmul.hi.u32\\t%0, %1, %2;")
+
 ;; Shifts
 
 (define_insn "ashl<mode>3"
-  [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
-	(ashift:SDIM (match_operand:SDIM 1 "nvptx_register_operand" "R")
-		     (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")))]
+  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
+	(ashift:HSDIM (match_operand:HSDIM 1 "nvptx_register_operand" "R")
+		      (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")))]
   ""
   "%.\\tshl.b%T0\\t%0, %1, %2;")
 
 (define_insn "ashr<mode>3"
-  [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
-	(ashiftrt:SDIM (match_operand:SDIM 1 "nvptx_register_operand" "R")
-		       (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")))]
+  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
+	(ashiftrt:HSDIM (match_operand:HSDIM 1 "nvptx_register_operand" "R")
+			(match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")))]
   ""
   "%.\\tshr.s%T0\\t%0, %1, %2;")
 
 (define_insn "lshr<mode>3"
-  [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
-	(lshiftrt:SDIM (match_operand:SDIM 1 "nvptx_register_operand" "R")
-		       (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")))]
+  [(set (match_operand:HSDIM 0 "nvptx_register_operand" "=R")
+	(lshiftrt:HSDIM (match_operand:HSDIM 1 "nvptx_register_operand" "R")
+			(match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")))]
   ""
   "%.\\tshr.u%T0\\t%0, %1, %2;")
 
@@ -872,6 +957,15 @@
   ""
   "%.\\tfma%#%t0\\t%0, %1, %2, %3;")
 
+(define_insn "*recip<mode>2"
+  [(set (match_operand:SDFM 0 "nvptx_register_operand" "=R")
+	(div:SDFM
+	  (match_operand:SDFM 2 "const_double_operand" "F")
+	  (match_operand:SDFM 1 "nvptx_register_operand" "R")))]
+  "CONST_DOUBLE_P (operands[2])
+   && real_identical (CONST_DOUBLE_REAL_VALUE (operands[2]), &dconst1)"
+  "%.\\trcp%#%t0\\t%0, %1;")
+
 (define_insn "div<mode>3"
   [(set (match_operand:SDFM 0 "nvptx_register_operand" "=R")
 	(div:SDFM (match_operand:SDFM 1 "nvptx_register_operand" "R")
@@ -1051,6 +1145,78 @@
   ""
   "%.\\tcvt<FPINT2:fpint2_roundingmode>.s%T0%t1\\t%0, %1;")
 
+;; Vector operations
+
+(define_insn "*vec_set<mode>_0"
+  [(set (match_operand:VECIM 0 "nvptx_register_operand" "=R")
+	(vec_merge:VECIM
+	  (vec_duplicate:VECIM
+	    (match_operand:<VECELEM> 1 "nvptx_register_operand" "R"))
+	  (match_dup 0)
+	  (const_int 1)))]
+  ""
+  "%.\\tmov%t1\\t%0.x, %1;")
+
+(define_insn "*vec_set<mode>_1"
+  [(set (match_operand:VECIM 0 "nvptx_register_operand" "=R")
+	(vec_merge:VECIM
+	  (vec_duplicate:VECIM
+	    (match_operand:<VECELEM> 1 "nvptx_register_operand" "R"))
+	  (match_dup 0)
+	  (const_int 2)))]
+  ""
+  "%.\\tmov%t1\\t%0.y, %1;")
+
+(define_insn "*vec_set<mode>_2"
+  [(set (match_operand:VECIM 0 "nvptx_register_operand" "=R")
+	(vec_merge:VECIM
+	  (vec_duplicate:VECIM
+	    (match_operand:<VECELEM> 1 "nvptx_register_operand" "R"))
+	  (match_dup 0)
+	  (const_int 4)))]
+  ""
+  "%.\\tmov%t1\\t%0.z, %1;")
+
+(define_insn "*vec_set<mode>_3"
+  [(set (match_operand:VECIM 0 "nvptx_register_operand" "=R")
+	(vec_merge:VECIM
+	  (vec_duplicate:VECIM
+	    (match_operand:<VECELEM> 1 "nvptx_register_operand" "R"))
+	  (match_dup 0)
+	  (const_int 8)))]
+  ""
+  "%.\\tmov%t1\\t%0.w, %1;")
+
+(define_expand "vec_set<mode>"
+  [(match_operand:VECIM 0 "nvptx_register_operand")
+   (match_operand:<VECELEM> 1 "nvptx_register_operand")
+   (match_operand:SI 2 "nvptx_vector_index_operand")]
+  ""
+{
+  enum machine_mode mode = GET_MODE (operands[0]);
+  int mask = 1 << INTVAL (operands[2]);
+  rtx tmp = gen_rtx_VEC_DUPLICATE (mode, operands[1]);
+  tmp = gen_rtx_VEC_MERGE (mode, tmp, operands[0], GEN_INT (mask));
+  emit_insn (gen_rtx_SET (operands[0], tmp));
+  DONE;
+})
+
+(define_insn "vec_extract<mode><Vecelem>"
+  [(set (match_operand:<VECELEM> 0 "nvptx_register_operand" "=R")
+	(vec_select:<VECELEM>
+	  (match_operand:VECIM 1 "nvptx_register_operand" "R")
+	  (parallel [(match_operand:SI 2 "nvptx_vector_index_operand" "")])))]
+  ""
+{
+  static const char *const asms[4] = {
+    "%.\\tmov%t0\\t%0, %1.x;",
+    "%.\\tmov%t0\\t%0, %1.y;",
+    "%.\\tmov%t0\\t%0, %1.z;",
+    "%.\\tmov%t0\\t%0, %1.w;"
+  };
+  return asms[INTVAL (operands[2])];
+})
+
 ;; Miscellaneous
 
 (define_insn "nop"
@@ -1523,6 +1689,22 @@
   "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;"
   [(set_attr "atomic" "true")])
 
+(define_expand "atomic_test_and_set"
+  [(match_operand:SI 0 "nvptx_register_operand")	;; bool success output
+   (match_operand:QI 1 "memory_operand")		;; memory
+   (match_operand:SI 2 "const_int_operand")]		;; model
+  ""
+{
+  rtx libfunc;
+  rtx addr;
+  libfunc = init_one_libfunc ("__atomic_test_and_set_1");
+  addr = convert_memory_address (ptr_mode, XEXP (operands[1], 0));
+  emit_library_call_value (libfunc, operands[0], LCT_NORMAL, SImode,
+			  addr, ptr_mode,
+			  operands[2], SImode);
+  DONE;
+})
+
 (define_insn "nvptx_barsync"
   [(unspec_volatile [(match_operand:SI 0 "nvptx_nonmemory_operand" "Ri")
 		     (match_operand:SI 1 "const_int_operand")]
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 75c3d54..3845422 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -37,7 +37,7 @@ msoft-stack
 Target Report Mask(SOFT_STACK)
 Use custom stacks instead of local memory for automatic storage.
 
-msoft-stack-reserve-local
+msoft-stack-reserve-local=
 Target Report Joined RejectNegative UInteger Var(nvptx_softstack_size) Init(128)
 Specify size of .local memory used for stack when the exact amount is not known.
 
@@ -59,6 +59,7 @@ Enum(ptx_isa) String(sm_30) Value(PTX_ISA_SM30)
 EnumValue
 Enum(ptx_isa) String(sm_35) Value(PTX_ISA_SM35)
 
+; Default needs to be in sync with default in ASM_SPEC in nvptx.h.
 misa=
-Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) Init(PTX_ISA_SM30)
+Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) Init(PTX_ISA_SM35)
 Specify the version of the ptx ISA to use.
diff --git a/gcc/config/pa/pa-hpux11.h b/gcc/config/pa/pa-hpux11.h
index 794bf8e..2820720 100644
--- a/gcc/config/pa/pa-hpux11.h
+++ b/gcc/config/pa/pa-hpux11.h
@@ -154,11 +154,6 @@ along with GCC; see the file COPYING3.  If not see
        %{!mt:%{!pthread:-a shared -lc -a archive}}}}\
    %{shared:%{mt|pthread:-lpthread}}"
 
-/* The libgcc_stub.a library needs to come last.  */
-#undef LINK_GCC_C_SEQUENCE_SPEC
-#define LINK_GCC_C_SEQUENCE_SPEC \
-  "%G %{!nolibc:%L} %G %{!nostdlib:%{!nodefaultlibs:%{!shared:-lgcc_stub}}}"
-
 #undef STARTFILE_SPEC
 #define STARTFILE_SPEC \
   "%{!shared:%{pg:gcrt0%O%s}%{!pg:%{p:mcrt0%O%s}%{!p:crt0%O%s}} \
diff --git a/gcc/config/pa/pa.c b/gcc/config/pa/pa.c
index 07d3287..210e44f 100644
--- a/gcc/config/pa/pa.c
+++ b/gcc/config/pa/pa.c
@@ -1492,6 +1492,33 @@ hppa_address_cost (rtx X, machine_mode mode ATTRIBUTE_UNUSED,
     }
 }
 
+/* Return true if X represents a (possibly non-canonical) shNadd pattern.
+   The machine mode of X is known to be SImode or DImode.  */
+
+static bool
+hppa_rtx_costs_shadd_p (rtx x)
+{
+  if (GET_CODE (x) != PLUS
+      || !REG_P (XEXP (x, 1)))
+    return false;
+  rtx op0 = XEXP (x, 0);
+  if (GET_CODE (op0) == ASHIFT
+      && CONST_INT_P (XEXP (op0, 1))
+      && REG_P (XEXP (op0, 0)))
+    {
+      unsigned HOST_WIDE_INT x = UINTVAL (XEXP (op0, 1));
+      return x == 1 || x == 2 || x == 3;
+    }
+  if (GET_CODE (op0) == MULT
+      && CONST_INT_P (XEXP (op0, 1))
+      && REG_P (XEXP (op0, 0)))
+    {
+      unsigned HOST_WIDE_INT x = UINTVAL (XEXP (op0, 1));
+      return x == 2 || x == 4 || x == 8;
+    }
+  return false;
+}
+
 /* Compute a (partial) cost for rtx X.  Return true if the complete
    cost has been computed, and false if subexpressions should be
    scanned.  In either case, *TOTAL contains the cost result.  */
@@ -1499,15 +1526,16 @@ hppa_address_cost (rtx X, machine_mode mode ATTRIBUTE_UNUSED,
 static bool
 hppa_rtx_costs (rtx x, machine_mode mode, int outer_code,
 		int opno ATTRIBUTE_UNUSED,
-		int *total, bool speed ATTRIBUTE_UNUSED)
+		int *total, bool speed)
 {
-  int factor;
   int code = GET_CODE (x);
 
   switch (code)
     {
     case CONST_INT:
-      if (INTVAL (x) == 0)
+      if (outer_code == SET)
+	*total = COSTS_N_INSNS (1);
+      else if (INTVAL (x) == 0)
 	*total = 0;
       else if (INT_14_BITS (x))
 	*total = 1;
@@ -1530,32 +1558,35 @@ hppa_rtx_costs (rtx x, machine_mode mode, int outer_code,
 	  && outer_code != SET)
 	*total = 0;
       else
-        *total = 8;
+	*total = 8;
       return true;
 
     case MULT:
       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
 	{
 	  *total = COSTS_N_INSNS (3);
-	  return true;
 	}
-
-      /* A mode size N times larger than SImode needs O(N*N) more insns.  */
-      factor = GET_MODE_SIZE (mode) / 4;
-      if (factor == 0)
-	factor = 1;
-
-      if (TARGET_PA_11 && !TARGET_DISABLE_FPREGS && !TARGET_SOFT_FLOAT)
-	*total = factor * factor * COSTS_N_INSNS (8);
+      else if (mode == DImode)
+	{
+	  if (TARGET_PA_11 && !TARGET_DISABLE_FPREGS && !TARGET_SOFT_FLOAT)
+	    *total = COSTS_N_INSNS (32);
+	  else
+	    *total = COSTS_N_INSNS (80);
+	}
       else
-	*total = factor * factor * COSTS_N_INSNS (20);
-      return true;
+	{
+	  if (TARGET_PA_11 && !TARGET_DISABLE_FPREGS && !TARGET_SOFT_FLOAT)
+	    *total = COSTS_N_INSNS (8);
+	  else
+	    *total = COSTS_N_INSNS (20);
+	}
+      return REG_P (XEXP (x, 0)) && REG_P (XEXP (x, 1));
 
     case DIV:
       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
 	{
 	  *total = COSTS_N_INSNS (14);
-	  return true;
+	  return false;
 	}
       /* FALLTHRU */
 
@@ -1563,34 +1594,137 @@ hppa_rtx_costs (rtx x, machine_mode mode, int outer_code,
     case MOD:
     case UMOD:
       /* A mode size N times larger than SImode needs O(N*N) more insns.  */
-      factor = GET_MODE_SIZE (mode) / 4;
-      if (factor == 0)
-	factor = 1;
-
-      *total = factor * factor * COSTS_N_INSNS (60);
-      return true;
+      if (mode == DImode)
+	*total = COSTS_N_INSNS (240);
+      else
+	*total = COSTS_N_INSNS (60);
+      return REG_P (XEXP (x, 0)) && REG_P (XEXP (x, 1));
 
     case PLUS: /* this includes shNadd insns */
     case MINUS:
       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+	*total = COSTS_N_INSNS (3);
+      else if (mode == DImode)
 	{
-	  *total = COSTS_N_INSNS (3);
-	  return true;
+	  if (TARGET_64BIT)
+	    {
+	      *total = COSTS_N_INSNS (1);
+	      /* Handle shladd,l instructions.  */
+	      if (hppa_rtx_costs_shadd_p (x))
+		return true;
+	    }
+	  else
+	    *total = COSTS_N_INSNS (2);
 	}
-
-      /* A size N times larger than UNITS_PER_WORD needs N times as
-	 many insns, taking N times as long.  */
-      factor = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
-      if (factor == 0)
-	factor = 1;
-      *total = factor * COSTS_N_INSNS (1);
-      return true;
+      else
+	{
+	  *total = COSTS_N_INSNS (1);
+	  /* Handle shNadd instructions.  */
+	  if (hppa_rtx_costs_shadd_p (x))
+	    return true;
+	}
+      return REG_P (XEXP (x, 0))
+	     && (REG_P (XEXP (x, 1))
+		 || CONST_INT_P (XEXP (x, 1)));
 
     case ASHIFT:
+      if (mode == DImode)
+	{
+	  if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
+	    {
+	      if (TARGET_64BIT)
+		*total = COSTS_N_INSNS (1);
+	      else
+		*total = COSTS_N_INSNS (2);
+	      return true;
+	    }
+	  else if (TARGET_64BIT)
+	    *total = COSTS_N_INSNS (3);
+	  else if (speed)
+	    *total = COSTS_N_INSNS (13);
+	  else
+	    *total = COSTS_N_INSNS (18);
+	}
+      else if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
+	{
+	  if (TARGET_64BIT)
+	    *total = COSTS_N_INSNS (2);
+	  else
+	    *total = COSTS_N_INSNS (1);
+	  return true;
+	}
+      else if (TARGET_64BIT)
+	*total = COSTS_N_INSNS (4);
+      else
+	*total = COSTS_N_INSNS (2);
+      return REG_P (XEXP (x, 0))
+	     && (REG_P (XEXP (x, 1))
+		 || CONST_INT_P (XEXP (x, 1)));
+
     case ASHIFTRT:
+      if (mode == DImode)
+	{
+	  if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
+	    {
+	      if (TARGET_64BIT)
+		*total = COSTS_N_INSNS (1);
+	      else
+		*total = COSTS_N_INSNS (2);
+	      return true;
+	    }
+	  else if (TARGET_64BIT)
+	    *total = COSTS_N_INSNS (3);
+	  else if (speed)
+	    *total = COSTS_N_INSNS (14);
+	  else
+	    *total = COSTS_N_INSNS (19);
+	}
+      else if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
+	{
+	  if (TARGET_64BIT)
+	    *total = COSTS_N_INSNS (2);
+	  else
+	    *total = COSTS_N_INSNS (1);
+	  return true;
+	}
+      else if (TARGET_64BIT)
+	*total = COSTS_N_INSNS (4);
+      else
+	*total = COSTS_N_INSNS (2);
+      return REG_P (XEXP (x, 0))
+	     && (REG_P (XEXP (x, 1))
+		 || CONST_INT_P (XEXP (x, 1)));
+
     case LSHIFTRT:
-      *total = COSTS_N_INSNS (1);
-      return true;
+      if (mode == DImode)
+	{
+	  if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
+	    {
+	      if (TARGET_64BIT)
+		*total = COSTS_N_INSNS (1);
+	      else
+		*total = COSTS_N_INSNS (2);
+	      return true;
+	    }
+	  else if (TARGET_64BIT)
+	    *total = COSTS_N_INSNS (2);
+	  else if (speed)
+	    *total = COSTS_N_INSNS (12);
+	  else
+	    *total = COSTS_N_INSNS (15);
+	}
+      else if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
+	{
+	  *total = COSTS_N_INSNS (1);
+	  return true;
+	}
+      else if (TARGET_64BIT)
+	*total = COSTS_N_INSNS (3);
+      else
+	*total = COSTS_N_INSNS (2);
+      return REG_P (XEXP (x, 0))
+	     && (REG_P (XEXP (x, 1))
+		 || CONST_INT_P (XEXP (x, 1)));
 
     default:
       return false;
diff --git a/gcc/config/pa/pa.md b/gcc/config/pa/pa.md
index 6350c68..3a82fac 100644
--- a/gcc/config/pa/pa.md
+++ b/gcc/config/pa/pa.md
@@ -6416,9 +6416,32 @@
   [(set (match_operand:DI 0 "register_operand" "")
 	(ashift:DI (match_operand:DI 1 "lhs_lshift_operand" "")
 		   (match_operand:DI 2 "arith32_operand" "")))]
-  "TARGET_64BIT"
+  ""
   "
 {
+  if (!TARGET_64BIT)
+    {
+      if (REG_P (operands[0]) && GET_CODE (operands[2]) == CONST_INT)
+	{
+	  unsigned HOST_WIDE_INT shift = UINTVAL (operands[2]);
+	  if (shift >= 1 && shift <= 31)
+	    {
+	      rtx dst = operands[0];
+	      rtx src = force_reg (DImode, operands[1]);
+	      emit_insn (gen_shd_internal (gen_highpart (SImode, dst),
+					   gen_lowpart (SImode, src),
+					   GEN_INT (32-shift),
+					   gen_highpart (SImode, src),
+					   GEN_INT (shift)));
+	      emit_insn (gen_ashlsi3 (gen_lowpart (SImode, dst),
+				      gen_lowpart (SImode, src),
+				      GEN_INT (shift)));
+	      DONE;
+	    }
+	}
+      /* Fallback to using optabs.c's expand_doubleword_shift.  */
+      FAIL;
+    }
   if (GET_CODE (operands[2]) != CONST_INT)
     {
       rtx temp = gen_reg_rtx (DImode);
@@ -6604,32 +6627,82 @@
    (set_attr "length" "4")])
 
 ; Shift right pair word 0 to 31 bits.
-(define_insn "shrpsi4"
-  [(set (match_operand:SI 0 "register_operand" "=r,r")
-	(ior:SI (ashift:SI (match_operand:SI 1 "register_operand" "r,r")
-			   (minus:SI (const_int 32)
-			     (match_operand:SI 3 "shift5_operand" "q,n")))
-		(lshiftrt:SI (match_operand:SI 2 "register_operand" "r,r")
-			     (match_dup 3))))]
+(define_insn "*shrpsi4_1"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(match_operator:SI 4 "plus_xor_ior_operator"
+	  [(ashift:SI (match_operand:SI 1 "register_operand" "r")
+		      (minus:SI (const_int 32)
+				(match_operand:SI 3 "register_operand" "q")))
+	   (lshiftrt:SI (match_operand:SI 2 "register_operand" "r")
+			(match_dup 3))]))]
   ""
-  "@
-   {vshd %1,%2,%0|shrpw %1,%2,%%sar,%0}
-   {shd|shrpw} %1,%2,%3,%0"
+  "{vshd %1,%2,%0|shrpw %1,%2,%%sar,%0}"
+  [(set_attr "type" "shift")
+   (set_attr "length" "4")])
+
+(define_insn "*shrpsi4_2"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(match_operator:SI 4 "plus_xor_ior_operator"
+	  [(lshiftrt:SI (match_operand:SI 2 "register_operand" "r")
+			(match_operand:SI 3 "register_operand" "q"))
+	   (ashift:SI (match_operand:SI 1 "register_operand" "r")
+		      (minus:SI (const_int 32)
+				(match_dup 3)))]))]
+  ""
+  "{vshd %1,%2,%0|shrpw %1,%2,%%sar,%0}"
   [(set_attr "type" "shift")
    (set_attr "length" "4")])
 
 ; Shift right pair doubleword 0 to 63 bits.
-(define_insn "shrpdi4"
-  [(set (match_operand:DI 0 "register_operand" "=r,r")
-	(ior:DI (ashift:DI (match_operand:SI 1 "register_operand" "r,r")
-			   (minus:DI (const_int 64)
-			     (match_operand:DI 3 "shift6_operand" "q,n")))
-		(lshiftrt:DI (match_operand:DI 2 "register_operand" "r,r")
-			     (match_dup 3))))]
+(define_insn "*shrpdi4_1"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(match_operator:DI 4 "plus_xor_ior_operator"
+	  [(ashift:DI (match_operand:DI 1 "register_operand" "r")
+		      (minus:DI (const_int 64)
+				(match_operand:DI 3 "register_operand" "q")))
+	   (lshiftrt:DI (match_operand:DI 2 "register_operand" "r")
+			(match_dup 3))]))]
   "TARGET_64BIT"
-  "@
-   shrpd %1,%2,%%sar,%0
-   shrpd %1,%2,%3,%0"
+  "shrpd %1,%2,%%sar,%0"
+  [(set_attr "type" "shift")
+   (set_attr "length" "4")])
+
+(define_insn "*shrpdi4_2"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(match_operator:DI 4 "plus_xor_ior_operator"
+	  [(lshiftrt:DI (match_operand:DI 2 "register_operand" "r")
+			(match_operand:DI 3 "shift6_operand" "q"))
+	   (ashift:DI (match_operand:SI 1 "register_operand" "r")
+		      (minus:DI (const_int 64)
+				(match_dup 3)))]))]
+  "TARGET_64BIT"
+  "shrpd %1,%2,%%sar,%0"
+  [(set_attr "type" "shift")
+   (set_attr "length" "4")])
+
+(define_insn "*shrpdi4_3"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(match_operator:DI 5 "plus_xor_ior_operator"
+	  [(ashift:DI (match_operand:DI 1 "register_operand" "r")
+		      (match_operand:DI 3 "const_int_operand" "n"))
+	   (lshiftrt:DI (match_operand:DI 2 "register_operand" "r")
+			(match_operand:DI 4 "const_int_operand" "n"))]))]
+  "TARGET_64BIT
+   && INTVAL (operands[3]) + INTVAL (operands[4]) == 64"
+  "shrpd %1,%2,%4,%0"
+  [(set_attr "type" "shift")
+   (set_attr "length" "4")])
+
+(define_insn "*shrpdi4_4"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(match_operator:DI 5 "plus_xor_ior_operator"
+	  [(lshiftrt:DI (match_operand:DI 2 "register_operand" "r")
+			(match_operand:DI 4 "const_int_operand" "n"))
+	   (ashift:DI (match_operand:DI 1 "register_operand" "r")
+		      (match_operand:DI 3 "const_int_operand" "n"))]))]
+  "TARGET_64BIT
+   && INTVAL (operands[3]) + INTVAL (operands[4]) == 64"
+  "shrpd %1,%2,%4,%0"
   [(set_attr "type" "shift")
    (set_attr "length" "4")])
 
@@ -6668,7 +6741,7 @@
   /* Else expand normally.  */
 }")
 
-(define_insn ""
+(define_insn "*rotlsi3_internal"
   [(set (match_operand:SI 0 "register_operand" "=r")
         (rotate:SI (match_operand:SI 1 "register_operand" "r")
                    (match_operand:SI 2 "const_int_operand" "n")))]
@@ -6681,6 +6754,54 @@
   [(set_attr "type" "shift")
    (set_attr "length" "4")])
 
+(define_insn "rotrdi3"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(rotatert:DI (match_operand:DI 1 "register_operand" "r,r")
+		     (match_operand:DI 2 "shift6_operand" "q,n")))]
+  "TARGET_64BIT"
+  "*
+{
+  if (GET_CODE (operands[2]) == CONST_INT)
+    {
+      operands[2] = GEN_INT (INTVAL (operands[2]) & 63);
+      return \"shrpd %1,%1,%2,%0\";
+    }
+  else
+    return \"shrpd %1,%1,%%sar,%0\";
+}"
+  [(set_attr "type" "shift")
+   (set_attr "length" "4")])
+
+(define_expand "rotldi3"
+  [(set (match_operand:DI 0 "register_operand" "")
+        (rotate:DI (match_operand:DI 1 "register_operand" "")
+                   (match_operand:DI 2 "arith32_operand" "")))]
+  "TARGET_64BIT"
+  "
+{
+  if (GET_CODE (operands[2]) != CONST_INT)
+    {
+      rtx temp = gen_reg_rtx (DImode);
+      emit_insn (gen_subdi3 (temp, GEN_INT (64), operands[2]));
+      emit_insn (gen_rotrdi3 (operands[0], operands[1], temp));
+      DONE;
+    }
+  /* Else expand normally.  */
+}")
+
+(define_insn "*rotldi3_internal"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+        (rotate:DI (match_operand:DI 1 "register_operand" "r")
+                   (match_operand:DI 2 "const_int_operand" "n")))]
+  "TARGET_64BIT"
+  "*
+{
+  operands[2] = GEN_INT ((64 - INTVAL (operands[2])) & 63);
+  return \"shrpd %1,%1,%2,%0\";
+}"
+  [(set_attr "type" "shift")
+   (set_attr "length" "4")])
+
 (define_insn ""
   [(set (match_operand:SI 0 "register_operand" "=r")
 	(match_operator:SI 5 "plus_xor_ior_operator"
@@ -6705,6 +6826,15 @@
   [(set_attr "type" "shift")
    (set_attr "length" "4")])
 
+(define_expand "shd_internal"
+  [(set (match_operand:SI 0 "register_operand")
+	(ior:SI
+	  (lshiftrt:SI (match_operand:SI 1 "register_operand")
+		       (match_operand:SI 2 "const_int_operand"))
+	  (ashift:SI (match_operand:SI 3 "register_operand")
+		     (match_operand:SI 4 "const_int_operand"))))]
+  "")
+
 (define_insn ""
   [(set (match_operand:SI 0 "register_operand" "=r")
 	(and:SI (ashift:SI (match_operand:SI 1 "register_operand" "r")
diff --git a/gcc/config/pa/pa32-linux.h b/gcc/config/pa/pa32-linux.h
index f271bbf..970722a 100644
--- a/gcc/config/pa/pa32-linux.h
+++ b/gcc/config/pa/pa32-linux.h
@@ -57,6 +57,11 @@ call_ ## FUNC (void)					\
 }
 #endif
 
+/* We need to link against libgcc.a for __canonicalize_funcptr_for_compare
+   and $$dyncall.  */
+#undef  ENDFILE_SPEC
+#define ENDFILE_SPEC GNU_USER_TARGET_ENDFILE_SPEC "libgcc.a%s"
+
 #undef  WCHAR_TYPE
 #define WCHAR_TYPE "long int"
 
diff --git a/gcc/config/pa/pa64-hpux.h b/gcc/config/pa/pa64-hpux.h
index c7d127f7..096aa4b 100644
--- a/gcc/config/pa/pa64-hpux.h
+++ b/gcc/config/pa/pa64-hpux.h
@@ -103,12 +103,6 @@ along with GCC; see the file COPYING3.  If not see
    %{shared:%{mt|pthread:-lpthread}}"
 #endif
 
-/* The libgcc_stub.a and milli.a libraries need to come last.  */
-#undef LINK_GCC_C_SEQUENCE_SPEC
-#define LINK_GCC_C_SEQUENCE_SPEC "\
-  %G %{!nolibc:%L} %G %{!nostdlib:%{!nodefaultlibs:%{!shared:-lgcc_stub}\
-  milli.a%s}}"
-
 /* Under hpux11, the normal location of the `ld' and `as' programs is the
    /usr/ccs/bin directory.  */
 
@@ -335,8 +329,12 @@ do {								\
    %{static:crtbeginT%O%s} %{!static:%{!shared:crtbegin%O%s} \
    %{shared:crtbeginS%O%s}}"
 #endif
+
+/* The libgcc_stub.a and milli.a libraries must come last.  We need
+   to link with these libraries whenever start files are needed.  */
 #undef ENDFILE_SPEC
-#define ENDFILE_SPEC "%{!shared:crtend%O%s} %{shared:crtendS%O%s}"
+#define ENDFILE_SPEC \
+  "%{!shared:crtend%O%s libgcc_stub.a%s} %{shared:crtendS%O%s} milli.a%s"
 
 /* Since HP uses the .init and .fini sections for array initializers
    and finalizers, we need different defines for INIT_SECTION_ASM_OP
diff --git a/gcc/config/pru/pru.c b/gcc/config/pru/pru.c
index a715f6b..39104e5 100644
--- a/gcc/config/pru/pru.c
+++ b/gcc/config/pru/pru.c
@@ -621,7 +621,7 @@ pru_option_override (void)
   /* Save the initial options in case the user does function specific
      options.  */
   target_option_default_node = target_option_current_node
-    = build_target_option_node (&global_options);
+    = build_target_option_node (&global_options, &global_options_set);
 
   /* Due to difficulties in implementing the TI ABI with GCC,
      at least check and error-out if GCC cannot compile a
@@ -2848,7 +2848,7 @@ pru_set_current_function (tree fndecl)
 
       else if (new_tree)
 	{
-	  cl_target_option_restore (&global_options,
+	  cl_target_option_restore (&global_options, &global_options_set,
 				    TREE_TARGET_OPTION (new_tree));
 	  target_reinit ();
 	}
@@ -2858,7 +2858,7 @@ pru_set_current_function (tree fndecl)
 	  struct cl_target_option *def
 	    = TREE_TARGET_OPTION (target_option_current_node);
 
-	  cl_target_option_restore (&global_options, def);
+	  cl_target_option_restore (&global_options, &global_options_set, def);
 	  target_reinit ();
 	}
     }
diff --git a/gcc/config/riscv/riscv-c.c b/gcc/config/riscv/riscv-c.c
index 735f2f2..c600badb 100644
--- a/gcc/config/riscv/riscv-c.c
+++ b/gcc/config/riscv/riscv-c.c
@@ -90,12 +90,15 @@ riscv_cpu_cpp_builtins (cpp_reader *pfile)
       builtin_define ("__riscv_cmodel_medlow");
       break;
 
+    case CM_PIC:
+      /* __riscv_cmodel_pic is deprecated, and will removed in next GCC release.
+	 see https://github.com/riscv/riscv-c-api-doc/pull/11  */
+      builtin_define ("__riscv_cmodel_pic");
+      /* FALLTHROUGH. */
+
     case CM_MEDANY:
       builtin_define ("__riscv_cmodel_medany");
       break;
 
-    case CM_PIC:
-      builtin_define ("__riscv_cmodel_pic");
-      break;
     }
 }
diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h
index 8f12e50..2a3f9d9 100644
--- a/gcc/config/riscv/riscv-opts.h
+++ b/gcc/config/riscv/riscv-opts.h
@@ -51,4 +51,10 @@ enum riscv_align_data {
   riscv_align_data_type_natural
 };
 
+/* Where to get the canary for the stack protector.  */
+enum stack_protector_guard {
+  SSP_TLS,			/* per-thread canary in TLS block */
+  SSP_GLOBAL			/* global canary */
+};
+
 #endif /* ! GCC_RISCV_OPTS_H */
diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
index bfb3885..63b0c38 100644
--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -4775,6 +4775,53 @@ riscv_option_override (void)
 	   " [%<-mriscv-attribute%>]");
 #endif
 
+  if (riscv_stack_protector_guard == SSP_GLOBAL
+      && global_options_set.x_riscv_stack_protector_guard_offset_str)
+    {
+      error ("incompatible options %<-mstack-protector-guard=global%> and "
+	     "%<-mstack-protector-guard-offset=%s%>",
+	     riscv_stack_protector_guard_offset_str);
+    }
+
+  if (riscv_stack_protector_guard == SSP_TLS
+      && !(global_options_set.x_riscv_stack_protector_guard_offset_str
+	   && global_options_set.x_riscv_stack_protector_guard_reg_str))
+    {
+      error ("both %<-mstack-protector-guard-offset%> and "
+	     "%<-mstack-protector-guard-reg%> must be used "
+	     "with %<-mstack-protector-guard=sysreg%>");
+    }
+
+  if (global_options_set.x_riscv_stack_protector_guard_reg_str)
+    {
+      const char *str = riscv_stack_protector_guard_reg_str;
+      int reg = decode_reg_name (str);
+
+      if (!IN_RANGE (reg, GP_REG_FIRST + 1, GP_REG_LAST))
+	error ("%qs is not a valid base register in %qs", str,
+	       "-mstack-protector-guard-reg=");
+
+      riscv_stack_protector_guard_reg = reg;
+    }
+
+  if (global_options_set.x_riscv_stack_protector_guard_offset_str)
+    {
+      char *end;
+      const char *str = riscv_stack_protector_guard_offset_str;
+      errno = 0;
+      long offs = strtol (riscv_stack_protector_guard_offset_str, &end, 0);
+
+      if (!*str || *end || errno)
+	error ("%qs is not a valid number in %qs", str,
+	       "-mstack-protector-guard-offset=");
+
+      if (!SMALL_OPERAND (offs))
+	error ("%qs is not a valid offset in %qs", str,
+	       "-mstack-protector-guard-offset=");
+
+      riscv_stack_protector_guard_offset = offs;
+    }
+
 }
 
 /* Implement TARGET_CONDITIONAL_REGISTER_USAGE.  */
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 9f67d82..b7b4a1c 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -941,7 +941,7 @@ extern unsigned riscv_stack_boundary;
 
 /* This is the maximum value that can be represented in a compressed load/store
    offset (an unsigned 5-bit value scaled by 4).  */
-#define CSW_MAX_OFFSET ((4LL << C_S_BITS) - 1) & ~3
+#define CSW_MAX_OFFSET (((4LL << C_S_BITS) - 1) & ~3)
 
 /* Called from RISCV_REORG, this is defined in riscv-sr.c.  */
 
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 95a02eca..f15bad3 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -65,6 +65,10 @@
   UNSPECV_BLOCKAGE
   UNSPECV_FENCE
   UNSPECV_FENCE_I
+
+  ;; Stack Smash Protector
+  UNSPEC_SSP_SET
+  UNSPEC_SSP_TEST
 ])
 
 (define_constants
@@ -2523,6 +2527,82 @@
   ""
 {})
 
+;; Named patterns for stack smashing protection.
+
+(define_expand "stack_protect_set"
+  [(match_operand 0 "memory_operand")
+   (match_operand 1 "memory_operand")]
+  ""
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (riscv_stack_protector_guard == SSP_TLS)
+  {
+    rtx reg = gen_rtx_REG (Pmode, riscv_stack_protector_guard_reg);
+    rtx offset = GEN_INT (riscv_stack_protector_guard_offset);
+    rtx addr = gen_rtx_PLUS (Pmode, reg, offset);
+    operands[1] = gen_rtx_MEM (Pmode, addr);
+  }
+
+  emit_insn ((mode == DImode
+	      ? gen_stack_protect_set_di
+	      : gen_stack_protect_set_si) (operands[0], operands[1]));
+  DONE;
+})
+
+;; DO NOT SPLIT THIS PATTERN.  It is important for security reasons that the
+;; canary value does not live beyond the life of this sequence.
+(define_insn "stack_protect_set_<mode>"
+  [(set (match_operand:GPR 0 "memory_operand" "=m")
+	(unspec:GPR [(match_operand:GPR 1 "memory_operand" "m")]
+	 UNSPEC_SSP_SET))
+   (set (match_scratch:GPR 2 "=&r") (const_int 0))]
+  ""
+  "<load>\\t%2, %1\;<store>\\t%2, %0\;li\t%2, 0"
+  [(set_attr "length" "12")])
+
+(define_expand "stack_protect_test"
+  [(match_operand 0 "memory_operand")
+   (match_operand 1 "memory_operand")
+   (match_operand 2)]
+  ""
+{
+  rtx result;
+  machine_mode mode = GET_MODE (operands[0]);
+
+  result = gen_reg_rtx(mode);
+  if (riscv_stack_protector_guard == SSP_TLS)
+  {
+      rtx reg = gen_rtx_REG (Pmode, riscv_stack_protector_guard_reg);
+      rtx offset = GEN_INT (riscv_stack_protector_guard_offset);
+      rtx addr = gen_rtx_PLUS (Pmode, reg, offset);
+      operands[1] = gen_rtx_MEM (Pmode, addr);
+  }
+  emit_insn ((mode == DImode
+		  ? gen_stack_protect_test_di
+		  : gen_stack_protect_test_si) (result,
+					        operands[0],
+					        operands[1]));
+
+  if (mode == DImode)
+    emit_jump_insn (gen_cbranchdi4 (gen_rtx_EQ (VOIDmode, result, const0_rtx),
+				    result, const0_rtx, operands[2]));
+  else
+    emit_jump_insn (gen_cbranchsi4 (gen_rtx_EQ (VOIDmode, result, const0_rtx),
+				    result, const0_rtx, operands[2]));
+
+  DONE;
+})
+
+(define_insn "stack_protect_test_<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(unspec:GPR [(match_operand:GPR 1 "memory_operand" "m")
+		     (match_operand:GPR 2 "memory_operand" "m")]
+	 UNSPEC_SSP_TEST))
+   (clobber (match_scratch:GPR 3 "=&r"))]
+  ""
+  "<load>\t%3, %1\;<load>\t%0, %2\;xor\t%0, %3, %0\;li\t%3, 0"
+  [(set_attr "length" "12")])
+
 (include "sync.md")
 (include "peephole.md")
 (include "pic.md")
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index e4bfcb8..f01d3ab 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -151,3 +151,31 @@ Enum(riscv_align_data) String(xlen) Value(riscv_align_data_type_xlen)
 
 EnumValue
 Enum(riscv_align_data) String(natural) Value(riscv_align_data_type_natural)
+
+mstack-protector-guard=
+Target RejectNegative Joined Enum(stack_protector_guard) Var(riscv_stack_protector_guard) Init(SSP_GLOBAL)
+Use given stack-protector guard.
+
+Enum
+Name(stack_protector_guard) Type(enum stack_protector_guard)
+Valid arguments to -mstack-protector-guard=:
+
+EnumValue
+Enum(stack_protector_guard) String(tls) Value(SSP_TLS)
+
+EnumValue
+Enum(stack_protector_guard) String(global) Value(SSP_GLOBAL)
+
+mstack-protector-guard-reg=
+Target RejectNegative Joined Var(riscv_stack_protector_guard_reg_str)
+Use the given base register for addressing the stack-protector guard.
+
+TargetVariable
+int riscv_stack_protector_guard_reg = 0
+
+mstack-protector-guard-offset=
+Target RejectNegative Joined Integer Var(riscv_stack_protector_guard_offset_str)
+Use the given offset for addressing the stack-protector guard.
+
+TargetVariable
+long riscv_stack_protector_guard_offset = 0
diff --git a/gcc/config/rs6000/aix71.h b/gcc/config/rs6000/aix71.h
index 5f98eb0..8b12a2d 100644
--- a/gcc/config/rs6000/aix71.h
+++ b/gcc/config/rs6000/aix71.h
@@ -64,8 +64,9 @@ do {									\
     }									\
 } while (0)
 
-#undef ASM_SPEC
-#define ASM_SPEC "-u %{maix64:-a64} %(asm_cpu)"
+#define ASM_SPEC32 "-a32"
+#define ASM_SPEC64 "-a64"
+#define ASM_SPEC_COMMON "-u %(asm_cpu)"
 
 /* Common ASM definitions used by ASM_SPEC amongst the various targets for
    handling -mcpu=xxx switches.  There is a parallel list in driver-rs6000.c to
@@ -91,10 +92,7 @@ do {									\
   mcpu=620: -m620; \
   mcpu=630: -m620; \
   mcpu=970|mcpu=G5: -m970; \
-  !mcpu*: %{mvsx: -mpwr6; \
-	    maltivec: -m970; \
-	    maix64|mpowerpc64: -mppc64; \
-	    : %(asm_default)}} \
+  !mcpu*: %(asm_default)} \
 -many"
 
 #undef	ASM_DEFAULT_SPEC
@@ -114,19 +112,17 @@ do {									\
     }                                \
   while (0)
 
-#undef CPP_SPEC
-#define CPP_SPEC "%{posix: -D_POSIX_SOURCE}	\
+#define CPP_SPEC32 ""
+#define CPP_SPEC64 "-D__64BIT__"
+#define CPP_SPEC_COMMON "%{posix: -D_POSIX_SOURCE} \
   %{ansi: -D_ANSI_C_SOURCE}			\
-  %{maix64: -D__64BIT__}			\
   %{mpe: -I%R/usr/lpp/ppe.poe/include}		\
   %{pthread: -D_THREAD_SAFE}"
 
 /* The GNU C++ standard library requires that these macros be 
    defined.  Synchronize with libstdc++ os_defines.h.  */
-#undef CPLUSPLUS_CPP_SPEC                       
-#define CPLUSPLUS_CPP_SPEC			\
+#define CPLUSPLUS_CPP_SPEC_COMMON		\
   "-D_ALL_SOURCE -D__COMPATMATH__		\
-   %{maix64: -D__64BIT__}			\
    %{mpe: -I%R/usr/lpp/ppe.poe/include}		\
    %{pthread: -D_THREAD_SAFE}"
 
@@ -135,7 +131,11 @@ do {									\
 #undef RS6000_CPU
 
 #undef  TARGET_DEFAULT
+#ifdef RS6000_BI_ARCH
+#define TARGET_DEFAULT (MASK_PPC_GPOPT | MASK_PPC_GFXOPT | MASK_MFCRF | MASK_POWERPC64 | MASK_64BIT)
+#else
 #define TARGET_DEFAULT (MASK_PPC_GPOPT | MASK_PPC_GFXOPT | MASK_MFCRF)
+#endif
 
 #undef  PROCESSOR_DEFAULT
 #define PROCESSOR_DEFAULT PROCESSOR_POWER7
@@ -154,29 +154,78 @@ do {									\
    the target makefile fragment or if none of the options listed in
    `MULTILIB_OPTIONS' are set by default.  *Note Target Fragment::.  */
 
-#undef	MULTILIB_DEFAULTS
+#undef MULTILIB_DEFAULTS
 
-#undef LIB_SPEC
-#define LIB_SPEC "%{pg:-L%R/lib/profiled -L%R/usr/lib/profiled}\
+#define DEFAULT_ARCH64_P (TARGET_DEFAULT & MASK_64BIT)
+
+#define LIB_SPEC32 "%{!shared:%{g*:-lg}}"
+#define LIB_SPEC64 ""
+#define LIB_SPEC_COMMON "%{pg:-L%R/lib/profiled -L%R/usr/lib/profiled}\
    %{p:-L%R/lib/profiled -L%R/usr/lib/profiled}\
-   %{!maix64:%{!shared:%{g*:-lg}}}\
    %{fprofile-arcs|fprofile-generate*|coverage:-lpthreads}\
    %{mpe:-L%R/usr/lpp/ppe.poe/lib -lmpi -lvtd}\
    %{mlong-double-128:-lc128}\
    %{pthread:-lpthreads} -lc"
 
-#undef LINK_SPEC
-#define LINK_SPEC "-bpT:0x10000000 -bpD:0x20000000 %{!r:-btextro}\
+#define LINK_SPEC32 "%{!shared:%{g*: %(link_libg) }} -b32"
+#define LINK_SPEC64 "-b64"
+#define LINK_SPEC_COMMON "-bpT:0x10000000 -bpD:0x20000000 %{!r:-btextro}\
    %{static:-bnso %(link_syscalls) } %{shared:-bM:SRE %{!e:-bnoentry}}\
-   %{!maix64:%{!shared:%{g*: %(link_libg) }}} %{maix64:-b64}\
-   %{mpe:-binitfini:poe_remote_main}"
+   %{mpe:-binitfini:poe_remote_main} "
 
 #undef STARTFILE_SPEC
+#if DEFAULT_ARCH64_P
+#define STARTFILE_SPEC "%{!shared:\
+   %{!maix32:%{pg:gcrt0_64%O%s;:%{p:mcrt0_64%O%s;:crt0_64%O%s}};:\
+     %{pthread:%{pg:gcrt0_r%O%s;:%{p:mcrt0_r%O%s;:crt0_r%O%s}};:\
+       %{pg:gcrt0%O%s;:%{p:mcrt0%O%s;:crt0%O%s}}}}}\
+   %{!maix32:%{shared:crtcxa_64_s%O%s;:crtcxa_64%O%s} crtdbase_64%O%s;:\
+     %{shared:crtcxa_s%O%s;:crtcxa%O%s} crtdbase%O%s}"
+#else
 #define STARTFILE_SPEC "%{!shared:\
    %{maix64:%{pg:gcrt0_64%O%s;:%{p:mcrt0_64%O%s;:crt0_64%O%s}};:\
      %{pthread:%{pg:gcrt0_r%O%s;:%{p:mcrt0_r%O%s;:crt0_r%O%s}};:\
        %{pg:gcrt0%O%s;:%{p:mcrt0%O%s;:crt0%O%s}}}}}\
-   %{shared:crtcxa_s%O%s;:crtcxa%O%s} crtdbase%O%s"
+   %{maix64:%{shared:crtcxa_64_s%O%s;:crtcxa_64%O%s} crtdbase_64%O%s;:\
+     %{shared:crtcxa_s%O%s;:crtcxa%O%s} crtdbase%O%s}"
+#endif
+
+
+#undef ASM_SPEC
+#undef CPP_SPEC
+#undef CPLUSPLUS_CPP_SPEC
+#undef LIB_SPEC
+#undef LINK_SPEC
+
+#if DEFAULT_ARCH64_P
+#define ASM_SPEC "%{maix32:%(asm_spec32);:%(asm_spec64)} %(asm_spec_common)"
+#define CPP_SPEC "%{maix32:%(cpp_spec32);:%(cpp_spec64)} %(cpp_spec_common)"
+#define CPLUSPLUS_CPP_SPEC "%{maix32:%(cpp_spec32);:%(cpp_spec64)} %(cplusplus_cpp_spec_common)"
+#define LIB_SPEC "%{maix32:%(lib_spec32);:%(lib_spec64)} %(lib_spec_common)"
+#define LINK_SPEC "%{maix32:%(link_spec32);:%(link_spec64)} %(link_spec_common)"
+#else
+#define ASM_SPEC "%{maix64:%(asm_spec64);:%(asm_spec32)} %(asm_spec_common)"
+#define CPP_SPEC "%{maix64:%(cpp_spec64);:%(cpp_spec32)} %(cpp_spec_common)"
+#define CPLUSPLUS_CPP_SPEC "%{maix64:%(cpp_spec64);:%(cpp_spec32)} %(cplusplus_cpp_spec_common)"
+#define LIB_SPEC "%{maix64:%(lib_spec64);:%(lib_spec32)} %(lib_spec_common)"
+#define LINK_SPEC "%{maix64:%(link_spec64);:%(link_spec32)} %(link_spec_common)"
+#endif
+
+#undef SUBTARGET_EXTRA_SPECS
+#define SUBTARGET_EXTRA_SPECS					\
+  { "asm_spec_common",		ASM_SPEC_COMMON },		\
+  { "asm_spec32",		ASM_SPEC32 },			\
+  { "asm_spec64",		ASM_SPEC64 },			\
+  { "cpp_spec_common",		CPP_SPEC_COMMON },		\
+  { "cplusplus_cpp_spec_common", CPLUSPLUS_CPP_SPEC_COMMON },	\
+  { "cpp_spec32",		CPP_SPEC32 },			\
+  { "cpp_spec64",		CPP_SPEC64 },			\
+  { "lib_spec_common",		LIB_SPEC_COMMON },		\
+  { "lib_spec32",		LIB_SPEC32 },			\
+  { "lib_spec64",		LIB_SPEC64 },			\
+  { "link_spec_common",		LINK_SPEC_COMMON },		\
+  { "link_spec32",		LINK_SPEC32 },			\
+  { "link_spec64",		LINK_SPEC64 },
 
 /* AIX V5 typedefs ptrdiff_t as "long" while earlier releases used "int".  */
 
diff --git a/gcc/config/rs6000/aix72.h b/gcc/config/rs6000/aix72.h
index b0262af..121420b 100644
--- a/gcc/config/rs6000/aix72.h
+++ b/gcc/config/rs6000/aix72.h
@@ -131,8 +131,10 @@ do {									\
 #include "rs6000-cpus.def"
 #undef RS6000_CPU
 
-#ifndef RS6000_BI_ARCH
 #undef  TARGET_DEFAULT
+#ifdef RS6000_BI_ARCH
+#define TARGET_DEFAULT (ISA_2_6_MASKS_EMBEDDED | MASK_POWERPC64 | MASK_64BIT)
+#else
 #define TARGET_DEFAULT ISA_2_6_MASKS_EMBEDDED
 #endif
 
diff --git a/gcc/config/rs6000/altivec.h b/gcc/config/rs6000/altivec.h
index 6c43124..8a2dcda 100644
--- a/gcc/config/rs6000/altivec.h
+++ b/gcc/config/rs6000/altivec.h
@@ -407,10 +407,6 @@
 #define vec_vpopcnth __builtin_vec_vpopcnth
 #define vec_vpopcntw __builtin_vec_vpopcntw
 #define vec_popcnt __builtin_vec_vpopcntu
-#define vec_popcntb __builtin_vec_vpopcntub
-#define vec_popcnth __builtin_vec_vpopcntuh
-#define vec_popcntw __builtin_vec_vpopcntuw
-#define vec_popcntd __builtin_vec_vpopcntud
 #define vec_vrld __builtin_vec_vrld
 #define vec_vsld __builtin_vec_vsld
 #define vec_vsrad __builtin_vec_vsrad
@@ -493,6 +489,9 @@
 #define vec_cntlz_lsbb __builtin_vec_vclzlsbb
 #define vec_cnttz_lsbb __builtin_vec_vctzlsbb
 
+#define vec_test_lsbb_all_ones __builtin_vec_xvtlsbb_all_ones
+#define vec_test_lsbb_all_zeros __builtin_vec_xvtlsbb_all_zeros
+
 #define vec_xlx __builtin_vec_vextulx
 #define vec_xrx __builtin_vec_vexturx
 #endif
@@ -700,6 +699,17 @@ __altivec_scalar_pred(vec_any_nle,
 /* Overloaded built-in functions for ISA 3.1.  */
 #define vec_extractl(a, b, c)	__builtin_vec_extractl (a, b, c)
 #define vec_extracth(a, b, c)	__builtin_vec_extracth (a, b, c)
+#define vec_insertl(a, b, c)   __builtin_vec_insertl (a, b, c)
+#define vec_inserth(a, b, c)   __builtin_vec_inserth (a, b, c)
+#define vec_replace_elt(a, b, c)       __builtin_vec_replace_elt (a, b, c)
+#define vec_replace_unaligned(a, b, c) __builtin_vec_replace_un (a, b, c)
+#define vec_sldb(a, b, c)      __builtin_vec_sldb (a, b, c)
+#define vec_srdb(a, b, c)      __builtin_vec_srdb (a, b, c)
+#define vec_splati(a)  __builtin_vec_xxspltiw (a)
+#define vec_splatid(a) __builtin_vec_xxspltid (a)
+#define vec_splati_ins(a, b, c)        __builtin_vec_xxsplti32dx (a, b, c)
+#define vec_blendv(a, b, c)    __builtin_vec_xxblend (a, b, c)
+#define vec_permx(a, b, c, d)  __builtin_vec_xxpermx (a, b, c, d)
 
 #define vec_gnb(a, b)	__builtin_vec_gnb (a, b)
 #define vec_clrl(a, b)	__builtin_vec_clrl (a, b)
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 0481642..0a2e634 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -173,8 +173,13 @@
    UNSPEC_XXEVAL
    UNSPEC_VSTRIR
    UNSPEC_VSTRIL
-   UNSPEC_EXTRACTL
-   UNSPEC_EXTRACTR
+   UNSPEC_SLDB
+   UNSPEC_SRDB
+   UNSPEC_XXSPLTIW
+   UNSPEC_XXSPLTID
+   UNSPEC_XXSPLTI32DX
+   UNSPEC_XXBLEND
+   UNSPEC_XXPERMX
 ])
 
 (define_c_enum "unspecv"
@@ -185,8 +190,6 @@
    UNSPECV_DSS
   ])
 
-;; Like VI, defined in vector.md, but add ISA 2.07 integer vector ops
-(define_mode_iterator VI2 [V4SI V8HI V16QI V2DI])
 ;; Short vec int modes
 (define_mode_iterator VIshort [V8HI V16QI])
 ;; Longer vec int modes for rotate/mask ops
@@ -219,6 +222,21 @@
 			   (KF "FLOAT128_VECTOR_P (KFmode)")
 			   (TF "FLOAT128_VECTOR_P (TFmode)")])
 
+;; Like VM2, just do char, short, int, long, float and double
+(define_mode_iterator VM3 [V4SI
+			   V8HI
+			   V16QI
+			   V4SF
+			   V2DF
+			   V2DI])
+
+(define_mode_attr VM3_char [(V2DI "d")
+			   (V4SI "w")
+			   (V8HI "h")
+			   (V16QI "b")
+			   (V2DF  "d")
+			   (V4SF  "w")])
+
 ;; Map the Vector convert single precision to double precision for integer
 ;; versus floating point
 (define_mode_attr VS_sxwsp [(V4SI "sxw") (V4SF "sp")])
@@ -787,64 +805,177 @@
   DONE;
 })
 
-(define_expand "vextractl<mode>"
-  [(set (match_operand:V2DI 0 "altivec_register_operand")
-	(unspec:V2DI [(match_operand:VI2 1 "altivec_register_operand")
-		      (match_operand:VI2 2 "altivec_register_operand")
-		      (match_operand:SI 3 "register_operand")]
-		     UNSPEC_EXTRACTL))]
+;; Map UNSPEC_SLDB to "l" and  UNSPEC_SRDB to "r".
+(define_int_attr SLDB_lr [(UNSPEC_SLDB "l")
+			  (UNSPEC_SRDB "r")])
+
+(define_int_iterator VSHIFT_DBL_LR [UNSPEC_SLDB UNSPEC_SRDB])
+
+(define_insn "vs<SLDB_lr>db_<mode>"
+ [(set (match_operand:VI2 0 "register_operand" "=v")
+  (unspec:VI2 [(match_operand:VI2 1 "register_operand" "v")
+	       (match_operand:VI2 2 "register_operand" "v")
+	       (match_operand:QI 3 "const_0_to_12_operand" "n")]
+	      VSHIFT_DBL_LR))]
   "TARGET_POWER10"
+  "vs<SLDB_lr>dbi %0,%1,%2,%3"
+  [(set_attr "type" "vecsimple")])
+
+(define_insn "xxspltiw_v4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=wa")
+	(unspec:V4SI [(match_operand:SI 1 "s32bit_cint_operand" "n")]
+		     UNSPEC_XXSPLTIW))]
+ "TARGET_POWER10"
+ "xxspltiw %x0,%1"
+ [(set_attr "type" "vecsimple")])
+
+(define_expand "xxspltiw_v4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=wa")
+	(unspec:V4SF [(match_operand:SF 1 "const_double_operand" "n")]
+		     UNSPEC_XXSPLTIW))]
+ "TARGET_POWER10"
 {
-  if (BYTES_BIG_ENDIAN)
-    {
-      emit_insn (gen_vextractl<mode>_internal (operands[0], operands[1],
-					       operands[2], operands[3]));
-      emit_insn (gen_xxswapd_v2di (operands[0], operands[0]));
-    }
-  else
-    emit_insn (gen_vextractr<mode>_internal (operands[0], operands[2],
-					     operands[1], operands[3]));
+  long long value = rs6000_const_f32_to_i32 (operands[1]);
+  emit_insn (gen_xxspltiw_v4sf_inst (operands[0], GEN_INT (value)));
   DONE;
 })
 
-(define_insn "vextractl<mode>_internal"
-  [(set (match_operand:V2DI 0 "altivec_register_operand" "=v")
-	(unspec:V2DI [(match_operand:VEC_I 1 "altivec_register_operand" "v")
-		      (match_operand:VEC_I 2 "altivec_register_operand" "v")
-		      (match_operand:SI 3 "register_operand" "r")]
-		     UNSPEC_EXTRACTL))]
+(define_insn "xxspltiw_v4sf_inst"
+  [(set (match_operand:V4SF 0 "register_operand" "=wa")
+	(unspec:V4SF [(match_operand:SI 1 "c32bit_cint_operand" "n")]
+		     UNSPEC_XXSPLTIW))]
+ "TARGET_POWER10"
+ "xxspltiw %x0,%1"
+ [(set_attr "type" "vecsimple")])
+
+(define_expand "xxspltidp_v2df"
+  [(set (match_operand:V2DF 0 "register_operand" )
+	(unspec:V2DF [(match_operand:SF 1 "const_double_operand")]
+		     UNSPEC_XXSPLTID))]
+ "TARGET_POWER10"
+{
+  long value = rs6000_const_f32_to_i32 (operands[1]);
+  rs6000_emit_xxspltidp_v2df (operands[0], value);
+  DONE;
+})
+
+(define_insn "xxspltidp_v2df_inst"
+  [(set (match_operand:V2DF 0 "register_operand" "=wa")
+	(unspec:V2DF [(match_operand:SI 1 "c32bit_cint_operand" "n")]
+		     UNSPEC_XXSPLTID))]
   "TARGET_POWER10"
-  "vext<du_or_d><wd>vlx %0,%1,%2,%3"
+  "xxspltidp %x0,%1"
   [(set_attr "type" "vecsimple")])
 
-(define_expand "vextractr<mode>"
-  [(set (match_operand:V2DI 0 "altivec_register_operand")
-	(unspec:V2DI [(match_operand:VI2 1 "altivec_register_operand")
-		      (match_operand:VI2 2 "altivec_register_operand")
-		      (match_operand:SI 3 "register_operand")]
-		     UNSPEC_EXTRACTR))]
+(define_expand "xxsplti32dx_v4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=wa")
+	(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
+		      (match_operand:QI 2 "u1bit_cint_operand" "n")
+		      (match_operand:SI 3 "s32bit_cint_operand" "n")]
+		     UNSPEC_XXSPLTI32DX))]
+ "TARGET_POWER10"
+{
+  int index = INTVAL (operands[2]);
+
+  if (!BYTES_BIG_ENDIAN)
+    index = 1 - index;
+
+   emit_insn (gen_xxsplti32dx_v4si_inst (operands[0], operands[1],
+					 GEN_INT (index), operands[3]));
+   DONE;
+}
+ [(set_attr "type" "vecsimple")])
+
+(define_insn "xxsplti32dx_v4si_inst"
+  [(set (match_operand:V4SI 0 "register_operand" "=wa")
+	(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
+		      (match_operand:QI 2 "u1bit_cint_operand" "n")
+		      (match_operand:SI 3 "s32bit_cint_operand" "n")]
+		     UNSPEC_XXSPLTI32DX))]
+  "TARGET_POWER10"
+  "xxsplti32dx %x0,%2,%3"
+  [(set_attr "type" "vecsimple")])
+
+(define_expand "xxsplti32dx_v4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=wa")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "0")
+		      (match_operand:QI 2 "u1bit_cint_operand" "n")
+		      (match_operand:SF 3 "const_double_operand" "n")]
+		     UNSPEC_XXSPLTI32DX))]
+  "TARGET_POWER10"
+{
+  int index = INTVAL (operands[2]);
+  long value = rs6000_const_f32_to_i32 (operands[3]);
+  if (!BYTES_BIG_ENDIAN)
+    index = 1 - index;
+
+   emit_insn (gen_xxsplti32dx_v4sf_inst (operands[0], operands[1],
+					 GEN_INT (index), GEN_INT (value)));
+   DONE;
+})
+
+(define_insn "xxsplti32dx_v4sf_inst"
+  [(set (match_operand:V4SF 0 "register_operand" "=wa")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "0")
+		      (match_operand:QI 2 "u1bit_cint_operand" "n")
+		      (match_operand:SI 3 "s32bit_cint_operand" "n")]
+		     UNSPEC_XXSPLTI32DX))]
+  "TARGET_POWER10"
+  "xxsplti32dx %x0,%2,%3"
+  [(set_attr "type" "vecsimple")])
+
+(define_insn "xxblend_<mode>"
+  [(set (match_operand:VM3 0 "register_operand" "=wa")
+	(unspec:VM3 [(match_operand:VM3 1 "register_operand" "wa")
+		     (match_operand:VM3 2 "register_operand" "wa")
+		     (match_operand:VM3 3 "register_operand" "wa")]
+		    UNSPEC_XXBLEND))]
+  "TARGET_POWER10"
+  "xxblendv<VM3_char> %x0,%x1,%x2,%x3"
+  [(set_attr "type" "vecsimple")])
+
+(define_expand "xxpermx"
+  [(set (match_operand:V2DI 0 "register_operand" "+wa")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "wa")
+		      (match_operand:V2DI 2 "register_operand" "wa")
+		      (match_operand:V16QI 3 "register_operand" "wa")
+		      (match_operand:QI 4 "u8bit_cint_operand" "n")]
+		     UNSPEC_XXPERMX))]
   "TARGET_POWER10"
 {
   if (BYTES_BIG_ENDIAN)
+    emit_insn (gen_xxpermx_inst (operands[0], operands[1],
+				 operands[2], operands[3],
+				 operands[4]));
+  else
     {
-      emit_insn (gen_vextractr<mode>_internal (operands[0], operands[1],
-					       operands[2], operands[3]));
-      emit_insn (gen_xxswapd_v2di (operands[0], operands[0]));
+      /* Reverse value of byte element indexes by XORing with 0xFF.
+	 Reverse the 32-byte section identifier match by subracting bits [0:2]
+	 of elemet from 7.  */
+      int value = INTVAL (operands[4]);
+      rtx vreg = gen_reg_rtx (V16QImode);
+
+      emit_insn (gen_xxspltib_v16qi (vreg, GEN_INT (-1)));
+      emit_insn (gen_xorv16qi3 (operands[3], operands[3], vreg));
+      value = 7 - value;
+      emit_insn (gen_xxpermx_inst (operands[0], operands[2],
+				   operands[1], operands[3],
+				   GEN_INT (value)));
     }
-  else
-    emit_insn (gen_vextractl<mode>_internal (operands[0], operands[2],
-    					     operands[1], operands[3]));
+
   DONE;
-})
+}
+  [(set_attr "type" "vecsimple")])
 
-(define_insn "vextractr<mode>_internal"
-  [(set (match_operand:V2DI 0 "altivec_register_operand" "=v")
-	(unspec:V2DI [(match_operand:VEC_I 1 "altivec_register_operand" "v")
-		      (match_operand:VEC_I 2 "altivec_register_operand" "v")
-		      (match_operand:SI 3 "register_operand" "r")]
-		     UNSPEC_EXTRACTR))]
+(define_insn "xxpermx_inst"
+  [(set (match_operand:V2DI 0 "register_operand" "+v")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "v")
+		      (match_operand:V2DI 2 "register_operand" "v")
+		      (match_operand:V16QI 3 "register_operand" "v")
+		      (match_operand:QI 4 "u3bit_cint_operand" "n")]
+		     UNSPEC_XXPERMX))]
   "TARGET_POWER10"
-  "vext<du_or_d><wd>vrx %0,%1,%2,%3"
+  "xxpermx %x0,%x1,%x2,%x3,%4"
   [(set_attr "type" "vecsimple")])
 
 (define_expand "vstrir_<mode>"
diff --git a/gcc/config/rs6000/defaultaix64.h b/gcc/config/rs6000/defaultaix64.h
deleted file mode 100644
index ecac576..0000000
--- a/gcc/config/rs6000/defaultaix64.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Definitions of target machine for GNU compiler,
-   for 64 bit powerpc linux defaulting to -m64.
-   Copyright (C) 2003-2020 Free Software Foundation, Inc.
-
-This file is part of GCC.
-
-GCC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 3, or (at your option)
-any later version.
-
-GCC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GCC; see the file COPYING3.  If not see
-<http://www.gnu.org/licenses/>.  */
-
-#define RS6000_CPU(NAME, CPU, FLAGS)
-#include "rs6000-cpus.def"
-#undef RS6000_CPU
-
-#undef TARGET_DEFAULT
-#define TARGET_DEFAULT (ISA_2_6_MASKS_EMBEDDED | MASK_POWERPC64 | MASK_64BIT)
-#undef ASM_DEFAULT_SPEC
-#define ASM_DEFAULT_SPEC "-mpwr7"
diff --git a/gcc/config/rs6000/dfp.md b/gcc/config/rs6000/dfp.md
index e91d6f5..8f82273 100644
--- a/gcc/config/rs6000/dfp.md
+++ b/gcc/config/rs6000/dfp.md
@@ -155,6 +155,19 @@
   [(set_attr "type" "dfp")
    (set_attr "length" "8")])
 
+(define_insn "trunctdsd2"
+  [(set (match_operand:SD 0 "gpc_reg_operand" "=d,d")
+	(float_truncate:SD (match_operand:TD 1 "gpc_reg_operand" "d,d")))
+   (clobber (match_scratch:TD 2 "=&d,&d"))
+   (clobber (match_scratch:DF 3 "=&d,&d"))]
+  "TARGET_DFP"
+  "@
+   mffscdrni %3,7\;drdpq %2,%1\;mffscdrn %3,%3\;drsp %0,%2
+   mffs %3\;mtfsfi 7,7,1\;drdpq %2,%1\;mtfsf 0xff,%3,1,0\;drsp %0,%2"
+  [(set_attr "type" "dfp")
+   (set_attr "isa" "p9,*")
+   (set_attr "length" "16,20")])
+
 (define_insn "add<mode>3"
   [(set (match_operand:DDTD 0 "gpc_reg_operand" "=d")
 	(plus:DDTD (match_operand:DDTD 1 "gpc_reg_operand" "%d")
diff --git a/gcc/config/rs6000/freebsd64.h b/gcc/config/rs6000/freebsd64.h
index c991363..6984ca5 100644
--- a/gcc/config/rs6000/freebsd64.h
+++ b/gcc/config/rs6000/freebsd64.h
@@ -78,65 +78,7 @@ extern int dot_symbols;
 
 #undef  SUBSUBTARGET_OVERRIDE_OPTIONS
 #define SUBSUBTARGET_OVERRIDE_OPTIONS				\
-  do								\
-    {								\
-      if (!global_options_set.x_rs6000_alignment_flags)		\
-	rs6000_alignment_flags = MASK_ALIGN_NATURAL;		\
-      if (TARGET_64BIT)						\
-	{							\
-	  if (DEFAULT_ABI != ABI_AIX)				\
-	    {							\
-	      rs6000_current_abi = ABI_AIX;			\
-	      error (INVALID_64BIT, "call");			\
-	    }							\
-	  dot_symbols = !strcmp (rs6000_abi_name, "aixdesc");	\
-	  if (rs6000_isa_flags & OPTION_MASK_RELOCATABLE)	\
-	    {							\
-	      rs6000_isa_flags &= ~OPTION_MASK_RELOCATABLE;	\
-	      error (INVALID_64BIT, "relocatable");		\
-	    }							\
-	  if (ELFv2_ABI_CHECK)					\
-	    {							\
-	      rs6000_current_abi = ABI_ELFv2;			\
-	      if (dot_symbols)					\
-		error ("%<-mcall-aixdesc%> incompatible with %<-mabi=elfv2%>"); \
-	    }							\
-	  if (rs6000_isa_flags & OPTION_MASK_EABI)		\
-	    {							\
-	      rs6000_isa_flags &= ~OPTION_MASK_EABI;		\
-	      error (INVALID_64BIT, "eabi");			\
-	    }							\
-	  if (TARGET_PROTOTYPE)					\
-	    {							\
-	      target_prototype = 0;				\
-	      error (INVALID_64BIT, "prototype");		\
-	    }							\
-	  if ((rs6000_isa_flags & OPTION_MASK_POWERPC64) == 0)	\
-	    {							\
-	      rs6000_isa_flags |= OPTION_MASK_POWERPC64;	\
-	      error ("%<-m64%> requires a PowerPC64 cpu");		\
-	    }							\
-	   if ((rs6000_isa_flags_explicit			\
-		& OPTION_MASK_MINIMAL_TOC) != 0)		\
-	    {							\
-	      if (global_options_set.x_rs6000_current_cmodel	\
-		  && rs6000_current_cmodel != CMODEL_SMALL)	\
-		error ("%<-mcmodel%> incompatible with other toc options"); \
-	      SET_CMODEL (CMODEL_SMALL);			\
-	    }							\
-	  else							\
-	    {							\
-	      if (!global_options_set.x_rs6000_current_cmodel)	\
-		SET_CMODEL (CMODEL_MEDIUM);			\
-	      if (rs6000_current_cmodel != CMODEL_SMALL)	\
-		{						\
-		  TARGET_NO_FP_IN_TOC = 0;			\
-		  TARGET_NO_SUM_IN_TOC = 0;			\
-		}						\
-	    }							\
-	}							\
-    }								\
-  while (0)
+  do rs6000_linux64_override_options (); while (0)
 
 #undef	ASM_SPEC
 #undef	LINK_OS_FREEBSD_SPEC
diff --git a/gcc/config/rs6000/linux64.h b/gcc/config/rs6000/linux64.h
index 2ded330..73b6c01 100644
--- a/gcc/config/rs6000/linux64.h
+++ b/gcc/config/rs6000/linux64.h
@@ -96,90 +96,7 @@ extern int dot_symbols;
 
 #undef	SUBSUBTARGET_OVERRIDE_OPTIONS
 #define	SUBSUBTARGET_OVERRIDE_OPTIONS				\
-  do								\
-    {								\
-      if (!global_options_set.x_rs6000_alignment_flags)		\
-	rs6000_alignment_flags = MASK_ALIGN_NATURAL;		\
-      if (rs6000_isa_flags & OPTION_MASK_64BIT)			\
-	{							\
-	  if (DEFAULT_ABI != ABI_AIX)				\
-	    {							\
-	      rs6000_current_abi = ABI_AIX;			\
-	      error (INVALID_64BIT, "call");			\
-	    }							\
-	  dot_symbols = !strcmp (rs6000_abi_name, "aixdesc");	\
-	  if (ELFv2_ABI_CHECK)					\
-	    {							\
-	      rs6000_current_abi = ABI_ELFv2;			\
-	      if (dot_symbols)					\
-		error ("%<-mcall-aixdesc%> incompatible with %<-mabi=elfv2%>"); \
-	    }							\
-	  if (rs6000_isa_flags & OPTION_MASK_RELOCATABLE)	\
-	    {							\
-	      rs6000_isa_flags &= ~OPTION_MASK_RELOCATABLE;	\
-	      error (INVALID_64BIT, "relocatable");		\
-	    }							\
-	  if (rs6000_isa_flags & OPTION_MASK_EABI)		\
-	    {							\
-	      rs6000_isa_flags &= ~OPTION_MASK_EABI;		\
-	      error (INVALID_64BIT, "eabi");			\
-	    }							\
-	  if (TARGET_PROTOTYPE)					\
-	    {							\
-	      target_prototype = 0;				\
-	      error (INVALID_64BIT, "prototype");		\
-	    }							\
-	  if ((rs6000_isa_flags & OPTION_MASK_POWERPC64) == 0)	\
-	    {							\
-	      rs6000_isa_flags |= OPTION_MASK_POWERPC64;	\
-	      error ("%<-m64%> requires a PowerPC64 cpu");		\
-	    }							\
-	  if ((rs6000_isa_flags_explicit			\
-	       & OPTION_MASK_MINIMAL_TOC) != 0)			\
-	    {							\
-	      if (global_options_set.x_rs6000_current_cmodel	\
-		  && rs6000_current_cmodel != CMODEL_SMALL)	\
-		error ("%<-mcmodel incompatible with other toc options%>"); \
-	      SET_CMODEL (CMODEL_SMALL);			\
-	    }							\
-	  else							\
-	    {							\
-	      if (!global_options_set.x_rs6000_current_cmodel)	\
-		SET_CMODEL (CMODEL_MEDIUM);			\
-	      if (rs6000_current_cmodel != CMODEL_SMALL)	\
-		{						\
-		  if (!global_options_set.x_TARGET_NO_FP_IN_TOC) \
-		    TARGET_NO_FP_IN_TOC				\
-		      = rs6000_current_cmodel == CMODEL_MEDIUM;	\
-		  if (!global_options_set.x_TARGET_NO_SUM_IN_TOC) \
-		    TARGET_NO_SUM_IN_TOC = 0;			\
-		}						\
-	    }							\
-	  if (TARGET_PLTSEQ && DEFAULT_ABI != ABI_ELFv2)	\
-	    {							\
-	      if (global_options_set.x_rs6000_pltseq)		\
-		warning (0, "%qs unsupported for this ABI",	\
-			 "-mpltseq");				\
-	      rs6000_pltseq = false;				\
-	    }							\
-	}							\
-      else							\
-	{							\
-	  if (!RS6000_BI_ARCH_P)				\
-	    error (INVALID_32BIT, "32");			\
-	  if (TARGET_PROFILE_KERNEL)				\
-	    {							\
-	      TARGET_PROFILE_KERNEL = 0;			\
-	      error (INVALID_32BIT, "profile-kernel");		\
-	    }							\
-	  if (global_options_set.x_rs6000_current_cmodel)	\
-	    {							\
-	      SET_CMODEL (CMODEL_SMALL);			\
-	      error (INVALID_32BIT, "cmodel");			\
-	    }							\
-	}							\
-    }								\
-  while (0)
+  do rs6000_linux64_override_options (); while (0)
 
 #undef	ASM_SPEC
 #undef	LINK_OS_LINUX_SPEC
diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 15cacfb..a3fd28b 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -328,11 +328,15 @@
   [(set (match_operand:PXI 0 "nonimmediate_operand" "=d,m,d,d")
 	(match_operand:PXI 1 "input_operand" "m,d,d,O"))]
   "TARGET_MMA
-   && ((gpc_reg_operand (operands[0], PXImode)
-	&& !(CONST_INT_P (operands[1]) && INTVAL (operands[1]) == 0))
+   && (gpc_reg_operand (operands[0], PXImode)
        || gpc_reg_operand (operands[1], PXImode))"
-  "#"
-  "&& reload_completed"
+  "@
+   #
+   #
+   #
+   xxsetaccz %A0"
+  "&& reload_completed
+   && !(fpr_reg_operand (operands[0], PXImode) && operands[1] == const0_rtx)"
   [(const_int 0)]
 {
   rs6000_split_multireg_move (operands[0], operands[1]);
@@ -409,12 +413,14 @@
   "<acc> %A0"
   [(set_attr "type" "mma")])
 
-(define_insn "mma_xxsetaccz"
-  [(set (match_operand:PXI 0 "fpr_reg_operand" "=d")
+(define_expand "mma_xxsetaccz"
+  [(set (match_operand:PXI 0 "fpr_reg_operand")
 	(const_int 0))]
   "TARGET_MMA"
-  "xxsetaccz %A0"
-  [(set_attr "type" "mma")])
+{
+  emit_insn (gen_movpxi (operands[0], const0_rtx));
+  DONE;
+})
 
 (define_insn "mma_<vv>"
   [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
diff --git a/gcc/config/rs6000/power10.md b/gcc/config/rs6000/power10.md
index 9f8a582..2b4d882 100644
--- a/gcc/config/rs6000/power10.md
+++ b/gcc/config/rs6000/power10.md
@@ -468,13 +468,13 @@
        (eq_attr "cpu" "power10"))
   "DU_super_power10,dfu_power10*8")
 
-(define_insn_reservation "power10-mffgpr" 2
-  (and (eq_attr "type" "mffgpr")
+(define_insn_reservation "power10-mtvsr" 2
+  (and (eq_attr "type" "mtvsr")
        (eq_attr "cpu" "power10"))
   "DU_slice_3_power10,VSU_power10")
 
-(define_insn_reservation "power10-mftgpr" 2
-  (and (eq_attr "type" "mftgpr")
+(define_insn_reservation "power10-mfvsr" 2
+  (and (eq_attr "type" "mfvsr")
        (eq_attr "cpu" "power10"))
   "DU_slice_3_power10,VSU_power10")
 
diff --git a/gcc/config/rs6000/power6.md b/gcc/config/rs6000/power6.md
index a94ce52..e2e7582 100644
--- a/gcc/config/rs6000/power6.md
+++ b/gcc/config/rs6000/power6.md
@@ -56,10 +56,6 @@
 (define_reservation "FX2_power6"
                     "iu1_power6+iu2_power6")
 
-(define_reservation "X2F_power6"
-                    "(iu1_power6+iu2_power6+fpu1_power6)\
-                    |(iu1_power6+iu2_power6+fpu2_power6)")
-
 (define_reservation "BX2_power6"
                     "iu1_power6+iu2_power6+bpu_power6")
 
@@ -605,20 +601,3 @@
 
 (define_bypass 5 "power6-vecperm" "power6-vecstore" )
 
-(define_insn_reservation "power6-mftgpr" 8
-  (and (eq_attr "type" "mftgpr")
-       (eq_attr "cpu" "power6"))
-  "X2F_power6")
-
-(define_insn_reservation "power6-mffgpr" 14
-  (and (eq_attr "type" "mffgpr")
-       (eq_attr "cpu" "power6"))
-  "LX2_power6")
-
-(define_bypass 4 "power6-mftgpr" "power6-imul,\
-                                  power6-lmul,\
-                                  power6-imul-cmp,\
-                                  power6-lmul-cmp,\
-                                  power6-imul3,\
-                                  power6-idiv,\
-                                  power6-ldiv" )
diff --git a/gcc/config/rs6000/power8.md b/gcc/config/rs6000/power8.md
index fae2ad8..a3f46c6 100644
--- a/gcc/config/rs6000/power8.md
+++ b/gcc/config/rs6000/power8.md
@@ -379,13 +379,13 @@
        (eq_attr "cpu" "power8"))
   "DU_any_power8,VSU_power8")
 
-(define_insn_reservation "power8-mffgpr" 5
-  (and (eq_attr "type" "mffgpr")
+(define_insn_reservation "power8-mtvsr" 5
+  (and (eq_attr "type" "mtvsr")
        (eq_attr "cpu" "power8"))
   "DU_any_power8,VSU_power8")
 
-(define_insn_reservation "power8-mftgpr" 6
-  (and (eq_attr "type" "mftgpr")
+(define_insn_reservation "power8-mfvsr" 6
+  (and (eq_attr "type" "mfvsr")
        (eq_attr "cpu" "power8"))
   "DU_any_power8,VSU_power8")
 
diff --git a/gcc/config/rs6000/power9.md b/gcc/config/rs6000/power9.md
index 2277b14..c86d643 100644
--- a/gcc/config/rs6000/power9.md
+++ b/gcc/config/rs6000/power9.md
@@ -466,13 +466,13 @@
        (eq_attr "cpu" "power9"))
   "DU_super_power9,dfu_power9*8")
 
-(define_insn_reservation "power9-mffgpr" 2
-  (and (eq_attr "type" "mffgpr")
+(define_insn_reservation "power9-mtvsr" 2
+  (and (eq_attr "type" "mtvsr")
        (eq_attr "cpu" "power9"))
   "DU_slice_3_power9,VSU_power9")
 
-(define_insn_reservation "power9-mftgpr" 2
-  (and (eq_attr "type" "mftgpr")
+(define_insn_reservation "power9-mfvsr" 2
+  (and (eq_attr "type" "mfvsr")
        (eq_attr "cpu" "power9"))
   "DU_slice_3_power9,VSU_power9")
 
diff --git a/gcc/config/rs6000/ppc-asm.h b/gcc/config/rs6000/ppc-asm.h
index 48edc99..e0bce9c 100644
--- a/gcc/config/rs6000/ppc-asm.h
+++ b/gcc/config/rs6000/ppc-asm.h
@@ -262,6 +262,14 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #undef toc
 
 #define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name)
+#ifdef __PCREL__
+#define JUMP_TARGET(name) GLUE(FUNC_NAME(name),@notoc)
+#define FUNC_START(name) \
+	.type FUNC_NAME(name),@function; \
+	.globl FUNC_NAME(name); \
+FUNC_NAME(name): \
+	.localentry FUNC_NAME(name),1
+#else
 #define JUMP_TARGET(name) FUNC_NAME(name)
 #define FUNC_START(name) \
 	.type FUNC_NAME(name),@function; \
@@ -270,6 +278,7 @@ FUNC_NAME(name): \
 0:	addis 2,12,(.TOC.-0b)@ha; \
 	addi 2,2,(.TOC.-0b)@l; \
 	.localentry FUNC_NAME(name),.-FUNC_NAME(name)
+#endif /* !__PCREL__ */
 
 #define HIDDEN_FUNC(name) \
   FUNC_START(name) \
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index 9762855..4c2fe7f 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -214,6 +214,11 @@
   (and (match_code "const_int")
        (match_test "INTVAL (op) >= -16 && INTVAL (op) <= 15")))
 
+;; Return 1 if op is an unsigned 1-bit constant integer.
+(define_predicate "u1bit_cint_operand"
+  (and (match_code "const_int")
+       (match_test "INTVAL (op) >= 0 && INTVAL (op) <= 1")))
+
 ;; Return 1 if op is a unsigned 3-bit constant integer.
 (define_predicate "u3bit_cint_operand"
   (and (match_code "const_int")
@@ -272,6 +277,16 @@
        (match_test "(unsigned HOST_WIDE_INT)
 		    (INTVAL (op) + 0x8000) >= 0x10000")))
 
+;; Return 1 if op is a 32-bit constant signed integer
+(define_predicate "s32bit_cint_operand"
+  (and (match_code "const_int")
+       (match_test "(0x80000000 + UINTVAL (op)) >> 32 == 0")))
+
+;; Return 1 if op is a constant 32-bit unsigned
+(define_predicate "c32bit_cint_operand"
+  (and (match_code "const_int")
+       (match_test "((UINTVAL (op) >> 32) == 0)")))
+
 ;; Return 1 if op is a positive constant integer that is an exact power of 2.
 (define_predicate "exact_log2_cint_operand"
   (and (match_code "const_int")
@@ -1036,7 +1051,12 @@
 		    && !((DEFAULT_ABI == ABI_AIX
 			  || DEFAULT_ABI == ABI_ELFv2)
 			 && (SYMBOL_REF_EXTERNAL_P (op)
-			     || SYMBOL_REF_WEAK (op)))")))
+			     || SYMBOL_REF_WEAK (op)))
+		    && !(DEFAULT_ABI == ABI_ELFv2
+			 && SYMBOL_REF_DECL (op) != NULL
+			 && TREE_CODE (SYMBOL_REF_DECL (op)) == FUNCTION_DECL
+			 && (rs6000_fndecl_pcrel_p (SYMBOL_REF_DECL (op))
+			     != rs6000_pcrel_p ()))")))
 
 ;; Return 1 if this operand is a valid input for a move insn.
 (define_predicate "input_operand"
diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def
index f703755..3eb55f0 100644
--- a/gcc/config/rs6000/rs6000-builtin.def
+++ b/gcc/config/rs6000/rs6000-builtin.def
@@ -1019,44 +1019,43 @@
 		     | RS6000_BTC_BINARY),				\
 		    CODE_FOR_ ## ICODE)			/* ICODE */
 
-/* For builtins for power10 vector instructions that are encoded as altivec
-   instructions, use __builtin_altivec_ as the builtin name.  */
+/* Power 10 VSX builtins  */
 
-#define BU_P10V_0(ENUM, NAME, ATTR, ICODE)				\
+#define BU_P10V_VSX_0(ENUM, NAME, ATTR, ICODE)				\
   RS6000_BUILTIN_0 (P10V_BUILTIN_ ## ENUM,		/* ENUM */	\
-		    "__builtin_altivec_" NAME,		/* NAME */	\
+		    "__builtin_vsx_" NAME,		/* NAME */	\
 		    RS6000_BTM_P10,			/* MASK */	\
 		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
 		     | RS6000_BTC_SPECIAL),				\
 		    CODE_FOR_ ## ICODE)			/* ICODE */
 
-#define BU_P10V_1(ENUM, NAME, ATTR, ICODE)				\
-  RS6000_BUILTIN_1 (P10_BUILTIN_ ## ENUM,		/* ENUM */	\
-		    "__builtin_altivec_" NAME,		/* NAME */	\
+#define BU_P10V_VSX_1(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_1 (P10V_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_vsx_" NAME,		/* NAME */	\
 		    RS6000_BTM_P10,			/* MASK */	\
 		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
 		     | RS6000_BTC_UNARY),				\
 		    CODE_FOR_ ## ICODE)			/* ICODE */
 
-#define BU_P10V_2(ENUM, NAME, ATTR, ICODE)				\
-  RS6000_BUILTIN_2 (P10_BUILTIN_ ## ENUM,		/* ENUM */	\
-		    "__builtin_altivec_" NAME,		/* NAME */	\
+#define BU_P10V_VSX_2(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_2 (P10V_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_vsx_" NAME,		/* NAME */	\
 		    RS6000_BTM_P10,			/* MASK */	\
 		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
 		     | RS6000_BTC_BINARY),				\
 		    CODE_FOR_ ## ICODE)			/* ICODE */
 
-#define BU_P10V_3(ENUM, NAME, ATTR, ICODE)				\
-  RS6000_BUILTIN_3 (P10_BUILTIN_ ## ENUM,		/* ENUM */	\
-		    "__builtin_altivec_" NAME,		/* NAME */	\
+#define BU_P10V_VSX_3(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_3 (P10V_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_vsx_" NAME,		/* NAME */	\
 		    RS6000_BTM_P10,			/* MASK */	\
 		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
 		     | RS6000_BTC_TERNARY),				\
 		    CODE_FOR_ ## ICODE)			/* ICODE */
 
-#define BU_P10V_4(ENUM, NAME, ATTR, ICODE)				\
-  RS6000_BUILTIN_4 (P10_BUILTIN_ ## ENUM,		/* ENUM */	\
-		    "__builtin_altivec_" NAME,		/* NAME */	\
+#define BU_P10V_VSX_4(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_4 (P10V_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_vsx_" NAME,		/* NAME */	\
 		    RS6000_BTM_P10,			/* MASK */	\
 		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
 		     | RS6000_BTC_QUATERNARY),				\
@@ -1112,7 +1111,7 @@
 		     | RS6000_BTC_UNARY),				\
 		    CODE_FOR_ ## ICODE)			/* ICODE */
 
-#define BU_P10_MISC_2(ENUM, NAME, ATTR, ICODE)				\
+#define BU_P10_POWERPC64_MISC_2(ENUM, NAME, ATTR, ICODE)		\
   RS6000_BUILTIN_2 (P10_BUILTIN_ ## ENUM,		/* ENUM */	\
 		    "__builtin_" NAME,			/* NAME */	\
 		    RS6000_BTM_P10					\
@@ -1146,6 +1145,40 @@
 		    CODE_FOR_ ## ICODE)			/* ICODE */
 #endif
 
+/* Power 10 Altivec builtins  */
+
+#define BU_P10V_AV_0(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_0 (P10V_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_altivec_" NAME,		/* NAME */	\
+		    RS6000_BTM_P10,			/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_SPECIAL),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+#define BU_P10V_AV_1(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_1 (P10V_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_altivec_" NAME,		/* NAME */	\
+		    RS6000_BTM_P10,			/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_UNARY),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+#define BU_P10V_AV_2(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_2 (P10V_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_altivec_" NAME,		/* NAME */	\
+		    RS6000_BTM_P10,			/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_BINARY),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
+#define BU_P10V_AV_3(ENUM, NAME, ATTR, ICODE)				\
+  RS6000_BUILTIN_3 (P10V_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_altivec_" NAME,		/* NAME */	\
+		    RS6000_BTM_P10,			/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_TERNARY),				\
+		    CODE_FOR_ ## ICODE)			/* ICODE */
+
 
 /* Insure 0 is not a legitimate index.  */
 BU_SPECIAL_X (RS6000_BUILTIN_NONE, NULL, 0, RS6000_BTC_MISC)
@@ -1482,7 +1515,6 @@ BU_ALTIVEC_C (STVLXL,		"stvlxl",	    MEM)
 BU_ALTIVEC_C (STVRX,		"stvrx",	    MEM)
 BU_ALTIVEC_C (STVRXL,		"stvrxl",	    MEM)
 BU_ALTIVEC_X (MASK_FOR_LOAD,	"mask_for_load",    MISC)
-BU_ALTIVEC_X (MASK_FOR_STORE,	"mask_for_store",   MISC)
 BU_ALTIVEC_X (VEC_INIT_V4SI,	"vec_init_v4si",    CONST)
 BU_ALTIVEC_X (VEC_INIT_V8HI,	"vec_init_v8hi",    CONST)
 BU_ALTIVEC_X (VEC_INIT_V16QI,	"vec_init_v16qi",   CONST)
@@ -2695,66 +2727,126 @@ BU_P9_OVERLOAD_2 (CMPRB2,	"byte_in_either_range")
 BU_P9_OVERLOAD_2 (CMPEQB,	"byte_in_set")
 
 /* Builtins for scalar instructions added in ISA 3.1 (power10).  */
-BU_P10_MISC_2 (CFUGED, "cfuged", CONST, cfuged)
-BU_P10_MISC_2 (CNTLZDM, "cntlzdm", CONST, cntlzdm)
-BU_P10_MISC_2 (CNTTZDM, "cnttzdm", CONST, cnttzdm)
-BU_P10_MISC_2 (PDEPD, "pdepd", CONST, pdepd)
-BU_P10_MISC_2 (PEXTD, "pextd", CONST, pextd)
+BU_P10_POWERPC64_MISC_2 (CFUGED, "cfuged", CONST, cfuged)
+BU_P10_POWERPC64_MISC_2 (CNTLZDM, "cntlzdm", CONST, cntlzdm)
+BU_P10_POWERPC64_MISC_2 (CNTTZDM, "cnttzdm", CONST, cnttzdm)
+BU_P10_POWERPC64_MISC_2 (PDEPD, "pdepd", CONST, pdepd)
+BU_P10_POWERPC64_MISC_2 (PEXTD, "pextd", CONST, pextd)
 
 /* Builtins for vector instructions added in ISA 3.1 (power10).  */
-BU_P10V_2 (VCLRLB, "vclrlb", CONST, vclrlb)
-BU_P10V_2 (VCLRRB, "vclrrb", CONST, vclrrb)
-BU_P10V_2 (VCFUGED, "vcfuged", CONST, vcfuged)
-BU_P10V_2 (VCLZDM, "vclzdm", CONST, vclzdm)
-BU_P10V_2 (VCTZDM, "vctzdm", CONST, vctzdm)
-BU_P10V_2 (VPDEPD, "vpdepd", CONST, vpdepd)
-BU_P10V_2 (VPEXTD, "vpextd", CONST, vpextd)
-BU_P10V_2 (VGNB, "vgnb", CONST, vgnb)
-BU_P10V_4 (XXEVAL, "xxeval", CONST, xxeval)
-BU_P10V_2 (XXGENPCVM_V16QI, "xxgenpcvm_v16qi", CONST, xxgenpcvm_v16qi)
-BU_P10V_2 (XXGENPCVM_V8HI, "xxgenpcvm_v8hi", CONST, xxgenpcvm_v8hi)
-BU_P10V_2 (XXGENPCVM_V4SI, "xxgenpcvm_v4si", CONST, xxgenpcvm_v4si)
-BU_P10V_2 (XXGENPCVM_V2DI, "xxgenpcvm_v2di", CONST, xxgenpcvm_v2di)
-
-BU_P10V_3 (VEXTRACTBL, "vextdubvlx", CONST, vextractlv16qi)
-BU_P10V_3 (VEXTRACTHL, "vextduhvlx", CONST, vextractlv8hi)
-BU_P10V_3 (VEXTRACTWL, "vextduwvlx", CONST, vextractlv4si)
-BU_P10V_3 (VEXTRACTDL, "vextddvlx", CONST, vextractlv2di)
-
-BU_P10V_3 (VEXTRACTBR, "vextdubvhx", CONST, vextractrv16qi)
-BU_P10V_3 (VEXTRACTHR, "vextduhvhx", CONST, vextractrv8hi)
-BU_P10V_3 (VEXTRACTWR, "vextduwvhx", CONST, vextractrv4si)
-BU_P10V_3 (VEXTRACTDR, "vextddvhx", CONST, vextractrv2di)
-
-BU_P10V_1 (VSTRIBR, "vstribr", CONST, vstrir_v16qi)
-BU_P10V_1 (VSTRIHR, "vstrihr", CONST, vstrir_v8hi)
-BU_P10V_1 (VSTRIBL, "vstribl", CONST, vstril_v16qi)
-BU_P10V_1 (VSTRIHL, "vstrihl", CONST, vstril_v8hi)
-
-BU_P10V_1 (VSTRIBR_P, "vstribr_p", CONST, vstrir_p_v16qi)
-BU_P10V_1 (VSTRIHR_P, "vstrihr_p", CONST, vstrir_p_v8hi)
-BU_P10V_1 (VSTRIBL_P, "vstribl_p", CONST, vstril_p_v16qi)
-BU_P10V_1 (VSTRIHL_P, "vstrihl_p", CONST, vstril_p_v8hi)
-
-BU_P10V_1 (MTVSRBM, "mtvsrbm", CONST, vec_mtvsr_v16qi)
-BU_P10V_1 (MTVSRHM, "mtvsrhm", CONST, vec_mtvsr_v8hi)
-BU_P10V_1 (MTVSRWM, "mtvsrwm", CONST, vec_mtvsr_v4si)
-BU_P10V_1 (MTVSRDM, "mtvsrdm", CONST, vec_mtvsr_v2di)
-BU_P10V_1 (MTVSRQM, "mtvsrqm", CONST, vec_mtvsr_v1ti)
-BU_P10V_2 (VCNTMBB, "cntmbb", CONST, vec_cntmb_v16qi)
-BU_P10V_2 (VCNTMBH, "cntmbh", CONST, vec_cntmb_v8hi)
-BU_P10V_2 (VCNTMBW, "cntmbw", CONST, vec_cntmb_v4si)
-BU_P10V_2 (VCNTMBD, "cntmbd", CONST, vec_cntmb_v2di)
-BU_P10V_1 (VEXPANDMB, "vexpandmb", CONST, vec_expand_v16qi)
-BU_P10V_1 (VEXPANDMH, "vexpandmh", CONST, vec_expand_v8hi)
-BU_P10V_1 (VEXPANDMW, "vexpandmw", CONST, vec_expand_v4si)
-BU_P10V_1 (VEXPANDMD, "vexpandmd", CONST, vec_expand_v2di)
-BU_P10V_1 (VEXPANDMQ, "vexpandmq", CONST, vec_expand_v1ti)
-BU_P10V_1 (VEXTRACTMB, "vextractmb", CONST, vec_extract_v16qi)
-BU_P10V_1 (VEXTRACTMH, "vextractmh", CONST, vec_extract_v8hi)
-BU_P10V_1 (VEXTRACTMW, "vextractmw", CONST, vec_extract_v4si)
-BU_P10V_1 (VEXTRACTMD, "vextractmd", CONST, vec_extract_v2di)
-BU_P10V_1 (VEXTRACTMQ, "vextractmq", CONST, vec_extract_v1ti)
+BU_P10V_AV_2 (VCLRLB, "vclrlb", CONST, vclrlb)
+BU_P10V_AV_2 (VCLRRB, "vclrrb", CONST, vclrrb)
+BU_P10V_AV_2 (VCFUGED, "vcfuged", CONST, vcfuged)
+BU_P10V_AV_2 (VCLZDM, "vclzdm", CONST, vclzdm)
+BU_P10V_AV_2 (VCTZDM, "vctzdm", CONST, vctzdm)
+BU_P10V_AV_2 (VPDEPD, "vpdepd", CONST, vpdepd)
+BU_P10V_AV_2 (VPEXTD, "vpextd", CONST, vpextd)
+BU_P10V_AV_2 (VGNB, "vgnb", CONST, vgnb)
+BU_P10V_VSX_4 (XXEVAL, "xxeval", CONST, xxeval)
+BU_P10V_VSX_2 (XXGENPCVM_V16QI, "xxgenpcvm_v16qi", CONST, xxgenpcvm_v16qi)
+BU_P10V_VSX_2 (XXGENPCVM_V8HI, "xxgenpcvm_v8hi", CONST, xxgenpcvm_v8hi)
+BU_P10V_VSX_2 (XXGENPCVM_V4SI, "xxgenpcvm_v4si", CONST, xxgenpcvm_v4si)
+BU_P10V_VSX_2 (XXGENPCVM_V2DI, "xxgenpcvm_v2di", CONST, xxgenpcvm_v2di)
+
+BU_P10V_AV_3 (VEXTRACTBL, "vextdubvlx", CONST, vextractlv16qi)
+BU_P10V_AV_3 (VEXTRACTHL, "vextduhvlx", CONST, vextractlv8hi)
+BU_P10V_AV_3 (VEXTRACTWL, "vextduwvlx", CONST, vextractlv4si)
+BU_P10V_AV_3 (VEXTRACTDL, "vextddvlx", CONST, vextractlv2di)
+
+BU_P10V_AV_3 (VEXTRACTBR, "vextdubvhx", CONST, vextractrv16qi)
+BU_P10V_AV_3 (VEXTRACTHR, "vextduhvhx", CONST, vextractrv8hi)
+BU_P10V_AV_3 (VEXTRACTWR, "vextduwvhx", CONST, vextractrv4si)
+BU_P10V_AV_3 (VEXTRACTDR, "vextddvhx", CONST, vextractrv2di)
+
+BU_P10V_AV_3 (VINSERTGPRBL, "vinsgubvlx", CONST, vinsertgl_v16qi)
+BU_P10V_AV_3 (VINSERTGPRHL, "vinsguhvlx", CONST, vinsertgl_v8hi)
+BU_P10V_AV_3 (VINSERTGPRWL, "vinsguwvlx", CONST, vinsertgl_v4si)
+BU_P10V_AV_3 (VINSERTGPRDL, "vinsgudvlx", CONST, vinsertgl_v2di)
+BU_P10V_AV_3 (VINSERTVPRBL, "vinsvubvlx", CONST, vinsertvl_v16qi)
+BU_P10V_AV_3 (VINSERTVPRHL, "vinsvuhvlx", CONST, vinsertvl_v8hi)
+BU_P10V_AV_3 (VINSERTVPRWL, "vinsvuwvlx", CONST, vinsertvl_v4si)
+
+BU_P10V_AV_3 (VINSERTGPRBR, "vinsgubvrx", CONST, vinsertgr_v16qi)
+BU_P10V_AV_3 (VINSERTGPRHR, "vinsguhvrx", CONST, vinsertgr_v8hi)
+BU_P10V_AV_3 (VINSERTGPRWR, "vinsguwvrx", CONST, vinsertgr_v4si)
+BU_P10V_AV_3 (VINSERTGPRDR, "vinsgudvrx", CONST, vinsertgr_v2di)
+BU_P10V_AV_3 (VINSERTVPRBR, "vinsvubvrx", CONST, vinsertvr_v16qi)
+BU_P10V_AV_3 (VINSERTVPRHR, "vinsvuhvrx", CONST, vinsertvr_v8hi)
+BU_P10V_AV_3 (VINSERTVPRWR, "vinsvuwvrx", CONST, vinsertvr_v4si)
+
+BU_P10V_AV_3 (VREPLACE_ELT_V4SI, "vreplace_v4si", CONST, vreplace_elt_v4si)
+BU_P10V_AV_3 (VREPLACE_ELT_UV4SI, "vreplace_uv4si", CONST, vreplace_elt_v4si)
+BU_P10V_AV_3 (VREPLACE_ELT_V4SF, "vreplace_v4sf", CONST, vreplace_elt_v4sf)
+BU_P10V_AV_3 (VREPLACE_ELT_V2DI, "vreplace_v2di", CONST, vreplace_elt_v2di)
+BU_P10V_AV_3 (VREPLACE_ELT_UV2DI, "vreplace_uv2di", CONST, vreplace_elt_v2di)
+BU_P10V_AV_3 (VREPLACE_ELT_V2DF, "vreplace_v2df", CONST, vreplace_elt_v2df)
+
+BU_P10V_AV_3 (VREPLACE_UN_V4SI, "vreplace_un_v4si", CONST, vreplace_un_v4si)
+BU_P10V_AV_3 (VREPLACE_UN_UV4SI, "vreplace_un_uv4si", CONST, vreplace_un_v4si)
+BU_P10V_AV_3 (VREPLACE_UN_V4SF, "vreplace_un_v4sf", CONST, vreplace_un_v4sf)
+BU_P10V_AV_3 (VREPLACE_UN_V2DI, "vreplace_un_v2di", CONST, vreplace_un_v2di)
+BU_P10V_AV_3 (VREPLACE_UN_UV2DI, "vreplace_un_uv2di", CONST, vreplace_un_v2di)
+BU_P10V_AV_3 (VREPLACE_UN_V2DF, "vreplace_un_v2df", CONST, vreplace_un_v2df)
+
+BU_P10V_AV_3 (VSLDB_V16QI, "vsldb_v16qi", CONST, vsldb_v16qi)
+BU_P10V_AV_3 (VSLDB_V8HI, "vsldb_v8hi", CONST, vsldb_v8hi)
+BU_P10V_AV_3 (VSLDB_V4SI, "vsldb_v4si", CONST, vsldb_v4si)
+BU_P10V_AV_3 (VSLDB_V2DI, "vsldb_v2di", CONST, vsldb_v2di)
+
+BU_P10V_AV_3 (VSRDB_V16QI, "vsrdb_v16qi", CONST, vsrdb_v16qi)
+BU_P10V_AV_3 (VSRDB_V8HI, "vsrdb_v8hi", CONST, vsrdb_v8hi)
+BU_P10V_AV_3 (VSRDB_V4SI, "vsrdb_v4si", CONST, vsrdb_v4si)
+BU_P10V_AV_3 (VSRDB_V2DI, "vsrdb_v2di", CONST, vsrdb_v2di)
+
+BU_P10V_VSX_1 (VXXSPLTIW_V4SI, "vxxspltiw_v4si", CONST, xxspltiw_v4si)
+BU_P10V_VSX_1 (VXXSPLTIW_V4SF, "vxxspltiw_v4sf", CONST, xxspltiw_v4sf)
+
+BU_P10V_VSX_1 (VXXSPLTID, "vxxspltidp", CONST, xxspltidp_v2df)
+
+BU_P10V_VSX_3 (VXXSPLTI32DX_V4SI, "vxxsplti32dx_v4si", CONST, xxsplti32dx_v4si)
+BU_P10V_VSX_3 (VXXSPLTI32DX_V4SF, "vxxsplti32dx_v4sf", CONST, xxsplti32dx_v4sf)
+
+BU_P10V_VSX_3 (VXXBLEND_V16QI, "xxblend_v16qi", CONST, xxblend_v16qi)
+BU_P10V_VSX_3 (VXXBLEND_V8HI, "xxblend_v8hi", CONST, xxblend_v8hi)
+BU_P10V_VSX_3 (VXXBLEND_V4SI, "xxblend_v4si", CONST, xxblend_v4si)
+BU_P10V_VSX_3 (VXXBLEND_V2DI, "xxblend_v2di", CONST, xxblend_v2di)
+BU_P10V_VSX_3 (VXXBLEND_V4SF, "xxblend_v4sf", CONST, xxblend_v4sf)
+BU_P10V_VSX_3 (VXXBLEND_V2DF, "xxblend_v2df", CONST, xxblend_v2df)
+
+BU_P10V_VSX_4 (VXXPERMX, "xxpermx", CONST, xxpermx)
+
+BU_P10V_AV_1 (VSTRIBR, "vstribr", CONST, vstrir_v16qi)
+BU_P10V_AV_1 (VSTRIHR, "vstrihr", CONST, vstrir_v8hi)
+BU_P10V_AV_1 (VSTRIBL, "vstribl", CONST, vstril_v16qi)
+BU_P10V_AV_1 (VSTRIHL, "vstrihl", CONST, vstril_v8hi)
+
+BU_P10V_AV_1 (VSTRIBR_P, "vstribr_p", CONST, vstrir_p_v16qi)
+BU_P10V_AV_1 (VSTRIHR_P, "vstrihr_p", CONST, vstrir_p_v8hi)
+BU_P10V_AV_1 (VSTRIBL_P, "vstribl_p", CONST, vstril_p_v16qi)
+BU_P10V_AV_1 (VSTRIHL_P, "vstrihl_p", CONST, vstril_p_v8hi)
+
+BU_P10V_VSX_1 (XVTLSBB_ZEROS, "xvtlsbb_all_zeros", CONST, xvtlsbbz)
+BU_P10V_VSX_1 (XVTLSBB_ONES, "xvtlsbb_all_ones", CONST, xvtlsbbo)
+
+BU_P10V_AV_1 (MTVSRBM, "mtvsrbm", CONST, vec_mtvsr_v16qi)
+BU_P10V_AV_1 (MTVSRHM, "mtvsrhm", CONST, vec_mtvsr_v8hi)
+BU_P10V_AV_1 (MTVSRWM, "mtvsrwm", CONST, vec_mtvsr_v4si)
+BU_P10V_AV_1 (MTVSRDM, "mtvsrdm", CONST, vec_mtvsr_v2di)
+BU_P10V_AV_1 (MTVSRQM, "mtvsrqm", CONST, vec_mtvsr_v1ti)
+BU_P10V_AV_2 (VCNTMBB, "cntmbb", CONST, vec_cntmb_v16qi)
+BU_P10V_AV_2 (VCNTMBH, "cntmbh", CONST, vec_cntmb_v8hi)
+BU_P10V_AV_2 (VCNTMBW, "cntmbw", CONST, vec_cntmb_v4si)
+BU_P10V_AV_2 (VCNTMBD, "cntmbd", CONST, vec_cntmb_v2di)
+BU_P10V_AV_1 (VEXPANDMB, "vexpandmb", CONST, vec_expand_v16qi)
+BU_P10V_AV_1 (VEXPANDMH, "vexpandmh", CONST, vec_expand_v8hi)
+BU_P10V_AV_1 (VEXPANDMW, "vexpandmw", CONST, vec_expand_v4si)
+BU_P10V_AV_1 (VEXPANDMD, "vexpandmd", CONST, vec_expand_v2di)
+BU_P10V_AV_1 (VEXPANDMQ, "vexpandmq", CONST, vec_expand_v1ti)
+BU_P10V_AV_1 (VEXTRACTMB, "vextractmb", CONST, vec_extract_v16qi)
+BU_P10V_AV_1 (VEXTRACTMH, "vextractmh", CONST, vec_extract_v8hi)
+BU_P10V_AV_1 (VEXTRACTMW, "vextractmw", CONST, vec_extract_v4si)
+BU_P10V_AV_1 (VEXTRACTMD, "vextractmd", CONST, vec_extract_v2di)
+BU_P10V_AV_1 (VEXTRACTMQ, "vextractmq", CONST, vec_extract_v1ti)
 
 /* Overloaded vector builtins for ISA 3.1 (power10).  */
 BU_P10_OVERLOAD_2 (CLRL, "clrl")
@@ -2765,12 +2857,22 @@ BU_P10_OVERLOAD_2 (XXGENPCVM, "xxgenpcvm")
 
 BU_P10_OVERLOAD_3 (EXTRACTL, "extractl")
 BU_P10_OVERLOAD_3 (EXTRACTH, "extracth")
+BU_P10_OVERLOAD_3 (INSERTL, "insertl")
+BU_P10_OVERLOAD_3 (INSERTH, "inserth")
+BU_P10_OVERLOAD_3 (REPLACE_ELT, "replace_elt")
+BU_P10_OVERLOAD_3 (REPLACE_UN, "replace_un")
+BU_P10_OVERLOAD_3 (SLDB, "sldb")
+BU_P10_OVERLOAD_3 (SRDB, "srdb")
 
 BU_P10_OVERLOAD_1 (VSTRIR, "strir")
 BU_P10_OVERLOAD_1 (VSTRIL, "stril")
 
 BU_P10_OVERLOAD_1 (VSTRIR_P, "strir_p")
 BU_P10_OVERLOAD_1 (VSTRIL_P, "stril_p")
+
+BU_P10_OVERLOAD_1 (XVTLSBB_ZEROS, "xvtlsbb_all_zeros")
+BU_P10_OVERLOAD_1 (XVTLSBB_ONES, "xvtlsbb_all_ones")
+
 
 BU_P10_OVERLOAD_1 (MTVSRBM, "mtvsrbm")
 BU_P10_OVERLOAD_1 (MTVSRHM, "mtvsrhm")
@@ -2780,6 +2882,11 @@ BU_P10_OVERLOAD_1 (MTVSRQM, "mtvsrqm")
 BU_P10_OVERLOAD_2 (VCNTM, "cntm")
 BU_P10_OVERLOAD_1 (VEXPANDM, "vexpandm")
 BU_P10_OVERLOAD_1 (VEXTRACTM, "vextractm")
+BU_P10_OVERLOAD_1 (XXSPLTIW, "xxspltiw")
+BU_P10_OVERLOAD_1 (XXSPLTID, "xxspltid")
+BU_P10_OVERLOAD_3 (XXSPLTI32DX, "xxsplti32dx")
+BU_P10_OVERLOAD_3 (XXBLEND, "xxblend")
+BU_P10_OVERLOAD_4 (XXPERMX, "xxpermx")
 
 /* 1 argument crypto functions.  */
 BU_CRYPTO_1 (VSBOX,		"vsbox",	  CONST, crypto_vsbox_v2di)
@@ -2915,8 +3022,8 @@ BU_SPECIAL_X (RS6000_BUILTIN_CFSTRING, "__builtin_cfstring", RS6000_BTM_ALWAYS,
 	      RS6000_BTC_MISC)
 
 /* POWER10 MMA builtins.  */
-BU_VSX_1 (XVCVBF16SP,	    "xvcvbf16sp",	MISC, vsx_xvcvbf16sp)
-BU_VSX_1 (XVCVSPBF16,	    "xvcvspbf16",	MISC, vsx_xvcvspbf16)
+BU_P10V_VSX_1 (XVCVBF16SPN,	 "xvcvbf16spn",	MISC, vsx_xvcvbf16spn)
+BU_P10V_VSX_1 (XVCVSPBF16,	    "xvcvspbf16",	MISC, vsx_xvcvspbf16)
 
 BU_MMA_1 (XXMFACC,	    "xxmfacc",		QUAD, mma_xxmfacc)
 BU_MMA_1 (XXMTACC,	    "xxmtacc",		QUAD, mma_xxmtacc)
diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index cb7d34d..cc1e997 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -597,6 +597,9 @@ rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT flags,
   /* Tell the user if we support the MMA instructions.  */
   if ((flags & OPTION_MASK_MMA) != 0)
     rs6000_define_or_undefine_macro (define_p, "__MMA__");
+  /* Whether pc-relative code is being generated.  */
+  if ((flags & OPTION_MASK_PCREL) != 0)
+    rs6000_define_or_undefine_macro (define_p, "__PCREL__");
 }
 
 void
@@ -1800,22 +1803,34 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
 	      unsupported_builtin = true;
 	  }
       }
-    else if (fcode == P10_BUILTIN_VEC_XXEVAL)
+    else if ((fcode == P10_BUILTIN_VEC_XXEVAL)
+	    || (fcode == P10V_BUILTIN_VXXPERMX))
       {
-	/* Need to special case __builtin_vec_xxeval because this takes
-	   4 arguments, and the existing infrastructure handles no
-	   more than three.  */
+	signed char op3_type;
+
+	/* Need to special case P10_BUILTIN_VEC_XXEVAL and
+	   P10V_BUILTIN_VXXPERMX because they take 4 arguments and the
+	   existing infrastructure only handles three.  */
 	if (nargs != 4)
 	  {
-	    error ("builtin %qs requires 4 arguments",
-		   "__builtin_vec_xxeval");
+	    const char *name = fcode == P10_BUILTIN_VEC_XXEVAL ?
+	      "__builtin_vec_xxeval":"__builtin_vec_xxpermx";
+
+	    error ("builtin %qs requires 4 arguments", name);
 	    return error_mark_node;
 	  }
+
 	for ( ; desc->code == fcode; desc++)
 	  {
+	    if (fcode == P10_BUILTIN_VEC_XXEVAL)
+	      op3_type = desc->op3;
+	    else  /* P10V_BUILTIN_VXXPERMX */
+	      op3_type = RS6000_BTI_V16QI;
+
 	    if (rs6000_builtin_type_compatible (types[0], desc->op1)
 		&& rs6000_builtin_type_compatible (types[1], desc->op2)
 		&& rs6000_builtin_type_compatible (types[2], desc->op3)
+		&& rs6000_builtin_type_compatible (types[2], op3_type)
 		&& rs6000_builtin_type_compatible (types[3],
 						   RS6000_BTI_UINTSI))
 	      {
diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c
index 5ec3f2c..9fdf97b 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -57,16 +57,14 @@
 #include "gimplify.h"
 #include "gimple-fold.h"
 #include "gimple-iterator.h"
-#include "gimple-ssa.h"
+#include "ssa.h"
+#include "tree-ssa-propagate.h"
 #include "builtins.h"
 #include "tree-vector-builder.h"
 #if TARGET_XCOFF
 #include "xcoffout.h"  /* get declarations of xcoff_*_section_name */
 #endif
 #include "ppc-auxv.h"
-#include "tree-ssa-propagate.h"
-#include "tree-vrp.h"
-#include "tree-ssanames.h"
 #include "targhooks.h"
 #include "opts.h"
 
@@ -5528,159 +5526,368 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = {
     RS6000_BTI_INTSI, RS6000_BTI_INTSI },
 
   /* Overloaded built-in functions for ISA3.1 (power10). */
-  { P10_BUILTIN_VEC_CLRL, P10_BUILTIN_VCLRLB,
+  { P10_BUILTIN_VEC_CLRL, P10V_BUILTIN_VCLRLB,
     RS6000_BTI_V16QI, RS6000_BTI_V16QI, RS6000_BTI_UINTSI, 0 },
-  { P10_BUILTIN_VEC_CLRL, P10_BUILTIN_VCLRLB,
+  { P10_BUILTIN_VEC_CLRL, P10V_BUILTIN_VCLRLB,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
     RS6000_BTI_UINTSI, 0 },
-  { P10_BUILTIN_VEC_CLRR, P10_BUILTIN_VCLRRB,
+  { P10_BUILTIN_VEC_CLRR, P10V_BUILTIN_VCLRRB,
     RS6000_BTI_V16QI, RS6000_BTI_V16QI, RS6000_BTI_UINTSI, 0 },
-  { P10_BUILTIN_VEC_CLRR, P10_BUILTIN_VCLRRB,
+  { P10_BUILTIN_VEC_CLRR, P10V_BUILTIN_VCLRRB,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
     RS6000_BTI_UINTSI, 0 },
 
-  { P10_BUILTIN_VEC_GNB, P10_BUILTIN_VGNB, RS6000_BTI_unsigned_long_long,
+  { P10_BUILTIN_VEC_GNB, P10V_BUILTIN_VGNB, RS6000_BTI_unsigned_long_long,
     RS6000_BTI_unsigned_V1TI, RS6000_BTI_UINTQI, 0 },
-  { P10_BUILTIN_VEC_XXGENPCVM, P10_BUILTIN_XXGENPCVM_V2DI,
+  { P10_BUILTIN_VEC_XXGENPCVM, P10V_BUILTIN_XXGENPCVM_V2DI,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_INTSI, 0 },
-  { P10_BUILTIN_VEC_XXGENPCVM, P10_BUILTIN_XXGENPCVM_V4SI,
+  { P10_BUILTIN_VEC_XXGENPCVM, P10V_BUILTIN_XXGENPCVM_V4SI,
     RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, RS6000_BTI_INTSI, 0 },
-  { P10_BUILTIN_VEC_XXGENPCVM, P10_BUILTIN_XXGENPCVM_V8HI,
+  { P10_BUILTIN_VEC_XXGENPCVM, P10V_BUILTIN_XXGENPCVM_V8HI,
     RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI, RS6000_BTI_INTSI, 0 },
-  { P10_BUILTIN_VEC_XXGENPCVM, P10_BUILTIN_XXGENPCVM_V16QI,
+  { P10_BUILTIN_VEC_XXGENPCVM, P10V_BUILTIN_XXGENPCVM_V16QI,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
     RS6000_BTI_INTSI, 0 },
 
   /* The overloaded XXEVAL definitions are handled specially because the
      fourth unsigned char operand is not encoded in this table.  */
-  { P10_BUILTIN_VEC_XXEVAL, P10_BUILTIN_XXEVAL,
+  { P10_BUILTIN_VEC_XXEVAL, P10V_BUILTIN_XXEVAL,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI },
-  { P10_BUILTIN_VEC_XXEVAL, P10_BUILTIN_XXEVAL,
+  { P10_BUILTIN_VEC_XXEVAL, P10V_BUILTIN_XXEVAL,
     RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI,
     RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI },
-  { P10_BUILTIN_VEC_XXEVAL, P10_BUILTIN_XXEVAL,
+  { P10_BUILTIN_VEC_XXEVAL, P10V_BUILTIN_XXEVAL,
     RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI,
     RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI },
-  { P10_BUILTIN_VEC_XXEVAL, P10_BUILTIN_XXEVAL,
+  { P10_BUILTIN_VEC_XXEVAL, P10V_BUILTIN_XXEVAL,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI },
-  { P10_BUILTIN_VEC_XXEVAL, P10_BUILTIN_XXEVAL,
+  { P10_BUILTIN_VEC_XXEVAL, P10V_BUILTIN_XXEVAL,
     RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI,
     RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI },
 
-  { P10_BUILTIN_VEC_EXTRACTL, P10_BUILTIN_VEXTRACTBL,
+  /* The overloaded XXPERMX definitions are handled specially because the
+     fourth unsigned char operand is not encoded in this table.  */
+  {  P10_BUILTIN_VEC_XXPERMX, P10V_BUILTIN_VXXPERMX,
+     RS6000_BTI_V16QI, RS6000_BTI_V16QI, RS6000_BTI_V16QI,
+     RS6000_BTI_unsigned_V16QI },
+  {  P10_BUILTIN_VEC_XXPERMX, P10V_BUILTIN_VXXPERMX,
+     RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
+     RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI },
+  {  P10_BUILTIN_VEC_XXPERMX, P10V_BUILTIN_VXXPERMX,
+     RS6000_BTI_V8HI, RS6000_BTI_V8HI, RS6000_BTI_V8HI,
+     RS6000_BTI_unsigned_V16QI },
+  {  P10_BUILTIN_VEC_XXPERMX, P10V_BUILTIN_VXXPERMX,
+     RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI,
+     RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V16QI },
+  {  P10_BUILTIN_VEC_XXPERMX, P10V_BUILTIN_VXXPERMX,
+     RS6000_BTI_V4SI, RS6000_BTI_V4SI, RS6000_BTI_V4SI,
+     RS6000_BTI_unsigned_V16QI },
+  {  P10_BUILTIN_VEC_XXPERMX, P10V_BUILTIN_VXXPERMX,
+     RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI,
+     RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V16QI },
+  {  P10_BUILTIN_VEC_XXPERMX, P10V_BUILTIN_VXXPERMX,
+     RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI,
+     RS6000_BTI_unsigned_V16QI },
+  {  P10_BUILTIN_VEC_XXPERMX, P10V_BUILTIN_VXXPERMX,
+     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI,
+     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V16QI },
+  {  P10_BUILTIN_VEC_XXPERMX, P10V_BUILTIN_VXXPERMX,
+     RS6000_BTI_V4SF, RS6000_BTI_V4SF, RS6000_BTI_V4SF,
+     RS6000_BTI_unsigned_V16QI },
+  {  P10_BUILTIN_VEC_XXPERMX, P10V_BUILTIN_VXXPERMX,
+     RS6000_BTI_V2DF, RS6000_BTI_V2DF, RS6000_BTI_V2DF,
+     RS6000_BTI_unsigned_V16QI },
+
+  { P10_BUILTIN_VEC_EXTRACTL, P10V_BUILTIN_VEXTRACTBL,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V16QI,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_UINTQI },
-  { P10_BUILTIN_VEC_EXTRACTL, P10_BUILTIN_VEXTRACTHL,
+  { P10_BUILTIN_VEC_EXTRACTL, P10V_BUILTIN_VEXTRACTHL,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V8HI,
     RS6000_BTI_unsigned_V8HI, RS6000_BTI_UINTQI },
-  { P10_BUILTIN_VEC_EXTRACTL, P10_BUILTIN_VEXTRACTWL,
+  { P10_BUILTIN_VEC_EXTRACTL, P10V_BUILTIN_VEXTRACTWL,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V4SI,
     RS6000_BTI_unsigned_V4SI, RS6000_BTI_UINTQI },
-  { P10_BUILTIN_VEC_EXTRACTL, P10_BUILTIN_VEXTRACTDL,
+  { P10_BUILTIN_VEC_EXTRACTL, P10V_BUILTIN_VEXTRACTDL,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_UINTQI },
 
-  { P10_BUILTIN_VEC_EXTRACTH, P10_BUILTIN_VEXTRACTBR,
+  { P10_BUILTIN_VEC_INSERTL, P10V_BUILTIN_VINSERTGPRBL,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_UINTQI,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_UINTSI },
+  { P10_BUILTIN_VEC_INSERTL, P10V_BUILTIN_VINSERTGPRHL,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_UINTHI,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_UINTSI },
+  { P10_BUILTIN_VEC_INSERTL, P10V_BUILTIN_VINSERTGPRWL,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_UINTSI,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_UINTSI },
+  { P10_BUILTIN_VEC_INSERTL, P10V_BUILTIN_VINSERTGPRDL,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_UINTDI,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_UINTSI },
+ { P10_BUILTIN_VEC_INSERTL, P10V_BUILTIN_VINSERTVPRBL,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_INSERTL, P10V_BUILTIN_VINSERTVPRHL,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_INSERTL, P10V_BUILTIN_VINSERTVPRWL,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_UINTQI },
+
+  { P10_BUILTIN_VEC_EXTRACTH, P10V_BUILTIN_VEXTRACTBR,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V16QI,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_UINTQI },
-  { P10_BUILTIN_VEC_EXTRACTH, P10_BUILTIN_VEXTRACTHR,
+  { P10_BUILTIN_VEC_EXTRACTH, P10V_BUILTIN_VEXTRACTHR,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V8HI,
     RS6000_BTI_unsigned_V8HI, RS6000_BTI_UINTQI },
-  { P10_BUILTIN_VEC_EXTRACTH, P10_BUILTIN_VEXTRACTWR,
+  { P10_BUILTIN_VEC_EXTRACTH, P10V_BUILTIN_VEXTRACTWR,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V4SI,
     RS6000_BTI_unsigned_V4SI, RS6000_BTI_UINTQI },
-  { P10_BUILTIN_VEC_EXTRACTH, P10_BUILTIN_VEXTRACTDR,
+  { P10_BUILTIN_VEC_EXTRACTH, P10V_BUILTIN_VEXTRACTDR,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_UINTQI },
+
+  { P10_BUILTIN_VEC_INSERTH, P10V_BUILTIN_VINSERTGPRBR,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_UINTQI,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_UINTSI },
+  { P10_BUILTIN_VEC_INSERTH, P10V_BUILTIN_VINSERTGPRHR,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_UINTHI,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_UINTSI },
+  { P10_BUILTIN_VEC_INSERTH, P10V_BUILTIN_VINSERTGPRWR,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_UINTSI,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_UINTSI },
+  { P10_BUILTIN_VEC_INSERTH, P10V_BUILTIN_VINSERTGPRDR,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_UINTDI,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_UINTSI },
+  { P10_BUILTIN_VEC_INSERTH, P10V_BUILTIN_VINSERTVPRBR,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_INSERTH, P10V_BUILTIN_VINSERTVPRHR,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_INSERTH, P10V_BUILTIN_VINSERTVPRWR,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_UINTQI },
+
+  { P10_BUILTIN_VEC_REPLACE_ELT, P10V_BUILTIN_VREPLACE_ELT_UV4SI,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI,
+    RS6000_BTI_UINTSI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_REPLACE_ELT, P10V_BUILTIN_VREPLACE_ELT_V4SI,
+    RS6000_BTI_V4SI, RS6000_BTI_V4SI, RS6000_BTI_INTSI, RS6000_BTI_INTQI },
+  { P10_BUILTIN_VEC_REPLACE_ELT, P10V_BUILTIN_VREPLACE_ELT_V4SF,
+    RS6000_BTI_V4SF, RS6000_BTI_V4SF, RS6000_BTI_float, RS6000_BTI_INTQI },
+  { P10_BUILTIN_VEC_REPLACE_ELT, P10V_BUILTIN_VREPLACE_ELT_UV2DI,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI,
+    RS6000_BTI_UINTDI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_REPLACE_ELT, P10V_BUILTIN_VREPLACE_ELT_V2DI,
+    RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_INTDI, RS6000_BTI_INTQI },
+  { P10_BUILTIN_VEC_REPLACE_ELT, P10V_BUILTIN_VREPLACE_ELT_V2DF,
+    RS6000_BTI_V2DF, RS6000_BTI_V2DF, RS6000_BTI_double, RS6000_BTI_INTQI },
+
+  { P10_BUILTIN_VEC_REPLACE_UN, P10V_BUILTIN_VREPLACE_UN_UV4SI,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI,
+    RS6000_BTI_UINTSI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_REPLACE_UN, P10V_BUILTIN_VREPLACE_UN_V4SI,
+    RS6000_BTI_V4SI, RS6000_BTI_V4SI, RS6000_BTI_INTSI, RS6000_BTI_INTQI },
+  { P10_BUILTIN_VEC_REPLACE_UN, P10V_BUILTIN_VREPLACE_UN_V4SF,
+    RS6000_BTI_V4SF, RS6000_BTI_V4SF, RS6000_BTI_float, RS6000_BTI_INTQI },
+  { P10_BUILTIN_VEC_REPLACE_UN, P10V_BUILTIN_VREPLACE_UN_UV2DI,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI,
+    RS6000_BTI_UINTDI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_REPLACE_UN, P10V_BUILTIN_VREPLACE_UN_V2DI,
+    RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_INTDI, RS6000_BTI_INTQI },
+  { P10_BUILTIN_VEC_REPLACE_UN, P10V_BUILTIN_VREPLACE_UN_V2DF,
+    RS6000_BTI_V2DF, RS6000_BTI_V2DF, RS6000_BTI_double, RS6000_BTI_INTQI },
+
+  { P10_BUILTIN_VEC_SLDB, P10V_BUILTIN_VSLDB_V16QI,
+    RS6000_BTI_V16QI, RS6000_BTI_V16QI,
+    RS6000_BTI_V16QI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SLDB, P10V_BUILTIN_VSLDB_V16QI,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SLDB, P10V_BUILTIN_VSLDB_V8HI,
+    RS6000_BTI_V8HI, RS6000_BTI_V8HI,
+    RS6000_BTI_V8HI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SLDB, P10V_BUILTIN_VSLDB_V8HI,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SLDB, P10V_BUILTIN_VSLDB_V4SI,
+    RS6000_BTI_V4SI, RS6000_BTI_V4SI,
+    RS6000_BTI_V4SI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SLDB, P10V_BUILTIN_VSLDB_V4SI,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SLDB, P10V_BUILTIN_VSLDB_V2DI,
+    RS6000_BTI_V2DI, RS6000_BTI_V2DI,
+    RS6000_BTI_V2DI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SLDB, P10V_BUILTIN_VSLDB_V2DI,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_UINTQI },
 
-  { P10_BUILTIN_VEC_VSTRIL, P10_BUILTIN_VSTRIBL,
+  { P10_BUILTIN_VEC_XXSPLTIW, P10V_BUILTIN_VXXSPLTIW_V4SI,
+    RS6000_BTI_V4SI, RS6000_BTI_INTSI, 0, 0 },
+  { P10_BUILTIN_VEC_XXSPLTIW, P10V_BUILTIN_VXXSPLTIW_V4SF,
+    RS6000_BTI_V4SF, RS6000_BTI_float, 0, 0 },
+
+  { P10_BUILTIN_VEC_XXSPLTID, P10V_BUILTIN_VXXSPLTID,
+    RS6000_BTI_V2DF, RS6000_BTI_float, 0, 0 },
+
+  { P10_BUILTIN_VEC_XXSPLTI32DX, P10V_BUILTIN_VXXSPLTI32DX_V4SI,
+    RS6000_BTI_V4SI, RS6000_BTI_V4SI, RS6000_BTI_UINTQI, RS6000_BTI_INTSI },
+  { P10_BUILTIN_VEC_XXSPLTI32DX, P10V_BUILTIN_VXXSPLTI32DX_V4SI,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, RS6000_BTI_UINTQI,
+    RS6000_BTI_UINTSI },
+  { P10_BUILTIN_VEC_XXSPLTI32DX, P10V_BUILTIN_VXXSPLTI32DX_V4SF,
+    RS6000_BTI_V4SF, RS6000_BTI_V4SF, RS6000_BTI_UINTQI, RS6000_BTI_float },
+
+  {  P10_BUILTIN_VEC_XXBLEND, P10V_BUILTIN_VXXBLEND_V16QI,
+     RS6000_BTI_V16QI, RS6000_BTI_V16QI, RS6000_BTI_V16QI,
+     RS6000_BTI_unsigned_V16QI },
+  {  P10_BUILTIN_VEC_XXBLEND, P10V_BUILTIN_VXXBLEND_V16QI,
+     RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
+     RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI },
+  {  P10_BUILTIN_VEC_XXBLEND, P10V_BUILTIN_VXXBLEND_V8HI,
+     RS6000_BTI_V8HI, RS6000_BTI_V8HI, RS6000_BTI_V8HI,
+     RS6000_BTI_unsigned_V8HI },
+  {  P10_BUILTIN_VEC_XXBLEND, P10V_BUILTIN_VXXBLEND_V8HI,
+     RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI,
+     RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI },
+  {  P10_BUILTIN_VEC_XXBLEND, P10V_BUILTIN_VXXBLEND_V4SI,
+     RS6000_BTI_V4SI, RS6000_BTI_V4SI, RS6000_BTI_V4SI,
+     RS6000_BTI_unsigned_V4SI },
+  {  P10_BUILTIN_VEC_XXBLEND, P10V_BUILTIN_VXXBLEND_V4SI,
+     RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI,
+     RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI },
+  {  P10_BUILTIN_VEC_XXBLEND, P10V_BUILTIN_VXXBLEND_V2DI,
+     RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI,
+     RS6000_BTI_unsigned_V2DI },
+  {  P10_BUILTIN_VEC_XXBLEND, P10V_BUILTIN_VXXBLEND_V2DI,
+     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI,
+     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI },
+  {  P10_BUILTIN_VEC_XXBLEND, P10V_BUILTIN_VXXBLEND_V4SF,
+     RS6000_BTI_V4SF, RS6000_BTI_V4SF, RS6000_BTI_V4SF,
+     RS6000_BTI_unsigned_V4SI },
+  {  P10_BUILTIN_VEC_XXBLEND, P10V_BUILTIN_VXXBLEND_V2DF,
+     RS6000_BTI_V2DF, RS6000_BTI_V2DF, RS6000_BTI_V2DF,
+     RS6000_BTI_unsigned_V2DI },
+
+  { P10_BUILTIN_VEC_SRDB, P10V_BUILTIN_VSRDB_V16QI,
+    RS6000_BTI_V16QI, RS6000_BTI_V16QI,
+    RS6000_BTI_V16QI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SRDB, P10V_BUILTIN_VSRDB_V16QI,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI,
+    RS6000_BTI_unsigned_V16QI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SRDB, P10V_BUILTIN_VSRDB_V8HI,
+    RS6000_BTI_V8HI, RS6000_BTI_V8HI,
+    RS6000_BTI_V8HI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SRDB, P10V_BUILTIN_VSRDB_V8HI,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI,
+    RS6000_BTI_unsigned_V8HI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SRDB, P10V_BUILTIN_VSRDB_V4SI,
+    RS6000_BTI_V4SI, RS6000_BTI_V4SI,
+    RS6000_BTI_V4SI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SRDB, P10V_BUILTIN_VSRDB_V4SI,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI,
+    RS6000_BTI_unsigned_V4SI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SRDB, P10V_BUILTIN_VSRDB_V2DI,
+    RS6000_BTI_V2DI, RS6000_BTI_V2DI,
+    RS6000_BTI_V2DI, RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_SRDB, P10V_BUILTIN_VSRDB_V2DI,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI,
+    RS6000_BTI_unsigned_V2DI, RS6000_BTI_UINTQI },
+
+  { P10_BUILTIN_VEC_VSTRIL, P10V_BUILTIN_VSTRIBL,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI, 0, 0 },
-  { P10_BUILTIN_VEC_VSTRIL, P10_BUILTIN_VSTRIBL,
+  { P10_BUILTIN_VEC_VSTRIL, P10V_BUILTIN_VSTRIBL,
     RS6000_BTI_V16QI, RS6000_BTI_V16QI, 0, 0 },
 
-  { P10_BUILTIN_VEC_VSTRIL, P10_BUILTIN_VSTRIHL,
+  { P10_BUILTIN_VEC_VSTRIL, P10V_BUILTIN_VSTRIHL,
     RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI, 0, 0 },
-  { P10_BUILTIN_VEC_VSTRIL, P10_BUILTIN_VSTRIHL,
+  { P10_BUILTIN_VEC_VSTRIL, P10V_BUILTIN_VSTRIHL,
     RS6000_BTI_V8HI, RS6000_BTI_V8HI, 0, 0 },
 
-  { P10_BUILTIN_VEC_VSTRIL_P, P10_BUILTIN_VSTRIBL_P,
+  { P10_BUILTIN_VEC_VSTRIL_P, P10V_BUILTIN_VSTRIBL_P,
     RS6000_BTI_INTSI, RS6000_BTI_unsigned_V16QI, 0, 0 },
-  { P10_BUILTIN_VEC_VSTRIL_P, P10_BUILTIN_VSTRIBL_P,
+  { P10_BUILTIN_VEC_VSTRIL_P, P10V_BUILTIN_VSTRIBL_P,
     RS6000_BTI_INTSI, RS6000_BTI_V16QI, 0, 0 },
 
-  { P10_BUILTIN_VEC_VSTRIL_P, P10_BUILTIN_VSTRIHL_P,
+  { P10_BUILTIN_VEC_VSTRIL_P, P10V_BUILTIN_VSTRIHL_P,
     RS6000_BTI_INTSI, RS6000_BTI_unsigned_V8HI, 0, 0 },
-  { P10_BUILTIN_VEC_VSTRIL_P, P10_BUILTIN_VSTRIHL_P,
+  { P10_BUILTIN_VEC_VSTRIL_P, P10V_BUILTIN_VSTRIHL_P,
     RS6000_BTI_INTSI, RS6000_BTI_V8HI, 0, 0 },
 
-  { P10_BUILTIN_VEC_VSTRIR, P10_BUILTIN_VSTRIBR,
+  { P10_BUILTIN_VEC_VSTRIR, P10V_BUILTIN_VSTRIBR,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI, 0, 0 },
-  { P10_BUILTIN_VEC_VSTRIR, P10_BUILTIN_VSTRIBR,
+  { P10_BUILTIN_VEC_VSTRIR, P10V_BUILTIN_VSTRIBR,
     RS6000_BTI_V16QI, RS6000_BTI_V16QI, 0, 0 },
 
-  { P10_BUILTIN_VEC_VSTRIR, P10_BUILTIN_VSTRIHR,
+  { P10_BUILTIN_VEC_VSTRIR, P10V_BUILTIN_VSTRIHR,
     RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI, 0, 0 },
-  { P10_BUILTIN_VEC_VSTRIR, P10_BUILTIN_VSTRIHR,
+  { P10_BUILTIN_VEC_VSTRIR, P10V_BUILTIN_VSTRIHR,
     RS6000_BTI_V8HI, RS6000_BTI_V8HI, 0, 0 },
 
-  { P10_BUILTIN_VEC_VSTRIR_P, P10_BUILTIN_VSTRIBR_P,
+  { P10_BUILTIN_VEC_VSTRIR_P, P10V_BUILTIN_VSTRIBR_P,
     RS6000_BTI_INTSI, RS6000_BTI_unsigned_V16QI, 0, 0 },
-  { P10_BUILTIN_VEC_VSTRIR_P, P10_BUILTIN_VSTRIBR_P,
+  { P10_BUILTIN_VEC_VSTRIR_P, P10V_BUILTIN_VSTRIBR_P,
     RS6000_BTI_INTSI, RS6000_BTI_V16QI, 0, 0 },
 
-  { P10_BUILTIN_VEC_VSTRIR_P, P10_BUILTIN_VSTRIHR_P,
+  { P10_BUILTIN_VEC_VSTRIR_P, P10V_BUILTIN_VSTRIHR_P,
     RS6000_BTI_INTSI, RS6000_BTI_unsigned_V8HI, 0, 0 },
-  { P10_BUILTIN_VEC_VSTRIR_P, P10_BUILTIN_VSTRIHR_P,
+  { P10_BUILTIN_VEC_VSTRIR_P, P10V_BUILTIN_VSTRIHR_P,
     RS6000_BTI_INTSI, RS6000_BTI_V8HI, 0, 0 },
 
-  { P10_BUILTIN_VEC_MTVSRBM, P10_BUILTIN_MTVSRBM,
+  { P10_BUILTIN_VEC_MTVSRBM, P10V_BUILTIN_MTVSRBM,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_UINTDI, 0, 0 },
-  { P10_BUILTIN_VEC_MTVSRHM, P10_BUILTIN_MTVSRHM,
+  { P10_BUILTIN_VEC_MTVSRHM, P10V_BUILTIN_MTVSRHM,
     RS6000_BTI_unsigned_V8HI, RS6000_BTI_UINTDI, 0, 0 },
-  { P10_BUILTIN_VEC_MTVSRWM, P10_BUILTIN_MTVSRWM,
+  { P10_BUILTIN_VEC_MTVSRWM, P10V_BUILTIN_MTVSRWM,
     RS6000_BTI_unsigned_V4SI, RS6000_BTI_UINTDI, 0, 0 },
-  { P10_BUILTIN_VEC_MTVSRDM, P10_BUILTIN_MTVSRDM,
+  { P10_BUILTIN_VEC_MTVSRDM, P10V_BUILTIN_MTVSRDM,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_UINTDI, 0, 0 },
-  { P10_BUILTIN_VEC_MTVSRQM, P10_BUILTIN_MTVSRQM,
+  { P10_BUILTIN_VEC_MTVSRQM, P10V_BUILTIN_MTVSRQM,
     RS6000_BTI_unsigned_V1TI, RS6000_BTI_UINTDI, 0, 0 },
 
-  { P10_BUILTIN_VEC_VCNTM, P10_BUILTIN_VCNTMBB,
+  { P10_BUILTIN_VEC_VCNTM, P10V_BUILTIN_VCNTMBB,
     RS6000_BTI_unsigned_long_long,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_UINTQI, 0 },
-  { P10_BUILTIN_VEC_VCNTM, P10_BUILTIN_VCNTMBH,
+  { P10_BUILTIN_VEC_VCNTM, P10V_BUILTIN_VCNTMBH,
     RS6000_BTI_unsigned_long_long,
     RS6000_BTI_unsigned_V8HI, RS6000_BTI_UINTQI, 0 },
-  { P10_BUILTIN_VEC_VCNTM, P10_BUILTIN_VCNTMBW,
+  { P10_BUILTIN_VEC_VCNTM, P10V_BUILTIN_VCNTMBW,
     RS6000_BTI_unsigned_long_long,
     RS6000_BTI_unsigned_V4SI, RS6000_BTI_UINTQI, 0 },
-  { P10_BUILTIN_VEC_VCNTM, P10_BUILTIN_VCNTMBD,
+  { P10_BUILTIN_VEC_VCNTM, P10V_BUILTIN_VCNTMBD,
     RS6000_BTI_unsigned_long_long,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_UINTQI, 0 },
 
-  { P10_BUILTIN_VEC_VEXPANDM, P10_BUILTIN_VEXPANDMB,
+  { P10_BUILTIN_VEC_VEXPANDM, P10V_BUILTIN_VEXPANDMB,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_unsigned_V16QI, 0, 0 },
-  { P10_BUILTIN_VEC_VEXPANDM, P10_BUILTIN_VEXPANDMH,
+  { P10_BUILTIN_VEC_VEXPANDM, P10V_BUILTIN_VEXPANDMH,
     RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI, 0, 0 },
-  { P10_BUILTIN_VEC_VEXPANDM, P10_BUILTIN_VEXPANDMW,
+  { P10_BUILTIN_VEC_VEXPANDM, P10V_BUILTIN_VEXPANDMW,
     RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V4SI, 0, 0 },
-  { P10_BUILTIN_VEC_VEXPANDM, P10_BUILTIN_VEXPANDMD,
+  { P10_BUILTIN_VEC_VEXPANDM, P10V_BUILTIN_VEXPANDMD,
     RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, 0, 0 },
-  { P10_BUILTIN_VEC_VEXPANDM, P10_BUILTIN_VEXPANDMQ,
+  { P10_BUILTIN_VEC_VEXPANDM, P10V_BUILTIN_VEXPANDMQ,
     RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, 0, 0 },
 
-  { P10_BUILTIN_VEC_VEXTRACTM, P10_BUILTIN_VEXTRACTMB,
+  { P10_BUILTIN_VEC_VEXTRACTM, P10V_BUILTIN_VEXTRACTMB,
     RS6000_BTI_INTSI, RS6000_BTI_unsigned_V16QI, 0, 0 },
-  { P10_BUILTIN_VEC_VEXTRACTM, P10_BUILTIN_VEXTRACTMH,
+  { P10_BUILTIN_VEC_VEXTRACTM, P10V_BUILTIN_VEXTRACTMH,
     RS6000_BTI_INTSI, RS6000_BTI_unsigned_V8HI, 0, 0 },
-  { P10_BUILTIN_VEC_VEXTRACTM, P10_BUILTIN_VEXTRACTMW,
+  { P10_BUILTIN_VEC_VEXTRACTM, P10V_BUILTIN_VEXTRACTMW,
     RS6000_BTI_INTSI, RS6000_BTI_unsigned_V4SI, 0, 0 },
-  { P10_BUILTIN_VEC_VEXTRACTM, P10_BUILTIN_VEXTRACTMD,
+  { P10_BUILTIN_VEC_VEXTRACTM, P10V_BUILTIN_VEXTRACTMD,
     RS6000_BTI_INTSI, RS6000_BTI_unsigned_V2DI, 0, 0 },
-  { P10_BUILTIN_VEC_VEXTRACTM, P10_BUILTIN_VEXTRACTMQ,
+  { P10_BUILTIN_VEC_VEXTRACTM, P10V_BUILTIN_VEXTRACTMQ,
     RS6000_BTI_INTSI, RS6000_BTI_unsigned_V1TI, 0, 0 },
 
+ { P10_BUILTIN_VEC_XVTLSBB_ZEROS, P10V_BUILTIN_XVTLSBB_ZEROS,
+    RS6000_BTI_INTSI, RS6000_BTI_unsigned_V16QI, 0, 0 },
+ { P10_BUILTIN_VEC_XVTLSBB_ONES, P10V_BUILTIN_XVTLSBB_ONES,
+    RS6000_BTI_INTSI, RS6000_BTI_unsigned_V16QI, 0, 0 },
+
   { RS6000_BUILTIN_NONE, RS6000_BUILTIN_NONE, 0, 0, 0, 0 }
 };
 
@@ -6235,8 +6442,30 @@ machine_mode
 rs6000_promote_function_mode (const_tree type ATTRIBUTE_UNUSED,
 			      machine_mode mode,
 			      int *punsignedp ATTRIBUTE_UNUSED,
-			      const_tree, int)
+			      const_tree, int for_return)
 {
+  /* Warning: this is a static local variable and not always NULL!
+     This function is called multiple times for the same function
+     and return value.  PREV_FUNC is used to keep track of the
+     first time we encounter a function's return value in order
+     to not report an error with that return value multiple times.  */
+  static struct function *prev_func = NULL;
+
+  /* We do not allow MMA types being used as return values.  Only report
+     the invalid return value usage the first time we encounter it.  */
+  if (for_return
+      && prev_func != cfun
+      && (mode == POImode || mode == PXImode))
+    {
+      /* Record we have now handled function CFUN, so the next time we
+	 are called, we do not re-report the same error.  */
+      prev_func = cfun;
+      if (TYPE_CANONICAL (type) != NULL_TREE)
+	type = TYPE_CANONICAL (type);
+      error ("invalid use of MMA type %qs as a function return value",
+	     IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (type))));
+    }
+
   PROMOTE_MODE (mode, *punsignedp, type);
 
   return mode;
@@ -7187,6 +7416,16 @@ rs6000_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
   machine_mode elt_mode;
   int n_elts;
 
+  /* We do not allow MMA types being used as function arguments.  */
+  if (mode == POImode || mode == PXImode)
+    {
+      if (TYPE_CANONICAL (type) != NULL_TREE)
+	type = TYPE_CANONICAL (type);
+      error ("invalid use of MMA operand of type %qs as a function parameter",
+	     IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (type))));
+      return NULL_RTX;
+    }
+
   /* Return a marker to indicate whether CR1 needs to set or clear the
      bit that V.4 uses to say fp args were passed in registers.
      Assume that we don't need the marker for software floating point,
@@ -10018,6 +10257,66 @@ rs6000_expand_quaternop_builtin (enum insn_code icode, tree exp, rtx target)
 	}
     }
 
+  else if (icode == CODE_FOR_xxpermx)
+    {
+      /* Only allow 3-bit unsigned literals.  */
+      STRIP_NOPS (arg3);
+      if (TREE_CODE (arg3) != INTEGER_CST
+	  || TREE_INT_CST_LOW (arg3) & ~0x7)
+	{
+	  error ("argument 4 must be a 3-bit unsigned literal");
+	  return CONST0_RTX (tmode);
+	}
+    }
+
+  else if (icode == CODE_FOR_vreplace_elt_v4si
+	   || icode == CODE_FOR_vreplace_elt_v4sf)
+   {
+     /* Check whether the 3rd argument is an integer constant in the range
+	0 to 3 inclusive.  */
+     STRIP_NOPS (arg2);
+     if (TREE_CODE (arg2) != INTEGER_CST
+	 || !IN_RANGE (TREE_INT_CST_LOW (arg2), 0, 3))
+	{
+	  error ("argument 3 must be in the range 0 to 3");
+	  return CONST0_RTX (tmode);
+	}
+   }
+
+  else if (icode == CODE_FOR_vreplace_un_v4si
+	   || icode == CODE_FOR_vreplace_un_v4sf)
+   {
+     /* Check whether the 3rd argument is an integer constant in the range
+	0 to 12 inclusive.  */
+     STRIP_NOPS (arg2);
+     if (TREE_CODE (arg2) != INTEGER_CST
+	 || !IN_RANGE(TREE_INT_CST_LOW (arg2), 0, 12))
+	{
+	  error ("argument 3 must be in the range 0 to 12");
+	  return CONST0_RTX (tmode);
+	}
+   }
+
+  else if (icode == CODE_FOR_vsldb_v16qi
+	   || icode == CODE_FOR_vsldb_v8hi
+	   || icode == CODE_FOR_vsldb_v4si
+	   || icode == CODE_FOR_vsldb_v2di
+	   || icode == CODE_FOR_vsrdb_v16qi
+	   || icode == CODE_FOR_vsrdb_v8hi
+	   || icode == CODE_FOR_vsrdb_v4si
+	   || icode == CODE_FOR_vsrdb_v2di)
+   {
+     /* Check whether the 3rd argument is an integer constant in the range
+	0 to 7 inclusive.  */
+     STRIP_NOPS (arg2);
+     if (TREE_CODE (arg2) != INTEGER_CST
+	 || !IN_RANGE (TREE_INT_CST_LOW (arg2), 0, 7))
+	{
+	  error ("argument 3 must be a constant in the range 0 to 7");
+	  return CONST0_RTX (tmode);
+	}
+   }
+
   if (target == 0
       || GET_MODE (target) != tmode
       || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
@@ -11154,11 +11453,12 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi)
       tree src_array = build1 (VIEW_CONVERT_EXPR, array_type, src);
       for (unsigned i = 0; i < 4; i++)
 	{
+	  unsigned index = WORDS_BIG_ENDIAN ? i : 3 - i;
 	  tree ref = build4 (ARRAY_REF, unsigned_V16QI_type_node, src_array,
 			     build_int_cst (size_type_node, i),
 			     NULL_TREE, NULL_TREE);
 	  tree dst = build2 (MEM_REF, unsigned_V16QI_type_node, dst_base,
-			     build_int_cst (dst_type, i * 16));
+			     build_int_cst (dst_type, index * 16));
 	  gimplify_assign (dst, ref, &new_seq);
 	}
       pop_gimplify_context (NULL);
@@ -11169,12 +11469,8 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi)
   /* Convert this built-in into an internal version that uses pass-by-value
      arguments.  The internal built-in follows immediately after this one.  */
   new_decl = rs6000_builtin_decls[fncode + 1];
-  tree lhs, mem, op[MAX_MMA_OPERANDS];
+  tree lhs, op[MAX_MMA_OPERANDS];
   tree acc = gimple_call_arg (stmt, 0);
-  if (TREE_CODE (acc) == PARM_DECL)
-    mem = build1 (INDIRECT_REF, TREE_TYPE (TREE_TYPE (acc)), acc);
-  else
-    mem = build_simple_mem_ref (acc);
   push_gimplify_context (true);
 
   if ((attr & RS6000_BTC_QUAD) != 0)
@@ -11184,7 +11480,7 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi)
       op[0] = make_ssa_name (vector_quad_type_node);
       for (unsigned i = 1; i < nopnds; i++)
 	op[i] = gimple_call_arg (stmt, i);
-      gimplify_assign (op[0], mem, &new_seq);
+      gimplify_assign (op[0], build_simple_mem_ref (acc), &new_seq);
     }
   else
     {
@@ -11234,7 +11530,7 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi)
     lhs = make_ssa_name (vector_quad_type_node);
   gimple_call_set_lhs (new_call, lhs);
   gimple_seq_add_stmt (&new_seq, new_call);
-  gimplify_assign (mem, lhs, &new_seq);
+  gimplify_assign (build_simple_mem_ref (acc), lhs, &new_seq);
   pop_gimplify_context (NULL);
   gsi_replace_with_seq (gsi, new_seq, true);
 
@@ -12341,7 +12637,6 @@ rs6000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
       }
 
     case ALTIVEC_BUILTIN_MASK_FOR_LOAD:
-    case ALTIVEC_BUILTIN_MASK_FOR_STORE:
       {
 	int icode2 = (BYTES_BIG_ENDIAN ? (int) CODE_FOR_altivec_lvsr_direct
 		     : (int) CODE_FOR_altivec_lvsl_direct);
@@ -12356,14 +12651,9 @@ rs6000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
 	gcc_assert (POINTER_TYPE_P (TREE_TYPE (arg)));
 	op = expand_expr (arg, NULL_RTX, Pmode, EXPAND_NORMAL);
 	addr = memory_address (mode, op);
-	if (fcode == ALTIVEC_BUILTIN_MASK_FOR_STORE)
-	  op = addr;
-	else
-	  {
-	    /* For the load case need to negate the address.  */
-	    op = gen_reg_rtx (GET_MODE (addr));
-	    emit_insn (gen_rtx_SET (op, gen_rtx_NEG (GET_MODE (addr), addr)));
-	  }
+	/* We need to negate the address.  */
+	op = gen_reg_rtx (GET_MODE (addr));
+	emit_insn (gen_rtx_SET (op, gen_rtx_NEG (GET_MODE (addr), addr)));
 	op = gen_rtx_MEM (mode, op);
 
 	if (target == 0
@@ -12509,7 +12799,7 @@ rs6000_init_builtins (void)
 
   V2DI_type_node = rs6000_vector_type (TARGET_POWERPC64 ? "__vector long"
 				       : "__vector long long",
-				       intDI_type_node, 2);
+				       long_long_integer_type_node, 2);
   V2DF_type_node = rs6000_vector_type ("__vector double", double_type_node, 2);
   V4SI_type_node = rs6000_vector_type ("__vector signed int",
 				       intSI_type_node, 4);
@@ -12528,7 +12818,7 @@ rs6000_init_builtins (void)
   unsigned_V2DI_type_node = rs6000_vector_type (TARGET_POWERPC64
 				       ? "__vector unsigned long"
 				       : "__vector unsigned long long",
-				       unsigned_intDI_type_node, 2);
+				       long_long_unsigned_type_node, 2);
 
   opaque_V4SI_type_node = build_opaque_vector_type (intSI_type_node, 4);
 
@@ -13198,7 +13488,7 @@ altivec_init_builtins (void)
     {
       def_builtin ("__builtin_altivec_stxvl", void_ftype_v16qi_pvoid_long,
 		   P9V_BUILTIN_STXVL);
-      def_builtin ("__builtin_xst_len_r", void_ftype_v16qi_pvoid_long,
+      def_builtin ("__builtin_altivec_xst_len_r", void_ftype_v16qi_pvoid_long,
 		   P9V_BUILTIN_XST_LEN_R);
     }
 
@@ -13654,15 +13944,20 @@ builtin_quaternary_function_type (machine_mode mode_ret,
   tree function_type = NULL;
 
   static tree v2udi_type = builtin_mode_to_type[V2DImode][1];
+  static tree v16uqi_type = builtin_mode_to_type[V16QImode][1];
   static tree uchar_type = builtin_mode_to_type[QImode][1];
 
   static tree xxeval_type =
     build_function_type_list (v2udi_type, v2udi_type, v2udi_type,
 			      v2udi_type, uchar_type, NULL_TREE);
 
+  static tree xxpermx_type =
+    build_function_type_list (v2udi_type, v2udi_type, v2udi_type,
+			      v16uqi_type, uchar_type, NULL_TREE);
+
   switch (builtin) {
 
-  case P10_BUILTIN_XXEVAL:
+  case P10V_BUILTIN_XXEVAL:
     gcc_assert ((mode_ret == V2DImode)
 		&& (mode_arg0 == V2DImode)
 		&& (mode_arg1 == V2DImode)
@@ -13671,6 +13966,15 @@ builtin_quaternary_function_type (machine_mode mode_ret,
     function_type = xxeval_type;
     break;
 
+  case P10V_BUILTIN_VXXPERMX:
+    gcc_assert ((mode_ret == V2DImode)
+		&& (mode_arg0 == V2DImode)
+		&& (mode_arg1 == V2DImode)
+		&& (mode_arg2 == V16QImode)
+		&& (mode_arg3 == QImode));
+    function_type = xxpermx_type;
+    break;
+
   default:
     /* A case for each quaternary built-in must be provided above.  */
     gcc_unreachable ();
@@ -13720,22 +14024,22 @@ builtin_function_type (machine_mode mode_ret, machine_mode mode_arg0,
     case P8V_BUILTIN_VGBBD:
     case MISC_BUILTIN_CDTBCD:
     case MISC_BUILTIN_CBCDTD:
-    case VSX_BUILTIN_XVCVSPBF16:
-    case VSX_BUILTIN_XVCVBF16SP:
-    case P10_BUILTIN_MTVSRBM:
-    case P10_BUILTIN_MTVSRHM:
-    case P10_BUILTIN_MTVSRWM:
-    case P10_BUILTIN_MTVSRDM:
-    case P10_BUILTIN_MTVSRQM:
-    case P10_BUILTIN_VCNTMBB:
-    case P10_BUILTIN_VCNTMBH:
-    case P10_BUILTIN_VCNTMBW:
-    case P10_BUILTIN_VCNTMBD:
-    case P10_BUILTIN_VEXPANDMB:
-    case P10_BUILTIN_VEXPANDMH:
-    case P10_BUILTIN_VEXPANDMW:
-    case P10_BUILTIN_VEXPANDMD:
-    case P10_BUILTIN_VEXPANDMQ:
+    case P10V_BUILTIN_XVCVSPBF16:
+    case P10V_BUILTIN_XVCVBF16SPN:
+    case P10V_BUILTIN_MTVSRBM:
+    case P10V_BUILTIN_MTVSRHM:
+    case P10V_BUILTIN_MTVSRWM:
+    case P10V_BUILTIN_MTVSRDM:
+    case P10V_BUILTIN_MTVSRQM:
+    case P10V_BUILTIN_VCNTMBB:
+    case P10V_BUILTIN_VCNTMBH:
+    case P10V_BUILTIN_VCNTMBW:
+    case P10V_BUILTIN_VCNTMBD:
+    case P10V_BUILTIN_VEXPANDMB:
+    case P10V_BUILTIN_VEXPANDMH:
+    case P10V_BUILTIN_VEXPANDMW:
+    case P10V_BUILTIN_VEXPANDMD:
+    case P10V_BUILTIN_VEXPANDMQ:
       h.uns_p[0] = 1;
       h.uns_p[1] = 1;
       break;
@@ -13807,16 +14111,16 @@ builtin_function_type (machine_mode mode_ret, machine_mode mode_arg0,
     case P8V_BUILTIN_ORC_V4SI_UNS:
     case P8V_BUILTIN_ORC_V2DI_UNS:
     case P8V_BUILTIN_ORC_V1TI_UNS:
-    case P10_BUILTIN_VCFUGED:
-    case P10_BUILTIN_VCLZDM:
-    case P10_BUILTIN_VCTZDM:
-    case P10_BUILTIN_VGNB:
-    case P10_BUILTIN_VPDEPD:
-    case P10_BUILTIN_VPEXTD:
-    case P10_BUILTIN_XXGENPCVM_V16QI:
-    case P10_BUILTIN_XXGENPCVM_V8HI:
-    case P10_BUILTIN_XXGENPCVM_V4SI:
-    case P10_BUILTIN_XXGENPCVM_V2DI:
+    case P10V_BUILTIN_VCFUGED:
+    case P10V_BUILTIN_VCLZDM:
+    case P10V_BUILTIN_VCTZDM:
+    case P10V_BUILTIN_VGNB:
+    case P10V_BUILTIN_VPDEPD:
+    case P10V_BUILTIN_VPEXTD:
+    case P10V_BUILTIN_XXGENPCVM_V16QI:
+    case P10V_BUILTIN_XXGENPCVM_V8HI:
+    case P10V_BUILTIN_XXGENPCVM_V4SI:
+    case P10V_BUILTIN_XXGENPCVM_V2DI:
       h.uns_p[0] = 1;
       h.uns_p[1] = 1;
       h.uns_p[2] = 1;
@@ -13847,14 +14151,29 @@ builtin_function_type (machine_mode mode_ret, machine_mode mode_arg0,
     case CRYPTO_BUILTIN_VSHASIGMAW:
     case CRYPTO_BUILTIN_VSHASIGMAD:
     case CRYPTO_BUILTIN_VSHASIGMA:
-    case P10_BUILTIN_VEXTRACTBL:
-    case P10_BUILTIN_VEXTRACTHL:
-    case P10_BUILTIN_VEXTRACTWL:
-    case P10_BUILTIN_VEXTRACTDL:
-    case P10_BUILTIN_VEXTRACTBR:
-    case P10_BUILTIN_VEXTRACTHR:
-    case P10_BUILTIN_VEXTRACTWR:
-    case P10_BUILTIN_VEXTRACTDR:
+    case P10V_BUILTIN_VEXTRACTBL:
+    case P10V_BUILTIN_VEXTRACTHL:
+    case P10V_BUILTIN_VEXTRACTWL:
+    case P10V_BUILTIN_VEXTRACTDL:
+    case P10V_BUILTIN_VEXTRACTBR:
+    case P10V_BUILTIN_VEXTRACTHR:
+    case P10V_BUILTIN_VEXTRACTWR:
+    case P10V_BUILTIN_VEXTRACTDR:
+    case P10V_BUILTIN_VINSERTGPRBL:
+    case P10V_BUILTIN_VINSERTGPRHL:
+    case P10V_BUILTIN_VINSERTGPRWL:
+    case P10V_BUILTIN_VINSERTGPRDL:
+    case P10V_BUILTIN_VINSERTVPRBL:
+    case P10V_BUILTIN_VINSERTVPRHL:
+    case P10V_BUILTIN_VINSERTVPRWL:
+    case P10V_BUILTIN_VREPLACE_ELT_UV4SI:
+    case P10V_BUILTIN_VREPLACE_ELT_UV2DI:
+    case P10V_BUILTIN_VREPLACE_UN_UV4SI:
+    case P10V_BUILTIN_VREPLACE_UN_UV2DI:
+    case P10V_BUILTIN_VXXBLEND_V16QI:
+    case P10V_BUILTIN_VXXBLEND_V8HI:
+    case P10V_BUILTIN_VXXBLEND_V4SI:
+    case P10V_BUILTIN_VXXBLEND_V2DI:
       h.uns_p[0] = 1;
       h.uns_p[1] = 1;
       h.uns_p[2] = 1;
@@ -13921,6 +14240,9 @@ builtin_function_type (machine_mode mode_ret, machine_mode mode_arg0,
     case ALTIVEC_BUILTIN_VSRH:
     case ALTIVEC_BUILTIN_VSRW:
     case P8V_BUILTIN_VSRD:
+    /* Vector splat immediate insert */
+    case P10V_BUILTIN_VXXSPLTI32DX_V4SI:
+    case P10V_BUILTIN_VXXSPLTI32DX_V4SF:
       h.uns_p[2] = 1;
       break;
 
diff --git a/gcc/config/rs6000/rs6000-internal.h b/gcc/config/rs6000/rs6000-internal.h
index 9caef01..32681b6 100644
--- a/gcc/config/rs6000/rs6000-internal.h
+++ b/gcc/config/rs6000/rs6000-internal.h
@@ -32,7 +32,7 @@ typedef struct rs6000_stack {
   int cr_save_p;		/* true if the CR reg needs to be saved */
   unsigned int vrsave_mask;	/* mask of vec registers to save */
   int push_p;			/* true if we need to allocate stack space */
-  int calls_p;			/* true if the function makes any calls */
+  int calls_p;			/* true if there are non-sibling calls */
   int world_save_p;		/* true if we're saving *everything*:
 				   r13-r31, cr, f14-f31, vrsave, v20-v31  */
   enum rs6000_abi abi;		/* which ABI to use */
diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c
index 6aad1ff..d90cd57 100644
--- a/gcc/config/rs6000/rs6000-logue.c
+++ b/gcc/config/rs6000/rs6000-logue.c
@@ -714,7 +714,7 @@ rs6000_stack_info (void)
   info->altivec_size = 16 * (LAST_ALTIVEC_REGNO + 1
 				 - info->first_altivec_reg_save);
 
-  /* Does this function call anything?  */
+  /* Does this function call anything (apart from sibling calls)?  */
   info->calls_p = (!crtl->is_leaf || cfun->machine->ra_needs_full_frame);
 
   /* Determine if we need to save the condition code registers.  */
@@ -1080,28 +1080,28 @@ rs6000_decl_ok_for_sibcall (tree decl)
 
   if (DEFAULT_ABI == ABI_AIX || DEFAULT_ABI == ABI_ELFv2)
     {
-      /* Under the AIX or ELFv2 ABIs we can't allow calls to non-local
-	 functions, because the callee may have a different TOC pointer to
-	 the caller and there's no way to ensure we restore the TOC when
+      /* A function compiled using the PC-relative addressing model does not
+	 use a TOC pointer; nor is it guaranteed to preserve the value of
+	 r2 for its caller's TOC.  Such a function may make sibcalls to any
+	 function, whether local or external, without restriction based on
+	 TOC-save/restore rules.  */
+      if (rs6000_pcrel_p ())
+	return true;
+
+      /* Otherwise, under the AIX or ELFv2 ABIs we can't allow sibcalls
+	 to non-local functions, because the callee may not preserve the
+	 TOC pointer, and there's no way to ensure we restore the TOC when
 	 we return.  */
       if (!decl || DECL_EXTERNAL (decl) || DECL_WEAK (decl)
 	  || !(*targetm.binds_local_p) (decl))
 	return false;
 
-      /* Similarly, if the caller preserves the TOC pointer and the callee
-	 doesn't (or vice versa), proper TOC setup or restoration will be
-	 missed.  For example, suppose A, B, and C are in the same binary
-	 and A -> B -> C.  A and B preserve the TOC pointer but C does not,
-	 and B -> C is eligible as a sibcall.  A will call B through its
-	 local entry point, so A will not restore its TOC itself.  B calls
-	 C with a sibcall, so it will not restore the TOC.  C does not
-	 preserve the TOC, so it may clobber r2 with impunity.  Returning
-	 from C will result in a corrupted TOC for A.  */
-      else if (rs6000_fndecl_pcrel_p (decl) != rs6000_pcrel_p (cfun))
+      /* A local sibcall from a function that preserves the TOC pointer
+	 to a function that does not is invalid for the same reason.  */
+      if (rs6000_fndecl_pcrel_p (decl))
 	return false;
 
-      else
-	return true;
+      return true;
     }
 
   /*  With the secure-plt SYSV ABI we can't make non-local calls when
@@ -2562,7 +2562,7 @@ rs6000_global_entry_point_prologue_needed_p (void)
     return false;
 
   /* PC-relative functions never generate a global entry point prologue.  */
-  if (rs6000_pcrel_p (cfun))
+  if (rs6000_pcrel_p ())
     return false;
 
   /* Ensure we have a global entry point for thunks.   ??? We could
@@ -3978,7 +3978,7 @@ rs6000_output_function_prologue (FILE *file)
       fputs ("\n", file);
     }
 
-  else if (rs6000_pcrel_p (cfun))
+  else if (rs6000_pcrel_p ())
     {
       const char *name = XSTR (XEXP (DECL_RTL (current_function_decl), 0), 0);
       /* All functions compiled to use PC-relative addressing will
@@ -5479,7 +5479,18 @@ rs6000_expand_split_stack_prologue (void)
   gcc_assert (flag_split_stack && reload_completed);
 
   if (!info->push_p)
-    return;
+    {
+      /* We need the -fsplit-stack prologue for functions that make
+	 tail calls.  Tail calls don't count against crtl->is_leaf.
+	 Note that we are called inside a sequence.  get_insns will
+	 just return that (as yet empty) sequence, so instead we
+	 access the function rtl with get_topmost_sequence.  */
+      for (insn = get_topmost_sequence ()->first; insn; insn = NEXT_INSN (insn))
+	if (CALL_P (insn))
+	  break;
+      if (!insn)
+	return;
+    }
 
   if (global_regs[29])
     {
diff --git a/gcc/config/rs6000/rs6000-p8swap.c b/gcc/config/rs6000/rs6000-p8swap.c
index 3d5dc7d..fff1b08 100644
--- a/gcc/config/rs6000/rs6000-p8swap.c
+++ b/gcc/config/rs6000/rs6000-p8swap.c
@@ -2095,11 +2095,15 @@ alignment_mask (rtx_insn *insn)
   return alignment_with_canonical_addr (SET_SRC (body));
 }
 
-/* Given INSN that's a load or store based at BASE_REG, look for a
-   feeding computation that aligns its address on a 16-byte boundary.
-   Return the rtx and its containing AND_INSN.  */
-static rtx
-find_alignment_op (rtx_insn *insn, rtx base_reg, rtx_insn **and_insn)
+/* Given INSN that's a load or store based at BASE_REG, check if
+   all of its feeding computations align its address on a 16-byte
+   boundary.  If so, return true and add all definition insns into
+   AND_INSNS and their corresponding fully-expanded rtxes for the
+   masking operations into AND_OPS.  */
+
+static bool
+find_alignment_op (rtx_insn *insn, rtx base_reg, vec<rtx_insn *> *and_insns,
+		   vec<rtx> *and_ops)
 {
   df_ref base_use;
   struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
@@ -2111,19 +2115,28 @@ find_alignment_op (rtx_insn *insn, rtx base_reg, rtx_insn **and_insn)
 	continue;
 
       struct df_link *base_def_link = DF_REF_CHAIN (base_use);
-      if (!base_def_link || base_def_link->next)
-	break;
+      if (!base_def_link)
+	return false;
 
-      /* With stack-protector code enabled, and possibly in other
-	 circumstances, there may not be an associated insn for 
-	 the def.  */
-      if (DF_REF_IS_ARTIFICIAL (base_def_link->ref))
-	break;
+      while (base_def_link)
+	{
+	  /* With stack-protector code enabled, and possibly in other
+	     circumstances, there may not be an associated insn for
+	     the def.  */
+	  if (DF_REF_IS_ARTIFICIAL (base_def_link->ref))
+	    return false;
 
-      *and_insn = DF_REF_INSN (base_def_link->ref);
-      and_operation = alignment_mask (*and_insn);
-      if (and_operation != 0)
-	break;
+	  rtx_insn *and_insn = DF_REF_INSN (base_def_link->ref);
+	  and_operation = alignment_mask (and_insn);
+
+	  /* Stop if we find any one which doesn't align.  */
+	  if (!and_operation)
+	    return false;
+
+	  and_insns->safe_push (and_insn);
+	  and_ops->safe_push (and_operation);
+	  base_def_link = base_def_link->next;
+	}
     }
 
   return and_operation;
@@ -2143,11 +2156,14 @@ recombine_lvx_pattern (rtx_insn *insn, del_info *to_delete)
   rtx mem = XEXP (SET_SRC (body), 0);
   rtx base_reg = XEXP (mem, 0);
 
-  rtx_insn *and_insn;
-  rtx and_operation = find_alignment_op (insn, base_reg, &and_insn);
+  auto_vec<rtx_insn *> and_insns;
+  auto_vec<rtx> and_ops;
+  bool is_any_def_and
+    = find_alignment_op (insn, base_reg, &and_insns, &and_ops);
 
-  if (and_operation != 0)
+  if (is_any_def_and)
     {
+      gcc_assert (and_insns.length () == and_ops.length ());
       df_ref def;
       struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
       FOR_EACH_INSN_INFO_DEF (def, insn_info)
@@ -2168,25 +2184,35 @@ recombine_lvx_pattern (rtx_insn *insn, del_info *to_delete)
 	  to_delete[INSN_UID (swap_insn)].replace = true;
 	  to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
 
-	  /* However, first we must be sure that we make the
-	     base register from the AND operation available
-	     in case the register has been overwritten.  Copy
-	     the base register to a new pseudo and use that
-	     as the base register of the AND operation in
-	     the new LVX instruction.  */
-	  rtx and_base = XEXP (and_operation, 0);
-	  rtx new_reg = gen_reg_rtx (GET_MODE (and_base));
-	  rtx copy = gen_rtx_SET (new_reg, and_base);
-	  rtx_insn *new_insn = emit_insn_after (copy, and_insn);
-	  set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn));
-	  df_insn_rescan (new_insn);
-
-	  XEXP (mem, 0) = gen_rtx_AND (GET_MODE (and_base), new_reg,
-				       XEXP (and_operation, 1));
+	  rtx new_reg = 0;
+	  rtx and_mask = 0;
+	  for (unsigned i = 0; i < and_insns.length (); i++)
+	    {
+	      /* However, first we must be sure that we make the
+		 base register from the AND operation available
+		 in case the register has been overwritten.  Copy
+		 the base register to a new pseudo and use that
+		 as the base register of the AND operation in
+		 the new LVX instruction.  */
+	      rtx_insn *and_insn = and_insns[i];
+	      rtx and_op = and_ops[i];
+	      rtx and_base = XEXP (and_op, 0);
+	      if (!new_reg)
+		{
+		  new_reg = gen_reg_rtx (GET_MODE (and_base));
+		  and_mask = XEXP (and_op, 1);
+		}
+	      rtx copy = gen_rtx_SET (new_reg, and_base);
+	      rtx_insn *new_insn = emit_insn_after (copy, and_insn);
+	      set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn));
+	      df_insn_rescan (new_insn);
+	    }
+
+	  XEXP (mem, 0) = gen_rtx_AND (GET_MODE (new_reg), new_reg, and_mask);
 	  SET_SRC (body) = mem;
 	  INSN_CODE (insn) = -1; /* Force re-recognition.  */
 	  df_insn_rescan (insn);
-		  
+
 	  if (dump_file)
 	    fprintf (dump_file, "lvx opportunity found at %d\n",
 		     INSN_UID (insn));
@@ -2205,11 +2231,14 @@ recombine_stvx_pattern (rtx_insn *insn, del_info *to_delete)
   rtx mem = SET_DEST (body);
   rtx base_reg = XEXP (mem, 0);
 
-  rtx_insn *and_insn;
-  rtx and_operation = find_alignment_op (insn, base_reg, &and_insn);
+  auto_vec<rtx_insn *> and_insns;
+  auto_vec<rtx> and_ops;
+  bool is_any_def_and
+    = find_alignment_op (insn, base_reg, &and_insns, &and_ops);
 
-  if (and_operation != 0)
+  if (is_any_def_and)
     {
+      gcc_assert (and_insns.length () == and_ops.length ());
       rtx src_reg = XEXP (SET_SRC (body), 0);
       df_ref src_use;
       struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
@@ -2234,25 +2263,35 @@ recombine_stvx_pattern (rtx_insn *insn, del_info *to_delete)
 	  to_delete[INSN_UID (swap_insn)].replace = true;
 	  to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
 
-	  /* However, first we must be sure that we make the
-	     base register from the AND operation available
-	     in case the register has been overwritten.  Copy
-	     the base register to a new pseudo and use that
-	     as the base register of the AND operation in
-	     the new STVX instruction.  */
-	  rtx and_base = XEXP (and_operation, 0);
-	  rtx new_reg = gen_reg_rtx (GET_MODE (and_base));
-	  rtx copy = gen_rtx_SET (new_reg, and_base);
-	  rtx_insn *new_insn = emit_insn_after (copy, and_insn);
-	  set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn));
-	  df_insn_rescan (new_insn);
-
-	  XEXP (mem, 0) = gen_rtx_AND (GET_MODE (and_base), new_reg,
-				       XEXP (and_operation, 1));
+	  rtx new_reg = 0;
+	  rtx and_mask = 0;
+	  for (unsigned i = 0; i < and_insns.length (); i++)
+	    {
+	      /* However, first we must be sure that we make the
+		 base register from the AND operation available
+		 in case the register has been overwritten.  Copy
+		 the base register to a new pseudo and use that
+		 as the base register of the AND operation in
+		 the new STVX instruction.  */
+	      rtx_insn *and_insn = and_insns[i];
+	      rtx and_op = and_ops[i];
+	      rtx and_base = XEXP (and_op, 0);
+	      if (!new_reg)
+		{
+		  new_reg = gen_reg_rtx (GET_MODE (and_base));
+		  and_mask = XEXP (and_op, 1);
+		}
+	      rtx copy = gen_rtx_SET (new_reg, and_base);
+	      rtx_insn *new_insn = emit_insn_after (copy, and_insn);
+	      set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn));
+	      df_insn_rescan (new_insn);
+	    }
+
+	  XEXP (mem, 0) = gen_rtx_AND (GET_MODE (new_reg), new_reg, and_mask);
 	  SET_SRC (body) = src_reg;
 	  INSN_CODE (insn) = -1; /* Force re-recognition.  */
 	  df_insn_rescan (insn);
-		  
+
 	  if (dump_file)
 	    fprintf (dump_file, "stvx opportunity found at %d\n",
 		     INSN_UID (insn));
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index 5508484..25fa5dd 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -119,8 +119,8 @@ extern char * output_cbranch (rtx, const char *, int, rtx_insn *);
 extern const char * output_probe_stack_range (rtx, rtx, rtx);
 extern void rs6000_emit_dot_insn (rtx dst, rtx src, int dot, rtx ccreg);
 extern bool rs6000_emit_set_const (rtx, rtx);
-extern int rs6000_emit_cmove (rtx, rtx, rtx, rtx);
-extern int rs6000_emit_int_cmove (rtx, rtx, rtx, rtx);
+extern bool rs6000_emit_cmove (rtx, rtx, rtx, rtx);
+extern bool rs6000_emit_int_cmove (rtx, rtx, rtx, rtx);
 extern int rs6000_emit_vector_cond_expr (rtx, rtx, rtx, rtx, rtx, rtx);
 extern void rs6000_emit_minmax (rtx, enum rtx_code, rtx, rtx);
 extern void rs6000_expand_atomic_compare_and_swap (rtx op[]);
@@ -152,7 +152,8 @@ extern rtx rs6000_machopic_legitimize_pic_address (rtx, machine_mode,
 extern rtx rs6000_allocate_stack_temp (machine_mode, bool, bool);
 extern align_flags rs6000_loop_align (rtx);
 extern void rs6000_split_logical (rtx [], enum rtx_code, bool, bool, bool);
-extern bool rs6000_pcrel_p (struct function *);
+extern bool rs6000_function_pcrel_p (struct function *);
+extern bool rs6000_pcrel_p (void);
 extern bool rs6000_fndecl_pcrel_p (const_tree);
 
 /* Different PowerPC instruction formats that are used by GCC.  There are
@@ -274,6 +275,7 @@ extern void rs6000_asm_output_dwarf_pcrel (FILE *file, int size,
 					   const char *label);
 extern void rs6000_asm_output_dwarf_datarel (FILE *file, int size,
 					     const char *label);
+extern long long rs6000_const_f32_to_i32 (rtx operand);
 
 /* Declare functions in rs6000-c.c */
 
@@ -309,4 +311,5 @@ extern bool rs6000_quadword_masked_address_p (const_rtx exp);
 extern rtx rs6000_gen_lvx (enum machine_mode, rtx, rtx);
 extern rtx rs6000_gen_stvx (enum machine_mode, rtx, rtx);
 
+extern void rs6000_emit_xxspltidp_v2df (rtx, long value);
 #endif  /* rs6000-protos.h */
diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c
index c35d931..82cc24e 100644
--- a/gcc/config/rs6000/rs6000-string.c
+++ b/gcc/config/rs6000/rs6000-string.c
@@ -2708,6 +2708,32 @@ gen_lvx_v4si_move (rtx dest, rtx src)
     return gen_altivec_lvx_v4si_internal (dest, src);
 }
 
+static rtx
+gen_lxvl_stxvl_move (rtx dest, rtx src, int length)
+{
+  gcc_assert (MEM_P (dest) ^ MEM_P (src));
+  gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode);
+  gcc_assert (length <= 16);
+
+  bool is_store = MEM_P (dest);
+  rtx addr;
+
+  /* If the address form is not a simple register, make it so.  */
+  if (is_store)
+    addr = XEXP (dest, 0);
+  else
+    addr = XEXP (src, 0);
+
+  if (!REG_P (addr))
+    addr = force_reg (Pmode, addr);
+
+  rtx len = force_reg (DImode, gen_int_mode (length, DImode));
+  if (is_store)
+    return gen_stxvl (src, addr, len);
+  else
+    return gen_lxvl (dest, addr, len);
+}
+
 /* Expand a block move operation, and return 1 if successful.  Return 0
    if we should let the compiler generate normal code.
 
@@ -2750,18 +2776,56 @@ expand_block_move (rtx operands[], bool might_overlap)
   if (bytes > rs6000_block_move_inline_limit)
     return 0;
 
+  int orig_bytes = bytes;
   for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
     {
       union {
-	rtx (*movmemsi) (rtx, rtx, rtx, rtx);
 	rtx (*mov) (rtx, rtx);
+	rtx (*movlen) (rtx, rtx, int);
       } gen_func;
       machine_mode mode = BLKmode;
       rtx src, dest;
-
-      /* Altivec first, since it will be faster than a string move
-	 when it applies, and usually not significantly larger.  */
-      if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+      bool move_with_length = false;
+
+      /* Use POImode for paired vsx load/store.  Use V2DI for single
+	 unaligned vsx load/store, for consistency with what other
+	 expansions (compare) already do, and so we can use lxvd2x on
+	 p8.  Order is VSX pair unaligned, VSX unaligned, Altivec, VSX
+	 with length < 16 (if allowed), then gpr load/store.  */
+
+      if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX
+	  && TARGET_BLOCK_OPS_VECTOR_PAIR
+	  && bytes >= 32
+	  && (align >= 256 || !STRICT_ALIGNMENT))
+	{
+	  move_bytes = 32;
+	  mode = POImode;
+	  gen_func.mov = gen_movpoi;
+	}
+      else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX
+	       && VECTOR_MEM_VSX_P (V2DImode)
+	       && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT))
+	{
+	  move_bytes = 16;
+	  mode = V2DImode;
+	  gen_func.mov = gen_vsx_movv2di_64bit;
+	}
+      else if (TARGET_BLOCK_OPS_UNALIGNED_VSX
+	       && TARGET_POWER10 && bytes < 16
+	       && orig_bytes > 16
+	       && !(bytes == 1 || bytes == 2
+		    || bytes == 4 || bytes == 8)
+	       && (align >= 128 || !STRICT_ALIGNMENT))
+	{
+	  /* Only use lxvl/stxvl if it could replace multiple ordinary
+	     loads+stores.  Also don't use it unless we likely already
+	     did one vsx copy so we aren't mixing gpr and vsx.  */
+	  move_bytes = bytes;
+	  mode = V16QImode;
+	  gen_func.movlen = gen_lxvl_stxvl_move;
+	  move_with_length = true;
+	}
+      else if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
 	{
 	  move_bytes = 16;
 	  mode = V4SImode;
@@ -2818,7 +2882,16 @@ expand_block_move (rtx operands[], bool might_overlap)
 	  gen_func.mov = gen_movqi;
 	}
 
-      /* Mode is always set to something other than BLKmode by one of the 
+      /* If we can't succeed in doing the move in one pass, we can't
+	 do it in the might_overlap case.  Bail out and return
+	 failure.  We test num_reg + 1 >= MAX_MOVE_REG here to check
+	 the same condition as the test of num_reg >= MAX_MOVE_REG
+	 that is done below after the increment of num_reg.  */
+      if (might_overlap && num_reg + 1 >= MAX_MOVE_REG
+	  && bytes > move_bytes)
+	return 0;
+
+      /* Mode is always set to something other than BLKmode by one of the
 	 cases of the if statement above.  */
       gcc_assert (mode != BLKmode);
 
@@ -2826,15 +2899,17 @@ expand_block_move (rtx operands[], bool might_overlap)
       dest = adjust_address (orig_dest, mode, offset);
 
       rtx tmp_reg = gen_reg_rtx (mode);
-      
-      loads[num_reg]    = (*gen_func.mov) (tmp_reg, src);
-      stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
 
-      /* If we didn't succeed in doing it in one pass, we can't do it in the 
-	 might_overlap case.  Bail out and return failure.  */
-      if (might_overlap && num_reg >= MAX_MOVE_REG
-	  && bytes > move_bytes)
-	return 0;
+      if (move_with_length)
+	{
+	  loads[num_reg]    = (*gen_func.movlen) (tmp_reg, src, move_bytes);
+	  stores[num_reg++] = (*gen_func.movlen) (dest, tmp_reg, move_bytes);
+	}
+      else
+	{
+	  loads[num_reg]    = (*gen_func.mov) (tmp_reg, src);
+	  stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
+	}
 
       /* Emit loads and stores saved up.  */
       if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index b42f0c5..b58eeae 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -59,11 +59,12 @@
 #include "gimplify.h"
 #include "gimple-fold.h"
 #include "gimple-iterator.h"
-#include "gimple-ssa.h"
 #include "gimple-walk.h"
+#include "ssa.h"
+#include "tree-vectorizer.h"
+#include "tree-ssa-propagate.h"
 #include "intl.h"
 #include "tm-constrs.h"
-#include "tree-vectorizer.h"
 #include "target-globals.h"
 #include "builtins.h"
 #include "tree-vector-builder.h"
@@ -75,9 +76,6 @@
 #endif
 #include "case-cfn-macros.h"
 #include "ppc-auxv.h"
-#include "tree-ssa-propagate.h"
-#include "tree-vrp.h"
-#include "tree-ssanames.h"
 #include "rs6000-internal.h"
 #include "opts.h"
 
@@ -1493,6 +1491,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_PROMOTE_FUNCTION_MODE
 #define TARGET_PROMOTE_FUNCTION_MODE rs6000_promote_function_mode
 
+#undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
+#define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE rs6000_override_options_after_change
+
 #undef TARGET_RETURN_IN_MEMORY
 #define TARGET_RETURN_IN_MEMORY rs6000_return_in_memory
 
@@ -2390,7 +2391,7 @@ rs6000_debug_reg_global (void)
   else
     fprintf (stderr, DEBUG_FMT_S, "tune", "<none>");
 
-  cl_target_option_save (&cl_opts, &global_options);
+  cl_target_option_save (&cl_opts, &global_options, &global_options_set);
   rs6000_print_isa_options (stderr, 0, "rs6000_isa_flags",
 			    rs6000_isa_flags);
 
@@ -3420,6 +3421,124 @@ rs6000_md_asm_adjust (vec<rtx> &/*outputs*/, vec<rtx> &/*inputs*/,
   return NULL;
 }
 
+/* This target function is similar to the hook TARGET_OPTION_OVERRIDE
+   but is called when the optimize level is changed via an attribute or
+   pragma or when it is reset at the end of the code affected by the
+   attribute or pragma.  It is not called at the beginning of compilation
+   when TARGET_OPTION_OVERRIDE is called so if you want to perform these
+   actions then, you should have TARGET_OPTION_OVERRIDE call
+   TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE.  */
+
+static void
+rs6000_override_options_after_change (void)
+{
+  /* Explicit -funroll-loops turns -munroll-only-small-loops off, and
+     turns -frename-registers on.  */
+  if ((global_options_set.x_flag_unroll_loops && flag_unroll_loops)
+       || (global_options_set.x_flag_unroll_all_loops
+	   && flag_unroll_all_loops))
+    {
+      if (!global_options_set.x_unroll_only_small_loops)
+	unroll_only_small_loops = 0;
+      if (!global_options_set.x_flag_rename_registers)
+	flag_rename_registers = 1;
+      if (!global_options_set.x_flag_cunroll_grow_size)
+	flag_cunroll_grow_size = 1;
+    }
+  else if (!global_options_set.x_flag_cunroll_grow_size)
+    flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
+}
+
+#ifdef TARGET_USES_LINUX64_OPT
+static void
+rs6000_linux64_override_options ()
+{
+  if (!global_options_set.x_rs6000_alignment_flags)
+    rs6000_alignment_flags = MASK_ALIGN_NATURAL;
+  if (rs6000_isa_flags & OPTION_MASK_64BIT)
+    {
+      if (DEFAULT_ABI != ABI_AIX)
+	{
+	  rs6000_current_abi = ABI_AIX;
+	  error (INVALID_64BIT, "call");
+	}
+      dot_symbols = !strcmp (rs6000_abi_name, "aixdesc");
+      if (ELFv2_ABI_CHECK)
+	{
+	  rs6000_current_abi = ABI_ELFv2;
+	  if (dot_symbols)
+	    error ("%<-mcall-aixdesc%> incompatible with %<-mabi=elfv2%>");
+	}
+      if (rs6000_isa_flags & OPTION_MASK_RELOCATABLE)
+	{
+	  rs6000_isa_flags &= ~OPTION_MASK_RELOCATABLE;
+	  error (INVALID_64BIT, "relocatable");
+	}
+      if (rs6000_isa_flags & OPTION_MASK_EABI)
+	{
+	  rs6000_isa_flags &= ~OPTION_MASK_EABI;
+	  error (INVALID_64BIT, "eabi");
+	}
+      if (TARGET_PROTOTYPE)
+	{
+	  target_prototype = 0;
+	  error (INVALID_64BIT, "prototype");
+	}
+      if ((rs6000_isa_flags & OPTION_MASK_POWERPC64) == 0)
+	{
+	  rs6000_isa_flags |= OPTION_MASK_POWERPC64;
+	  error ("%<-m64%> requires a PowerPC64 cpu");
+	}
+      if (!global_options_set.x_rs6000_current_cmodel)
+	SET_CMODEL (CMODEL_MEDIUM);
+      if ((rs6000_isa_flags_explicit & OPTION_MASK_MINIMAL_TOC) != 0)
+	{
+	  if (global_options_set.x_rs6000_current_cmodel
+	      && rs6000_current_cmodel != CMODEL_SMALL)
+	    error ("%<-mcmodel incompatible with other toc options%>");
+	  if (TARGET_MINIMAL_TOC)
+	    SET_CMODEL (CMODEL_SMALL);
+	  else if (TARGET_PCREL
+		   || (PCREL_SUPPORTED_BY_OS
+		       && (rs6000_isa_flags_explicit & OPTION_MASK_PCREL) == 0))
+	    /* Ignore -mno-minimal-toc.  */
+	    ;
+	  else
+	    SET_CMODEL (CMODEL_SMALL);
+	}
+      if (rs6000_current_cmodel != CMODEL_SMALL)
+	{
+	  if (!global_options_set.x_TARGET_NO_FP_IN_TOC)
+	    TARGET_NO_FP_IN_TOC = rs6000_current_cmodel == CMODEL_MEDIUM;
+	  if (!global_options_set.x_TARGET_NO_SUM_IN_TOC)
+	    TARGET_NO_SUM_IN_TOC = 0;
+	}
+      if (TARGET_PLTSEQ && DEFAULT_ABI != ABI_ELFv2)
+	{
+	  if (global_options_set.x_rs6000_pltseq)
+	    warning (0, "%qs unsupported for this ABI",
+		     "-mpltseq");
+	  rs6000_pltseq = false;
+	}
+    }
+  else if (TARGET_64BIT)
+    error (INVALID_32BIT, "32");
+  else
+    {
+      if (TARGET_PROFILE_KERNEL)
+	{
+	  profile_kernel = 0;
+	  error (INVALID_32BIT, "profile-kernel");
+	}
+      if (global_options_set.x_rs6000_current_cmodel)
+	{
+	  SET_CMODEL (CMODEL_SMALL);
+	  error (INVALID_32BIT, "cmodel");
+	}
+    }
+}
+#endif
+
 /* Override command line options.
 
    Combine build-specific configuration information with options
@@ -3979,6 +4098,23 @@ rs6000_option_override_internal (bool global_init_p)
 	}
     }
 
+  if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX))
+    {
+      if (TARGET_EFFICIENT_UNALIGNED_VSX)
+	rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX;
+      else
+	rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX;
+    }
+
+  if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR))
+    {
+      /* When the POImode issues of PR96791 are resolved, then we can
+	 once again enable use of vector pair for memcpy/memmove on
+	 P10 if we have TARGET_MMA.  For now we make it disabled by
+	 default for all targets.  */
+      rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
+    }
+
   /* Use long double size to select the appropriate long double.  We use
      TYPE_PRECISION to differentiate the 3 different long double types.  We map
      128 into the precision used for TFmode.  */
@@ -4629,28 +4765,16 @@ rs6000_option_override_internal (bool global_init_p)
       SET_OPTION_IF_UNSET (&global_options, &global_options_set,
 			   param_max_completely_peeled_insns, 400);
 
+      /* Temporarily disable it for now since lxvl/stxvl on the default
+	 supported hardware Power9 has unexpected performance behaviors.  */
+      SET_OPTION_IF_UNSET (&global_options, &global_options_set,
+			   param_vect_partial_vector_usage, 0);
+
       /* Use the 'model' -fsched-pressure algorithm by default.  */
       SET_OPTION_IF_UNSET (&global_options, &global_options_set,
 			   param_sched_pressure_algorithm,
 			   SCHED_PRESSURE_MODEL);
 
-      /* Explicit -funroll-loops turns -munroll-only-small-loops off, and
-	 turns -frename-registers on.  */
-      if ((global_options_set.x_flag_unroll_loops && flag_unroll_loops)
-	   || (global_options_set.x_flag_unroll_all_loops
-	       && flag_unroll_all_loops))
-	{
-	  if (!global_options_set.x_unroll_only_small_loops)
-	    unroll_only_small_loops = 0;
-	  if (!global_options_set.x_flag_rename_registers)
-	    flag_rename_registers = 1;
-	  if (!global_options_set.x_flag_cunroll_grow_size)
-	    flag_cunroll_grow_size = 1;
-	}
-      else
-	if (!global_options_set.x_flag_cunroll_grow_size)
-	  flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
-
       /* If using typedef char *va_list, signal that
 	 __builtin_va_start (&ap, 0) can be optimized to
 	 ap = __builtin_next_arg (0).  */
@@ -4658,6 +4782,8 @@ rs6000_option_override_internal (bool global_init_p)
 	targetm.expand_builtin_va_start = NULL;
     }
 
+  rs6000_override_options_after_change ();
+
   /* If not explicitly specified via option, decide whether to generate indexed
      load/store instructions.  A value of -1 indicates that the
      initial value of this variable has not been overwritten. During
@@ -4731,7 +4857,7 @@ rs6000_option_override_internal (bool global_init_p)
   /* Save the initial options in case the user does function specific options */
   if (global_init_p)
     target_option_default_node = target_option_current_node
-      = build_target_option_node (&global_options);
+      = build_target_option_node (&global_options, &global_options_set);
 
   /* If not explicitly specified via option, decide whether to generate the
      extra blr's required to preserve the link stack on some cpus (eg, 476).  */
@@ -5115,8 +5241,8 @@ rs6000_init_cost (struct loop *loop_info)
    compare + branch or compare + isel instructions.  */
 
 static unsigned
-adjust_vectorization_cost (enum vect_cost_for_stmt kind,
-			   struct _stmt_vec_info *stmt_info)
+rs6000_adjust_vect_cost_per_stmt (enum vect_cost_for_stmt kind,
+				  struct _stmt_vec_info *stmt_info)
 {
   if (kind == scalar_stmt && stmt_info && stmt_info->stmt
       && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
@@ -5144,7 +5270,7 @@ rs6000_add_stmt_cost (class vec_info *vinfo, void *data, int count,
     {
       int stmt_cost = rs6000_builtin_vectorization_cost (kind, vectype,
 							 misalign);
-      stmt_cost += adjust_vectorization_cost (kind, stmt_info);
+      stmt_cost += rs6000_adjust_vect_cost_per_stmt (kind, stmt_info);
       /* Statements in an inner loop relative to the loop being
 	 vectorized are weighted more heavily.  The value here is
 	 arbitrary and could potentially be improved with analysis.  */
@@ -5168,6 +5294,34 @@ rs6000_add_stmt_cost (class vec_info *vinfo, void *data, int count,
   return retval;
 }
 
+/* For some target specific vectorization cost which can't be handled per stmt,
+   we check the requisite conditions and adjust the vectorization cost
+   accordingly if satisfied.  One typical example is to model shift cost for
+   vector with length by counting number of required lengths under condition
+   LOOP_VINFO_FULLY_WITH_LENGTH_P.  */
+
+static void
+rs6000_adjust_vect_cost_per_loop (rs6000_cost_data *data)
+{
+  struct loop *loop = data->loop_info;
+  gcc_assert (loop);
+  loop_vec_info loop_vinfo = loop_vec_info_for_loop (loop);
+
+  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+    {
+      rgroup_controls *rgc;
+      unsigned int num_vectors_m1;
+      unsigned int shift_cnt = 0;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
+	if (rgc->type)
+	  /* Each length needs one shift to fill into bits 0-7.  */
+	  shift_cnt += num_vectors_m1 + 1;
+
+      rs6000_add_stmt_cost (loop_vinfo, (void *) data, shift_cnt, scalar_stmt,
+			    NULL, NULL_TREE, 0, vect_body);
+    }
+}
+
 /* Implement targetm.vectorize.finish_cost.  */
 
 static void
@@ -5177,7 +5331,10 @@ rs6000_finish_cost (void *data, unsigned *prologue_cost,
   rs6000_cost_data *cost_data = (rs6000_cost_data*) data;
 
   if (cost_data->loop_info)
-    rs6000_density_test (cost_data);
+    {
+      rs6000_adjust_vect_cost_per_loop (cost_data);
+      rs6000_density_test (cost_data);
+    }
 
   /* Don't vectorize minimum-vectorization-factor, simple copy loops
      that require versioning for any reason.  The vectorization is at
@@ -5212,13 +5369,14 @@ rs6000_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
 {
    if (unroll_only_small_loops)
     {
-      /* TODO: This is hardcoded to 10 right now.  It can be refined, for
-	 example we may want to unroll very small loops more times (4 perhaps).
-	 We also should use a PARAM for this.  */
+      /* TODO: These are hardcoded values right now.  We probably should use
+	 a PARAM here.  */
+      if (loop->ninsns <= 6)
+	return MIN (4, nunroll);
       if (loop->ninsns <= 10)
 	return MIN (2, nunroll);
-      else
-	return 0;
+
+      return 0;
     }
 
   return nunroll;
@@ -5661,7 +5819,7 @@ direct_return (void)
 
 /* Helper for num_insns_constant.  Calculate number of instructions to
    load VALUE to a single gpr using combinations of addi, addis, ori,
-   oris and sldi instructions.  */
+   oris, sldi and rldimi instructions.  */
 
 static int
 num_insns_constant_gpr (HOST_WIDE_INT value)
@@ -5689,7 +5847,7 @@ num_insns_constant_gpr (HOST_WIDE_INT value)
 
       high >>= 1;
 
-      if (low == 0)
+      if (low == 0 || low == high)
 	return num_insns_constant_gpr (high) + 1;
       else if (high == 0)
 	return num_insns_constant_gpr (low) + 1;
@@ -6498,29 +6656,48 @@ rs6000_expand_vector_init (rtx target, rtx vals)
 	}
       else
 	{
-	  rtx dbl_even = gen_reg_rtx (V2DFmode);
-	  rtx dbl_odd  = gen_reg_rtx (V2DFmode);
-	  rtx flt_even = gen_reg_rtx (V4SFmode);
-	  rtx flt_odd  = gen_reg_rtx (V4SFmode);
-	  rtx op0 = force_reg (SFmode, XVECEXP (vals, 0, 0));
-	  rtx op1 = force_reg (SFmode, XVECEXP (vals, 0, 1));
-	  rtx op2 = force_reg (SFmode, XVECEXP (vals, 0, 2));
-	  rtx op3 = force_reg (SFmode, XVECEXP (vals, 0, 3));
-
-	  /* Use VMRGEW if we can instead of doing a permute.  */
-	  if (TARGET_P8_VECTOR)
+	  if (TARGET_P8_VECTOR && TARGET_POWERPC64)
 	    {
-	      emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op2));
-	      emit_insn (gen_vsx_concat_v2sf (dbl_odd, op1, op3));
-	      emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even));
-	      emit_insn (gen_vsx_xvcvdpsp (flt_odd, dbl_odd));
-	      if (BYTES_BIG_ENDIAN)
-		emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_even, flt_odd));
-	      else
-		emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_odd, flt_even));
+	      rtx tmp_sf[4];
+	      rtx tmp_si[4];
+	      rtx tmp_di[4];
+	      rtx mrg_di[4];
+	      for (i = 0; i < 4; i++)
+		{
+		  tmp_si[i] = gen_reg_rtx (SImode);
+		  tmp_di[i] = gen_reg_rtx (DImode);
+		  mrg_di[i] = gen_reg_rtx (DImode);
+		  tmp_sf[i] = force_reg (SFmode, XVECEXP (vals, 0, i));
+		  emit_insn (gen_movsi_from_sf (tmp_si[i], tmp_sf[i]));
+		  emit_insn (gen_zero_extendsidi2 (tmp_di[i], tmp_si[i]));
+		}
+
+	      if (!BYTES_BIG_ENDIAN)
+		{
+		  std::swap (tmp_di[0], tmp_di[1]);
+		  std::swap (tmp_di[2], tmp_di[3]);
+		}
+
+	      emit_insn (gen_ashldi3 (mrg_di[0], tmp_di[0], GEN_INT (32)));
+	      emit_insn (gen_iordi3 (mrg_di[1], mrg_di[0], tmp_di[1]));
+	      emit_insn (gen_ashldi3 (mrg_di[2], tmp_di[2], GEN_INT (32)));
+	      emit_insn (gen_iordi3 (mrg_di[3], mrg_di[2], tmp_di[3]));
+
+	      rtx tmp_v2di = gen_reg_rtx (V2DImode);
+	      emit_insn (gen_vsx_concat_v2di (tmp_v2di, mrg_di[1], mrg_di[3]));
+	      emit_move_insn (target, gen_lowpart (V4SFmode, tmp_v2di));
 	    }
 	  else
 	    {
+	      rtx dbl_even = gen_reg_rtx (V2DFmode);
+	      rtx dbl_odd  = gen_reg_rtx (V2DFmode);
+	      rtx flt_even = gen_reg_rtx (V4SFmode);
+	      rtx flt_odd  = gen_reg_rtx (V4SFmode);
+	      rtx op0 = force_reg (SFmode, XVECEXP (vals, 0, 0));
+	      rtx op1 = force_reg (SFmode, XVECEXP (vals, 0, 1));
+	      rtx op2 = force_reg (SFmode, XVECEXP (vals, 0, 2));
+	      rtx op3 = force_reg (SFmode, XVECEXP (vals, 0, 3));
+
 	      emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op1));
 	      emit_insn (gen_vsx_concat_v2sf (dbl_odd, op2, op3));
 	      emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even));
@@ -8275,7 +8452,7 @@ rs6000_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
 	low_int = 0;
       high_int = INTVAL (XEXP (x, 1)) - low_int;
       sum = force_operand (gen_rtx_PLUS (Pmode, XEXP (x, 0),
-					 GEN_INT (high_int)), 0);
+					 gen_int_mode (high_int, Pmode)), 0);
       return plus_constant (Pmode, sum, low_int);
     }
   else if (GET_CODE (x) == PLUS
@@ -8691,7 +8868,7 @@ rs6000_legitimize_tls_address (rtx addr, enum tls_model model)
 
   dest = gen_reg_rtx (Pmode);
   if (model == TLS_MODEL_LOCAL_EXEC
-      && (rs6000_tls_size == 16 || rs6000_pcrel_p (cfun)))
+      && (rs6000_tls_size == 16 || rs6000_pcrel_p ()))
     {
       rtx tlsreg;
 
@@ -8738,7 +8915,7 @@ rs6000_legitimize_tls_address (rtx addr, enum tls_model model)
 	 them in the .got section.  So use a pointer to the .got section,
 	 not one to secondary TOC sections used by 64-bit -mminimal-toc,
 	 or to secondary GOT sections used by 32-bit -fPIC.  */
-      if (rs6000_pcrel_p (cfun))
+      if (rs6000_pcrel_p ())
 	got = const0_rtx;
       else if (TARGET_64BIT)
 	got = gen_rtx_REG (Pmode, 2);
@@ -8803,7 +8980,7 @@ rs6000_legitimize_tls_address (rtx addr, enum tls_model model)
 	  rtx uns = gen_rtx_UNSPEC (Pmode, vec, UNSPEC_TLS_GET_ADDR);
 	  set_unique_reg_note (get_last_insn (), REG_EQUAL, uns);
 
-	  if (rs6000_tls_size == 16 || rs6000_pcrel_p (cfun))
+	  if (rs6000_tls_size == 16 || rs6000_pcrel_p ())
 	    {
 	      if (TARGET_64BIT)
 		insn = gen_tls_dtprel_64 (dest, tmp1, addr);
@@ -8844,7 +9021,7 @@ rs6000_legitimize_tls_address (rtx addr, enum tls_model model)
 	  else
 	    insn = gen_tls_got_tprel_32 (tmp2, got, addr);
 	  emit_insn (insn);
-	  if (rs6000_pcrel_p (cfun))
+	  if (rs6000_pcrel_p ())
 	    {
 	      if (TARGET_64BIT)
 		insn = gen_tls_tls_pcrel_64 (dest, tmp2, addr);
@@ -13651,7 +13828,7 @@ rs6000_call_template_1 (rtx *operands, unsigned int funop, bool sibcall)
 	    ? "+32768" : ""));
 
   static char str[32];  /* 1 spare */
-  if (rs6000_pcrel_p (cfun))
+  if (rs6000_pcrel_p ())
     sprintf (str, "b%s %s@notoc%s", sibcall ? "" : "l", z, arg);
   else if (DEFAULT_ABI == ABI_AIX || DEFAULT_ABI == ABI_ELFv2)
     sprintf (str, "b%s %s%s%s", sibcall ? "" : "l", z, arg,
@@ -13791,7 +13968,7 @@ rs6000_indirect_call_template_1 (rtx *operands, unsigned int funop,
 		     rel64);
 	}
 
-      const char *notoc = rs6000_pcrel_p (cfun) ? "_NOTOC" : "";
+      const char *notoc = rs6000_pcrel_p () ? "_NOTOC" : "";
       const char *addend = (DEFAULT_ABI == ABI_V4 && TARGET_SECURE_PLT
 			    && flag_pic == 2 ? "+32768" : "");
       if (!speculate)
@@ -13808,7 +13985,7 @@ rs6000_indirect_call_template_1 (rtx *operands, unsigned int funop,
   else if (!speculate)
     s += sprintf (s, "crset 2\n\t");
 
-  if (rs6000_pcrel_p (cfun))
+  if (rs6000_pcrel_p ())
     {
       if (speculate)
 	sprintf (s, "b%%T%ul", funop);
@@ -14968,13 +15145,33 @@ rs6000_emit_vector_cond_expr (rtx dest, rtx op_true, rtx op_false,
   return 1;
 }
 
-/* ISA 3.0 (power9) minmax subcase to emit a XSMAXCDP or XSMINCDP instruction
-   for SF/DF scalars.  Move TRUE_COND to DEST if OP of the operands of the last
-   comparison is nonzero/true, FALSE_COND if it is zero/false.  Return 0 if the
-   hardware has no such operation.  */
+/* Possibly emit the xsmaxcdp and xsmincdp instructions to emit a maximum or
+   minimum with "C" semantics.
 
-static int
-rs6000_emit_p9_fp_minmax (rtx dest, rtx op, rtx true_cond, rtx false_cond)
+   Unless you use -ffast-math, you can't use these instructions to replace
+   conditions that implicitly reverse the condition because the comparison
+   might generate a NaN or signed zer0.
+
+   I.e. the following can be replaced all of the time
+	ret = (op1 >  op2) ? op1 : op2	; generate xsmaxcdp
+	ret = (op1 >= op2) ? op1 : op2	; generate xsmaxcdp
+	ret = (op1 <  op2) ? op1 : op2;	; generate xsmincdp
+	ret = (op1 <= op2) ? op1 : op2;	; generate xsmincdp
+
+   The following can be replaced only if -ffast-math is used:
+	ret = (op1 <  op2) ? op2 : op1	; generate xsmaxcdp
+	ret = (op1 <= op2) ? op2 : op1	; generate xsmaxcdp
+	ret = (op1 >  op2) ? op2 : op1;	; generate xsmincdp
+	ret = (op1 >= op2) ? op2 : op1;	; generate xsmincdp
+
+   Move TRUE_COND to DEST if OP of the operands of the last comparison is
+   nonzero/true, FALSE_COND if it is zero/false.
+
+   Return false if we can't generate the appropriate minimum or maximum, and
+   true if we can did the minimum or maximum.  */
+
+static bool
+rs6000_maybe_emit_maxc_minc (rtx dest, rtx op, rtx true_cond, rtx false_cond)
 {
   enum rtx_code code = GET_CODE (op);
   rtx op0 = XEXP (op, 0);
@@ -14984,14 +15181,14 @@ rs6000_emit_p9_fp_minmax (rtx dest, rtx op, rtx true_cond, rtx false_cond)
   bool max_p = false;
 
   if (result_mode != compare_mode)
-    return 0;
+    return false;
 
   if (code == GE || code == GT)
     max_p = true;
   else if (code == LE || code == LT)
     max_p = false;
   else
-    return 0;
+    return false;
 
   if (rtx_equal_p (op0, true_cond) && rtx_equal_p (op1, false_cond))
     ;
@@ -15004,19 +15201,23 @@ rs6000_emit_p9_fp_minmax (rtx dest, rtx op, rtx true_cond, rtx false_cond)
     max_p = !max_p;
 
   else
-    return 0;
+    return false;
 
   rs6000_emit_minmax (dest, max_p ? SMAX : SMIN, op0, op1);
-  return 1;
+  return true;
 }
 
-/* ISA 3.0 (power9) conditional move subcase to emit XSCMP{EQ,GE,GT,NE}DP and
-   XXSEL instructions for SF/DF scalars.  Move TRUE_COND to DEST if OP of the
-   operands of the last comparison is nonzero/true, FALSE_COND if it is
-   zero/false.  Return 0 if the hardware has no such operation.  */
+/* Possibly emit a floating point conditional move by generating a compare that
+   sets a mask instruction and a XXSEL select instruction.
 
-static int
-rs6000_emit_p9_fp_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
+   Move TRUE_COND to DEST if OP of the operands of the last comparison is
+   nonzero/true, FALSE_COND if it is zero/false.
+
+   Return false if the operation cannot be generated, and true if we could
+   generate the instruction.  */
+
+static bool
+rs6000_maybe_emit_fp_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
 {
   enum rtx_code code = GET_CODE (op);
   rtx op0 = XEXP (op, 0);
@@ -15044,7 +15245,7 @@ rs6000_emit_p9_fp_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
       break;
 
     default:
-      return 0;
+      return false;
     }
 
   /* Generate:	[(parallel [(set (dest)
@@ -15064,14 +15265,35 @@ rs6000_emit_p9_fp_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
   emit_insn (gen_rtx_PARALLEL (VOIDmode,
 			       gen_rtvec (2, cmove_rtx, clobber_rtx)));
 
-  return 1;
+  return true;
+}
+
+/* Helper function to return true if the target has instructions to do a
+   compare and set mask instruction that can be used with XXSEL to implement a
+   conditional move.  It is also assumed that such a target also supports the
+   "C" minimum and maximum instructions. */
+
+static bool
+have_compare_and_set_mask (machine_mode mode)
+{
+  switch (mode)
+    {
+    case E_SFmode:
+    case E_DFmode:
+      return TARGET_P9_MINMAX;
+
+    default:
+      break;
+    }
+
+  return false;
 }
 
 /* Emit a conditional move: move TRUE_COND to DEST if OP of the
    operands of the last comparison is nonzero/true, FALSE_COND if it
    is zero/false.  Return 0 if the hardware has no such operation.  */
 
-int
+bool
 rs6000_emit_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
 {
   enum rtx_code code = GET_CODE (op);
@@ -15087,28 +15309,28 @@ rs6000_emit_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
       /* In the isel case however, we can use a compare immediate, so
 	 op1 may be a small constant.  */
       && (!TARGET_ISEL || !short_cint_operand (op1, VOIDmode)))
-    return 0;
+    return false;
   if (GET_MODE (true_cond) != result_mode)
-    return 0;
+    return false;
   if (GET_MODE (false_cond) != result_mode)
-    return 0;
+    return false;
 
-  /* See if we can use the ISA 3.0 (power9) min/max/compare functions.  */
-  if (TARGET_P9_MINMAX
-      && (compare_mode == SFmode || compare_mode == DFmode)
-      && (result_mode == SFmode || result_mode == DFmode))
+  /* See if we can use the "C" minimum, "C" maximum, and compare and set mask
+     instructions.  */
+  if (have_compare_and_set_mask (compare_mode)
+      && have_compare_and_set_mask (result_mode))
     {
-      if (rs6000_emit_p9_fp_minmax (dest, op, true_cond, false_cond))
-	return 1;
+      if (rs6000_maybe_emit_maxc_minc (dest, op, true_cond, false_cond))
+	return true;
 
-      if (rs6000_emit_p9_fp_cmove (dest, op, true_cond, false_cond))
-	return 1;
+      if (rs6000_maybe_emit_fp_cmove (dest, op, true_cond, false_cond))
+	return true;
     }
 
   /* Don't allow using floating point comparisons for integer results for
      now.  */
   if (FLOAT_MODE_P (compare_mode) && !FLOAT_MODE_P (result_mode))
-    return 0;
+    return false;
 
   /* First, work out if the hardware can do this at all, or
      if it's too slow....  */
@@ -15116,7 +15338,7 @@ rs6000_emit_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
     {
       if (TARGET_ISEL)
 	return rs6000_emit_int_cmove (dest, op, true_cond, false_cond);
-      return 0;
+      return false;
     }
 
   is_against_zero = op1 == CONST0_RTX (compare_mode);
@@ -15128,7 +15350,7 @@ rs6000_emit_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
      generated.  */
   if (SCALAR_FLOAT_MODE_P (compare_mode)
       && flag_trapping_math && ! is_against_zero)
-    return 0;
+    return false;
 
   /* Eliminate half of the comparisons by switching operands, this
      makes the remaining code simpler.  */
@@ -15144,7 +15366,7 @@ rs6000_emit_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
   /* UNEQ and LTGT take four instructions for a comparison with zero,
      it'll probably be faster to use a branch here too.  */
   if (code == UNEQ && HONOR_NANS (compare_mode))
-    return 0;
+    return false;
 
   /* We're going to try to implement comparisons by performing
      a subtract, then comparing against zero.  Unfortunately,
@@ -15159,14 +15381,14 @@ rs6000_emit_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
       && ((! rtx_equal_p (op0, false_cond) && ! rtx_equal_p (op1, false_cond))
 	  || (! rtx_equal_p (op0, true_cond)
 	      && ! rtx_equal_p (op1, true_cond))))
-    return 0;
+    return false;
 
   /* At this point we know we can use fsel.  */
 
   /* Don't allow compare_mode other than SFmode or DFmode, for others there
      is no fsel instruction.  */
   if (compare_mode != SFmode && compare_mode != DFmode)
-    return 0;
+    return false;
 
   /* Reduce the comparison to a comparison against zero.  */
   if (! is_against_zero)
@@ -15265,12 +15487,12 @@ rs6000_emit_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
 						gen_rtx_GE (VOIDmode,
 							    op0, op1),
 						true_cond, false_cond)));
-  return 1;
+  return true;
 }
 
 /* Same as above, but for ints (isel).  */
 
-int
+bool
 rs6000_emit_int_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
 {
   rtx condition_rtx, cr;
@@ -15280,7 +15502,7 @@ rs6000_emit_int_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
   bool signedp;
 
   if (mode != SImode && (!TARGET_POWERPC64 || mode != DImode))
-    return 0;
+    return false;
 
   /* We still have to do the compare, because isel doesn't do a
      compare, it just looks at the CRx bits set by a previous compare
@@ -15315,7 +15537,7 @@ rs6000_emit_int_cmove (rtx dest, rtx op, rtx true_cond, rtx false_cond)
 
   emit_insn (isel_func (dest, condition_rtx, true_cond, false_cond, cr));
 
-  return 1;
+  return true;
 }
 
 void
@@ -17364,14 +17586,6 @@ rs6000_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
               }
             break;
 
-          case TYPE_FPLOAD:
-            if ((rs6000_tune == PROCESSOR_POWER6)
-                && get_attr_update (insn) == UPDATE_NO
-                && recog_memoized (dep_insn)
-                && (INSN_CODE (dep_insn) >= 0)
-                && (get_attr_type (dep_insn) == TYPE_MFFGPR))
-              return 2;
-
           default:
             break;
           }
@@ -17397,11 +17611,6 @@ rs6000_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
 		  || get_attr_type (dep_insn) == TYPE_FPSIMPLE)
                 return 1;
               break;
-            case TYPE_FPLOAD:
-              if (get_attr_update (insn) == UPDATE_NO
-                  && get_attr_type (dep_insn) == TYPE_MFFGPR)
-                return 2;
-              break;
             default:
               break;
             }
@@ -19547,7 +19756,7 @@ rs6000_longcall_ref (rtx call_ref, rtx arg)
     {
       rtx base = const0_rtx;
       int regno = 12;
-      if (rs6000_pcrel_p (cfun))
+      if (rs6000_pcrel_p ())
 	{
 	  rtx reg = gen_rtx_REG (Pmode, regno);
 	  rtx u = gen_rtx_UNSPEC_VOLATILE (Pmode,
@@ -21055,9 +21264,9 @@ rs6000_rtx_costs (rtx x, machine_mode mode, int outer_code,
 	  return true;
 	}
       else if ((outer_code == PLUS
-		&& reg_or_add_cint_operand (x, VOIDmode))
+		&& reg_or_add_cint_operand (x, mode))
 	       || (outer_code == MINUS
-		   && reg_or_sub_cint_operand (x, VOIDmode))
+		   && reg_or_sub_cint_operand (x, mode))
 	       || ((outer_code == SET
 		    || outer_code == IOR
 		    || outer_code == XOR)
@@ -23142,6 +23351,10 @@ struct rs6000_opt_mask {
 static struct rs6000_opt_mask const rs6000_opt_masks[] =
 {
   { "altivec",			OPTION_MASK_ALTIVEC,		false, true  },
+  { "block-ops-unaligned-vsx",	OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX,
+								false, true  },
+  { "block-ops-vector-pair",	OPTION_MASK_BLOCK_OPS_VECTOR_PAIR,
+								false, true  },
   { "cmpb",			OPTION_MASK_CMPB,		false, true  },
   { "crypto",			OPTION_MASK_CRYPTO,		false, true  },
   { "direct-move",		OPTION_MASK_DIRECT_MOVE,	false, true  },
@@ -23524,18 +23737,19 @@ rs6000_valid_attribute_p (tree fndecl,
       && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
     return true;
 
-  old_optimize = build_optimization_node (&global_options);
+  old_optimize = build_optimization_node (&global_options,
+					  &global_options_set);
   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
 
   /* If the function changed the optimization levels as well as setting target
      options, start with the optimizations specified.  */
   if (func_optimize && func_optimize != old_optimize)
-    cl_optimization_restore (&global_options,
+    cl_optimization_restore (&global_options, &global_options_set,
 			     TREE_OPTIMIZATION (func_optimize));
 
   /* The target attributes may also change some optimization flags, so update
      the optimization options if necessary.  */
-  cl_target_option_save (&cur_target, &global_options);
+  cl_target_option_save (&cur_target, &global_options, &global_options_set);
   rs6000_cpu_index = rs6000_tune_index = -1;
   ret = rs6000_inner_target_options (args, true);
 
@@ -23543,12 +23757,14 @@ rs6000_valid_attribute_p (tree fndecl,
   if (ret)
     {
       ret = rs6000_option_override_internal (false);
-      new_target = build_target_option_node (&global_options);
+      new_target = build_target_option_node (&global_options,
+					     &global_options_set);
     }
   else
     new_target = NULL;
 
-  new_optimize = build_optimization_node (&global_options);
+  new_optimize = build_optimization_node (&global_options,
+					  &global_options_set);
 
   if (!new_target)
     ret = false;
@@ -23561,10 +23777,10 @@ rs6000_valid_attribute_p (tree fndecl,
 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
     }
 
-  cl_target_option_restore (&global_options, &cur_target);
+  cl_target_option_restore (&global_options, &global_options_set, &cur_target);
 
   if (old_optimize != new_optimize)
-    cl_optimization_restore (&global_options,
+    cl_optimization_restore (&global_options, &global_options_set,
 			     TREE_OPTIMIZATION (old_optimize));
 
   return ret;
@@ -23578,7 +23794,8 @@ rs6000_valid_attribute_p (tree fndecl,
 bool
 rs6000_pragma_target_parse (tree args, tree pop_target)
 {
-  tree prev_tree = build_target_option_node (&global_options);
+  tree prev_tree = build_target_option_node (&global_options,
+					     &global_options_set);
   tree cur_tree;
   struct cl_target_option *prev_opt, *cur_opt;
   HOST_WIDE_INT prev_flags, cur_flags, diff_flags;
@@ -23607,7 +23824,7 @@ rs6000_pragma_target_parse (tree args, tree pop_target)
       cur_tree = ((pop_target)
 		  ? pop_target
 		  : target_option_default_node);
-      cl_target_option_restore (&global_options,
+      cl_target_option_restore (&global_options, &global_options_set,
 				TREE_TARGET_OPTION (cur_tree));
     }
   else
@@ -23615,7 +23832,8 @@ rs6000_pragma_target_parse (tree args, tree pop_target)
       rs6000_cpu_index = rs6000_tune_index = -1;
       if (!rs6000_inner_target_options (args, false)
 	  || !rs6000_option_override_internal (false)
-	  || (cur_tree = build_target_option_node (&global_options))
+	  || (cur_tree = build_target_option_node (&global_options,
+						   &global_options_set))
 	     == NULL_TREE)
 	{
 	  if (TARGET_DEBUG_BUILTIN || TARGET_DEBUG_TARGET)
@@ -23670,7 +23888,8 @@ static GTY(()) tree rs6000_previous_fndecl;
 void
 rs6000_activate_target_options (tree new_tree)
 {
-  cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
+  cl_target_option_restore (&global_options, &global_options_set,
+			    TREE_TARGET_OPTION (new_tree));
   if (TREE_TARGET_GLOBALS (new_tree))
     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
   else if (new_tree == target_option_default_node)
@@ -23761,7 +23980,8 @@ rs6000_set_current_function (tree fndecl)
 
 static void
 rs6000_function_specific_save (struct cl_target_option *ptr,
-			       struct gcc_options *opts)
+			       struct gcc_options *opts,
+			       struct gcc_options */* opts_set */)
 {
   ptr->x_rs6000_isa_flags = opts->x_rs6000_isa_flags;
   ptr->x_rs6000_isa_flags_explicit = opts->x_rs6000_isa_flags_explicit;
@@ -23771,6 +23991,7 @@ rs6000_function_specific_save (struct cl_target_option *ptr,
 
 static void
 rs6000_function_specific_restore (struct gcc_options *opts,
+				  struct gcc_options */* opts_set */,
 				  struct cl_target_option *ptr)
 				  
 {
@@ -24596,7 +24817,7 @@ rs6000_call_aix (rtx value, rtx func_desc, rtx tlsarg, rtx cookie)
   if (!SYMBOL_REF_P (func)
       || (DEFAULT_ABI == ABI_AIX && !SYMBOL_REF_FUNCTION_P (func)))
     {
-      if (!rs6000_pcrel_p (cfun))
+      if (!rs6000_pcrel_p ())
 	{
 	  /* Save the TOC into its reserved slot before the call,
 	     and prepare to restore it after the call.  */
@@ -24633,8 +24854,7 @@ rs6000_call_aix (rtx value, rtx func_desc, rtx tlsarg, rtx cookie)
 	  /* A function pointer in the ELFv2 ABI is just a plain address, but
 	     the ABI requires it to be loaded into r12 before the call.  */
 	  func_addr = gen_rtx_REG (Pmode, 12);
-	  if (!rtx_equal_p (func_addr, func))
-	    emit_move_insn (func_addr, func);
+	  emit_move_insn (func_addr, func);
 	  abi_reg = func_addr;
 	  /* Indirect calls via CTR are strongly preferred over indirect
 	     calls via LR, so move the address there.  Needed to mark
@@ -24703,7 +24923,7 @@ rs6000_call_aix (rtx value, rtx func_desc, rtx tlsarg, rtx cookie)
   else
     {
       /* No TOC register needed for calls from PC-relative callers.  */
-      if (!rs6000_pcrel_p (cfun))
+      if (!rs6000_pcrel_p ())
 	/* Direct calls use the TOC: for local calls, the callee will
 	   assume the TOC register is set; for non-local calls, the
 	   PLT stub needs the TOC register.  */
@@ -24741,14 +24961,26 @@ rs6000_sibcall_aix (rtx value, rtx func_desc, rtx tlsarg, rtx cookie)
 {
   rtx call[2];
   rtx insn;
+  rtx r12 = NULL_RTX;
+  rtx func_addr = func_desc;
 
   gcc_assert (INTVAL (cookie) == 0);
 
   if (global_tlsarg)
     tlsarg = global_tlsarg;
 
+  /* For ELFv2, r12 and CTR need to hold the function address
+     for an indirect call.  */
+  if (GET_CODE (func_desc) != SYMBOL_REF && DEFAULT_ABI == ABI_ELFv2)
+    {
+      r12 = gen_rtx_REG (Pmode, 12);
+      emit_move_insn (r12, func_desc);
+      func_addr = gen_rtx_REG (Pmode, CTR_REGNO);
+      emit_move_insn (func_addr, r12);
+    }
+
   /* Create the call.  */
-  call[0] = gen_rtx_CALL (VOIDmode, gen_rtx_MEM (SImode, func_desc), tlsarg);
+  call[0] = gen_rtx_CALL (VOIDmode, gen_rtx_MEM (SImode, func_addr), tlsarg);
   if (value != NULL_RTX)
     call[0] = gen_rtx_SET (value, call[0]);
 
@@ -24758,9 +24990,13 @@ rs6000_sibcall_aix (rtx value, rtx func_desc, rtx tlsarg, rtx cookie)
   insn = emit_call_insn (insn);
 
   /* Note use of the TOC register.  */
-  if (!rs6000_pcrel_p (cfun))
+  if (!rs6000_pcrel_p ())
     use_reg (&CALL_INSN_FUNCTION_USAGE (insn),
 	     gen_rtx_REG (Pmode, TOC_REGNUM));
+
+  /* Note use of r12.  */
+  if (r12)
+    use_reg (&CALL_INSN_FUNCTION_USAGE (insn), r12);
 }
 
 /* Expand code to perform a call under the SYSV4 ABI.  */
@@ -25041,7 +25277,7 @@ rs6000_fndecl_pcrel_p (const_tree fndecl)
 
 /* Return whether we should generate PC-relative code for *FN.  */
 bool
-rs6000_pcrel_p (struct function *fn)
+rs6000_function_pcrel_p (struct function *fn)
 {
   if (DEFAULT_ABI != ABI_ELFv2)
     return false;
@@ -25054,6 +25290,16 @@ rs6000_pcrel_p (struct function *fn)
   return rs6000_fndecl_pcrel_p (fn->decl);
 }
 
+/* Return whether we should generate PC-relative code for the current
+   function.  */
+bool
+rs6000_pcrel_p ()
+{
+  return (DEFAULT_ABI == ABI_ELFv2
+	  && (rs6000_isa_flags & OPTION_MASK_PCREL) != 0
+	  && TARGET_CMODEL == CMODEL_MEDIUM);
+}
+
 
 /* Given an address (ADDR), a mode (MODE), and what the format of the
    non-prefixed address (NON_PREFIXED_FORMAT) is, return the instruction format
@@ -26737,40 +26983,76 @@ rs6000_cannot_substitute_mem_equiv_p (rtx mem)
 static const char *
 rs6000_invalid_conversion (const_tree fromtype, const_tree totype)
 {
-  if (element_mode (fromtype) != element_mode (totype))
+  /* Make sure we're working with the canonical types.  */
+  if (TYPE_CANONICAL (fromtype) != NULL_TREE)
+    fromtype = TYPE_CANONICAL (fromtype);
+  if (TYPE_CANONICAL (totype) != NULL_TREE)
+    totype = TYPE_CANONICAL (totype);
+
+  machine_mode frommode = TYPE_MODE (fromtype);
+  machine_mode tomode = TYPE_MODE (totype);
+
+  if (frommode != tomode)
     {
       /* Do not allow conversions to/from PXImode and POImode types.  */
-      if (TYPE_MODE (fromtype) == PXImode)
+      if (frommode == PXImode)
 	return N_("invalid conversion from type %<__vector_quad%>");
-      if (TYPE_MODE (totype) == PXImode)
+      if (tomode == PXImode)
 	return N_("invalid conversion to type %<__vector_quad%>");
-      if (TYPE_MODE (fromtype) == POImode)
+      if (frommode == POImode)
 	return N_("invalid conversion from type %<__vector_pair%>");
-      if (TYPE_MODE (totype) == POImode)
+      if (tomode == POImode)
 	return N_("invalid conversion to type %<__vector_pair%>");
     }
   else if (POINTER_TYPE_P (fromtype) && POINTER_TYPE_P (totype))
     {
+      /* We really care about the modes of the base types.  */
+      frommode = TYPE_MODE (TREE_TYPE (fromtype));
+      tomode = TYPE_MODE (TREE_TYPE (totype));
+
       /* Do not allow conversions to/from PXImode and POImode pointer
 	 types, except to/from void pointers.  */
-      if (TYPE_MODE (TREE_TYPE (fromtype)) == PXImode
-	  && TYPE_MODE (TREE_TYPE (totype)) != VOIDmode)
-	return N_("invalid conversion from type %<* __vector_quad%>");
-      if (TYPE_MODE (TREE_TYPE (totype)) == PXImode
-	  && TYPE_MODE (TREE_TYPE (fromtype)) != VOIDmode)
-	return N_("invalid conversion to type %<* __vector_quad%>");
-      if (TYPE_MODE (TREE_TYPE (fromtype)) == POImode
-	  && TYPE_MODE (TREE_TYPE (totype)) != VOIDmode)
-	return N_("invalid conversion from type %<* __vector_pair%>");
-      if (TYPE_MODE (TREE_TYPE (totype)) == POImode
-	  && TYPE_MODE (TREE_TYPE (fromtype)) != VOIDmode)
-	return N_("invalid conversion to type %<* __vector_pair%>");
+      if (frommode != tomode
+	  && frommode != VOIDmode
+	  && tomode != VOIDmode)
+	{
+	  if (frommode == PXImode)
+	    return N_("invalid conversion from type %<* __vector_quad%>");
+	  if (tomode == PXImode)
+	    return N_("invalid conversion to type %<* __vector_quad%>");
+	  if (frommode == POImode)
+	    return N_("invalid conversion from type %<* __vector_pair%>");
+	  if (tomode == POImode)
+	    return N_("invalid conversion to type %<* __vector_pair%>");
+	}
     }
 
   /* Conversion allowed.  */
   return NULL;
 }
 
+long long
+rs6000_const_f32_to_i32 (rtx operand)
+{
+  long long value;
+  const struct real_value *rv = CONST_DOUBLE_REAL_VALUE (operand);
+
+  gcc_assert (GET_MODE (operand) == SFmode);
+  REAL_VALUE_TO_TARGET_SINGLE (*rv, value);
+  return value;
+}
+
+void
+rs6000_emit_xxspltidp_v2df (rtx dst, long value)
+{
+  printf("rs6000_emit_xxspltidp_v2df called %ld\n", value);
+  printf("rs6000_emit_xxspltidp_v2df called 0x%lx\n", value);
+  if (((value & 0x7F800000) == 0) && ((value & 0x7FFFFF) != 0))
+    inform (input_location,
+	    "the result for the xxspltidp instruction is undefined for subnormal input values.\n");
+  emit_insn( gen_xxspltidp_v2df_inst (dst, GEN_INT (value)));
+}
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-rs6000.h"
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 7baaa61..779bfd1 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -77,6 +77,7 @@
    UNSPEC_TLS_GET_ADDR
    UNSPEC_MOVESI_FROM_CR
    UNSPEC_MOVESI_TO_CR
+   UNSPEC_XVTLSBB
    UNSPEC_TLSDTPREL
    UNSPEC_TLSDTPRELHA
    UNSPEC_TLSDTPRELLO
@@ -201,7 +202,7 @@
    cr_logical,mfcr,mfcrf,mtcr,
    fpcompare,fp,fpsimple,dmul,qmul,sdiv,ddiv,ssqrt,dsqrt,
    vecsimple,veccomplex,vecdiv,veccmp,veccmpsimple,vecperm,
-   vecfloat,vecfdiv,vecdouble,mffgpr,mftgpr,crypto,
+   vecfloat,vecfdiv,vecdouble,mtvsr,mfvsr,crypto,
    veclogical,veccmpfx,vecexts,vecmove,
    htm,htmsimple,dfp,mma"
   (const_string "integer"))
@@ -322,7 +323,7 @@
   (const (symbol_ref "(enum attr_cpu) rs6000_tune")))
 
 ;; The ISA we implement.
-(define_attr "isa" "any,p5,p6,p7,p7v,p8v,p9v,p9kf,p9tf,p10"
+(define_attr "isa" "any,p5,p6,p7,p7v,p8v,p9,p9v,p9kf,p9tf,p10"
   (const_string "any"))
 
 ;; Is this alternative enabled for the current CPU/ISA/etc.?
@@ -351,6 +352,10 @@
 	  (match_test "TARGET_P8_VECTOR"))
      (const_int 1)
 
+     (and (eq_attr "isa" "p9")
+	  (match_test "TARGET_MODULO"))
+     (const_int 1)
+
      (and (eq_attr "isa" "p9v")
 	  (match_test "TARGET_P9_VECTOR"))
      (const_int 1)
@@ -916,7 +921,7 @@
    mtvsrwz %x0,%1
    mfvsrwz %0,%x1
    xxextractuw %x0,%x1,4"
-  [(set_attr "type" "load,shift,fpload,fpload,mffgpr,mftgpr,vecexts")
+  [(set_attr "type" "load,shift,fpload,fpload,mtvsr,mfvsr,vecexts")
    (set_attr "isa" "*,*,p7,p8v,p8v,p8v,p9v")])
 
 (define_insn_and_split "*zero_extendsi<mode>2_dot"
@@ -1103,7 +1108,7 @@
    vextsw2d %0,%1
    #
    #"
-  [(set_attr "type" "load,exts,fpload,fpload,mffgpr,vecexts,vecperm,mftgpr")
+  [(set_attr "type" "load,exts,fpload,fpload,mtvsr,vecexts,vecperm,mfvsr")
    (set_attr "sign_extend" "yes")
    (set_attr "length" "*,*,*,*,*,*,8,8")
    (set_attr "isa" "*,*,p6,p8v,p8v,p9v,p8v,p8v")])
@@ -2586,15 +2591,16 @@
   [(set_attr "type" "store")])
 
 (define_insn_and_split "bswaphi2_reg"
-  [(set (match_operand:HI 0 "gpc_reg_operand" "=&r,wa")
+  [(set (match_operand:HI 0 "gpc_reg_operand" "=r,&r,wa")
 	(bswap:HI
-	 (match_operand:HI 1 "gpc_reg_operand" "r,wa")))
-   (clobber (match_scratch:SI 2 "=&r,X"))]
+	 (match_operand:HI 1 "gpc_reg_operand" "r,r,wa")))
+   (clobber (match_scratch:SI 2 "=X,&r,X"))]
   ""
   "@
+   brh %0,%1
    #
    xxbrh %x0,%x1"
-  "reload_completed && int_reg_operand (operands[0], HImode)"
+  "reload_completed && !TARGET_POWER10 && int_reg_operand (operands[0], HImode)"
   [(set (match_dup 3)
 	(and:SI (lshiftrt:SI (match_dup 4)
 			     (const_int 8))
@@ -2610,21 +2616,22 @@
   operands[3] = simplify_gen_subreg (SImode, operands[0], HImode, 0);
   operands[4] = simplify_gen_subreg (SImode, operands[1], HImode, 0);
 }
-  [(set_attr "length" "12,4")
-   (set_attr "type" "*,vecperm")
-   (set_attr "isa" "*,p9v")])
+  [(set_attr "length" "*,12,*")
+   (set_attr "type" "shift,*,vecperm")
+   (set_attr "isa" "p10,*,p9v")])
 
 ;; We are always BITS_BIG_ENDIAN, so the bit positions below in
 ;; zero_extract insns do not change for -mlittle.
 (define_insn_and_split "bswapsi2_reg"
-  [(set (match_operand:SI 0 "gpc_reg_operand" "=&r,wa")
+  [(set (match_operand:SI 0 "gpc_reg_operand" "=r,&r,wa")
 	(bswap:SI
-	 (match_operand:SI 1 "gpc_reg_operand" "r,wa")))]
+	 (match_operand:SI 1 "gpc_reg_operand" "r,r,wa")))]
   ""
   "@
+   brw %0,%1
    #
    xxbrw %x0,%x1"
-  "reload_completed && int_reg_operand (operands[0], SImode)"
+  "reload_completed && !TARGET_POWER10 && int_reg_operand (operands[0], SImode)"
   [(set (match_dup 0)					; DABC
 	(rotate:SI (match_dup 1)
 		   (const_int 24)))
@@ -2641,9 +2648,9 @@
 		(and:SI (match_dup 0)
 			(const_int -256))))]
   ""
-  [(set_attr "length" "12,4")
-   (set_attr "type" "*,vecperm")
-   (set_attr "isa" "*,p9v")])
+  [(set_attr "length" "4,12,4")
+   (set_attr "type" "shift,*,vecperm")
+   (set_attr "isa" "p10,*,p9v")])
 
 ;; On systems with LDBRX/STDBRX generate the loads/stores directly, just like
 ;; we do for L{H,W}BRX and ST{H,W}BRX above.  If not, we have to generate more
@@ -2676,7 +2683,7 @@
 	  emit_insn (gen_bswapdi2_store (dest, src));
         }
       else if (TARGET_P9_VECTOR)
-	emit_insn (gen_bswapdi2_xxbrd (dest, src));
+	emit_insn (gen_bswapdi2_brd (dest, src));
       else
 	emit_insn (gen_bswapdi2_reg (dest, src));
       DONE;
@@ -2707,13 +2714,15 @@
   "stdbrx %1,%y0"
   [(set_attr "type" "store")])
 
-(define_insn "bswapdi2_xxbrd"
-  [(set (match_operand:DI 0 "gpc_reg_operand" "=wa")
-	(bswap:DI (match_operand:DI 1 "gpc_reg_operand" "wa")))]
+(define_insn "bswapdi2_brd"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r,wa")
+	(bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r,wa")))]
   "TARGET_P9_VECTOR"
-  "xxbrd %x0,%x1"
-  [(set_attr "type" "vecperm")
-   (set_attr "isa" "p9v")])
+  "@
+   brd %0,%1
+   xxbrd %x0,%x1"
+  [(set_attr "type" "shift,vecperm")
+   (set_attr "isa" "p10,p9v")])
 
 (define_insn "bswapdi2_reg"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=&r")
@@ -4240,6 +4249,32 @@
   operands[5] = GEN_INT ((HOST_WIDE_INT_1U << <bits>) - 1);
 })
 
+; rldimi with UNSPEC_SI_FROM_SF.
+(define_insn_and_split "*rotldi3_insert_sf"
+  [(set (match_operand:DI 0 "gpc_reg_operand")
+	(ior:DI
+	 (ashift:DI (match_operand:DI 1 "gpc_reg_operand")
+	  (match_operand:SI 2 "const_int_operand"))
+	 (zero_extend:DI
+	  (unspec:QHSI
+	   [(match_operand:SF 3 "memory_operand")]
+	   UNSPEC_SI_FROM_SF))))
+  (clobber (match_scratch:V4SF 4))]
+  "TARGET_POWERPC64 && INTVAL (operands[2]) == <bits>"
+  "#"
+  ""
+  [(parallel [(set (match_dup 5)
+      (zero_extend:DI (unspec:QHSI [(match_dup 3)] UNSPEC_SI_FROM_SF)))
+    (clobber (match_dup 4))])
+  (set (match_dup 0)
+   (ior:DI
+    (and:DI (match_dup 5) (match_dup 6))
+    (ashift:DI (match_dup 1) (match_dup 2))))]
+{
+  operands[5] = gen_reg_rtx (DImode);
+  operands[6] = GEN_INT ((HOST_WIDE_INT_1U << <bits>) - 1);
+})
+
 ; rlwimi, too.
 (define_split
   [(set (match_operand:SI 0 "gpc_reg_operand")
@@ -5013,7 +5048,7 @@
 {
   operands[2] = gen_highpart (DImode, operands[1]);
 }
- [(set_attr "type" "mftgpr,*")])
+ [(set_attr "type" "mfvsr,*")])
 
 ;; Optimize IEEE 128-bit signbit on to avoid loading the value into a vector
 ;; register and then doing a direct move if the value comes from memory.  On
@@ -5367,7 +5402,7 @@
    lxsiwax %x0,%y1
    mtvsrwa %x0,%1
    vextsw2d %0,%1"
-  [(set_attr "type" "fpload,fpload,mffgpr,vecexts")
+  [(set_attr "type" "fpload,fpload,mtvsr,vecexts")
    (set_attr "isa" "*,p8v,p8v,p9v")])
 
 ; This split must be run before register allocation because it allocates the
@@ -5448,7 +5483,7 @@
    lxsiwzx %x0,%y1
    mtvsrwz %x0,%1
    xxextractuw %x0,%x1,4"
-  [(set_attr "type" "fpload,fpload,mftgpr,vecexts")
+  [(set_attr "type" "fpload,fpload,mtvsr,vecexts")
    (set_attr "isa" "*,p8v,p8v,p9v")])
 
 (define_insn_and_split "floatunssi<mode>2_lfiwzx"
@@ -7052,7 +7087,7 @@
 	   *,          *,          *,          *,
 	   veclogical, vecsimple,  vecsimple,  vecsimple,
 	   veclogical, veclogical, vecsimple,
-	   mffgpr,     mftgpr,
+	   mtvsr,      mfvsr,
 	   *,          *,          *")
    (set_attr "length"
 	  "*,          *,
@@ -7138,8 +7173,8 @@
 }
   [(set_attr "type"
 		"*,          load,        fpload,      fpload,   store,
-		 fpstore,    fpstore,     fpstore,     mftgpr,   fp,
-		 mffgpr")
+		 fpstore,    fpstore,     fpstore,     mfvsr,    fp,
+		 mtvsr")
    (set_attr "length"
 		"*,          *,           *,           *,        *,
 		 *,          *,           *,           8,        *,
@@ -7193,7 +7228,7 @@
 }
   [(set_attr "type"
 		"*,          load,        fpload,      fpload,   two,
-		 two,        mffgpr")
+		 two,        mtvsr")
    (set_attr "length"
 		"*,          *,           *,           *,        8,
 		 8,          *")
@@ -7320,8 +7355,8 @@
    nop"
   [(set_attr "type"
 		"*,         load,      fpload,    store,     fpstore,   *,
-		 vecsimple, vecperm,   vecperm,   vecperm,   vecperm,   mftgpr,
-		 mffgpr,    mfjmpr,    mtjmpr,    *")
+		 vecsimple, vecperm,   vecperm,   vecperm,   vecperm,   mfvsr,
+		 mtvsr,     mfjmpr,    mtjmpr,    *")
    (set_attr "length"
 		"*,         *,         *,         *,         *,         *,
 		 *,         *,         *,         *,         8,         *,
@@ -7496,7 +7531,7 @@
    mf%1 %0
    nop"
   [(set_attr "type"
-	"load,       fpload,    store,     fpstore,   mffgpr,    mftgpr,
+	"load,       fpload,    store,     fpstore,   mtvsr,     mfvsr,
 	 fpsimple,   *,         mtjmpr,    mfjmpr,    *")
    (set_attr "isa"
 	"*,          p7,        *,         *,         p8v,       p8v,
@@ -7599,11 +7634,42 @@
 	     *,          12,        *,         *")
    (set_attr "type"
 	    "load,       fpload,    fpload,    fpload,    store,     fpstore,
-	     fpstore,    vecfloat,  mffgpr,    *")
+	     fpstore,    vecfloat,  mfvsr,     *")
    (set_attr "isa"
 	    "*,          *,         p9v,       p8v,       *,         *,
 	     p8v,        p8v,       p8v,       *")])
 
+;; For extracting high part element from DImode register like:
+;;     {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
+;; split it before reload with "and mask" to avoid generating shift right
+;; 32 bit then shift left 32 bit.
+(define_insn_and_split "movsf_from_si2"
+  [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
+	    (unspec:SF
+	     [(subreg:SI
+	       (ashiftrt:DI
+		(match_operand:DI 1 "input_operand" "r")
+		(const_int 32))
+	       0)]
+	     UNSPEC_SF_FROM_SI))
+  (clobber (match_scratch:DI 2 "=r"))]
+  "TARGET_NO_SF_SUBREG"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+    operands[2] = gen_reg_rtx (DImode);
+
+  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << 32);
+  emit_insn (gen_anddi3 (operands[2], operands[1], mask));
+  emit_insn (gen_p8_mtvsrd_sf (operands[0], operands[2]));
+  emit_insn (gen_vsx_xscvspdpn_directmove (operands[0], operands[0]));
+  DONE;
+}
+  [(set_attr "length" "12")
+  (set_attr "type" "vecfloat")
+  (set_attr "isa" "p8v")])
 
 ;; Move 64-bit binary/decimal floating point
 (define_expand "mov<mode>"
@@ -7805,7 +7871,7 @@
             "fpstore,     fpload,     fpsimple,   fpload,     fpstore,
              fpload,      fpstore,    veclogical, veclogical, integer,
              store,       load,       *,          mtjmpr,     mfjmpr,
-             *,           mftgpr,     mffgpr")
+             *,           mfvsr,      mtvsr")
    (set_attr "size" "64")
    (set_attr "isa"
             "*,           *,          *,          p9v,        p9v,
@@ -8645,7 +8711,7 @@
 		   UNSPEC_P8V_MTVSRWZ))]
   "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
   "mtvsrwz %x0,%1"
-  [(set_attr "type" "mftgpr")])
+  [(set_attr "type" "mtvsr")])
 
 (define_insn_and_split "reload_fpr_from_gpr<mode>"
   [(set (match_operand:FMOVE64X 0 "register_operand" "=d")
@@ -8679,7 +8745,7 @@
 		   UNSPEC_P8V_MTVSRD))]
   "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
   "mtvsrd %x0,%1"
-  [(set_attr "type" "mftgpr")])
+  [(set_attr "type" "mfvsr")])
 
 (define_insn "p8_xxpermdi_<mode>"
   [(set (match_operand:FMOVE128_GPR 0 "register_operand" "=wa")
@@ -8744,7 +8810,7 @@
 		   UNSPEC_P8V_MTVSRD))]
   "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
   "mtvsrd %x0,%1"
-  [(set_attr "type" "mftgpr")])
+  [(set_attr "type" "mtvsr")])
 
 (define_insn_and_split "reload_vsx_from_gprsf"
   [(set (match_operand:SF 0 "register_operand" "=wa")
@@ -8779,7 +8845,7 @@
 		   UNSPEC_P8V_RELOAD_FROM_VSX))]
   "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
   "mfvsrd %0,%x1"
-  [(set_attr "type" "mftgpr")])
+  [(set_attr "type" "mfvsr")])
 
 (define_insn_and_split "reload_gpr_from_vsx<mode>"
   [(set (match_operand:FMOVE128_GPR 0 "register_operand" "=r")
@@ -8994,7 +9060,7 @@
 	   vecsimple,  vecsimple,  vecsimple,  veclogical, veclogical,
 	   vecsimple,  vecsimple,
 	   mfjmpr,     mtjmpr,     *,
-	   mftgpr,     mffgpr")
+	   mfvsr,      mtvsr")
    (set_attr "size" "64")
    (set_attr "length"
 	  "*,          *,          *,
@@ -10487,7 +10553,7 @@
 			    (match_operand:P 3 "" "")]
 			   UNSPECV_PLT_PCREL))]
   "HAVE_AS_PLTSEQ && TARGET_ELF
-   && rs6000_pcrel_p (cfun)"
+   && rs6000_pcrel_p ()"
 {
   return rs6000_pltseq_template (operands, RS6000_PLTSEQ_PLT_PCREL34);
 }
@@ -10572,7 +10638,7 @@
   else if (INTVAL (operands[2]) & CALL_V4_CLEAR_FP_ARGS)
     output_asm_insn ("creqv 6,6,6", operands);
 
-  if (rs6000_pcrel_p (cfun))
+  if (rs6000_pcrel_p ())
     return "bl %z0@notoc";
   return (DEFAULT_ABI == ABI_V4 && flag_pic) ? "bl %z0@local" : "bl %z0";
 }
@@ -10593,7 +10659,7 @@
   else if (INTVAL (operands[3]) & CALL_V4_CLEAR_FP_ARGS)
     output_asm_insn ("creqv 6,6,6", operands);
 
-  if (rs6000_pcrel_p (cfun))
+  if (rs6000_pcrel_p ())
     return "bl %z1@notoc";
   return (DEFAULT_ABI == ABI_V4 && flag_pic) ? "bl %z1@local" : "bl %z1";
 }
@@ -10768,7 +10834,7 @@
 }
   [(set_attr "type" "branch")
    (set (attr "length")
-	(if_then_else (match_test "rs6000_pcrel_p (cfun)")
+	(if_then_else (match_test "rs6000_pcrel_p ()")
 	  (const_int 4)
 	  (const_int 8)))])
 
@@ -10785,7 +10851,7 @@
 }
   [(set_attr "type" "branch")
    (set (attr "length")
-	(if_then_else (match_test "rs6000_pcrel_p (cfun)")
+	(if_then_else (match_test "rs6000_pcrel_p ()")
 	    (const_int 4)
 	    (const_int 8)))])
 
@@ -10859,7 +10925,7 @@
 	 (match_operand 1))
    (use (match_operand:SI 2 "immediate_operand" "n,n,n"))
    (clobber (reg:P LR_REGNO))]
-  "rs6000_pcrel_p (cfun)"
+  "rs6000_pcrel_p ()"
 {
   return rs6000_indirect_call_template (operands, 0);
 }
@@ -10896,7 +10962,7 @@
 	      (match_operand:P 2 "unspec_tls" "")))
    (use (match_operand:SI 3 "immediate_operand" "n,n,n"))
    (clobber (reg:P LR_REGNO))]
-  "rs6000_pcrel_p (cfun)"
+  "rs6000_pcrel_p ()"
 {
   return rs6000_indirect_call_template (operands, 1);
 }
@@ -10998,8 +11064,8 @@
   DONE;
 })
 
-(define_insn "*sibcall_local32"
-  [(call (mem:SI (match_operand:SI 0 "current_file_function_operand" "s,s"))
+(define_insn "*sibcall_local<mode>"
+  [(call (mem:SI (match_operand:P 0 "current_file_function_operand" "s,s"))
 	 (match_operand 1))
    (use (match_operand:SI 2 "immediate_operand" "O,n"))
    (simple_return)]
@@ -11016,27 +11082,9 @@
   [(set_attr "type" "branch")
    (set_attr "length" "4,8")])
 
-(define_insn "*sibcall_local64"
-  [(call (mem:SI (match_operand:DI 0 "current_file_function_operand" "s,s"))
-	 (match_operand 1))
-   (use (match_operand:SI 2 "immediate_operand" "O,n"))
-   (simple_return)]
-  "TARGET_64BIT && (INTVAL (operands[2]) & CALL_LONG) == 0"
-{
-  if (INTVAL (operands[2]) & CALL_V4_SET_FP_ARGS)
-    output_asm_insn ("crxor 6,6,6", operands);
-
-  else if (INTVAL (operands[2]) & CALL_V4_CLEAR_FP_ARGS)
-    output_asm_insn ("creqv 6,6,6", operands);
-
-  return (DEFAULT_ABI == ABI_V4 && flag_pic) ? "b %z0@local" : "b %z0";
-}
-  [(set_attr "type" "branch")
-   (set_attr "length" "4,8")])
-
-(define_insn "*sibcall_value_local32"
+(define_insn "*sibcall_value_local<mode>"
   [(set (match_operand 0 "" "")
-	(call (mem:SI (match_operand:SI 1 "current_file_function_operand" "s,s"))
+	(call (mem:SI (match_operand:P 1 "current_file_function_operand" "s,s"))
 	      (match_operand 2)))
    (use (match_operand:SI 3 "immediate_operand" "O,n"))
    (simple_return)]
@@ -11053,25 +11101,6 @@
   [(set_attr "type" "branch")
    (set_attr "length" "4,8")])
 
-(define_insn "*sibcall_value_local64"
-  [(set (match_operand 0 "" "")
-	(call (mem:SI (match_operand:DI 1 "current_file_function_operand" "s,s"))
-	      (match_operand 2)))
-   (use (match_operand:SI 3 "immediate_operand" "O,n"))
-   (simple_return)]
-  "TARGET_64BIT && (INTVAL (operands[3]) & CALL_LONG) == 0"
-{
-  if (INTVAL (operands[3]) & CALL_V4_SET_FP_ARGS)
-    output_asm_insn ("crxor 6,6,6", operands);
-
-  else if (INTVAL (operands[3]) & CALL_V4_CLEAR_FP_ARGS)
-    output_asm_insn ("creqv 6,6,6", operands);
-
-  return (DEFAULT_ABI == ABI_V4 && flag_pic) ? "b %z1@local" : "b %z1";
-}
-  [(set_attr "type" "branch")
-   (set_attr "length" "4,8")])
-
 (define_insn "*sibcall_indirect_nonlocal_sysv<mode>"
   [(call (mem:SI (match_operand:P 0 "indirect_call_operand" "c,*l,X"))
 	 (match_operand 1))
@@ -12668,12 +12697,7 @@
   ""
 {
   if (rs6000_speculate_indirect_jumps)
-    {
-      if (TARGET_32BIT)
-      	emit_jump_insn (gen_tablejumpsi (operands[0], operands[1]));
-      else
-	emit_jump_insn (gen_tablejumpdi (operands[0], operands[1]));
-    }
+    emit_jump_insn (gen_tablejump_normal (Pmode, operands[0], operands[1]));
   else
     {
       rtx ccreg = gen_reg_rtx (CCmode);
@@ -12687,69 +12711,57 @@
   DONE;
 })
 
-(define_expand "tablejumpsi"
-  [(set (match_dup 3)
-	(plus:SI (match_operand:SI 0)
-		 (match_dup 2)))
-   (parallel [(set (pc)
-		   (match_dup 3))
-	      (use (label_ref (match_operand 1)))])]
-  "TARGET_32BIT && rs6000_speculate_indirect_jumps"
+(define_expand "@tablejump<mode>_normal"
+  [(use (match_operand:SI 0))
+   (use (match_operand:P 1))]
+  "rs6000_speculate_indirect_jumps"
 {
+  rtx off;
   operands[0] = force_reg (SImode, operands[0]);
-  operands[2] = force_reg (SImode, gen_rtx_LABEL_REF (SImode, operands[1]));
-  operands[3] = gen_reg_rtx (SImode);
+  if (<MODE>mode == SImode)
+    off = operands[0];
+  else
+    {
+      off = gen_reg_rtx (Pmode);
+      rtx src = gen_rtx_fmt_e (SIGN_EXTEND, Pmode, operands[0]);
+      emit_move_insn (off, src);
+    }
+
+  rtx lab = force_reg (Pmode, gen_rtx_LABEL_REF (Pmode, operands[1]));
+  rtx addr = gen_reg_rtx (Pmode);
+
+  emit_insn (gen_add<mode>3 (addr, off, lab));
+  emit_jump_insn (gen_tablejump_insn_normal (Pmode, addr, operands[1]));
+  DONE;
 })
 
-(define_expand "tablejumpsi_nospec"
-  [(set (match_dup 4)
-	(plus:SI (match_operand:SI 0)
-		 (match_dup 3)))
-   (parallel [(set (pc)
-		   (match_dup 4))
-	      (use (label_ref (match_operand 1)))
-	      (clobber (match_operand 2))])]
-  "TARGET_32BIT && !rs6000_speculate_indirect_jumps"
+(define_expand "@tablejump<mode>_nospec"
+  [(use (match_operand:SI 0))
+   (use (match_operand:P 1))
+   (use (match_operand:CC 2))]
+  "!rs6000_speculate_indirect_jumps"
 {
+  rtx off;
   operands[0] = force_reg (SImode, operands[0]);
-  operands[3] = force_reg (SImode, gen_rtx_LABEL_REF (SImode, operands[1]));
-  operands[4] = gen_reg_rtx (SImode);
-})
+  if (<MODE>mode == SImode)
+    off = operands[0];
+  else
+    {
+      off = gen_reg_rtx (Pmode);
+      rtx src = gen_rtx_fmt_e (SIGN_EXTEND, Pmode, operands[0]);
+      emit_move_insn (off, src);
+    }
 
-(define_expand "tablejumpdi"
-  [(set (match_dup 4)
-        (sign_extend:DI (match_operand:SI 0 "lwa_operand")))
-   (set (match_dup 3)
-	(plus:DI (match_dup 4)
-		 (match_dup 2)))
-   (parallel [(set (pc)
-		   (match_dup 3))
-	      (use (label_ref (match_operand 1)))])]
-  "TARGET_64BIT && rs6000_speculate_indirect_jumps"
-{
-  operands[2] = force_reg (DImode, gen_rtx_LABEL_REF (DImode, operands[1]));
-  operands[3] = gen_reg_rtx (DImode);
-  operands[4] = gen_reg_rtx (DImode);
-})
+  rtx lab = force_reg (Pmode, gen_rtx_LABEL_REF (Pmode, operands[1]));
+  rtx addr = gen_reg_rtx (Pmode);
 
-(define_expand "tablejumpdi_nospec"
-  [(set (match_dup 5)
-        (sign_extend:DI (match_operand:SI 0 "lwa_operand")))
-   (set (match_dup 4)
-	(plus:DI (match_dup 5)
-		 (match_dup 3)))
-   (parallel [(set (pc)
-		   (match_dup 4))
-	      (use (label_ref (match_operand 1)))
-	      (clobber (match_operand 2))])]
-  "TARGET_64BIT && !rs6000_speculate_indirect_jumps"
-{
-  operands[3] = force_reg (DImode, gen_rtx_LABEL_REF (DImode, operands[1]));
-  operands[4] = gen_reg_rtx (DImode);
-  operands[5] = gen_reg_rtx (DImode);
+  emit_insn (gen_add<mode>3 (addr, off, lab));
+  emit_jump_insn (gen_tablejump_insn_nospec (Pmode, addr, operands[1],
+					     operands[2]));
+  DONE;
 })
 
-(define_insn "*tablejump<mode>_internal1"
+(define_insn "@tablejump<mode>_insn_normal"
   [(set (pc)
 	(match_operand:P 0 "register_operand" "c,*l"))
    (use (label_ref (match_operand 1)))]
@@ -12757,7 +12769,7 @@
   "b%T0"
   [(set_attr "type" "jmpreg")])
 
-(define_insn "*tablejump<mode>_internal1_nospec"
+(define_insn "@tablejump<mode>_insn_nospec"
   [(set (pc)
 	(match_operand:P 0 "register_operand" "c,*l"))
    (use (label_ref (match_operand 1)))
@@ -13988,7 +14000,7 @@
 
   operands[3] = gen_rtx_REG (<FP128_64>mode, fp_regno);
 }
-  [(set_attr "type" "fp,fpstore,mffgpr,mftgpr,store")])
+  [(set_attr "type" "fp,fpstore,mtvsr,mfvsr,store")])
 
 (define_insn_and_split "unpack<mode>_nodm"
   [(set (match_operand:<FP128_64> 0 "nonimmediate_operand" "=d,m")
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 6b426f2..b2a70e8 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -324,6 +324,14 @@ mblock-move-inline-limit=
 Target Report Var(rs6000_block_move_inline_limit) Init(0) RejectNegative Joined UInteger Save
 Max number of bytes to move inline.
 
+mblock-ops-unaligned-vsx
+Target Report Mask(BLOCK_OPS_UNALIGNED_VSX) Var(rs6000_isa_flags)
+Generate unaligned VSX load/store for inline expansion of memcpy/memmove.
+
+mblock-ops-vector-pair
+Target Undocumented Mask(BLOCK_OPS_VECTOR_PAIR) Var(rs6000_isa_flags)
+Generate unaligned VSX vector pair load/store for inline expansion of memcpy/memmove.
+
 mblock-compare-inline-limit=
 Target Report Var(rs6000_block_compare_inline_limit) Init(63) RejectNegative Joined UInteger Save
 Max number of bytes to compare without loops.
@@ -568,8 +576,7 @@ mspeculate-indirect-jumps
 Target Undocumented Var(rs6000_speculate_indirect_jumps) Init(1) Save
 
 mpower10
-Target Report Mask(POWER10) Var(rs6000_isa_flags)
-Use instructions added in ISA 3.1.
+Target Undocumented Mask(POWER10) Var(rs6000_isa_flags) WarnRemoved
 
 mprefixed
 Target Report Mask(PREFIXED) Var(rs6000_isa_flags)
diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index d78ddba..4c0fc86 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -42,6 +42,36 @@
 #include <altivec.h>
 #include <tmmintrin.h>
 
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
+{
+  __v16qi result = (__v16qi)__A;
+
+  result [__N & 0xf] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi32 (__m128i const __A, int const __D, int const __N)
+{
+  __v4si result = (__v4si)__A;
+
+  result [__N & 3] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N)
+{
+  __v2di result = (__v2di)__A;
+
+  result [__N & 1] = __D;
+
+  return (__m128i) result;
+}
+
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_extract_epi8 (__m128i __X, const int __N)
 {
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index f753771..4ff5245 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -300,7 +300,7 @@
    UNSPEC_VSX_DIVUD
    UNSPEC_VSX_MULSD
    UNSPEC_VSX_SIGN_EXTEND
-   UNSPEC_VSX_XVCVBF16SP
+   UNSPEC_VSX_XVCVBF16SPN
    UNSPEC_VSX_XVCVSPBF16
    UNSPEC_VSX_XVCVSPSXDS
    UNSPEC_VSX_XVCVSPHP
@@ -355,13 +355,31 @@
    UNSPEC_VCNTMB
    UNSPEC_VEXPAND
    UNSPEC_VEXTRACT
+   UNSPEC_EXTRACTL
+   UNSPEC_EXTRACTR
+   UNSPEC_INSERTL
+   UNSPEC_INSERTR
+   UNSPEC_REPLACE_ELT
+   UNSPEC_REPLACE_UN
   ])
 
 (define_int_iterator XVCVBF16	[UNSPEC_VSX_XVCVSPBF16
-				 UNSPEC_VSX_XVCVBF16SP])
+				 UNSPEC_VSX_XVCVBF16SPN])
 
 (define_int_attr xvcvbf16       [(UNSPEC_VSX_XVCVSPBF16 "xvcvspbf16")
-				 (UNSPEC_VSX_XVCVBF16SP "xvcvbf16sp")])
+				 (UNSPEC_VSX_XVCVBF16SPN "xvcvbf16spn")])
+
+;; Like VI, defined in vector.md, but add ISA 2.07 integer vector ops
+(define_mode_iterator VI2 [V4SI V8HI V16QI V2DI])
+
+;; Vector extract_elt iterator/attr for 32-bit and 64-bit elements
+(define_mode_iterator REPLACE_ELT [V4SI V4SF V2DI V2DF])
+(define_mode_attr REPLACE_ELT_char [(V4SI "w") (V4SF "w")
+				    (V2DI  "d") (V2DF "d")])
+(define_mode_attr REPLACE_ELT_sh [(V4SI "2") (V4SF "2")
+				  (V2DI  "3") (V2DF "3")])
+(define_mode_attr REPLACE_ELT_max [(V4SI "12") (V4SF "12")
+				   (V2DI  "8") (V2DF "8")])
 
 ;; VSX moves
 
@@ -1155,7 +1173,7 @@
   return rs6000_output_move_128bit (operands);
 }
   [(set_attr "type"
-               "vecstore,  vecload,   vecsimple, mffgpr,    mftgpr,    load,
+               "vecstore,  vecload,   vecsimple, mtvsr,     mfvsr,     load,
                 store,     load,      store,     *,         vecsimple, vecsimple,
                 vecsimple, *,         *,         vecstore,  vecload")
    (set_attr "num_insns"
@@ -1985,6 +2003,45 @@
   "xvcmpgt<sd>p. %x0,%x1,%x2"
   [(set_attr "type" "<VStype_simple>")])
 
+;; xvtlsbb BF,XB
+;; Set the CR field BF to indicate if the lowest bit (bit 7) of every byte
+;; element in VSR[XB] is equal to 1 (ALL_TRUE) or equal to 0 (ALL_FALSE).
+(define_insn "*xvtlsbb_internal"
+  [(set (match_operand:CC 0 "cc_reg_operand" "=y")
+	(unspec:CC [(match_operand:V16QI 1 "vsx_register_operand" "wa")]
+	 UNSPEC_XVTLSBB))]
+  "TARGET_POWER10"
+  "xvtlsbb %0,%x1"
+  [(set_attr "type" "logical")])
+
+;; Vector Test Least Significant Bit by Byte
+;; for the implementation of the builtin
+;;     __builtin_vec_test_lsbb_all_ones
+;;     int vec_test_lsbb_all_ones (vector unsigned char);
+;; and
+;;     __builtin_vec_test_lsbb_all_zeros
+;;     int vec_test_lsbb_all_zeros (vector unsigned char);
+(define_expand "xvtlsbbo"
+  [(set (match_dup 2)
+	(unspec:CC [(match_operand:V16QI 1 "vsx_register_operand" "v")]
+	 UNSPEC_XVTLSBB))
+   (set (match_operand:SI 0 "gpc_reg_operand" "=r")
+	(lt:SI (match_dup 2) (const_int 0)))]
+  "TARGET_POWER10"
+{
+   operands[2] = gen_reg_rtx (CCmode);
+})
+(define_expand "xvtlsbbz"
+  [(set (match_dup 2)
+	(unspec:CC [(match_operand:V16QI 1 "vsx_register_operand" "v")]
+	 UNSPEC_XVTLSBB))
+   (set (match_operand:SI 0 "gpc_reg_operand" "=r")
+	(eq:SI (match_dup 2) (const_int 0)))]
+  "TARGET_POWER10"
+{
+   operands[2] = gen_reg_rtx (CCmode);
+})
+
 (define_insn "*vsx_ge_<mode>_p"
   [(set (reg:CC CR6_REGNO)
 	(unspec:CC
@@ -2828,7 +2885,7 @@
   else
     gcc_unreachable ();
 }
-  [(set_attr "type" "vecperm")])
+  [(set_attr "type" "vecperm,vecmove")])
 
 ;; Combiner patterns to allow creating XXPERMDI's to access either double
 ;; word element in a vector register.
@@ -3257,7 +3314,7 @@
   else
     gcc_unreachable ();
 }
-  [(set_attr "type" "veclogical,mftgpr,mftgpr,vecperm")
+  [(set_attr "type" "veclogical,mfvsr,mfvsr,vecperm")
    (set_attr "isa" "*,*,p8v,p9v")])
 
 ;; Optimize extracting a single scalar element from memory.
@@ -3700,7 +3757,7 @@
 
   DONE;
 }
-  [(set_attr "type" "mftgpr,vecperm,fpstore")
+  [(set_attr "type" "mfvsr,vecperm,fpstore")
    (set_attr "length" "8")
    (set_attr "isa" "*,p8v,*")])
 
@@ -3749,7 +3806,7 @@
 		  gen_rtx_REG (DImode, REGNO (vec_tmp)));
   DONE;
 }
-  [(set_attr "type" "mftgpr")])
+  [(set_attr "type" "mfvsr")])
 
 ;; Optimize extracting a single scalar element from memory.
 (define_insn_and_split "*vsx_extract_<mode>_load"
@@ -3807,6 +3864,224 @@
 }
   [(set_attr "type" "load")])
 
+;; ISA 3.1 extract
+(define_expand "vextractl<mode>"
+  [(set (match_operand:V2DI 0 "altivec_register_operand")
+	(unspec:V2DI [(match_operand:VI2 1 "altivec_register_operand")
+		      (match_operand:VI2 2 "altivec_register_operand")
+		      (match_operand:SI 3 "register_operand")]
+		     UNSPEC_EXTRACTL))]
+  "TARGET_POWER10"
+{
+  if (BYTES_BIG_ENDIAN)
+    {
+      emit_insn (gen_vextractl<mode>_internal (operands[0], operands[1],
+					       operands[2], operands[3]));
+      emit_insn (gen_xxswapd_v2di (operands[0], operands[0]));
+    }
+  else
+    emit_insn (gen_vextractr<mode>_internal (operands[0], operands[2],
+					     operands[1], operands[3]));
+  DONE;
+})
+
+(define_insn "vextractl<mode>_internal"
+  [(set (match_operand:V2DI 0 "altivec_register_operand" "=v")
+	(unspec:V2DI [(match_operand:VEC_I 1 "altivec_register_operand" "v")
+		      (match_operand:VEC_I 2 "altivec_register_operand" "v")
+		      (match_operand:SI 3 "register_operand" "r")]
+		     UNSPEC_EXTRACTL))]
+  "TARGET_POWER10"
+  "vext<du_or_d><wd>vlx %0,%1,%2,%3"
+  [(set_attr "type" "vecsimple")])
+
+(define_expand "vextractr<mode>"
+  [(set (match_operand:V2DI 0 "altivec_register_operand")
+	(unspec:V2DI [(match_operand:VI2 1 "altivec_register_operand")
+		      (match_operand:VI2 2 "altivec_register_operand")
+		      (match_operand:SI 3 "register_operand")]
+		     UNSPEC_EXTRACTR))]
+  "TARGET_POWER10"
+{
+  if (BYTES_BIG_ENDIAN)
+    {
+      emit_insn (gen_vextractr<mode>_internal (operands[0], operands[1],
+					       operands[2], operands[3]));
+      emit_insn (gen_xxswapd_v2di (operands[0], operands[0]));
+    }
+  else
+    emit_insn (gen_vextractl<mode>_internal (operands[0], operands[2],
+					     operands[1], operands[3]));
+  DONE;
+})
+
+(define_insn "vextractr<mode>_internal"
+  [(set (match_operand:V2DI 0 "altivec_register_operand" "=v")
+	(unspec:V2DI [(match_operand:VEC_I 1 "altivec_register_operand" "v")
+		      (match_operand:VEC_I 2 "altivec_register_operand" "v")
+		      (match_operand:SI 3 "register_operand" "r")]
+		     UNSPEC_EXTRACTR))]
+  "TARGET_POWER10"
+  "vext<du_or_d><wd>vrx %0,%1,%2,%3"
+  [(set_attr "type" "vecsimple")])
+
+(define_expand "vinsertvl_<mode>"
+  [(set (match_operand:VI2 0 "altivec_register_operand")
+	(unspec:VI2 [(match_operand:VI2 1 "altivec_register_operand")
+		     (match_operand:VI2 2 "altivec_register_operand")
+		     (match_operand:SI 3 "register_operand" "r")]
+		    UNSPEC_INSERTL))]
+  "TARGET_POWER10"
+{
+  if (BYTES_BIG_ENDIAN)
+     emit_insn (gen_vinsertvl_internal_<mode> (operands[0], operands[3],
+                                               operands[1], operands[2]));
+   else
+     emit_insn (gen_vinsertvr_internal_<mode> (operands[0], operands[3],
+                                               operands[1], operands[2]));
+   DONE;
+})
+
+(define_insn "vinsertvl_internal_<mode>"
+  [(set (match_operand:VEC_I 0 "altivec_register_operand" "=v")
+	(unspec:VEC_I [(match_operand:SI 1 "register_operand" "r")
+		       (match_operand:VEC_I 2 "altivec_register_operand" "v")
+		       (match_operand:VEC_I 3 "altivec_register_operand" "0")]
+		      UNSPEC_INSERTL))]
+  "TARGET_POWER10"
+  "vins<wd>vlx %0,%1,%2"
+  [(set_attr "type" "vecsimple")])
+
+(define_expand "vinsertvr_<mode>"
+  [(set (match_operand:VI2 0 "altivec_register_operand")
+	(unspec:VI2 [(match_operand:VI2 1 "altivec_register_operand")
+		     (match_operand:VI2 2 "altivec_register_operand")
+		     (match_operand:SI 3 "register_operand" "r")]
+		    UNSPEC_INSERTR))]
+  "TARGET_POWER10"
+{
+  if (BYTES_BIG_ENDIAN)
+     emit_insn (gen_vinsertvr_internal_<mode> (operands[0], operands[3],
+                                               operands[1], operands[2]));
+   else
+     emit_insn (gen_vinsertvl_internal_<mode> (operands[0], operands[3],
+                                               operands[1], operands[2]));
+   DONE;
+})
+
+(define_insn "vinsertvr_internal_<mode>"
+  [(set (match_operand:VEC_I 0 "altivec_register_operand" "=v")
+	(unspec:VEC_I [(match_operand:SI 1 "register_operand" "r")
+		       (match_operand:VEC_I 2 "altivec_register_operand" "v")
+		       (match_operand:VEC_I 3 "altivec_register_operand" "0")]
+		      UNSPEC_INSERTR))]
+  "TARGET_POWER10"
+  "vins<wd>vrx %0,%1,%2"
+  [(set_attr "type" "vecsimple")])
+
+(define_expand "vinsertgl_<mode>"
+  [(set (match_operand:VI2 0 "altivec_register_operand")
+	(unspec:VI2 [(match_operand:SI 1 "register_operand")
+		     (match_operand:VI2 2 "altivec_register_operand")
+		     (match_operand:SI 3 "register_operand")]
+		    UNSPEC_INSERTL))]
+  "TARGET_POWER10"
+{
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (gen_vinsertgl_internal_<mode> (operands[0], operands[3],
+                                            operands[1], operands[2]));
+  else
+    emit_insn (gen_vinsertgr_internal_<mode> (operands[0], operands[3],
+                                            operands[1], operands[2]));
+  DONE;
+ })
+
+(define_insn "vinsertgl_internal_<mode>"
+ [(set (match_operand:VEC_I 0 "altivec_register_operand" "=v")
+       (unspec:VEC_I [(match_operand:SI 1 "register_operand" "r")
+		      (match_operand:SI 2 "register_operand" "r")
+		      (match_operand:VEC_I 3 "altivec_register_operand" "0")]
+		     UNSPEC_INSERTL))]
+ "TARGET_POWER10"
+ "vins<wd>lx %0,%1,%2"
+ [(set_attr "type" "vecsimple")])
+
+(define_expand "vinsertgr_<mode>"
+  [(set (match_operand:VI2 0 "altivec_register_operand")
+	(unspec:VI2 [(match_operand:SI 1 "register_operand")
+		     (match_operand:VI2 2 "altivec_register_operand")
+		     (match_operand:SI 3 "register_operand")]
+		    UNSPEC_INSERTR))]
+  "TARGET_POWER10"
+{
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (gen_vinsertgr_internal_<mode> (operands[0], operands[3],
+                                            operands[1], operands[2]));
+  else
+    emit_insn (gen_vinsertgl_internal_<mode> (operands[0], operands[3],
+                                            operands[1], operands[2]));
+  DONE;
+ })
+
+(define_insn "vinsertgr_internal_<mode>"
+ [(set (match_operand:VEC_I 0 "altivec_register_operand" "=v")
+   (unspec:VEC_I [(match_operand:SI 1 "register_operand" "r")
+		  (match_operand:SI 2 "register_operand" "r")
+		  (match_operand:VEC_I 3 "altivec_register_operand" "0")]
+		 UNSPEC_INSERTR))]
+ "TARGET_POWER10"
+ "vins<wd>rx %0,%1,%2"
+ [(set_attr "type" "vecsimple")])
+
+(define_expand "vreplace_elt_<mode>"
+  [(set (match_operand:REPLACE_ELT 0 "register_operand")
+  (unspec:REPLACE_ELT [(match_operand:REPLACE_ELT 1 "register_operand")
+		       (match_operand:<VS_scalar> 2 "register_operand")
+		       (match_operand:QI 3 "const_0_to_3_operand")]
+		      UNSPEC_REPLACE_ELT))]
+ "TARGET_POWER10"
+{
+   int index;
+   /* Immediate value is the word index, convert to byte index and adjust for
+      Endianness if needed.  */
+   if (BYTES_BIG_ENDIAN)
+     index = INTVAL (operands[3]) << <REPLACE_ELT_sh>;
+
+   else
+     index = <REPLACE_ELT_max> - (INTVAL (operands[3]) << <REPLACE_ELT_sh>);
+
+   emit_insn (gen_vreplace_elt_<mode>_inst (operands[0], operands[1],
+					    operands[2],
+					    GEN_INT (index)));
+   DONE;
+ }
+[(set_attr "type" "vecsimple")])
+
+(define_expand "vreplace_un_<mode>"
+ [(set (match_operand:REPLACE_ELT 0 "register_operand")
+ (unspec:REPLACE_ELT [(match_operand:REPLACE_ELT 1 "register_operand")
+		      (match_operand:<VS_scalar> 2 "register_operand")
+		      (match_operand:QI 3 "const_0_to_12_operand")]
+		     UNSPEC_REPLACE_UN))]
+ "TARGET_POWER10"
+{
+   /* Immediate value is the byte index Big Endian numbering.  */
+   emit_insn (gen_vreplace_elt_<mode>_inst (operands[0], operands[1],
+					    operands[2], operands[3]));
+   DONE;
+ }
+[(set_attr "type" "vecsimple")])
+
+(define_insn "vreplace_elt_<mode>_inst"
+ [(set (match_operand:REPLACE_ELT 0 "register_operand" "=v")
+  (unspec:REPLACE_ELT [(match_operand:REPLACE_ELT 1 "register_operand" "0")
+		       (match_operand:<VS_scalar> 2 "register_operand" "r")
+		       (match_operand:QI 3 "const_0_to_12_operand" "n")]
+		      UNSPEC_REPLACE_ELT))]
+ "TARGET_POWER10"
+ "vins<REPLACE_ELT_char> %0,%2,%3"
+ [(set_attr "type" "vecsimple")])
+
 ;; VSX_EXTRACT optimizations
 ;; Optimize double d = (double) vec_extract (vi, <n>)
 ;; Get the element into the top position and use XVCVSWDP/XVCVUWDP
@@ -4165,7 +4440,7 @@
   "@
    xxpermdi %x0,%x1,%x1,0
    mtvsrdd %x0,%1,%1"
-  [(set_attr "type" "vecperm")])
+  [(set_attr "type" "vecperm,vecmove")])
 
 (define_insn "vsx_splat_<mode>_mem"
   [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
@@ -4218,7 +4493,7 @@
 	(unspec:V4SF [(match_dup 0)
 		      (const_int 0)] UNSPEC_VSX_XXSPLTW))]
   ""
-  [(set_attr "type" "vecload,vecperm,mftgpr")
+  [(set_attr "type" "vecload,vecperm,vecperm")
    (set_attr "length" "*,8,*")
    (set_attr "isa" "*,p8v,*")])
 
diff --git a/gcc/config/s390/s390-c.c b/gcc/config/s390/s390-c.c
index f236c55..8e5f2c9 100644
--- a/gcc/config/s390/s390-c.c
+++ b/gcc/config/s390/s390-c.c
@@ -388,7 +388,7 @@ s390_cpu_cpp_builtins (cpp_reader *pfile)
     cpp_define (pfile, "__s390x__");
   if (TARGET_LONG_DOUBLE_128)
     cpp_define (pfile, "__LONG_DOUBLE_128__");
-  cl_target_option_save (&opts, &global_options);
+  cl_target_option_save (&opts, &global_options, &global_options_set);
   s390_cpu_cpp_builtins_internal (pfile, &opts, NULL);
 }
 
@@ -400,7 +400,8 @@ s390_cpu_cpp_builtins (cpp_reader *pfile)
 static bool
 s390_pragma_target_parse (tree args, tree pop_target)
 {
-  tree prev_tree = build_target_option_node (&global_options);
+  tree prev_tree = build_target_option_node (&global_options,
+					     &global_options_set);
   tree cur_tree;
 
   if (! args)
@@ -411,7 +412,7 @@ s390_pragma_target_parse (tree args, tree pop_target)
 						   &global_options_set, true);
       if (!cur_tree || cur_tree == error_mark_node)
 	{
-	  cl_target_option_restore (&global_options,
+	  cl_target_option_restore (&global_options, &global_options_set,
 				    TREE_TARGET_OPTION (prev_tree));
 	  return false;
 	}
diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index 6f1bc07..029f728 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -121,6 +121,7 @@ extern void s390_expand_vec_compare_cc (rtx, enum rtx_code, rtx, rtx, bool);
 extern enum rtx_code s390_reverse_condition (machine_mode, enum rtx_code);
 extern void s390_expand_vcond (rtx, rtx, rtx, enum rtx_code, rtx, rtx);
 extern void s390_expand_vec_init (rtx, rtx);
+extern rtx s390_build_signbit_mask (machine_mode);
 extern rtx s390_return_addr_rtx (int, rtx);
 extern rtx s390_back_chain_rtx (void);
 extern rtx_insn *s390_emit_call (rtx, rtx, rtx, rtx);
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index bd49a89..dbb541b 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -2467,6 +2467,9 @@ s390_contiguous_bitmask_vector_p (rtx op, int *start, int *end)
   rtx elt;
   bool b;
 
+  /* Handle floats by bitcasting them to ints.  */
+  op = gen_lowpart (related_int_vector_mode (GET_MODE (op)).require (), op);
+
   gcc_assert (!!start == !!end);
   if (!const_vec_duplicate_p (op, &elt)
       || !CONST_INT_P (elt))
@@ -4106,6 +4109,18 @@ s390_cannot_force_const_mem (machine_mode mode, rtx x)
       /* Accept all non-symbolic constants.  */
       return false;
 
+    case NEG:
+      /* Accept an unary '-' only on scalar numeric constants.  */
+      switch (GET_CODE (XEXP (x, 0)))
+	{
+	case CONST_INT:
+	case CONST_DOUBLE:
+	case CONST_WIDE_INT:
+	  return false;
+	default:
+	  return true;
+	}
+
     case LABEL_REF:
       /* Labels are OK iff we are non-PIC.  */
       return flag_pic != 0;
@@ -5268,6 +5283,7 @@ legitimize_tls_address (rtx addr, rtx reg)
     {
       switch (XINT (XEXP (addr, 0), 1))
 	{
+	case UNSPEC_NTPOFF:
 	case UNSPEC_INDNTPOFF:
 	  new_rtx = addr;
 	  break;
@@ -5290,6 +5306,18 @@ legitimize_tls_address (rtx addr, rtx reg)
       new_rtx = force_operand (new_rtx, 0);
     }
 
+  /* (const (neg (unspec (symbol_ref)))) -> (neg (const (unspec (symbol_ref)))) */
+  else if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == NEG)
+    {
+      new_rtx = XEXP (XEXP (addr, 0), 0);
+      if (GET_CODE (new_rtx) != SYMBOL_REF)
+	new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+
+      new_rtx = legitimize_tls_address (new_rtx, reg);
+      new_rtx = gen_rtx_NEG (Pmode, new_rtx);
+      new_rtx = force_operand (new_rtx, 0);
+    }
+
   else
     gcc_unreachable ();  /* for now ... */
 
@@ -6436,11 +6464,16 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
       /* Emit a strict_low_part pattern if possible.  */
       if (smode_bsize == bitsize && bitpos == mode_bsize - smode_bsize)
 	{
-	  op = gen_rtx_STRICT_LOW_PART (VOIDmode, gen_lowpart (smode, dest));
-	  op = gen_rtx_SET (op, gen_lowpart (smode, src));
-	  clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, CC_REGNUM));
-	  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clobber)));
-	  return true;
+	  rtx low_dest = gen_lowpart (smode, dest);
+	  rtx low_src = gen_lowpart (smode, src);
+
+	  switch (smode)
+	    {
+	    case E_QImode: emit_insn (gen_movstrictqi (low_dest, low_src)); return true;
+	    case E_HImode: emit_insn (gen_movstricthi (low_dest, low_src)); return true;
+	    case E_SImode: emit_insn (gen_movstrictsi (low_dest, low_src)); return true;
+	    default: break;
+	    }
 	}
 
       /* ??? There are more powerful versions of ICM that are not
@@ -6833,15 +6866,16 @@ s390_expand_vec_init (rtx target, rtx vals)
     }
 
   /* Use vector gen mask or vector gen byte mask if possible.  */
-  if (all_same && all_const_int
-      && (XVECEXP (vals, 0, 0) == const0_rtx
-	  || s390_contiguous_bitmask_vector_p (XVECEXP (vals, 0, 0),
-					       NULL, NULL)
-	  || s390_bytemask_vector_p (XVECEXP (vals, 0, 0), NULL)))
+  if (all_same && all_const_int)
     {
-      emit_insn (gen_rtx_SET (target,
-			      gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0))));
-      return;
+      rtx vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
+      if (XVECEXP (vals, 0, 0) == const0_rtx
+	  || s390_contiguous_bitmask_vector_p (vec, NULL, NULL)
+	  || s390_bytemask_vector_p (vec, NULL))
+	{
+	  emit_insn (gen_rtx_SET (target, vec));
+	  return;
+	}
     }
 
   /* Use vector replicate instructions.  vlrep/vrepi/vrep  */
@@ -6919,6 +6953,30 @@ s390_expand_vec_init (rtx target, rtx vals)
     }
 }
 
+/* Emit a vector constant that contains 1s in each element's sign bit position
+   and 0s in other positions.  MODE is the desired constant's mode.  */
+extern rtx
+s390_build_signbit_mask (machine_mode mode)
+{
+  /* Generate the integral element mask value.  */
+  machine_mode inner_mode = GET_MODE_INNER (mode);
+  int inner_bitsize = GET_MODE_BITSIZE (inner_mode);
+  wide_int mask_val = wi::set_bit_in_zero (inner_bitsize - 1, inner_bitsize);
+
+  /* Emit the element mask rtx.  Use gen_lowpart in order to cast the integral
+     value to the desired mode.  */
+  machine_mode int_mode = related_int_vector_mode (mode).require ();
+  rtx mask = immed_wide_int_const (mask_val, GET_MODE_INNER (int_mode));
+  mask = gen_lowpart (inner_mode, mask);
+
+  /* Emit the vector mask rtx by mode the element mask rtx.  */
+  int nunits = GET_MODE_NUNITS (mode);
+  rtvec v = rtvec_alloc (nunits);
+  for (int i = 0; i < nunits; i++)
+    RTVEC_ELT (v, i) = mask;
+  return gen_rtx_CONST_VECTOR (mode, v);
+}
+
 /* Structure to hold the initial parameters for a compare_and_swap operation
    in HImode and QImode.  */
 
@@ -15177,6 +15235,7 @@ s390_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
 
 static void
 s390_function_specific_restore (struct gcc_options *opts,
+				struct gcc_options */* opts_set */,
 				struct cl_target_option *ptr ATTRIBUTE_UNUSED)
 {
   opts->x_s390_cost_pointer = (long)processor_table[opts->x_s390_tune].cost;
@@ -15200,7 +15259,7 @@ s390_override_options_after_change (void)
 
 static void
 s390_option_override_internal (struct gcc_options *opts,
-			       const struct gcc_options *opts_set)
+			       struct gcc_options *opts_set)
 {
   /* Architecture mode defaults according to ABI.  */
   if (!(opts_set->x_target_flags & MASK_ZARCH))
@@ -15414,7 +15473,7 @@ s390_option_override_internal (struct gcc_options *opts,
 
   /* Call target specific restore function to do post-init work.  At the moment,
      this just sets opts->x_s390_cost_pointer.  */
-  s390_function_specific_restore (opts, NULL);
+  s390_function_specific_restore (opts, opts_set, NULL);
 
   /* Check whether -mfentry is supported. It cannot be used in 31-bit mode,
      because 31-bit PLT stubs assume that %r12 contains GOT address, which is
@@ -15483,7 +15542,8 @@ s390_option_override (void)
 
   /* Save the initial options in case the user does function specific
      options.  */
-  target_option_default_node = build_target_option_node (&global_options);
+  target_option_default_node
+    = build_target_option_node (&global_options, &global_options_set);
   target_option_current_node = target_option_default_node;
 
   /* This cannot reside in s390_option_optimization_table since HAVE_prefetch
@@ -15773,7 +15833,7 @@ s390_valid_target_attribute_tree (tree args,
       s390_option_override_internal (opts, &new_opts_set);
       /* Save the current options unless we are validating options for
 	 #pragma.  */
-      t = build_target_option_node (opts);
+      t = build_target_option_node (opts, &new_opts_set);
     }
   return t;
 }
@@ -15786,7 +15846,7 @@ s390_valid_target_attribute_p (tree fndecl,
 			       tree args,
 			       int ARG_UNUSED (flags))
 {
-  struct gcc_options func_options;
+  struct gcc_options func_options, func_options_set;
   tree new_target, new_optimize;
   bool ret = true;
 
@@ -15798,7 +15858,8 @@ s390_valid_target_attribute_p (tree fndecl,
       && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
     return true;
 
-  tree old_optimize = build_optimization_node (&global_options);
+  tree old_optimize
+    = build_optimization_node (&global_options, &global_options_set);
 
   /* Get the optimization options of the current function.  */
   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
@@ -15810,19 +15871,21 @@ s390_valid_target_attribute_p (tree fndecl,
   memset (&func_options, 0, sizeof (func_options));
   init_options_struct (&func_options, NULL);
   lang_hooks.init_options_struct (&func_options);
+  memset (&func_options_set, 0, sizeof (func_options_set));
 
-  cl_optimization_restore (&func_options, TREE_OPTIMIZATION (func_optimize));
+  cl_optimization_restore (&func_options, &func_options_set,
+			   TREE_OPTIMIZATION (func_optimize));
 
   /* Initialize func_options to the default before its target options can
      be set.  */
-  cl_target_option_restore (&func_options,
+  cl_target_option_restore (&func_options, &func_options_set,
 			    TREE_TARGET_OPTION (target_option_default_node));
 
   new_target = s390_valid_target_attribute_tree (args, &func_options,
 						 &global_options_set,
 						 (args ==
 						  current_target_pragma));
-  new_optimize = build_optimization_node (&func_options);
+  new_optimize = build_optimization_node (&func_options, &func_options_set);
   if (new_target == error_mark_node)
     ret = false;
   else if (fndecl && new_target)
@@ -15960,7 +16023,8 @@ s390_indirect_branch_settings (tree fndecl)
 void
 s390_activate_target_options (tree new_tree)
 {
-  cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
+  cl_target_option_restore (&global_options, &global_options_set,
+			    TREE_TARGET_OPTION (new_tree));
   if (TREE_TARGET_GLOBALS (new_tree))
     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
   else if (new_tree == target_option_default_node)
@@ -16046,12 +16110,13 @@ s390_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
 
      fenv_var = __builtin_s390_efpc ();
      __builtin_s390_sfpc (fenv_var & mask) */
-  tree old_fpc = build2 (MODIFY_EXPR, unsigned_type_node, fenv_var, call_efpc);
-  tree new_fpc =
-    build2 (BIT_AND_EXPR, unsigned_type_node, fenv_var,
-	    build_int_cst (unsigned_type_node,
-			   ~(FPC_DXC_MASK | FPC_FLAGS_MASK |
-			     FPC_EXCEPTION_MASK)));
+  tree old_fpc = build4 (TARGET_EXPR, unsigned_type_node, fenv_var, call_efpc,
+			 NULL_TREE, NULL_TREE);
+  tree new_fpc
+    = build2 (BIT_AND_EXPR, unsigned_type_node, fenv_var,
+	      build_int_cst (unsigned_type_node,
+			     ~(FPC_DXC_MASK | FPC_FLAGS_MASK
+			       | FPC_EXCEPTION_MASK)));
   tree set_new_fpc = build_call_expr (sfpc, 1, new_fpc);
   *hold = build2 (COMPOUND_EXPR, void_type_node, old_fpc, set_new_fpc);
 
@@ -16070,8 +16135,8 @@ s390_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
   __atomic_feraiseexcept ((old_fpc & FPC_FLAGS_MASK) >> FPC_FLAGS_SHIFT);  */
 
   old_fpc = create_tmp_var_raw (unsigned_type_node);
-  tree store_old_fpc = build2 (MODIFY_EXPR, void_type_node,
-			       old_fpc, call_efpc);
+  tree store_old_fpc = build4 (TARGET_EXPR, void_type_node, old_fpc, call_efpc,
+			       NULL_TREE, NULL_TREE);
 
   set_new_fpc = build_call_expr (sfpc, 1, fenv_var);
 
diff --git a/gcc/config/s390/s390.h b/gcc/config/s390/s390.h
index e4ef63e..ec5128c 100644
--- a/gcc/config/s390/s390.h
+++ b/gcc/config/s390/s390.h
@@ -175,6 +175,11 @@ enum processor_flags
 #define TARGET_VECTOR_LOADSTORE_ALIGNMENT_HINTS 0
 #endif
 
+/* Evaluate to true if it is ok to emit a non-signaling vector
+   comparison.  */
+#define TARGET_NONSIGNALING_VECTOR_COMPARE_OK \
+  (TARGET_VX && !TARGET_VXE && (flag_finite_math_only || !flag_trapping_math))
+
 #ifdef HAVE_AS_MACHINE_MACHINEMODE
 #define S390_USE_TARGET_ATTRIBUTE 1
 #else
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index cd1c063..18edea1 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -1391,23 +1391,55 @@
 ; (TF|DF|SF|TD|DD|SD) instructions
 
 
-; FIXME: load and test instructions turn SNaN into QNaN what is not
-; acceptable if the target will be used afterwards.  On the other hand
-; they are quite convenient for implementing comparisons with 0.0. So
-; try to enable them via splitter/peephole if the value isn't needed anymore.
-; See testcases: load-and-test-fp-1.c and load-and-test-fp-2.c
+; load and test instructions turn a signaling NaN into a quiet NaN.  Thus they
+; may only be used if the target register is dead afterwards or if fast math
+; is enabled.  The former is done via a peephole optimization.  Note, load and
+; test instructions may only be used for (in)equality comparisons because
+; relational comparisons must treat a quiet NaN like a signaling NaN which is
+; not the case for load and test instructions.  For fast math insn
+; "cmp<mode>_ccs_0_fastmath" applies.
+; See testcases load-and-test-fp-{1,2}.c
+
+(define_peephole2
+  [(set (match_operand:FP 0 "register_operand")
+	(match_operand:FP 1 "const0_operand"))
+   (set (reg:CCZ CC_REGNUM)
+	(compare:CCZ (match_operand:FP 2 "register_operand")
+		     (match_operand:FP 3 "register_operand")))]
+  "TARGET_HARD_FLOAT
+   && FP_REG_P (operands[2])
+   && REGNO (operands[0]) == REGNO (operands[3])
+   && peep2_reg_dead_p (2, operands[0])
+   && peep2_reg_dead_p (2, operands[2])"
+  [(parallel
+    [(set (reg:CCZ CC_REGNUM)
+	  (compare:CCZ (match_dup 2) (match_dup 1)))
+     (clobber (match_dup 2))])]
+  "")
 
 ; ltxbr, ltdbr, ltebr, ltxtr, ltdtr
-(define_insn "*cmp<mode>_ccs_0"
-  [(set (reg CC_REGNUM)
-	(compare (match_operand:FP 0 "register_operand"  "f")
-		 (match_operand:FP 1 "const0_operand"    "")))
-   (clobber (match_operand:FP      2 "register_operand" "=0"))]
-  "s390_match_ccmode(insn, CCSmode) && TARGET_HARD_FLOAT"
+(define_insn "*cmp<mode>_ccz_0"
+  [(set (reg:CCZ CC_REGNUM)
+	(compare:CCZ (match_operand:FP 0 "register_operand" "f")
+		     (match_operand:FP 1 "const0_operand")))
+   (clobber (match_operand:FP 2 "register_operand" "=0"))]
+  "TARGET_HARD_FLOAT"
   "lt<xde><bt>r\t%0,%0"
    [(set_attr "op_type" "RRE")
     (set_attr "type"  "fsimp<mode>")])
 
+(define_insn "*cmp<mode>_ccs_0_fastmath"
+  [(set (reg CC_REGNUM)
+	(compare (match_operand:FP 0 "register_operand" "f")
+		 (match_operand:FP 1 "const0_operand")))]
+  "s390_match_ccmode (insn, CCSmode)
+   && TARGET_HARD_FLOAT
+   && !flag_trapping_math
+   && !flag_signaling_nans"
+  "lt<xde><bt>r\t%0,%0"
+  [(set_attr "op_type" "RRE")
+   (set_attr "type" "fsimp<mode>")])
+
 ; VX: TFmode in FPR pairs: use cxbr instead of wfcxb
 ; cxtr, cdtr, cxbr, cdbr, cebr, cdb, ceb, wfcsb, wfcdb
 (define_insn "*cmp<mode>_ccs"
@@ -2413,7 +2445,7 @@
 ; movstrictqi instruction pattern(s).
 ;
 
-(define_insn "*movstrictqi"
+(define_insn "movstrictqi"
   [(set (strict_low_part (match_operand:QI 0 "register_operand" "+d,d"))
                          (match_operand:QI 1 "memory_operand" "R,T"))]
   ""
@@ -2428,7 +2460,7 @@
 ; movstricthi instruction pattern(s).
 ;
 
-(define_insn "*movstricthi"
+(define_insn "movstricthi"
   [(set (strict_low_part (match_operand:HI 0 "register_operand" "+d,d"))
                          (match_operand:HI 1 "memory_operand" "Q,S"))
    (clobber (reg:CC CC_REGNUM))]
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 08f2d4c..e9332ba 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -567,7 +567,7 @@
 ; single vector register.
 (define_insn "*vec_tf_to_v1tf"
   [(set (match_operand:V1TF                   0 "nonimmediate_operand" "=v,v,R,v,v")
-	(vec_duplicate:V1TF (match_operand:TF 1 "general_operand"       "v,R,v,G,d")))]
+	(vec_duplicate:V1TF (match_operand:TF 1 "general_operand"       "f,R,f,G,d")))]
   "TARGET_VX"
   "@
    vmrhg\t%v0,%1,%N1
@@ -622,7 +622,7 @@
     case GT:
     case LTGT:
       /* Signaling vector comparisons are supported only on z14+.  */
-      return TARGET_Z14;
+      return TARGET_VXE || TARGET_NONSIGNALING_VECTOR_COMPARE_OK;
     default:
       return true;
     }
@@ -1425,28 +1425,16 @@
 
 ; Vector copysign, implement using vector select
 (define_expand "copysign<mode>3"
-  [(set (match_operand:VFT 0 "register_operand" "")
-	(if_then_else:VFT
-	 (eq (match_dup 3)
-	     (match_dup 4))
-	 (match_operand:VFT 1 "register_operand"  "")
-	 (match_operand:VFT 2 "register_operand"  "")))]
+  [(set (match_operand:VFT            0 "register_operand" "")
+	(ior:VFT
+	 (and:VFT (match_operand:VFT  2 "register_operand" "")
+		  (match_dup 3))
+	 (and:VFT (not:VFT (match_dup 3))
+		  (match_operand:VFT  1 "register_operand" ""))))]
   "TARGET_VX"
 {
-  int sz = GET_MODE_BITSIZE (GET_MODE_INNER (<MODE>mode));
-  int prec = GET_MODE_PRECISION (GET_MODE_INNER (<tointvec>mode));
-  wide_int mask_val = wi::shwi (1l << (sz - 1), prec);
-
-  rtx mask = gen_reg_rtx (<tointvec>mode);
-
-  int nunits = GET_MODE_NUNITS (<tointvec>mode);
-  rtvec v = rtvec_alloc (nunits);
-  for (int i = 0; i < nunits; i++)
-    RTVEC_ELT (v, i) = GEN_INT (mask_val.to_shwi ());
-
-  mask = gen_rtx_CONST_VECTOR (<tointvec>mode, v);
-  operands[3] = force_reg (<tointvec>mode, mask);
-  operands[4] = CONST0_RTX (<tointvec>mode);
+  rtx mask = s390_build_signbit_mask (<MODE>mode);
+  operands[3] = force_reg (<MODE>mode, mask);
 })
 
 ;;
@@ -1534,7 +1522,7 @@
   [(set (match_operand:<tointvec>         0 "register_operand" "=v")
 	(gt:<tointvec> (match_operand:VFT 1 "register_operand" "v")
 		       (match_operand:VFT 2 "register_operand" "v")))]
-  "TARGET_VX && !TARGET_VXE && flag_finite_math_only"
+  "TARGET_NONSIGNALING_VECTOR_COMPARE_OK"
   "<vw>fch<sdx>b\t%v0,%v1,%v2"
   [(set_attr "op_type" "VRR")])
 
@@ -1551,7 +1539,7 @@
   [(set (match_operand:<tointvec>         0 "register_operand" "=v")
 	(ge:<tointvec> (match_operand:VFT 1 "register_operand" "v")
 		       (match_operand:VFT 2 "register_operand" "v")))]
-  "TARGET_VX && !TARGET_VXE && flag_finite_math_only"
+  "TARGET_NONSIGNALING_VECTOR_COMPARE_OK"
   "<vw>fche<sdx>b\t%v0,%v1,%v2"
   [(set_attr "op_type" "VRR")])
 
diff --git a/gcc/config/sparc/linux.h b/gcc/config/sparc/linux.h
index 81201e6..63853e6 100644
--- a/gcc/config/sparc/linux.h
+++ b/gcc/config/sparc/linux.h
@@ -27,16 +27,10 @@ along with GCC; see the file COPYING3.  If not see
     }						\
   while (0)
 
-/* Provide a ENDFILE_SPEC appropriate for GNU/Linux.  Here we tack on
-   the GNU/Linux magical crtend.o file (see crtstuff.c) which
-   provides part of the support for getting C++ file-scope static
-   object constructed before entering `main', followed by a normal
-   GNU/Linux "finalizer" file, `crtn.o'.  */
-
 #undef  ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{shared|pie:crtendS.o%s;:crtend.o%s} crtn.o%s\
-   %{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+  GNU_USER_TARGET_ENDFILE_SPEC \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
 
 /* -mcpu=native handling only makes sense with compiler running on
    a SPARC chip.  */
diff --git a/gcc/config/sparc/linux64.h b/gcc/config/sparc/linux64.h
index a1a0efd..19ce84d 100644
--- a/gcc/config/sparc/linux64.h
+++ b/gcc/config/sparc/linux64.h
@@ -44,16 +44,10 @@ along with GCC; see the file COPYING3.  If not see
 #undef ASM_CPU64_DEFAULT_SPEC
 #define ASM_CPU64_DEFAULT_SPEC "-Av9a"
 
-/* Provide a ENDFILE_SPEC appropriate for GNU/Linux.  Here we tack on
-   the GNU/Linux magical crtend.o file (see crtstuff.c) which
-   provides part of the support for getting C++ file-scope static
-   object constructed before entering `main', followed by a normal
-   GNU/Linux "finalizer" file, `crtn.o'.  */
-
 #undef	ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{shared|pie:crtendS.o%s;:crtend.o%s} crtn.o%s\
-   %{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+  GNU_USER_TARGET_ENDFILE_SPEC \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
 
 /* The default code model.  */
 #undef SPARC_DEFAULT_CMODEL
diff --git a/gcc/config/tilepro/gen-mul-tables.cc b/gcc/config/tilepro/gen-mul-tables.cc
index 2a34502..7f9fb65 100644
--- a/gcc/config/tilepro/gen-mul-tables.cc
+++ b/gcc/config/tilepro/gen-mul-tables.cc
@@ -1252,6 +1252,8 @@ main ()
   printf ("/* Note this file is auto-generated from gen-mul-tables.cc.\n");
   printf ("   Make any required changes there.  */\n");
   printf ("\n");
+  printf ("#define IN_TARGET_CODE 1\n");
+  printf ("\n");
   printf ("#include \"config.h\"\n");
   printf ("#include \"system.h\"\n");
   printf ("#include \"coretypes.h\"\n");
diff --git a/gcc/config/vxworks.h b/gcc/config/vxworks.h
index d648d2f..e50260b0 100644
--- a/gcc/config/vxworks.h
+++ b/gcc/config/vxworks.h
@@ -36,11 +36,16 @@ along with GCC; see the file COPYING3.  If not see
 
 /* Since we provide a default -isystem, expand -isystem on the command
    line early.  */
+
+/* Self-tests may be run in contexts where the VxWorks environment isn't
+   available.  Prevent attempts at designating the location of runtime header
+   files, libraries or startfiles, which would fail on unset environment
+   variables and aren't needed for such tests.  */
 #if TARGET_VXWORKS7
 
 #undef VXWORKS_ADDITIONAL_CPP_SPEC
 #define VXWORKS_ADDITIONAL_CPP_SPEC                     \
- "%{!nostdinc:                                          \
+ "%{!nostdinc:%{!fself-test=*:                          \
     %{isystem*}                                         \
     %{mrtp: -idirafter %:getenv(VSB_DIR /h)             \
             -idirafter %:getenv(VSB_DIR /share/h)       \
@@ -49,19 +54,19 @@ along with GCC; see the file COPYING3.  If not see
       ;:    -idirafter %:getenv(VSB_DIR /h)             \
             -idirafter %:getenv(VSB_DIR /share/h)       \
             -idirafter %:getenv(VSB_DIR /krnl/h/system) \
-            -idirafter %:getenv(VSB_DIR /krnl/h/public)}}"
+            -idirafter %:getenv(VSB_DIR /krnl/h/public)}}}"
 
 #else /* TARGET_VXWORKS7 */
 
 #undef VXWORKS_ADDITIONAL_CPP_SPEC
 #define VXWORKS_ADDITIONAL_CPP_SPEC		\
- "%{!nostdinc:					\
+ "%{!nostdinc:%{!fself-test=*:			\
     %{isystem*}					\
     %{mrtp: -idirafter %:getenv(WIND_USR /h)	\
 	    -idirafter %:getenv(WIND_USR /h/wrn/coreip) \
       ;:    -idirafter %:getenv(WIND_BASE /target/h) \
 	    -idirafter %:getenv(WIND_BASE /target/h/wrn/coreip) \
-}}"
+}}}"
 
 #endif
 
@@ -108,7 +113,8 @@ along with GCC; see the file COPYING3.  If not see
 
 #if TARGET_VXWORKS7
 #undef  STARTFILE_PREFIX_SPEC
-#define STARTFILE_PREFIX_SPEC "%:getenv(VSB_DIR /usr/lib/common)"
+#define STARTFILE_PREFIX_SPEC \
+  "%{!fself-test=*:%:getenv(VSB_DIR /usr/lib/common)}"
 #define TLS_SYM "-u __tls__"
 #else
 #define TLS_SYM ""
author	Ian Lance Taylor <iant@golang.org>	2020-10-12 09:46:38 -0700
committer	Ian Lance Taylor <iant@golang.org>	2020-10-12 09:46:38 -0700
commit	9cd320ea6572c577cdf17ce1f9ea5230b166af6d (patch)
tree	d1c8e7c2e09a91ed75f0e5476c648c2e745aa2de /gcc/config
parent	4854d721be78358e59367982bdd94461b4be3c5a (diff)
parent	3175d40fc52fb8eb3c3b18cc343d773da24434fb (diff)
download	gcc-9cd320ea6572c577cdf17ce1f9ea5230b166af6d.zip gcc-9cd320ea6572c577cdf17ce1f9ea5230b166af6d.tar.gz gcc-9cd320ea6572c577cdf17ce1f9ea5230b166af6d.tar.bz2